aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/6lowpan/nhc.c8
-rw-r--r--net/802/fc.c2
-rw-r--r--net/802/fddi.c11
-rw-r--r--net/802/hippi.c16
-rw-r--r--net/8021q/vlan.c11
-rw-r--r--net/8021q/vlan.h2
-rw-r--r--net/8021q/vlan_dev.c10
-rw-r--r--net/9p/client.c20
-rw-r--r--net/Kconfig22
-rw-r--r--net/Makefile3
-rw-r--r--net/appletalk/ddp.c4
-rw-r--r--net/atm/br2684.c6
-rw-r--r--net/atm/common.c4
-rw-r--r--net/atm/lec.c19
-rw-r--r--net/atm/mpc.c2
-rw-r--r--net/atm/mpoa_caches.c43
-rw-r--r--net/atm/svc.c7
-rw-r--r--net/ax25/af_ax25.c7
-rw-r--r--net/ax25/ax25_addr.c2
-rw-r--r--net/ax25/ax25_dev.c2
-rw-r--r--net/ax25/ax25_ds_in.c2
-rw-r--r--net/ax25/ax25_ds_subr.c2
-rw-r--r--net/ax25/ax25_ds_timer.c2
-rw-r--r--net/ax25/ax25_iface.c2
-rw-r--r--net/ax25/ax25_in.c2
-rw-r--r--net/ax25/ax25_ip.c2
-rw-r--r--net/ax25/ax25_out.c2
-rw-r--r--net/ax25/ax25_route.c2
-rw-r--r--net/ax25/ax25_std_in.c2
-rw-r--r--net/ax25/ax25_std_subr.c2
-rw-r--r--net/ax25/ax25_std_timer.c2
-rw-r--r--net/ax25/ax25_subr.c4
-rw-r--r--net/ax25/ax25_timer.c2
-rw-r--r--net/ax25/ax25_uid.c2
-rw-r--r--net/batman-adv/Kconfig2
-rw-r--r--net/batman-adv/Makefile2
-rw-r--r--net/batman-adv/bat_algo.c2
-rw-r--r--net/batman-adv/bat_algo.h2
-rw-r--r--net/batman-adv/bat_iv_ogm.c71
-rw-r--r--net/batman-adv/bat_iv_ogm.h2
-rw-r--r--net/batman-adv/bat_v.c24
-rw-r--r--net/batman-adv/bat_v.h2
-rw-r--r--net/batman-adv/bat_v_elp.c73
-rw-r--r--net/batman-adv/bat_v_elp.h2
-rw-r--r--net/batman-adv/bat_v_ogm.c77
-rw-r--r--net/batman-adv/bat_v_ogm.h2
-rw-r--r--net/batman-adv/bitarray.c2
-rw-r--r--net/batman-adv/bitarray.h2
-rw-r--r--net/batman-adv/bridge_loop_avoidance.c3
-rw-r--r--net/batman-adv/bridge_loop_avoidance.h20
-rw-r--r--net/batman-adv/debugfs.c30
-rw-r--r--net/batman-adv/debugfs.h2
-rw-r--r--net/batman-adv/distributed-arp-table.c87
-rw-r--r--net/batman-adv/distributed-arp-table.h2
-rw-r--r--net/batman-adv/fragmentation.c118
-rw-r--r--net/batman-adv/fragmentation.h4
-rw-r--r--net/batman-adv/gateway_client.c11
-rw-r--r--net/batman-adv/gateway_client.h2
-rw-r--r--net/batman-adv/gateway_common.c7
-rw-r--r--net/batman-adv/gateway_common.h2
-rw-r--r--net/batman-adv/hard-interface.c227
-rw-r--r--net/batman-adv/hard-interface.h23
-rw-r--r--net/batman-adv/hash.c2
-rw-r--r--net/batman-adv/hash.h32
-rw-r--r--net/batman-adv/icmp_socket.c7
-rw-r--r--net/batman-adv/icmp_socket.h2
-rw-r--r--net/batman-adv/log.c6
-rw-r--r--net/batman-adv/log.h14
-rw-r--r--net/batman-adv/main.c18
-rw-r--r--net/batman-adv/main.h30
-rw-r--r--net/batman-adv/multicast.c72
-rw-r--r--net/batman-adv/multicast.h8
-rw-r--r--net/batman-adv/netlink.c33
-rw-r--r--net/batman-adv/netlink.h2
-rw-r--r--net/batman-adv/network-coding.c45
-rw-r--r--net/batman-adv/network-coding.h2
-rw-r--r--net/batman-adv/originator.c25
-rw-r--r--net/batman-adv/originator.h2
-rw-r--r--net/batman-adv/packet.h14
-rw-r--r--net/batman-adv/routing.c181
-rw-r--r--net/batman-adv/routing.h2
-rw-r--r--net/batman-adv/send.c419
-rw-r--r--net/batman-adv/send.h13
-rw-r--r--net/batman-adv/soft-interface.c35
-rw-r--r--net/batman-adv/soft-interface.h2
-rw-r--r--net/batman-adv/sysfs.c55
-rw-r--r--net/batman-adv/sysfs.h2
-rw-r--r--net/batman-adv/tp_meter.c10
-rw-r--r--net/batman-adv/tp_meter.h2
-rw-r--r--net/batman-adv/translation-table.c43
-rw-r--r--net/batman-adv/translation-table.h2
-rw-r--r--net/batman-adv/tvlv.c7
-rw-r--r--net/batman-adv/tvlv.h2
-rw-r--r--net/batman-adv/types.h51
-rw-r--r--net/bluetooth/6lowpan.c2
-rw-r--r--net/bluetooth/Makefile2
-rw-r--r--net/bluetooth/a2mp.c4
-rw-r--r--net/bluetooth/af_bluetooth.c4
-rw-r--r--net/bluetooth/amp.c4
-rw-r--r--net/bluetooth/bnep/netdev.c3
-rw-r--r--net/bluetooth/cmtp/capi.c2
-rw-r--r--net/bluetooth/hci_event.c2
-rw-r--r--net/bluetooth/hci_request.c2
-rw-r--r--net/bluetooth/hci_sock.c6
-rw-r--r--net/bluetooth/l2cap_core.c10
-rw-r--r--net/bluetooth/l2cap_sock.c3
-rw-r--r--net/bluetooth/rfcomm/sock.c4
-rw-r--r--net/bluetooth/sco.c3
-rw-r--r--net/bluetooth/smp.c85
-rw-r--r--net/bluetooth/smp.h1
-rw-r--r--net/bridge/Makefile5
-rw-r--r--net/bridge/br_device.c38
-rw-r--r--net/bridge/br_fdb.c225
-rw-r--r--net/bridge/br_forward.c44
-rw-r--r--net/bridge/br_if.c3
-rw-r--r--net/bridge/br_input.c18
-rw-r--r--net/bridge/br_ioctl.c4
-rw-r--r--net/bridge/br_mdb.c2
-rw-r--r--net/bridge/br_multicast.c340
-rw-r--r--net/bridge/br_netfilter_hooks.c63
-rw-r--r--net/bridge/br_netfilter_ipv6.c2
-rw-r--r--net/bridge/br_netlink.c218
-rw-r--r--net/bridge/br_netlink_tunnel.c294
-rw-r--r--net/bridge/br_private.h93
-rw-r--r--net/bridge/br_private_stp.h1
-rw-r--r--net/bridge/br_private_tunnel.h83
-rw-r--r--net/bridge/br_stp.c67
-rw-r--r--net/bridge/br_stp_if.c18
-rw-r--r--net/bridge/br_stp_timer.c4
-rw-r--r--net/bridge/br_sysfs_br.c43
-rw-r--r--net/bridge/br_sysfs_if.c3
-rw-r--r--net/bridge/br_vlan.c24
-rw-r--r--net/bridge/br_vlan_tunnel.c205
-rw-r--r--net/bridge/netfilter/Kconfig1
-rw-r--r--net/bridge/netfilter/ebt_among.c2
-rw-r--r--net/bridge/netfilter/ebt_arpreply.c3
-rw-r--r--net/bridge/netfilter/ebt_limit.c1
-rw-r--r--net/bridge/netfilter/ebt_log.c13
-rw-r--r--net/bridge/netfilter/ebt_nflog.c6
-rw-r--r--net/bridge/netfilter/ebt_redirect.c6
-rw-r--r--net/bridge/netfilter/ebtable_broute.c2
-rw-r--r--net/bridge/netfilter/ebtables.c86
-rw-r--r--net/bridge/netfilter/nf_log_bridge.c17
-rw-r--r--net/bridge/netfilter/nft_meta_bridge.c2
-rw-r--r--net/bridge/netfilter/nft_reject_bridge.c30
-rw-r--r--net/caif/caif_dev.c2
-rw-r--r--net/caif/caif_socket.c2
-rw-r--r--net/caif/cfcnfg.c9
-rw-r--r--net/caif/chnl_net.c1
-rw-r--r--net/can/af_can.c12
-rw-r--r--net/can/af_can.h3
-rw-r--r--net/can/bcm.c59
-rw-r--r--net/can/gw.c4
-rw-r--r--net/can/raw.c4
-rw-r--r--net/ceph/auth.c4
-rw-r--r--net/ceph/auth_x.c197
-rw-r--r--net/ceph/auth_x.h3
-rw-r--r--net/ceph/ceph_common.c15
-rw-r--r--net/ceph/cls_lock_client.c14
-rw-r--r--net/ceph/crush/crush.c5
-rw-r--r--net/ceph/crush/mapper.c229
-rw-r--r--net/ceph/crypto.c465
-rw-r--r--net/ceph/crypto.h26
-rw-r--r--net/ceph/messenger.c73
-rw-r--r--net/ceph/mon_client.c12
-rw-r--r--net/ceph/osd_client.c199
-rw-r--r--net/ceph/osdmap.c101
-rw-r--r--net/ceph/snapshot.c2
-rw-r--r--net/compat.c36
-rw-r--r--net/core/Makefile2
-rw-r--r--net/core/datagram.c101
-rw-r--r--net/core/dev.c1433
-rw-r--r--net/core/devlink.c122
-rw-r--r--net/core/drop_monitor.c60
-rw-r--r--net/core/dst.c1
-rw-r--r--net/core/ethtool.c150
-rw-r--r--net/core/fib_rules.c78
-rw-r--r--net/core/filter.c562
-rw-r--r--net/core/flow.c60
-rw-r--r--net/core/flow_dissector.c95
-rw-r--r--net/core/gen_estimator.c296
-rw-r--r--net/core/gen_stats.c20
-rw-r--r--net/core/gro_cells.c92
-rw-r--r--net/core/lwt_bpf.c397
-rw-r--r--net/core/lwtunnel.c85
-rw-r--r--net/core/neighbour.c22
-rw-r--r--net/core/net-sysfs.c66
-rw-r--r--net/core/net_namespace.c79
-rw-r--r--net/core/netclassid_cgroup.c34
-rw-r--r--net/core/netpoll.c16
-rw-r--r--net/core/netprio_cgroup.c3
-rw-r--r--net/core/pktgen.c6
-rw-r--r--net/core/request_sock.c2
-rw-r--r--net/core/rtnetlink.c106
-rw-r--r--net/core/scm.c5
-rw-r--r--net/core/secure_seq.c177
-rw-r--r--net/core/skbuff.c172
-rw-r--r--net/core/sock.c239
-rw-r--r--net/core/stream.c29
-rw-r--r--net/core/sysctl_net_core.c49
-rw-r--r--net/core/utils.c2
-rw-r--r--net/dccp/ccids/ccid2.c1
-rw-r--r--net/dccp/input.c13
-rw-r--r--net/dccp/ipv4.c18
-rw-r--r--net/dccp/ipv6.c22
-rw-r--r--net/dccp/minisocks.c29
-rw-r--r--net/dccp/output.c1
-rw-r--r--net/decnet/af_decnet.c23
-rw-r--r--net/decnet/dn_dev.c4
-rw-r--r--net/decnet/dn_fib.c2
-rw-r--r--net/decnet/dn_table.c2
-rw-r--r--net/decnet/sysctl_net_decnet.c2
-rw-r--r--net/dns_resolver/dns_query.c6
-rw-r--r--net/dsa/Kconfig16
-rw-r--r--net/dsa/Makefile2
-rw-r--r--net/dsa/dsa.c262
-rw-r--r--net/dsa/dsa2.c254
-rw-r--r--net/dsa/dsa_priv.h25
-rw-r--r--net/dsa/slave.c497
-rw-r--r--net/dsa/switch.c85
-rw-r--r--net/dsa/tag_brcm.c11
-rw-r--r--net/dsa/tag_dsa.c10
-rw-r--r--net/dsa/tag_edsa.c10
-rw-r--r--net/dsa/tag_qca.c4
-rw-r--r--net/dsa/tag_trailer.c6
-rw-r--r--net/ethernet/eth.c39
-rw-r--r--net/hsr/hsr_device.c3
-rw-r--r--net/hsr/hsr_netlink.c23
-rw-r--r--net/hsr/hsr_slave.c3
-rw-r--r--net/ieee802154/6lowpan/6lowpan_i.h2
-rw-r--r--net/ieee802154/Makefile2
-rw-r--r--net/ieee802154/netlink.c24
-rw-r--r--net/ieee802154/nl-phy.c6
-rw-r--r--net/ieee802154/nl802154.c44
-rw-r--r--net/ieee802154/socket.c4
-rw-r--r--net/ife/Kconfig16
-rw-r--r--net/ife/Makefile5
-rw-r--r--net/ife/ife.c142
-rw-r--r--net/ipv4/Kconfig22
-rw-r--r--net/ipv4/Makefile2
-rw-r--r--net/ipv4/af_inet.c67
-rw-r--r--net/ipv4/ah4.c3
-rw-r--r--net/ipv4/arp.c12
-rw-r--r--net/ipv4/cipso_ipv4.c4
-rw-r--r--net/ipv4/devinet.c5
-rw-r--r--net/ipv4/esp4.c332
-rw-r--r--net/ipv4/esp4_offload.c106
-rw-r--r--net/ipv4/fib_frontend.c25
-rw-r--r--net/ipv4/fib_semantics.c85
-rw-r--r--net/ipv4/fib_trie.c195
-rw-r--r--net/ipv4/fou.c23
-rw-r--r--net/ipv4/icmp.c133
-rw-r--r--net/ipv4/igmp.c10
-rw-r--r--net/ipv4/inet_connection_sock.c284
-rw-r--r--net/ipv4/inet_diag.c75
-rw-r--r--net/ipv4/inet_hashtables.c19
-rw-r--r--net/ipv4/inet_timewait_sock.c3
-rw-r--r--net/ipv4/ip_fragment.c25
-rw-r--r--net/ipv4/ip_gre.c6
-rw-r--r--net/ipv4/ip_options.c2
-rw-r--r--net/ipv4/ip_output.c53
-rw-r--r--net/ipv4/ip_sockglue.c80
-rw-r--r--net/ipv4/ip_tunnel.c10
-rw-r--r--net/ipv4/ip_tunnel_core.c12
-rw-r--r--net/ipv4/ip_vti.c2
-rw-r--r--net/ipv4/ipconfig.c4
-rw-r--r--net/ipv4/ipip.c4
-rw-r--r--net/ipv4/ipmr.c296
-rw-r--r--net/ipv4/netfilter.c7
-rw-r--r--net/ipv4/netfilter/Kconfig14
-rw-r--r--net/ipv4/netfilter/Makefile3
-rw-r--r--net/ipv4/netfilter/arp_tables.c59
-rw-r--r--net/ipv4/netfilter/ip_tables.c65
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c44
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c11
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c4
-rw-r--r--net/ipv4/netfilter/ipt_SYNPROXY.c19
-rw-r--r--net/ipv4/netfilter/ipt_rpfilter.c18
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c151
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c15
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c45
-rw-r--r--net/ipv4/netfilter/nf_dup_ipv4.c7
-rw-r--r--net/ipv4/netfilter/nf_log_arp.c4
-rw-r--r--net/ipv4/netfilter/nf_log_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c5
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c20
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nf_socket_ipv4.c163
-rw-r--r--net/ipv4/netfilter/nft_dup_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nft_fib_ipv4.c236
-rw-r--r--net/ipv4/netfilter/nft_masq_ipv4.c23
-rw-r--r--net/ipv4/netfilter/nft_redir_ipv4.c22
-rw-r--r--net/ipv4/netfilter/nft_reject_ipv4.c4
-rw-r--r--net/ipv4/ping.c27
-rw-r--r--net/ipv4/proc.c5
-rw-r--r--net/ipv4/raw.c43
-rw-r--r--net/ipv4/raw_diag.c266
-rw-r--r--net/ipv4/route.c172
-rw-r--r--net/ipv4/syncookies.c24
-rw-r--r--net/ipv4/sysctl_net_ipv4.c126
-rw-r--r--net/ipv4/tcp.c220
-rw-r--r--net/ipv4/tcp_bbr.c32
-rw-r--r--net/ipv4/tcp_cdg.c2
-rw-r--r--net/ipv4/tcp_cong.c14
-rw-r--r--net/ipv4/tcp_dctcp.c1
-rw-r--r--net/ipv4/tcp_fastopen.c57
-rw-r--r--net/ipv4/tcp_highspeed.c11
-rw-r--r--net/ipv4/tcp_hybla.c1
-rw-r--r--net/ipv4/tcp_illinois.c10
-rw-r--r--net/ipv4/tcp_input.c349
-rw-r--r--net/ipv4/tcp_ipv4.c90
-rw-r--r--net/ipv4/tcp_lp.c1
-rw-r--r--net/ipv4/tcp_metrics.c32
-rw-r--r--net/ipv4/tcp_minisocks.c30
-rw-r--r--net/ipv4/tcp_output.c339
-rw-r--r--net/ipv4/tcp_recovery.c149
-rw-r--r--net/ipv4/tcp_scalable.c15
-rw-r--r--net/ipv4/tcp_timer.c15
-rw-r--r--net/ipv4/tcp_vegas.c1
-rw-r--r--net/ipv4/tcp_veno.c10
-rw-r--r--net/ipv4/tcp_westwood.c1
-rw-r--r--net/ipv4/tcp_yeah.c10
-rw-r--r--net/ipv4/udp.c373
-rw-r--r--net/ipv4/udplite.c3
-rw-r--r--net/ipv4/xfrm4_input.c6
-rw-r--r--net/ipv4/xfrm4_mode_transport.c4
-rw-r--r--net/ipv4/xfrm4_policy.c9
-rw-r--r--net/ipv4/xfrm4_protocol.c3
-rw-r--r--net/ipv4/xfrm4_state.c8
-rw-r--r--net/ipv6/Kconfig49
-rw-r--r--net/ipv6/Makefile5
-rw-r--r--net/ipv6/addrconf.c206
-rw-r--r--net/ipv6/af_inet6.c33
-rw-r--r--net/ipv6/ah6.c8
-rw-r--r--net/ipv6/datagram.c34
-rw-r--r--net/ipv6/esp6.c323
-rw-r--r--net/ipv6/esp6_offload.c108
-rw-r--r--net/ipv6/exthdrs.c244
-rw-r--r--net/ipv6/icmp.c77
-rw-r--r--net/ipv6/ila/ila_lwt.c95
-rw-r--r--net/ipv6/ila/ila_xlat.c43
-rw-r--r--net/ipv6/inet6_connection_sock.c43
-rw-r--r--net/ipv6/inet6_hashtables.c46
-rw-r--r--net/ipv6/ip6_fib.c24
-rw-r--r--net/ipv6/ip6_flowlabel.c2
-rw-r--r--net/ipv6/ip6_gre.c57
-rw-r--r--net/ipv6/ip6_input.c7
-rw-r--r--net/ipv6/ip6_offload.c7
-rw-r--r--net/ipv6/ip6_output.c47
-rw-r--r--net/ipv6/ip6_tunnel.c54
-rw-r--r--net/ipv6/ip6_vti.c46
-rw-r--r--net/ipv6/ip6mr.c41
-rw-r--r--net/ipv6/ipcomp6.c5
-rw-r--r--net/ipv6/ipv6_sockglue.c36
-rw-r--r--net/ipv6/mcast.c50
-rw-r--r--net/ipv6/mip6.c2
-rw-r--r--net/ipv6/ndisc.c29
-rw-r--r--net/ipv6/netfilter.c1
-rw-r--r--net/ipv6/netfilter/Kconfig14
-rw-r--r--net/ipv6/netfilter/Makefile3
-rw-r--r--net/ipv6/netfilter/ip6_tables.c66
-rw-r--r--net/ipv6/netfilter/ip6t_MASQUERADE.c2
-rw-r--r--net/ipv6/netfilter/ip6t_NPT.c2
-rw-r--r--net/ipv6/netfilter/ip6t_REJECT.c23
-rw-r--r--net/ipv6/netfilter/ip6t_SYNPROXY.c19
-rw-r--r--net/ipv6/netfilter/ip6t_rpfilter.c11
-rw-r--r--net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c146
-rw-r--r--net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c20
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c1
-rw-r--r--net/ipv6/netfilter/nf_defrag_ipv6_hooks.c46
-rw-r--r--net/ipv6/netfilter/nf_dup_ipv6.c7
-rw-r--r--net/ipv6/netfilter/nf_log_ipv6.c4
-rw-r--r--net/ipv6/netfilter/nf_reject_ipv6.c3
-rw-r--r--net/ipv6/netfilter/nf_socket_ipv6.c151
-rw-r--r--net/ipv6/netfilter/nft_dup_ipv6.c2
-rw-r--r--net/ipv6/netfilter/nft_fib_ipv6.c270
-rw-r--r--net/ipv6/netfilter/nft_masq_ipv6.c22
-rw-r--r--net/ipv6/netfilter/nft_redir_ipv6.c22
-rw-r--r--net/ipv6/netfilter/nft_reject_ipv6.c6
-rw-r--r--net/ipv6/ping.c8
-rw-r--r--net/ipv6/raw.c20
-rw-r--r--net/ipv6/reassembly.c7
-rw-r--r--net/ipv6/route.c401
-rw-r--r--net/ipv6/seg6.c500
-rw-r--r--net/ipv6/seg6_hmac.c448
-rw-r--r--net/ipv6/seg6_iptunnel.c436
-rw-r--r--net/ipv6/sit.c19
-rw-r--r--net/ipv6/syncookies.c43
-rw-r--r--net/ipv6/tcp_ipv6.c96
-rw-r--r--net/ipv6/udp.c88
-rw-r--r--net/ipv6/udplite.c3
-rw-r--r--net/ipv6/xfrm6_input.c22
-rw-r--r--net/ipv6/xfrm6_mode_transport.c4
-rw-r--r--net/ipv6/xfrm6_policy.c9
-rw-r--r--net/ipv6/xfrm6_protocol.c3
-rw-r--r--net/ipv6/xfrm6_tunnel.c2
-rw-r--r--net/ipx/af_ipx.c4
-rw-r--r--net/irda/af_irda.c8
-rw-r--r--net/irda/ircomm/ircomm_tty.c4
-rw-r--r--net/irda/ircomm/ircomm_tty_ioctl.c2
-rw-r--r--net/irda/irda_device.c2
-rw-r--r--net/irda/irlan/irlan_eth.c4
-rw-r--r--net/irda/irnet/irnet.h3
-rw-r--r--net/irda/irnet/irnet_ppp.c15
-rw-r--r--net/irda/irnet/irnet_ppp.h11
-rw-r--r--net/irda/irnetlink.c22
-rw-r--r--net/irda/irproc.c1
-rw-r--r--net/irda/irqueue.c34
-rw-r--r--net/iucv/af_iucv.c63
-rw-r--r--net/iucv/iucv.c124
-rw-r--r--net/kcm/kcmsock.c54
-rw-r--r--net/key/af_key.c95
-rw-r--r--net/l2tp/l2tp_core.c170
-rw-r--r--net/l2tp/l2tp_core.h20
-rw-r--r--net/l2tp/l2tp_debugfs.c10
-rw-r--r--net/l2tp/l2tp_eth.c18
-rw-r--r--net/l2tp/l2tp_ip.c97
-rw-r--r--net/l2tp/l2tp_ip6.c79
-rw-r--r--net/l2tp/l2tp_netlink.c111
-rw-r--r--net/l2tp/l2tp_ppp.c163
-rw-r--r--net/lapb/lapb_iface.c2
-rw-r--r--net/lapb/lapb_in.c2
-rw-r--r--net/lapb/lapb_out.c2
-rw-r--r--net/lapb/lapb_subr.c2
-rw-r--r--net/lapb/lapb_timer.c2
-rw-r--r--net/llc/af_llc.c30
-rw-r--r--net/llc/llc_conn.c3
-rw-r--r--net/llc/llc_sap.c3
-rw-r--r--net/mac80211/Kconfig1
-rw-r--r--net/mac80211/Makefile3
-rw-r--r--net/mac80211/aes_cmac.c126
-rw-r--r--net/mac80211/aes_cmac.h11
-rw-r--r--net/mac80211/agg-rx.c11
-rw-r--r--net/mac80211/cfg.c51
-rw-r--r--net/mac80211/chan.c7
-rw-r--r--net/mac80211/debugfs.c36
-rw-r--r--net/mac80211/debugfs_netdev.c14
-rw-r--r--net/mac80211/debugfs_sta.c10
-rw-r--r--net/mac80211/fils_aead.c334
-rw-r--r--net/mac80211/fils_aead.h19
-rw-r--r--net/mac80211/ibss.c4
-rw-r--r--net/mac80211/ieee80211_i.h38
-rw-r--r--net/mac80211/iface.c59
-rw-r--r--net/mac80211/key.c3
-rw-r--r--net/mac80211/key.h2
-rw-r--r--net/mac80211/main.c18
-rw-r--r--net/mac80211/mesh.c11
-rw-r--r--net/mac80211/mesh.h2
-rw-r--r--net/mac80211/mesh_plink.c16
-rw-r--r--net/mac80211/mesh_sync.c27
-rw-r--r--net/mac80211/mlme.c94
-rw-r--r--net/mac80211/pm.c1
-rw-r--r--net/mac80211/rc80211_minstrel.c21
-rw-r--r--net/mac80211/rc80211_minstrel.h33
-rw-r--r--net/mac80211/rc80211_minstrel_debugfs.c24
-rw-r--r--net/mac80211/rc80211_minstrel_ht.c68
-rw-r--r--net/mac80211/rc80211_minstrel_ht.h6
-rw-r--r--net/mac80211/rc80211_minstrel_ht_debugfs.c32
-rw-r--r--net/mac80211/rx.c172
-rw-r--r--net/mac80211/scan.c8
-rw-r--r--net/mac80211/sta_info.c59
-rw-r--r--net/mac80211/sta_info.h12
-rw-r--r--net/mac80211/status.c17
-rw-r--r--net/mac80211/trace.h27
-rw-r--r--net/mac80211/tx.c211
-rw-r--r--net/mac80211/util.c61
-rw-r--r--net/mac80211/vht.c8
-rw-r--r--net/mac80211/wep.c3
-rw-r--r--net/mac80211/wme.c23
-rw-r--r--net/mac80211/wpa.c5
-rw-r--r--net/mac802154/Makefile2
-rw-r--r--net/mac802154/llsec.c2
-rw-r--r--net/mac802154/util.c4
-rw-r--r--net/mpls/af_mpls.c452
-rw-r--r--net/mpls/internal.h58
-rw-r--r--net/mpls/mpls_iptunnel.c19
-rw-r--r--net/netfilter/Kconfig70
-rw-r--r--net/netfilter/Makefile23
-rw-r--r--net/netfilter/core.c102
-rw-r--r--net/netfilter/ipset/Kconfig9
-rw-r--r--net/netfilter/ipset/Makefile1
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_gen.h31
-rw-r--r--net/netfilter/ipset/ip_set_core.c22
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h256
-rw-r--r--net/netfilter/ipset/ip_set_hash_ip.c10
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipmac.c315
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipmark.c10
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipport.c6
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportip.c6
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportnet.c10
-rw-r--r--net/netfilter/ipset/ip_set_hash_net.c8
-rw-r--r--net/netfilter/ipset/ip_set_hash_netiface.c10
-rw-r--r--net/netfilter/ipset/ip_set_hash_netnet.c8
-rw-r--r--net/netfilter/ipset/ip_set_hash_netport.c10
-rw-r--r--net/netfilter/ipset/ip_set_hash_netportnet.c10
-rw-r--r--net/netfilter/ipset/ip_set_list_set.c46
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c42
-rw-r--r--net/netfilter/ipvs/ip_vs_dh.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_lblc.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_lblcr.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_sh.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c54
-rw-r--r--net/netfilter/nf_conntrack_core.c144
-rw-r--r--net/netfilter/nf_conntrack_ecache.c2
-rw-r--r--net/netfilter/nf_conntrack_expect.c18
-rw-r--r--net/netfilter/nf_conntrack_extend.c13
-rw-r--r--net/netfilter/nf_conntrack_ftp.c2
-rw-r--r--net/netfilter/nf_conntrack_helper.c56
-rw-r--r--net/netfilter/nf_conntrack_netlink.c75
-rw-r--r--net/netfilter/nf_conntrack_proto.c158
-rw-r--r--net/netfilter/nf_conntrack_proto_dccp.c102
-rw-r--r--net/netfilter/nf_conntrack_proto_gre.c13
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c132
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c1
-rw-r--r--net/netfilter/nf_conntrack_proto_udp.c124
-rw-r--r--net/netfilter/nf_conntrack_proto_udplite.c409
-rw-r--r--net/netfilter/nf_conntrack_sip.c14
-rw-r--r--net/netfilter/nf_conntrack_standalone.c13
-rw-r--r--net/netfilter/nf_dup_netdev.c35
-rw-r--r--net/netfilter/nf_internals.h5
-rw-r--r--net/netfilter/nf_log.c25
-rw-r--r--net/netfilter/nf_log_common.c28
-rw-r--r--net/netfilter/nf_log_netdev.c81
-rw-r--r--net/netfilter/nf_nat_core.c14
-rw-r--r--net/netfilter/nf_nat_helper.c2
-rw-r--r--net/netfilter/nf_nat_proto_dccp.c36
-rw-r--r--net/netfilter/nf_nat_proto_sctp.c53
-rw-r--r--net/netfilter/nf_nat_proto_udp.c78
-rw-r--r--net/netfilter/nf_nat_proto_udplite.c106
-rw-r--r--net/netfilter/nf_nat_redirect.c2
-rw-r--r--net/netfilter/nf_queue.c36
-rw-r--r--net/netfilter/nf_synproxy_core.c2
-rw-r--r--net/netfilter/nf_tables_api.c996
-rw-r--r--net/netfilter/nf_tables_core.c91
-rw-r--r--net/netfilter/nf_tables_trace.c8
-rw-r--r--net/netfilter/nfnetlink.c94
-rw-r--r--net/netfilter/nfnetlink_cthelper.c289
-rw-r--r--net/netfilter/nfnetlink_cttimeout.c2
-rw-r--r--net/netfilter/nfnetlink_log.c7
-rw-r--r--net/netfilter/nfnetlink_queue.c19
-rw-r--r--net/netfilter/nft_bitwise.c13
-rw-r--r--net/netfilter/nft_byteorder.c13
-rw-r--r--net/netfilter/nft_cmp.c16
-rw-r--r--net/netfilter/nft_counter.c223
-rw-r--r--net/netfilter/nft_ct.c306
-rw-r--r--net/netfilter/nft_dynset.c16
-rw-r--r--net/netfilter/nft_exthdr.c139
-rw-r--r--net/netfilter/nft_fib.c159
-rw-r--r--net/netfilter/nft_fib_inet.c82
-rw-r--r--net/netfilter/nft_fwd_netdev.c4
-rw-r--r--net/netfilter/nft_hash.c12
-rw-r--r--net/netfilter/nft_immediate.c16
-rw-r--r--net/netfilter/nft_log.c8
-rw-r--r--net/netfilter/nft_lookup.c21
-rw-r--r--net/netfilter/nft_masq.c6
-rw-r--r--net/netfilter/nft_meta.c74
-rw-r--r--net/netfilter/nft_nat.c19
-rw-r--r--net/netfilter/nft_numgen.c2
-rw-r--r--net/netfilter/nft_objref.c228
-rw-r--r--net/netfilter/nft_payload.c145
-rw-r--r--net/netfilter/nft_queue.c4
-rw-r--r--net/netfilter/nft_quota.c160
-rw-r--r--net/netfilter/nft_range.c13
-rw-r--r--net/netfilter/nft_redir.c6
-rw-r--r--net/netfilter/nft_reject_inet.c18
-rw-r--r--net/netfilter/nft_rt.c153
-rw-r--r--net/netfilter/nft_set_bitmap.c307
-rw-r--r--net/netfilter/nft_set_hash.c35
-rw-r--r--net/netfilter/nft_set_rbtree.c31
-rw-r--r--net/netfilter/x_tables.c130
-rw-r--r--net/netfilter/xt_AUDIT.c10
-rw-r--r--net/netfilter/xt_CONNSECMARK.c4
-rw-r--r--net/netfilter/xt_CT.c21
-rw-r--r--net/netfilter/xt_LOG.c6
-rw-r--r--net/netfilter/xt_NETMAP.c31
-rw-r--r--net/netfilter/xt_NFLOG.c6
-rw-r--r--net/netfilter/xt_NFQUEUE.c4
-rw-r--r--net/netfilter/xt_RATEEST.c5
-rw-r--r--net/netfilter/xt_REDIRECT.c16
-rw-r--r--net/netfilter/xt_TCPMSS.c10
-rw-r--r--net/netfilter/xt_TEE.c6
-rw-r--r--net/netfilter/xt_TPROXY.c36
-rw-r--r--net/netfilter/xt_addrtype.c10
-rw-r--r--net/netfilter/xt_bpf.c98
-rw-r--r--net/netfilter/xt_cgroup.c1
-rw-r--r--net/netfilter/xt_cluster.c2
-rw-r--r--net/netfilter/xt_connbytes.c4
-rw-r--r--net/netfilter/xt_connlabel.c6
-rw-r--r--net/netfilter/xt_connlimit.c19
-rw-r--r--net/netfilter/xt_connmark.c8
-rw-r--r--net/netfilter/xt_conntrack.c12
-rw-r--r--net/netfilter/xt_devgroup.c4
-rw-r--r--net/netfilter/xt_dscp.c2
-rw-r--r--net/netfilter/xt_hashlimit.c31
-rw-r--r--net/netfilter/xt_helper.c4
-rw-r--r--net/netfilter/xt_ipvs.c4
-rw-r--r--net/netfilter/xt_limit.c2
-rw-r--r--net/netfilter/xt_multiport.c52
-rw-r--r--net/netfilter/xt_nat.c18
-rw-r--r--net/netfilter/xt_nfacct.c2
-rw-r--r--net/netfilter/xt_osf.c10
-rw-r--r--net/netfilter/xt_owner.c4
-rw-r--r--net/netfilter/xt_pkttype.c5
-rw-r--r--net/netfilter/xt_policy.c4
-rw-r--r--net/netfilter/xt_quota.c1
-rw-r--r--net/netfilter/xt_rateest.c29
-rw-r--r--net/netfilter/xt_recent.c12
-rw-r--r--net/netfilter/xt_set.c38
-rw-r--r--net/netfilter/xt_socket.c336
-rw-r--r--net/netfilter/xt_state.c4
-rw-r--r--net/netfilter/xt_string.c1
-rw-r--r--net/netfilter/xt_time.c2
-rw-r--r--net/netlabel/netlabel_calipso.c21
-rw-r--r--net/netlabel/netlabel_cipso_v4.c22
-rw-r--r--net/netlabel/netlabel_kapi.c5
-rw-r--r--net/netlabel/netlabel_mgmt.c21
-rw-r--r--net/netlabel/netlabel_unlabeled.c21
-rw-r--r--net/netlink/af_netlink.c57
-rw-r--r--net/netlink/genetlink.c325
-rw-r--r--net/netrom/af_netrom.c5
-rw-r--r--net/nfc/llcp_sock.c3
-rw-r--r--net/nfc/netlink.c34
-rw-r--r--net/openvswitch/actions.c176
-rw-r--r--net/openvswitch/conntrack.c326
-rw-r--r--net/openvswitch/conntrack.h14
-rw-r--r--net/openvswitch/datapath.c34
-rw-r--r--net/openvswitch/datapath.h2
-rw-r--r--net/openvswitch/flow.c153
-rw-r--r--net/openvswitch/flow.h77
-rw-r--r--net/openvswitch/flow_netlink.c274
-rw-r--r--net/openvswitch/flow_netlink.h7
-rw-r--r--net/openvswitch/vport-internal_dev.c16
-rw-r--r--net/openvswitch/vport-netdev.c10
-rw-r--r--net/openvswitch/vport.c48
-rw-r--r--net/openvswitch/vport.h3
-rw-r--r--net/packet/af_packet.c213
-rw-r--r--net/packet/diag.c3
-rw-r--r--net/phonet/pep-gprs.c12
-rw-r--r--net/phonet/pep.c16
-rw-r--r--net/phonet/pn_dev.c2
-rw-r--r--net/phonet/socket.c6
-rw-r--r--net/psample/Kconfig15
-rw-r--r--net/psample/Makefile5
-rw-r--r--net/psample/psample.c301
-rw-r--r--net/qrtr/qrtr.c8
-rw-r--r--net/rds/af_rds.c35
-rw-r--r--net/rds/bind.c4
-rw-r--r--net/rds/connection.c29
-rw-r--r--net/rds/ib.c20
-rw-r--r--net/rds/ib.h30
-rw-r--r--net/rds/ib_cm.c136
-rw-r--r--net/rds/ib_frmr.c16
-rw-r--r--net/rds/ib_mr.h3
-rw-r--r--net/rds/ib_recv.c14
-rw-r--r--net/rds/ib_send.c30
-rw-r--r--net/rds/ib_stats.c2
-rw-r--r--net/rds/message.c1
-rw-r--r--net/rds/page.c29
-rw-r--r--net/rds/rdma.c24
-rw-r--r--net/rds/rdma_transport.c16
-rw-r--r--net/rds/rds.h44
-rw-r--r--net/rds/recv.c72
-rw-r--r--net/rds/send.c59
-rw-r--r--net/rds/tcp.c67
-rw-r--r--net/rds/tcp.h2
-rw-r--r--net/rds/tcp_connect.c14
-rw-r--r--net/rds/tcp_listen.c43
-rw-r--r--net/rds/tcp_recv.c5
-rw-r--r--net/rds/tcp_send.c3
-rw-r--r--net/rds/threads.c3
-rw-r--r--net/rds/transport.c4
-rw-r--r--net/rfkill/Kconfig11
-rw-r--r--net/rfkill/Makefile1
-rw-r--r--net/rfkill/core.c100
-rw-r--r--net/rfkill/rfkill-regulator.c154
-rw-r--r--net/rose/af_rose.c7
-rw-r--r--net/rose/rose_route.c2
-rw-r--r--net/rxrpc/Makefile12
-rw-r--r--net/rxrpc/af_rxrpc.c31
-rw-r--r--net/rxrpc/ar-internal.h196
-rw-r--r--net/rxrpc/call_accept.c51
-rw-r--r--net/rxrpc/call_object.c36
-rw-r--r--net/rxrpc/conn_client.c14
-rw-r--r--net/rxrpc/conn_event.c4
-rw-r--r--net/rxrpc/conn_object.c1
-rw-r--r--net/rxrpc/input.c51
-rw-r--r--net/rxrpc/key.c2
-rw-r--r--net/rxrpc/misc.c151
-rw-r--r--net/rxrpc/proc.c9
-rw-r--r--net/rxrpc/recvmsg.c53
-rw-r--r--net/rxrpc/sendmsg.c99
-rw-r--r--net/sched/Kconfig14
-rw-r--r--net/sched/Makefile1
-rw-r--r--net/sched/act_api.c84
-rw-r--r--net/sched/act_bpf.c19
-rw-r--r--net/sched/act_connmark.c5
-rw-r--r--net/sched/act_csum.c32
-rw-r--r--net/sched/act_gact.c2
-rw-r--r--net/sched/act_ife.c119
-rw-r--r--net/sched/act_ipt.c16
-rw-r--r--net/sched/act_mirred.c94
-rw-r--r--net/sched/act_nat.c2
-rw-r--r--net/sched/act_pedit.c222
-rw-r--r--net/sched/act_police.c23
-rw-r--r--net/sched/act_sample.c276
-rw-r--r--net/sched/act_simple.c2
-rw-r--r--net/sched/act_skbedit.c23
-rw-r--r--net/sched/act_skbmod.c3
-rw-r--r--net/sched/act_tunnel_key.c19
-rw-r--r--net/sched/act_vlan.c2
-rw-r--r--net/sched/cls_api.c218
-rw-r--r--net/sched/cls_bpf.c62
-rw-r--r--net/sched/cls_flow.c2
-rw-r--r--net/sched/cls_flower.c409
-rw-r--r--net/sched/cls_matchall.c160
-rw-r--r--net/sched/cls_u32.c11
-rw-r--r--net/sched/em_ipset.c17
-rw-r--r--net/sched/em_meta.c10
-rw-r--r--net/sched/sch_api.c52
-rw-r--r--net/sched/sch_atm.c1
-rw-r--r--net/sched/sch_cbq.c9
-rw-r--r--net/sched/sch_choke.c1
-rw-r--r--net/sched/sch_drr.c6
-rw-r--r--net/sched/sch_dsmark.c11
-rw-r--r--net/sched/sch_fq.c18
-rw-r--r--net/sched/sch_fq_codel.c7
-rw-r--r--net/sched/sch_generic.c6
-rw-r--r--net/sched/sch_hfsc.c6
-rw-r--r--net/sched/sch_hhf.c8
-rw-r--r--net/sched/sch_htb.c7
-rw-r--r--net/sched/sch_ingress.c1
-rw-r--r--net/sched/sch_mq.c10
-rw-r--r--net/sched/sch_mqprio.c19
-rw-r--r--net/sched/sch_multiq.c2
-rw-r--r--net/sched/sch_netem.c6
-rw-r--r--net/sched/sch_prio.c2
-rw-r--r--net/sched/sch_qfq.c8
-rw-r--r--net/sched/sch_sfb.c1
-rw-r--r--net/sched/sch_sfq.c4
-rw-r--r--net/sched/sch_teql.c10
-rw-r--r--net/sctp/Makefile2
-rw-r--r--net/sctp/associola.c59
-rw-r--r--net/sctp/bind_addr.c3
-rw-r--r--net/sctp/chunk.c139
-rw-r--r--net/sctp/debug.c5
-rw-r--r--net/sctp/endpointola.c6
-rw-r--r--net/sctp/input.c130
-rw-r--r--net/sctp/ipv6.c26
-rw-r--r--net/sctp/objcnt.c2
-rw-r--r--net/sctp/offload.c2
-rw-r--r--net/sctp/output.c526
-rw-r--r--net/sctp/outqueue.c55
-rw-r--r--net/sctp/primitive.c3
-rw-r--r--net/sctp/proc.c4
-rw-r--r--net/sctp/protocol.c70
-rw-r--r--net/sctp/sm_make_chunk.c361
-rw-r--r--net/sctp/sm_sideeffect.c38
-rw-r--r--net/sctp/sm_statefuns.c232
-rw-r--r--net/sctp/sm_statetable.c70
-rw-r--r--net/sctp/socket.c238
-rw-r--r--net/sctp/ssnmap.c125
-rw-r--r--net/sctp/stream.c506
-rw-r--r--net/sctp/transport.c58
-rw-r--r--net/sctp/ulpevent.c29
-rw-r--r--net/sctp/ulpqueue.c36
-rw-r--r--net/smc/Kconfig20
-rw-r--r--net/smc/Makefile4
-rw-r--r--net/smc/af_smc.c1409
-rw-r--r--net/smc/smc.h274
-rw-r--r--net/smc/smc_cdc.c304
-rw-r--r--net/smc/smc_cdc.h218
-rw-r--r--net/smc/smc_clc.c282
-rw-r--r--net/smc/smc_clc.h116
-rw-r--r--net/smc/smc_close.c444
-rw-r--r--net/smc/smc_close.h28
-rw-r--r--net/smc/smc_core.c682
-rw-r--r--net/smc/smc_core.h181
-rw-r--r--net/smc/smc_diag.c215
-rw-r--r--net/smc/smc_ib.c466
-rw-r--r--net/smc/smc_ib.h71
-rw-r--r--net/smc/smc_llc.c158
-rw-r--r--net/smc/smc_llc.h63
-rw-r--r--net/smc/smc_pnet.c534
-rw-r--r--net/smc/smc_pnet.h23
-rw-r--r--net/smc/smc_rx.c219
-rw-r--r--net/smc/smc_rx.h23
-rw-r--r--net/smc/smc_tx.c485
-rw-r--r--net/smc/smc_tx.h35
-rw-r--r--net/smc/smc_wr.c614
-rw-r--r--net/smc/smc_wr.h106
-rw-r--r--net/socket.c68
-rw-r--r--net/strparser/strparser.c1
-rw-r--r--net/sunrpc/auth.c16
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c11
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_crypto.c12
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_mech.c3
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_xdr.c2
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c6
-rw-r--r--net/sunrpc/auth_null.c3
-rw-r--r--net/sunrpc/auth_unix.c18
-rw-r--r--net/sunrpc/cache.c125
-rw-r--r--net/sunrpc/clnt.c58
-rw-r--r--net/sunrpc/debugfs.c35
-rw-r--r--net/sunrpc/netns.h2
-rw-r--r--net/sunrpc/stats.c10
-rw-r--r--net/sunrpc/sunrpc_syms.c3
-rw-r--r--net/sunrpc/svc.c42
-rw-r--r--net/sunrpc/svc_xprt.c16
-rw-r--r--net/sunrpc/svcauth.c18
-rw-r--r--net/sunrpc/svcauth_unix.c4
-rw-r--r--net/sunrpc/svcsock.c32
-rw-r--r--net/sunrpc/sysctl.c2
-rw-r--r--net/sunrpc/xdr.c34
-rw-r--r--net/sunrpc/xprt.c5
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c4
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c5
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c105
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c118
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_backchannel.c23
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_marshal.c299
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c43
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c138
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c168
-rw-r--r--net/sunrpc/xprtrdma/transport.c40
-rw-r--r--net/sunrpc/xprtrdma/verbs.c134
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h61
-rw-r--r--net/sunrpc/xprtsock.c106
-rw-r--r--net/switchdev/switchdev.c5
-rw-r--r--net/tipc/bcast.c204
-rw-r--r--net/tipc/bcast.h33
-rw-r--r--net/tipc/bearer.c15
-rw-r--r--net/tipc/bearer.h8
-rw-r--r--net/tipc/core.c2
-rw-r--r--net/tipc/core.h2
-rw-r--r--net/tipc/discover.c4
-rw-r--r--net/tipc/link.c89
-rw-r--r--net/tipc/msg.c37
-rw-r--r--net/tipc/msg.h15
-rw-r--r--net/tipc/name_distr.c2
-rw-r--r--net/tipc/name_table.c128
-rw-r--r--net/tipc/name_table.h24
-rw-r--r--net/tipc/net.c4
-rw-r--r--net/tipc/netlink.c27
-rw-r--r--net/tipc/netlink_compat.c25
-rw-r--r--net/tipc/node.c63
-rw-r--r--net/tipc/node.h4
-rw-r--r--net/tipc/server.c48
-rw-r--r--net/tipc/socket.c985
-rw-r--r--net/tipc/subscr.c127
-rw-r--r--net/tipc/subscr.h1
-rw-r--r--net/tipc/udp_media.c8
-rw-r--r--net/unix/af_unix.c87
-rw-r--r--net/unix/garbage.c17
-rw-r--r--net/vmw_vsock/af_vsock.c18
-rw-r--r--net/vmw_vsock/virtio_transport.c103
-rw-r--r--net/vmw_vsock/virtio_transport_common.c35
-rw-r--r--net/vmw_vsock/vmci_transport_notify.c30
-rw-r--r--net/vmw_vsock/vmci_transport_notify_qstate.c30
-rw-r--r--net/wimax/stack.c22
-rw-r--r--net/wireless/Makefile3
-rw-r--r--net/wireless/core.c39
-rw-r--r--net/wireless/core.h13
-rw-r--r--net/wireless/debugfs.c10
-rw-r--r--net/wireless/lib80211_crypt_tkip.c2
-rw-r--r--net/wireless/mesh.c2
-rw-r--r--net/wireless/mlme.c47
-rw-r--r--net/wireless/nl80211.c951
-rw-r--r--net/wireless/nl80211.h10
-rw-r--r--net/wireless/of.c138
-rw-r--r--net/wireless/rdev-ops.h24
-rw-r--r--net/wireless/reg.c27
-rw-r--r--net/wireless/scan.c9
-rw-r--r--net/wireless/sme.c88
-rw-r--r--net/wireless/sysfs.c16
-rw-r--r--net/wireless/trace.h64
-rw-r--r--net/wireless/util.c157
-rw-r--r--net/wireless/wext-core.c67
-rw-r--r--net/wireless/wext-sme.c23
-rw-r--r--net/x25/af_x25.c7
-rw-r--r--net/x25/sysctl_net_x25.c2
-rw-r--r--net/x25/x25_link.c2
-rw-r--r--net/xfrm/Kconfig5
-rw-r--r--net/xfrm/xfrm_input.c111
-rw-r--r--net/xfrm/xfrm_output.c8
-rw-r--r--net/xfrm/xfrm_policy.c173
-rw-r--r--net/xfrm/xfrm_state.c98
-rw-r--r--net/xfrm/xfrm_user.c11
890 files changed, 39173 insertions, 15469 deletions
diff --git a/net/6lowpan/nhc.c b/net/6lowpan/nhc.c
index 7008d53e455c..4fa2fdda174d 100644
--- a/net/6lowpan/nhc.c
+++ b/net/6lowpan/nhc.c
@@ -27,8 +27,8 @@ static int lowpan_nhc_insert(struct lowpan_nhc *nhc)
27 27
28 /* Figure out where to put new node */ 28 /* Figure out where to put new node */
29 while (*new) { 29 while (*new) {
30 struct lowpan_nhc *this = container_of(*new, struct lowpan_nhc, 30 struct lowpan_nhc *this = rb_entry(*new, struct lowpan_nhc,
31 node); 31 node);
32 int result, len_dif, len; 32 int result, len_dif, len;
33 33
34 len_dif = nhc->idlen - this->idlen; 34 len_dif = nhc->idlen - this->idlen;
@@ -69,8 +69,8 @@ static struct lowpan_nhc *lowpan_nhc_by_nhcid(const struct sk_buff *skb)
69 const u8 *nhcid_skb_ptr = skb->data; 69 const u8 *nhcid_skb_ptr = skb->data;
70 70
71 while (node) { 71 while (node) {
72 struct lowpan_nhc *nhc = container_of(node, struct lowpan_nhc, 72 struct lowpan_nhc *nhc = rb_entry(node, struct lowpan_nhc,
73 node); 73 node);
74 u8 nhcid_skb_ptr_masked[LOWPAN_NHC_MAX_ID_LEN]; 74 u8 nhcid_skb_ptr_masked[LOWPAN_NHC_MAX_ID_LEN];
75 int result, i; 75 int result, i;
76 76
diff --git a/net/802/fc.c b/net/802/fc.c
index 7b9219022418..1bb496ea997e 100644
--- a/net/802/fc.c
+++ b/net/802/fc.c
@@ -10,7 +10,7 @@
10 * v 1.0 03/22/99 10 * v 1.0 03/22/99
11 */ 11 */
12 12
13#include <asm/uaccess.h> 13#include <linux/uaccess.h>
14#include <linux/types.h> 14#include <linux/types.h>
15#include <linux/kernel.h> 15#include <linux/kernel.h>
16#include <linux/string.h> 16#include <linux/string.h>
diff --git a/net/802/fddi.c b/net/802/fddi.c
index 7d3a0af954e8..6356623fc238 100644
--- a/net/802/fddi.c
+++ b/net/802/fddi.c
@@ -141,15 +141,6 @@ __be16 fddi_type_trans(struct sk_buff *skb, struct net_device *dev)
141 141
142EXPORT_SYMBOL(fddi_type_trans); 142EXPORT_SYMBOL(fddi_type_trans);
143 143
144int fddi_change_mtu(struct net_device *dev, int new_mtu)
145{
146 if ((new_mtu < FDDI_K_SNAP_HLEN) || (new_mtu > FDDI_K_SNAP_DLEN))
147 return -EINVAL;
148 dev->mtu = new_mtu;
149 return 0;
150}
151EXPORT_SYMBOL(fddi_change_mtu);
152
153static const struct header_ops fddi_header_ops = { 144static const struct header_ops fddi_header_ops = {
154 .create = fddi_header, 145 .create = fddi_header,
155}; 146};
@@ -161,6 +152,8 @@ static void fddi_setup(struct net_device *dev)
161 dev->type = ARPHRD_FDDI; 152 dev->type = ARPHRD_FDDI;
162 dev->hard_header_len = FDDI_K_SNAP_HLEN+3; /* Assume 802.2 SNAP hdr len + 3 pad bytes */ 153 dev->hard_header_len = FDDI_K_SNAP_HLEN+3; /* Assume 802.2 SNAP hdr len + 3 pad bytes */
163 dev->mtu = FDDI_K_SNAP_DLEN; /* Assume max payload of 802.2 SNAP frame */ 154 dev->mtu = FDDI_K_SNAP_DLEN; /* Assume max payload of 802.2 SNAP frame */
155 dev->min_mtu = FDDI_K_SNAP_HLEN;
156 dev->max_mtu = FDDI_K_SNAP_DLEN;
164 dev->addr_len = FDDI_K_ALEN; 157 dev->addr_len = FDDI_K_ALEN;
165 dev->tx_queue_len = 100; /* Long queues on FDDI */ 158 dev->tx_queue_len = 100; /* Long queues on FDDI */
166 dev->flags = IFF_BROADCAST | IFF_MULTICAST; 159 dev->flags = IFF_BROADCAST | IFF_MULTICAST;
diff --git a/net/802/hippi.c b/net/802/hippi.c
index ade1a52cdcff..4460606e9c36 100644
--- a/net/802/hippi.c
+++ b/net/802/hippi.c
@@ -34,7 +34,7 @@
34#include <linux/errno.h> 34#include <linux/errno.h>
35#include <net/arp.h> 35#include <net/arp.h>
36#include <net/sock.h> 36#include <net/sock.h>
37#include <asm/uaccess.h> 37#include <linux/uaccess.h>
38 38
39/* 39/*
40 * Create the HIPPI MAC header for an arbitrary protocol layer 40 * Create the HIPPI MAC header for an arbitrary protocol layer
@@ -116,18 +116,6 @@ __be16 hippi_type_trans(struct sk_buff *skb, struct net_device *dev)
116 116
117EXPORT_SYMBOL(hippi_type_trans); 117EXPORT_SYMBOL(hippi_type_trans);
118 118
119int hippi_change_mtu(struct net_device *dev, int new_mtu)
120{
121 /*
122 * HIPPI's got these nice large MTUs.
123 */
124 if ((new_mtu < 68) || (new_mtu > 65280))
125 return -EINVAL;
126 dev->mtu = new_mtu;
127 return 0;
128}
129EXPORT_SYMBOL(hippi_change_mtu);
130
131/* 119/*
132 * For HIPPI we will actually use the lower 4 bytes of the hardware 120 * For HIPPI we will actually use the lower 4 bytes of the hardware
133 * address as the I-FIELD rather than the actual hardware address. 121 * address as the I-FIELD rather than the actual hardware address.
@@ -174,6 +162,8 @@ static void hippi_setup(struct net_device *dev)
174 dev->type = ARPHRD_HIPPI; 162 dev->type = ARPHRD_HIPPI;
175 dev->hard_header_len = HIPPI_HLEN; 163 dev->hard_header_len = HIPPI_HLEN;
176 dev->mtu = 65280; 164 dev->mtu = 65280;
165 dev->min_mtu = 68;
166 dev->max_mtu = 65280;
177 dev->addr_len = HIPPI_ALEN; 167 dev->addr_len = HIPPI_ALEN;
178 dev->tx_queue_len = 25 /* 5 */; 168 dev->tx_queue_len = 25 /* 5 */;
179 memset(dev->broadcast, 0xFF, HIPPI_ALEN); 169 memset(dev->broadcast, 0xFF, HIPPI_ALEN);
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index f2531ad66b68..467069b73ce1 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -34,7 +34,7 @@
34#include <net/rtnetlink.h> 34#include <net/rtnetlink.h>
35#include <net/net_namespace.h> 35#include <net/net_namespace.h>
36#include <net/netns/generic.h> 36#include <net/netns/generic.h>
37#include <asm/uaccess.h> 37#include <linux/uaccess.h>
38 38
39#include <linux/if_vlan.h> 39#include <linux/if_vlan.h>
40#include "vlan.h" 40#include "vlan.h"
@@ -44,7 +44,7 @@
44 44
45/* Global VLAN variables */ 45/* Global VLAN variables */
46 46
47int vlan_net_id __read_mostly; 47unsigned int vlan_net_id __read_mostly;
48 48
49const char vlan_fullname[] = "802.1Q VLAN Support"; 49const char vlan_fullname[] = "802.1Q VLAN Support";
50const char vlan_version[] = DRV_VERSION; 50const char vlan_version[] = DRV_VERSION;
@@ -515,8 +515,8 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg)
515 return -EFAULT; 515 return -EFAULT;
516 516
517 /* Null terminate this sucker, just in case. */ 517 /* Null terminate this sucker, just in case. */
518 args.device1[23] = 0; 518 args.device1[sizeof(args.device1) - 1] = 0;
519 args.u.device2[23] = 0; 519 args.u.device2[sizeof(args.u.device2) - 1] = 0;
520 520
521 rtnl_lock(); 521 rtnl_lock();
522 522
@@ -571,8 +571,7 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg)
571 err = -EPERM; 571 err = -EPERM;
572 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 572 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
573 break; 573 break;
574 if ((args.u.name_type >= 0) && 574 if (args.u.name_type < VLAN_NAME_TYPE_HIGHEST) {
575 (args.u.name_type < VLAN_NAME_TYPE_HIGHEST)) {
576 struct vlan_net *vn; 575 struct vlan_net *vn;
577 576
578 vn = net_generic(net, vlan_net_id); 577 vn = net_generic(net, vlan_net_id);
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index cc1557978066..df8bd65dd370 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -159,7 +159,7 @@ void vlan_netlink_fini(void);
159 159
160extern struct rtnl_link_ops vlan_link_ops; 160extern struct rtnl_link_ops vlan_link_ops;
161 161
162extern int vlan_net_id; 162extern unsigned int vlan_net_id;
163 163
164struct proc_dir_entry; 164struct proc_dir_entry;
165 165
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index fbfacd51aa34..e97ab824e368 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -671,7 +671,8 @@ static int vlan_ethtool_get_ts_info(struct net_device *dev,
671 return 0; 671 return 0;
672} 672}
673 673
674static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) 674static void vlan_dev_get_stats64(struct net_device *dev,
675 struct rtnl_link_stats64 *stats)
675{ 676{
676 struct vlan_pcpu_stats *p; 677 struct vlan_pcpu_stats *p;
677 u32 rx_errors = 0, tx_dropped = 0; 678 u32 rx_errors = 0, tx_dropped = 0;
@@ -702,8 +703,6 @@ static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, st
702 } 703 }
703 stats->rx_errors = rx_errors; 704 stats->rx_errors = rx_errors;
704 stats->tx_dropped = tx_dropped; 705 stats->tx_dropped = tx_dropped;
705
706 return stats;
707} 706}
708 707
709#ifdef CONFIG_NET_POLL_CONTROLLER 708#ifdef CONFIG_NET_POLL_CONTROLLER
@@ -792,8 +791,6 @@ static const struct net_device_ops vlan_netdev_ops = {
792 .ndo_netpoll_cleanup = vlan_dev_netpoll_cleanup, 791 .ndo_netpoll_cleanup = vlan_dev_netpoll_cleanup,
793#endif 792#endif
794 .ndo_fix_features = vlan_dev_fix_features, 793 .ndo_fix_features = vlan_dev_fix_features,
795 .ndo_neigh_construct = netdev_default_l2upper_neigh_construct,
796 .ndo_neigh_destroy = netdev_default_l2upper_neigh_destroy,
797 .ndo_fdb_add = switchdev_port_fdb_add, 794 .ndo_fdb_add = switchdev_port_fdb_add,
798 .ndo_fdb_del = switchdev_port_fdb_del, 795 .ndo_fdb_del = switchdev_port_fdb_del,
799 .ndo_fdb_dump = switchdev_port_fdb_dump, 796 .ndo_fdb_dump = switchdev_port_fdb_dump,
@@ -826,5 +823,8 @@ void vlan_setup(struct net_device *dev)
826 dev->destructor = vlan_dev_free; 823 dev->destructor = vlan_dev_free;
827 dev->ethtool_ops = &vlan_ethtool_ops; 824 dev->ethtool_ops = &vlan_ethtool_ops;
828 825
826 dev->min_mtu = 0;
827 dev->max_mtu = ETH_MAX_MTU;
828
829 eth_zero_addr(dev->broadcast); 829 eth_zero_addr(dev->broadcast);
830} 830}
diff --git a/net/9p/client.c b/net/9p/client.c
index 3fc94a49ccd5..3ce672af1596 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -32,7 +32,7 @@
32#include <linux/idr.h> 32#include <linux/idr.h>
33#include <linux/mutex.h> 33#include <linux/mutex.h>
34#include <linux/slab.h> 34#include <linux/slab.h>
35#include <linux/sched.h> 35#include <linux/sched/signal.h>
36#include <linux/uaccess.h> 36#include <linux/uaccess.h>
37#include <linux/uio.h> 37#include <linux/uio.h>
38#include <net/9p/9p.h> 38#include <net/9p/9p.h>
@@ -1101,7 +1101,7 @@ void p9_client_begin_disconnect(struct p9_client *clnt)
1101EXPORT_SYMBOL(p9_client_begin_disconnect); 1101EXPORT_SYMBOL(p9_client_begin_disconnect);
1102 1102
1103struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid, 1103struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid,
1104 char *uname, kuid_t n_uname, char *aname) 1104 const char *uname, kuid_t n_uname, const char *aname)
1105{ 1105{
1106 int err = 0; 1106 int err = 0;
1107 struct p9_req_t *req; 1107 struct p9_req_t *req;
@@ -1149,7 +1149,7 @@ error:
1149EXPORT_SYMBOL(p9_client_attach); 1149EXPORT_SYMBOL(p9_client_attach);
1150 1150
1151struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname, 1151struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname,
1152 char **wnames, int clone) 1152 const unsigned char * const *wnames, int clone)
1153{ 1153{
1154 int err; 1154 int err;
1155 struct p9_client *clnt; 1155 struct p9_client *clnt;
@@ -1271,7 +1271,7 @@ error:
1271} 1271}
1272EXPORT_SYMBOL(p9_client_open); 1272EXPORT_SYMBOL(p9_client_open);
1273 1273
1274int p9_client_create_dotl(struct p9_fid *ofid, char *name, u32 flags, u32 mode, 1274int p9_client_create_dotl(struct p9_fid *ofid, const char *name, u32 flags, u32 mode,
1275 kgid_t gid, struct p9_qid *qid) 1275 kgid_t gid, struct p9_qid *qid)
1276{ 1276{
1277 int err = 0; 1277 int err = 0;
@@ -1316,7 +1316,7 @@ error:
1316} 1316}
1317EXPORT_SYMBOL(p9_client_create_dotl); 1317EXPORT_SYMBOL(p9_client_create_dotl);
1318 1318
1319int p9_client_fcreate(struct p9_fid *fid, char *name, u32 perm, int mode, 1319int p9_client_fcreate(struct p9_fid *fid, const char *name, u32 perm, int mode,
1320 char *extension) 1320 char *extension)
1321{ 1321{
1322 int err; 1322 int err;
@@ -1361,8 +1361,8 @@ error:
1361} 1361}
1362EXPORT_SYMBOL(p9_client_fcreate); 1362EXPORT_SYMBOL(p9_client_fcreate);
1363 1363
1364int p9_client_symlink(struct p9_fid *dfid, char *name, char *symtgt, kgid_t gid, 1364int p9_client_symlink(struct p9_fid *dfid, const char *name,
1365 struct p9_qid *qid) 1365 const char *symtgt, kgid_t gid, struct p9_qid *qid)
1366{ 1366{
1367 int err = 0; 1367 int err = 0;
1368 struct p9_client *clnt; 1368 struct p9_client *clnt;
@@ -1395,7 +1395,7 @@ error:
1395} 1395}
1396EXPORT_SYMBOL(p9_client_symlink); 1396EXPORT_SYMBOL(p9_client_symlink);
1397 1397
1398int p9_client_link(struct p9_fid *dfid, struct p9_fid *oldfid, char *newname) 1398int p9_client_link(struct p9_fid *dfid, struct p9_fid *oldfid, const char *newname)
1399{ 1399{
1400 struct p9_client *clnt; 1400 struct p9_client *clnt;
1401 struct p9_req_t *req; 1401 struct p9_req_t *req;
@@ -2117,7 +2117,7 @@ error:
2117} 2117}
2118EXPORT_SYMBOL(p9_client_readdir); 2118EXPORT_SYMBOL(p9_client_readdir);
2119 2119
2120int p9_client_mknod_dotl(struct p9_fid *fid, char *name, int mode, 2120int p9_client_mknod_dotl(struct p9_fid *fid, const char *name, int mode,
2121 dev_t rdev, kgid_t gid, struct p9_qid *qid) 2121 dev_t rdev, kgid_t gid, struct p9_qid *qid)
2122{ 2122{
2123 int err; 2123 int err;
@@ -2148,7 +2148,7 @@ error:
2148} 2148}
2149EXPORT_SYMBOL(p9_client_mknod_dotl); 2149EXPORT_SYMBOL(p9_client_mknod_dotl);
2150 2150
2151int p9_client_mkdir_dotl(struct p9_fid *fid, char *name, int mode, 2151int p9_client_mkdir_dotl(struct p9_fid *fid, const char *name, int mode,
2152 kgid_t gid, struct p9_qid *qid) 2152 kgid_t gid, struct p9_qid *qid)
2153{ 2153{
2154 int err; 2154 int err;
diff --git a/net/Kconfig b/net/Kconfig
index 7b6cd340b72b..102f781a0131 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -57,6 +57,7 @@ source "net/packet/Kconfig"
57source "net/unix/Kconfig" 57source "net/unix/Kconfig"
58source "net/xfrm/Kconfig" 58source "net/xfrm/Kconfig"
59source "net/iucv/Kconfig" 59source "net/iucv/Kconfig"
60source "net/smc/Kconfig"
60 61
61config INET 62config INET
62 bool "TCP/IP networking" 63 bool "TCP/IP networking"
@@ -258,10 +259,6 @@ config XPS
258config HWBM 259config HWBM
259 bool 260 bool
260 261
261config SOCK_CGROUP_DATA
262 bool
263 default n
264
265config CGROUP_NET_PRIO 262config CGROUP_NET_PRIO
266 bool "Network priority cgroup" 263 bool "Network priority cgroup"
267 depends on CGROUPS 264 depends on CGROUPS
@@ -300,7 +297,8 @@ config BPF_JIT
300 297
301 Note, admin should enable this feature changing: 298 Note, admin should enable this feature changing:
302 /proc/sys/net/core/bpf_jit_enable 299 /proc/sys/net/core/bpf_jit_enable
303 /proc/sys/net/core/bpf_jit_harden (optional) 300 /proc/sys/net/core/bpf_jit_harden (optional)
301 /proc/sys/net/core/bpf_jit_kallsyms (optional)
304 302
305config NET_FLOW_LIMIT 303config NET_FLOW_LIMIT
306 bool 304 bool
@@ -393,6 +391,8 @@ source "net/9p/Kconfig"
393source "net/caif/Kconfig" 391source "net/caif/Kconfig"
394source "net/ceph/Kconfig" 392source "net/ceph/Kconfig"
395source "net/nfc/Kconfig" 393source "net/nfc/Kconfig"
394source "net/psample/Kconfig"
395source "net/ife/Kconfig"
396 396
397config LWTUNNEL 397config LWTUNNEL
398 bool "Network light weight tunnels" 398 bool "Network light weight tunnels"
@@ -402,10 +402,22 @@ config LWTUNNEL
402 weight tunnel endpoint. Tunnel encapsulation parameters are stored 402 weight tunnel endpoint. Tunnel encapsulation parameters are stored
403 with light weight tunnel state associated with fib routes. 403 with light weight tunnel state associated with fib routes.
404 404
405config LWTUNNEL_BPF
406 bool "Execute BPF program as route nexthop action"
407 depends on LWTUNNEL
408 default y if LWTUNNEL=y
409 ---help---
410 Allows to run BPF programs as a nexthop action following a route
411 lookup for incoming and outgoing packets.
412
405config DST_CACHE 413config DST_CACHE
406 bool 414 bool
407 default n 415 default n
408 416
417config GRO_CELLS
418 bool
419 default n
420
409config NET_DEVLINK 421config NET_DEVLINK
410 tristate "Network physical/parent device Netlink interface" 422 tristate "Network physical/parent device Netlink interface"
411 help 423 help
diff --git a/net/Makefile b/net/Makefile
index 4cafaa2b4667..9b681550e3a3 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -51,6 +51,7 @@ obj-$(CONFIG_MAC80211) += mac80211/
51obj-$(CONFIG_TIPC) += tipc/ 51obj-$(CONFIG_TIPC) += tipc/
52obj-$(CONFIG_NETLABEL) += netlabel/ 52obj-$(CONFIG_NETLABEL) += netlabel/
53obj-$(CONFIG_IUCV) += iucv/ 53obj-$(CONFIG_IUCV) += iucv/
54obj-$(CONFIG_SMC) += smc/
54obj-$(CONFIG_RFKILL) += rfkill/ 55obj-$(CONFIG_RFKILL) += rfkill/
55obj-$(CONFIG_NET_9P) += 9p/ 56obj-$(CONFIG_NET_9P) += 9p/
56obj-$(CONFIG_CAIF) += caif/ 57obj-$(CONFIG_CAIF) += caif/
@@ -69,6 +70,8 @@ obj-$(CONFIG_DNS_RESOLVER) += dns_resolver/
69obj-$(CONFIG_CEPH_LIB) += ceph/ 70obj-$(CONFIG_CEPH_LIB) += ceph/
70obj-$(CONFIG_BATMAN_ADV) += batman-adv/ 71obj-$(CONFIG_BATMAN_ADV) += batman-adv/
71obj-$(CONFIG_NFC) += nfc/ 72obj-$(CONFIG_NFC) += nfc/
73obj-$(CONFIG_PSAMPLE) += psample/
74obj-$(CONFIG_NET_IFE) += ife/
72obj-$(CONFIG_OPENVSWITCH) += openvswitch/ 75obj-$(CONFIG_OPENVSWITCH) += openvswitch/
73obj-$(CONFIG_VSOCKETS) += vmw_vsock/ 76obj-$(CONFIG_VSOCKETS) += vmw_vsock/
74obj-$(CONFIG_MPLS) += mpls/ 77obj-$(CONFIG_MPLS) += mpls/
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 10d2bdce686e..465cc24b41e5 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1656,7 +1656,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1656 ddp->deh_dport = usat->sat_port; 1656 ddp->deh_dport = usat->sat_port;
1657 ddp->deh_sport = at->src_port; 1657 ddp->deh_sport = at->src_port;
1658 1658
1659 SOCK_DEBUG(sk, "SK %p: Copy user data (%Zd bytes).\n", sk, len); 1659 SOCK_DEBUG(sk, "SK %p: Copy user data (%zd bytes).\n", sk, len);
1660 1660
1661 err = memcpy_from_msg(skb_put(skb, len), msg, len); 1661 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1662 if (err) { 1662 if (err) {
@@ -1720,7 +1720,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1720 */ 1720 */
1721 aarp_send_ddp(dev, skb, &usat->sat_addr, NULL); 1721 aarp_send_ddp(dev, skb, &usat->sat_addr, NULL);
1722 } 1722 }
1723 SOCK_DEBUG(sk, "SK %p: Done write (%Zd).\n", sk, len); 1723 SOCK_DEBUG(sk, "SK %p: Done write (%zd).\n", sk, len);
1724 1724
1725out: 1725out:
1726 release_sock(sk); 1726 release_sock(sk);
diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index aa0047c5c467..fca84e111c89 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -620,14 +620,12 @@ error:
620static const struct net_device_ops br2684_netdev_ops = { 620static const struct net_device_ops br2684_netdev_ops = {
621 .ndo_start_xmit = br2684_start_xmit, 621 .ndo_start_xmit = br2684_start_xmit,
622 .ndo_set_mac_address = br2684_mac_addr, 622 .ndo_set_mac_address = br2684_mac_addr,
623 .ndo_change_mtu = eth_change_mtu,
624 .ndo_validate_addr = eth_validate_addr, 623 .ndo_validate_addr = eth_validate_addr,
625}; 624};
626 625
627static const struct net_device_ops br2684_netdev_ops_routed = { 626static const struct net_device_ops br2684_netdev_ops_routed = {
628 .ndo_start_xmit = br2684_start_xmit, 627 .ndo_start_xmit = br2684_start_xmit,
629 .ndo_set_mac_address = br2684_mac_addr, 628 .ndo_set_mac_address = br2684_mac_addr,
630 .ndo_change_mtu = eth_change_mtu
631}; 629};
632 630
633static void br2684_setup(struct net_device *netdev) 631static void br2684_setup(struct net_device *netdev)
@@ -651,7 +649,9 @@ static void br2684_setup_routed(struct net_device *netdev)
651 netdev->hard_header_len = sizeof(llc_oui_ipv4); /* worst case */ 649 netdev->hard_header_len = sizeof(llc_oui_ipv4); /* worst case */
652 netdev->netdev_ops = &br2684_netdev_ops_routed; 650 netdev->netdev_ops = &br2684_netdev_ops_routed;
653 netdev->addr_len = 0; 651 netdev->addr_len = 0;
654 netdev->mtu = 1500; 652 netdev->mtu = ETH_DATA_LEN;
653 netdev->min_mtu = 0;
654 netdev->max_mtu = ETH_MAX_MTU;
655 netdev->type = ARPHRD_PPP; 655 netdev->type = ARPHRD_PPP;
656 netdev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; 656 netdev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
657 netdev->tx_queue_len = 100; 657 netdev->tx_queue_len = 100;
diff --git a/net/atm/common.c b/net/atm/common.c
index 6dc12305799e..9613381f5db0 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -13,7 +13,7 @@
13#include <linux/errno.h> /* error codes */ 13#include <linux/errno.h> /* error codes */
14#include <linux/capability.h> 14#include <linux/capability.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/sched.h> 16#include <linux/sched/signal.h>
17#include <linux/time.h> /* struct timeval */ 17#include <linux/time.h> /* struct timeval */
18#include <linux/skbuff.h> 18#include <linux/skbuff.h>
19#include <linux/bitops.h> 19#include <linux/bitops.h>
@@ -630,7 +630,7 @@ int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t size)
630 goto out; 630 goto out;
631 skb->dev = NULL; /* for paths shared with net_device interfaces */ 631 skb->dev = NULL; /* for paths shared with net_device interfaces */
632 ATM_SKB(skb)->atm_options = vcc->atm_options; 632 ATM_SKB(skb)->atm_options = vcc->atm_options;
633 if (copy_from_iter(skb_put(skb, size), size, &m->msg_iter) != size) { 633 if (!copy_from_iter_full(skb_put(skb, size), size, &m->msg_iter)) {
634 kfree_skb(skb); 634 kfree_skb(skb);
635 error = -EFAULT; 635 error = -EFAULT;
636 goto out; 636 goto out;
diff --git a/net/atm/lec.c b/net/atm/lec.c
index 5d2693826afb..09cfe87f0a44 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -111,9 +111,9 @@ static inline void lec_arp_put(struct lec_arp_table *entry)
111} 111}
112 112
113static struct lane2_ops lane2_ops = { 113static struct lane2_ops lane2_ops = {
114 lane2_resolve, /* resolve, spec 3.1.3 */ 114 .resolve = lane2_resolve, /* spec 3.1.3 */
115 lane2_associate_req, /* associate_req, spec 3.1.4 */ 115 .associate_req = lane2_associate_req, /* spec 3.1.4 */
116 NULL /* associate indicator, spec 3.1.5 */ 116 .associate_indicator = NULL /* spec 3.1.5 */
117}; 117};
118 118
119static unsigned char bus_mac[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; 119static unsigned char bus_mac[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
@@ -544,15 +544,6 @@ send_to_lecd(struct lec_priv *priv, atmlec_msg_type type,
544 return 0; 544 return 0;
545} 545}
546 546
547/* shamelessly stolen from drivers/net/net_init.c */
548static int lec_change_mtu(struct net_device *dev, int new_mtu)
549{
550 if ((new_mtu < 68) || (new_mtu > 18190))
551 return -EINVAL;
552 dev->mtu = new_mtu;
553 return 0;
554}
555
556static void lec_set_multicast_list(struct net_device *dev) 547static void lec_set_multicast_list(struct net_device *dev)
557{ 548{
558 /* 549 /*
@@ -565,7 +556,6 @@ static const struct net_device_ops lec_netdev_ops = {
565 .ndo_open = lec_open, 556 .ndo_open = lec_open,
566 .ndo_stop = lec_close, 557 .ndo_stop = lec_close,
567 .ndo_start_xmit = lec_start_xmit, 558 .ndo_start_xmit = lec_start_xmit,
568 .ndo_change_mtu = lec_change_mtu,
569 .ndo_tx_timeout = lec_tx_timeout, 559 .ndo_tx_timeout = lec_tx_timeout,
570 .ndo_set_rx_mode = lec_set_multicast_list, 560 .ndo_set_rx_mode = lec_set_multicast_list,
571}; 561};
@@ -742,6 +732,7 @@ static int lecd_attach(struct atm_vcc *vcc, int arg)
742 if (!dev_lec[i]) 732 if (!dev_lec[i])
743 return -ENOMEM; 733 return -ENOMEM;
744 dev_lec[i]->netdev_ops = &lec_netdev_ops; 734 dev_lec[i]->netdev_ops = &lec_netdev_ops;
735 dev_lec[i]->max_mtu = 18190;
745 snprintf(dev_lec[i]->name, IFNAMSIZ, "lec%d", i); 736 snprintf(dev_lec[i]->name, IFNAMSIZ, "lec%d", i);
746 if (register_netdev(dev_lec[i])) { 737 if (register_netdev(dev_lec[i])) {
747 free_netdev(dev_lec[i]); 738 free_netdev(dev_lec[i]);
@@ -1068,7 +1059,9 @@ static void __exit lane_module_cleanup(void)
1068{ 1059{
1069 int i; 1060 int i;
1070 1061
1062#ifdef CONFIG_PROC_FS
1071 remove_proc_entry("lec", atm_proc_root); 1063 remove_proc_entry("lec", atm_proc_root);
1064#endif
1072 1065
1073 deregister_atm_ioctl(&lane_ioctl_ops); 1066 deregister_atm_ioctl(&lane_ioctl_ops);
1074 1067
diff --git a/net/atm/mpc.c b/net/atm/mpc.c
index 3b3b1a292ec8..a190800572bd 100644
--- a/net/atm/mpc.c
+++ b/net/atm/mpc.c
@@ -451,7 +451,7 @@ static void lane2_assoc_ind(struct net_device *dev, const u8 *mac_addr,
451 return; 451 return;
452 } 452 }
453 if (end_of_tlvs - tlvs != 0) 453 if (end_of_tlvs - tlvs != 0)
454 pr_info("(%s) ignoring %Zd bytes of trailing TLV garbage\n", 454 pr_info("(%s) ignoring %zd bytes of trailing TLV garbage\n",
455 dev->name, end_of_tlvs - tlvs); 455 dev->name, end_of_tlvs - tlvs);
456} 456}
457 457
diff --git a/net/atm/mpoa_caches.c b/net/atm/mpoa_caches.c
index 9e60e74c807d..a89fdebeffda 100644
--- a/net/atm/mpoa_caches.c
+++ b/net/atm/mpoa_caches.c
@@ -535,33 +535,32 @@ static void eg_destroy_cache(struct mpoa_client *mpc)
535 535
536 536
537static const struct in_cache_ops ingress_ops = { 537static const struct in_cache_ops ingress_ops = {
538 in_cache_add_entry, /* add_entry */ 538 .add_entry = in_cache_add_entry,
539 in_cache_get, /* get */ 539 .get = in_cache_get,
540 in_cache_get_with_mask, /* get_with_mask */ 540 .get_with_mask = in_cache_get_with_mask,
541 in_cache_get_by_vcc, /* get_by_vcc */ 541 .get_by_vcc = in_cache_get_by_vcc,
542 in_cache_put, /* put */ 542 .put = in_cache_put,
543 in_cache_remove_entry, /* remove_entry */ 543 .remove_entry = in_cache_remove_entry,
544 cache_hit, /* cache_hit */ 544 .cache_hit = cache_hit,
545 clear_count_and_expired, /* clear_count */ 545 .clear_count = clear_count_and_expired,
546 check_resolving_entries, /* check_resolving */ 546 .check_resolving = check_resolving_entries,
547 refresh_entries, /* refresh */ 547 .refresh = refresh_entries,
548 in_destroy_cache /* destroy_cache */ 548 .destroy_cache = in_destroy_cache
549}; 549};
550 550
551static const struct eg_cache_ops egress_ops = { 551static const struct eg_cache_ops egress_ops = {
552 eg_cache_add_entry, /* add_entry */ 552 .add_entry = eg_cache_add_entry,
553 eg_cache_get_by_cache_id, /* get_by_cache_id */ 553 .get_by_cache_id = eg_cache_get_by_cache_id,
554 eg_cache_get_by_tag, /* get_by_tag */ 554 .get_by_tag = eg_cache_get_by_tag,
555 eg_cache_get_by_vcc, /* get_by_vcc */ 555 .get_by_vcc = eg_cache_get_by_vcc,
556 eg_cache_get_by_src_ip, /* get_by_src_ip */ 556 .get_by_src_ip = eg_cache_get_by_src_ip,
557 eg_cache_put, /* put */ 557 .put = eg_cache_put,
558 eg_cache_remove_entry, /* remove_entry */ 558 .remove_entry = eg_cache_remove_entry,
559 update_eg_cache_entry, /* update */ 559 .update = update_eg_cache_entry,
560 clear_expired, /* clear_expired */ 560 .clear_expired = clear_expired,
561 eg_destroy_cache /* destroy_cache */ 561 .destroy_cache = eg_destroy_cache
562}; 562};
563 563
564
565void atm_mpoa_init_cache(struct mpoa_client *mpc) 564void atm_mpoa_init_cache(struct mpoa_client *mpc)
566{ 565{
567 mpc->in_ops = &ingress_ops; 566 mpc->in_ops = &ingress_ops;
diff --git a/net/atm/svc.c b/net/atm/svc.c
index 878563a8354d..5589de7086af 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -10,7 +10,7 @@
10#include <linux/kernel.h> /* printk */ 10#include <linux/kernel.h> /* printk */
11#include <linux/skbuff.h> 11#include <linux/skbuff.h>
12#include <linux/wait.h> 12#include <linux/wait.h>
13#include <linux/sched.h> /* jiffies and HZ */ 13#include <linux/sched/signal.h>
14#include <linux/fcntl.h> /* O_NONBLOCK */ 14#include <linux/fcntl.h> /* O_NONBLOCK */
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/atm.h> /* ATM stuff */ 16#include <linux/atm.h> /* ATM stuff */
@@ -318,7 +318,8 @@ out:
318 return error; 318 return error;
319} 319}
320 320
321static int svc_accept(struct socket *sock, struct socket *newsock, int flags) 321static int svc_accept(struct socket *sock, struct socket *newsock, int flags,
322 bool kern)
322{ 323{
323 struct sock *sk = sock->sk; 324 struct sock *sk = sock->sk;
324 struct sk_buff *skb; 325 struct sk_buff *skb;
@@ -329,7 +330,7 @@ static int svc_accept(struct socket *sock, struct socket *newsock, int flags)
329 330
330 lock_sock(sk); 331 lock_sock(sk);
331 332
332 error = svc_create(sock_net(sk), newsock, 0, 0); 333 error = svc_create(sock_net(sk), newsock, 0, kern);
333 if (error) 334 if (error)
334 goto out; 335 goto out;
335 336
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 2fdebabbfacd..b7c486752b3a 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -20,7 +20,7 @@
20#include <linux/socket.h> 20#include <linux/socket.h>
21#include <linux/in.h> 21#include <linux/in.h>
22#include <linux/kernel.h> 22#include <linux/kernel.h>
23#include <linux/sched.h> 23#include <linux/sched/signal.h>
24#include <linux/timer.h> 24#include <linux/timer.h>
25#include <linux/string.h> 25#include <linux/string.h>
26#include <linux/sockios.h> 26#include <linux/sockios.h>
@@ -32,7 +32,7 @@
32#include <linux/if_arp.h> 32#include <linux/if_arp.h>
33#include <linux/skbuff.h> 33#include <linux/skbuff.h>
34#include <net/sock.h> 34#include <net/sock.h>
35#include <asm/uaccess.h> 35#include <linux/uaccess.h>
36#include <linux/fcntl.h> 36#include <linux/fcntl.h>
37#include <linux/termios.h> /* For TIOCINQ/OUTQ */ 37#include <linux/termios.h> /* For TIOCINQ/OUTQ */
38#include <linux/mm.h> 38#include <linux/mm.h>
@@ -1320,7 +1320,8 @@ out_release:
1320 return err; 1320 return err;
1321} 1321}
1322 1322
1323static int ax25_accept(struct socket *sock, struct socket *newsock, int flags) 1323static int ax25_accept(struct socket *sock, struct socket *newsock, int flags,
1324 bool kern)
1324{ 1325{
1325 struct sk_buff *skb; 1326 struct sk_buff *skb;
1326 struct sock *newsk; 1327 struct sock *newsk;
diff --git a/net/ax25/ax25_addr.c b/net/ax25/ax25_addr.c
index e7c9b0ea17a1..ac2542b7be88 100644
--- a/net/ax25/ax25_addr.c
+++ b/net/ax25/ax25_addr.c
@@ -21,7 +21,7 @@
21#include <linux/netdevice.h> 21#include <linux/netdevice.h>
22#include <linux/skbuff.h> 22#include <linux/skbuff.h>
23#include <net/sock.h> 23#include <net/sock.h>
24#include <asm/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/fcntl.h> 25#include <linux/fcntl.h>
26#include <linux/mm.h> 26#include <linux/mm.h>
27#include <linux/interrupt.h> 27#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c
index 3d106767b272..9a3a301e1e2f 100644
--- a/net/ax25/ax25_dev.c
+++ b/net/ax25/ax25_dev.c
@@ -23,7 +23,7 @@
23#include <linux/if_arp.h> 23#include <linux/if_arp.h>
24#include <linux/skbuff.h> 24#include <linux/skbuff.h>
25#include <net/sock.h> 25#include <net/sock.h>
26#include <asm/uaccess.h> 26#include <linux/uaccess.h>
27#include <linux/fcntl.h> 27#include <linux/fcntl.h>
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/interrupt.h> 29#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_ds_in.c b/net/ax25/ax25_ds_in.c
index 9bd31e88aeca..891596e74278 100644
--- a/net/ax25/ax25_ds_in.c
+++ b/net/ax25/ax25_ds_in.c
@@ -22,7 +22,7 @@
22#include <linux/skbuff.h> 22#include <linux/skbuff.h>
23#include <net/sock.h> 23#include <net/sock.h>
24#include <net/tcp_states.h> 24#include <net/tcp_states.h>
25#include <asm/uaccess.h> 25#include <linux/uaccess.h>
26#include <linux/fcntl.h> 26#include <linux/fcntl.h>
27#include <linux/mm.h> 27#include <linux/mm.h>
28#include <linux/interrupt.h> 28#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_ds_subr.c b/net/ax25/ax25_ds_subr.c
index e05bd57b5afd..28827e81ba2b 100644
--- a/net/ax25/ax25_ds_subr.c
+++ b/net/ax25/ax25_ds_subr.c
@@ -23,7 +23,7 @@
23#include <linux/netdevice.h> 23#include <linux/netdevice.h>
24#include <linux/skbuff.h> 24#include <linux/skbuff.h>
25#include <net/sock.h> 25#include <net/sock.h>
26#include <asm/uaccess.h> 26#include <linux/uaccess.h>
27#include <linux/fcntl.h> 27#include <linux/fcntl.h>
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/interrupt.h> 29#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_ds_timer.c b/net/ax25/ax25_ds_timer.c
index 5237dff6941d..5fb2104b7304 100644
--- a/net/ax25/ax25_ds_timer.c
+++ b/net/ax25/ax25_ds_timer.c
@@ -24,7 +24,7 @@
24#include <linux/netdevice.h> 24#include <linux/netdevice.h>
25#include <linux/skbuff.h> 25#include <linux/skbuff.h>
26#include <net/sock.h> 26#include <net/sock.h>
27#include <asm/uaccess.h> 27#include <linux/uaccess.h>
28#include <linux/fcntl.h> 28#include <linux/fcntl.h>
29#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/interrupt.h> 30#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_iface.c b/net/ax25/ax25_iface.c
index 7f16e8a931b2..8c07c28569e4 100644
--- a/net/ax25/ax25_iface.c
+++ b/net/ax25/ax25_iface.c
@@ -23,7 +23,7 @@
23#include <linux/netdevice.h> 23#include <linux/netdevice.h>
24#include <linux/skbuff.h> 24#include <linux/skbuff.h>
25#include <net/sock.h> 25#include <net/sock.h>
26#include <asm/uaccess.h> 26#include <linux/uaccess.h>
27#include <linux/fcntl.h> 27#include <linux/fcntl.h>
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/interrupt.h> 29#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c
index bb5a0e4e98d9..860752639b1a 100644
--- a/net/ax25/ax25_in.c
+++ b/net/ax25/ax25_in.c
@@ -25,7 +25,7 @@
25#include <linux/skbuff.h> 25#include <linux/skbuff.h>
26#include <net/sock.h> 26#include <net/sock.h>
27#include <net/tcp_states.h> 27#include <net/tcp_states.h>
28#include <asm/uaccess.h> 28#include <linux/uaccess.h>
29#include <linux/fcntl.h> 29#include <linux/fcntl.h>
30#include <linux/mm.h> 30#include <linux/mm.h>
31#include <linux/interrupt.h> 31#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c
index 2fa3be965101..183b1c583d56 100644
--- a/net/ax25/ax25_ip.c
+++ b/net/ax25/ax25_ip.c
@@ -23,7 +23,7 @@
23#include <linux/if_arp.h> 23#include <linux/if_arp.h>
24#include <linux/skbuff.h> 24#include <linux/skbuff.h>
25#include <net/sock.h> 25#include <net/sock.h>
26#include <asm/uaccess.h> 26#include <linux/uaccess.h>
27#include <linux/fcntl.h> 27#include <linux/fcntl.h>
28#include <linux/termios.h> /* For TIOCINQ/OUTQ */ 28#include <linux/termios.h> /* For TIOCINQ/OUTQ */
29#include <linux/mm.h> 29#include <linux/mm.h>
diff --git a/net/ax25/ax25_out.c b/net/ax25/ax25_out.c
index 8ddd41baa81c..b11a5f466fcc 100644
--- a/net/ax25/ax25_out.c
+++ b/net/ax25/ax25_out.c
@@ -25,7 +25,7 @@
25#include <linux/netdevice.h> 25#include <linux/netdevice.h>
26#include <linux/skbuff.h> 26#include <linux/skbuff.h>
27#include <net/sock.h> 27#include <net/sock.h>
28#include <asm/uaccess.h> 28#include <linux/uaccess.h>
29#include <linux/fcntl.h> 29#include <linux/fcntl.h>
30#include <linux/mm.h> 30#include <linux/mm.h>
31#include <linux/interrupt.h> 31#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c
index d39097737e38..e1fda27cb27c 100644
--- a/net/ax25/ax25_route.c
+++ b/net/ax25/ax25_route.c
@@ -31,7 +31,7 @@
31#include <linux/skbuff.h> 31#include <linux/skbuff.h>
32#include <linux/spinlock.h> 32#include <linux/spinlock.h>
33#include <net/sock.h> 33#include <net/sock.h>
34#include <asm/uaccess.h> 34#include <linux/uaccess.h>
35#include <linux/fcntl.h> 35#include <linux/fcntl.h>
36#include <linux/mm.h> 36#include <linux/mm.h>
37#include <linux/interrupt.h> 37#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_std_in.c b/net/ax25/ax25_std_in.c
index 3fbf8f7b2cf4..8632b86e843e 100644
--- a/net/ax25/ax25_std_in.c
+++ b/net/ax25/ax25_std_in.c
@@ -29,7 +29,7 @@
29#include <linux/skbuff.h> 29#include <linux/skbuff.h>
30#include <net/sock.h> 30#include <net/sock.h>
31#include <net/tcp_states.h> 31#include <net/tcp_states.h>
32#include <asm/uaccess.h> 32#include <linux/uaccess.h>
33#include <linux/fcntl.h> 33#include <linux/fcntl.h>
34#include <linux/mm.h> 34#include <linux/mm.h>
35#include <linux/interrupt.h> 35#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_std_subr.c b/net/ax25/ax25_std_subr.c
index 8b66a41e538f..94bd06396a43 100644
--- a/net/ax25/ax25_std_subr.c
+++ b/net/ax25/ax25_std_subr.c
@@ -20,7 +20,7 @@
20#include <linux/netdevice.h> 20#include <linux/netdevice.h>
21#include <linux/skbuff.h> 21#include <linux/skbuff.h>
22#include <net/sock.h> 22#include <net/sock.h>
23#include <asm/uaccess.h> 23#include <linux/uaccess.h>
24#include <linux/fcntl.h> 24#include <linux/fcntl.h>
25#include <linux/mm.h> 25#include <linux/mm.h>
26#include <linux/interrupt.h> 26#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_std_timer.c b/net/ax25/ax25_std_timer.c
index 2c0d6ef66f9d..30bbc675261d 100644
--- a/net/ax25/ax25_std_timer.c
+++ b/net/ax25/ax25_std_timer.c
@@ -24,7 +24,7 @@
24#include <linux/skbuff.h> 24#include <linux/skbuff.h>
25#include <net/sock.h> 25#include <net/sock.h>
26#include <net/tcp_states.h> 26#include <net/tcp_states.h>
27#include <asm/uaccess.h> 27#include <linux/uaccess.h>
28#include <linux/fcntl.h> 28#include <linux/fcntl.h>
29#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/interrupt.h> 30#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c
index 655a7d4c96e1..038b109b2be7 100644
--- a/net/ax25/ax25_subr.c
+++ b/net/ax25/ax25_subr.c
@@ -25,7 +25,7 @@
25#include <linux/skbuff.h> 25#include <linux/skbuff.h>
26#include <net/sock.h> 26#include <net/sock.h>
27#include <net/tcp_states.h> 27#include <net/tcp_states.h>
28#include <asm/uaccess.h> 28#include <linux/uaccess.h>
29#include <linux/fcntl.h> 29#include <linux/fcntl.h>
30#include <linux/mm.h> 30#include <linux/mm.h>
31#include <linux/interrupt.h> 31#include <linux/interrupt.h>
@@ -264,7 +264,7 @@ void ax25_disconnect(ax25_cb *ax25, int reason)
264{ 264{
265 ax25_clear_queues(ax25); 265 ax25_clear_queues(ax25);
266 266
267 if (!sock_flag(ax25->sk, SOCK_DESTROY)) 267 if (!ax25->sk || !sock_flag(ax25->sk, SOCK_DESTROY))
268 ax25_stop_heartbeat(ax25); 268 ax25_stop_heartbeat(ax25);
269 ax25_stop_t1timer(ax25); 269 ax25_stop_t1timer(ax25);
270 ax25_stop_t2timer(ax25); 270 ax25_stop_t2timer(ax25);
diff --git a/net/ax25/ax25_timer.c b/net/ax25/ax25_timer.c
index c3cffa79bafb..23a6f38a80bf 100644
--- a/net/ax25/ax25_timer.c
+++ b/net/ax25/ax25_timer.c
@@ -28,7 +28,7 @@
28#include <linux/netdevice.h> 28#include <linux/netdevice.h>
29#include <linux/skbuff.h> 29#include <linux/skbuff.h>
30#include <net/sock.h> 30#include <net/sock.h>
31#include <asm/uaccess.h> 31#include <linux/uaccess.h>
32#include <linux/fcntl.h> 32#include <linux/fcntl.h>
33#include <linux/mm.h> 33#include <linux/mm.h>
34#include <linux/interrupt.h> 34#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_uid.c b/net/ax25/ax25_uid.c
index 4ad2fb7bcd35..0403b0def7e6 100644
--- a/net/ax25/ax25_uid.c
+++ b/net/ax25/ax25_uid.c
@@ -25,7 +25,7 @@
25#include <linux/if_arp.h> 25#include <linux/if_arp.h>
26#include <linux/skbuff.h> 26#include <linux/skbuff.h>
27#include <net/sock.h> 27#include <net/sock.h>
28#include <asm/uaccess.h> 28#include <linux/uaccess.h>
29#include <linux/fcntl.h> 29#include <linux/fcntl.h>
30#include <linux/mm.h> 30#include <linux/mm.h>
31#include <linux/interrupt.h> 31#include <linux/interrupt.h>
diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
index f20742cbae6d..b73b96a2854b 100644
--- a/net/batman-adv/Kconfig
+++ b/net/batman-adv/Kconfig
@@ -17,7 +17,7 @@ config BATMAN_ADV
17 17
18config BATMAN_ADV_BATMAN_V 18config BATMAN_ADV_BATMAN_V
19 bool "B.A.T.M.A.N. V protocol (experimental)" 19 bool "B.A.T.M.A.N. V protocol (experimental)"
20 depends on BATMAN_ADV && CFG80211=y || (CFG80211=m && BATMAN_ADV=m) 20 depends on BATMAN_ADV && !(CFG80211=m && BATMAN_ADV=y)
21 default n 21 default n
22 help 22 help
23 This option enables the B.A.T.M.A.N. V protocol, the successor 23 This option enables the B.A.T.M.A.N. V protocol, the successor
diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile
index f724d3c98a81..915987bc6d29 100644
--- a/net/batman-adv/Makefile
+++ b/net/batman-adv/Makefile
@@ -1,5 +1,5 @@
1# 1#
2# Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 2# Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
3# 3#
4# Marek Lindner, Simon Wunderlich 4# Marek Lindner, Simon Wunderlich
5# 5#
diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c
index 623d04302aa2..44fd073b7546 100644
--- a/net/batman-adv/bat_algo.c
+++ b/net/batman-adv/bat_algo.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h
index 3b5b69cdd12b..29f6312f9bf1 100644
--- a/net/batman-adv/bat_algo.h
+++ b/net/batman-adv/bat_algo.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Linus Lüssing 3 * Marek Lindner, Linus Lüssing
4 * 4 *
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index e2d18d0b1f06..71343d0fec94 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
@@ -698,7 +698,7 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff,
698 698
699 forw_packet_aggr->skb = netdev_alloc_skb_ip_align(NULL, skb_size); 699 forw_packet_aggr->skb = netdev_alloc_skb_ip_align(NULL, skb_size);
700 if (!forw_packet_aggr->skb) { 700 if (!forw_packet_aggr->skb) {
701 batadv_forw_packet_free(forw_packet_aggr); 701 batadv_forw_packet_free(forw_packet_aggr, true);
702 return; 702 return;
703 } 703 }
704 704
@@ -717,17 +717,10 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff,
717 if (direct_link) 717 if (direct_link)
718 forw_packet_aggr->direct_link_flags |= 1; 718 forw_packet_aggr->direct_link_flags |= 1;
719 719
720 /* add new packet to packet list */
721 spin_lock_bh(&bat_priv->forw_bat_list_lock);
722 hlist_add_head(&forw_packet_aggr->list, &bat_priv->forw_bat_list);
723 spin_unlock_bh(&bat_priv->forw_bat_list_lock);
724
725 /* start timer for this packet */
726 INIT_DELAYED_WORK(&forw_packet_aggr->delayed_work, 720 INIT_DELAYED_WORK(&forw_packet_aggr->delayed_work,
727 batadv_iv_send_outstanding_bat_ogm_packet); 721 batadv_iv_send_outstanding_bat_ogm_packet);
728 queue_delayed_work(batadv_event_workqueue, 722
729 &forw_packet_aggr->delayed_work, 723 batadv_forw_packet_ogmv1_queue(bat_priv, forw_packet_aggr, send_time);
730 send_time - jiffies);
731} 724}
732 725
733/* aggregate a new packet into the existing ogm packet */ 726/* aggregate a new packet into the existing ogm packet */
@@ -1272,7 +1265,7 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
1272 */ 1265 */
1273 tq_iface_penalty = BATADV_TQ_MAX_VALUE; 1266 tq_iface_penalty = BATADV_TQ_MAX_VALUE;
1274 if (if_outgoing && (if_incoming == if_outgoing) && 1267 if (if_outgoing && (if_incoming == if_outgoing) &&
1275 batadv_is_wifi_netdev(if_outgoing->net_dev)) 1268 batadv_is_wifi_hardif(if_outgoing))
1276 tq_iface_penalty = batadv_hop_penalty(BATADV_TQ_MAX_VALUE, 1269 tq_iface_penalty = batadv_hop_penalty(BATADV_TQ_MAX_VALUE,
1277 bat_priv); 1270 bat_priv);
1278 1271
@@ -1611,7 +1604,7 @@ out:
1611 if (hardif_neigh) 1604 if (hardif_neigh)
1612 batadv_hardif_neigh_put(hardif_neigh); 1605 batadv_hardif_neigh_put(hardif_neigh);
1613 1606
1614 kfree_skb(skb_priv); 1607 consume_skb(skb_priv);
1615} 1608}
1616 1609
1617/** 1610/**
@@ -1783,17 +1776,17 @@ static void batadv_iv_send_outstanding_bat_ogm_packet(struct work_struct *work)
1783 struct delayed_work *delayed_work; 1776 struct delayed_work *delayed_work;
1784 struct batadv_forw_packet *forw_packet; 1777 struct batadv_forw_packet *forw_packet;
1785 struct batadv_priv *bat_priv; 1778 struct batadv_priv *bat_priv;
1779 bool dropped = false;
1786 1780
1787 delayed_work = to_delayed_work(work); 1781 delayed_work = to_delayed_work(work);
1788 forw_packet = container_of(delayed_work, struct batadv_forw_packet, 1782 forw_packet = container_of(delayed_work, struct batadv_forw_packet,
1789 delayed_work); 1783 delayed_work);
1790 bat_priv = netdev_priv(forw_packet->if_incoming->soft_iface); 1784 bat_priv = netdev_priv(forw_packet->if_incoming->soft_iface);
1791 spin_lock_bh(&bat_priv->forw_bat_list_lock);
1792 hlist_del(&forw_packet->list);
1793 spin_unlock_bh(&bat_priv->forw_bat_list_lock);
1794 1785
1795 if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) 1786 if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) {
1787 dropped = true;
1796 goto out; 1788 goto out;
1789 }
1797 1790
1798 batadv_iv_ogm_emit(forw_packet); 1791 batadv_iv_ogm_emit(forw_packet);
1799 1792
@@ -1810,7 +1803,10 @@ static void batadv_iv_send_outstanding_bat_ogm_packet(struct work_struct *work)
1810 batadv_iv_ogm_schedule(forw_packet->if_incoming); 1803 batadv_iv_ogm_schedule(forw_packet->if_incoming);
1811 1804
1812out: 1805out:
1813 batadv_forw_packet_free(forw_packet); 1806 /* do we get something for free()? */
1807 if (batadv_forw_packet_steal(forw_packet,
1808 &bat_priv->forw_bat_list_lock))
1809 batadv_forw_packet_free(forw_packet, dropped);
1814} 1810}
1815 1811
1816static int batadv_iv_ogm_receive(struct sk_buff *skb, 1812static int batadv_iv_ogm_receive(struct sk_buff *skb,
@@ -1820,17 +1816,18 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb,
1820 struct batadv_ogm_packet *ogm_packet; 1816 struct batadv_ogm_packet *ogm_packet;
1821 u8 *packet_pos; 1817 u8 *packet_pos;
1822 int ogm_offset; 1818 int ogm_offset;
1823 bool ret; 1819 bool res;
1820 int ret = NET_RX_DROP;
1824 1821
1825 ret = batadv_check_management_packet(skb, if_incoming, BATADV_OGM_HLEN); 1822 res = batadv_check_management_packet(skb, if_incoming, BATADV_OGM_HLEN);
1826 if (!ret) 1823 if (!res)
1827 return NET_RX_DROP; 1824 goto free_skb;
1828 1825
1829 /* did we receive a B.A.T.M.A.N. IV OGM packet on an interface 1826 /* did we receive a B.A.T.M.A.N. IV OGM packet on an interface
1830 * that does not have B.A.T.M.A.N. IV enabled ? 1827 * that does not have B.A.T.M.A.N. IV enabled ?
1831 */ 1828 */
1832 if (bat_priv->algo_ops->iface.enable != batadv_iv_ogm_iface_enable) 1829 if (bat_priv->algo_ops->iface.enable != batadv_iv_ogm_iface_enable)
1833 return NET_RX_DROP; 1830 goto free_skb;
1834 1831
1835 batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_RX); 1832 batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_RX);
1836 batadv_add_counter(bat_priv, BATADV_CNT_MGMT_RX_BYTES, 1833 batadv_add_counter(bat_priv, BATADV_CNT_MGMT_RX_BYTES,
@@ -1851,8 +1848,15 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb,
1851 ogm_packet = (struct batadv_ogm_packet *)packet_pos; 1848 ogm_packet = (struct batadv_ogm_packet *)packet_pos;
1852 } 1849 }
1853 1850
1854 kfree_skb(skb); 1851 ret = NET_RX_SUCCESS;
1855 return NET_RX_SUCCESS; 1852
1853free_skb:
1854 if (ret == NET_RX_SUCCESS)
1855 consume_skb(skb);
1856 else
1857 kfree_skb(skb);
1858
1859 return ret;
1856} 1860}
1857 1861
1858#ifdef CONFIG_BATMAN_ADV_DEBUGFS 1862#ifdef CONFIG_BATMAN_ADV_DEBUGFS
@@ -2473,6 +2477,16 @@ static void batadv_iv_iface_activate(struct batadv_hard_iface *hard_iface)
2473 batadv_iv_ogm_schedule(hard_iface); 2477 batadv_iv_ogm_schedule(hard_iface);
2474} 2478}
2475 2479
2480/**
2481 * batadv_iv_init_sel_class - initialize GW selection class
2482 * @bat_priv: the bat priv with all the soft interface information
2483 */
2484static void batadv_iv_init_sel_class(struct batadv_priv *bat_priv)
2485{
2486 /* set default TQ difference threshold to 20 */
2487 atomic_set(&bat_priv->gw.sel_class, 20);
2488}
2489
2476static struct batadv_gw_node * 2490static struct batadv_gw_node *
2477batadv_iv_gw_get_best_gw_node(struct batadv_priv *bat_priv) 2491batadv_iv_gw_get_best_gw_node(struct batadv_priv *bat_priv)
2478{ 2492{
@@ -2486,7 +2500,7 @@ batadv_iv_gw_get_best_gw_node(struct batadv_priv *bat_priv)
2486 struct batadv_orig_node *orig_node; 2500 struct batadv_orig_node *orig_node;
2487 2501
2488 rcu_read_lock(); 2502 rcu_read_lock();
2489 hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) { 2503 hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) {
2490 orig_node = gw_node->orig_node; 2504 orig_node = gw_node->orig_node;
2491 router = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT); 2505 router = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT);
2492 if (!router) 2506 if (!router)
@@ -2674,7 +2688,7 @@ static void batadv_iv_gw_print(struct batadv_priv *bat_priv,
2674 " Gateway (#/255) Nexthop [outgoingIF]: advertised uplink bandwidth\n"); 2688 " Gateway (#/255) Nexthop [outgoingIF]: advertised uplink bandwidth\n");
2675 2689
2676 rcu_read_lock(); 2690 rcu_read_lock();
2677 hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) { 2691 hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) {
2678 /* fails if orig_node has no router */ 2692 /* fails if orig_node has no router */
2679 if (batadv_iv_gw_write_buffer_text(bat_priv, seq, gw_node) < 0) 2693 if (batadv_iv_gw_write_buffer_text(bat_priv, seq, gw_node) < 0)
2680 continue; 2694 continue;
@@ -2774,7 +2788,7 @@ static void batadv_iv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb,
2774 int idx = 0; 2788 int idx = 0;
2775 2789
2776 rcu_read_lock(); 2790 rcu_read_lock();
2777 hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) { 2791 hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) {
2778 if (idx++ < idx_skip) 2792 if (idx++ < idx_skip)
2779 continue; 2793 continue;
2780 2794
@@ -2819,6 +2833,7 @@ static struct batadv_algo_ops batadv_batman_iv __read_mostly = {
2819 .del_if = batadv_iv_ogm_orig_del_if, 2833 .del_if = batadv_iv_ogm_orig_del_if,
2820 }, 2834 },
2821 .gw = { 2835 .gw = {
2836 .init_sel_class = batadv_iv_init_sel_class,
2822 .get_best_gw_node = batadv_iv_gw_get_best_gw_node, 2837 .get_best_gw_node = batadv_iv_gw_get_best_gw_node,
2823 .is_eligible = batadv_iv_gw_is_eligible, 2838 .is_eligible = batadv_iv_gw_is_eligible,
2824#ifdef CONFIG_BATMAN_ADV_DEBUGFS 2839#ifdef CONFIG_BATMAN_ADV_DEBUGFS
diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h
index b9f3550faaf7..ae2ab526bdb1 100644
--- a/net/batman-adv/bat_iv_ogm.h
+++ b/net/batman-adv/bat_iv_ogm.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index e79f6f01182e..a36c8e7291d6 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Linus Lüssing, Marek Lindner 3 * Linus Lüssing, Marek Lindner
4 * 4 *
@@ -668,6 +668,16 @@ err_ifinfo1:
668 return ret; 668 return ret;
669} 669}
670 670
671/**
672 * batadv_v_init_sel_class - initialize GW selection class
673 * @bat_priv: the bat priv with all the soft interface information
674 */
675static void batadv_v_init_sel_class(struct batadv_priv *bat_priv)
676{
677 /* set default throughput difference threshold to 5Mbps */
678 atomic_set(&bat_priv->gw.sel_class, 50);
679}
680
671static ssize_t batadv_v_store_sel_class(struct batadv_priv *bat_priv, 681static ssize_t batadv_v_store_sel_class(struct batadv_priv *bat_priv,
672 char *buff, size_t count) 682 char *buff, size_t count)
673{ 683{
@@ -750,7 +760,7 @@ batadv_v_gw_get_best_gw_node(struct batadv_priv *bat_priv)
750 u32 max_bw = 0, bw; 760 u32 max_bw = 0, bw;
751 761
752 rcu_read_lock(); 762 rcu_read_lock();
753 hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) { 763 hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) {
754 if (!kref_get_unless_zero(&gw_node->refcount)) 764 if (!kref_get_unless_zero(&gw_node->refcount))
755 continue; 765 continue;
756 766
@@ -787,7 +797,7 @@ static bool batadv_v_gw_is_eligible(struct batadv_priv *bat_priv,
787 struct batadv_orig_node *curr_gw_orig, 797 struct batadv_orig_node *curr_gw_orig,
788 struct batadv_orig_node *orig_node) 798 struct batadv_orig_node *orig_node)
789{ 799{
790 struct batadv_gw_node *curr_gw = NULL, *orig_gw = NULL; 800 struct batadv_gw_node *curr_gw, *orig_gw = NULL;
791 u32 gw_throughput, orig_throughput, threshold; 801 u32 gw_throughput, orig_throughput, threshold;
792 bool ret = false; 802 bool ret = false;
793 803
@@ -889,7 +899,7 @@ static void batadv_v_gw_print(struct batadv_priv *bat_priv,
889 " Gateway ( throughput) Nexthop [outgoingIF]: advertised uplink bandwidth\n"); 899 " Gateway ( throughput) Nexthop [outgoingIF]: advertised uplink bandwidth\n");
890 900
891 rcu_read_lock(); 901 rcu_read_lock();
892 hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) { 902 hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) {
893 /* fails if orig_node has no router */ 903 /* fails if orig_node has no router */
894 if (batadv_v_gw_write_buffer_text(bat_priv, seq, gw_node) < 0) 904 if (batadv_v_gw_write_buffer_text(bat_priv, seq, gw_node) < 0)
895 continue; 905 continue;
@@ -1009,7 +1019,7 @@ static void batadv_v_gw_dump(struct sk_buff *msg, struct netlink_callback *cb,
1009 int idx = 0; 1019 int idx = 0;
1010 1020
1011 rcu_read_lock(); 1021 rcu_read_lock();
1012 hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) { 1022 hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) {
1013 if (idx++ < idx_skip) 1023 if (idx++ < idx_skip)
1014 continue; 1024 continue;
1015 1025
@@ -1052,6 +1062,7 @@ static struct batadv_algo_ops batadv_batman_v __read_mostly = {
1052 .dump = batadv_v_orig_dump, 1062 .dump = batadv_v_orig_dump,
1053 }, 1063 },
1054 .gw = { 1064 .gw = {
1065 .init_sel_class = batadv_v_init_sel_class,
1055 .store_sel_class = batadv_v_store_sel_class, 1066 .store_sel_class = batadv_v_store_sel_class,
1056 .show_sel_class = batadv_v_show_sel_class, 1067 .show_sel_class = batadv_v_show_sel_class,
1057 .get_best_gw_node = batadv_v_gw_get_best_gw_node, 1068 .get_best_gw_node = batadv_v_gw_get_best_gw_node,
@@ -1092,9 +1103,6 @@ int batadv_v_mesh_init(struct batadv_priv *bat_priv)
1092 if (ret < 0) 1103 if (ret < 0)
1093 return ret; 1104 return ret;
1094 1105
1095 /* set default throughput difference threshold to 5Mbps */
1096 atomic_set(&bat_priv->gw.sel_class, 50);
1097
1098 return 0; 1106 return 0;
1099} 1107}
1100 1108
diff --git a/net/batman-adv/bat_v.h b/net/batman-adv/bat_v.h
index 83b77639729e..dd7c4b647e6b 100644
--- a/net/batman-adv/bat_v.h
+++ b/net/batman-adv/bat_v.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Linus Lüssing 3 * Marek Lindner, Linus Lüssing
4 * 4 *
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index ee08540ce503..b90c9903e246 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Linus Lüssing, Marek Lindner 3 * Linus Lüssing, Marek Lindner
4 * 4 *
@@ -75,6 +75,7 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh)
75{ 75{
76 struct batadv_hard_iface *hard_iface = neigh->if_incoming; 76 struct batadv_hard_iface *hard_iface = neigh->if_incoming;
77 struct ethtool_link_ksettings link_settings; 77 struct ethtool_link_ksettings link_settings;
78 struct net_device *real_netdev;
78 struct station_info sinfo; 79 struct station_info sinfo;
79 u32 throughput; 80 u32 throughput;
80 int ret; 81 int ret;
@@ -89,23 +90,27 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh)
89 /* if this is a wireless device, then ask its throughput through 90 /* if this is a wireless device, then ask its throughput through
90 * cfg80211 API 91 * cfg80211 API
91 */ 92 */
92 if (batadv_is_wifi_netdev(hard_iface->net_dev)) { 93 if (batadv_is_wifi_hardif(hard_iface)) {
93 if (hard_iface->net_dev->ieee80211_ptr) { 94 if (!batadv_is_cfg80211_hardif(hard_iface))
94 ret = cfg80211_get_station(hard_iface->net_dev, 95 /* unsupported WiFi driver version */
95 neigh->addr, &sinfo); 96 goto default_throughput;
96 if (ret == -ENOENT) { 97
97 /* Node is not associated anymore! It would be 98 real_netdev = batadv_get_real_netdev(hard_iface->net_dev);
98 * possible to delete this neighbor. For now set 99 if (!real_netdev)
99 * the throughput metric to 0. 100 goto default_throughput;
100 */ 101
101 return 0; 102 ret = cfg80211_get_station(real_netdev, neigh->addr, &sinfo);
102 } 103
103 if (!ret) 104 dev_put(real_netdev);
104 return sinfo.expected_throughput / 100; 105 if (ret == -ENOENT) {
106 /* Node is not associated anymore! It would be
107 * possible to delete this neighbor. For now set
108 * the throughput metric to 0.
109 */
110 return 0;
105 } 111 }
106 112 if (!ret)
107 /* unsupported WiFi driver version */ 113 return sinfo.expected_throughput / 100;
108 goto default_throughput;
109 } 114 }
110 115
111 /* if not a wifi interface, check if this device provides data via 116 /* if not a wifi interface, check if this device provides data via
@@ -187,7 +192,7 @@ batadv_v_elp_wifi_neigh_probe(struct batadv_hardif_neigh_node *neigh)
187 int elp_skb_len; 192 int elp_skb_len;
188 193
189 /* this probing routine is for Wifi neighbours only */ 194 /* this probing routine is for Wifi neighbours only */
190 if (!batadv_is_wifi_netdev(hard_iface->net_dev)) 195 if (!batadv_is_wifi_hardif(hard_iface))
191 return true; 196 return true;
192 197
193 /* probe the neighbor only if no unicast packets have been sent 198 /* probe the neighbor only if no unicast packets have been sent
@@ -352,7 +357,7 @@ int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface)
352 /* warn the user (again) if there is no throughput data is available */ 357 /* warn the user (again) if there is no throughput data is available */
353 hard_iface->bat_v.flags &= ~BATADV_WARNING_DEFAULT; 358 hard_iface->bat_v.flags &= ~BATADV_WARNING_DEFAULT;
354 359
355 if (batadv_is_wifi_netdev(hard_iface->net_dev)) 360 if (batadv_is_wifi_hardif(hard_iface))
356 hard_iface->bat_v.flags &= ~BATADV_FULL_DUPLEX; 361 hard_iface->bat_v.flags &= ~BATADV_FULL_DUPLEX;
357 362
358 INIT_DELAYED_WORK(&hard_iface->bat_v.elp_wq, 363 INIT_DELAYED_WORK(&hard_iface->bat_v.elp_wq,
@@ -492,20 +497,21 @@ int batadv_v_elp_packet_recv(struct sk_buff *skb,
492 struct batadv_elp_packet *elp_packet; 497 struct batadv_elp_packet *elp_packet;
493 struct batadv_hard_iface *primary_if; 498 struct batadv_hard_iface *primary_if;
494 struct ethhdr *ethhdr = (struct ethhdr *)skb_mac_header(skb); 499 struct ethhdr *ethhdr = (struct ethhdr *)skb_mac_header(skb);
495 bool ret; 500 bool res;
501 int ret = NET_RX_DROP;
496 502
497 ret = batadv_check_management_packet(skb, if_incoming, BATADV_ELP_HLEN); 503 res = batadv_check_management_packet(skb, if_incoming, BATADV_ELP_HLEN);
498 if (!ret) 504 if (!res)
499 return NET_RX_DROP; 505 goto free_skb;
500 506
501 if (batadv_is_my_mac(bat_priv, ethhdr->h_source)) 507 if (batadv_is_my_mac(bat_priv, ethhdr->h_source))
502 return NET_RX_DROP; 508 goto free_skb;
503 509
504 /* did we receive a B.A.T.M.A.N. V ELP packet on an interface 510 /* did we receive a B.A.T.M.A.N. V ELP packet on an interface
505 * that does not have B.A.T.M.A.N. V ELP enabled ? 511 * that does not have B.A.T.M.A.N. V ELP enabled ?
506 */ 512 */
507 if (strcmp(bat_priv->algo_ops->name, "BATMAN_V") != 0) 513 if (strcmp(bat_priv->algo_ops->name, "BATMAN_V") != 0)
508 return NET_RX_DROP; 514 goto free_skb;
509 515
510 elp_packet = (struct batadv_elp_packet *)skb->data; 516 elp_packet = (struct batadv_elp_packet *)skb->data;
511 517
@@ -516,14 +522,19 @@ int batadv_v_elp_packet_recv(struct sk_buff *skb,
516 522
517 primary_if = batadv_primary_if_get_selected(bat_priv); 523 primary_if = batadv_primary_if_get_selected(bat_priv);
518 if (!primary_if) 524 if (!primary_if)
519 goto out; 525 goto free_skb;
520 526
521 batadv_v_elp_neigh_update(bat_priv, ethhdr->h_source, if_incoming, 527 batadv_v_elp_neigh_update(bat_priv, ethhdr->h_source, if_incoming,
522 elp_packet); 528 elp_packet);
523 529
524out: 530 ret = NET_RX_SUCCESS;
525 if (primary_if) 531 batadv_hardif_put(primary_if);
526 batadv_hardif_put(primary_if); 532
527 consume_skb(skb); 533free_skb:
528 return NET_RX_SUCCESS; 534 if (ret == NET_RX_SUCCESS)
535 consume_skb(skb);
536 else
537 kfree_skb(skb);
538
539 return ret;
529} 540}
diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h
index be17c0b1369e..376ead280ab9 100644
--- a/net/batman-adv/bat_v_elp.h
+++ b/net/batman-adv/bat_v_elp.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Linus Lüssing, Marek Lindner 3 * Linus Lüssing, Marek Lindner
4 * 4 *
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
index 1aeeadca620c..03a35c9f456d 100644
--- a/net/batman-adv/bat_v_ogm.c
+++ b/net/batman-adv/bat_v_ogm.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Antonio Quartulli 3 * Antonio Quartulli
4 * 4 *
@@ -140,6 +140,7 @@ static void batadv_v_ogm_send(struct work_struct *work)
140 unsigned char *ogm_buff, *pkt_buff; 140 unsigned char *ogm_buff, *pkt_buff;
141 int ogm_buff_len; 141 int ogm_buff_len;
142 u16 tvlv_len = 0; 142 u16 tvlv_len = 0;
143 int ret;
143 144
144 bat_v = container_of(work, struct batadv_priv_bat_v, ogm_wq.work); 145 bat_v = container_of(work, struct batadv_priv_bat_v, ogm_wq.work);
145 bat_priv = container_of(bat_v, struct batadv_priv, bat_v); 146 bat_priv = container_of(bat_v, struct batadv_priv, bat_v);
@@ -182,6 +183,31 @@ static void batadv_v_ogm_send(struct work_struct *work)
182 if (!kref_get_unless_zero(&hard_iface->refcount)) 183 if (!kref_get_unless_zero(&hard_iface->refcount))
183 continue; 184 continue;
184 185
186 ret = batadv_hardif_no_broadcast(hard_iface, NULL, NULL);
187 if (ret) {
188 char *type;
189
190 switch (ret) {
191 case BATADV_HARDIF_BCAST_NORECIPIENT:
192 type = "no neighbor";
193 break;
194 case BATADV_HARDIF_BCAST_DUPFWD:
195 type = "single neighbor is source";
196 break;
197 case BATADV_HARDIF_BCAST_DUPORIG:
198 type = "single neighbor is originator";
199 break;
200 default:
201 type = "unknown";
202 }
203
204 batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "OGM2 from ourselve on %s surpressed: %s\n",
205 hard_iface->net_dev->name, type);
206
207 batadv_hardif_put(hard_iface);
208 continue;
209 }
210
185 batadv_dbg(BATADV_DBG_BATMAN, bat_priv, 211 batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
186 "Sending own OGM2 packet (originator %pM, seqno %u, throughput %u, TTL %d) on interface %s [%pM]\n", 212 "Sending own OGM2 packet (originator %pM, seqno %u, throughput %u, TTL %d) on interface %s [%pM]\n",
187 ogm_packet->orig, ntohl(ogm_packet->seqno), 213 ogm_packet->orig, ntohl(ogm_packet->seqno),
@@ -401,7 +427,7 @@ static int batadv_v_ogm_metric_update(struct batadv_priv *bat_priv,
401 struct batadv_hard_iface *if_incoming, 427 struct batadv_hard_iface *if_incoming,
402 struct batadv_hard_iface *if_outgoing) 428 struct batadv_hard_iface *if_outgoing)
403{ 429{
404 struct batadv_orig_ifinfo *orig_ifinfo = NULL; 430 struct batadv_orig_ifinfo *orig_ifinfo;
405 struct batadv_neigh_ifinfo *neigh_ifinfo = NULL; 431 struct batadv_neigh_ifinfo *neigh_ifinfo = NULL;
406 bool protection_started = false; 432 bool protection_started = false;
407 int ret = -EINVAL; 433 int ret = -EINVAL;
@@ -486,7 +512,7 @@ static bool batadv_v_ogm_route_update(struct batadv_priv *bat_priv,
486 struct batadv_hard_iface *if_outgoing) 512 struct batadv_hard_iface *if_outgoing)
487{ 513{
488 struct batadv_neigh_node *router = NULL; 514 struct batadv_neigh_node *router = NULL;
489 struct batadv_orig_node *orig_neigh_node = NULL; 515 struct batadv_orig_node *orig_neigh_node;
490 struct batadv_neigh_node *orig_neigh_router = NULL; 516 struct batadv_neigh_node *orig_neigh_router = NULL;
491 struct batadv_neigh_ifinfo *router_ifinfo = NULL, *neigh_ifinfo = NULL; 517 struct batadv_neigh_ifinfo *router_ifinfo = NULL, *neigh_ifinfo = NULL;
492 u32 router_throughput, neigh_throughput; 518 u32 router_throughput, neigh_throughput;
@@ -651,6 +677,7 @@ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset,
651 struct batadv_hard_iface *hard_iface; 677 struct batadv_hard_iface *hard_iface;
652 struct batadv_ogm2_packet *ogm_packet; 678 struct batadv_ogm2_packet *ogm_packet;
653 u32 ogm_throughput, link_throughput, path_throughput; 679 u32 ogm_throughput, link_throughput, path_throughput;
680 int ret;
654 681
655 ethhdr = eth_hdr(skb); 682 ethhdr = eth_hdr(skb);
656 ogm_packet = (struct batadv_ogm2_packet *)(skb->data + ogm_offset); 683 ogm_packet = (struct batadv_ogm2_packet *)(skb->data + ogm_offset);
@@ -716,6 +743,35 @@ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset,
716 if (!kref_get_unless_zero(&hard_iface->refcount)) 743 if (!kref_get_unless_zero(&hard_iface->refcount))
717 continue; 744 continue;
718 745
746 ret = batadv_hardif_no_broadcast(hard_iface,
747 ogm_packet->orig,
748 hardif_neigh->orig);
749
750 if (ret) {
751 char *type;
752
753 switch (ret) {
754 case BATADV_HARDIF_BCAST_NORECIPIENT:
755 type = "no neighbor";
756 break;
757 case BATADV_HARDIF_BCAST_DUPFWD:
758 type = "single neighbor is source";
759 break;
760 case BATADV_HARDIF_BCAST_DUPORIG:
761 type = "single neighbor is originator";
762 break;
763 default:
764 type = "unknown";
765 }
766
767 batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "OGM2 packet from %pM on %s surpressed: %s\n",
768 ogm_packet->orig, hard_iface->net_dev->name,
769 type);
770
771 batadv_hardif_put(hard_iface);
772 continue;
773 }
774
719 batadv_v_ogm_process_per_outif(bat_priv, ethhdr, ogm_packet, 775 batadv_v_ogm_process_per_outif(bat_priv, ethhdr, ogm_packet,
720 orig_node, neigh_node, 776 orig_node, neigh_node,
721 if_incoming, hard_iface); 777 if_incoming, hard_iface);
@@ -754,18 +810,18 @@ int batadv_v_ogm_packet_recv(struct sk_buff *skb,
754 * B.A.T.M.A.N. V enabled ? 810 * B.A.T.M.A.N. V enabled ?
755 */ 811 */
756 if (strcmp(bat_priv->algo_ops->name, "BATMAN_V") != 0) 812 if (strcmp(bat_priv->algo_ops->name, "BATMAN_V") != 0)
757 return NET_RX_DROP; 813 goto free_skb;
758 814
759 if (!batadv_check_management_packet(skb, if_incoming, BATADV_OGM2_HLEN)) 815 if (!batadv_check_management_packet(skb, if_incoming, BATADV_OGM2_HLEN))
760 return NET_RX_DROP; 816 goto free_skb;
761 817
762 if (batadv_is_my_mac(bat_priv, ethhdr->h_source)) 818 if (batadv_is_my_mac(bat_priv, ethhdr->h_source))
763 return NET_RX_DROP; 819 goto free_skb;
764 820
765 ogm_packet = (struct batadv_ogm2_packet *)skb->data; 821 ogm_packet = (struct batadv_ogm2_packet *)skb->data;
766 822
767 if (batadv_is_my_mac(bat_priv, ogm_packet->orig)) 823 if (batadv_is_my_mac(bat_priv, ogm_packet->orig))
768 return NET_RX_DROP; 824 goto free_skb;
769 825
770 batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_RX); 826 batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_RX);
771 batadv_add_counter(bat_priv, BATADV_CNT_MGMT_RX_BYTES, 827 batadv_add_counter(bat_priv, BATADV_CNT_MGMT_RX_BYTES,
@@ -786,7 +842,12 @@ int batadv_v_ogm_packet_recv(struct sk_buff *skb,
786 } 842 }
787 843
788 ret = NET_RX_SUCCESS; 844 ret = NET_RX_SUCCESS;
789 consume_skb(skb); 845
846free_skb:
847 if (ret == NET_RX_SUCCESS)
848 consume_skb(skb);
849 else
850 kfree_skb(skb);
790 851
791 return ret; 852 return ret;
792} 853}
diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h
index 4c4d45caa422..2068770b542d 100644
--- a/net/batman-adv/bat_v_ogm.h
+++ b/net/batman-adv/bat_v_ogm.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Antonio Quartulli 3 * Antonio Quartulli
4 * 4 *
diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c
index 032271421a20..2b070c7e31da 100644
--- a/net/batman-adv/bitarray.c
+++ b/net/batman-adv/bitarray.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2006-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Simon Wunderlich, Marek Lindner 3 * Simon Wunderlich, Marek Lindner
4 * 4 *
diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h
index 0e6e9d09078c..cc262c9d97e0 100644
--- a/net/batman-adv/bitarray.h
+++ b/net/batman-adv/bitarray.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2006-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Simon Wunderlich, Marek Lindner 3 * Simon Wunderlich, Marek Lindner
4 * 4 *
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index e7f690b571ea..ba8420d8a992 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Simon Wunderlich 3 * Simon Wunderlich
4 * 4 *
@@ -449,7 +449,6 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac,
449 batadv_inc_counter(bat_priv, BATADV_CNT_RX); 449 batadv_inc_counter(bat_priv, BATADV_CNT_RX);
450 batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES, 450 batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES,
451 skb->len + ETH_HLEN); 451 skb->len + ETH_HLEN);
452 soft_iface->last_rx = jiffies;
453 452
454 netif_rx(skb); 453 netif_rx(skb);
455out: 454out:
diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h
index 1ae93e46fb98..e157986bd01c 100644
--- a/net/batman-adv/bridge_loop_avoidance.h
+++ b/net/batman-adv/bridge_loop_avoidance.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Simon Wunderlich 3 * Simon Wunderlich
4 * 4 *
@@ -20,6 +20,8 @@
20 20
21#include "main.h" 21#include "main.h"
22 22
23#include <linux/compiler.h>
24#include <linux/stddef.h>
23#include <linux/types.h> 25#include <linux/types.h>
24 26
25struct net_device; 27struct net_device;
@@ -27,6 +29,22 @@ struct netlink_callback;
27struct seq_file; 29struct seq_file;
28struct sk_buff; 30struct sk_buff;
29 31
32/**
33 * batadv_bla_is_loopdetect_mac - check if the mac address is from a loop detect
34 * frame sent by bridge loop avoidance
35 * @mac: mac address to check
36 *
37 * Return: true if the it looks like a loop detect frame
38 * (mac starts with BA:BE), false otherwise
39 */
40static inline bool batadv_bla_is_loopdetect_mac(const uint8_t *mac)
41{
42 if (mac[0] == 0xba && mac[1] == 0xbe)
43 return true;
44
45 return false;
46}
47
30#ifdef CONFIG_BATMAN_ADV_BLA 48#ifdef CONFIG_BATMAN_ADV_BLA
31bool batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, 49bool batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb,
32 unsigned short vid, bool is_bcast); 50 unsigned short vid, bool is_bcast);
diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
index b4ffba7dd583..e32ad47c6efd 100644
--- a/net/batman-adv/debugfs.c
+++ b/net/batman-adv/debugfs.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner 3 * Marek Lindner
4 * 4 *
@@ -19,7 +19,7 @@
19#include "main.h" 19#include "main.h"
20 20
21#include <linux/debugfs.h> 21#include <linux/debugfs.h>
22#include <linux/device.h> 22#include <linux/err.h>
23#include <linux/errno.h> 23#include <linux/errno.h>
24#include <linux/export.h> 24#include <linux/export.h>
25#include <linux/fs.h> 25#include <linux/fs.h>
@@ -186,7 +186,7 @@ struct batadv_debuginfo batadv_debuginfo_##_name = { \
186/* the following attributes are general and therefore they will be directly 186/* the following attributes are general and therefore they will be directly
187 * placed in the BATADV_DEBUGFS_SUBDIR subdirectory of debugfs 187 * placed in the BATADV_DEBUGFS_SUBDIR subdirectory of debugfs
188 */ 188 */
189static BATADV_DEBUGINFO(routing_algos, S_IRUGO, batadv_algorithms_open); 189static BATADV_DEBUGINFO(routing_algos, 0444, batadv_algorithms_open);
190 190
191static struct batadv_debuginfo *batadv_general_debuginfos[] = { 191static struct batadv_debuginfo *batadv_general_debuginfos[] = {
192 &batadv_debuginfo_routing_algos, 192 &batadv_debuginfo_routing_algos,
@@ -194,26 +194,24 @@ static struct batadv_debuginfo *batadv_general_debuginfos[] = {
194}; 194};
195 195
196/* The following attributes are per soft interface */ 196/* The following attributes are per soft interface */
197static BATADV_DEBUGINFO(neighbors, S_IRUGO, neighbors_open); 197static BATADV_DEBUGINFO(neighbors, 0444, neighbors_open);
198static BATADV_DEBUGINFO(originators, S_IRUGO, batadv_originators_open); 198static BATADV_DEBUGINFO(originators, 0444, batadv_originators_open);
199static BATADV_DEBUGINFO(gateways, S_IRUGO, batadv_gateways_open); 199static BATADV_DEBUGINFO(gateways, 0444, batadv_gateways_open);
200static BATADV_DEBUGINFO(transtable_global, S_IRUGO, 200static BATADV_DEBUGINFO(transtable_global, 0444, batadv_transtable_global_open);
201 batadv_transtable_global_open);
202#ifdef CONFIG_BATMAN_ADV_BLA 201#ifdef CONFIG_BATMAN_ADV_BLA
203static BATADV_DEBUGINFO(bla_claim_table, S_IRUGO, batadv_bla_claim_table_open); 202static BATADV_DEBUGINFO(bla_claim_table, 0444, batadv_bla_claim_table_open);
204static BATADV_DEBUGINFO(bla_backbone_table, S_IRUGO, 203static BATADV_DEBUGINFO(bla_backbone_table, 0444,
205 batadv_bla_backbone_table_open); 204 batadv_bla_backbone_table_open);
206#endif 205#endif
207#ifdef CONFIG_BATMAN_ADV_DAT 206#ifdef CONFIG_BATMAN_ADV_DAT
208static BATADV_DEBUGINFO(dat_cache, S_IRUGO, batadv_dat_cache_open); 207static BATADV_DEBUGINFO(dat_cache, 0444, batadv_dat_cache_open);
209#endif 208#endif
210static BATADV_DEBUGINFO(transtable_local, S_IRUGO, 209static BATADV_DEBUGINFO(transtable_local, 0444, batadv_transtable_local_open);
211 batadv_transtable_local_open);
212#ifdef CONFIG_BATMAN_ADV_NC 210#ifdef CONFIG_BATMAN_ADV_NC
213static BATADV_DEBUGINFO(nc_nodes, S_IRUGO, batadv_nc_nodes_open); 211static BATADV_DEBUGINFO(nc_nodes, 0444, batadv_nc_nodes_open);
214#endif 212#endif
215#ifdef CONFIG_BATMAN_ADV_MCAST 213#ifdef CONFIG_BATMAN_ADV_MCAST
216static BATADV_DEBUGINFO(mcast_flags, S_IRUGO, batadv_mcast_flags_open); 214static BATADV_DEBUGINFO(mcast_flags, 0444, batadv_mcast_flags_open);
217#endif 215#endif
218 216
219static struct batadv_debuginfo *batadv_mesh_debuginfos[] = { 217static struct batadv_debuginfo *batadv_mesh_debuginfos[] = {
@@ -253,7 +251,7 @@ struct batadv_debuginfo batadv_hardif_debuginfo_##_name = { \
253 }, \ 251 }, \
254} 252}
255 253
256static BATADV_HARDIF_DEBUGINFO(originators, S_IRUGO, 254static BATADV_HARDIF_DEBUGINFO(originators, 0444,
257 batadv_originators_hardif_open); 255 batadv_originators_hardif_open);
258 256
259static struct batadv_debuginfo *batadv_hardif_debuginfos[] = { 257static struct batadv_debuginfo *batadv_hardif_debuginfos[] = {
diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h
index e49121ee55f6..9c5d4a65b98c 100644
--- a/net/batman-adv/debugfs.h
+++ b/net/batman-adv/debugfs.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner 3 * Marek Lindner
4 * 4 *
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index e257efdc5d03..1bfd1dbc2feb 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Antonio Quartulli 3 * Antonio Quartulli
4 * 4 *
@@ -369,12 +369,11 @@ out:
369 * batadv_dbg_arp - print a debug message containing all the ARP packet details 369 * batadv_dbg_arp - print a debug message containing all the ARP packet details
370 * @bat_priv: the bat priv with all the soft interface information 370 * @bat_priv: the bat priv with all the soft interface information
371 * @skb: ARP packet 371 * @skb: ARP packet
372 * @type: ARP type
373 * @hdr_size: size of the possible header before the ARP packet 372 * @hdr_size: size of the possible header before the ARP packet
374 * @msg: message to print together with the debugging information 373 * @msg: message to print together with the debugging information
375 */ 374 */
376static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb, 375static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb,
377 u16 type, int hdr_size, char *msg) 376 int hdr_size, char *msg)
378{ 377{
379 struct batadv_unicast_4addr_packet *unicast_4addr_packet; 378 struct batadv_unicast_4addr_packet *unicast_4addr_packet;
380 struct batadv_bcast_packet *bcast_pkt; 379 struct batadv_bcast_packet *bcast_pkt;
@@ -441,7 +440,7 @@ static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb,
441#else 440#else
442 441
443static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb, 442static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb,
444 u16 type, int hdr_size, char *msg) 443 int hdr_size, char *msg)
445{ 444{
446} 445}
447 446
@@ -950,6 +949,41 @@ static unsigned short batadv_dat_get_vid(struct sk_buff *skb, int *hdr_size)
950} 949}
951 950
952/** 951/**
952 * batadv_dat_arp_create_reply - create an ARP Reply
953 * @bat_priv: the bat priv with all the soft interface information
954 * @ip_src: ARP sender IP
955 * @ip_dst: ARP target IP
956 * @hw_src: Ethernet source and ARP sender MAC
957 * @hw_dst: Ethernet destination and ARP target MAC
958 * @vid: VLAN identifier (optional, set to zero otherwise)
959 *
960 * Creates an ARP Reply from the given values, optionally encapsulated in a
961 * VLAN header.
962 *
963 * Return: An skb containing an ARP Reply.
964 */
965static struct sk_buff *
966batadv_dat_arp_create_reply(struct batadv_priv *bat_priv, __be32 ip_src,
967 __be32 ip_dst, u8 *hw_src, u8 *hw_dst,
968 unsigned short vid)
969{
970 struct sk_buff *skb;
971
972 skb = arp_create(ARPOP_REPLY, ETH_P_ARP, ip_dst, bat_priv->soft_iface,
973 ip_src, hw_dst, hw_src, hw_dst);
974 if (!skb)
975 return NULL;
976
977 skb_reset_mac_header(skb);
978
979 if (vid & BATADV_VLAN_HAS_TAG)
980 skb = vlan_insert_tag(skb, htons(ETH_P_8021Q),
981 vid & VLAN_VID_MASK);
982
983 return skb;
984}
985
986/**
953 * batadv_dat_snoop_outgoing_arp_request - snoop the ARP request and try to 987 * batadv_dat_snoop_outgoing_arp_request - snoop the ARP request and try to
954 * answer using DAT 988 * answer using DAT
955 * @bat_priv: the bat priv with all the soft interface information 989 * @bat_priv: the bat priv with all the soft interface information
@@ -983,8 +1017,7 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv,
983 if (type != ARPOP_REQUEST) 1017 if (type != ARPOP_REQUEST)
984 goto out; 1018 goto out;
985 1019
986 batadv_dbg_arp(bat_priv, skb, type, hdr_size, 1020 batadv_dbg_arp(bat_priv, skb, hdr_size, "Parsing outgoing ARP REQUEST");
987 "Parsing outgoing ARP REQUEST");
988 1021
989 ip_src = batadv_arp_ip_src(skb, hdr_size); 1022 ip_src = batadv_arp_ip_src(skb, hdr_size);
990 hw_src = batadv_arp_hw_src(skb, hdr_size); 1023 hw_src = batadv_arp_hw_src(skb, hdr_size);
@@ -1007,25 +1040,16 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv,
1007 goto out; 1040 goto out;
1008 } 1041 }
1009 1042
1010 skb_new = arp_create(ARPOP_REPLY, ETH_P_ARP, ip_src, 1043 skb_new = batadv_dat_arp_create_reply(bat_priv, ip_dst, ip_src,
1011 bat_priv->soft_iface, ip_dst, hw_src, 1044 dat_entry->mac_addr,
1012 dat_entry->mac_addr, hw_src); 1045 hw_src, vid);
1013 if (!skb_new) 1046 if (!skb_new)
1014 goto out; 1047 goto out;
1015 1048
1016 if (vid & BATADV_VLAN_HAS_TAG) {
1017 skb_new = vlan_insert_tag(skb_new, htons(ETH_P_8021Q),
1018 vid & VLAN_VID_MASK);
1019 if (!skb_new)
1020 goto out;
1021 }
1022
1023 skb_reset_mac_header(skb_new);
1024 skb_new->protocol = eth_type_trans(skb_new, 1049 skb_new->protocol = eth_type_trans(skb_new,
1025 bat_priv->soft_iface); 1050 bat_priv->soft_iface);
1026 bat_priv->stats.rx_packets++; 1051 bat_priv->stats.rx_packets++;
1027 bat_priv->stats.rx_bytes += skb->len + ETH_HLEN + hdr_size; 1052 bat_priv->stats.rx_bytes += skb->len + ETH_HLEN + hdr_size;
1028 bat_priv->soft_iface->last_rx = jiffies;
1029 1053
1030 netif_rx(skb_new); 1054 netif_rx(skb_new);
1031 batadv_dbg(BATADV_DBG_DAT, bat_priv, "ARP request replied locally\n"); 1055 batadv_dbg(BATADV_DBG_DAT, bat_priv, "ARP request replied locally\n");
@@ -1075,8 +1099,7 @@ bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv,
1075 ip_src = batadv_arp_ip_src(skb, hdr_size); 1099 ip_src = batadv_arp_ip_src(skb, hdr_size);
1076 ip_dst = batadv_arp_ip_dst(skb, hdr_size); 1100 ip_dst = batadv_arp_ip_dst(skb, hdr_size);
1077 1101
1078 batadv_dbg_arp(bat_priv, skb, type, hdr_size, 1102 batadv_dbg_arp(bat_priv, skb, hdr_size, "Parsing incoming ARP REQUEST");
1079 "Parsing incoming ARP REQUEST");
1080 1103
1081 batadv_dat_entry_add(bat_priv, ip_src, hw_src, vid); 1104 batadv_dat_entry_add(bat_priv, ip_src, hw_src, vid);
1082 1105
@@ -1084,25 +1107,11 @@ bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv,
1084 if (!dat_entry) 1107 if (!dat_entry)
1085 goto out; 1108 goto out;
1086 1109
1087 skb_new = arp_create(ARPOP_REPLY, ETH_P_ARP, ip_src, 1110 skb_new = batadv_dat_arp_create_reply(bat_priv, ip_dst, ip_src,
1088 bat_priv->soft_iface, ip_dst, hw_src, 1111 dat_entry->mac_addr, hw_src, vid);
1089 dat_entry->mac_addr, hw_src);
1090
1091 if (!skb_new) 1112 if (!skb_new)
1092 goto out; 1113 goto out;
1093 1114
1094 /* the rest of the TX path assumes that the mac_header offset pointing
1095 * to the inner Ethernet header has been set, therefore reset it now.
1096 */
1097 skb_reset_mac_header(skb_new);
1098
1099 if (vid & BATADV_VLAN_HAS_TAG) {
1100 skb_new = vlan_insert_tag(skb_new, htons(ETH_P_8021Q),
1101 vid & VLAN_VID_MASK);
1102 if (!skb_new)
1103 goto out;
1104 }
1105
1106 /* To preserve backwards compatibility, the node has choose the outgoing 1115 /* To preserve backwards compatibility, the node has choose the outgoing
1107 * format based on the incoming request packet type. The assumption is 1116 * format based on the incoming request packet type. The assumption is
1108 * that a node not using the 4addr packet format doesn't support it. 1117 * that a node not using the 4addr packet format doesn't support it.
@@ -1149,8 +1158,7 @@ void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv,
1149 if (type != ARPOP_REPLY) 1158 if (type != ARPOP_REPLY)
1150 return; 1159 return;
1151 1160
1152 batadv_dbg_arp(bat_priv, skb, type, hdr_size, 1161 batadv_dbg_arp(bat_priv, skb, hdr_size, "Parsing outgoing ARP REPLY");
1153 "Parsing outgoing ARP REPLY");
1154 1162
1155 hw_src = batadv_arp_hw_src(skb, hdr_size); 1163 hw_src = batadv_arp_hw_src(skb, hdr_size);
1156 ip_src = batadv_arp_ip_src(skb, hdr_size); 1164 ip_src = batadv_arp_ip_src(skb, hdr_size);
@@ -1195,8 +1203,7 @@ bool batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv,
1195 if (type != ARPOP_REPLY) 1203 if (type != ARPOP_REPLY)
1196 goto out; 1204 goto out;
1197 1205
1198 batadv_dbg_arp(bat_priv, skb, type, hdr_size, 1206 batadv_dbg_arp(bat_priv, skb, hdr_size, "Parsing incoming ARP REPLY");
1199 "Parsing incoming ARP REPLY");
1200 1207
1201 hw_src = batadv_arp_hw_src(skb, hdr_size); 1208 hw_src = batadv_arp_hw_src(skb, hdr_size);
1202 ip_src = batadv_arp_ip_src(skb, hdr_size); 1209 ip_src = batadv_arp_ip_src(skb, hdr_size);
diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h
index 813ecea96cf9..ec364a3c1c66 100644
--- a/net/batman-adv/distributed-arp-table.h
+++ b/net/batman-adv/distributed-arp-table.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Antonio Quartulli 3 * Antonio Quartulli
4 * 4 *
diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index 0934730fb7ff..8f964beaac28 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Martin Hundebøll <martin@hundeboll.net> 3 * Martin Hundebøll <martin@hundeboll.net>
4 * 4 *
@@ -20,6 +20,7 @@
20 20
21#include <linux/atomic.h> 21#include <linux/atomic.h>
22#include <linux/byteorder/generic.h> 22#include <linux/byteorder/generic.h>
23#include <linux/errno.h>
23#include <linux/etherdevice.h> 24#include <linux/etherdevice.h>
24#include <linux/fs.h> 25#include <linux/fs.h>
25#include <linux/if_ether.h> 26#include <linux/if_ether.h>
@@ -42,17 +43,23 @@
42/** 43/**
43 * batadv_frag_clear_chain - delete entries in the fragment buffer chain 44 * batadv_frag_clear_chain - delete entries in the fragment buffer chain
44 * @head: head of chain with entries. 45 * @head: head of chain with entries.
46 * @dropped: whether the chain is cleared because all fragments are dropped
45 * 47 *
46 * Free fragments in the passed hlist. Should be called with appropriate lock. 48 * Free fragments in the passed hlist. Should be called with appropriate lock.
47 */ 49 */
48static void batadv_frag_clear_chain(struct hlist_head *head) 50static void batadv_frag_clear_chain(struct hlist_head *head, bool dropped)
49{ 51{
50 struct batadv_frag_list_entry *entry; 52 struct batadv_frag_list_entry *entry;
51 struct hlist_node *node; 53 struct hlist_node *node;
52 54
53 hlist_for_each_entry_safe(entry, node, head, list) { 55 hlist_for_each_entry_safe(entry, node, head, list) {
54 hlist_del(&entry->list); 56 hlist_del(&entry->list);
55 kfree_skb(entry->skb); 57
58 if (dropped)
59 kfree_skb(entry->skb);
60 else
61 consume_skb(entry->skb);
62
56 kfree(entry); 63 kfree(entry);
57 } 64 }
58} 65}
@@ -73,7 +80,7 @@ void batadv_frag_purge_orig(struct batadv_orig_node *orig_node,
73 spin_lock_bh(&chain->lock); 80 spin_lock_bh(&chain->lock);
74 81
75 if (!check_cb || check_cb(chain)) { 82 if (!check_cb || check_cb(chain)) {
76 batadv_frag_clear_chain(&chain->head); 83 batadv_frag_clear_chain(&chain->fragment_list, true);
77 chain->size = 0; 84 chain->size = 0;
78 } 85 }
79 86
@@ -117,8 +124,8 @@ static bool batadv_frag_init_chain(struct batadv_frag_table_entry *chain,
117 if (chain->seqno == seqno) 124 if (chain->seqno == seqno)
118 return false; 125 return false;
119 126
120 if (!hlist_empty(&chain->head)) 127 if (!hlist_empty(&chain->fragment_list))
121 batadv_frag_clear_chain(&chain->head); 128 batadv_frag_clear_chain(&chain->fragment_list, true);
122 129
123 chain->size = 0; 130 chain->size = 0;
124 chain->seqno = seqno; 131 chain->seqno = seqno;
@@ -176,7 +183,7 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node,
176 chain = &orig_node->fragments[bucket]; 183 chain = &orig_node->fragments[bucket];
177 spin_lock_bh(&chain->lock); 184 spin_lock_bh(&chain->lock);
178 if (batadv_frag_init_chain(chain, seqno)) { 185 if (batadv_frag_init_chain(chain, seqno)) {
179 hlist_add_head(&frag_entry_new->list, &chain->head); 186 hlist_add_head(&frag_entry_new->list, &chain->fragment_list);
180 chain->size = skb->len - hdr_size; 187 chain->size = skb->len - hdr_size;
181 chain->timestamp = jiffies; 188 chain->timestamp = jiffies;
182 chain->total_size = ntohs(frag_packet->total_size); 189 chain->total_size = ntohs(frag_packet->total_size);
@@ -185,7 +192,7 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node,
185 } 192 }
186 193
187 /* Find the position for the new fragment. */ 194 /* Find the position for the new fragment. */
188 hlist_for_each_entry(frag_entry_curr, &chain->head, list) { 195 hlist_for_each_entry(frag_entry_curr, &chain->fragment_list, list) {
189 /* Drop packet if fragment already exists. */ 196 /* Drop packet if fragment already exists. */
190 if (frag_entry_curr->no == frag_entry_new->no) 197 if (frag_entry_curr->no == frag_entry_new->no)
191 goto err_unlock; 198 goto err_unlock;
@@ -220,11 +227,11 @@ out:
220 * exceeds the maximum size of one merged packet. Don't allow 227 * exceeds the maximum size of one merged packet. Don't allow
221 * packets to have different total_size. 228 * packets to have different total_size.
222 */ 229 */
223 batadv_frag_clear_chain(&chain->head); 230 batadv_frag_clear_chain(&chain->fragment_list, true);
224 chain->size = 0; 231 chain->size = 0;
225 } else if (ntohs(frag_packet->total_size) == chain->size) { 232 } else if (ntohs(frag_packet->total_size) == chain->size) {
226 /* All fragments received. Hand over chain to caller. */ 233 /* All fragments received. Hand over chain to caller. */
227 hlist_move_list(&chain->head, chain_out); 234 hlist_move_list(&chain->fragment_list, chain_out);
228 chain->size = 0; 235 chain->size = 0;
229 } 236 }
230 237
@@ -232,8 +239,10 @@ err_unlock:
232 spin_unlock_bh(&chain->lock); 239 spin_unlock_bh(&chain->lock);
233 240
234err: 241err:
235 if (!ret) 242 if (!ret) {
236 kfree(frag_entry_new); 243 kfree(frag_entry_new);
244 kfree_skb(skb);
245 }
237 246
238 return ret; 247 return ret;
239} 248}
@@ -252,8 +261,9 @@ batadv_frag_merge_packets(struct hlist_head *chain)
252{ 261{
253 struct batadv_frag_packet *packet; 262 struct batadv_frag_packet *packet;
254 struct batadv_frag_list_entry *entry; 263 struct batadv_frag_list_entry *entry;
255 struct sk_buff *skb_out = NULL; 264 struct sk_buff *skb_out;
256 int size, hdr_size = sizeof(struct batadv_frag_packet); 265 int size, hdr_size = sizeof(struct batadv_frag_packet);
266 bool dropped = false;
257 267
258 /* Remove first entry, as this is the destination for the rest of the 268 /* Remove first entry, as this is the destination for the rest of the
259 * fragments. 269 * fragments.
@@ -270,6 +280,7 @@ batadv_frag_merge_packets(struct hlist_head *chain)
270 if (pskb_expand_head(skb_out, 0, size - skb_out->len, GFP_ATOMIC) < 0) { 280 if (pskb_expand_head(skb_out, 0, size - skb_out->len, GFP_ATOMIC) < 0) {
271 kfree_skb(skb_out); 281 kfree_skb(skb_out);
272 skb_out = NULL; 282 skb_out = NULL;
283 dropped = true;
273 goto free; 284 goto free;
274 } 285 }
275 286
@@ -291,7 +302,7 @@ batadv_frag_merge_packets(struct hlist_head *chain)
291 302
292free: 303free:
293 /* Locking is not needed, because 'chain' is not part of any orig. */ 304 /* Locking is not needed, because 'chain' is not part of any orig. */
294 batadv_frag_clear_chain(chain); 305 batadv_frag_clear_chain(chain, dropped);
295 return skb_out; 306 return skb_out;
296} 307}
297 308
@@ -304,7 +315,7 @@ free:
304 * 315 *
305 * There are three possible outcomes: 1) Packet is merged: Return true and 316 * There are three possible outcomes: 1) Packet is merged: Return true and
306 * set *skb to merged packet; 2) Packet is buffered: Return true and set *skb 317 * set *skb to merged packet; 2) Packet is buffered: Return true and set *skb
307 * to NULL; 3) Error: Return false and leave skb as is. 318 * to NULL; 3) Error: Return false and free skb.
308 * 319 *
309 * Return: true when packet is merged or buffered, false when skb is not not 320 * Return: true when packet is merged or buffered, false when skb is not not
310 * used. 321 * used.
@@ -329,9 +340,9 @@ bool batadv_frag_skb_buffer(struct sk_buff **skb,
329 goto out_err; 340 goto out_err;
330 341
331out: 342out:
332 *skb = skb_out;
333 ret = true; 343 ret = true;
334out_err: 344out_err:
345 *skb = skb_out;
335 return ret; 346 return ret;
336} 347}
337 348
@@ -352,7 +363,7 @@ bool batadv_frag_skb_fwd(struct sk_buff *skb,
352 struct batadv_orig_node *orig_node_src) 363 struct batadv_orig_node *orig_node_src)
353{ 364{
354 struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface); 365 struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface);
355 struct batadv_orig_node *orig_node_dst = NULL; 366 struct batadv_orig_node *orig_node_dst;
356 struct batadv_neigh_node *neigh_node = NULL; 367 struct batadv_neigh_node *neigh_node = NULL;
357 struct batadv_frag_packet *packet; 368 struct batadv_frag_packet *packet;
358 u16 total_size; 369 u16 total_size;
@@ -393,7 +404,7 @@ out:
393 * batadv_frag_create - create a fragment from skb 404 * batadv_frag_create - create a fragment from skb
394 * @skb: skb to create fragment from 405 * @skb: skb to create fragment from
395 * @frag_head: header to use in new fragment 406 * @frag_head: header to use in new fragment
396 * @mtu: size of new fragment 407 * @fragment_size: size of new fragment
397 * 408 *
398 * Split the passed skb into two fragments: A new one with size matching the 409 * Split the passed skb into two fragments: A new one with size matching the
399 * passed mtu and the old one with the rest. The new skb contains data from the 410 * passed mtu and the old one with the rest. The new skb contains data from the
@@ -403,11 +414,11 @@ out:
403 */ 414 */
404static struct sk_buff *batadv_frag_create(struct sk_buff *skb, 415static struct sk_buff *batadv_frag_create(struct sk_buff *skb,
405 struct batadv_frag_packet *frag_head, 416 struct batadv_frag_packet *frag_head,
406 unsigned int mtu) 417 unsigned int fragment_size)
407{ 418{
408 struct sk_buff *skb_fragment; 419 struct sk_buff *skb_fragment;
409 unsigned int header_size = sizeof(*frag_head); 420 unsigned int header_size = sizeof(*frag_head);
410 unsigned int fragment_size = mtu - header_size; 421 unsigned int mtu = fragment_size + header_size;
411 422
412 skb_fragment = netdev_alloc_skb(NULL, mtu + ETH_HLEN); 423 skb_fragment = netdev_alloc_skb(NULL, mtu + ETH_HLEN);
413 if (!skb_fragment) 424 if (!skb_fragment)
@@ -433,8 +444,7 @@ err:
433 * @orig_node: final destination of the created fragments 444 * @orig_node: final destination of the created fragments
434 * @neigh_node: next-hop of the created fragments 445 * @neigh_node: next-hop of the created fragments
435 * 446 *
436 * Return: the netdev tx status or -1 in case of error. 447 * Return: the netdev tx status or a negative errno code on a failure
437 * When -1 is returned the skb is not consumed.
438 */ 448 */
439int batadv_frag_send_packet(struct sk_buff *skb, 449int batadv_frag_send_packet(struct sk_buff *skb,
440 struct batadv_orig_node *orig_node, 450 struct batadv_orig_node *orig_node,
@@ -446,24 +456,33 @@ int batadv_frag_send_packet(struct sk_buff *skb,
446 struct sk_buff *skb_fragment; 456 struct sk_buff *skb_fragment;
447 unsigned int mtu = neigh_node->if_incoming->net_dev->mtu; 457 unsigned int mtu = neigh_node->if_incoming->net_dev->mtu;
448 unsigned int header_size = sizeof(frag_header); 458 unsigned int header_size = sizeof(frag_header);
449 unsigned int max_fragment_size, max_packet_size; 459 unsigned int max_fragment_size, num_fragments;
450 int ret = -1; 460 int ret;
451 461
452 /* To avoid merge and refragmentation at next-hops we never send 462 /* To avoid merge and refragmentation at next-hops we never send
453 * fragments larger than BATADV_FRAG_MAX_FRAG_SIZE 463 * fragments larger than BATADV_FRAG_MAX_FRAG_SIZE
454 */ 464 */
455 mtu = min_t(unsigned int, mtu, BATADV_FRAG_MAX_FRAG_SIZE); 465 mtu = min_t(unsigned int, mtu, BATADV_FRAG_MAX_FRAG_SIZE);
456 max_fragment_size = mtu - header_size; 466 max_fragment_size = mtu - header_size;
457 max_packet_size = max_fragment_size * BATADV_FRAG_MAX_FRAGMENTS; 467
468 if (skb->len == 0 || max_fragment_size == 0)
469 return -EINVAL;
470
471 num_fragments = (skb->len - 1) / max_fragment_size + 1;
472 max_fragment_size = (skb->len - 1) / num_fragments + 1;
458 473
459 /* Don't even try to fragment, if we need more than 16 fragments */ 474 /* Don't even try to fragment, if we need more than 16 fragments */
460 if (skb->len > max_packet_size) 475 if (num_fragments > BATADV_FRAG_MAX_FRAGMENTS) {
461 goto out; 476 ret = -EAGAIN;
477 goto free_skb;
478 }
462 479
463 bat_priv = orig_node->bat_priv; 480 bat_priv = orig_node->bat_priv;
464 primary_if = batadv_primary_if_get_selected(bat_priv); 481 primary_if = batadv_primary_if_get_selected(bat_priv);
465 if (!primary_if) 482 if (!primary_if) {
466 goto out; 483 ret = -EINVAL;
484 goto free_skb;
485 }
467 486
468 /* Create one header to be copied to all fragments */ 487 /* Create one header to be copied to all fragments */
469 frag_header.packet_type = BATADV_UNICAST_FRAG; 488 frag_header.packet_type = BATADV_UNICAST_FRAG;
@@ -487,35 +506,37 @@ int batadv_frag_send_packet(struct sk_buff *skb,
487 506
488 /* Eat and send fragments from the tail of skb */ 507 /* Eat and send fragments from the tail of skb */
489 while (skb->len > max_fragment_size) { 508 while (skb->len > max_fragment_size) {
490 skb_fragment = batadv_frag_create(skb, &frag_header, mtu); 509 /* The initial check in this function should cover this case */
491 if (!skb_fragment) 510 if (unlikely(frag_header.no == BATADV_FRAG_MAX_FRAGMENTS - 1)) {
492 goto out; 511 ret = -EINVAL;
512 goto put_primary_if;
513 }
514
515 skb_fragment = batadv_frag_create(skb, &frag_header,
516 max_fragment_size);
517 if (!skb_fragment) {
518 ret = -ENOMEM;
519 goto put_primary_if;
520 }
493 521
494 batadv_inc_counter(bat_priv, BATADV_CNT_FRAG_TX); 522 batadv_inc_counter(bat_priv, BATADV_CNT_FRAG_TX);
495 batadv_add_counter(bat_priv, BATADV_CNT_FRAG_TX_BYTES, 523 batadv_add_counter(bat_priv, BATADV_CNT_FRAG_TX_BYTES,
496 skb_fragment->len + ETH_HLEN); 524 skb_fragment->len + ETH_HLEN);
497 ret = batadv_send_unicast_skb(skb_fragment, neigh_node); 525 ret = batadv_send_unicast_skb(skb_fragment, neigh_node);
498 if (ret != NET_XMIT_SUCCESS) { 526 if (ret != NET_XMIT_SUCCESS) {
499 /* return -1 so that the caller can free the original 527 ret = NET_XMIT_DROP;
500 * skb 528 goto put_primary_if;
501 */
502 ret = -1;
503 goto out;
504 } 529 }
505 530
506 frag_header.no++; 531 frag_header.no++;
507
508 /* The initial check in this function should cover this case */
509 if (frag_header.no == BATADV_FRAG_MAX_FRAGMENTS - 1) {
510 ret = -1;
511 goto out;
512 }
513 } 532 }
514 533
515 /* Make room for the fragment header. */ 534 /* Make room for the fragment header. */
516 if (batadv_skb_head_push(skb, header_size) < 0 || 535 if (batadv_skb_head_push(skb, header_size) < 0 ||
517 pskb_expand_head(skb, header_size + ETH_HLEN, 0, GFP_ATOMIC) < 0) 536 pskb_expand_head(skb, header_size + ETH_HLEN, 0, GFP_ATOMIC) < 0) {
518 goto out; 537 ret = -ENOMEM;
538 goto put_primary_if;
539 }
519 540
520 memcpy(skb->data, &frag_header, header_size); 541 memcpy(skb->data, &frag_header, header_size);
521 542
@@ -524,10 +545,13 @@ int batadv_frag_send_packet(struct sk_buff *skb,
524 batadv_add_counter(bat_priv, BATADV_CNT_FRAG_TX_BYTES, 545 batadv_add_counter(bat_priv, BATADV_CNT_FRAG_TX_BYTES,
525 skb->len + ETH_HLEN); 546 skb->len + ETH_HLEN);
526 ret = batadv_send_unicast_skb(skb, neigh_node); 547 ret = batadv_send_unicast_skb(skb, neigh_node);
548 /* skb was consumed */
549 skb = NULL;
527 550
528out: 551put_primary_if:
529 if (primary_if) 552 batadv_hardif_put(primary_if);
530 batadv_hardif_put(primary_if); 553free_skb:
554 kfree_skb(skb);
531 555
532 return ret; 556 return ret;
533} 557}
diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h
index 3202fe329e63..1a2d6c308745 100644
--- a/net/batman-adv/fragmentation.h
+++ b/net/batman-adv/fragmentation.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Martin Hundebøll <martin@hundeboll.net> 3 * Martin Hundebøll <martin@hundeboll.net>
4 * 4 *
@@ -47,7 +47,7 @@ int batadv_frag_send_packet(struct sk_buff *skb,
47static inline bool 47static inline bool
48batadv_frag_check_entry(struct batadv_frag_table_entry *frags_entry) 48batadv_frag_check_entry(struct batadv_frag_table_entry *frags_entry)
49{ 49{
50 if (!hlist_empty(&frags_entry->head) && 50 if (!hlist_empty(&frags_entry->fragment_list) &&
51 batadv_has_timed_out(frags_entry->timestamp, BATADV_FRAG_TIMEOUT)) 51 batadv_has_timed_out(frags_entry->timestamp, BATADV_FRAG_TIMEOUT))
52 return true; 52 return true;
53 return false; 53 return false;
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index de055d64debe..de9955d5224d 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner 3 * Marek Lindner
4 * 4 *
@@ -348,7 +348,7 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv,
348 348
349 spin_lock_bh(&bat_priv->gw.list_lock); 349 spin_lock_bh(&bat_priv->gw.list_lock);
350 kref_get(&gw_node->refcount); 350 kref_get(&gw_node->refcount);
351 hlist_add_head_rcu(&gw_node->list, &bat_priv->gw.list); 351 hlist_add_head_rcu(&gw_node->list, &bat_priv->gw.gateway_list);
352 spin_unlock_bh(&bat_priv->gw.list_lock); 352 spin_unlock_bh(&bat_priv->gw.list_lock);
353 353
354 batadv_dbg(BATADV_DBG_BATMAN, bat_priv, 354 batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
@@ -376,7 +376,8 @@ struct batadv_gw_node *batadv_gw_node_get(struct batadv_priv *bat_priv,
376 struct batadv_gw_node *gw_node_tmp, *gw_node = NULL; 376 struct batadv_gw_node *gw_node_tmp, *gw_node = NULL;
377 377
378 rcu_read_lock(); 378 rcu_read_lock();
379 hlist_for_each_entry_rcu(gw_node_tmp, &bat_priv->gw.list, list) { 379 hlist_for_each_entry_rcu(gw_node_tmp, &bat_priv->gw.gateway_list,
380 list) {
380 if (gw_node_tmp->orig_node != orig_node) 381 if (gw_node_tmp->orig_node != orig_node)
381 continue; 382 continue;
382 383
@@ -475,7 +476,7 @@ void batadv_gw_node_free(struct batadv_priv *bat_priv)
475 476
476 spin_lock_bh(&bat_priv->gw.list_lock); 477 spin_lock_bh(&bat_priv->gw.list_lock);
477 hlist_for_each_entry_safe(gw_node, node_tmp, 478 hlist_for_each_entry_safe(gw_node, node_tmp,
478 &bat_priv->gw.list, list) { 479 &bat_priv->gw.gateway_list, list) {
479 hlist_del_init_rcu(&gw_node->list); 480 hlist_del_init_rcu(&gw_node->list);
480 batadv_gw_node_put(gw_node); 481 batadv_gw_node_put(gw_node);
481 } 482 }
@@ -704,7 +705,7 @@ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv,
704{ 705{
705 struct batadv_neigh_node *neigh_curr = NULL; 706 struct batadv_neigh_node *neigh_curr = NULL;
706 struct batadv_neigh_node *neigh_old = NULL; 707 struct batadv_neigh_node *neigh_old = NULL;
707 struct batadv_orig_node *orig_dst_node = NULL; 708 struct batadv_orig_node *orig_dst_node;
708 struct batadv_gw_node *gw_node = NULL; 709 struct batadv_gw_node *gw_node = NULL;
709 struct batadv_gw_node *curr_gw = NULL; 710 struct batadv_gw_node *curr_gw = NULL;
710 struct batadv_neigh_ifinfo *curr_ifinfo, *old_ifinfo; 711 struct batadv_neigh_ifinfo *curr_ifinfo, *old_ifinfo;
diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h
index 859166d03561..3baa3d466e5e 100644
--- a/net/batman-adv/gateway_client.h
+++ b/net/batman-adv/gateway_client.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner 3 * Marek Lindner
4 * 4 *
diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c
index 21184810d89f..33940c5c74a8 100644
--- a/net/batman-adv/gateway_common.c
+++ b/net/batman-adv/gateway_common.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner 3 * Marek Lindner
4 * 4 *
@@ -253,6 +253,11 @@ static void batadv_gw_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
253 */ 253 */
254void batadv_gw_init(struct batadv_priv *bat_priv) 254void batadv_gw_init(struct batadv_priv *bat_priv)
255{ 255{
256 if (bat_priv->algo_ops->gw.init_sel_class)
257 bat_priv->algo_ops->gw.init_sel_class(bat_priv);
258 else
259 atomic_set(&bat_priv->gw.sel_class, 1);
260
256 batadv_tvlv_handler_register(bat_priv, batadv_gw_tvlv_ogm_handler_v1, 261 batadv_tvlv_handler_register(bat_priv, batadv_gw_tvlv_ogm_handler_v1,
257 NULL, BATADV_TVLV_GW, 1, 262 NULL, BATADV_TVLV_GW, 1,
258 BATADV_TVLV_HANDLER_OGM_CIFNOTFND); 263 BATADV_TVLV_HANDLER_OGM_CIFNOTFND);
diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h
index 8a5e1ddf1175..0a6a97d201f2 100644
--- a/net/batman-adv/gateway_common.h
+++ b/net/batman-adv/gateway_common.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner 3 * Marek Lindner
4 * 4 *
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 08ce36147c4c..e348f76ea8c1 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
@@ -92,8 +92,8 @@ out:
92 * 92 *
93 * Return: result of rtnl_link_ops->get_link_net or @fallback_net 93 * Return: result of rtnl_link_ops->get_link_net or @fallback_net
94 */ 94 */
95static const struct net *batadv_getlink_net(const struct net_device *netdev, 95static struct net *batadv_getlink_net(const struct net_device *netdev,
96 const struct net *fallback_net) 96 struct net *fallback_net)
97{ 97{
98 if (!netdev->rtnl_link_ops) 98 if (!netdev->rtnl_link_ops)
99 return fallback_net; 99 return fallback_net;
@@ -116,9 +116,9 @@ static const struct net *batadv_getlink_net(const struct net_device *netdev,
116 * Return: true if the devices are each others parent, otherwise false 116 * Return: true if the devices are each others parent, otherwise false
117 */ 117 */
118static bool batadv_mutual_parents(const struct net_device *dev1, 118static bool batadv_mutual_parents(const struct net_device *dev1,
119 const struct net *net1, 119 struct net *net1,
120 const struct net_device *dev2, 120 const struct net_device *dev2,
121 const struct net *net2) 121 struct net *net2)
122{ 122{
123 int dev1_parent_iflink = dev_get_iflink(dev1); 123 int dev1_parent_iflink = dev_get_iflink(dev1);
124 int dev2_parent_iflink = dev_get_iflink(dev2); 124 int dev2_parent_iflink = dev_get_iflink(dev2);
@@ -154,7 +154,7 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev)
154{ 154{
155 struct net *net = dev_net(net_dev); 155 struct net *net = dev_net(net_dev);
156 struct net_device *parent_dev; 156 struct net_device *parent_dev;
157 const struct net *parent_net; 157 struct net *parent_net;
158 bool ret; 158 bool ret;
159 159
160 /* check if this is a batman-adv mesh interface */ 160 /* check if this is a batman-adv mesh interface */
@@ -202,13 +202,77 @@ static bool batadv_is_valid_iface(const struct net_device *net_dev)
202} 202}
203 203
204/** 204/**
205 * batadv_is_wifi_netdev - check if the given net_device struct is a wifi 205 * batadv_get_real_netdevice - check if the given netdev struct is a virtual
206 * interface 206 * interface on top of another 'real' interface
207 * @netdev: the device to check
208 *
209 * Callers must hold the rtnl semaphore. You may want batadv_get_real_netdev()
210 * instead of this.
211 *
212 * Return: the 'real' net device or the original net device and NULL in case
213 * of an error.
214 */
215static struct net_device *batadv_get_real_netdevice(struct net_device *netdev)
216{
217 struct batadv_hard_iface *hard_iface = NULL;
218 struct net_device *real_netdev = NULL;
219 struct net *real_net;
220 struct net *net;
221 int ifindex;
222
223 ASSERT_RTNL();
224
225 if (!netdev)
226 return NULL;
227
228 if (netdev->ifindex == dev_get_iflink(netdev)) {
229 dev_hold(netdev);
230 return netdev;
231 }
232
233 hard_iface = batadv_hardif_get_by_netdev(netdev);
234 if (!hard_iface || !hard_iface->soft_iface)
235 goto out;
236
237 net = dev_net(hard_iface->soft_iface);
238 ifindex = dev_get_iflink(netdev);
239 real_net = batadv_getlink_net(netdev, net);
240 real_netdev = dev_get_by_index(real_net, ifindex);
241
242out:
243 if (hard_iface)
244 batadv_hardif_put(hard_iface);
245 return real_netdev;
246}
247
248/**
249 * batadv_get_real_netdev - check if the given net_device struct is a virtual
250 * interface on top of another 'real' interface
207 * @net_device: the device to check 251 * @net_device: the device to check
208 * 252 *
209 * Return: true if the net device is a 802.11 wireless device, false otherwise. 253 * Return: the 'real' net device or the original net device and NULL in case
254 * of an error.
210 */ 255 */
211bool batadv_is_wifi_netdev(struct net_device *net_device) 256struct net_device *batadv_get_real_netdev(struct net_device *net_device)
257{
258 struct net_device *real_netdev;
259
260 rtnl_lock();
261 real_netdev = batadv_get_real_netdevice(net_device);
262 rtnl_unlock();
263
264 return real_netdev;
265}
266
267/**
268 * batadv_is_wext_netdev - check if the given net_device struct is a
269 * wext wifi interface
270 * @net_device: the device to check
271 *
272 * Return: true if the net device is a wext wireless device, false
273 * otherwise.
274 */
275static bool batadv_is_wext_netdev(struct net_device *net_device)
212{ 276{
213 if (!net_device) 277 if (!net_device)
214 return false; 278 return false;
@@ -221,6 +285,22 @@ bool batadv_is_wifi_netdev(struct net_device *net_device)
221 return true; 285 return true;
222#endif 286#endif
223 287
288 return false;
289}
290
291/**
292 * batadv_is_cfg80211_netdev - check if the given net_device struct is a
293 * cfg80211 wifi interface
294 * @net_device: the device to check
295 *
296 * Return: true if the net device is a cfg80211 wireless device, false
297 * otherwise.
298 */
299static bool batadv_is_cfg80211_netdev(struct net_device *net_device)
300{
301 if (!net_device)
302 return false;
303
224 /* cfg80211 drivers have to set ieee80211_ptr */ 304 /* cfg80211 drivers have to set ieee80211_ptr */
225 if (net_device->ieee80211_ptr) 305 if (net_device->ieee80211_ptr)
226 return true; 306 return true;
@@ -228,6 +308,125 @@ bool batadv_is_wifi_netdev(struct net_device *net_device)
228 return false; 308 return false;
229} 309}
230 310
311/**
312 * batadv_wifi_flags_evaluate - calculate wifi flags for net_device
313 * @net_device: the device to check
314 *
315 * Return: batadv_hard_iface_wifi_flags flags of the device
316 */
317static u32 batadv_wifi_flags_evaluate(struct net_device *net_device)
318{
319 u32 wifi_flags = 0;
320 struct net_device *real_netdev;
321
322 if (batadv_is_wext_netdev(net_device))
323 wifi_flags |= BATADV_HARDIF_WIFI_WEXT_DIRECT;
324
325 if (batadv_is_cfg80211_netdev(net_device))
326 wifi_flags |= BATADV_HARDIF_WIFI_CFG80211_DIRECT;
327
328 real_netdev = batadv_get_real_netdevice(net_device);
329 if (!real_netdev)
330 return wifi_flags;
331
332 if (real_netdev == net_device)
333 goto out;
334
335 if (batadv_is_wext_netdev(real_netdev))
336 wifi_flags |= BATADV_HARDIF_WIFI_WEXT_INDIRECT;
337
338 if (batadv_is_cfg80211_netdev(real_netdev))
339 wifi_flags |= BATADV_HARDIF_WIFI_CFG80211_INDIRECT;
340
341out:
342 dev_put(real_netdev);
343 return wifi_flags;
344}
345
346/**
347 * batadv_is_cfg80211_hardif - check if the given hardif is a cfg80211 wifi
348 * interface
349 * @hard_iface: the device to check
350 *
351 * Return: true if the net device is a cfg80211 wireless device, false
352 * otherwise.
353 */
354bool batadv_is_cfg80211_hardif(struct batadv_hard_iface *hard_iface)
355{
356 u32 allowed_flags = 0;
357
358 allowed_flags |= BATADV_HARDIF_WIFI_CFG80211_DIRECT;
359 allowed_flags |= BATADV_HARDIF_WIFI_CFG80211_INDIRECT;
360
361 return !!(hard_iface->wifi_flags & allowed_flags);
362}
363
364/**
365 * batadv_is_wifi_hardif - check if the given hardif is a wifi interface
366 * @hard_iface: the device to check
367 *
368 * Return: true if the net device is a 802.11 wireless device, false otherwise.
369 */
370bool batadv_is_wifi_hardif(struct batadv_hard_iface *hard_iface)
371{
372 if (!hard_iface)
373 return false;
374
375 return hard_iface->wifi_flags != 0;
376}
377
378/**
379 * batadv_hardif_no_broadcast - check whether (re)broadcast is necessary
380 * @if_outgoing: the outgoing interface checked and considered for (re)broadcast
381 * @orig_addr: the originator of this packet
382 * @orig_neigh: originator address of the forwarder we just got the packet from
383 * (NULL if we originated)
384 *
385 * Checks whether a packet needs to be (re)broadcasted on the given interface.
386 *
387 * Return:
388 * BATADV_HARDIF_BCAST_NORECIPIENT: No neighbor on interface
389 * BATADV_HARDIF_BCAST_DUPFWD: Just one neighbor, but it is the forwarder
390 * BATADV_HARDIF_BCAST_DUPORIG: Just one neighbor, but it is the originator
391 * BATADV_HARDIF_BCAST_OK: Several neighbors, must broadcast
392 */
393int batadv_hardif_no_broadcast(struct batadv_hard_iface *if_outgoing,
394 u8 *orig_addr, u8 *orig_neigh)
395{
396 struct batadv_hardif_neigh_node *hardif_neigh;
397 struct hlist_node *first;
398 int ret = BATADV_HARDIF_BCAST_OK;
399
400 rcu_read_lock();
401
402 /* 0 neighbors -> no (re)broadcast */
403 first = rcu_dereference(hlist_first_rcu(&if_outgoing->neigh_list));
404 if (!first) {
405 ret = BATADV_HARDIF_BCAST_NORECIPIENT;
406 goto out;
407 }
408
409 /* >1 neighbors -> (re)brodcast */
410 if (rcu_dereference(hlist_next_rcu(first)))
411 goto out;
412
413 hardif_neigh = hlist_entry(first, struct batadv_hardif_neigh_node,
414 list);
415
416 /* 1 neighbor, is the originator -> no rebroadcast */
417 if (orig_addr && batadv_compare_eth(hardif_neigh->orig, orig_addr)) {
418 ret = BATADV_HARDIF_BCAST_DUPORIG;
419 /* 1 neighbor, is the one we received from -> no rebroadcast */
420 } else if (orig_neigh &&
421 batadv_compare_eth(hardif_neigh->orig, orig_neigh)) {
422 ret = BATADV_HARDIF_BCAST_DUPFWD;
423 }
424
425out:
426 rcu_read_unlock();
427 return ret;
428}
429
231static struct batadv_hard_iface * 430static struct batadv_hard_iface *
232batadv_hardif_get_active(const struct net_device *soft_iface) 431batadv_hardif_get_active(const struct net_device *soft_iface)
233{ 432{
@@ -697,7 +896,8 @@ batadv_hardif_add_interface(struct net_device *net_dev)
697 kref_init(&hard_iface->refcount); 896 kref_init(&hard_iface->refcount);
698 897
699 hard_iface->num_bcasts = BATADV_NUM_BCASTS_DEFAULT; 898 hard_iface->num_bcasts = BATADV_NUM_BCASTS_DEFAULT;
700 if (batadv_is_wifi_netdev(net_dev)) 899 hard_iface->wifi_flags = batadv_wifi_flags_evaluate(net_dev);
900 if (batadv_is_wifi_hardif(hard_iface))
701 hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS; 901 hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS;
702 902
703 batadv_v_hardif_init(hard_iface); 903 batadv_v_hardif_init(hard_iface);
@@ -806,6 +1006,11 @@ static int batadv_hard_if_event(struct notifier_block *this,
806 if (hard_iface == primary_if) 1006 if (hard_iface == primary_if)
807 batadv_primary_if_update_addr(bat_priv, NULL); 1007 batadv_primary_if_update_addr(bat_priv, NULL);
808 break; 1008 break;
1009 case NETDEV_CHANGEUPPER:
1010 hard_iface->wifi_flags = batadv_wifi_flags_evaluate(net_dev);
1011 if (batadv_is_wifi_hardif(hard_iface))
1012 hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS;
1013 break;
809 default: 1014 default:
810 break; 1015 break;
811 } 1016 }
diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h
index a76724d369bf..9f9890ff7a22 100644
--- a/net/batman-adv/hard-interface.h
+++ b/net/batman-adv/hard-interface.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
@@ -40,6 +40,20 @@ enum batadv_hard_if_state {
40}; 40};
41 41
42/** 42/**
43 * enum batadv_hard_if_bcast - broadcast avoidance options
44 * @BATADV_HARDIF_BCAST_OK: Do broadcast on according hard interface
45 * @BATADV_HARDIF_BCAST_NORECIPIENT: Broadcast not needed, there is no recipient
46 * @BATADV_HARDIF_BCAST_DUPFWD: There is just the neighbor we got it from
47 * @BATADV_HARDIF_BCAST_DUPORIG: There is just the originator
48 */
49enum batadv_hard_if_bcast {
50 BATADV_HARDIF_BCAST_OK = 0,
51 BATADV_HARDIF_BCAST_NORECIPIENT,
52 BATADV_HARDIF_BCAST_DUPFWD,
53 BATADV_HARDIF_BCAST_DUPORIG,
54};
55
56/**
43 * enum batadv_hard_if_cleanup - Cleanup modi for soft_iface after slave removal 57 * enum batadv_hard_if_cleanup - Cleanup modi for soft_iface after slave removal
44 * @BATADV_IF_CLEANUP_KEEP: Don't automatically delete soft-interface 58 * @BATADV_IF_CLEANUP_KEEP: Don't automatically delete soft-interface
45 * @BATADV_IF_CLEANUP_AUTO: Delete soft-interface after last slave was removed 59 * @BATADV_IF_CLEANUP_AUTO: Delete soft-interface after last slave was removed
@@ -51,8 +65,9 @@ enum batadv_hard_if_cleanup {
51 65
52extern struct notifier_block batadv_hard_if_notifier; 66extern struct notifier_block batadv_hard_if_notifier;
53 67
54bool batadv_is_wifi_netdev(struct net_device *net_device); 68struct net_device *batadv_get_real_netdev(struct net_device *net_device);
55bool batadv_is_wifi_iface(int ifindex); 69bool batadv_is_cfg80211_hardif(struct batadv_hard_iface *hard_iface);
70bool batadv_is_wifi_hardif(struct batadv_hard_iface *hard_iface);
56struct batadv_hard_iface* 71struct batadv_hard_iface*
57batadv_hardif_get_by_netdev(const struct net_device *net_dev); 72batadv_hardif_get_by_netdev(const struct net_device *net_dev);
58int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, 73int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
@@ -63,6 +78,8 @@ void batadv_hardif_remove_interfaces(void);
63int batadv_hardif_min_mtu(struct net_device *soft_iface); 78int batadv_hardif_min_mtu(struct net_device *soft_iface);
64void batadv_update_min_mtu(struct net_device *soft_iface); 79void batadv_update_min_mtu(struct net_device *soft_iface);
65void batadv_hardif_release(struct kref *ref); 80void batadv_hardif_release(struct kref *ref);
81int batadv_hardif_no_broadcast(struct batadv_hard_iface *if_outgoing,
82 u8 *orig_addr, u8 *orig_neigh);
66 83
67/** 84/**
68 * batadv_hardif_put - decrement the hard interface refcounter and possibly 85 * batadv_hardif_put - decrement the hard interface refcounter and possibly
diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c
index a0a0fdb85805..b5f7e13918ac 100644
--- a/net/batman-adv/hash.c
+++ b/net/batman-adv/hash.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2006-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Simon Wunderlich, Marek Lindner 3 * Simon Wunderlich, Marek Lindner
4 * 4 *
diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h
index cbbf87075f06..0c905e91c5e2 100644
--- a/net/batman-adv/hash.h
+++ b/net/batman-adv/hash.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2006-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Simon Wunderlich, Marek Lindner 3 * Simon Wunderlich, Marek Lindner
4 * 4 *
@@ -61,36 +61,6 @@ void batadv_hash_set_lock_class(struct batadv_hashtable *hash,
61/* free only the hashtable and the hash itself. */ 61/* free only the hashtable and the hash itself. */
62void batadv_hash_destroy(struct batadv_hashtable *hash); 62void batadv_hash_destroy(struct batadv_hashtable *hash);
63 63
64/* remove the hash structure. if hashdata_free_cb != NULL, this function will be
65 * called to remove the elements inside of the hash. if you don't remove the
66 * elements, memory might be leaked.
67 */
68static inline void batadv_hash_delete(struct batadv_hashtable *hash,
69 batadv_hashdata_free_cb free_cb,
70 void *arg)
71{
72 struct hlist_head *head;
73 struct hlist_node *node, *node_tmp;
74 spinlock_t *list_lock; /* spinlock to protect write access */
75 u32 i;
76
77 for (i = 0; i < hash->size; i++) {
78 head = &hash->table[i];
79 list_lock = &hash->list_locks[i];
80
81 spin_lock_bh(list_lock);
82 hlist_for_each_safe(node, node_tmp, head) {
83 hlist_del_rcu(node);
84
85 if (free_cb)
86 free_cb(node, arg);
87 }
88 spin_unlock_bh(list_lock);
89 }
90
91 batadv_hash_destroy(hash);
92}
93
94/** 64/**
95 * batadv_hash_add - adds data to the hashtable 65 * batadv_hash_add - adds data to the hashtable
96 * @hash: storage hash table 66 * @hash: storage hash table
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index 378cc1119d66..6308c9f0fd96 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner 3 * Marek Lindner
4 * 4 *
@@ -38,7 +38,6 @@
38#include <linux/skbuff.h> 38#include <linux/skbuff.h>
39#include <linux/slab.h> 39#include <linux/slab.h>
40#include <linux/spinlock.h> 40#include <linux/spinlock.h>
41#include <linux/stat.h>
42#include <linux/stddef.h> 41#include <linux/stddef.h>
43#include <linux/string.h> 42#include <linux/string.h>
44#include <linux/uaccess.h> 43#include <linux/uaccess.h>
@@ -322,8 +321,8 @@ int batadv_socket_setup(struct batadv_priv *bat_priv)
322 if (!bat_priv->debug_dir) 321 if (!bat_priv->debug_dir)
323 goto err; 322 goto err;
324 323
325 d = debugfs_create_file(BATADV_ICMP_SOCKET, S_IFREG | S_IWUSR | S_IRUSR, 324 d = debugfs_create_file(BATADV_ICMP_SOCKET, 0600, bat_priv->debug_dir,
326 bat_priv->debug_dir, bat_priv, &batadv_fops); 325 bat_priv, &batadv_fops);
327 if (!d) 326 if (!d)
328 goto err; 327 goto err;
329 328
diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h
index e44a7da51431..f3fec40aae86 100644
--- a/net/batman-adv/icmp_socket.h
+++ b/net/batman-adv/icmp_socket.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner 3 * Marek Lindner
4 * 4 *
diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c
index 56dc532f7a2c..4ef4bde2cc2d 100644
--- a/net/batman-adv/log.c
+++ b/net/batman-adv/log.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner 3 * Marek Lindner
4 * 4 *
@@ -31,7 +31,6 @@
31#include <linux/sched.h> /* for linux/wait.h */ 31#include <linux/sched.h> /* for linux/wait.h */
32#include <linux/slab.h> 32#include <linux/slab.h>
33#include <linux/spinlock.h> 33#include <linux/spinlock.h>
34#include <linux/stat.h>
35#include <linux/stddef.h> 34#include <linux/stddef.h>
36#include <linux/types.h> 35#include <linux/types.h>
37#include <linux/uaccess.h> 36#include <linux/uaccess.h>
@@ -212,8 +211,7 @@ int batadv_debug_log_setup(struct batadv_priv *bat_priv)
212 spin_lock_init(&bat_priv->debug_log->lock); 211 spin_lock_init(&bat_priv->debug_log->lock);
213 init_waitqueue_head(&bat_priv->debug_log->queue_wait); 212 init_waitqueue_head(&bat_priv->debug_log->queue_wait);
214 213
215 d = debugfs_create_file("log", S_IFREG | S_IRUSR, 214 d = debugfs_create_file("log", 0400, bat_priv->debug_dir, bat_priv,
216 bat_priv->debug_dir, bat_priv,
217 &batadv_log_fops); 215 &batadv_log_fops);
218 if (!d) 216 if (!d)
219 goto err; 217 goto err;
diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h
index d2905a855d1b..7a2b9f4da078 100644
--- a/net/batman-adv/log.h
+++ b/net/batman-adv/log.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
@@ -71,12 +71,12 @@ int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...)
71__printf(2, 3); 71__printf(2, 3);
72 72
73/* possibly ratelimited debug output */ 73/* possibly ratelimited debug output */
74#define _batadv_dbg(type, bat_priv, ratelimited, fmt, arg...) \ 74#define _batadv_dbg(type, bat_priv, ratelimited, fmt, arg...) \
75 do { \ 75 do { \
76 if (atomic_read(&bat_priv->log_level) & type && \ 76 if (atomic_read(&(bat_priv)->log_level) & (type) && \
77 (!ratelimited || net_ratelimit())) \ 77 (!(ratelimited) || net_ratelimit())) \
78 batadv_debug_log(bat_priv, fmt, ## arg);\ 78 batadv_debug_log(bat_priv, fmt, ## arg); \
79 } \ 79 } \
80 while (0) 80 while (0)
81#else /* !CONFIG_BATMAN_ADV_DEBUG */ 81#else /* !CONFIG_BATMAN_ADV_DEBUG */
82__printf(4, 5) 82__printf(4, 5)
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 2c017ab47557..5000c540614d 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
@@ -23,6 +23,7 @@
23#include <linux/crc32c.h> 23#include <linux/crc32c.h>
24#include <linux/errno.h> 24#include <linux/errno.h>
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/genetlink.h>
26#include <linux/if_ether.h> 27#include <linux/if_ether.h>
27#include <linux/if_vlan.h> 28#include <linux/if_vlan.h>
28#include <linux/init.h> 29#include <linux/init.h>
@@ -44,6 +45,7 @@
44#include <linux/workqueue.h> 45#include <linux/workqueue.h>
45#include <net/dsfield.h> 46#include <net/dsfield.h>
46#include <net/rtnetlink.h> 47#include <net/rtnetlink.h>
48#include <uapi/linux/batman_adv.h>
47 49
48#include "bat_algo.h" 50#include "bat_algo.h"
49#include "bat_iv_ogm.h" 51#include "bat_iv_ogm.h"
@@ -160,7 +162,7 @@ int batadv_mesh_init(struct net_device *soft_iface)
160 162
161 INIT_HLIST_HEAD(&bat_priv->forw_bat_list); 163 INIT_HLIST_HEAD(&bat_priv->forw_bat_list);
162 INIT_HLIST_HEAD(&bat_priv->forw_bcast_list); 164 INIT_HLIST_HEAD(&bat_priv->forw_bcast_list);
163 INIT_HLIST_HEAD(&bat_priv->gw.list); 165 INIT_HLIST_HEAD(&bat_priv->gw.gateway_list);
164#ifdef CONFIG_BATMAN_ADV_MCAST 166#ifdef CONFIG_BATMAN_ADV_MCAST
165 INIT_HLIST_HEAD(&bat_priv->mcast.want_all_unsnoopables_list); 167 INIT_HLIST_HEAD(&bat_priv->mcast.want_all_unsnoopables_list);
166 INIT_HLIST_HEAD(&bat_priv->mcast.want_all_ipv4_list); 168 INIT_HLIST_HEAD(&bat_priv->mcast.want_all_ipv4_list);
@@ -402,6 +404,8 @@ void batadv_skb_set_priority(struct sk_buff *skb, int offset)
402static int batadv_recv_unhandled_packet(struct sk_buff *skb, 404static int batadv_recv_unhandled_packet(struct sk_buff *skb,
403 struct batadv_hard_iface *recv_if) 405 struct batadv_hard_iface *recv_if)
404{ 406{
407 kfree_skb(skb);
408
405 return NET_RX_DROP; 409 return NET_RX_DROP;
406} 410}
407 411
@@ -416,7 +420,6 @@ int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev,
416 struct batadv_ogm_packet *batadv_ogm_packet; 420 struct batadv_ogm_packet *batadv_ogm_packet;
417 struct batadv_hard_iface *hard_iface; 421 struct batadv_hard_iface *hard_iface;
418 u8 idx; 422 u8 idx;
419 int ret;
420 423
421 hard_iface = container_of(ptype, struct batadv_hard_iface, 424 hard_iface = container_of(ptype, struct batadv_hard_iface,
422 batman_adv_ptype); 425 batman_adv_ptype);
@@ -466,14 +469,8 @@ int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev,
466 /* reset control block to avoid left overs from previous users */ 469 /* reset control block to avoid left overs from previous users */
467 memset(skb->cb, 0, sizeof(struct batadv_skb_cb)); 470 memset(skb->cb, 0, sizeof(struct batadv_skb_cb));
468 471
469 /* all receive handlers return whether they received or reused
470 * the supplied skb. if not, we have to free the skb.
471 */
472 idx = batadv_ogm_packet->packet_type; 472 idx = batadv_ogm_packet->packet_type;
473 ret = (*batadv_rx_handler[idx])(skb, hard_iface); 473 (*batadv_rx_handler[idx])(skb, hard_iface);
474
475 if (ret == NET_RX_DROP)
476 kfree_skb(skb);
477 474
478 batadv_hardif_put(hard_iface); 475 batadv_hardif_put(hard_iface);
479 476
@@ -653,3 +650,4 @@ MODULE_DESCRIPTION(BATADV_DRIVER_DESC);
653MODULE_SUPPORTED_DEVICE(BATADV_DRIVER_DEVICE); 650MODULE_SUPPORTED_DEVICE(BATADV_DRIVER_DEVICE);
654MODULE_VERSION(BATADV_SOURCE_VERSION); 651MODULE_VERSION(BATADV_SOURCE_VERSION);
655MODULE_ALIAS_RTNL_LINK("batadv"); 652MODULE_ALIAS_RTNL_LINK("batadv");
653MODULE_ALIAS_GENL_FAMILY(BATADV_NL_NAME);
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 09af21e27639..57a8103dbce7 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
@@ -24,7 +24,7 @@
24#define BATADV_DRIVER_DEVICE "batman-adv" 24#define BATADV_DRIVER_DEVICE "batman-adv"
25 25
26#ifndef BATADV_SOURCE_VERSION 26#ifndef BATADV_SOURCE_VERSION
27#define BATADV_SOURCE_VERSION "2016.4" 27#define BATADV_SOURCE_VERSION "2017.0"
28#endif 28#endif
29 29
30/* B.A.T.M.A.N. parameters */ 30/* B.A.T.M.A.N. parameters */
@@ -48,6 +48,7 @@
48#define BATADV_TT_CLIENT_TEMP_TIMEOUT 600000 /* in milliseconds */ 48#define BATADV_TT_CLIENT_TEMP_TIMEOUT 600000 /* in milliseconds */
49#define BATADV_TT_WORK_PERIOD 5000 /* 5 seconds */ 49#define BATADV_TT_WORK_PERIOD 5000 /* 5 seconds */
50#define BATADV_ORIG_WORK_PERIOD 1000 /* 1 second */ 50#define BATADV_ORIG_WORK_PERIOD 1000 /* 1 second */
51#define BATADV_MCAST_WORK_PERIOD 500 /* 0.5 seconds */
51#define BATADV_DAT_ENTRY_TIMEOUT (5 * 60000) /* 5 mins in milliseconds */ 52#define BATADV_DAT_ENTRY_TIMEOUT (5 * 60000) /* 5 mins in milliseconds */
52/* sliding packet range of received originator messages in sequence numbers 53/* sliding packet range of received originator messages in sequence numbers
53 * (should be a multiple of our word size) 54 * (should be a multiple of our word size)
@@ -185,7 +186,6 @@ enum batadv_uev_type {
185 186
186#include <linux/bitops.h> /* for packet.h */ 187#include <linux/bitops.h> /* for packet.h */
187#include <linux/compiler.h> 188#include <linux/compiler.h>
188#include <linux/cpumask.h>
189#include <linux/etherdevice.h> 189#include <linux/etherdevice.h>
190#include <linux/if_ether.h> /* for packet.h */ 190#include <linux/if_ether.h> /* for packet.h */
191#include <linux/if_vlan.h> 191#include <linux/if_vlan.h>
@@ -200,8 +200,8 @@ struct packet_type;
200struct seq_file; 200struct seq_file;
201struct sk_buff; 201struct sk_buff;
202 202
203#define BATADV_PRINT_VID(vid) ((vid & BATADV_VLAN_HAS_TAG) ? \ 203#define BATADV_PRINT_VID(vid) (((vid) & BATADV_VLAN_HAS_TAG) ? \
204 (int)(vid & VLAN_VID_MASK) : -1) 204 (int)((vid) & VLAN_VID_MASK) : -1)
205 205
206extern struct list_head batadv_hardif_list; 206extern struct list_head batadv_hardif_list;
207 207
@@ -284,26 +284,6 @@ static inline void batadv_add_counter(struct batadv_priv *bat_priv, size_t idx,
284 284
285#define batadv_inc_counter(b, i) batadv_add_counter(b, i, 1) 285#define batadv_inc_counter(b, i) batadv_add_counter(b, i, 1)
286 286
287/**
288 * batadv_sum_counter - Sum the cpu-local counters for index 'idx'
289 * @bat_priv: the bat priv with all the soft interface information
290 * @idx: index of counter to sum up
291 *
292 * Return: sum of all cpu-local counters
293 */
294static inline u64 batadv_sum_counter(struct batadv_priv *bat_priv, size_t idx)
295{
296 u64 *counters, sum = 0;
297 int cpu;
298
299 for_each_possible_cpu(cpu) {
300 counters = per_cpu_ptr(bat_priv->bat_counters, cpu);
301 sum += counters[idx];
302 }
303
304 return sum;
305}
306
307/* Define a macro to reach the control buffer of the skb. The members of the 287/* Define a macro to reach the control buffer of the skb. The members of the
308 * control buffer are defined in struct batadv_skb_cb in types.h. 288 * control buffer are defined in struct batadv_skb_cb in types.h.
309 * The macro is inspired by the similar macro TCP_SKB_CB() in tcp.h. 289 * The macro is inspired by the similar macro TCP_SKB_CB() in tcp.h.
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index 13661f43386f..952ba81a565b 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2014-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2014-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Linus Lüssing 3 * Linus Lüssing
4 * 4 *
@@ -33,6 +33,7 @@
33#include <linux/in6.h> 33#include <linux/in6.h>
34#include <linux/ip.h> 34#include <linux/ip.h>
35#include <linux/ipv6.h> 35#include <linux/ipv6.h>
36#include <linux/jiffies.h>
36#include <linux/kernel.h> 37#include <linux/kernel.h>
37#include <linux/kref.h> 38#include <linux/kref.h>
38#include <linux/list.h> 39#include <linux/list.h>
@@ -48,6 +49,7 @@
48#include <linux/stddef.h> 49#include <linux/stddef.h>
49#include <linux/string.h> 50#include <linux/string.h>
50#include <linux/types.h> 51#include <linux/types.h>
52#include <linux/workqueue.h>
51#include <net/addrconf.h> 53#include <net/addrconf.h>
52#include <net/if_inet6.h> 54#include <net/if_inet6.h>
53#include <net/ip.h> 55#include <net/ip.h>
@@ -60,6 +62,18 @@
60#include "translation-table.h" 62#include "translation-table.h"
61#include "tvlv.h" 63#include "tvlv.h"
62 64
65static void batadv_mcast_mla_update(struct work_struct *work);
66
67/**
68 * batadv_mcast_start_timer - schedule the multicast periodic worker
69 * @bat_priv: the bat priv with all the soft interface information
70 */
71static void batadv_mcast_start_timer(struct batadv_priv *bat_priv)
72{
73 queue_delayed_work(batadv_event_workqueue, &bat_priv->mcast.work,
74 msecs_to_jiffies(BATADV_MCAST_WORK_PERIOD));
75}
76
63/** 77/**
64 * batadv_mcast_get_bridge - get the bridge on top of the softif if it exists 78 * batadv_mcast_get_bridge - get the bridge on top of the softif if it exists
65 * @soft_iface: netdev struct of the mesh interface 79 * @soft_iface: netdev struct of the mesh interface
@@ -231,19 +245,15 @@ out:
231 245
232/** 246/**
233 * batadv_mcast_mla_list_free - free a list of multicast addresses 247 * batadv_mcast_mla_list_free - free a list of multicast addresses
234 * @bat_priv: the bat priv with all the soft interface information
235 * @mcast_list: the list to free 248 * @mcast_list: the list to free
236 * 249 *
237 * Removes and frees all items in the given mcast_list. 250 * Removes and frees all items in the given mcast_list.
238 */ 251 */
239static void batadv_mcast_mla_list_free(struct batadv_priv *bat_priv, 252static void batadv_mcast_mla_list_free(struct hlist_head *mcast_list)
240 struct hlist_head *mcast_list)
241{ 253{
242 struct batadv_hw_addr *mcast_entry; 254 struct batadv_hw_addr *mcast_entry;
243 struct hlist_node *tmp; 255 struct hlist_node *tmp;
244 256
245 lockdep_assert_held(&bat_priv->tt.commit_lock);
246
247 hlist_for_each_entry_safe(mcast_entry, tmp, mcast_list, list) { 257 hlist_for_each_entry_safe(mcast_entry, tmp, mcast_list, list) {
248 hlist_del(&mcast_entry->list); 258 hlist_del(&mcast_entry->list);
249 kfree(mcast_entry); 259 kfree(mcast_entry);
@@ -259,6 +269,8 @@ static void batadv_mcast_mla_list_free(struct batadv_priv *bat_priv,
259 * translation table except the ones listed in the given mcast_list. 269 * translation table except the ones listed in the given mcast_list.
260 * 270 *
261 * If mcast_list is NULL then all are retracted. 271 * If mcast_list is NULL then all are retracted.
272 *
273 * Do not call outside of the mcast worker! (or cancel mcast worker first)
262 */ 274 */
263static void batadv_mcast_mla_tt_retract(struct batadv_priv *bat_priv, 275static void batadv_mcast_mla_tt_retract(struct batadv_priv *bat_priv,
264 struct hlist_head *mcast_list) 276 struct hlist_head *mcast_list)
@@ -266,7 +278,7 @@ static void batadv_mcast_mla_tt_retract(struct batadv_priv *bat_priv,
266 struct batadv_hw_addr *mcast_entry; 278 struct batadv_hw_addr *mcast_entry;
267 struct hlist_node *tmp; 279 struct hlist_node *tmp;
268 280
269 lockdep_assert_held(&bat_priv->tt.commit_lock); 281 WARN_ON(delayed_work_pending(&bat_priv->mcast.work));
270 282
271 hlist_for_each_entry_safe(mcast_entry, tmp, &bat_priv->mcast.mla_list, 283 hlist_for_each_entry_safe(mcast_entry, tmp, &bat_priv->mcast.mla_list,
272 list) { 284 list) {
@@ -291,6 +303,8 @@ static void batadv_mcast_mla_tt_retract(struct batadv_priv *bat_priv,
291 * 303 *
292 * Adds multicast listener announcements from the given mcast_list to the 304 * Adds multicast listener announcements from the given mcast_list to the
293 * translation table if they have not been added yet. 305 * translation table if they have not been added yet.
306 *
307 * Do not call outside of the mcast worker! (or cancel mcast worker first)
294 */ 308 */
295static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv, 309static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv,
296 struct hlist_head *mcast_list) 310 struct hlist_head *mcast_list)
@@ -298,7 +312,7 @@ static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv,
298 struct batadv_hw_addr *mcast_entry; 312 struct batadv_hw_addr *mcast_entry;
299 struct hlist_node *tmp; 313 struct hlist_node *tmp;
300 314
301 lockdep_assert_held(&bat_priv->tt.commit_lock); 315 WARN_ON(delayed_work_pending(&bat_priv->mcast.work));
302 316
303 if (!mcast_list) 317 if (!mcast_list)
304 return; 318 return;
@@ -532,13 +546,18 @@ update:
532} 546}
533 547
534/** 548/**
535 * batadv_mcast_mla_update - update the own MLAs 549 * __batadv_mcast_mla_update - update the own MLAs
536 * @bat_priv: the bat priv with all the soft interface information 550 * @bat_priv: the bat priv with all the soft interface information
537 * 551 *
538 * Updates the own multicast listener announcements in the translation 552 * Updates the own multicast listener announcements in the translation
539 * table as well as the own, announced multicast tvlv container. 553 * table as well as the own, announced multicast tvlv container.
554 *
555 * Note that non-conflicting reads and writes to bat_priv->mcast.mla_list
556 * in batadv_mcast_mla_tt_retract() and batadv_mcast_mla_tt_add() are
557 * ensured by the non-parallel execution of the worker this function
558 * belongs to.
540 */ 559 */
541void batadv_mcast_mla_update(struct batadv_priv *bat_priv) 560static void __batadv_mcast_mla_update(struct batadv_priv *bat_priv)
542{ 561{
543 struct net_device *soft_iface = bat_priv->soft_iface; 562 struct net_device *soft_iface = bat_priv->soft_iface;
544 struct hlist_head mcast_list = HLIST_HEAD_INIT; 563 struct hlist_head mcast_list = HLIST_HEAD_INIT;
@@ -560,7 +579,30 @@ update:
560 batadv_mcast_mla_tt_add(bat_priv, &mcast_list); 579 batadv_mcast_mla_tt_add(bat_priv, &mcast_list);
561 580
562out: 581out:
563 batadv_mcast_mla_list_free(bat_priv, &mcast_list); 582 batadv_mcast_mla_list_free(&mcast_list);
583}
584
585/**
586 * batadv_mcast_mla_update - update the own MLAs
587 * @work: kernel work struct
588 *
589 * Updates the own multicast listener announcements in the translation
590 * table as well as the own, announced multicast tvlv container.
591 *
592 * In the end, reschedules the work timer.
593 */
594static void batadv_mcast_mla_update(struct work_struct *work)
595{
596 struct delayed_work *delayed_work;
597 struct batadv_priv_mcast *priv_mcast;
598 struct batadv_priv *bat_priv;
599
600 delayed_work = to_delayed_work(work);
601 priv_mcast = container_of(delayed_work, struct batadv_priv_mcast, work);
602 bat_priv = container_of(priv_mcast, struct batadv_priv, mcast);
603
604 __batadv_mcast_mla_update(bat_priv);
605 batadv_mcast_start_timer(bat_priv);
564} 606}
565 607
566/** 608/**
@@ -1132,6 +1174,9 @@ void batadv_mcast_init(struct batadv_priv *bat_priv)
1132 batadv_tvlv_handler_register(bat_priv, batadv_mcast_tvlv_ogm_handler, 1174 batadv_tvlv_handler_register(bat_priv, batadv_mcast_tvlv_ogm_handler,
1133 NULL, BATADV_TVLV_MCAST, 2, 1175 NULL, BATADV_TVLV_MCAST, 2,
1134 BATADV_TVLV_HANDLER_OGM_CIFNOTFND); 1176 BATADV_TVLV_HANDLER_OGM_CIFNOTFND);
1177
1178 INIT_DELAYED_WORK(&bat_priv->mcast.work, batadv_mcast_mla_update);
1179 batadv_mcast_start_timer(bat_priv);
1135} 1180}
1136 1181
1137#ifdef CONFIG_BATMAN_ADV_DEBUGFS 1182#ifdef CONFIG_BATMAN_ADV_DEBUGFS
@@ -1243,12 +1288,13 @@ int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset)
1243 */ 1288 */
1244void batadv_mcast_free(struct batadv_priv *bat_priv) 1289void batadv_mcast_free(struct batadv_priv *bat_priv)
1245{ 1290{
1291 cancel_delayed_work_sync(&bat_priv->mcast.work);
1292
1246 batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_MCAST, 2); 1293 batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_MCAST, 2);
1247 batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST, 2); 1294 batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST, 2);
1248 1295
1249 spin_lock_bh(&bat_priv->tt.commit_lock); 1296 /* safely calling outside of worker, as worker was canceled above */
1250 batadv_mcast_mla_tt_retract(bat_priv, NULL); 1297 batadv_mcast_mla_tt_retract(bat_priv, NULL);
1251 spin_unlock_bh(&bat_priv->tt.commit_lock);
1252} 1298}
1253 1299
1254/** 1300/**
diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h
index 1fb00ba84907..2a78cddab0e9 100644
--- a/net/batman-adv/multicast.h
+++ b/net/batman-adv/multicast.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2014-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2014-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Linus Lüssing 3 * Linus Lüssing
4 * 4 *
@@ -39,8 +39,6 @@ enum batadv_forw_mode {
39 39
40#ifdef CONFIG_BATMAN_ADV_MCAST 40#ifdef CONFIG_BATMAN_ADV_MCAST
41 41
42void batadv_mcast_mla_update(struct batadv_priv *bat_priv);
43
44enum batadv_forw_mode 42enum batadv_forw_mode
45batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, 43batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb,
46 struct batadv_orig_node **mcast_single_orig); 44 struct batadv_orig_node **mcast_single_orig);
@@ -55,10 +53,6 @@ void batadv_mcast_purge_orig(struct batadv_orig_node *orig_node);
55 53
56#else 54#else
57 55
58static inline void batadv_mcast_mla_update(struct batadv_priv *bat_priv)
59{
60}
61
62static inline enum batadv_forw_mode 56static inline enum batadv_forw_mode
63batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, 57batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb,
64 struct batadv_orig_node **mcast_single_orig) 58 struct batadv_orig_node **mcast_single_orig)
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 64cb6acbe0a6..ab13b4d58733 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2016-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Matthias Schiffer 3 * Matthias Schiffer
4 * 4 *
@@ -20,11 +20,14 @@
20 20
21#include <linux/atomic.h> 21#include <linux/atomic.h>
22#include <linux/byteorder/generic.h> 22#include <linux/byteorder/generic.h>
23#include <linux/cache.h>
23#include <linux/errno.h> 24#include <linux/errno.h>
25#include <linux/export.h>
24#include <linux/fs.h> 26#include <linux/fs.h>
25#include <linux/genetlink.h> 27#include <linux/genetlink.h>
26#include <linux/if_ether.h> 28#include <linux/if_ether.h>
27#include <linux/init.h> 29#include <linux/init.h>
30#include <linux/kernel.h>
28#include <linux/netdevice.h> 31#include <linux/netdevice.h>
29#include <linux/netlink.h> 32#include <linux/netlink.h>
30#include <linux/printk.h> 33#include <linux/printk.h>
@@ -48,14 +51,7 @@
48#include "tp_meter.h" 51#include "tp_meter.h"
49#include "translation-table.h" 52#include "translation-table.h"
50 53
51struct genl_family batadv_netlink_family = { 54struct genl_family batadv_netlink_family;
52 .id = GENL_ID_GENERATE,
53 .hdrsize = 0,
54 .name = BATADV_NL_NAME,
55 .version = 1,
56 .maxattr = BATADV_ATTR_MAX,
57 .netnsok = true,
58};
59 55
60/* multicast groups */ 56/* multicast groups */
61enum batadv_netlink_multicast_groups { 57enum batadv_netlink_multicast_groups {
@@ -534,7 +530,7 @@ batadv_netlink_dump_hardifs(struct sk_buff *msg, struct netlink_callback *cb)
534 return msg->len; 530 return msg->len;
535} 531}
536 532
537static struct genl_ops batadv_netlink_ops[] = { 533static const struct genl_ops batadv_netlink_ops[] = {
538 { 534 {
539 .cmd = BATADV_CMD_GET_MESH_INFO, 535 .cmd = BATADV_CMD_GET_MESH_INFO,
540 .flags = GENL_ADMIN_PERM, 536 .flags = GENL_ADMIN_PERM,
@@ -610,6 +606,19 @@ static struct genl_ops batadv_netlink_ops[] = {
610 606
611}; 607};
612 608
609struct genl_family batadv_netlink_family __ro_after_init = {
610 .hdrsize = 0,
611 .name = BATADV_NL_NAME,
612 .version = 1,
613 .maxattr = BATADV_ATTR_MAX,
614 .netnsok = true,
615 .module = THIS_MODULE,
616 .ops = batadv_netlink_ops,
617 .n_ops = ARRAY_SIZE(batadv_netlink_ops),
618 .mcgrps = batadv_netlink_mcgrps,
619 .n_mcgrps = ARRAY_SIZE(batadv_netlink_mcgrps),
620};
621
613/** 622/**
614 * batadv_netlink_register - register batadv genl netlink family 623 * batadv_netlink_register - register batadv genl netlink family
615 */ 624 */
@@ -617,9 +626,7 @@ void __init batadv_netlink_register(void)
617{ 626{
618 int ret; 627 int ret;
619 628
620 ret = genl_register_family_with_ops_groups(&batadv_netlink_family, 629 ret = genl_register_family(&batadv_netlink_family);
621 batadv_netlink_ops,
622 batadv_netlink_mcgrps);
623 if (ret) 630 if (ret)
624 pr_warn("unable to register netlink family"); 631 pr_warn("unable to register netlink family");
625} 632}
diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h
index 52eb16281aba..f1cd8c5da966 100644
--- a/net/batman-adv/netlink.h
+++ b/net/batman-adv/netlink.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2016-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Matthias Schiffer 3 * Matthias Schiffer
4 * 4 *
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index e3baf697a35c..e1f6fc72fe3e 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Martin Hundebøll, Jeppe Ledet-Pedersen 3 * Martin Hundebøll, Jeppe Ledet-Pedersen
4 * 4 *
@@ -44,7 +44,6 @@
44#include <linux/skbuff.h> 44#include <linux/skbuff.h>
45#include <linux/slab.h> 45#include <linux/slab.h>
46#include <linux/spinlock.h> 46#include <linux/spinlock.h>
47#include <linux/stat.h>
48#include <linux/stddef.h> 47#include <linux/stddef.h>
49#include <linux/string.h> 48#include <linux/string.h>
50#include <linux/workqueue.h> 49#include <linux/workqueue.h>
@@ -261,10 +260,16 @@ static void batadv_nc_path_put(struct batadv_nc_path *nc_path)
261/** 260/**
262 * batadv_nc_packet_free - frees nc packet 261 * batadv_nc_packet_free - frees nc packet
263 * @nc_packet: the nc packet to free 262 * @nc_packet: the nc packet to free
263 * @dropped: whether the packet is freed because is is dropped
264 */ 264 */
265static void batadv_nc_packet_free(struct batadv_nc_packet *nc_packet) 265static void batadv_nc_packet_free(struct batadv_nc_packet *nc_packet,
266 bool dropped)
266{ 267{
267 kfree_skb(nc_packet->skb); 268 if (dropped)
269 kfree_skb(nc_packet->skb);
270 else
271 consume_skb(nc_packet->skb);
272
268 batadv_nc_path_put(nc_packet->nc_path); 273 batadv_nc_path_put(nc_packet->nc_path);
269 kfree(nc_packet); 274 kfree(nc_packet);
270} 275}
@@ -577,7 +582,7 @@ static void batadv_nc_send_packet(struct batadv_nc_packet *nc_packet)
577{ 582{
578 batadv_send_unicast_skb(nc_packet->skb, nc_packet->neigh_node); 583 batadv_send_unicast_skb(nc_packet->skb, nc_packet->neigh_node);
579 nc_packet->skb = NULL; 584 nc_packet->skb = NULL;
580 batadv_nc_packet_free(nc_packet); 585 batadv_nc_packet_free(nc_packet, false);
581} 586}
582 587
583/** 588/**
@@ -611,7 +616,7 @@ static bool batadv_nc_sniffed_purge(struct batadv_priv *bat_priv,
611 616
612 /* purge nc packet */ 617 /* purge nc packet */
613 list_del(&nc_packet->list); 618 list_del(&nc_packet->list);
614 batadv_nc_packet_free(nc_packet); 619 batadv_nc_packet_free(nc_packet, true);
615 620
616 res = true; 621 res = true;
617 622
@@ -1209,11 +1214,11 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv,
1209 } 1214 }
1210 1215
1211 /* skb_src is now coded into skb_dest, so free it */ 1216 /* skb_src is now coded into skb_dest, so free it */
1212 kfree_skb(skb_src); 1217 consume_skb(skb_src);
1213 1218
1214 /* avoid duplicate free of skb from nc_packet */ 1219 /* avoid duplicate free of skb from nc_packet */
1215 nc_packet->skb = NULL; 1220 nc_packet->skb = NULL;
1216 batadv_nc_packet_free(nc_packet); 1221 batadv_nc_packet_free(nc_packet, false);
1217 1222
1218 /* Send the coded packet and return true */ 1223 /* Send the coded packet and return true */
1219 batadv_send_unicast_skb(skb_dest, first_dest); 1224 batadv_send_unicast_skb(skb_dest, first_dest);
@@ -1400,7 +1405,7 @@ static void batadv_nc_skb_store_before_coding(struct batadv_priv *bat_priv,
1400 /* batadv_nc_skb_store_for_decoding() clones the skb, so we must free 1405 /* batadv_nc_skb_store_for_decoding() clones the skb, so we must free
1401 * our ref 1406 * our ref
1402 */ 1407 */
1403 kfree_skb(skb); 1408 consume_skb(skb);
1404} 1409}
1405 1410
1406/** 1411/**
@@ -1724,7 +1729,7 @@ batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb,
1724 ether_addr_copy(unicast_packet->dest, orig_dest); 1729 ether_addr_copy(unicast_packet->dest, orig_dest);
1725 unicast_packet->ttvn = ttvn; 1730 unicast_packet->ttvn = ttvn;
1726 1731
1727 batadv_nc_packet_free(nc_packet); 1732 batadv_nc_packet_free(nc_packet, false);
1728 return unicast_packet; 1733 return unicast_packet;
1729} 1734}
1730 1735
@@ -1814,11 +1819,11 @@ static int batadv_nc_recv_coded_packet(struct sk_buff *skb,
1814 1819
1815 /* Check if network coding is enabled */ 1820 /* Check if network coding is enabled */
1816 if (!atomic_read(&bat_priv->network_coding)) 1821 if (!atomic_read(&bat_priv->network_coding))
1817 return NET_RX_DROP; 1822 goto free_skb;
1818 1823
1819 /* Make sure we can access (and remove) header */ 1824 /* Make sure we can access (and remove) header */
1820 if (unlikely(!pskb_may_pull(skb, hdr_size))) 1825 if (unlikely(!pskb_may_pull(skb, hdr_size)))
1821 return NET_RX_DROP; 1826 goto free_skb;
1822 1827
1823 coded_packet = (struct batadv_coded_packet *)skb->data; 1828 coded_packet = (struct batadv_coded_packet *)skb->data;
1824 ethhdr = eth_hdr(skb); 1829 ethhdr = eth_hdr(skb);
@@ -1826,7 +1831,7 @@ static int batadv_nc_recv_coded_packet(struct sk_buff *skb,
1826 /* Verify frame is destined for us */ 1831 /* Verify frame is destined for us */
1827 if (!batadv_is_my_mac(bat_priv, ethhdr->h_dest) && 1832 if (!batadv_is_my_mac(bat_priv, ethhdr->h_dest) &&
1828 !batadv_is_my_mac(bat_priv, coded_packet->second_dest)) 1833 !batadv_is_my_mac(bat_priv, coded_packet->second_dest))
1829 return NET_RX_DROP; 1834 goto free_skb;
1830 1835
1831 /* Update stat counter */ 1836 /* Update stat counter */
1832 if (batadv_is_my_mac(bat_priv, coded_packet->second_dest)) 1837 if (batadv_is_my_mac(bat_priv, coded_packet->second_dest))
@@ -1836,7 +1841,7 @@ static int batadv_nc_recv_coded_packet(struct sk_buff *skb,
1836 coded_packet); 1841 coded_packet);
1837 if (!nc_packet) { 1842 if (!nc_packet) {
1838 batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE_FAILED); 1843 batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE_FAILED);
1839 return NET_RX_DROP; 1844 goto free_skb;
1840 } 1845 }
1841 1846
1842 /* Make skb's linear, because decoding accesses the entire buffer */ 1847 /* Make skb's linear, because decoding accesses the entire buffer */
@@ -1861,7 +1866,10 @@ static int batadv_nc_recv_coded_packet(struct sk_buff *skb,
1861 return batadv_recv_unicast_packet(skb, recv_if); 1866 return batadv_recv_unicast_packet(skb, recv_if);
1862 1867
1863free_nc_packet: 1868free_nc_packet:
1864 batadv_nc_packet_free(nc_packet); 1869 batadv_nc_packet_free(nc_packet, true);
1870free_skb:
1871 kfree_skb(skb);
1872
1865 return NET_RX_DROP; 1873 return NET_RX_DROP;
1866} 1874}
1867 1875
@@ -1961,17 +1969,16 @@ int batadv_nc_init_debugfs(struct batadv_priv *bat_priv)
1961 if (!nc_dir) 1969 if (!nc_dir)
1962 goto out; 1970 goto out;
1963 1971
1964 file = debugfs_create_u8("min_tq", S_IRUGO | S_IWUSR, nc_dir, 1972 file = debugfs_create_u8("min_tq", 0644, nc_dir, &bat_priv->nc.min_tq);
1965 &bat_priv->nc.min_tq);
1966 if (!file) 1973 if (!file)
1967 goto out; 1974 goto out;
1968 1975
1969 file = debugfs_create_u32("max_fwd_delay", S_IRUGO | S_IWUSR, nc_dir, 1976 file = debugfs_create_u32("max_fwd_delay", 0644, nc_dir,
1970 &bat_priv->nc.max_fwd_delay); 1977 &bat_priv->nc.max_fwd_delay);
1971 if (!file) 1978 if (!file)
1972 goto out; 1979 goto out;
1973 1980
1974 file = debugfs_create_u32("max_buffer_time", S_IRUGO | S_IWUSR, nc_dir, 1981 file = debugfs_create_u32("max_buffer_time", 0644, nc_dir,
1975 &bat_priv->nc.max_buffer_time); 1982 &bat_priv->nc.max_buffer_time);
1976 if (!file) 1983 if (!file)
1977 goto out; 1984 goto out;
diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h
index d6d7fb4ec5d5..c66efb81d2f4 100644
--- a/net/batman-adv/network-coding.h
+++ b/net/batman-adv/network-coding.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Martin Hundebøll, Jeppe Ledet-Pedersen 3 * Martin Hundebøll, Jeppe Ledet-Pedersen
4 * 4 *
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 7c8d16086f0f..8e2a4b205257 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
@@ -364,7 +364,7 @@ struct batadv_orig_ifinfo *
364batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node, 364batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node,
365 struct batadv_hard_iface *if_outgoing) 365 struct batadv_hard_iface *if_outgoing)
366{ 366{
367 struct batadv_orig_ifinfo *orig_ifinfo = NULL; 367 struct batadv_orig_ifinfo *orig_ifinfo;
368 unsigned long reset_time; 368 unsigned long reset_time;
369 369
370 spin_lock_bh(&orig_node->neigh_list_lock); 370 spin_lock_bh(&orig_node->neigh_list_lock);
@@ -512,15 +512,17 @@ batadv_neigh_node_get(const struct batadv_orig_node *orig_node,
512 * batadv_hardif_neigh_create - create a hardif neighbour node 512 * batadv_hardif_neigh_create - create a hardif neighbour node
513 * @hard_iface: the interface this neighbour is connected to 513 * @hard_iface: the interface this neighbour is connected to
514 * @neigh_addr: the interface address of the neighbour to retrieve 514 * @neigh_addr: the interface address of the neighbour to retrieve
515 * @orig_node: originator object representing the neighbour
515 * 516 *
516 * Return: the hardif neighbour node if found or created or NULL otherwise. 517 * Return: the hardif neighbour node if found or created or NULL otherwise.
517 */ 518 */
518static struct batadv_hardif_neigh_node * 519static struct batadv_hardif_neigh_node *
519batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface, 520batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface,
520 const u8 *neigh_addr) 521 const u8 *neigh_addr,
522 struct batadv_orig_node *orig_node)
521{ 523{
522 struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); 524 struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
523 struct batadv_hardif_neigh_node *hardif_neigh = NULL; 525 struct batadv_hardif_neigh_node *hardif_neigh;
524 526
525 spin_lock_bh(&hard_iface->neigh_list_lock); 527 spin_lock_bh(&hard_iface->neigh_list_lock);
526 528
@@ -536,6 +538,7 @@ batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface,
536 kref_get(&hard_iface->refcount); 538 kref_get(&hard_iface->refcount);
537 INIT_HLIST_NODE(&hardif_neigh->list); 539 INIT_HLIST_NODE(&hardif_neigh->list);
538 ether_addr_copy(hardif_neigh->addr, neigh_addr); 540 ether_addr_copy(hardif_neigh->addr, neigh_addr);
541 ether_addr_copy(hardif_neigh->orig, orig_node->orig);
539 hardif_neigh->if_incoming = hard_iface; 542 hardif_neigh->if_incoming = hard_iface;
540 hardif_neigh->last_seen = jiffies; 543 hardif_neigh->last_seen = jiffies;
541 544
@@ -556,21 +559,23 @@ out:
556 * node 559 * node
557 * @hard_iface: the interface this neighbour is connected to 560 * @hard_iface: the interface this neighbour is connected to
558 * @neigh_addr: the interface address of the neighbour to retrieve 561 * @neigh_addr: the interface address of the neighbour to retrieve
562 * @orig_node: originator object representing the neighbour
559 * 563 *
560 * Return: the hardif neighbour node if found or created or NULL otherwise. 564 * Return: the hardif neighbour node if found or created or NULL otherwise.
561 */ 565 */
562static struct batadv_hardif_neigh_node * 566static struct batadv_hardif_neigh_node *
563batadv_hardif_neigh_get_or_create(struct batadv_hard_iface *hard_iface, 567batadv_hardif_neigh_get_or_create(struct batadv_hard_iface *hard_iface,
564 const u8 *neigh_addr) 568 const u8 *neigh_addr,
569 struct batadv_orig_node *orig_node)
565{ 570{
566 struct batadv_hardif_neigh_node *hardif_neigh = NULL; 571 struct batadv_hardif_neigh_node *hardif_neigh;
567 572
568 /* first check without locking to avoid the overhead */ 573 /* first check without locking to avoid the overhead */
569 hardif_neigh = batadv_hardif_neigh_get(hard_iface, neigh_addr); 574 hardif_neigh = batadv_hardif_neigh_get(hard_iface, neigh_addr);
570 if (hardif_neigh) 575 if (hardif_neigh)
571 return hardif_neigh; 576 return hardif_neigh;
572 577
573 return batadv_hardif_neigh_create(hard_iface, neigh_addr); 578 return batadv_hardif_neigh_create(hard_iface, neigh_addr, orig_node);
574} 579}
575 580
576/** 581/**
@@ -630,7 +635,7 @@ batadv_neigh_node_create(struct batadv_orig_node *orig_node,
630 goto out; 635 goto out;
631 636
632 hardif_neigh = batadv_hardif_neigh_get_or_create(hard_iface, 637 hardif_neigh = batadv_hardif_neigh_get_or_create(hard_iface,
633 neigh_addr); 638 neigh_addr, orig_node);
634 if (!hardif_neigh) 639 if (!hardif_neigh)
635 goto out; 640 goto out;
636 641
@@ -683,7 +688,7 @@ batadv_neigh_node_get_or_create(struct batadv_orig_node *orig_node,
683 struct batadv_hard_iface *hard_iface, 688 struct batadv_hard_iface *hard_iface,
684 const u8 *neigh_addr) 689 const u8 *neigh_addr)
685{ 690{
686 struct batadv_neigh_node *neigh_node = NULL; 691 struct batadv_neigh_node *neigh_node;
687 692
688 /* first check without locking to avoid the overhead */ 693 /* first check without locking to avoid the overhead */
689 neigh_node = batadv_neigh_node_get(orig_node, hard_iface, neigh_addr); 694 neigh_node = batadv_neigh_node_get(orig_node, hard_iface, neigh_addr);
@@ -1021,7 +1026,7 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv,
1021 batadv_orig_node_vlan_put(vlan); 1026 batadv_orig_node_vlan_put(vlan);
1022 1027
1023 for (i = 0; i < BATADV_FRAG_BUFFER_COUNT; i++) { 1028 for (i = 0; i < BATADV_FRAG_BUFFER_COUNT; i++) {
1024 INIT_HLIST_HEAD(&orig_node->fragments[i].head); 1029 INIT_HLIST_HEAD(&orig_node->fragments[i].fragment_list);
1025 spin_lock_init(&orig_node->fragments[i].lock); 1030 spin_lock_init(&orig_node->fragments[i].lock);
1026 orig_node->fragments[i].size = 0; 1031 orig_node->fragments[i].size = 0;
1027 } 1032 }
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index ebc56183f358..d94220a6d21a 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h
index 6afc0b86950e..8e8a5db197cb 100644
--- a/net/batman-adv/packet.h
+++ b/net/batman-adv/packet.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
@@ -21,7 +21,7 @@
21#include <asm/byteorder.h> 21#include <asm/byteorder.h>
22#include <linux/types.h> 22#include <linux/types.h>
23 23
24#define batadv_tp_is_error(n) ((u8)n > 127 ? 1 : 0) 24#define batadv_tp_is_error(n) ((u8)(n) > 127 ? 1 : 0)
25 25
26/** 26/**
27 * enum batadv_packettype - types for batman-adv encapsulated packets 27 * enum batadv_packettype - types for batman-adv encapsulated packets
@@ -252,16 +252,6 @@ struct batadv_elp_packet {
252#define BATADV_ELP_HLEN sizeof(struct batadv_elp_packet) 252#define BATADV_ELP_HLEN sizeof(struct batadv_elp_packet)
253 253
254/** 254/**
255 * enum batadv_icmp_user_cmd_type - types for batman-adv icmp cmd modes
256 * @BATADV_TP_START: start a throughput meter run
257 * @BATADV_TP_STOP: stop a throughput meter run
258 */
259enum batadv_icmp_user_cmd_type {
260 BATADV_TP_START = 0,
261 BATADV_TP_STOP = 2,
262};
263
264/**
265 * struct batadv_icmp_header - common members among all the ICMP packets 255 * struct batadv_icmp_header - common members among all the ICMP packets
266 * @packet_type: batman-adv packet type, part of the general header 256 * @packet_type: batman-adv packet type, part of the general header
267 * @version: batman-adv protocol version, part of the genereal header 257 * @version: batman-adv protocol version, part of the genereal header
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 7e8dc648b95a..7fd740b6e36d 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
@@ -196,8 +196,8 @@ bool batadv_check_management_packet(struct sk_buff *skb,
196 if (!is_broadcast_ether_addr(ethhdr->h_dest)) 196 if (!is_broadcast_ether_addr(ethhdr->h_dest))
197 return false; 197 return false;
198 198
199 /* packet with broadcast sender address */ 199 /* packet with invalid sender address */
200 if (is_broadcast_ether_addr(ethhdr->h_source)) 200 if (!is_valid_ether_addr(ethhdr->h_source))
201 return false; 201 return false;
202 202
203 /* create a copy of the skb, if needed, to modify it. */ 203 /* create a copy of the skb, if needed, to modify it. */
@@ -262,11 +262,11 @@ static int batadv_recv_my_icmp_packet(struct batadv_priv *bat_priv,
262 icmph->ttl = BATADV_TTL; 262 icmph->ttl = BATADV_TTL;
263 263
264 res = batadv_send_skb_to_orig(skb, orig_node, NULL); 264 res = batadv_send_skb_to_orig(skb, orig_node, NULL);
265 if (res == -1) 265 if (res == NET_XMIT_SUCCESS)
266 goto out; 266 ret = NET_RX_SUCCESS;
267
268 ret = NET_RX_SUCCESS;
269 267
268 /* skb was consumed */
269 skb = NULL;
270 break; 270 break;
271 case BATADV_TP: 271 case BATADV_TP:
272 if (!pskb_may_pull(skb, sizeof(struct batadv_icmp_tp_packet))) 272 if (!pskb_may_pull(skb, sizeof(struct batadv_icmp_tp_packet)))
@@ -274,6 +274,8 @@ static int batadv_recv_my_icmp_packet(struct batadv_priv *bat_priv,
274 274
275 batadv_tp_meter_recv(bat_priv, skb); 275 batadv_tp_meter_recv(bat_priv, skb);
276 ret = NET_RX_SUCCESS; 276 ret = NET_RX_SUCCESS;
277 /* skb was consumed */
278 skb = NULL;
277 goto out; 279 goto out;
278 default: 280 default:
279 /* drop unknown type */ 281 /* drop unknown type */
@@ -284,6 +286,9 @@ out:
284 batadv_hardif_put(primary_if); 286 batadv_hardif_put(primary_if);
285 if (orig_node) 287 if (orig_node)
286 batadv_orig_node_put(orig_node); 288 batadv_orig_node_put(orig_node);
289
290 kfree_skb(skb);
291
287 return ret; 292 return ret;
288} 293}
289 294
@@ -325,14 +330,20 @@ static int batadv_recv_icmp_ttl_exceeded(struct batadv_priv *bat_priv,
325 icmp_packet->ttl = BATADV_TTL; 330 icmp_packet->ttl = BATADV_TTL;
326 331
327 res = batadv_send_skb_to_orig(skb, orig_node, NULL); 332 res = batadv_send_skb_to_orig(skb, orig_node, NULL);
328 if (res != -1) 333 if (res == NET_RX_SUCCESS)
329 ret = NET_RX_SUCCESS; 334 ret = NET_XMIT_SUCCESS;
335
336 /* skb was consumed */
337 skb = NULL;
330 338
331out: 339out:
332 if (primary_if) 340 if (primary_if)
333 batadv_hardif_put(primary_if); 341 batadv_hardif_put(primary_if);
334 if (orig_node) 342 if (orig_node)
335 batadv_orig_node_put(orig_node); 343 batadv_orig_node_put(orig_node);
344
345 kfree_skb(skb);
346
336 return ret; 347 return ret;
337} 348}
338 349
@@ -349,21 +360,21 @@ int batadv_recv_icmp_packet(struct sk_buff *skb,
349 360
350 /* drop packet if it has not necessary minimum size */ 361 /* drop packet if it has not necessary minimum size */
351 if (unlikely(!pskb_may_pull(skb, hdr_size))) 362 if (unlikely(!pskb_may_pull(skb, hdr_size)))
352 goto out; 363 goto free_skb;
353 364
354 ethhdr = eth_hdr(skb); 365 ethhdr = eth_hdr(skb);
355 366
356 /* packet with unicast indication but broadcast recipient */ 367 /* packet with unicast indication but non-unicast recipient */
357 if (is_broadcast_ether_addr(ethhdr->h_dest)) 368 if (!is_valid_ether_addr(ethhdr->h_dest))
358 goto out; 369 goto free_skb;
359 370
360 /* packet with broadcast sender address */ 371 /* packet with broadcast/multicast sender address */
361 if (is_broadcast_ether_addr(ethhdr->h_source)) 372 if (is_multicast_ether_addr(ethhdr->h_source))
362 goto out; 373 goto free_skb;
363 374
364 /* not for me */ 375 /* not for me */
365 if (!batadv_is_my_mac(bat_priv, ethhdr->h_dest)) 376 if (!batadv_is_my_mac(bat_priv, ethhdr->h_dest))
366 goto out; 377 goto free_skb;
367 378
368 icmph = (struct batadv_icmp_header *)skb->data; 379 icmph = (struct batadv_icmp_header *)skb->data;
369 380
@@ -372,17 +383,17 @@ int batadv_recv_icmp_packet(struct sk_buff *skb,
372 icmph->msg_type == BATADV_ECHO_REQUEST) && 383 icmph->msg_type == BATADV_ECHO_REQUEST) &&
373 (skb->len >= sizeof(struct batadv_icmp_packet_rr))) { 384 (skb->len >= sizeof(struct batadv_icmp_packet_rr))) {
374 if (skb_linearize(skb) < 0) 385 if (skb_linearize(skb) < 0)
375 goto out; 386 goto free_skb;
376 387
377 /* create a copy of the skb, if needed, to modify it. */ 388 /* create a copy of the skb, if needed, to modify it. */
378 if (skb_cow(skb, ETH_HLEN) < 0) 389 if (skb_cow(skb, ETH_HLEN) < 0)
379 goto out; 390 goto free_skb;
380 391
381 ethhdr = eth_hdr(skb); 392 ethhdr = eth_hdr(skb);
382 icmph = (struct batadv_icmp_header *)skb->data; 393 icmph = (struct batadv_icmp_header *)skb->data;
383 icmp_packet_rr = (struct batadv_icmp_packet_rr *)icmph; 394 icmp_packet_rr = (struct batadv_icmp_packet_rr *)icmph;
384 if (icmp_packet_rr->rr_cur >= BATADV_RR_LEN) 395 if (icmp_packet_rr->rr_cur >= BATADV_RR_LEN)
385 goto out; 396 goto free_skb;
386 397
387 ether_addr_copy(icmp_packet_rr->rr[icmp_packet_rr->rr_cur], 398 ether_addr_copy(icmp_packet_rr->rr[icmp_packet_rr->rr_cur],
388 ethhdr->h_dest); 399 ethhdr->h_dest);
@@ -400,11 +411,11 @@ int batadv_recv_icmp_packet(struct sk_buff *skb,
400 /* get routing information */ 411 /* get routing information */
401 orig_node = batadv_orig_hash_find(bat_priv, icmph->dst); 412 orig_node = batadv_orig_hash_find(bat_priv, icmph->dst);
402 if (!orig_node) 413 if (!orig_node)
403 goto out; 414 goto free_skb;
404 415
405 /* create a copy of the skb, if needed, to modify it. */ 416 /* create a copy of the skb, if needed, to modify it. */
406 if (skb_cow(skb, ETH_HLEN) < 0) 417 if (skb_cow(skb, ETH_HLEN) < 0)
407 goto out; 418 goto put_orig_node;
408 419
409 icmph = (struct batadv_icmp_header *)skb->data; 420 icmph = (struct batadv_icmp_header *)skb->data;
410 421
@@ -413,12 +424,18 @@ int batadv_recv_icmp_packet(struct sk_buff *skb,
413 424
414 /* route it */ 425 /* route it */
415 res = batadv_send_skb_to_orig(skb, orig_node, recv_if); 426 res = batadv_send_skb_to_orig(skb, orig_node, recv_if);
416 if (res != -1) 427 if (res == NET_XMIT_SUCCESS)
417 ret = NET_RX_SUCCESS; 428 ret = NET_RX_SUCCESS;
418 429
419out: 430 /* skb was consumed */
431 skb = NULL;
432
433put_orig_node:
420 if (orig_node) 434 if (orig_node)
421 batadv_orig_node_put(orig_node); 435 batadv_orig_node_put(orig_node);
436free_skb:
437 kfree_skb(skb);
438
422 return ret; 439 return ret;
423} 440}
424 441
@@ -445,12 +462,12 @@ static int batadv_check_unicast_packet(struct batadv_priv *bat_priv,
445 462
446 ethhdr = eth_hdr(skb); 463 ethhdr = eth_hdr(skb);
447 464
448 /* packet with unicast indication but broadcast recipient */ 465 /* packet with unicast indication but non-unicast recipient */
449 if (is_broadcast_ether_addr(ethhdr->h_dest)) 466 if (!is_valid_ether_addr(ethhdr->h_dest))
450 return -EBADR; 467 return -EBADR;
451 468
452 /* packet with broadcast sender address */ 469 /* packet with broadcast/multicast sender address */
453 if (is_broadcast_ether_addr(ethhdr->h_source)) 470 if (is_multicast_ether_addr(ethhdr->h_source))
454 return -EBADR; 471 return -EBADR;
455 472
456 /* not for me */ 473 /* not for me */
@@ -667,18 +684,18 @@ static int batadv_route_unicast_packet(struct sk_buff *skb,
667 if (unicast_packet->ttl < 2) { 684 if (unicast_packet->ttl < 2) {
668 pr_debug("Warning - can't forward unicast packet from %pM to %pM: ttl exceeded\n", 685 pr_debug("Warning - can't forward unicast packet from %pM to %pM: ttl exceeded\n",
669 ethhdr->h_source, unicast_packet->dest); 686 ethhdr->h_source, unicast_packet->dest);
670 goto out; 687 goto free_skb;
671 } 688 }
672 689
673 /* get routing information */ 690 /* get routing information */
674 orig_node = batadv_orig_hash_find(bat_priv, unicast_packet->dest); 691 orig_node = batadv_orig_hash_find(bat_priv, unicast_packet->dest);
675 692
676 if (!orig_node) 693 if (!orig_node)
677 goto out; 694 goto free_skb;
678 695
679 /* create a copy of the skb, if needed, to modify it. */ 696 /* create a copy of the skb, if needed, to modify it. */
680 if (skb_cow(skb, ETH_HLEN) < 0) 697 if (skb_cow(skb, ETH_HLEN) < 0)
681 goto out; 698 goto put_orig_node;
682 699
683 /* decrement ttl */ 700 /* decrement ttl */
684 unicast_packet = (struct batadv_unicast_packet *)skb->data; 701 unicast_packet = (struct batadv_unicast_packet *)skb->data;
@@ -702,22 +719,24 @@ static int batadv_route_unicast_packet(struct sk_buff *skb,
702 719
703 len = skb->len; 720 len = skb->len;
704 res = batadv_send_skb_to_orig(skb, orig_node, recv_if); 721 res = batadv_send_skb_to_orig(skb, orig_node, recv_if);
705 if (res == -1)
706 goto out;
707 722
708 /* translate transmit result into receive result */ 723 /* translate transmit result into receive result */
709 if (res == NET_XMIT_SUCCESS) { 724 if (res == NET_XMIT_SUCCESS) {
725 ret = NET_RX_SUCCESS;
710 /* skb was transmitted and consumed */ 726 /* skb was transmitted and consumed */
711 batadv_inc_counter(bat_priv, BATADV_CNT_FORWARD); 727 batadv_inc_counter(bat_priv, BATADV_CNT_FORWARD);
712 batadv_add_counter(bat_priv, BATADV_CNT_FORWARD_BYTES, 728 batadv_add_counter(bat_priv, BATADV_CNT_FORWARD_BYTES,
713 len + ETH_HLEN); 729 len + ETH_HLEN);
714 } 730 }
715 731
716 ret = NET_RX_SUCCESS; 732 /* skb was consumed */
733 skb = NULL;
734
735put_orig_node:
736 batadv_orig_node_put(orig_node);
737free_skb:
738 kfree_skb(skb);
717 739
718out:
719 if (orig_node)
720 batadv_orig_node_put(orig_node);
721 return ret; 740 return ret;
722} 741}
723 742
@@ -902,14 +921,18 @@ int batadv_recv_unhandled_unicast_packet(struct sk_buff *skb,
902 921
903 check = batadv_check_unicast_packet(bat_priv, skb, hdr_size); 922 check = batadv_check_unicast_packet(bat_priv, skb, hdr_size);
904 if (check < 0) 923 if (check < 0)
905 return NET_RX_DROP; 924 goto free_skb;
906 925
907 /* we don't know about this type, drop it. */ 926 /* we don't know about this type, drop it. */
908 unicast_packet = (struct batadv_unicast_packet *)skb->data; 927 unicast_packet = (struct batadv_unicast_packet *)skb->data;
909 if (batadv_is_my_mac(bat_priv, unicast_packet->dest)) 928 if (batadv_is_my_mac(bat_priv, unicast_packet->dest))
910 return NET_RX_DROP; 929 goto free_skb;
911 930
912 return batadv_route_unicast_packet(skb, recv_if); 931 return batadv_route_unicast_packet(skb, recv_if);
932
933free_skb:
934 kfree_skb(skb);
935 return NET_RX_DROP;
913} 936}
914 937
915int batadv_recv_unicast_packet(struct sk_buff *skb, 938int batadv_recv_unicast_packet(struct sk_buff *skb,
@@ -923,6 +946,7 @@ int batadv_recv_unicast_packet(struct sk_buff *skb,
923 int check, hdr_size = sizeof(*unicast_packet); 946 int check, hdr_size = sizeof(*unicast_packet);
924 enum batadv_subtype subtype; 947 enum batadv_subtype subtype;
925 bool is4addr; 948 bool is4addr;
949 int ret = NET_RX_DROP;
926 950
927 unicast_packet = (struct batadv_unicast_packet *)skb->data; 951 unicast_packet = (struct batadv_unicast_packet *)skb->data;
928 unicast_4addr_packet = (struct batadv_unicast_4addr_packet *)skb->data; 952 unicast_4addr_packet = (struct batadv_unicast_4addr_packet *)skb->data;
@@ -942,9 +966,9 @@ int batadv_recv_unicast_packet(struct sk_buff *skb,
942 batadv_nc_skb_store_sniffed_unicast(bat_priv, skb); 966 batadv_nc_skb_store_sniffed_unicast(bat_priv, skb);
943 967
944 if (check < 0) 968 if (check < 0)
945 return NET_RX_DROP; 969 goto free_skb;
946 if (!batadv_check_unicast_ttvn(bat_priv, skb, hdr_size)) 970 if (!batadv_check_unicast_ttvn(bat_priv, skb, hdr_size))
947 return NET_RX_DROP; 971 goto free_skb;
948 972
949 /* packet for me */ 973 /* packet for me */
950 if (batadv_is_my_mac(bat_priv, unicast_packet->dest)) { 974 if (batadv_is_my_mac(bat_priv, unicast_packet->dest)) {
@@ -982,7 +1006,14 @@ rx_success:
982 return NET_RX_SUCCESS; 1006 return NET_RX_SUCCESS;
983 } 1007 }
984 1008
985 return batadv_route_unicast_packet(skb, recv_if); 1009 ret = batadv_route_unicast_packet(skb, recv_if);
1010 /* skb was consumed */
1011 skb = NULL;
1012
1013free_skb:
1014 kfree_skb(skb);
1015
1016 return ret;
986} 1017}
987 1018
988/** 1019/**
@@ -1004,15 +1035,15 @@ int batadv_recv_unicast_tvlv(struct sk_buff *skb,
1004 int ret = NET_RX_DROP; 1035 int ret = NET_RX_DROP;
1005 1036
1006 if (batadv_check_unicast_packet(bat_priv, skb, hdr_size) < 0) 1037 if (batadv_check_unicast_packet(bat_priv, skb, hdr_size) < 0)
1007 return NET_RX_DROP; 1038 goto free_skb;
1008 1039
1009 /* the header is likely to be modified while forwarding */ 1040 /* the header is likely to be modified while forwarding */
1010 if (skb_cow(skb, hdr_size) < 0) 1041 if (skb_cow(skb, hdr_size) < 0)
1011 return NET_RX_DROP; 1042 goto free_skb;
1012 1043
1013 /* packet needs to be linearized to access the tvlv content */ 1044 /* packet needs to be linearized to access the tvlv content */
1014 if (skb_linearize(skb) < 0) 1045 if (skb_linearize(skb) < 0)
1015 return NET_RX_DROP; 1046 goto free_skb;
1016 1047
1017 unicast_tvlv_packet = (struct batadv_unicast_tvlv_packet *)skb->data; 1048 unicast_tvlv_packet = (struct batadv_unicast_tvlv_packet *)skb->data;
1018 1049
@@ -1020,17 +1051,21 @@ int batadv_recv_unicast_tvlv(struct sk_buff *skb,
1020 tvlv_buff_len = ntohs(unicast_tvlv_packet->tvlv_len); 1051 tvlv_buff_len = ntohs(unicast_tvlv_packet->tvlv_len);
1021 1052
1022 if (tvlv_buff_len > skb->len - hdr_size) 1053 if (tvlv_buff_len > skb->len - hdr_size)
1023 return NET_RX_DROP; 1054 goto free_skb;
1024 1055
1025 ret = batadv_tvlv_containers_process(bat_priv, false, NULL, 1056 ret = batadv_tvlv_containers_process(bat_priv, false, NULL,
1026 unicast_tvlv_packet->src, 1057 unicast_tvlv_packet->src,
1027 unicast_tvlv_packet->dst, 1058 unicast_tvlv_packet->dst,
1028 tvlv_buff, tvlv_buff_len); 1059 tvlv_buff, tvlv_buff_len);
1029 1060
1030 if (ret != NET_RX_SUCCESS) 1061 if (ret != NET_RX_SUCCESS) {
1031 ret = batadv_route_unicast_packet(skb, recv_if); 1062 ret = batadv_route_unicast_packet(skb, recv_if);
1032 else 1063 /* skb was consumed */
1033 consume_skb(skb); 1064 skb = NULL;
1065 }
1066
1067free_skb:
1068 kfree_skb(skb);
1034 1069
1035 return ret; 1070 return ret;
1036} 1071}
@@ -1056,20 +1091,22 @@ int batadv_recv_frag_packet(struct sk_buff *skb,
1056 1091
1057 if (batadv_check_unicast_packet(bat_priv, skb, 1092 if (batadv_check_unicast_packet(bat_priv, skb,
1058 sizeof(*frag_packet)) < 0) 1093 sizeof(*frag_packet)) < 0)
1059 goto out; 1094 goto free_skb;
1060 1095
1061 frag_packet = (struct batadv_frag_packet *)skb->data; 1096 frag_packet = (struct batadv_frag_packet *)skb->data;
1062 orig_node_src = batadv_orig_hash_find(bat_priv, frag_packet->orig); 1097 orig_node_src = batadv_orig_hash_find(bat_priv, frag_packet->orig);
1063 if (!orig_node_src) 1098 if (!orig_node_src)
1064 goto out; 1099 goto free_skb;
1065 1100
1066 skb->priority = frag_packet->priority + 256; 1101 skb->priority = frag_packet->priority + 256;
1067 1102
1068 /* Route the fragment if it is not for us and too big to be merged. */ 1103 /* Route the fragment if it is not for us and too big to be merged. */
1069 if (!batadv_is_my_mac(bat_priv, frag_packet->dest) && 1104 if (!batadv_is_my_mac(bat_priv, frag_packet->dest) &&
1070 batadv_frag_skb_fwd(skb, recv_if, orig_node_src)) { 1105 batadv_frag_skb_fwd(skb, recv_if, orig_node_src)) {
1106 /* skb was consumed */
1107 skb = NULL;
1071 ret = NET_RX_SUCCESS; 1108 ret = NET_RX_SUCCESS;
1072 goto out; 1109 goto put_orig_node;
1073 } 1110 }
1074 1111
1075 batadv_inc_counter(bat_priv, BATADV_CNT_FRAG_RX); 1112 batadv_inc_counter(bat_priv, BATADV_CNT_FRAG_RX);
@@ -1077,20 +1114,24 @@ int batadv_recv_frag_packet(struct sk_buff *skb,
1077 1114
1078 /* Add fragment to buffer and merge if possible. */ 1115 /* Add fragment to buffer and merge if possible. */
1079 if (!batadv_frag_skb_buffer(&skb, orig_node_src)) 1116 if (!batadv_frag_skb_buffer(&skb, orig_node_src))
1080 goto out; 1117 goto put_orig_node;
1081 1118
1082 /* Deliver merged packet to the appropriate handler, if it was 1119 /* Deliver merged packet to the appropriate handler, if it was
1083 * merged 1120 * merged
1084 */ 1121 */
1085 if (skb) 1122 if (skb) {
1086 batadv_batman_skb_recv(skb, recv_if->net_dev, 1123 batadv_batman_skb_recv(skb, recv_if->net_dev,
1087 &recv_if->batman_adv_ptype, NULL); 1124 &recv_if->batman_adv_ptype, NULL);
1125 /* skb was consumed */
1126 skb = NULL;
1127 }
1088 1128
1089 ret = NET_RX_SUCCESS; 1129 ret = NET_RX_SUCCESS;
1090 1130
1091out: 1131put_orig_node:
1092 if (orig_node_src) 1132 batadv_orig_node_put(orig_node_src);
1093 batadv_orig_node_put(orig_node_src); 1133free_skb:
1134 kfree_skb(skb);
1094 1135
1095 return ret; 1136 return ret;
1096} 1137}
@@ -1109,35 +1150,35 @@ int batadv_recv_bcast_packet(struct sk_buff *skb,
1109 1150
1110 /* drop packet if it has not necessary minimum size */ 1151 /* drop packet if it has not necessary minimum size */
1111 if (unlikely(!pskb_may_pull(skb, hdr_size))) 1152 if (unlikely(!pskb_may_pull(skb, hdr_size)))
1112 goto out; 1153 goto free_skb;
1113 1154
1114 ethhdr = eth_hdr(skb); 1155 ethhdr = eth_hdr(skb);
1115 1156
1116 /* packet with broadcast indication but unicast recipient */ 1157 /* packet with broadcast indication but unicast recipient */
1117 if (!is_broadcast_ether_addr(ethhdr->h_dest)) 1158 if (!is_broadcast_ether_addr(ethhdr->h_dest))
1118 goto out; 1159 goto free_skb;
1119 1160
1120 /* packet with broadcast sender address */ 1161 /* packet with broadcast/multicast sender address */
1121 if (is_broadcast_ether_addr(ethhdr->h_source)) 1162 if (is_multicast_ether_addr(ethhdr->h_source))
1122 goto out; 1163 goto free_skb;
1123 1164
1124 /* ignore broadcasts sent by myself */ 1165 /* ignore broadcasts sent by myself */
1125 if (batadv_is_my_mac(bat_priv, ethhdr->h_source)) 1166 if (batadv_is_my_mac(bat_priv, ethhdr->h_source))
1126 goto out; 1167 goto free_skb;
1127 1168
1128 bcast_packet = (struct batadv_bcast_packet *)skb->data; 1169 bcast_packet = (struct batadv_bcast_packet *)skb->data;
1129 1170
1130 /* ignore broadcasts originated by myself */ 1171 /* ignore broadcasts originated by myself */
1131 if (batadv_is_my_mac(bat_priv, bcast_packet->orig)) 1172 if (batadv_is_my_mac(bat_priv, bcast_packet->orig))
1132 goto out; 1173 goto free_skb;
1133 1174
1134 if (bcast_packet->ttl < 2) 1175 if (bcast_packet->ttl < 2)
1135 goto out; 1176 goto free_skb;
1136 1177
1137 orig_node = batadv_orig_hash_find(bat_priv, bcast_packet->orig); 1178 orig_node = batadv_orig_hash_find(bat_priv, bcast_packet->orig);
1138 1179
1139 if (!orig_node) 1180 if (!orig_node)
1140 goto out; 1181 goto free_skb;
1141 1182
1142 spin_lock_bh(&orig_node->bcast_seqno_lock); 1183 spin_lock_bh(&orig_node->bcast_seqno_lock);
1143 1184
@@ -1165,18 +1206,18 @@ int batadv_recv_bcast_packet(struct sk_buff *skb,
1165 1206
1166 /* check whether this has been sent by another originator before */ 1207 /* check whether this has been sent by another originator before */
1167 if (batadv_bla_check_bcast_duplist(bat_priv, skb)) 1208 if (batadv_bla_check_bcast_duplist(bat_priv, skb))
1168 goto out; 1209 goto free_skb;
1169 1210
1170 batadv_skb_set_priority(skb, sizeof(struct batadv_bcast_packet)); 1211 batadv_skb_set_priority(skb, sizeof(struct batadv_bcast_packet));
1171 1212
1172 /* rebroadcast packet */ 1213 /* rebroadcast packet */
1173 batadv_add_bcast_packet_to_list(bat_priv, skb, 1); 1214 batadv_add_bcast_packet_to_list(bat_priv, skb, 1, false);
1174 1215
1175 /* don't hand the broadcast up if it is from an originator 1216 /* don't hand the broadcast up if it is from an originator
1176 * from the same backbone. 1217 * from the same backbone.
1177 */ 1218 */
1178 if (batadv_bla_is_backbone_gw(skb, orig_node, hdr_size)) 1219 if (batadv_bla_is_backbone_gw(skb, orig_node, hdr_size))
1179 goto out; 1220 goto free_skb;
1180 1221
1181 if (batadv_dat_snoop_incoming_arp_request(bat_priv, skb, hdr_size)) 1222 if (batadv_dat_snoop_incoming_arp_request(bat_priv, skb, hdr_size))
1182 goto rx_success; 1223 goto rx_success;
@@ -1192,6 +1233,8 @@ rx_success:
1192 1233
1193spin_unlock: 1234spin_unlock:
1194 spin_unlock_bh(&orig_node->bcast_seqno_lock); 1235 spin_unlock_bh(&orig_node->bcast_seqno_lock);
1236free_skb:
1237 kfree_skb(skb);
1195out: 1238out:
1196 if (orig_node) 1239 if (orig_node)
1197 batadv_orig_node_put(orig_node); 1240 batadv_orig_node_put(orig_node);
diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h
index 05c3ff42e181..5ede16c32f15 100644
--- a/net/batman-adv/routing.h
+++ b/net/batman-adv/routing.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index 8d4e1f578574..1489ec27daff 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
@@ -19,6 +19,7 @@
19#include "main.h" 19#include "main.h"
20 20
21#include <linux/atomic.h> 21#include <linux/atomic.h>
22#include <linux/bug.h>
22#include <linux/byteorder/generic.h> 23#include <linux/byteorder/generic.h>
23#include <linux/errno.h> 24#include <linux/errno.h>
24#include <linux/etherdevice.h> 25#include <linux/etherdevice.h>
@@ -64,8 +65,11 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work);
64 * If neigh_node is NULL, then the packet is broadcasted using hard_iface, 65 * If neigh_node is NULL, then the packet is broadcasted using hard_iface,
65 * otherwise it is sent as unicast to the given neighbor. 66 * otherwise it is sent as unicast to the given neighbor.
66 * 67 *
67 * Return: NET_TX_DROP in case of error or the result of dev_queue_xmit(skb) 68 * Regardless of the return value, the skb is consumed.
68 * otherwise 69 *
70 * Return: A negative errno code is returned on a failure. A success does not
71 * guarantee the frame will be transmitted as it may be dropped due
72 * to congestion or traffic shaping.
69 */ 73 */
70int batadv_send_skb_packet(struct sk_buff *skb, 74int batadv_send_skb_packet(struct sk_buff *skb,
71 struct batadv_hard_iface *hard_iface, 75 struct batadv_hard_iface *hard_iface,
@@ -111,15 +115,9 @@ int batadv_send_skb_packet(struct sk_buff *skb,
111 /* dev_queue_xmit() returns a negative result on error. However on 115 /* dev_queue_xmit() returns a negative result on error. However on
112 * congestion and traffic shaping, it drops and returns NET_XMIT_DROP 116 * congestion and traffic shaping, it drops and returns NET_XMIT_DROP
113 * (which is > 0). This will not be treated as an error. 117 * (which is > 0). This will not be treated as an error.
114 *
115 * a negative value cannot be returned because it could be interepreted
116 * as not consumed skb by callers of batadv_send_skb_to_orig.
117 */ 118 */
118 ret = dev_queue_xmit(skb); 119 ret = dev_queue_xmit(skb);
119 if (ret < 0) 120 return net_xmit_eval(ret);
120 ret = NET_XMIT_DROP;
121
122 return ret;
123send_skb_err: 121send_skb_err:
124 kfree_skb(skb); 122 kfree_skb(skb);
125 return NET_XMIT_DROP; 123 return NET_XMIT_DROP;
@@ -165,11 +163,9 @@ int batadv_send_unicast_skb(struct sk_buff *skb,
165 * host, NULL can be passed as recv_if and no interface alternating is 163 * host, NULL can be passed as recv_if and no interface alternating is
166 * attempted. 164 * attempted.
167 * 165 *
168 * Return: -1 on failure (and the skb is not consumed), -EINPROGRESS if the 166 * Return: negative errno code on a failure, -EINPROGRESS if the skb is
169 * skb is buffered for later transmit or the NET_XMIT status returned by the 167 * buffered for later transmit or the NET_XMIT status returned by the
170 * lower routine if the packet has been passed down. 168 * lower routine if the packet has been passed down.
171 *
172 * If the returning value is not -1 the skb has been consumed.
173 */ 169 */
174int batadv_send_skb_to_orig(struct sk_buff *skb, 170int batadv_send_skb_to_orig(struct sk_buff *skb,
175 struct batadv_orig_node *orig_node, 171 struct batadv_orig_node *orig_node,
@@ -177,12 +173,14 @@ int batadv_send_skb_to_orig(struct sk_buff *skb,
177{ 173{
178 struct batadv_priv *bat_priv = orig_node->bat_priv; 174 struct batadv_priv *bat_priv = orig_node->bat_priv;
179 struct batadv_neigh_node *neigh_node; 175 struct batadv_neigh_node *neigh_node;
180 int ret = -1; 176 int ret;
181 177
182 /* batadv_find_router() increases neigh_nodes refcount if found. */ 178 /* batadv_find_router() increases neigh_nodes refcount if found. */
183 neigh_node = batadv_find_router(bat_priv, orig_node, recv_if); 179 neigh_node = batadv_find_router(bat_priv, orig_node, recv_if);
184 if (!neigh_node) 180 if (!neigh_node) {
185 goto out; 181 ret = -EINVAL;
182 goto free_skb;
183 }
186 184
187 /* Check if the skb is too large to send in one piece and fragment 185 /* Check if the skb is too large to send in one piece and fragment
188 * it if needed. 186 * it if needed.
@@ -191,8 +189,10 @@ int batadv_send_skb_to_orig(struct sk_buff *skb,
191 skb->len > neigh_node->if_incoming->net_dev->mtu) { 189 skb->len > neigh_node->if_incoming->net_dev->mtu) {
192 /* Fragment and send packet. */ 190 /* Fragment and send packet. */
193 ret = batadv_frag_send_packet(skb, orig_node, neigh_node); 191 ret = batadv_frag_send_packet(skb, orig_node, neigh_node);
192 /* skb was consumed */
193 skb = NULL;
194 194
195 goto out; 195 goto put_neigh_node;
196 } 196 }
197 197
198 /* try to network code the packet, if it is received on an interface 198 /* try to network code the packet, if it is received on an interface
@@ -204,9 +204,13 @@ int batadv_send_skb_to_orig(struct sk_buff *skb,
204 else 204 else
205 ret = batadv_send_unicast_skb(skb, neigh_node); 205 ret = batadv_send_unicast_skb(skb, neigh_node);
206 206
207out: 207 /* skb was consumed */
208 if (neigh_node) 208 skb = NULL;
209 batadv_neigh_node_put(neigh_node); 209
210put_neigh_node:
211 batadv_neigh_node_put(neigh_node);
212free_skb:
213 kfree_skb(skb);
210 214
211 return ret; 215 return ret;
212} 216}
@@ -327,7 +331,7 @@ int batadv_send_skb_unicast(struct batadv_priv *bat_priv,
327{ 331{
328 struct batadv_unicast_packet *unicast_packet; 332 struct batadv_unicast_packet *unicast_packet;
329 struct ethhdr *ethhdr; 333 struct ethhdr *ethhdr;
330 int res, ret = NET_XMIT_DROP; 334 int ret = NET_XMIT_DROP;
331 335
332 if (!orig_node) 336 if (!orig_node)
333 goto out; 337 goto out;
@@ -364,13 +368,12 @@ int batadv_send_skb_unicast(struct batadv_priv *bat_priv,
364 if (batadv_tt_global_client_is_roaming(bat_priv, ethhdr->h_dest, vid)) 368 if (batadv_tt_global_client_is_roaming(bat_priv, ethhdr->h_dest, vid))
365 unicast_packet->ttvn = unicast_packet->ttvn - 1; 369 unicast_packet->ttvn = unicast_packet->ttvn - 1;
366 370
367 res = batadv_send_skb_to_orig(skb, orig_node, NULL); 371 ret = batadv_send_skb_to_orig(skb, orig_node, NULL);
368 if (res != -1) 372 /* skb was consumed */
369 ret = NET_XMIT_SUCCESS; 373 skb = NULL;
370 374
371out: 375out:
372 if (ret == NET_XMIT_DROP) 376 kfree_skb(skb);
373 kfree_skb(skb);
374 return ret; 377 return ret;
375} 378}
376 379
@@ -451,13 +454,19 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb,
451/** 454/**
452 * batadv_forw_packet_free - free a forwarding packet 455 * batadv_forw_packet_free - free a forwarding packet
453 * @forw_packet: The packet to free 456 * @forw_packet: The packet to free
457 * @dropped: whether the packet is freed because is is dropped
454 * 458 *
455 * This frees a forwarding packet and releases any resources it might 459 * This frees a forwarding packet and releases any resources it might
456 * have claimed. 460 * have claimed.
457 */ 461 */
458void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet) 462void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet,
463 bool dropped)
459{ 464{
460 kfree_skb(forw_packet->skb); 465 if (dropped)
466 kfree_skb(forw_packet->skb);
467 else
468 consume_skb(forw_packet->skb);
469
461 if (forw_packet->if_incoming) 470 if (forw_packet->if_incoming)
462 batadv_hardif_put(forw_packet->if_incoming); 471 batadv_hardif_put(forw_packet->if_incoming);
463 if (forw_packet->if_outgoing) 472 if (forw_packet->if_outgoing)
@@ -514,6 +523,8 @@ batadv_forw_packet_alloc(struct batadv_hard_iface *if_incoming,
514 if (if_outgoing) 523 if (if_outgoing)
515 kref_get(&if_outgoing->refcount); 524 kref_get(&if_outgoing->refcount);
516 525
526 INIT_HLIST_NODE(&forw_packet->list);
527 INIT_HLIST_NODE(&forw_packet->cleanup_list);
517 forw_packet->skb = NULL; 528 forw_packet->skb = NULL;
518 forw_packet->queue_left = queue_left; 529 forw_packet->queue_left = queue_left;
519 forw_packet->if_incoming = if_incoming; 530 forw_packet->if_incoming = if_incoming;
@@ -529,19 +540,191 @@ err:
529 return NULL; 540 return NULL;
530} 541}
531 542
543/**
544 * batadv_forw_packet_was_stolen - check whether someone stole this packet
545 * @forw_packet: the forwarding packet to check
546 *
547 * This function checks whether the given forwarding packet was claimed by
548 * someone else for free().
549 *
550 * Return: True if someone stole it, false otherwise.
551 */
552static bool
553batadv_forw_packet_was_stolen(struct batadv_forw_packet *forw_packet)
554{
555 return !hlist_unhashed(&forw_packet->cleanup_list);
556}
557
558/**
559 * batadv_forw_packet_steal - claim a forw_packet for free()
560 * @forw_packet: the forwarding packet to steal
561 * @lock: a key to the store to steal from (e.g. forw_{bat,bcast}_list_lock)
562 *
563 * This function tries to steal a specific forw_packet from global
564 * visibility for the purpose of getting it for free(). That means
565 * the caller is *not* allowed to requeue it afterwards.
566 *
567 * Return: True if stealing was successful. False if someone else stole it
568 * before us.
569 */
570bool batadv_forw_packet_steal(struct batadv_forw_packet *forw_packet,
571 spinlock_t *lock)
572{
573 /* did purging routine steal it earlier? */
574 spin_lock_bh(lock);
575 if (batadv_forw_packet_was_stolen(forw_packet)) {
576 spin_unlock_bh(lock);
577 return false;
578 }
579
580 hlist_del_init(&forw_packet->list);
581
582 /* Just to spot misuse of this function */
583 hlist_add_fake(&forw_packet->cleanup_list);
584
585 spin_unlock_bh(lock);
586 return true;
587}
588
589/**
590 * batadv_forw_packet_list_steal - claim a list of forward packets for free()
591 * @forw_list: the to be stolen forward packets
592 * @cleanup_list: a backup pointer, to be able to dispose the packet later
593 * @hard_iface: the interface to steal forward packets from
594 *
595 * This function claims responsibility to free any forw_packet queued on the
596 * given hard_iface. If hard_iface is NULL forwarding packets on all hard
597 * interfaces will be claimed.
598 *
599 * The packets are being moved from the forw_list to the cleanup_list and
600 * by that allows already running threads to notice the claiming.
601 */
532static void 602static void
533_batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, 603batadv_forw_packet_list_steal(struct hlist_head *forw_list,
534 struct batadv_forw_packet *forw_packet, 604 struct hlist_head *cleanup_list,
535 unsigned long send_time) 605 const struct batadv_hard_iface *hard_iface)
536{ 606{
537 /* add new packet to packet list */ 607 struct batadv_forw_packet *forw_packet;
538 spin_lock_bh(&bat_priv->forw_bcast_list_lock); 608 struct hlist_node *safe_tmp_node;
539 hlist_add_head(&forw_packet->list, &bat_priv->forw_bcast_list); 609
540 spin_unlock_bh(&bat_priv->forw_bcast_list_lock); 610 hlist_for_each_entry_safe(forw_packet, safe_tmp_node,
611 forw_list, list) {
612 /* if purge_outstanding_packets() was called with an argument
613 * we delete only packets belonging to the given interface
614 */
615 if (hard_iface &&
616 (forw_packet->if_incoming != hard_iface) &&
617 (forw_packet->if_outgoing != hard_iface))
618 continue;
619
620 hlist_del(&forw_packet->list);
621 hlist_add_head(&forw_packet->cleanup_list, cleanup_list);
622 }
623}
624
625/**
626 * batadv_forw_packet_list_free - free a list of forward packets
627 * @head: a list of to be freed forw_packets
628 *
629 * This function cancels the scheduling of any packet in the provided list,
630 * waits for any possibly running packet forwarding thread to finish and
631 * finally, safely frees this forward packet.
632 *
633 * This function might sleep.
634 */
635static void batadv_forw_packet_list_free(struct hlist_head *head)
636{
637 struct batadv_forw_packet *forw_packet;
638 struct hlist_node *safe_tmp_node;
541 639
542 /* start timer for this packet */ 640 hlist_for_each_entry_safe(forw_packet, safe_tmp_node, head,
543 queue_delayed_work(batadv_event_workqueue, &forw_packet->delayed_work, 641 cleanup_list) {
544 send_time); 642 cancel_delayed_work_sync(&forw_packet->delayed_work);
643
644 hlist_del(&forw_packet->cleanup_list);
645 batadv_forw_packet_free(forw_packet, true);
646 }
647}
648
649/**
650 * batadv_forw_packet_queue - try to queue a forwarding packet
651 * @forw_packet: the forwarding packet to queue
652 * @lock: a key to the store (e.g. forw_{bat,bcast}_list_lock)
653 * @head: the shelve to queue it on (e.g. forw_{bat,bcast}_list)
654 * @send_time: timestamp (jiffies) when the packet is to be sent
655 *
656 * This function tries to (re)queue a forwarding packet. Requeuing
657 * is prevented if the according interface is shutting down
658 * (e.g. if batadv_forw_packet_list_steal() was called for this
659 * packet earlier).
660 *
661 * Calling batadv_forw_packet_queue() after a call to
662 * batadv_forw_packet_steal() is forbidden!
663 *
664 * Caller needs to ensure that forw_packet->delayed_work was initialized.
665 */
666static void batadv_forw_packet_queue(struct batadv_forw_packet *forw_packet,
667 spinlock_t *lock, struct hlist_head *head,
668 unsigned long send_time)
669{
670 spin_lock_bh(lock);
671
672 /* did purging routine steal it from us? */
673 if (batadv_forw_packet_was_stolen(forw_packet)) {
674 /* If you got it for free() without trouble, then
675 * don't get back into the queue after stealing...
676 */
677 WARN_ONCE(hlist_fake(&forw_packet->cleanup_list),
678 "Requeuing after batadv_forw_packet_steal() not allowed!\n");
679
680 spin_unlock_bh(lock);
681 return;
682 }
683
684 hlist_del_init(&forw_packet->list);
685 hlist_add_head(&forw_packet->list, head);
686
687 queue_delayed_work(batadv_event_workqueue,
688 &forw_packet->delayed_work,
689 send_time - jiffies);
690 spin_unlock_bh(lock);
691}
692
693/**
694 * batadv_forw_packet_bcast_queue - try to queue a broadcast packet
695 * @bat_priv: the bat priv with all the soft interface information
696 * @forw_packet: the forwarding packet to queue
697 * @send_time: timestamp (jiffies) when the packet is to be sent
698 *
699 * This function tries to (re)queue a broadcast packet.
700 *
701 * Caller needs to ensure that forw_packet->delayed_work was initialized.
702 */
703static void
704batadv_forw_packet_bcast_queue(struct batadv_priv *bat_priv,
705 struct batadv_forw_packet *forw_packet,
706 unsigned long send_time)
707{
708 batadv_forw_packet_queue(forw_packet, &bat_priv->forw_bcast_list_lock,
709 &bat_priv->forw_bcast_list, send_time);
710}
711
712/**
713 * batadv_forw_packet_ogmv1_queue - try to queue an OGMv1 packet
714 * @bat_priv: the bat priv with all the soft interface information
715 * @forw_packet: the forwarding packet to queue
716 * @send_time: timestamp (jiffies) when the packet is to be sent
717 *
718 * This function tries to (re)queue an OGMv1 packet.
719 *
720 * Caller needs to ensure that forw_packet->delayed_work was initialized.
721 */
722void batadv_forw_packet_ogmv1_queue(struct batadv_priv *bat_priv,
723 struct batadv_forw_packet *forw_packet,
724 unsigned long send_time)
725{
726 batadv_forw_packet_queue(forw_packet, &bat_priv->forw_bat_list_lock,
727 &bat_priv->forw_bat_list, send_time);
545} 728}
546 729
547/** 730/**
@@ -549,6 +732,7 @@ _batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv,
549 * @bat_priv: the bat priv with all the soft interface information 732 * @bat_priv: the bat priv with all the soft interface information
550 * @skb: broadcast packet to add 733 * @skb: broadcast packet to add
551 * @delay: number of jiffies to wait before sending 734 * @delay: number of jiffies to wait before sending
735 * @own_packet: true if it is a self-generated broadcast packet
552 * 736 *
553 * add a broadcast packet to the queue and setup timers. broadcast packets 737 * add a broadcast packet to the queue and setup timers. broadcast packets
554 * are sent multiple times to increase probability for being received. 738 * are sent multiple times to increase probability for being received.
@@ -560,9 +744,10 @@ _batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv,
560 */ 744 */
561int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, 745int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv,
562 const struct sk_buff *skb, 746 const struct sk_buff *skb,
563 unsigned long delay) 747 unsigned long delay,
748 bool own_packet)
564{ 749{
565 struct batadv_hard_iface *primary_if = NULL; 750 struct batadv_hard_iface *primary_if;
566 struct batadv_forw_packet *forw_packet; 751 struct batadv_forw_packet *forw_packet;
567 struct batadv_bcast_packet *bcast_packet; 752 struct batadv_bcast_packet *bcast_packet;
568 struct sk_buff *newskb; 753 struct sk_buff *newskb;
@@ -586,18 +771,17 @@ int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv,
586 bcast_packet = (struct batadv_bcast_packet *)newskb->data; 771 bcast_packet = (struct batadv_bcast_packet *)newskb->data;
587 bcast_packet->ttl--; 772 bcast_packet->ttl--;
588 773
589 skb_reset_mac_header(newskb);
590
591 forw_packet->skb = newskb; 774 forw_packet->skb = newskb;
775 forw_packet->own = own_packet;
592 776
593 INIT_DELAYED_WORK(&forw_packet->delayed_work, 777 INIT_DELAYED_WORK(&forw_packet->delayed_work,
594 batadv_send_outstanding_bcast_packet); 778 batadv_send_outstanding_bcast_packet);
595 779
596 _batadv_add_bcast_packet_to_list(bat_priv, forw_packet, delay); 780 batadv_forw_packet_bcast_queue(bat_priv, forw_packet, jiffies + delay);
597 return NETDEV_TX_OK; 781 return NETDEV_TX_OK;
598 782
599err_packet_free: 783err_packet_free:
600 batadv_forw_packet_free(forw_packet); 784 batadv_forw_packet_free(forw_packet, true);
601err: 785err:
602 return NETDEV_TX_BUSY; 786 return NETDEV_TX_BUSY;
603} 787}
@@ -605,11 +789,18 @@ err:
605static void batadv_send_outstanding_bcast_packet(struct work_struct *work) 789static void batadv_send_outstanding_bcast_packet(struct work_struct *work)
606{ 790{
607 struct batadv_hard_iface *hard_iface; 791 struct batadv_hard_iface *hard_iface;
792 struct batadv_hardif_neigh_node *neigh_node;
608 struct delayed_work *delayed_work; 793 struct delayed_work *delayed_work;
609 struct batadv_forw_packet *forw_packet; 794 struct batadv_forw_packet *forw_packet;
795 struct batadv_bcast_packet *bcast_packet;
610 struct sk_buff *skb1; 796 struct sk_buff *skb1;
611 struct net_device *soft_iface; 797 struct net_device *soft_iface;
612 struct batadv_priv *bat_priv; 798 struct batadv_priv *bat_priv;
799 unsigned long send_time = jiffies + msecs_to_jiffies(5);
800 bool dropped = false;
801 u8 *neigh_addr;
802 u8 *orig_neigh;
803 int ret = 0;
613 804
614 delayed_work = to_delayed_work(work); 805 delayed_work = to_delayed_work(work);
615 forw_packet = container_of(delayed_work, struct batadv_forw_packet, 806 forw_packet = container_of(delayed_work, struct batadv_forw_packet,
@@ -617,15 +808,17 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work)
617 soft_iface = forw_packet->if_incoming->soft_iface; 808 soft_iface = forw_packet->if_incoming->soft_iface;
618 bat_priv = netdev_priv(soft_iface); 809 bat_priv = netdev_priv(soft_iface);
619 810
620 spin_lock_bh(&bat_priv->forw_bcast_list_lock); 811 if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) {
621 hlist_del(&forw_packet->list); 812 dropped = true;
622 spin_unlock_bh(&bat_priv->forw_bcast_list_lock);
623
624 if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING)
625 goto out; 813 goto out;
814 }
626 815
627 if (batadv_dat_drop_broadcast_packet(bat_priv, forw_packet)) 816 if (batadv_dat_drop_broadcast_packet(bat_priv, forw_packet)) {
817 dropped = true;
628 goto out; 818 goto out;
819 }
820
821 bcast_packet = (struct batadv_bcast_packet *)forw_packet->skb->data;
629 822
630 /* rebroadcast packet */ 823 /* rebroadcast packet */
631 rcu_read_lock(); 824 rcu_read_lock();
@@ -636,6 +829,49 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work)
636 if (forw_packet->num_packets >= hard_iface->num_bcasts) 829 if (forw_packet->num_packets >= hard_iface->num_bcasts)
637 continue; 830 continue;
638 831
832 if (forw_packet->own) {
833 neigh_node = NULL;
834 } else {
835 neigh_addr = eth_hdr(forw_packet->skb)->h_source;
836 neigh_node = batadv_hardif_neigh_get(hard_iface,
837 neigh_addr);
838 }
839
840 orig_neigh = neigh_node ? neigh_node->orig : NULL;
841
842 ret = batadv_hardif_no_broadcast(hard_iface, bcast_packet->orig,
843 orig_neigh);
844
845 if (ret) {
846 char *type;
847
848 switch (ret) {
849 case BATADV_HARDIF_BCAST_NORECIPIENT:
850 type = "no neighbor";
851 break;
852 case BATADV_HARDIF_BCAST_DUPFWD:
853 type = "single neighbor is source";
854 break;
855 case BATADV_HARDIF_BCAST_DUPORIG:
856 type = "single neighbor is originator";
857 break;
858 default:
859 type = "unknown";
860 }
861
862 batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "BCAST packet from orig %pM on %s surpressed: %s\n",
863 bcast_packet->orig,
864 hard_iface->net_dev->name, type);
865
866 if (neigh_node)
867 batadv_hardif_neigh_put(neigh_node);
868
869 continue;
870 }
871
872 if (neigh_node)
873 batadv_hardif_neigh_put(neigh_node);
874
639 if (!kref_get_unless_zero(&hard_iface->refcount)) 875 if (!kref_get_unless_zero(&hard_iface->refcount))
640 continue; 876 continue;
641 877
@@ -652,22 +888,34 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work)
652 888
653 /* if we still have some more bcasts to send */ 889 /* if we still have some more bcasts to send */
654 if (forw_packet->num_packets < BATADV_NUM_BCASTS_MAX) { 890 if (forw_packet->num_packets < BATADV_NUM_BCASTS_MAX) {
655 _batadv_add_bcast_packet_to_list(bat_priv, forw_packet, 891 batadv_forw_packet_bcast_queue(bat_priv, forw_packet,
656 msecs_to_jiffies(5)); 892 send_time);
657 return; 893 return;
658 } 894 }
659 895
660out: 896out:
661 batadv_forw_packet_free(forw_packet); 897 /* do we get something for free()? */
898 if (batadv_forw_packet_steal(forw_packet,
899 &bat_priv->forw_bcast_list_lock))
900 batadv_forw_packet_free(forw_packet, dropped);
662} 901}
663 902
903/**
904 * batadv_purge_outstanding_packets - stop/purge scheduled bcast/OGMv1 packets
905 * @bat_priv: the bat priv with all the soft interface information
906 * @hard_iface: the hard interface to cancel and purge bcast/ogm packets on
907 *
908 * This method cancels and purges any broadcast and OGMv1 packet on the given
909 * hard_iface. If hard_iface is NULL, broadcast and OGMv1 packets on all hard
910 * interfaces will be canceled and purged.
911 *
912 * This function might sleep.
913 */
664void 914void
665batadv_purge_outstanding_packets(struct batadv_priv *bat_priv, 915batadv_purge_outstanding_packets(struct batadv_priv *bat_priv,
666 const struct batadv_hard_iface *hard_iface) 916 const struct batadv_hard_iface *hard_iface)
667{ 917{
668 struct batadv_forw_packet *forw_packet; 918 struct hlist_head head = HLIST_HEAD_INIT;
669 struct hlist_node *safe_tmp_node;
670 bool pending;
671 919
672 if (hard_iface) 920 if (hard_iface)
673 batadv_dbg(BATADV_DBG_BATMAN, bat_priv, 921 batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
@@ -677,57 +925,18 @@ batadv_purge_outstanding_packets(struct batadv_priv *bat_priv,
677 batadv_dbg(BATADV_DBG_BATMAN, bat_priv, 925 batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
678 "purge_outstanding_packets()\n"); 926 "purge_outstanding_packets()\n");
679 927
680 /* free bcast list */ 928 /* claim bcast list for free() */
681 spin_lock_bh(&bat_priv->forw_bcast_list_lock); 929 spin_lock_bh(&bat_priv->forw_bcast_list_lock);
682 hlist_for_each_entry_safe(forw_packet, safe_tmp_node, 930 batadv_forw_packet_list_steal(&bat_priv->forw_bcast_list, &head,
683 &bat_priv->forw_bcast_list, list) { 931 hard_iface);
684 /* if purge_outstanding_packets() was called with an argument
685 * we delete only packets belonging to the given interface
686 */
687 if ((hard_iface) &&
688 (forw_packet->if_incoming != hard_iface) &&
689 (forw_packet->if_outgoing != hard_iface))
690 continue;
691
692 spin_unlock_bh(&bat_priv->forw_bcast_list_lock);
693
694 /* batadv_send_outstanding_bcast_packet() will lock the list to
695 * delete the item from the list
696 */
697 pending = cancel_delayed_work_sync(&forw_packet->delayed_work);
698 spin_lock_bh(&bat_priv->forw_bcast_list_lock);
699
700 if (pending) {
701 hlist_del(&forw_packet->list);
702 batadv_forw_packet_free(forw_packet);
703 }
704 }
705 spin_unlock_bh(&bat_priv->forw_bcast_list_lock); 932 spin_unlock_bh(&bat_priv->forw_bcast_list_lock);
706 933
707 /* free batman packet list */ 934 /* claim batman packet list for free() */
708 spin_lock_bh(&bat_priv->forw_bat_list_lock); 935 spin_lock_bh(&bat_priv->forw_bat_list_lock);
709 hlist_for_each_entry_safe(forw_packet, safe_tmp_node, 936 batadv_forw_packet_list_steal(&bat_priv->forw_bat_list, &head,
710 &bat_priv->forw_bat_list, list) { 937 hard_iface);
711 /* if purge_outstanding_packets() was called with an argument
712 * we delete only packets belonging to the given interface
713 */
714 if ((hard_iface) &&
715 (forw_packet->if_incoming != hard_iface) &&
716 (forw_packet->if_outgoing != hard_iface))
717 continue;
718
719 spin_unlock_bh(&bat_priv->forw_bat_list_lock);
720
721 /* send_outstanding_bat_packet() will lock the list to
722 * delete the item from the list
723 */
724 pending = cancel_delayed_work_sync(&forw_packet->delayed_work);
725 spin_lock_bh(&bat_priv->forw_bat_list_lock);
726
727 if (pending) {
728 hlist_del(&forw_packet->list);
729 batadv_forw_packet_free(forw_packet);
730 }
731 }
732 spin_unlock_bh(&bat_priv->forw_bat_list_lock); 938 spin_unlock_bh(&bat_priv->forw_bat_list_lock);
939
940 /* then cancel or wait for packet workers to finish and free */
941 batadv_forw_packet_list_free(&head);
733} 942}
diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h
index 999f78683d9e..f21166d10323 100644
--- a/net/batman-adv/send.h
+++ b/net/batman-adv/send.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
@@ -21,18 +21,24 @@
21#include "main.h" 21#include "main.h"
22 22
23#include <linux/compiler.h> 23#include <linux/compiler.h>
24#include <linux/spinlock.h>
24#include <linux/types.h> 25#include <linux/types.h>
25 26
26#include "packet.h" 27#include "packet.h"
27 28
28struct sk_buff; 29struct sk_buff;
29 30
30void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet); 31void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet,
32 bool dropped);
31struct batadv_forw_packet * 33struct batadv_forw_packet *
32batadv_forw_packet_alloc(struct batadv_hard_iface *if_incoming, 34batadv_forw_packet_alloc(struct batadv_hard_iface *if_incoming,
33 struct batadv_hard_iface *if_outgoing, 35 struct batadv_hard_iface *if_outgoing,
34 atomic_t *queue_left, 36 atomic_t *queue_left,
35 struct batadv_priv *bat_priv); 37 struct batadv_priv *bat_priv);
38bool batadv_forw_packet_steal(struct batadv_forw_packet *packet, spinlock_t *l);
39void batadv_forw_packet_ogmv1_queue(struct batadv_priv *bat_priv,
40 struct batadv_forw_packet *forw_packet,
41 unsigned long send_time);
36 42
37int batadv_send_skb_to_orig(struct sk_buff *skb, 43int batadv_send_skb_to_orig(struct sk_buff *skb,
38 struct batadv_orig_node *orig_node, 44 struct batadv_orig_node *orig_node,
@@ -46,7 +52,8 @@ int batadv_send_unicast_skb(struct sk_buff *skb,
46 struct batadv_neigh_node *neigh_node); 52 struct batadv_neigh_node *neigh_node);
47int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, 53int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv,
48 const struct sk_buff *skb, 54 const struct sk_buff *skb,
49 unsigned long delay); 55 unsigned long delay,
56 bool own_packet);
50void 57void
51batadv_purge_outstanding_packets(struct batadv_priv *bat_priv, 58batadv_purge_outstanding_packets(struct batadv_priv *bat_priv,
52 const struct batadv_hard_iface *hard_iface); 59 const struct batadv_hard_iface *hard_iface);
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 49e16b6e0ba3..d042c99af028 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
@@ -22,6 +22,7 @@
22#include <linux/byteorder/generic.h> 22#include <linux/byteorder/generic.h>
23#include <linux/cache.h> 23#include <linux/cache.h>
24#include <linux/compiler.h> 24#include <linux/compiler.h>
25#include <linux/cpumask.h>
25#include <linux/errno.h> 26#include <linux/errno.h>
26#include <linux/etherdevice.h> 27#include <linux/etherdevice.h>
27#include <linux/ethtool.h> 28#include <linux/ethtool.h>
@@ -116,6 +117,26 @@ static int batadv_interface_release(struct net_device *dev)
116 return 0; 117 return 0;
117} 118}
118 119
120/**
121 * batadv_sum_counter - Sum the cpu-local counters for index 'idx'
122 * @bat_priv: the bat priv with all the soft interface information
123 * @idx: index of counter to sum up
124 *
125 * Return: sum of all cpu-local counters
126 */
127static u64 batadv_sum_counter(struct batadv_priv *bat_priv, size_t idx)
128{
129 u64 *counters, sum = 0;
130 int cpu;
131
132 for_each_possible_cpu(cpu) {
133 counters = per_cpu_ptr(bat_priv->bat_counters, cpu);
134 sum += counters[idx];
135 }
136
137 return sum;
138}
139
119static struct net_device_stats *batadv_interface_stats(struct net_device *dev) 140static struct net_device_stats *batadv_interface_stats(struct net_device *dev)
120{ 141{
121 struct batadv_priv *bat_priv = netdev_priv(dev); 142 struct batadv_priv *bat_priv = netdev_priv(dev);
@@ -237,7 +258,8 @@ static int batadv_interface_tx(struct sk_buff *skb,
237 ethhdr = eth_hdr(skb); 258 ethhdr = eth_hdr(skb);
238 259
239 /* Register the client MAC in the transtable */ 260 /* Register the client MAC in the transtable */
240 if (!is_multicast_ether_addr(ethhdr->h_source)) { 261 if (!is_multicast_ether_addr(ethhdr->h_source) &&
262 !batadv_bla_is_loopdetect_mac(ethhdr->h_source)) {
241 client_added = batadv_tt_local_add(soft_iface, ethhdr->h_source, 263 client_added = batadv_tt_local_add(soft_iface, ethhdr->h_source,
242 vid, skb->skb_iif, 264 vid, skb->skb_iif,
243 skb->mark); 265 skb->mark);
@@ -336,12 +358,12 @@ send:
336 seqno = atomic_inc_return(&bat_priv->bcast_seqno); 358 seqno = atomic_inc_return(&bat_priv->bcast_seqno);
337 bcast_packet->seqno = htonl(seqno); 359 bcast_packet->seqno = htonl(seqno);
338 360
339 batadv_add_bcast_packet_to_list(bat_priv, skb, brd_delay); 361 batadv_add_bcast_packet_to_list(bat_priv, skb, brd_delay, true);
340 362
341 /* a copy is stored in the bcast list, therefore removing 363 /* a copy is stored in the bcast list, therefore removing
342 * the original skb. 364 * the original skb.
343 */ 365 */
344 kfree_skb(skb); 366 consume_skb(skb);
345 367
346 /* unicast packet */ 368 /* unicast packet */
347 } else { 369 } else {
@@ -365,7 +387,7 @@ send:
365 ret = batadv_send_skb_via_tt(bat_priv, skb, dst_hint, 387 ret = batadv_send_skb_via_tt(bat_priv, skb, dst_hint,
366 vid); 388 vid);
367 } 389 }
368 if (ret == NET_XMIT_DROP) 390 if (ret != NET_XMIT_SUCCESS)
369 goto dropped_freed; 391 goto dropped_freed;
370 } 392 }
371 393
@@ -460,8 +482,6 @@ void batadv_interface_rx(struct net_device *soft_iface,
460 batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES, 482 batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES,
461 skb->len + ETH_HLEN); 483 skb->len + ETH_HLEN);
462 484
463 soft_iface->last_rx = jiffies;
464
465 /* Let the bridge loop avoidance check the packet. If will 485 /* Let the bridge loop avoidance check the packet. If will
466 * not handle it, we can safely push it up. 486 * not handle it, we can safely push it up.
467 */ 487 */
@@ -799,7 +819,6 @@ static int batadv_softif_init_late(struct net_device *dev)
799 atomic_set(&bat_priv->mcast.num_want_all_ipv6, 0); 819 atomic_set(&bat_priv->mcast.num_want_all_ipv6, 0);
800#endif 820#endif
801 atomic_set(&bat_priv->gw.mode, BATADV_GW_MODE_OFF); 821 atomic_set(&bat_priv->gw.mode, BATADV_GW_MODE_OFF);
802 atomic_set(&bat_priv->gw.sel_class, 20);
803 atomic_set(&bat_priv->gw.bandwidth_down, 100); 822 atomic_set(&bat_priv->gw.bandwidth_down, 100);
804 atomic_set(&bat_priv->gw.bandwidth_up, 20); 823 atomic_set(&bat_priv->gw.bandwidth_up, 20);
805 atomic_set(&bat_priv->orig_interval, 1000); 824 atomic_set(&bat_priv->orig_interval, 1000);
diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h
index ec303ddbf647..639c3abb214a 100644
--- a/net/batman-adv/soft-interface.h
+++ b/net/batman-adv/soft-interface.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner 3 * Marek Lindner
4 * 4 *
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index 02d96f224c60..0ae8b30e4eaa 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner 3 * Marek Lindner
4 * 4 *
@@ -33,7 +33,6 @@
33#include <linux/rcupdate.h> 33#include <linux/rcupdate.h>
34#include <linux/rtnetlink.h> 34#include <linux/rtnetlink.h>
35#include <linux/slab.h> 35#include <linux/slab.h>
36#include <linux/stat.h>
37#include <linux/stddef.h> 36#include <linux/stddef.h>
38#include <linux/string.h> 37#include <linux/string.h>
39#include <linux/stringify.h> 38#include <linux/stringify.h>
@@ -666,41 +665,36 @@ static ssize_t batadv_store_isolation_mark(struct kobject *kobj,
666 return count; 665 return count;
667} 666}
668 667
669BATADV_ATTR_SIF_BOOL(aggregated_ogms, S_IRUGO | S_IWUSR, NULL); 668BATADV_ATTR_SIF_BOOL(aggregated_ogms, 0644, NULL);
670BATADV_ATTR_SIF_BOOL(bonding, S_IRUGO | S_IWUSR, NULL); 669BATADV_ATTR_SIF_BOOL(bonding, 0644, NULL);
671#ifdef CONFIG_BATMAN_ADV_BLA 670#ifdef CONFIG_BATMAN_ADV_BLA
672BATADV_ATTR_SIF_BOOL(bridge_loop_avoidance, S_IRUGO | S_IWUSR, 671BATADV_ATTR_SIF_BOOL(bridge_loop_avoidance, 0644, batadv_bla_status_update);
673 batadv_bla_status_update);
674#endif 672#endif
675#ifdef CONFIG_BATMAN_ADV_DAT 673#ifdef CONFIG_BATMAN_ADV_DAT
676BATADV_ATTR_SIF_BOOL(distributed_arp_table, S_IRUGO | S_IWUSR, 674BATADV_ATTR_SIF_BOOL(distributed_arp_table, 0644, batadv_dat_status_update);
677 batadv_dat_status_update);
678#endif 675#endif
679BATADV_ATTR_SIF_BOOL(fragmentation, S_IRUGO | S_IWUSR, batadv_update_min_mtu); 676BATADV_ATTR_SIF_BOOL(fragmentation, 0644, batadv_update_min_mtu);
680static BATADV_ATTR(routing_algo, S_IRUGO, batadv_show_bat_algo, NULL); 677static BATADV_ATTR(routing_algo, 0444, batadv_show_bat_algo, NULL);
681static BATADV_ATTR(gw_mode, S_IRUGO | S_IWUSR, batadv_show_gw_mode, 678static BATADV_ATTR(gw_mode, 0644, batadv_show_gw_mode, batadv_store_gw_mode);
682 batadv_store_gw_mode); 679BATADV_ATTR_SIF_UINT(orig_interval, orig_interval, 0644, 2 * BATADV_JITTER,
683BATADV_ATTR_SIF_UINT(orig_interval, orig_interval, S_IRUGO | S_IWUSR, 680 INT_MAX, NULL);
684 2 * BATADV_JITTER, INT_MAX, NULL); 681BATADV_ATTR_SIF_UINT(hop_penalty, hop_penalty, 0644, 0, BATADV_TQ_MAX_VALUE,
685BATADV_ATTR_SIF_UINT(hop_penalty, hop_penalty, S_IRUGO | S_IWUSR, 0, 682 NULL);
686 BATADV_TQ_MAX_VALUE, NULL); 683static BATADV_ATTR(gw_sel_class, 0644, batadv_show_gw_sel_class,
687static BATADV_ATTR(gw_sel_class, S_IRUGO | S_IWUSR, batadv_show_gw_sel_class,
688 batadv_store_gw_sel_class); 684 batadv_store_gw_sel_class);
689static BATADV_ATTR(gw_bandwidth, S_IRUGO | S_IWUSR, batadv_show_gw_bwidth, 685static BATADV_ATTR(gw_bandwidth, 0644, batadv_show_gw_bwidth,
690 batadv_store_gw_bwidth); 686 batadv_store_gw_bwidth);
691#ifdef CONFIG_BATMAN_ADV_MCAST 687#ifdef CONFIG_BATMAN_ADV_MCAST
692BATADV_ATTR_SIF_BOOL(multicast_mode, S_IRUGO | S_IWUSR, NULL); 688BATADV_ATTR_SIF_BOOL(multicast_mode, 0644, NULL);
693#endif 689#endif
694#ifdef CONFIG_BATMAN_ADV_DEBUG 690#ifdef CONFIG_BATMAN_ADV_DEBUG
695BATADV_ATTR_SIF_UINT(log_level, log_level, S_IRUGO | S_IWUSR, 0, 691BATADV_ATTR_SIF_UINT(log_level, log_level, 0644, 0, BATADV_DBG_ALL, NULL);
696 BATADV_DBG_ALL, NULL);
697#endif 692#endif
698#ifdef CONFIG_BATMAN_ADV_NC 693#ifdef CONFIG_BATMAN_ADV_NC
699BATADV_ATTR_SIF_BOOL(network_coding, S_IRUGO | S_IWUSR, 694BATADV_ATTR_SIF_BOOL(network_coding, 0644, batadv_nc_status_update);
700 batadv_nc_status_update);
701#endif 695#endif
702static BATADV_ATTR(isolation_mark, S_IRUGO | S_IWUSR, 696static BATADV_ATTR(isolation_mark, 0644, batadv_show_isolation_mark,
703 batadv_show_isolation_mark, batadv_store_isolation_mark); 697 batadv_store_isolation_mark);
704 698
705static struct batadv_attribute *batadv_mesh_attrs[] = { 699static struct batadv_attribute *batadv_mesh_attrs[] = {
706 &batadv_attr_aggregated_ogms, 700 &batadv_attr_aggregated_ogms,
@@ -731,7 +725,7 @@ static struct batadv_attribute *batadv_mesh_attrs[] = {
731 NULL, 725 NULL,
732}; 726};
733 727
734BATADV_ATTR_VLAN_BOOL(ap_isolation, S_IRUGO | S_IWUSR, NULL); 728BATADV_ATTR_VLAN_BOOL(ap_isolation, 0644, NULL);
735 729
736/* array of vlan specific sysfs attributes */ 730/* array of vlan specific sysfs attributes */
737static struct batadv_attribute *batadv_vlan_attrs[] = { 731static struct batadv_attribute *batadv_vlan_attrs[] = {
@@ -1116,14 +1110,13 @@ static ssize_t batadv_show_throughput_override(struct kobject *kobj,
1116 1110
1117#endif 1111#endif
1118 1112
1119static BATADV_ATTR(mesh_iface, S_IRUGO | S_IWUSR, batadv_show_mesh_iface, 1113static BATADV_ATTR(mesh_iface, 0644, batadv_show_mesh_iface,
1120 batadv_store_mesh_iface); 1114 batadv_store_mesh_iface);
1121static BATADV_ATTR(iface_status, S_IRUGO, batadv_show_iface_status, NULL); 1115static BATADV_ATTR(iface_status, 0444, batadv_show_iface_status, NULL);
1122#ifdef CONFIG_BATMAN_ADV_BATMAN_V 1116#ifdef CONFIG_BATMAN_ADV_BATMAN_V
1123BATADV_ATTR_HIF_UINT(elp_interval, bat_v.elp_interval, S_IRUGO | S_IWUSR, 1117BATADV_ATTR_HIF_UINT(elp_interval, bat_v.elp_interval, 0644,
1124 2 * BATADV_JITTER, INT_MAX, NULL); 1118 2 * BATADV_JITTER, INT_MAX, NULL);
1125static BATADV_ATTR(throughput_override, S_IRUGO | S_IWUSR, 1119static BATADV_ATTR(throughput_override, 0644, batadv_show_throughput_override,
1126 batadv_show_throughput_override,
1127 batadv_store_throughput_override); 1120 batadv_store_throughput_override);
1128#endif 1121#endif
1129 1122
diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h
index c76021b4e198..e487412e256b 100644
--- a/net/batman-adv/sysfs.h
+++ b/net/batman-adv/sysfs.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner 3 * Marek Lindner
4 * 4 *
diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c
index 8af1611b8ab2..c94ebdecdc3d 100644
--- a/net/batman-adv/tp_meter.c
+++ b/net/batman-adv/tp_meter.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Edo Monticelli, Antonio Quartulli 3 * Edo Monticelli, Antonio Quartulli
4 * 4 *
@@ -23,7 +23,7 @@
23#include <linux/byteorder/generic.h> 23#include <linux/byteorder/generic.h>
24#include <linux/cache.h> 24#include <linux/cache.h>
25#include <linux/compiler.h> 25#include <linux/compiler.h>
26#include <linux/device.h> 26#include <linux/err.h>
27#include <linux/etherdevice.h> 27#include <linux/etherdevice.h>
28#include <linux/fs.h> 28#include <linux/fs.h>
29#include <linux/if_ether.h> 29#include <linux/if_ether.h>
@@ -615,9 +615,6 @@ static int batadv_tp_send_msg(struct batadv_tp_vars *tp_vars, const u8 *src,
615 batadv_tp_fill_prerandom(tp_vars, data, data_len); 615 batadv_tp_fill_prerandom(tp_vars, data, data_len);
616 616
617 r = batadv_send_skb_to_orig(skb, orig_node, NULL); 617 r = batadv_send_skb_to_orig(skb, orig_node, NULL);
618 if (r == -1)
619 kfree_skb(skb);
620
621 if (r == NET_XMIT_SUCCESS) 618 if (r == NET_XMIT_SUCCESS)
622 return 0; 619 return 0;
623 620
@@ -1207,9 +1204,6 @@ static int batadv_tp_send_ack(struct batadv_priv *bat_priv, const u8 *dst,
1207 1204
1208 /* send the ack */ 1205 /* send the ack */
1209 r = batadv_send_skb_to_orig(skb, orig_node, NULL); 1206 r = batadv_send_skb_to_orig(skb, orig_node, NULL);
1210 if (r == -1)
1211 kfree_skb(skb);
1212
1213 if (unlikely(r < 0) || (r == NET_XMIT_DROP)) { 1207 if (unlikely(r < 0) || (r == NET_XMIT_DROP)) {
1214 ret = BATADV_TP_REASON_DST_UNREACHABLE; 1208 ret = BATADV_TP_REASON_DST_UNREACHABLE;
1215 goto out; 1209 goto out;
diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h
index ba922c425e56..a8ada5c123bd 100644
--- a/net/batman-adv/tp_meter.h
+++ b/net/batman-adv/tp_meter.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Edo Monticelli, Antonio Quartulli 3 * Edo Monticelli, Antonio Quartulli
4 * 4 *
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 0dc85eb1cb7a..6077a87d46f0 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich, Antonio Quartulli 3 * Marek Lindner, Simon Wunderlich, Antonio Quartulli
4 * 4 *
@@ -56,7 +56,6 @@
56#include "hard-interface.h" 56#include "hard-interface.h"
57#include "hash.h" 57#include "hash.h"
58#include "log.h" 58#include "log.h"
59#include "multicast.h"
60#include "netlink.h" 59#include "netlink.h"
61#include "originator.h" 60#include "originator.h"
62#include "packet.h" 61#include "packet.h"
@@ -647,6 +646,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
647 struct net *net = dev_net(soft_iface); 646 struct net *net = dev_net(soft_iface);
648 struct batadv_softif_vlan *vlan; 647 struct batadv_softif_vlan *vlan;
649 struct net_device *in_dev = NULL; 648 struct net_device *in_dev = NULL;
649 struct batadv_hard_iface *in_hardif = NULL;
650 struct hlist_head *head; 650 struct hlist_head *head;
651 struct batadv_tt_orig_list_entry *orig_entry; 651 struct batadv_tt_orig_list_entry *orig_entry;
652 int hash_added, table_size, packet_size_max; 652 int hash_added, table_size, packet_size_max;
@@ -658,6 +658,9 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
658 if (ifindex != BATADV_NULL_IFINDEX) 658 if (ifindex != BATADV_NULL_IFINDEX)
659 in_dev = dev_get_by_index(net, ifindex); 659 in_dev = dev_get_by_index(net, ifindex);
660 660
661 if (in_dev)
662 in_hardif = batadv_hardif_get_by_netdev(in_dev);
663
661 tt_local = batadv_tt_local_hash_find(bat_priv, addr, vid); 664 tt_local = batadv_tt_local_hash_find(bat_priv, addr, vid);
662 665
663 if (!is_multicast_ether_addr(addr)) 666 if (!is_multicast_ether_addr(addr))
@@ -731,7 +734,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
731 */ 734 */
732 tt_local->common.flags = BATADV_TT_CLIENT_NEW; 735 tt_local->common.flags = BATADV_TT_CLIENT_NEW;
733 tt_local->common.vid = vid; 736 tt_local->common.vid = vid;
734 if (batadv_is_wifi_netdev(in_dev)) 737 if (batadv_is_wifi_hardif(in_hardif))
735 tt_local->common.flags |= BATADV_TT_CLIENT_WIFI; 738 tt_local->common.flags |= BATADV_TT_CLIENT_WIFI;
736 kref_init(&tt_local->common.refcount); 739 kref_init(&tt_local->common.refcount);
737 tt_local->last_seen = jiffies; 740 tt_local->last_seen = jiffies;
@@ -791,7 +794,7 @@ check_roaming:
791 */ 794 */
792 remote_flags = tt_local->common.flags & BATADV_TT_REMOTE_MASK; 795 remote_flags = tt_local->common.flags & BATADV_TT_REMOTE_MASK;
793 796
794 if (batadv_is_wifi_netdev(in_dev)) 797 if (batadv_is_wifi_hardif(in_hardif))
795 tt_local->common.flags |= BATADV_TT_CLIENT_WIFI; 798 tt_local->common.flags |= BATADV_TT_CLIENT_WIFI;
796 else 799 else
797 tt_local->common.flags &= ~BATADV_TT_CLIENT_WIFI; 800 tt_local->common.flags &= ~BATADV_TT_CLIENT_WIFI;
@@ -815,6 +818,8 @@ check_roaming:
815 818
816 ret = true; 819 ret = true;
817out: 820out:
821 if (in_hardif)
822 batadv_hardif_put(in_hardif);
818 if (in_dev) 823 if (in_dev)
819 dev_put(in_dev); 824 dev_put(in_dev);
820 if (tt_local) 825 if (tt_local)
@@ -3709,7 +3714,6 @@ static void batadv_tt_local_set_flags(struct batadv_priv *bat_priv, u16 flags,
3709{ 3714{
3710 struct batadv_hashtable *hash = bat_priv->tt.local_hash; 3715 struct batadv_hashtable *hash = bat_priv->tt.local_hash;
3711 struct batadv_tt_common_entry *tt_common_entry; 3716 struct batadv_tt_common_entry *tt_common_entry;
3712 u16 changed_num = 0;
3713 struct hlist_head *head; 3717 struct hlist_head *head;
3714 u32 i; 3718 u32 i;
3715 3719
@@ -3731,7 +3735,6 @@ static void batadv_tt_local_set_flags(struct batadv_priv *bat_priv, u16 flags,
3731 continue; 3735 continue;
3732 tt_common_entry->flags &= ~flags; 3736 tt_common_entry->flags &= ~flags;
3733 } 3737 }
3734 changed_num++;
3735 3738
3736 if (!count) 3739 if (!count)
3737 continue; 3740 continue;
@@ -3795,9 +3798,6 @@ static void batadv_tt_local_commit_changes_nolock(struct batadv_priv *bat_priv)
3795{ 3798{
3796 lockdep_assert_held(&bat_priv->tt.commit_lock); 3799 lockdep_assert_held(&bat_priv->tt.commit_lock);
3797 3800
3798 /* Update multicast addresses in local translation table */
3799 batadv_mcast_mla_update(bat_priv);
3800
3801 if (atomic_read(&bat_priv->tt.local_changes) < 1) { 3801 if (atomic_read(&bat_priv->tt.local_changes) < 1) {
3802 if (!batadv_atomic_dec_not_zero(&bat_priv->tt.ogm_append_cnt)) 3802 if (!batadv_atomic_dec_not_zero(&bat_priv->tt.ogm_append_cnt))
3803 batadv_tt_tvlv_container_update(bat_priv); 3803 batadv_tt_tvlv_container_update(bat_priv);
@@ -3835,8 +3835,8 @@ void batadv_tt_local_commit_changes(struct batadv_priv *bat_priv)
3835bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst, 3835bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst,
3836 unsigned short vid) 3836 unsigned short vid)
3837{ 3837{
3838 struct batadv_tt_local_entry *tt_local_entry = NULL; 3838 struct batadv_tt_local_entry *tt_local_entry;
3839 struct batadv_tt_global_entry *tt_global_entry = NULL; 3839 struct batadv_tt_global_entry *tt_global_entry;
3840 struct batadv_softif_vlan *vlan; 3840 struct batadv_softif_vlan *vlan;
3841 bool ret = false; 3841 bool ret = false;
3842 3842
@@ -3845,27 +3845,24 @@ bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst,
3845 return false; 3845 return false;
3846 3846
3847 if (!atomic_read(&vlan->ap_isolation)) 3847 if (!atomic_read(&vlan->ap_isolation))
3848 goto out; 3848 goto vlan_put;
3849 3849
3850 tt_local_entry = batadv_tt_local_hash_find(bat_priv, dst, vid); 3850 tt_local_entry = batadv_tt_local_hash_find(bat_priv, dst, vid);
3851 if (!tt_local_entry) 3851 if (!tt_local_entry)
3852 goto out; 3852 goto vlan_put;
3853 3853
3854 tt_global_entry = batadv_tt_global_hash_find(bat_priv, src, vid); 3854 tt_global_entry = batadv_tt_global_hash_find(bat_priv, src, vid);
3855 if (!tt_global_entry) 3855 if (!tt_global_entry)
3856 goto out; 3856 goto local_entry_put;
3857 3857
3858 if (!_batadv_is_ap_isolated(tt_local_entry, tt_global_entry)) 3858 if (_batadv_is_ap_isolated(tt_local_entry, tt_global_entry))
3859 goto out; 3859 ret = true;
3860
3861 ret = true;
3862 3860
3863out: 3861 batadv_tt_global_entry_put(tt_global_entry);
3862local_entry_put:
3863 batadv_tt_local_entry_put(tt_local_entry);
3864vlan_put:
3864 batadv_softif_vlan_put(vlan); 3865 batadv_softif_vlan_put(vlan);
3865 if (tt_global_entry)
3866 batadv_tt_global_entry_put(tt_global_entry);
3867 if (tt_local_entry)
3868 batadv_tt_local_entry_put(tt_local_entry);
3869 return ret; 3866 return ret;
3870} 3867}
3871 3868
diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h
index 783fdba84db2..411d586191da 100644
--- a/net/batman-adv/translation-table.h
+++ b/net/batman-adv/translation-table.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich, Antonio Quartulli 3 * Marek Lindner, Simon Wunderlich, Antonio Quartulli
4 * 4 *
diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c
index 77654f055f24..1d9e267caec9 100644
--- a/net/batman-adv/tvlv.c
+++ b/net/batman-adv/tvlv.c
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
@@ -600,7 +600,6 @@ void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src,
600 unsigned char *tvlv_buff; 600 unsigned char *tvlv_buff;
601 unsigned int tvlv_len; 601 unsigned int tvlv_len;
602 ssize_t hdr_len = sizeof(*unicast_tvlv_packet); 602 ssize_t hdr_len = sizeof(*unicast_tvlv_packet);
603 int res;
604 603
605 orig_node = batadv_orig_hash_find(bat_priv, dst); 604 orig_node = batadv_orig_hash_find(bat_priv, dst);
606 if (!orig_node) 605 if (!orig_node)
@@ -633,9 +632,7 @@ void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src,
633 tvlv_buff += sizeof(*tvlv_hdr); 632 tvlv_buff += sizeof(*tvlv_hdr);
634 memcpy(tvlv_buff, tvlv_value, tvlv_value_len); 633 memcpy(tvlv_buff, tvlv_value, tvlv_value_len);
635 634
636 res = batadv_send_skb_to_orig(skb, orig_node, NULL); 635 batadv_send_skb_to_orig(skb, orig_node, NULL);
637 if (res == -1)
638 kfree_skb(skb);
639out: 636out:
640 batadv_orig_node_put(orig_node); 637 batadv_orig_node_put(orig_node);
641} 638}
diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h
index e4369b547b43..4d01400ada30 100644
--- a/net/batman-adv/tvlv.h
+++ b/net/batman-adv/tvlv.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index b3dd1a381aad..246f21b4973b 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1,4 +1,4 @@
1/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: 1/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors:
2 * 2 *
3 * Marek Lindner, Simon Wunderlich 3 * Marek Lindner, Simon Wunderlich
4 * 4 *
@@ -119,12 +119,28 @@ struct batadv_hard_iface_bat_v {
119}; 119};
120 120
121/** 121/**
122 * enum batadv_hard_iface_wifi_flags - Flags describing the wifi configuration
123 * of a batadv_hard_iface
124 * @BATADV_HARDIF_WIFI_WEXT_DIRECT: it is a wext wifi device
125 * @BATADV_HARDIF_WIFI_CFG80211_DIRECT: it is a cfg80211 wifi device
126 * @BATADV_HARDIF_WIFI_WEXT_INDIRECT: link device is a wext wifi device
127 * @BATADV_HARDIF_WIFI_CFG80211_INDIRECT: link device is a cfg80211 wifi device
128 */
129enum batadv_hard_iface_wifi_flags {
130 BATADV_HARDIF_WIFI_WEXT_DIRECT = BIT(0),
131 BATADV_HARDIF_WIFI_CFG80211_DIRECT = BIT(1),
132 BATADV_HARDIF_WIFI_WEXT_INDIRECT = BIT(2),
133 BATADV_HARDIF_WIFI_CFG80211_INDIRECT = BIT(3),
134};
135
136/**
122 * struct batadv_hard_iface - network device known to batman-adv 137 * struct batadv_hard_iface - network device known to batman-adv
123 * @list: list node for batadv_hardif_list 138 * @list: list node for batadv_hardif_list
124 * @if_num: identificator of the interface 139 * @if_num: identificator of the interface
125 * @if_status: status of the interface for batman-adv 140 * @if_status: status of the interface for batman-adv
126 * @net_dev: pointer to the net_device
127 * @num_bcasts: number of payload re-broadcasts on this interface (ARQ) 141 * @num_bcasts: number of payload re-broadcasts on this interface (ARQ)
142 * @wifi_flags: flags whether this is (directly or indirectly) a wifi interface
143 * @net_dev: pointer to the net_device
128 * @hardif_obj: kobject of the per interface sysfs "mesh" directory 144 * @hardif_obj: kobject of the per interface sysfs "mesh" directory
129 * @refcount: number of contexts the object is used 145 * @refcount: number of contexts the object is used
130 * @batman_adv_ptype: packet type describing packets that should be processed by 146 * @batman_adv_ptype: packet type describing packets that should be processed by
@@ -141,8 +157,9 @@ struct batadv_hard_iface {
141 struct list_head list; 157 struct list_head list;
142 s16 if_num; 158 s16 if_num;
143 char if_status; 159 char if_status;
144 struct net_device *net_dev;
145 u8 num_bcasts; 160 u8 num_bcasts;
161 u32 wifi_flags;
162 struct net_device *net_dev;
146 struct kobject *hardif_obj; 163 struct kobject *hardif_obj;
147 struct kref refcount; 164 struct kref refcount;
148 struct packet_type batman_adv_ptype; 165 struct packet_type batman_adv_ptype;
@@ -184,7 +201,7 @@ struct batadv_orig_ifinfo {
184 201
185/** 202/**
186 * struct batadv_frag_table_entry - head in the fragment buffer table 203 * struct batadv_frag_table_entry - head in the fragment buffer table
187 * @head: head of list with fragments 204 * @fragment_list: head of list with fragments
188 * @lock: lock to protect the list of fragments 205 * @lock: lock to protect the list of fragments
189 * @timestamp: time (jiffie) of last received fragment 206 * @timestamp: time (jiffie) of last received fragment
190 * @seqno: sequence number of the fragments in the list 207 * @seqno: sequence number of the fragments in the list
@@ -192,8 +209,8 @@ struct batadv_orig_ifinfo {
192 * @total_size: expected size of the assembled packet 209 * @total_size: expected size of the assembled packet
193 */ 210 */
194struct batadv_frag_table_entry { 211struct batadv_frag_table_entry {
195 struct hlist_head head; 212 struct hlist_head fragment_list;
196 spinlock_t lock; /* protects head */ 213 spinlock_t lock; /* protects fragment_list */
197 unsigned long timestamp; 214 unsigned long timestamp;
198 u16 seqno; 215 u16 seqno;
199 u16 size; 216 u16 size;
@@ -385,7 +402,7 @@ struct batadv_gw_node {
385 struct rcu_head rcu; 402 struct rcu_head rcu;
386}; 403};
387 404
388DECLARE_EWMA(throughput, 1024, 8) 405DECLARE_EWMA(throughput, 10, 8)
389 406
390/** 407/**
391 * struct batadv_hardif_neigh_node_bat_v - B.A.T.M.A.N. V private neighbor 408 * struct batadv_hardif_neigh_node_bat_v - B.A.T.M.A.N. V private neighbor
@@ -408,6 +425,7 @@ struct batadv_hardif_neigh_node_bat_v {
408 * struct batadv_hardif_neigh_node - unique neighbor per hard-interface 425 * struct batadv_hardif_neigh_node - unique neighbor per hard-interface
409 * @list: list node for batadv_hard_iface::neigh_list 426 * @list: list node for batadv_hard_iface::neigh_list
410 * @addr: the MAC address of the neighboring interface 427 * @addr: the MAC address of the neighboring interface
428 * @orig: the address of the originator this neighbor node belongs to
411 * @if_incoming: pointer to incoming hard-interface 429 * @if_incoming: pointer to incoming hard-interface
412 * @last_seen: when last packet via this neighbor was received 430 * @last_seen: when last packet via this neighbor was received
413 * @bat_v: B.A.T.M.A.N. V private data 431 * @bat_v: B.A.T.M.A.N. V private data
@@ -417,6 +435,7 @@ struct batadv_hardif_neigh_node_bat_v {
417struct batadv_hardif_neigh_node { 435struct batadv_hardif_neigh_node {
418 struct hlist_node list; 436 struct hlist_node list;
419 u8 addr[ETH_ALEN]; 437 u8 addr[ETH_ALEN];
438 u8 orig[ETH_ALEN];
420 struct batadv_hard_iface *if_incoming; 439 struct batadv_hard_iface *if_incoming;
421 unsigned long last_seen; 440 unsigned long last_seen;
422#ifdef CONFIG_BATMAN_ADV_BATMAN_V 441#ifdef CONFIG_BATMAN_ADV_BATMAN_V
@@ -706,8 +725,8 @@ struct batadv_priv_debug_log {
706 725
707/** 726/**
708 * struct batadv_priv_gw - per mesh interface gateway data 727 * struct batadv_priv_gw - per mesh interface gateway data
709 * @list: list of available gateway nodes 728 * @gateway_list: list of available gateway nodes
710 * @list_lock: lock protecting gw_list & curr_gw 729 * @list_lock: lock protecting gateway_list & curr_gw
711 * @curr_gw: pointer to currently selected gateway node 730 * @curr_gw: pointer to currently selected gateway node
712 * @mode: gateway operation: off, client or server (see batadv_gw_modes) 731 * @mode: gateway operation: off, client or server (see batadv_gw_modes)
713 * @sel_class: gateway selection class (applies if gw_mode client) 732 * @sel_class: gateway selection class (applies if gw_mode client)
@@ -716,8 +735,8 @@ struct batadv_priv_debug_log {
716 * @reselect: bool indicating a gateway re-selection is in progress 735 * @reselect: bool indicating a gateway re-selection is in progress
717 */ 736 */
718struct batadv_priv_gw { 737struct batadv_priv_gw {
719 struct hlist_head list; 738 struct hlist_head gateway_list;
720 spinlock_t list_lock; /* protects gw_list & curr_gw */ 739 spinlock_t list_lock; /* protects gateway_list & curr_gw */
721 struct batadv_gw_node __rcu *curr_gw; /* rcu protected pointer */ 740 struct batadv_gw_node __rcu *curr_gw; /* rcu protected pointer */
722 atomic_t mode; 741 atomic_t mode;
723 atomic_t sel_class; 742 atomic_t sel_class;
@@ -785,9 +804,10 @@ struct batadv_mcast_querier_state {
785 * @num_want_all_ipv6: counter for items in want_all_ipv6_list 804 * @num_want_all_ipv6: counter for items in want_all_ipv6_list
786 * @want_lists_lock: lock for protecting modifications to mcast want lists 805 * @want_lists_lock: lock for protecting modifications to mcast want lists
787 * (traversals are rcu-locked) 806 * (traversals are rcu-locked)
807 * @work: work queue callback item for multicast TT and TVLV updates
788 */ 808 */
789struct batadv_priv_mcast { 809struct batadv_priv_mcast {
790 struct hlist_head mla_list; 810 struct hlist_head mla_list; /* see __batadv_mcast_mla_update() */
791 struct hlist_head want_all_unsnoopables_list; 811 struct hlist_head want_all_unsnoopables_list;
792 struct hlist_head want_all_ipv4_list; 812 struct hlist_head want_all_ipv4_list;
793 struct hlist_head want_all_ipv6_list; 813 struct hlist_head want_all_ipv6_list;
@@ -802,6 +822,7 @@ struct batadv_priv_mcast {
802 atomic_t num_want_all_ipv6; 822 atomic_t num_want_all_ipv6;
803 /* protects want_all_{unsnoopables,ipv4,ipv6}_list */ 823 /* protects want_all_{unsnoopables,ipv4,ipv6}_list */
804 spinlock_t want_lists_lock; 824 spinlock_t want_lists_lock;
825 struct delayed_work work;
805}; 826};
806#endif 827#endif
807 828
@@ -1363,7 +1384,8 @@ struct batadv_skb_cb {
1363 1384
1364/** 1385/**
1365 * struct batadv_forw_packet - structure for bcast packets to be sent/forwarded 1386 * struct batadv_forw_packet - structure for bcast packets to be sent/forwarded
1366 * @list: list node for batadv_socket_client::queue_list 1387 * @list: list node for batadv_priv::forw_{bat,bcast}_list
1388 * @cleanup_list: list node for purging functions
1367 * @send_time: execution time for delayed_work (packet sending) 1389 * @send_time: execution time for delayed_work (packet sending)
1368 * @own: bool for locally generated packets (local OGMs are re-scheduled after 1390 * @own: bool for locally generated packets (local OGMs are re-scheduled after
1369 * sending) 1391 * sending)
@@ -1380,6 +1402,7 @@ struct batadv_skb_cb {
1380 */ 1402 */
1381struct batadv_forw_packet { 1403struct batadv_forw_packet {
1382 struct hlist_node list; 1404 struct hlist_node list;
1405 struct hlist_node cleanup_list;
1383 unsigned long send_time; 1406 unsigned long send_time;
1384 u8 own; 1407 u8 own;
1385 struct sk_buff *skb; 1408 struct sk_buff *skb;
@@ -1466,6 +1489,7 @@ struct batadv_algo_orig_ops {
1466 1489
1467/** 1490/**
1468 * struct batadv_algo_gw_ops - mesh algorithm callbacks (GW specific) 1491 * struct batadv_algo_gw_ops - mesh algorithm callbacks (GW specific)
1492 * @init_sel_class: initialize GW selection class (optional)
1469 * @store_sel_class: parse and stores a new GW selection class (optional) 1493 * @store_sel_class: parse and stores a new GW selection class (optional)
1470 * @show_sel_class: prints the current GW selection class (optional) 1494 * @show_sel_class: prints the current GW selection class (optional)
1471 * @get_best_gw_node: select the best GW from the list of available nodes 1495 * @get_best_gw_node: select the best GW from the list of available nodes
@@ -1476,6 +1500,7 @@ struct batadv_algo_orig_ops {
1476 * @dump: dump gateways to a netlink socket (optional) 1500 * @dump: dump gateways to a netlink socket (optional)
1477 */ 1501 */
1478struct batadv_algo_gw_ops { 1502struct batadv_algo_gw_ops {
1503 void (*init_sel_class)(struct batadv_priv *bat_priv);
1479 ssize_t (*store_sel_class)(struct batadv_priv *bat_priv, char *buff, 1504 ssize_t (*store_sel_class)(struct batadv_priv *bat_priv, char *buff,
1480 size_t count); 1505 size_t count);
1481 ssize_t (*show_sel_class)(struct batadv_priv *bat_priv, char *buff); 1506 ssize_t (*show_sel_class)(struct batadv_priv *bat_priv, char *buff);
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index 1904a93f47d5..d491529332f4 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -920,7 +920,7 @@ static void chan_close_cb(struct l2cap_chan *chan)
920 BT_DBG("dev %p removing %speer %p", dev, 920 BT_DBG("dev %p removing %speer %p", dev,
921 last ? "last " : "1 ", peer); 921 last ? "last " : "1 ", peer);
922 BT_DBG("chan %p orig refcnt %d", chan, 922 BT_DBG("chan %p orig refcnt %d", chan,
923 atomic_read(&chan->kref.refcount)); 923 kref_read(&chan->kref));
924 924
925 l2cap_chan_put(chan); 925 l2cap_chan_put(chan);
926 break; 926 break;
diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile
index b3ff12eb9b6d..4bfaa19a5573 100644
--- a/net/bluetooth/Makefile
+++ b/net/bluetooth/Makefile
@@ -20,5 +20,3 @@ bluetooth-$(CONFIG_BT_HS) += a2mp.o amp.o
20bluetooth-$(CONFIG_BT_LEDS) += leds.o 20bluetooth-$(CONFIG_BT_LEDS) += leds.o
21bluetooth-$(CONFIG_BT_DEBUGFS) += hci_debugfs.o 21bluetooth-$(CONFIG_BT_DEBUGFS) += hci_debugfs.o
22bluetooth-$(CONFIG_BT_SELFTEST) += selftest.o 22bluetooth-$(CONFIG_BT_SELFTEST) += selftest.o
23
24subdir-ccflags-y += -D__CHECK_ENDIAN__
diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c
index 5f123c3320a7..f0095fd79818 100644
--- a/net/bluetooth/a2mp.c
+++ b/net/bluetooth/a2mp.c
@@ -810,7 +810,7 @@ static struct l2cap_chan *a2mp_chan_open(struct l2cap_conn *conn, bool locked)
810/* AMP Manager functions */ 810/* AMP Manager functions */
811struct amp_mgr *amp_mgr_get(struct amp_mgr *mgr) 811struct amp_mgr *amp_mgr_get(struct amp_mgr *mgr)
812{ 812{
813 BT_DBG("mgr %p orig refcnt %d", mgr, atomic_read(&mgr->kref.refcount)); 813 BT_DBG("mgr %p orig refcnt %d", mgr, kref_read(&mgr->kref));
814 814
815 kref_get(&mgr->kref); 815 kref_get(&mgr->kref);
816 816
@@ -833,7 +833,7 @@ static void amp_mgr_destroy(struct kref *kref)
833 833
834int amp_mgr_put(struct amp_mgr *mgr) 834int amp_mgr_put(struct amp_mgr *mgr)
835{ 835{
836 BT_DBG("mgr %p orig refcnt %d", mgr, atomic_read(&mgr->kref.refcount)); 836 BT_DBG("mgr %p orig refcnt %d", mgr, kref_read(&mgr->kref));
837 837
838 return kref_put(&mgr->kref, &amp_mgr_destroy); 838 return kref_put(&mgr->kref, &amp_mgr_destroy);
839} 839}
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 1aff2da9bc74..69e1f7d362a8 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -27,6 +27,8 @@
27#include <linux/module.h> 27#include <linux/module.h>
28#include <linux/debugfs.h> 28#include <linux/debugfs.h>
29#include <linux/stringify.h> 29#include <linux/stringify.h>
30#include <linux/sched/signal.h>
31
30#include <asm/ioctls.h> 32#include <asm/ioctls.h>
31 33
32#include <net/bluetooth/bluetooth.h> 34#include <net/bluetooth/bluetooth.h>
@@ -245,7 +247,7 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
245 if (err == 0) { 247 if (err == 0) {
246 sock_recv_ts_and_drops(msg, sk, skb); 248 sock_recv_ts_and_drops(msg, sk, skb);
247 249
248 if (bt_sk(sk)->skb_msg_name) 250 if (msg->msg_name && bt_sk(sk)->skb_msg_name)
249 bt_sk(sk)->skb_msg_name(skb, msg->msg_name, 251 bt_sk(sk)->skb_msg_name(skb, msg->msg_name,
250 &msg->msg_namelen); 252 &msg->msg_namelen);
251 } 253 }
diff --git a/net/bluetooth/amp.c b/net/bluetooth/amp.c
index e32f34189007..02a4ccc04e1e 100644
--- a/net/bluetooth/amp.c
+++ b/net/bluetooth/amp.c
@@ -24,7 +24,7 @@
24void amp_ctrl_get(struct amp_ctrl *ctrl) 24void amp_ctrl_get(struct amp_ctrl *ctrl)
25{ 25{
26 BT_DBG("ctrl %p orig refcnt %d", ctrl, 26 BT_DBG("ctrl %p orig refcnt %d", ctrl,
27 atomic_read(&ctrl->kref.refcount)); 27 kref_read(&ctrl->kref));
28 28
29 kref_get(&ctrl->kref); 29 kref_get(&ctrl->kref);
30} 30}
@@ -42,7 +42,7 @@ static void amp_ctrl_destroy(struct kref *kref)
42int amp_ctrl_put(struct amp_ctrl *ctrl) 42int amp_ctrl_put(struct amp_ctrl *ctrl)
43{ 43{
44 BT_DBG("ctrl %p orig refcnt %d", ctrl, 44 BT_DBG("ctrl %p orig refcnt %d", ctrl,
45 atomic_read(&ctrl->kref.refcount)); 45 kref_read(&ctrl->kref));
46 46
47 return kref_put(&ctrl->kref, &amp_ctrl_destroy); 47 return kref_put(&ctrl->kref, &amp_ctrl_destroy);
48} 48}
diff --git a/net/bluetooth/bnep/netdev.c b/net/bluetooth/bnep/netdev.c
index f4fcb4a9d5c1..2b875edf77e1 100644
--- a/net/bluetooth/bnep/netdev.c
+++ b/net/bluetooth/bnep/netdev.c
@@ -211,7 +211,6 @@ static const struct net_device_ops bnep_netdev_ops = {
211 .ndo_set_rx_mode = bnep_net_set_mc_list, 211 .ndo_set_rx_mode = bnep_net_set_mc_list,
212 .ndo_set_mac_address = bnep_net_set_mac_addr, 212 .ndo_set_mac_address = bnep_net_set_mac_addr,
213 .ndo_tx_timeout = bnep_net_timeout, 213 .ndo_tx_timeout = bnep_net_timeout,
214 .ndo_change_mtu = eth_change_mtu,
215 214
216}; 215};
217 216
@@ -222,6 +221,8 @@ void bnep_net_setup(struct net_device *dev)
222 dev->addr_len = ETH_ALEN; 221 dev->addr_len = ETH_ALEN;
223 222
224 ether_setup(dev); 223 ether_setup(dev);
224 dev->min_mtu = 0;
225 dev->max_mtu = ETH_MAX_MTU;
225 dev->priv_flags &= ~IFF_TX_SKB_SHARING; 226 dev->priv_flags &= ~IFF_TX_SKB_SHARING;
226 dev->netdev_ops = &bnep_netdev_ops; 227 dev->netdev_ops = &bnep_netdev_ops;
227 228
diff --git a/net/bluetooth/cmtp/capi.c b/net/bluetooth/cmtp/capi.c
index 46ac686c8911..bb308224099c 100644
--- a/net/bluetooth/cmtp/capi.c
+++ b/net/bluetooth/cmtp/capi.c
@@ -26,7 +26,7 @@
26#include <linux/types.h> 26#include <linux/types.h>
27#include <linux/errno.h> 27#include <linux/errno.h>
28#include <linux/kernel.h> 28#include <linux/kernel.h>
29#include <linux/sched.h> 29#include <linux/sched/signal.h>
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/poll.h> 31#include <linux/poll.h>
32#include <linux/fcntl.h> 32#include <linux/fcntl.h>
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index e17aacbc5630..0b4dba08a14e 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -4749,7 +4749,7 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
4749 case LE_ADV_SCAN_RSP: 4749 case LE_ADV_SCAN_RSP:
4750 break; 4750 break;
4751 default: 4751 default:
4752 BT_ERR_RATELIMITED("Unknown advetising packet type: 0x%02x", 4752 BT_ERR_RATELIMITED("Unknown advertising packet type: 0x%02x",
4753 type); 4753 type);
4754 return; 4754 return;
4755 } 4755 }
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 1015d9c8d97d..b5faff458d8b 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -21,6 +21,8 @@
21 SOFTWARE IS DISCLAIMED. 21 SOFTWARE IS DISCLAIMED.
22*/ 22*/
23 23
24#include <linux/sched/signal.h>
25
24#include <net/bluetooth/bluetooth.h> 26#include <net/bluetooth/bluetooth.h>
25#include <net/bluetooth/hci_core.h> 27#include <net/bluetooth/hci_core.h>
26#include <net/bluetooth/mgmt.h> 28#include <net/bluetooth/mgmt.h>
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 48f9471e7c85..f64d6566021f 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -851,7 +851,7 @@ static int hci_sock_release(struct socket *sock)
851 851
852 if (hdev) { 852 if (hdev) {
853 if (hci_pi(sk)->channel == HCI_CHANNEL_USER) { 853 if (hci_pi(sk)->channel == HCI_CHANNEL_USER) {
854 /* When releasing an user channel exclusive access, 854 /* When releasing a user channel exclusive access,
855 * call hci_dev_do_close directly instead of calling 855 * call hci_dev_do_close directly instead of calling
856 * hci_dev_close to ensure the exclusive access will 856 * hci_dev_close to ensure the exclusive access will
857 * be released and the controller brought back down. 857 * be released and the controller brought back down.
@@ -1172,7 +1172,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
1172 /* In case the transport is already up and 1172 /* In case the transport is already up and
1173 * running, clear the error here. 1173 * running, clear the error here.
1174 * 1174 *
1175 * This can happen when opening an user 1175 * This can happen when opening a user
1176 * channel and HCI_AUTO_OFF grace period 1176 * channel and HCI_AUTO_OFF grace period
1177 * is still active. 1177 * is still active.
1178 */ 1178 */
@@ -1190,7 +1190,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
1190 if (!hci_sock_gen_cookie(sk)) { 1190 if (!hci_sock_gen_cookie(sk)) {
1191 /* In the case when a cookie has already been assigned, 1191 /* In the case when a cookie has already been assigned,
1192 * this socket will transition from a raw socket into 1192 * this socket will transition from a raw socket into
1193 * an user channel socket. For a clean transition, send 1193 * a user channel socket. For a clean transition, send
1194 * the close notification first. 1194 * the close notification first.
1195 */ 1195 */
1196 skb = create_monitor_ctrl_close(sk); 1196 skb = create_monitor_ctrl_close(sk);
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 577f1c01454a..fc7f321a3823 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -481,14 +481,14 @@ static void l2cap_chan_destroy(struct kref *kref)
481 481
482void l2cap_chan_hold(struct l2cap_chan *c) 482void l2cap_chan_hold(struct l2cap_chan *c)
483{ 483{
484 BT_DBG("chan %p orig refcnt %d", c, atomic_read(&c->kref.refcount)); 484 BT_DBG("chan %p orig refcnt %d", c, kref_read(&c->kref));
485 485
486 kref_get(&c->kref); 486 kref_get(&c->kref);
487} 487}
488 488
489void l2cap_chan_put(struct l2cap_chan *c) 489void l2cap_chan_put(struct l2cap_chan *c)
490{ 490{
491 BT_DBG("chan %p orig refcnt %d", c, atomic_read(&c->kref.refcount)); 491 BT_DBG("chan %p orig refcnt %d", c, kref_read(&c->kref));
492 492
493 kref_put(&c->kref, l2cap_chan_destroy); 493 kref_put(&c->kref, l2cap_chan_destroy);
494} 494}
@@ -2127,7 +2127,7 @@ static inline int l2cap_skbuff_fromiovec(struct l2cap_chan *chan,
2127 struct sk_buff **frag; 2127 struct sk_buff **frag;
2128 int sent = 0; 2128 int sent = 0;
2129 2129
2130 if (copy_from_iter(skb_put(skb, count), count, &msg->msg_iter) != count) 2130 if (!copy_from_iter_full(skb_put(skb, count), count, &msg->msg_iter))
2131 return -EFAULT; 2131 return -EFAULT;
2132 2132
2133 sent += count; 2133 sent += count;
@@ -2147,8 +2147,8 @@ static inline int l2cap_skbuff_fromiovec(struct l2cap_chan *chan,
2147 2147
2148 *frag = tmp; 2148 *frag = tmp;
2149 2149
2150 if (copy_from_iter(skb_put(*frag, count), count, 2150 if (!copy_from_iter_full(skb_put(*frag, count), count,
2151 &msg->msg_iter) != count) 2151 &msg->msg_iter))
2152 return -EFAULT; 2152 return -EFAULT;
2153 2153
2154 sent += count; 2154 sent += count;
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index a8ba752732c9..507b80d59dec 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -29,6 +29,7 @@
29 29
30#include <linux/module.h> 30#include <linux/module.h>
31#include <linux/export.h> 31#include <linux/export.h>
32#include <linux/sched/signal.h>
32 33
33#include <net/bluetooth/bluetooth.h> 34#include <net/bluetooth/bluetooth.h>
34#include <net/bluetooth/hci_core.h> 35#include <net/bluetooth/hci_core.h>
@@ -300,7 +301,7 @@ done:
300} 301}
301 302
302static int l2cap_sock_accept(struct socket *sock, struct socket *newsock, 303static int l2cap_sock_accept(struct socket *sock, struct socket *newsock,
303 int flags) 304 int flags, bool kern)
304{ 305{
305 DEFINE_WAIT_FUNC(wait, woken_wake_function); 306 DEFINE_WAIT_FUNC(wait, woken_wake_function);
306 struct sock *sk = sock->sk, *nsk; 307 struct sock *sk = sock->sk, *nsk;
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 7511df72347f..ac3c650cb234 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -27,6 +27,7 @@
27 27
28#include <linux/export.h> 28#include <linux/export.h>
29#include <linux/debugfs.h> 29#include <linux/debugfs.h>
30#include <linux/sched/signal.h>
30 31
31#include <net/bluetooth/bluetooth.h> 32#include <net/bluetooth/bluetooth.h>
32#include <net/bluetooth/hci_core.h> 33#include <net/bluetooth/hci_core.h>
@@ -470,7 +471,8 @@ done:
470 return err; 471 return err;
471} 472}
472 473
473static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, int flags) 474static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, int flags,
475 bool kern)
474{ 476{
475 DEFINE_WAIT_FUNC(wait, woken_wake_function); 477 DEFINE_WAIT_FUNC(wait, woken_wake_function);
476 struct sock *sk = sock->sk, *nsk; 478 struct sock *sk = sock->sk, *nsk;
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 3125ce670c2f..728e0c8dc8e7 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -27,6 +27,7 @@
27#include <linux/module.h> 27#include <linux/module.h>
28#include <linux/debugfs.h> 28#include <linux/debugfs.h>
29#include <linux/seq_file.h> 29#include <linux/seq_file.h>
30#include <linux/sched/signal.h>
30 31
31#include <net/bluetooth/bluetooth.h> 32#include <net/bluetooth/bluetooth.h>
32#include <net/bluetooth/hci_core.h> 33#include <net/bluetooth/hci_core.h>
@@ -626,7 +627,7 @@ done:
626} 627}
627 628
628static int sco_sock_accept(struct socket *sock, struct socket *newsock, 629static int sco_sock_accept(struct socket *sock, struct socket *newsock,
629 int flags) 630 int flags, bool kern)
630{ 631{
631 DEFINE_WAIT_FUNC(wait, woken_wake_function); 632 DEFINE_WAIT_FUNC(wait, woken_wake_function);
632 struct sock *sk = sock->sk, *ch; 633 struct sock *sk = sock->sk, *ch;
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index 43faf2aea2ab..fae391f1871f 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -57,7 +57,7 @@
57#define SMP_TIMEOUT msecs_to_jiffies(30000) 57#define SMP_TIMEOUT msecs_to_jiffies(30000)
58 58
59#define AUTH_REQ_MASK(dev) (hci_dev_test_flag(dev, HCI_SC_ENABLED) ? \ 59#define AUTH_REQ_MASK(dev) (hci_dev_test_flag(dev, HCI_SC_ENABLED) ? \
60 0x1f : 0x07) 60 0x3f : 0x07)
61#define KEY_DIST_MASK 0x07 61#define KEY_DIST_MASK 0x07
62 62
63/* Maximum message length that can be passed to aes_cmac */ 63/* Maximum message length that can be passed to aes_cmac */
@@ -76,6 +76,7 @@ enum {
76 SMP_FLAG_DHKEY_PENDING, 76 SMP_FLAG_DHKEY_PENDING,
77 SMP_FLAG_REMOTE_OOB, 77 SMP_FLAG_REMOTE_OOB,
78 SMP_FLAG_LOCAL_OOB, 78 SMP_FLAG_LOCAL_OOB,
79 SMP_FLAG_CT2,
79}; 80};
80 81
81struct smp_dev { 82struct smp_dev {
@@ -357,6 +358,22 @@ static int smp_h6(struct crypto_shash *tfm_cmac, const u8 w[16],
357 return err; 358 return err;
358} 359}
359 360
361static int smp_h7(struct crypto_shash *tfm_cmac, const u8 w[16],
362 const u8 salt[16], u8 res[16])
363{
364 int err;
365
366 SMP_DBG("w %16phN salt %16phN", w, salt);
367
368 err = aes_cmac(tfm_cmac, salt, w, 16, res);
369 if (err)
370 return err;
371
372 SMP_DBG("res %16phN", res);
373
374 return err;
375}
376
360/* The following functions map to the legacy SMP crypto functions e, c1, 377/* The following functions map to the legacy SMP crypto functions e, c1,
361 * s1 and ah. 378 * s1 and ah.
362 */ 379 */
@@ -1130,20 +1147,31 @@ static void sc_add_ltk(struct smp_chan *smp)
1130 1147
1131static void sc_generate_link_key(struct smp_chan *smp) 1148static void sc_generate_link_key(struct smp_chan *smp)
1132{ 1149{
1133 /* These constants are as specified in the core specification. 1150 /* From core spec. Spells out in ASCII as 'lebr'. */
1134 * In ASCII they spell out to 'tmp1' and 'lebr'.
1135 */
1136 const u8 tmp1[4] = { 0x31, 0x70, 0x6d, 0x74 };
1137 const u8 lebr[4] = { 0x72, 0x62, 0x65, 0x6c }; 1151 const u8 lebr[4] = { 0x72, 0x62, 0x65, 0x6c };
1138 1152
1139 smp->link_key = kzalloc(16, GFP_KERNEL); 1153 smp->link_key = kzalloc(16, GFP_KERNEL);
1140 if (!smp->link_key) 1154 if (!smp->link_key)
1141 return; 1155 return;
1142 1156
1143 if (smp_h6(smp->tfm_cmac, smp->tk, tmp1, smp->link_key)) { 1157 if (test_bit(SMP_FLAG_CT2, &smp->flags)) {
1144 kzfree(smp->link_key); 1158 /* SALT = 0x00000000000000000000000000000000746D7031 */
1145 smp->link_key = NULL; 1159 const u8 salt[16] = { 0x31, 0x70, 0x6d, 0x74 };
1146 return; 1160
1161 if (smp_h7(smp->tfm_cmac, smp->tk, salt, smp->link_key)) {
1162 kzfree(smp->link_key);
1163 smp->link_key = NULL;
1164 return;
1165 }
1166 } else {
1167 /* From core spec. Spells out in ASCII as 'tmp1'. */
1168 const u8 tmp1[4] = { 0x31, 0x70, 0x6d, 0x74 };
1169
1170 if (smp_h6(smp->tfm_cmac, smp->tk, tmp1, smp->link_key)) {
1171 kzfree(smp->link_key);
1172 smp->link_key = NULL;
1173 return;
1174 }
1147 } 1175 }
1148 1176
1149 if (smp_h6(smp->tfm_cmac, smp->link_key, lebr, smp->link_key)) { 1177 if (smp_h6(smp->tfm_cmac, smp->link_key, lebr, smp->link_key)) {
@@ -1169,10 +1197,7 @@ static void smp_allow_key_dist(struct smp_chan *smp)
1169 1197
1170static void sc_generate_ltk(struct smp_chan *smp) 1198static void sc_generate_ltk(struct smp_chan *smp)
1171{ 1199{
1172 /* These constants are as specified in the core specification. 1200 /* From core spec. Spells out in ASCII as 'brle'. */
1173 * In ASCII they spell out to 'tmp2' and 'brle'.
1174 */
1175 const u8 tmp2[4] = { 0x32, 0x70, 0x6d, 0x74 };
1176 const u8 brle[4] = { 0x65, 0x6c, 0x72, 0x62 }; 1201 const u8 brle[4] = { 0x65, 0x6c, 0x72, 0x62 };
1177 struct hci_conn *hcon = smp->conn->hcon; 1202 struct hci_conn *hcon = smp->conn->hcon;
1178 struct hci_dev *hdev = hcon->hdev; 1203 struct hci_dev *hdev = hcon->hdev;
@@ -1187,8 +1212,19 @@ static void sc_generate_ltk(struct smp_chan *smp)
1187 if (key->type == HCI_LK_DEBUG_COMBINATION) 1212 if (key->type == HCI_LK_DEBUG_COMBINATION)
1188 set_bit(SMP_FLAG_DEBUG_KEY, &smp->flags); 1213 set_bit(SMP_FLAG_DEBUG_KEY, &smp->flags);
1189 1214
1190 if (smp_h6(smp->tfm_cmac, key->val, tmp2, smp->tk)) 1215 if (test_bit(SMP_FLAG_CT2, &smp->flags)) {
1191 return; 1216 /* SALT = 0x00000000000000000000000000000000746D7032 */
1217 const u8 salt[16] = { 0x32, 0x70, 0x6d, 0x74 };
1218
1219 if (smp_h7(smp->tfm_cmac, key->val, salt, smp->tk))
1220 return;
1221 } else {
1222 /* From core spec. Spells out in ASCII as 'tmp2'. */
1223 const u8 tmp2[4] = { 0x32, 0x70, 0x6d, 0x74 };
1224
1225 if (smp_h6(smp->tfm_cmac, key->val, tmp2, smp->tk))
1226 return;
1227 }
1192 1228
1193 if (smp_h6(smp->tfm_cmac, smp->tk, brle, smp->tk)) 1229 if (smp_h6(smp->tfm_cmac, smp->tk, brle, smp->tk))
1194 return; 1230 return;
@@ -1669,6 +1705,7 @@ static void build_bredr_pairing_cmd(struct smp_chan *smp,
1669 if (!rsp) { 1705 if (!rsp) {
1670 memset(req, 0, sizeof(*req)); 1706 memset(req, 0, sizeof(*req));
1671 1707
1708 req->auth_req = SMP_AUTH_CT2;
1672 req->init_key_dist = local_dist; 1709 req->init_key_dist = local_dist;
1673 req->resp_key_dist = remote_dist; 1710 req->resp_key_dist = remote_dist;
1674 req->max_key_size = conn->hcon->enc_key_size; 1711 req->max_key_size = conn->hcon->enc_key_size;
@@ -1680,6 +1717,7 @@ static void build_bredr_pairing_cmd(struct smp_chan *smp,
1680 1717
1681 memset(rsp, 0, sizeof(*rsp)); 1718 memset(rsp, 0, sizeof(*rsp));
1682 1719
1720 rsp->auth_req = SMP_AUTH_CT2;
1683 rsp->max_key_size = conn->hcon->enc_key_size; 1721 rsp->max_key_size = conn->hcon->enc_key_size;
1684 rsp->init_key_dist = req->init_key_dist & remote_dist; 1722 rsp->init_key_dist = req->init_key_dist & remote_dist;
1685 rsp->resp_key_dist = req->resp_key_dist & local_dist; 1723 rsp->resp_key_dist = req->resp_key_dist & local_dist;
@@ -1744,6 +1782,9 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb)
1744 1782
1745 build_bredr_pairing_cmd(smp, req, &rsp); 1783 build_bredr_pairing_cmd(smp, req, &rsp);
1746 1784
1785 if (req->auth_req & SMP_AUTH_CT2)
1786 set_bit(SMP_FLAG_CT2, &smp->flags);
1787
1747 key_size = min(req->max_key_size, rsp.max_key_size); 1788 key_size = min(req->max_key_size, rsp.max_key_size);
1748 if (check_enc_key_size(conn, key_size)) 1789 if (check_enc_key_size(conn, key_size))
1749 return SMP_ENC_KEY_SIZE; 1790 return SMP_ENC_KEY_SIZE;
@@ -1761,9 +1802,13 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb)
1761 1802
1762 build_pairing_cmd(conn, req, &rsp, auth); 1803 build_pairing_cmd(conn, req, &rsp, auth);
1763 1804
1764 if (rsp.auth_req & SMP_AUTH_SC) 1805 if (rsp.auth_req & SMP_AUTH_SC) {
1765 set_bit(SMP_FLAG_SC, &smp->flags); 1806 set_bit(SMP_FLAG_SC, &smp->flags);
1766 1807
1808 if (rsp.auth_req & SMP_AUTH_CT2)
1809 set_bit(SMP_FLAG_CT2, &smp->flags);
1810 }
1811
1767 if (conn->hcon->io_capability == HCI_IO_NO_INPUT_OUTPUT) 1812 if (conn->hcon->io_capability == HCI_IO_NO_INPUT_OUTPUT)
1768 sec_level = BT_SECURITY_MEDIUM; 1813 sec_level = BT_SECURITY_MEDIUM;
1769 else 1814 else
@@ -1917,6 +1962,9 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb)
1917 */ 1962 */
1918 smp->remote_key_dist &= rsp->resp_key_dist; 1963 smp->remote_key_dist &= rsp->resp_key_dist;
1919 1964
1965 if ((req->auth_req & SMP_AUTH_CT2) && (auth & SMP_AUTH_CT2))
1966 set_bit(SMP_FLAG_CT2, &smp->flags);
1967
1920 /* For BR/EDR this means we're done and can start phase 3 */ 1968 /* For BR/EDR this means we're done and can start phase 3 */
1921 if (conn->hcon->type == ACL_LINK) { 1969 if (conn->hcon->type == ACL_LINK) {
1922 /* Clear bits which are generated but not distributed */ 1970 /* Clear bits which are generated but not distributed */
@@ -2312,8 +2360,11 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level)
2312 2360
2313 authreq = seclevel_to_authreq(sec_level); 2361 authreq = seclevel_to_authreq(sec_level);
2314 2362
2315 if (hci_dev_test_flag(hcon->hdev, HCI_SC_ENABLED)) 2363 if (hci_dev_test_flag(hcon->hdev, HCI_SC_ENABLED)) {
2316 authreq |= SMP_AUTH_SC; 2364 authreq |= SMP_AUTH_SC;
2365 if (hci_dev_test_flag(hcon->hdev, HCI_SSP_ENABLED))
2366 authreq |= SMP_AUTH_CT2;
2367 }
2317 2368
2318 /* Require MITM if IO Capability allows or the security level 2369 /* Require MITM if IO Capability allows or the security level
2319 * requires it. 2370 * requires it.
diff --git a/net/bluetooth/smp.h b/net/bluetooth/smp.h
index ffcc70b6b199..0ff6247eaa6c 100644
--- a/net/bluetooth/smp.h
+++ b/net/bluetooth/smp.h
@@ -57,6 +57,7 @@ struct smp_cmd_pairing {
57#define SMP_AUTH_MITM 0x04 57#define SMP_AUTH_MITM 0x04
58#define SMP_AUTH_SC 0x08 58#define SMP_AUTH_SC 0x08
59#define SMP_AUTH_KEYPRESS 0x10 59#define SMP_AUTH_KEYPRESS 0x10
60#define SMP_AUTH_CT2 0x20
60 61
61#define SMP_CMD_PAIRING_CONFIRM 0x03 62#define SMP_CMD_PAIRING_CONFIRM 0x03
62struct smp_cmd_pairing_confirm { 63struct smp_cmd_pairing_confirm {
diff --git a/net/bridge/Makefile b/net/bridge/Makefile
index 0aefc011b668..40b1ede527ca 100644
--- a/net/bridge/Makefile
+++ b/net/bridge/Makefile
@@ -6,7 +6,8 @@ obj-$(CONFIG_BRIDGE) += bridge.o
6 6
7bridge-y := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \ 7bridge-y := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \
8 br_ioctl.o br_stp.o br_stp_bpdu.o \ 8 br_ioctl.o br_stp.o br_stp_bpdu.o \
9 br_stp_if.o br_stp_timer.o br_netlink.o 9 br_stp_if.o br_stp_timer.o br_netlink.o \
10 br_netlink_tunnel.o
10 11
11bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o 12bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o
12 13
@@ -18,7 +19,7 @@ obj-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o
18 19
19bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o 20bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o
20 21
21bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o 22bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o br_vlan_tunnel.o
22 23
23bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o 24bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o
24 25
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 89a687f3c0a3..90f49a194249 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -19,7 +19,7 @@
19#include <linux/list.h> 19#include <linux/list.h>
20#include <linux/netfilter_bridge.h> 20#include <linux/netfilter_bridge.h>
21 21
22#include <asm/uaccess.h> 22#include <linux/uaccess.h>
23#include "br_private.h" 23#include "br_private.h"
24 24
25#define COMMON_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | \ 25#define COMMON_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | \
@@ -79,7 +79,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
79 br_multicast_flood(mdst, skb, false, true); 79 br_multicast_flood(mdst, skb, false, true);
80 else 80 else
81 br_flood(br, skb, BR_PKT_MULTICAST, false, true); 81 br_flood(br, skb, BR_PKT_MULTICAST, false, true);
82 } else if ((dst = __br_fdb_get(br, dest, vid)) != NULL) { 82 } else if ((dst = br_fdb_find_rcu(br, dest, vid)) != NULL) {
83 br_forward(dst->dst, skb, false, true); 83 br_forward(dst->dst, skb, false, true);
84 } else { 84 } else {
85 br_flood(br, skb, BR_PKT_UNICAST, false, true); 85 br_flood(br, skb, BR_PKT_UNICAST, false, true);
@@ -119,6 +119,15 @@ static int br_dev_init(struct net_device *dev)
119 return err; 119 return err;
120} 120}
121 121
122static void br_dev_uninit(struct net_device *dev)
123{
124 struct net_bridge *br = netdev_priv(dev);
125
126 br_multicast_uninit_stats(br);
127 br_vlan_flush(br);
128 free_percpu(br->stats);
129}
130
122static int br_dev_open(struct net_device *dev) 131static int br_dev_open(struct net_device *dev)
123{ 132{
124 struct net_bridge *br = netdev_priv(dev); 133 struct net_bridge *br = netdev_priv(dev);
@@ -153,8 +162,8 @@ static int br_dev_stop(struct net_device *dev)
153 return 0; 162 return 0;
154} 163}
155 164
156static struct rtnl_link_stats64 *br_get_stats64(struct net_device *dev, 165static void br_get_stats64(struct net_device *dev,
157 struct rtnl_link_stats64 *stats) 166 struct rtnl_link_stats64 *stats)
158{ 167{
159 struct net_bridge *br = netdev_priv(dev); 168 struct net_bridge *br = netdev_priv(dev);
160 struct pcpu_sw_netstats tmp, sum = { 0 }; 169 struct pcpu_sw_netstats tmp, sum = { 0 };
@@ -178,14 +187,12 @@ static struct rtnl_link_stats64 *br_get_stats64(struct net_device *dev,
178 stats->tx_packets = sum.tx_packets; 187 stats->tx_packets = sum.tx_packets;
179 stats->rx_bytes = sum.rx_bytes; 188 stats->rx_bytes = sum.rx_bytes;
180 stats->rx_packets = sum.rx_packets; 189 stats->rx_packets = sum.rx_packets;
181
182 return stats;
183} 190}
184 191
185static int br_change_mtu(struct net_device *dev, int new_mtu) 192static int br_change_mtu(struct net_device *dev, int new_mtu)
186{ 193{
187 struct net_bridge *br = netdev_priv(dev); 194 struct net_bridge *br = netdev_priv(dev);
188 if (new_mtu < 68 || new_mtu > br_min_mtu(br)) 195 if (new_mtu > br_min_mtu(br))
189 return -EINVAL; 196 return -EINVAL;
190 197
191 dev->mtu = new_mtu; 198 dev->mtu = new_mtu;
@@ -334,6 +341,7 @@ static const struct net_device_ops br_netdev_ops = {
334 .ndo_open = br_dev_open, 341 .ndo_open = br_dev_open,
335 .ndo_stop = br_dev_stop, 342 .ndo_stop = br_dev_stop,
336 .ndo_init = br_dev_init, 343 .ndo_init = br_dev_init,
344 .ndo_uninit = br_dev_uninit,
337 .ndo_start_xmit = br_dev_xmit, 345 .ndo_start_xmit = br_dev_xmit,
338 .ndo_get_stats64 = br_get_stats64, 346 .ndo_get_stats64 = br_get_stats64,
339 .ndo_set_mac_address = br_set_mac_address, 347 .ndo_set_mac_address = br_set_mac_address,
@@ -349,8 +357,6 @@ static const struct net_device_ops br_netdev_ops = {
349 .ndo_add_slave = br_add_slave, 357 .ndo_add_slave = br_add_slave,
350 .ndo_del_slave = br_del_slave, 358 .ndo_del_slave = br_del_slave,
351 .ndo_fix_features = br_fix_features, 359 .ndo_fix_features = br_fix_features,
352 .ndo_neigh_construct = netdev_default_l2upper_neigh_construct,
353 .ndo_neigh_destroy = netdev_default_l2upper_neigh_destroy,
354 .ndo_fdb_add = br_fdb_add, 360 .ndo_fdb_add = br_fdb_add,
355 .ndo_fdb_del = br_fdb_delete, 361 .ndo_fdb_del = br_fdb_delete,
356 .ndo_fdb_dump = br_fdb_dump, 362 .ndo_fdb_dump = br_fdb_dump,
@@ -360,14 +366,6 @@ static const struct net_device_ops br_netdev_ops = {
360 .ndo_features_check = passthru_features_check, 366 .ndo_features_check = passthru_features_check,
361}; 367};
362 368
363static void br_dev_free(struct net_device *dev)
364{
365 struct net_bridge *br = netdev_priv(dev);
366
367 free_percpu(br->stats);
368 free_netdev(dev);
369}
370
371static struct device_type br_type = { 369static struct device_type br_type = {
372 .name = "bridge", 370 .name = "bridge",
373}; 371};
@@ -380,7 +378,7 @@ void br_dev_setup(struct net_device *dev)
380 ether_setup(dev); 378 ether_setup(dev);
381 379
382 dev->netdev_ops = &br_netdev_ops; 380 dev->netdev_ops = &br_netdev_ops;
383 dev->destructor = br_dev_free; 381 dev->destructor = free_netdev;
384 dev->ethtool_ops = &br_ethtool_ops; 382 dev->ethtool_ops = &br_ethtool_ops;
385 SET_NETDEV_DEVTYPE(dev, &br_type); 383 SET_NETDEV_DEVTYPE(dev, &br_type);
386 dev->priv_flags = IFF_EBRIDGE | IFF_NO_QUEUE; 384 dev->priv_flags = IFF_EBRIDGE | IFF_NO_QUEUE;
@@ -409,9 +407,11 @@ void br_dev_setup(struct net_device *dev)
409 br->bridge_max_age = br->max_age = 20 * HZ; 407 br->bridge_max_age = br->max_age = 20 * HZ;
410 br->bridge_hello_time = br->hello_time = 2 * HZ; 408 br->bridge_hello_time = br->hello_time = 2 * HZ;
411 br->bridge_forward_delay = br->forward_delay = 15 * HZ; 409 br->bridge_forward_delay = br->forward_delay = 15 * HZ;
412 br->ageing_time = BR_DEFAULT_AGEING_TIME; 410 br->bridge_ageing_time = br->ageing_time = BR_DEFAULT_AGEING_TIME;
411 dev->max_mtu = ETH_MAX_MTU;
413 412
414 br_netfilter_rtable_init(br); 413 br_netfilter_rtable_init(br);
415 br_stp_timer_init(br); 414 br_stp_timer_init(br);
416 br_multicast_init(br); 415 br_multicast_init(br);
416 INIT_DELAYED_WORK(&br->gc_work, br_fdb_cleanup);
417} 417}
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 6b43c8c88f19..6e08b7199dd7 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -28,9 +28,6 @@
28#include "br_private.h" 28#include "br_private.h"
29 29
30static struct kmem_cache *br_fdb_cache __read_mostly; 30static struct kmem_cache *br_fdb_cache __read_mostly;
31static struct net_bridge_fdb_entry *fdb_find(struct hlist_head *head,
32 const unsigned char *addr,
33 __u16 vid);
34static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, 31static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
35 const unsigned char *addr, u16 vid); 32 const unsigned char *addr, u16 vid);
36static void fdb_notify(struct net_bridge *br, 33static void fdb_notify(struct net_bridge *br,
@@ -68,7 +65,7 @@ static inline unsigned long hold_time(const struct net_bridge *br)
68static inline int has_expired(const struct net_bridge *br, 65static inline int has_expired(const struct net_bridge *br,
69 const struct net_bridge_fdb_entry *fdb) 66 const struct net_bridge_fdb_entry *fdb)
70{ 67{
71 return !fdb->is_static && 68 return !fdb->is_static && !fdb->added_by_external_learn &&
72 time_before_eq(fdb->updated + hold_time(br), jiffies); 69 time_before_eq(fdb->updated + hold_time(br), jiffies);
73} 70}
74 71
@@ -86,6 +83,47 @@ static void fdb_rcu_free(struct rcu_head *head)
86 kmem_cache_free(br_fdb_cache, ent); 83 kmem_cache_free(br_fdb_cache, ent);
87} 84}
88 85
86static struct net_bridge_fdb_entry *fdb_find_rcu(struct hlist_head *head,
87 const unsigned char *addr,
88 __u16 vid)
89{
90 struct net_bridge_fdb_entry *f;
91
92 WARN_ON_ONCE(!rcu_read_lock_held());
93
94 hlist_for_each_entry_rcu(f, head, hlist)
95 if (ether_addr_equal(f->addr.addr, addr) && f->vlan_id == vid)
96 break;
97
98 return f;
99}
100
101/* requires bridge hash_lock */
102static struct net_bridge_fdb_entry *br_fdb_find(struct net_bridge *br,
103 const unsigned char *addr,
104 __u16 vid)
105{
106 struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
107 struct net_bridge_fdb_entry *fdb;
108
109 lockdep_assert_held_once(&br->hash_lock);
110
111 rcu_read_lock();
112 fdb = fdb_find_rcu(head, addr, vid);
113 rcu_read_unlock();
114
115 return fdb;
116}
117
118struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br,
119 const unsigned char *addr,
120 __u16 vid)
121{
122 struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
123
124 return fdb_find_rcu(head, addr, vid);
125}
126
89/* When a static FDB entry is added, the mac address from the entry is 127/* When a static FDB entry is added, the mac address from the entry is
90 * added to the bridge private HW address list and all required ports 128 * added to the bridge private HW address list and all required ports
91 * are then updated with the new information. 129 * are then updated with the new information.
@@ -154,7 +192,7 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f)
154 if (f->added_by_external_learn) 192 if (f->added_by_external_learn)
155 fdb_del_external_learn(f); 193 fdb_del_external_learn(f);
156 194
157 hlist_del_rcu(&f->hlist); 195 hlist_del_init_rcu(&f->hlist);
158 fdb_notify(br, f, RTM_DELNEIGH); 196 fdb_notify(br, f, RTM_DELNEIGH);
159 call_rcu(&f->rcu, fdb_rcu_free); 197 call_rcu(&f->rcu, fdb_rcu_free);
160} 198}
@@ -198,11 +236,10 @@ void br_fdb_find_delete_local(struct net_bridge *br,
198 const struct net_bridge_port *p, 236 const struct net_bridge_port *p,
199 const unsigned char *addr, u16 vid) 237 const unsigned char *addr, u16 vid)
200{ 238{
201 struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
202 struct net_bridge_fdb_entry *f; 239 struct net_bridge_fdb_entry *f;
203 240
204 spin_lock_bh(&br->hash_lock); 241 spin_lock_bh(&br->hash_lock);
205 f = fdb_find(head, addr, vid); 242 f = br_fdb_find(br, addr, vid);
206 if (f && f->is_local && !f->added_by_user && f->dst == p) 243 if (f && f->is_local && !f->added_by_user && f->dst == p)
207 fdb_delete_local(br, p, f); 244 fdb_delete_local(br, p, f);
208 spin_unlock_bh(&br->hash_lock); 245 spin_unlock_bh(&br->hash_lock);
@@ -266,7 +303,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr)
266 spin_lock_bh(&br->hash_lock); 303 spin_lock_bh(&br->hash_lock);
267 304
268 /* If old entry was unassociated with any port, then delete it. */ 305 /* If old entry was unassociated with any port, then delete it. */
269 f = __br_fdb_get(br, br->dev->dev_addr, 0); 306 f = br_fdb_find(br, br->dev->dev_addr, 0);
270 if (f && f->is_local && !f->dst && !f->added_by_user) 307 if (f && f->is_local && !f->dst && !f->added_by_user)
271 fdb_delete_local(br, NULL, f); 308 fdb_delete_local(br, NULL, f);
272 309
@@ -281,7 +318,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr)
281 list_for_each_entry(v, &vg->vlan_list, vlist) { 318 list_for_each_entry(v, &vg->vlan_list, vlist) {
282 if (!br_vlan_should_use(v)) 319 if (!br_vlan_should_use(v))
283 continue; 320 continue;
284 f = __br_fdb_get(br, br->dev->dev_addr, v->vid); 321 f = br_fdb_find(br, br->dev->dev_addr, v->vid);
285 if (f && f->is_local && !f->dst && !f->added_by_user) 322 if (f && f->is_local && !f->dst && !f->added_by_user)
286 fdb_delete_local(br, NULL, f); 323 fdb_delete_local(br, NULL, f);
287 fdb_insert(br, NULL, newaddr, v->vid); 324 fdb_insert(br, NULL, newaddr, v->vid);
@@ -290,34 +327,43 @@ out:
290 spin_unlock_bh(&br->hash_lock); 327 spin_unlock_bh(&br->hash_lock);
291} 328}
292 329
293void br_fdb_cleanup(unsigned long _data) 330void br_fdb_cleanup(struct work_struct *work)
294{ 331{
295 struct net_bridge *br = (struct net_bridge *)_data; 332 struct net_bridge *br = container_of(work, struct net_bridge,
333 gc_work.work);
296 unsigned long delay = hold_time(br); 334 unsigned long delay = hold_time(br);
297 unsigned long next_timer = jiffies + br->ageing_time; 335 unsigned long work_delay = delay;
336 unsigned long now = jiffies;
298 int i; 337 int i;
299 338
300 spin_lock(&br->hash_lock);
301 for (i = 0; i < BR_HASH_SIZE; i++) { 339 for (i = 0; i < BR_HASH_SIZE; i++) {
302 struct net_bridge_fdb_entry *f; 340 struct net_bridge_fdb_entry *f;
303 struct hlist_node *n; 341 struct hlist_node *n;
304 342
343 if (!br->hash[i].first)
344 continue;
345
346 spin_lock_bh(&br->hash_lock);
305 hlist_for_each_entry_safe(f, n, &br->hash[i], hlist) { 347 hlist_for_each_entry_safe(f, n, &br->hash[i], hlist) {
306 unsigned long this_timer; 348 unsigned long this_timer;
349
307 if (f->is_static) 350 if (f->is_static)
308 continue; 351 continue;
309 if (f->added_by_external_learn) 352 if (f->added_by_external_learn)
310 continue; 353 continue;
311 this_timer = f->updated + delay; 354 this_timer = f->updated + delay;
312 if (time_before_eq(this_timer, jiffies)) 355 if (time_after(this_timer, now))
356 work_delay = min(work_delay, this_timer - now);
357 else
313 fdb_delete(br, f); 358 fdb_delete(br, f);
314 else if (time_before(this_timer, next_timer))
315 next_timer = this_timer;
316 } 359 }
360 spin_unlock_bh(&br->hash_lock);
361 cond_resched();
317 } 362 }
318 spin_unlock(&br->hash_lock);
319 363
320 mod_timer(&br->gc_timer, round_jiffies_up(next_timer)); 364 /* Cleanup minimum 10 milliseconds apart */
365 work_delay = max_t(unsigned long, work_delay, msecs_to_jiffies(10));
366 mod_delayed_work(system_long_wq, &br->gc_work, work_delay);
321} 367}
322 368
323/* Completely flush all dynamic entries in forwarding database.*/ 369/* Completely flush all dynamic entries in forwarding database.*/
@@ -371,26 +417,6 @@ void br_fdb_delete_by_port(struct net_bridge *br,
371 spin_unlock_bh(&br->hash_lock); 417 spin_unlock_bh(&br->hash_lock);
372} 418}
373 419
374/* No locking or refcounting, assumes caller has rcu_read_lock */
375struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br,
376 const unsigned char *addr,
377 __u16 vid)
378{
379 struct net_bridge_fdb_entry *fdb;
380
381 hlist_for_each_entry_rcu(fdb,
382 &br->hash[br_mac_hash(addr, vid)], hlist) {
383 if (ether_addr_equal(fdb->addr.addr, addr) &&
384 fdb->vlan_id == vid) {
385 if (unlikely(has_expired(br, fdb)))
386 break;
387 return fdb;
388 }
389 }
390
391 return NULL;
392}
393
394#if IS_ENABLED(CONFIG_ATM_LANE) 420#if IS_ENABLED(CONFIG_ATM_LANE)
395/* Interface used by ATM LANE hook to test 421/* Interface used by ATM LANE hook to test
396 * if an addr is on some other bridge port */ 422 * if an addr is on some other bridge port */
@@ -405,7 +431,7 @@ int br_fdb_test_addr(struct net_device *dev, unsigned char *addr)
405 if (!port) 431 if (!port)
406 ret = 0; 432 ret = 0;
407 else { 433 else {
408 fdb = __br_fdb_get(port->br, addr, 0); 434 fdb = br_fdb_find_rcu(port->br, addr, 0);
409 ret = fdb && fdb->dst && fdb->dst->dev != dev && 435 ret = fdb && fdb->dst && fdb->dst->dev != dev &&
410 fdb->dst->state == BR_STATE_FORWARDING; 436 fdb->dst->state == BR_STATE_FORWARDING;
411 } 437 }
@@ -467,34 +493,6 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf,
467 return num; 493 return num;
468} 494}
469 495
470static struct net_bridge_fdb_entry *fdb_find(struct hlist_head *head,
471 const unsigned char *addr,
472 __u16 vid)
473{
474 struct net_bridge_fdb_entry *fdb;
475
476 hlist_for_each_entry(fdb, head, hlist) {
477 if (ether_addr_equal(fdb->addr.addr, addr) &&
478 fdb->vlan_id == vid)
479 return fdb;
480 }
481 return NULL;
482}
483
484static struct net_bridge_fdb_entry *fdb_find_rcu(struct hlist_head *head,
485 const unsigned char *addr,
486 __u16 vid)
487{
488 struct net_bridge_fdb_entry *fdb;
489
490 hlist_for_each_entry_rcu(fdb, head, hlist) {
491 if (ether_addr_equal(fdb->addr.addr, addr) &&
492 fdb->vlan_id == vid)
493 return fdb;
494 }
495 return NULL;
496}
497
498static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head, 496static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
499 struct net_bridge_port *source, 497 struct net_bridge_port *source,
500 const unsigned char *addr, 498 const unsigned char *addr,
@@ -528,16 +526,15 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
528 if (!is_valid_ether_addr(addr)) 526 if (!is_valid_ether_addr(addr))
529 return -EINVAL; 527 return -EINVAL;
530 528
531 fdb = fdb_find(head, addr, vid); 529 fdb = br_fdb_find(br, addr, vid);
532 if (fdb) { 530 if (fdb) {
533 /* it is okay to have multiple ports with same 531 /* it is okay to have multiple ports with same
534 * address, just use the first one. 532 * address, just use the first one.
535 */ 533 */
536 if (fdb->is_local) 534 if (fdb->is_local)
537 return 0; 535 return 0;
538 br_warn(br, "adding interface %s with same address " 536 br_warn(br, "adding interface %s with same address as a received packet (addr:%pM, vlan:%u)\n",
539 "as a received packet\n", 537 source ? source->dev->name : br->dev->name, addr, vid);
540 source ? source->dev->name : br->dev->name);
541 fdb_delete(br, fdb); 538 fdb_delete(br, fdb);
542 } 539 }
543 540
@@ -583,16 +580,18 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
583 /* attempt to update an entry for a local interface */ 580 /* attempt to update an entry for a local interface */
584 if (unlikely(fdb->is_local)) { 581 if (unlikely(fdb->is_local)) {
585 if (net_ratelimit()) 582 if (net_ratelimit())
586 br_warn(br, "received packet on %s with " 583 br_warn(br, "received packet on %s with own address as source address (addr:%pM, vlan:%u)\n",
587 "own address as source address\n", 584 source->dev->name, addr, vid);
588 source->dev->name);
589 } else { 585 } else {
586 unsigned long now = jiffies;
587
590 /* fastpath: update of existing entry */ 588 /* fastpath: update of existing entry */
591 if (unlikely(source != fdb->dst)) { 589 if (unlikely(source != fdb->dst)) {
592 fdb->dst = source; 590 fdb->dst = source;
593 fdb_modified = true; 591 fdb_modified = true;
594 } 592 }
595 fdb->updated = jiffies; 593 if (now != fdb->updated)
594 fdb->updated = now;
596 if (unlikely(added_by_user)) 595 if (unlikely(added_by_user))
597 fdb->added_by_user = 1; 596 fdb->added_by_user = 1;
598 if (unlikely(fdb_modified)) 597 if (unlikely(fdb_modified))
@@ -600,7 +599,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
600 } 599 }
601 } else { 600 } else {
602 spin_lock(&br->hash_lock); 601 spin_lock(&br->hash_lock);
603 if (likely(!fdb_find(head, addr, vid))) { 602 if (likely(!fdb_find_rcu(head, addr, vid))) {
604 fdb = fdb_create(head, source, addr, vid, 0, 0); 603 fdb = fdb_create(head, source, addr, vid, 0, 0);
605 if (fdb) { 604 if (fdb) {
606 if (unlikely(added_by_user)) 605 if (unlikely(added_by_user))
@@ -784,7 +783,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
784 return -EINVAL; 783 return -EINVAL;
785 } 784 }
786 785
787 fdb = fdb_find(head, addr, vid); 786 fdb = br_fdb_find(br, addr, vid);
788 if (fdb == NULL) { 787 if (fdb == NULL) {
789 if (!(flags & NLM_F_CREATE)) 788 if (!(flags & NLM_F_CREATE))
790 return -ENOENT; 789 return -ENOENT;
@@ -931,55 +930,30 @@ out:
931 return err; 930 return err;
932} 931}
933 932
934static int fdb_delete_by_addr(struct net_bridge *br, const u8 *addr, 933static int fdb_delete_by_addr_and_port(struct net_bridge *br,
935 u16 vid) 934 const struct net_bridge_port *p,
936{
937 struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
938 struct net_bridge_fdb_entry *fdb;
939
940 fdb = fdb_find(head, addr, vid);
941 if (!fdb)
942 return -ENOENT;
943
944 fdb_delete(br, fdb);
945 return 0;
946}
947
948static int __br_fdb_delete_by_addr(struct net_bridge *br,
949 const unsigned char *addr, u16 vid)
950{
951 int err;
952
953 spin_lock_bh(&br->hash_lock);
954 err = fdb_delete_by_addr(br, addr, vid);
955 spin_unlock_bh(&br->hash_lock);
956
957 return err;
958}
959
960static int fdb_delete_by_addr_and_port(struct net_bridge_port *p,
961 const u8 *addr, u16 vlan) 935 const u8 *addr, u16 vlan)
962{ 936{
963 struct net_bridge *br = p->br;
964 struct hlist_head *head = &br->hash[br_mac_hash(addr, vlan)];
965 struct net_bridge_fdb_entry *fdb; 937 struct net_bridge_fdb_entry *fdb;
966 938
967 fdb = fdb_find(head, addr, vlan); 939 fdb = br_fdb_find(br, addr, vlan);
968 if (!fdb || fdb->dst != p) 940 if (!fdb || fdb->dst != p)
969 return -ENOENT; 941 return -ENOENT;
970 942
971 fdb_delete(br, fdb); 943 fdb_delete(br, fdb);
944
972 return 0; 945 return 0;
973} 946}
974 947
975static int __br_fdb_delete(struct net_bridge_port *p, 948static int __br_fdb_delete(struct net_bridge *br,
949 const struct net_bridge_port *p,
976 const unsigned char *addr, u16 vid) 950 const unsigned char *addr, u16 vid)
977{ 951{
978 int err; 952 int err;
979 953
980 spin_lock_bh(&p->br->hash_lock); 954 spin_lock_bh(&br->hash_lock);
981 err = fdb_delete_by_addr_and_port(p, addr, vid); 955 err = fdb_delete_by_addr_and_port(br, p, addr, vid);
982 spin_unlock_bh(&p->br->hash_lock); 956 spin_unlock_bh(&br->hash_lock);
983 957
984 return err; 958 return err;
985} 959}
@@ -992,7 +966,7 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
992 struct net_bridge_vlan_group *vg; 966 struct net_bridge_vlan_group *vg;
993 struct net_bridge_port *p = NULL; 967 struct net_bridge_port *p = NULL;
994 struct net_bridge_vlan *v; 968 struct net_bridge_vlan *v;
995 struct net_bridge *br = NULL; 969 struct net_bridge *br;
996 int err; 970 int err;
997 971
998 if (dev->priv_flags & IFF_EBRIDGE) { 972 if (dev->priv_flags & IFF_EBRIDGE) {
@@ -1006,6 +980,7 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
1006 return -EINVAL; 980 return -EINVAL;
1007 } 981 }
1008 vg = nbp_vlan_group(p); 982 vg = nbp_vlan_group(p);
983 br = p->br;
1009 } 984 }
1010 985
1011 if (vid) { 986 if (vid) {
@@ -1015,30 +990,20 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
1015 return -EINVAL; 990 return -EINVAL;
1016 } 991 }
1017 992
1018 if (dev->priv_flags & IFF_EBRIDGE) 993 err = __br_fdb_delete(br, p, addr, vid);
1019 err = __br_fdb_delete_by_addr(br, addr, vid);
1020 else
1021 err = __br_fdb_delete(p, addr, vid);
1022 } else { 994 } else {
1023 err = -ENOENT; 995 err = -ENOENT;
1024 if (dev->priv_flags & IFF_EBRIDGE) 996 err &= __br_fdb_delete(br, p, addr, 0);
1025 err = __br_fdb_delete_by_addr(br, addr, 0);
1026 else
1027 err &= __br_fdb_delete(p, addr, 0);
1028
1029 if (!vg || !vg->num_vlans) 997 if (!vg || !vg->num_vlans)
1030 goto out; 998 return err;
1031 999
1032 list_for_each_entry(v, &vg->vlan_list, vlist) { 1000 list_for_each_entry(v, &vg->vlan_list, vlist) {
1033 if (!br_vlan_should_use(v)) 1001 if (!br_vlan_should_use(v))
1034 continue; 1002 continue;
1035 if (dev->priv_flags & IFF_EBRIDGE) 1003 err &= __br_fdb_delete(br, p, addr, v->vid);
1036 err = __br_fdb_delete_by_addr(br, addr, v->vid);
1037 else
1038 err &= __br_fdb_delete(p, addr, v->vid);
1039 } 1004 }
1040 } 1005 }
1041out: 1006
1042 return err; 1007 return err;
1043} 1008}
1044 1009
@@ -1109,7 +1074,7 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
1109 spin_lock_bh(&br->hash_lock); 1074 spin_lock_bh(&br->hash_lock);
1110 1075
1111 head = &br->hash[br_mac_hash(addr, vid)]; 1076 head = &br->hash[br_mac_hash(addr, vid)];
1112 fdb = fdb_find(head, addr, vid); 1077 fdb = br_fdb_find(br, addr, vid);
1113 if (!fdb) { 1078 if (!fdb) {
1114 fdb = fdb_create(head, p, addr, vid, 0, 0); 1079 fdb = fdb_create(head, p, addr, vid, 0, 0);
1115 if (!fdb) { 1080 if (!fdb) {
@@ -1137,15 +1102,13 @@ err_unlock:
1137int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p, 1102int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
1138 const unsigned char *addr, u16 vid) 1103 const unsigned char *addr, u16 vid)
1139{ 1104{
1140 struct hlist_head *head;
1141 struct net_bridge_fdb_entry *fdb; 1105 struct net_bridge_fdb_entry *fdb;
1142 int err = 0; 1106 int err = 0;
1143 1107
1144 ASSERT_RTNL(); 1108 ASSERT_RTNL();
1145 spin_lock_bh(&br->hash_lock); 1109 spin_lock_bh(&br->hash_lock);
1146 1110
1147 head = &br->hash[br_mac_hash(addr, vid)]; 1111 fdb = br_fdb_find(br, addr, vid);
1148 fdb = fdb_find(head, addr, vid);
1149 if (fdb && fdb->added_by_external_learn) 1112 if (fdb && fdb->added_by_external_learn)
1150 fdb_delete(br, fdb); 1113 fdb_delete(br, fdb);
1151 else 1114 else
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 7cb41aee4c82..902af6ba481c 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -80,7 +80,7 @@ static void __br_forward(const struct net_bridge_port *to,
80 int br_hook; 80 int br_hook;
81 81
82 vg = nbp_vlan_group_rcu(to); 82 vg = nbp_vlan_group_rcu(to);
83 skb = br_handle_vlan(to->br, vg, skb); 83 skb = br_handle_vlan(to->br, to, vg, skb);
84 if (!skb) 84 if (!skb)
85 return; 85 return;
86 86
@@ -186,8 +186,9 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb,
186 /* Do not flood unicast traffic to ports that turn it off */ 186 /* Do not flood unicast traffic to ports that turn it off */
187 if (pkt_type == BR_PKT_UNICAST && !(p->flags & BR_FLOOD)) 187 if (pkt_type == BR_PKT_UNICAST && !(p->flags & BR_FLOOD))
188 continue; 188 continue;
189 /* Do not flood if mc off, except for traffic we originate */
189 if (pkt_type == BR_PKT_MULTICAST && 190 if (pkt_type == BR_PKT_MULTICAST &&
190 !(p->flags & BR_MCAST_FLOOD)) 191 !(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev)
191 continue; 192 continue;
192 193
193 /* Do not flood to ports that enable proxy ARP */ 194 /* Do not flood to ports that enable proxy ARP */
@@ -220,6 +221,31 @@ out:
220} 221}
221 222
222#ifdef CONFIG_BRIDGE_IGMP_SNOOPING 223#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
224static void maybe_deliver_addr(struct net_bridge_port *p, struct sk_buff *skb,
225 const unsigned char *addr, bool local_orig)
226{
227 struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev;
228 const unsigned char *src = eth_hdr(skb)->h_source;
229
230 if (!should_deliver(p, skb))
231 return;
232
233 /* Even with hairpin, no soliloquies - prevent breaking IPv6 DAD */
234 if (skb->dev == p->dev && ether_addr_equal(src, addr))
235 return;
236
237 skb = skb_copy(skb, GFP_ATOMIC);
238 if (!skb) {
239 dev->stats.tx_dropped++;
240 return;
241 }
242
243 if (!is_broadcast_ether_addr(addr))
244 memcpy(eth_hdr(skb)->h_dest, addr, ETH_ALEN);
245
246 __br_forward(p, skb, local_orig);
247}
248
223/* called with rcu_read_lock */ 249/* called with rcu_read_lock */
224void br_multicast_flood(struct net_bridge_mdb_entry *mdst, 250void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
225 struct sk_buff *skb, 251 struct sk_buff *skb,
@@ -241,10 +267,20 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
241 rport = rp ? hlist_entry(rp, struct net_bridge_port, rlist) : 267 rport = rp ? hlist_entry(rp, struct net_bridge_port, rlist) :
242 NULL; 268 NULL;
243 269
244 port = (unsigned long)lport > (unsigned long)rport ? 270 if ((unsigned long)lport > (unsigned long)rport) {
245 lport : rport; 271 port = lport;
272
273 if (port->flags & BR_MULTICAST_TO_UNICAST) {
274 maybe_deliver_addr(lport, skb, p->eth_addr,
275 local_orig);
276 goto delivered;
277 }
278 } else {
279 port = rport;
280 }
246 281
247 prev = maybe_deliver(prev, port, skb, local_orig); 282 prev = maybe_deliver(prev, port, skb, local_orig);
283delivered:
248 if (IS_ERR(prev)) 284 if (IS_ERR(prev))
249 goto out; 285 goto out;
250 if (prev == port) 286 if (prev == port)
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index ed0dd3340084..56a2a72e7738 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -311,9 +311,8 @@ void br_dev_delete(struct net_device *dev, struct list_head *head)
311 311
312 br_fdb_delete_by_port(br, NULL, 0, 1); 312 br_fdb_delete_by_port(br, NULL, 0, 1);
313 313
314 br_vlan_flush(br);
315 br_multicast_dev_del(br); 314 br_multicast_dev_del(br);
316 del_timer_sync(&br->gc_timer); 315 cancel_delayed_work_sync(&br->gc_work);
317 316
318 br_sysfs_delbr(br->dev); 317 br_sysfs_delbr(br->dev);
319 unregister_netdevice_queue(br->dev, head); 318 unregister_netdevice_queue(br->dev, head);
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 855b72fbe1da..013f2290bfa5 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -21,6 +21,7 @@
21#include <linux/export.h> 21#include <linux/export.h>
22#include <linux/rculist.h> 22#include <linux/rculist.h>
23#include "br_private.h" 23#include "br_private.h"
24#include "br_private_tunnel.h"
24 25
25/* Hook for brouter */ 26/* Hook for brouter */
26br_should_route_hook_t __rcu *br_should_route_hook __read_mostly; 27br_should_route_hook_t __rcu *br_should_route_hook __read_mostly;
@@ -29,6 +30,7 @@ EXPORT_SYMBOL(br_should_route_hook);
29static int 30static int
30br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb) 31br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb)
31{ 32{
33 br_drop_fake_rtable(skb);
32 return netif_receive_skb(skb); 34 return netif_receive_skb(skb);
33} 35}
34 36
@@ -57,7 +59,7 @@ static int br_pass_frame_up(struct sk_buff *skb)
57 59
58 indev = skb->dev; 60 indev = skb->dev;
59 skb->dev = brdev; 61 skb->dev = brdev;
60 skb = br_handle_vlan(br, vg, skb); 62 skb = br_handle_vlan(br, NULL, vg, skb);
61 if (!skb) 63 if (!skb)
62 return NET_RX_DROP; 64 return NET_RX_DROP;
63 /* update the multicast stats if the packet is IGMP/MLD */ 65 /* update the multicast stats if the packet is IGMP/MLD */
@@ -113,7 +115,7 @@ static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br,
113 return; 115 return;
114 } 116 }
115 117
116 f = __br_fdb_get(br, n->ha, vid); 118 f = br_fdb_find_rcu(br, n->ha, vid);
117 if (f && ((p->flags & BR_PROXYARP) || 119 if (f && ((p->flags & BR_PROXYARP) ||
118 (f->dst && (f->dst->flags & BR_PROXYARP_WIFI)))) { 120 (f->dst && (f->dst->flags & BR_PROXYARP_WIFI)))) {
119 arp_send(ARPOP_REPLY, ETH_P_ARP, sip, skb->dev, tip, 121 arp_send(ARPOP_REPLY, ETH_P_ARP, sip, skb->dev, tip,
@@ -188,16 +190,19 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
188 } 190 }
189 break; 191 break;
190 case BR_PKT_UNICAST: 192 case BR_PKT_UNICAST:
191 dst = __br_fdb_get(br, dest, vid); 193 dst = br_fdb_find_rcu(br, dest, vid);
192 default: 194 default:
193 break; 195 break;
194 } 196 }
195 197
196 if (dst) { 198 if (dst) {
199 unsigned long now = jiffies;
200
197 if (dst->is_local) 201 if (dst->is_local)
198 return br_pass_frame_up(skb); 202 return br_pass_frame_up(skb);
199 203
200 dst->used = jiffies; 204 if (now != dst->used)
205 dst->used = now;
201 br_forward(dst->dst, skb, local_rcv, false); 206 br_forward(dst->dst, skb, local_rcv, false);
202 } else { 207 } else {
203 if (!mcast_hit) 208 if (!mcast_hit)
@@ -261,6 +266,11 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
261 return RX_HANDLER_CONSUMED; 266 return RX_HANDLER_CONSUMED;
262 267
263 p = br_port_get_rcu(skb->dev); 268 p = br_port_get_rcu(skb->dev);
269 if (p->flags & BR_VLAN_TUNNEL) {
270 if (br_handle_ingress_vlan_tunnel(skb, p,
271 nbp_vlan_group_rcu(p)))
272 goto drop;
273 }
264 274
265 if (unlikely(is_link_local_ether_addr(dest))) { 275 if (unlikely(is_link_local_ether_addr(dest))) {
266 u16 fwd_mask = p->br->group_fwd_mask_required; 276 u16 fwd_mask = p->br->group_fwd_mask_required;
diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c
index d99b2009771a..7970f8540cbb 100644
--- a/net/bridge/br_ioctl.c
+++ b/net/bridge/br_ioctl.c
@@ -18,7 +18,7 @@
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/times.h> 19#include <linux/times.h>
20#include <net/net_namespace.h> 20#include <net/net_namespace.h>
21#include <asm/uaccess.h> 21#include <linux/uaccess.h>
22#include "br_private.h" 22#include "br_private.h"
23 23
24static int get_bridge_ifindices(struct net *net, int *indices, int num) 24static int get_bridge_ifindices(struct net *net, int *indices, int num)
@@ -149,7 +149,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
149 b.hello_timer_value = br_timer_value(&br->hello_timer); 149 b.hello_timer_value = br_timer_value(&br->hello_timer);
150 b.tcn_timer_value = br_timer_value(&br->tcn_timer); 150 b.tcn_timer_value = br_timer_value(&br->tcn_timer);
151 b.topology_change_timer_value = br_timer_value(&br->topology_change_timer); 151 b.topology_change_timer_value = br_timer_value(&br->topology_change_timer);
152 b.gc_timer_value = br_timer_value(&br->gc_timer); 152 b.gc_timer_value = br_timer_value(&br->gc_work.timer);
153 rcu_read_unlock(); 153 rcu_read_unlock();
154 154
155 if (copy_to_user((void __user *)args[1], &b, sizeof(b))) 155 if (copy_to_user((void __user *)args[1], &b, sizeof(b)))
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 7dbc80d01eb0..056e6ac49d8f 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -531,7 +531,7 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
531 break; 531 break;
532 } 532 }
533 533
534 p = br_multicast_new_port_group(port, group, *pp, state); 534 p = br_multicast_new_port_group(port, group, *pp, state, NULL);
535 if (unlikely(!p)) 535 if (unlikely(!p))
536 return -ENOMEM; 536 return -ENOMEM;
537 rcu_assign_pointer(*pp, p); 537 rcu_assign_pointer(*pp, p);
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 2136e45f5277..faa7261a992f 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -25,7 +25,9 @@
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/timer.h> 26#include <linux/timer.h>
27#include <linux/inetdevice.h> 27#include <linux/inetdevice.h>
28#include <linux/mroute.h>
28#include <net/ip.h> 29#include <net/ip.h>
30#include <net/switchdev.h>
29#if IS_ENABLED(CONFIG_IPV6) 31#if IS_ENABLED(CONFIG_IPV6)
30#include <net/ipv6.h> 32#include <net/ipv6.h>
31#include <net/mld.h> 33#include <net/mld.h>
@@ -42,12 +44,15 @@ static void br_multicast_add_router(struct net_bridge *br,
42static void br_ip4_multicast_leave_group(struct net_bridge *br, 44static void br_ip4_multicast_leave_group(struct net_bridge *br,
43 struct net_bridge_port *port, 45 struct net_bridge_port *port,
44 __be32 group, 46 __be32 group,
45 __u16 vid); 47 __u16 vid,
48 const unsigned char *src);
49
50static void __del_port_router(struct net_bridge_port *p);
46#if IS_ENABLED(CONFIG_IPV6) 51#if IS_ENABLED(CONFIG_IPV6)
47static void br_ip6_multicast_leave_group(struct net_bridge *br, 52static void br_ip6_multicast_leave_group(struct net_bridge *br,
48 struct net_bridge_port *port, 53 struct net_bridge_port *port,
49 const struct in6_addr *group, 54 const struct in6_addr *group,
50 __u16 vid); 55 __u16 vid, const unsigned char *src);
51#endif 56#endif
52unsigned int br_mdb_rehash_seq; 57unsigned int br_mdb_rehash_seq;
53 58
@@ -364,13 +369,18 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
364 __be32 group, 369 __be32 group,
365 u8 *igmp_type) 370 u8 *igmp_type)
366{ 371{
372 struct igmpv3_query *ihv3;
373 size_t igmp_hdr_size;
367 struct sk_buff *skb; 374 struct sk_buff *skb;
368 struct igmphdr *ih; 375 struct igmphdr *ih;
369 struct ethhdr *eth; 376 struct ethhdr *eth;
370 struct iphdr *iph; 377 struct iphdr *iph;
371 378
379 igmp_hdr_size = sizeof(*ih);
380 if (br->multicast_igmp_version == 3)
381 igmp_hdr_size = sizeof(*ihv3);
372 skb = netdev_alloc_skb_ip_align(br->dev, sizeof(*eth) + sizeof(*iph) + 382 skb = netdev_alloc_skb_ip_align(br->dev, sizeof(*eth) + sizeof(*iph) +
373 sizeof(*ih) + 4); 383 igmp_hdr_size + 4);
374 if (!skb) 384 if (!skb)
375 goto out; 385 goto out;
376 386
@@ -395,7 +405,7 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
395 iph->version = 4; 405 iph->version = 4;
396 iph->ihl = 6; 406 iph->ihl = 6;
397 iph->tos = 0xc0; 407 iph->tos = 0xc0;
398 iph->tot_len = htons(sizeof(*iph) + sizeof(*ih) + 4); 408 iph->tot_len = htons(sizeof(*iph) + igmp_hdr_size + 4);
399 iph->id = 0; 409 iph->id = 0;
400 iph->frag_off = htons(IP_DF); 410 iph->frag_off = htons(IP_DF);
401 iph->ttl = 1; 411 iph->ttl = 1;
@@ -411,17 +421,37 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
411 skb_put(skb, 24); 421 skb_put(skb, 24);
412 422
413 skb_set_transport_header(skb, skb->len); 423 skb_set_transport_header(skb, skb->len);
414 ih = igmp_hdr(skb);
415 *igmp_type = IGMP_HOST_MEMBERSHIP_QUERY; 424 *igmp_type = IGMP_HOST_MEMBERSHIP_QUERY;
416 ih->type = IGMP_HOST_MEMBERSHIP_QUERY;
417 ih->code = (group ? br->multicast_last_member_interval :
418 br->multicast_query_response_interval) /
419 (HZ / IGMP_TIMER_SCALE);
420 ih->group = group;
421 ih->csum = 0;
422 ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr));
423 skb_put(skb, sizeof(*ih));
424 425
426 switch (br->multicast_igmp_version) {
427 case 2:
428 ih = igmp_hdr(skb);
429 ih->type = IGMP_HOST_MEMBERSHIP_QUERY;
430 ih->code = (group ? br->multicast_last_member_interval :
431 br->multicast_query_response_interval) /
432 (HZ / IGMP_TIMER_SCALE);
433 ih->group = group;
434 ih->csum = 0;
435 ih->csum = ip_compute_csum((void *)ih, sizeof(*ih));
436 break;
437 case 3:
438 ihv3 = igmpv3_query_hdr(skb);
439 ihv3->type = IGMP_HOST_MEMBERSHIP_QUERY;
440 ihv3->code = (group ? br->multicast_last_member_interval :
441 br->multicast_query_response_interval) /
442 (HZ / IGMP_TIMER_SCALE);
443 ihv3->group = group;
444 ihv3->qqic = br->multicast_query_interval / HZ;
445 ihv3->nsrcs = 0;
446 ihv3->resv = 0;
447 ihv3->suppress = 0;
448 ihv3->qrv = 2;
449 ihv3->csum = 0;
450 ihv3->csum = ip_compute_csum((void *)ihv3, sizeof(*ihv3));
451 break;
452 }
453
454 skb_put(skb, igmp_hdr_size);
425 __skb_pull(skb, sizeof(*eth)); 455 __skb_pull(skb, sizeof(*eth));
426 456
427out: 457out:
@@ -433,15 +463,20 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
433 const struct in6_addr *grp, 463 const struct in6_addr *grp,
434 u8 *igmp_type) 464 u8 *igmp_type)
435{ 465{
436 struct sk_buff *skb; 466 struct mld2_query *mld2q;
467 unsigned long interval;
437 struct ipv6hdr *ip6h; 468 struct ipv6hdr *ip6h;
438 struct mld_msg *mldq; 469 struct mld_msg *mldq;
470 size_t mld_hdr_size;
471 struct sk_buff *skb;
439 struct ethhdr *eth; 472 struct ethhdr *eth;
440 u8 *hopopt; 473 u8 *hopopt;
441 unsigned long interval;
442 474
475 mld_hdr_size = sizeof(*mldq);
476 if (br->multicast_mld_version == 2)
477 mld_hdr_size = sizeof(*mld2q);
443 skb = netdev_alloc_skb_ip_align(br->dev, sizeof(*eth) + sizeof(*ip6h) + 478 skb = netdev_alloc_skb_ip_align(br->dev, sizeof(*eth) + sizeof(*ip6h) +
444 8 + sizeof(*mldq)); 479 8 + mld_hdr_size);
445 if (!skb) 480 if (!skb)
446 goto out; 481 goto out;
447 482
@@ -460,7 +495,7 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
460 ip6h = ipv6_hdr(skb); 495 ip6h = ipv6_hdr(skb);
461 496
462 *(__force __be32 *)ip6h = htonl(0x60000000); 497 *(__force __be32 *)ip6h = htonl(0x60000000);
463 ip6h->payload_len = htons(8 + sizeof(*mldq)); 498 ip6h->payload_len = htons(8 + mld_hdr_size);
464 ip6h->nexthdr = IPPROTO_HOPOPTS; 499 ip6h->nexthdr = IPPROTO_HOPOPTS;
465 ip6h->hop_limit = 1; 500 ip6h->hop_limit = 1;
466 ipv6_addr_set(&ip6h->daddr, htonl(0xff020000), 0, 0, htonl(1)); 501 ipv6_addr_set(&ip6h->daddr, htonl(0xff020000), 0, 0, htonl(1));
@@ -488,26 +523,47 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
488 523
489 /* ICMPv6 */ 524 /* ICMPv6 */
490 skb_set_transport_header(skb, skb->len); 525 skb_set_transport_header(skb, skb->len);
491 mldq = (struct mld_msg *) icmp6_hdr(skb);
492
493 interval = ipv6_addr_any(grp) ? 526 interval = ipv6_addr_any(grp) ?
494 br->multicast_query_response_interval : 527 br->multicast_query_response_interval :
495 br->multicast_last_member_interval; 528 br->multicast_last_member_interval;
496
497 *igmp_type = ICMPV6_MGM_QUERY; 529 *igmp_type = ICMPV6_MGM_QUERY;
498 mldq->mld_type = ICMPV6_MGM_QUERY; 530 switch (br->multicast_mld_version) {
499 mldq->mld_code = 0; 531 case 1:
500 mldq->mld_cksum = 0; 532 mldq = (struct mld_msg *)icmp6_hdr(skb);
501 mldq->mld_maxdelay = htons((u16)jiffies_to_msecs(interval)); 533 mldq->mld_type = ICMPV6_MGM_QUERY;
502 mldq->mld_reserved = 0; 534 mldq->mld_code = 0;
503 mldq->mld_mca = *grp; 535 mldq->mld_cksum = 0;
504 536 mldq->mld_maxdelay = htons((u16)jiffies_to_msecs(interval));
505 /* checksum */ 537 mldq->mld_reserved = 0;
506 mldq->mld_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, 538 mldq->mld_mca = *grp;
507 sizeof(*mldq), IPPROTO_ICMPV6, 539 mldq->mld_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
508 csum_partial(mldq, 540 sizeof(*mldq), IPPROTO_ICMPV6,
509 sizeof(*mldq), 0)); 541 csum_partial(mldq,
510 skb_put(skb, sizeof(*mldq)); 542 sizeof(*mldq),
543 0));
544 break;
545 case 2:
546 mld2q = (struct mld2_query *)icmp6_hdr(skb);
547 mld2q->mld2q_mrc = htons((u16)jiffies_to_msecs(interval));
548 mld2q->mld2q_type = ICMPV6_MGM_QUERY;
549 mld2q->mld2q_code = 0;
550 mld2q->mld2q_cksum = 0;
551 mld2q->mld2q_resv1 = 0;
552 mld2q->mld2q_resv2 = 0;
553 mld2q->mld2q_suppress = 0;
554 mld2q->mld2q_qrv = 2;
555 mld2q->mld2q_nsrcs = 0;
556 mld2q->mld2q_qqic = br->multicast_query_interval / HZ;
557 mld2q->mld2q_mca = *grp;
558 mld2q->mld2q_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
559 sizeof(*mld2q),
560 IPPROTO_ICMPV6,
561 csum_partial(mld2q,
562 sizeof(*mld2q),
563 0));
564 break;
565 }
566 skb_put(skb, mld_hdr_size);
511 567
512 __skb_pull(skb, sizeof(*eth)); 568 __skb_pull(skb, sizeof(*eth));
513 569
@@ -607,7 +663,8 @@ err:
607} 663}
608 664
609struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br, 665struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br,
610 struct net_bridge_port *port, struct br_ip *group) 666 struct net_bridge_port *p,
667 struct br_ip *group)
611{ 668{
612 struct net_bridge_mdb_htable *mdb; 669 struct net_bridge_mdb_htable *mdb;
613 struct net_bridge_mdb_entry *mp; 670 struct net_bridge_mdb_entry *mp;
@@ -623,7 +680,7 @@ struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br,
623 } 680 }
624 681
625 hash = br_ip_hash(mdb, group); 682 hash = br_ip_hash(mdb, group);
626 mp = br_multicast_get_group(br, port, group, hash); 683 mp = br_multicast_get_group(br, p, group, hash);
627 switch (PTR_ERR(mp)) { 684 switch (PTR_ERR(mp)) {
628 case 0: 685 case 0:
629 break; 686 break;
@@ -658,7 +715,8 @@ struct net_bridge_port_group *br_multicast_new_port_group(
658 struct net_bridge_port *port, 715 struct net_bridge_port *port,
659 struct br_ip *group, 716 struct br_ip *group,
660 struct net_bridge_port_group __rcu *next, 717 struct net_bridge_port_group __rcu *next,
661 unsigned char flags) 718 unsigned char flags,
719 const unsigned char *src)
662{ 720{
663 struct net_bridge_port_group *p; 721 struct net_bridge_port_group *p;
664 722
@@ -673,16 +731,36 @@ struct net_bridge_port_group *br_multicast_new_port_group(
673 hlist_add_head(&p->mglist, &port->mglist); 731 hlist_add_head(&p->mglist, &port->mglist);
674 setup_timer(&p->timer, br_multicast_port_group_expired, 732 setup_timer(&p->timer, br_multicast_port_group_expired,
675 (unsigned long)p); 733 (unsigned long)p);
734
735 if (src)
736 memcpy(p->eth_addr, src, ETH_ALEN);
737 else
738 memset(p->eth_addr, 0xff, ETH_ALEN);
739
676 return p; 740 return p;
677} 741}
678 742
743static bool br_port_group_equal(struct net_bridge_port_group *p,
744 struct net_bridge_port *port,
745 const unsigned char *src)
746{
747 if (p->port != port)
748 return false;
749
750 if (!(port->flags & BR_MULTICAST_TO_UNICAST))
751 return true;
752
753 return ether_addr_equal(src, p->eth_addr);
754}
755
679static int br_multicast_add_group(struct net_bridge *br, 756static int br_multicast_add_group(struct net_bridge *br,
680 struct net_bridge_port *port, 757 struct net_bridge_port *port,
681 struct br_ip *group) 758 struct br_ip *group,
759 const unsigned char *src)
682{ 760{
683 struct net_bridge_mdb_entry *mp;
684 struct net_bridge_port_group *p;
685 struct net_bridge_port_group __rcu **pp; 761 struct net_bridge_port_group __rcu **pp;
762 struct net_bridge_port_group *p;
763 struct net_bridge_mdb_entry *mp;
686 unsigned long now = jiffies; 764 unsigned long now = jiffies;
687 int err; 765 int err;
688 766
@@ -705,13 +783,13 @@ static int br_multicast_add_group(struct net_bridge *br,
705 for (pp = &mp->ports; 783 for (pp = &mp->ports;
706 (p = mlock_dereference(*pp, br)) != NULL; 784 (p = mlock_dereference(*pp, br)) != NULL;
707 pp = &p->next) { 785 pp = &p->next) {
708 if (p->port == port) 786 if (br_port_group_equal(p, port, src))
709 goto found; 787 goto found;
710 if ((unsigned long)p->port < (unsigned long)port) 788 if ((unsigned long)p->port < (unsigned long)port)
711 break; 789 break;
712 } 790 }
713 791
714 p = br_multicast_new_port_group(port, group, *pp, 0); 792 p = br_multicast_new_port_group(port, group, *pp, 0, src);
715 if (unlikely(!p)) 793 if (unlikely(!p))
716 goto err; 794 goto err;
717 rcu_assign_pointer(*pp, p); 795 rcu_assign_pointer(*pp, p);
@@ -730,7 +808,8 @@ err:
730static int br_ip4_multicast_add_group(struct net_bridge *br, 808static int br_ip4_multicast_add_group(struct net_bridge *br,
731 struct net_bridge_port *port, 809 struct net_bridge_port *port,
732 __be32 group, 810 __be32 group,
733 __u16 vid) 811 __u16 vid,
812 const unsigned char *src)
734{ 813{
735 struct br_ip br_group; 814 struct br_ip br_group;
736 815
@@ -741,14 +820,15 @@ static int br_ip4_multicast_add_group(struct net_bridge *br,
741 br_group.proto = htons(ETH_P_IP); 820 br_group.proto = htons(ETH_P_IP);
742 br_group.vid = vid; 821 br_group.vid = vid;
743 822
744 return br_multicast_add_group(br, port, &br_group); 823 return br_multicast_add_group(br, port, &br_group, src);
745} 824}
746 825
747#if IS_ENABLED(CONFIG_IPV6) 826#if IS_ENABLED(CONFIG_IPV6)
748static int br_ip6_multicast_add_group(struct net_bridge *br, 827static int br_ip6_multicast_add_group(struct net_bridge *br,
749 struct net_bridge_port *port, 828 struct net_bridge_port *port,
750 const struct in6_addr *group, 829 const struct in6_addr *group,
751 __u16 vid) 830 __u16 vid,
831 const unsigned char *src)
752{ 832{
753 struct br_ip br_group; 833 struct br_ip br_group;
754 834
@@ -759,7 +839,7 @@ static int br_ip6_multicast_add_group(struct net_bridge *br,
759 br_group.proto = htons(ETH_P_IPV6); 839 br_group.proto = htons(ETH_P_IPV6);
760 br_group.vid = vid; 840 br_group.vid = vid;
761 841
762 return br_multicast_add_group(br, port, &br_group); 842 return br_multicast_add_group(br, port, &br_group, src);
763} 843}
764#endif 844#endif
765 845
@@ -771,16 +851,10 @@ static void br_multicast_router_expired(unsigned long data)
771 spin_lock(&br->multicast_lock); 851 spin_lock(&br->multicast_lock);
772 if (port->multicast_router == MDB_RTR_TYPE_DISABLED || 852 if (port->multicast_router == MDB_RTR_TYPE_DISABLED ||
773 port->multicast_router == MDB_RTR_TYPE_PERM || 853 port->multicast_router == MDB_RTR_TYPE_PERM ||
774 timer_pending(&port->multicast_router_timer) || 854 timer_pending(&port->multicast_router_timer))
775 hlist_unhashed(&port->rlist))
776 goto out; 855 goto out;
777 856
778 hlist_del_init_rcu(&port->rlist); 857 __del_port_router(port);
779 br_rtr_notify(br->dev, port, RTM_DELMDB);
780 /* Don't allow timer refresh if the router expired */
781 if (port->multicast_router == MDB_RTR_TYPE_TEMP)
782 port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
783
784out: 858out:
785 spin_unlock(&br->multicast_lock); 859 spin_unlock(&br->multicast_lock);
786} 860}
@@ -860,9 +934,9 @@ static void br_multicast_send_query(struct net_bridge *br,
860 struct net_bridge_port *port, 934 struct net_bridge_port *port,
861 struct bridge_mcast_own_query *own_query) 935 struct bridge_mcast_own_query *own_query)
862{ 936{
863 unsigned long time;
864 struct br_ip br_group;
865 struct bridge_mcast_other_query *other_query = NULL; 937 struct bridge_mcast_other_query *other_query = NULL;
938 struct br_ip br_group;
939 unsigned long time;
866 940
867 if (!netif_running(br->dev) || br->multicast_disabled || 941 if (!netif_running(br->dev) || br->multicast_disabled ||
868 !br->multicast_querier) 942 !br->multicast_querier)
@@ -929,6 +1003,18 @@ static void br_ip6_multicast_port_query_expired(unsigned long data)
929} 1003}
930#endif 1004#endif
931 1005
1006static void br_mc_disabled_update(struct net_device *dev, bool value)
1007{
1008 struct switchdev_attr attr = {
1009 .orig_dev = dev,
1010 .id = SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED,
1011 .flags = SWITCHDEV_F_DEFER,
1012 .u.mc_disabled = value,
1013 };
1014
1015 switchdev_port_attr_set(dev, &attr);
1016}
1017
932int br_multicast_add_port(struct net_bridge_port *port) 1018int br_multicast_add_port(struct net_bridge_port *port)
933{ 1019{
934 port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; 1020 port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
@@ -941,6 +1027,8 @@ int br_multicast_add_port(struct net_bridge_port *port)
941 setup_timer(&port->ip6_own_query.timer, 1027 setup_timer(&port->ip6_own_query.timer,
942 br_ip6_multicast_port_query_expired, (unsigned long)port); 1028 br_ip6_multicast_port_query_expired, (unsigned long)port);
943#endif 1029#endif
1030 br_mc_disabled_update(port->dev, port->br->multicast_disabled);
1031
944 port->mcast_stats = netdev_alloc_pcpu_stats(struct bridge_mcast_stats); 1032 port->mcast_stats = netdev_alloc_pcpu_stats(struct bridge_mcast_stats);
945 if (!port->mcast_stats) 1033 if (!port->mcast_stats)
946 return -ENOMEM; 1034 return -ENOMEM;
@@ -1008,13 +1096,8 @@ void br_multicast_disable_port(struct net_bridge_port *port)
1008 if (!(pg->flags & MDB_PG_FLAGS_PERMANENT)) 1096 if (!(pg->flags & MDB_PG_FLAGS_PERMANENT))
1009 br_multicast_del_pg(br, pg); 1097 br_multicast_del_pg(br, pg);
1010 1098
1011 if (!hlist_unhashed(&port->rlist)) { 1099 __del_port_router(port);
1012 hlist_del_init_rcu(&port->rlist); 1100
1013 br_rtr_notify(br->dev, port, RTM_DELMDB);
1014 /* Don't allow timer refresh if disabling */
1015 if (port->multicast_router == MDB_RTR_TYPE_TEMP)
1016 port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
1017 }
1018 del_timer(&port->multicast_router_timer); 1101 del_timer(&port->multicast_router_timer);
1019 del_timer(&port->ip4_own_query.timer); 1102 del_timer(&port->ip4_own_query.timer);
1020#if IS_ENABLED(CONFIG_IPV6) 1103#if IS_ENABLED(CONFIG_IPV6)
@@ -1028,6 +1111,7 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
1028 struct sk_buff *skb, 1111 struct sk_buff *skb,
1029 u16 vid) 1112 u16 vid)
1030{ 1113{
1114 const unsigned char *src;
1031 struct igmpv3_report *ih; 1115 struct igmpv3_report *ih;
1032 struct igmpv3_grec *grec; 1116 struct igmpv3_grec *grec;
1033 int i; 1117 int i;
@@ -1068,12 +1152,14 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
1068 continue; 1152 continue;
1069 } 1153 }
1070 1154
1155 src = eth_hdr(skb)->h_source;
1071 if ((type == IGMPV3_CHANGE_TO_INCLUDE || 1156 if ((type == IGMPV3_CHANGE_TO_INCLUDE ||
1072 type == IGMPV3_MODE_IS_INCLUDE) && 1157 type == IGMPV3_MODE_IS_INCLUDE) &&
1073 ntohs(grec->grec_nsrcs) == 0) { 1158 ntohs(grec->grec_nsrcs) == 0) {
1074 br_ip4_multicast_leave_group(br, port, group, vid); 1159 br_ip4_multicast_leave_group(br, port, group, vid, src);
1075 } else { 1160 } else {
1076 err = br_ip4_multicast_add_group(br, port, group, vid); 1161 err = br_ip4_multicast_add_group(br, port, group, vid,
1162 src);
1077 if (err) 1163 if (err)
1078 break; 1164 break;
1079 } 1165 }
@@ -1088,6 +1174,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
1088 struct sk_buff *skb, 1174 struct sk_buff *skb,
1089 u16 vid) 1175 u16 vid)
1090{ 1176{
1177 const unsigned char *src;
1091 struct icmp6hdr *icmp6h; 1178 struct icmp6hdr *icmp6h;
1092 struct mld2_grec *grec; 1179 struct mld2_grec *grec;
1093 int i; 1180 int i;
@@ -1135,14 +1222,16 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
1135 continue; 1222 continue;
1136 } 1223 }
1137 1224
1225 src = eth_hdr(skb)->h_source;
1138 if ((grec->grec_type == MLD2_CHANGE_TO_INCLUDE || 1226 if ((grec->grec_type == MLD2_CHANGE_TO_INCLUDE ||
1139 grec->grec_type == MLD2_MODE_IS_INCLUDE) && 1227 grec->grec_type == MLD2_MODE_IS_INCLUDE) &&
1140 ntohs(*nsrcs) == 0) { 1228 ntohs(*nsrcs) == 0) {
1141 br_ip6_multicast_leave_group(br, port, &grec->grec_mca, 1229 br_ip6_multicast_leave_group(br, port, &grec->grec_mca,
1142 vid); 1230 vid, src);
1143 } else { 1231 } else {
1144 err = br_ip6_multicast_add_group(br, port, 1232 err = br_ip6_multicast_add_group(br, port,
1145 &grec->grec_mca, vid); 1233 &grec->grec_mca, vid,
1234 src);
1146 if (err) 1235 if (err)
1147 break; 1236 break;
1148 } 1237 }
@@ -1228,6 +1317,19 @@ br_multicast_update_query_timer(struct net_bridge *br,
1228 mod_timer(&query->timer, jiffies + br->multicast_querier_interval); 1317 mod_timer(&query->timer, jiffies + br->multicast_querier_interval);
1229} 1318}
1230 1319
1320static void br_port_mc_router_state_change(struct net_bridge_port *p,
1321 bool is_mc_router)
1322{
1323 struct switchdev_attr attr = {
1324 .orig_dev = p->dev,
1325 .id = SWITCHDEV_ATTR_ID_PORT_MROUTER,
1326 .flags = SWITCHDEV_F_DEFER,
1327 .u.mrouter = is_mc_router,
1328 };
1329
1330 switchdev_port_attr_set(p->dev, &attr);
1331}
1332
1231/* 1333/*
1232 * Add port to router_list 1334 * Add port to router_list
1233 * list is maintained ordered by pointer value 1335 * list is maintained ordered by pointer value
@@ -1253,6 +1355,7 @@ static void br_multicast_add_router(struct net_bridge *br,
1253 else 1355 else
1254 hlist_add_head_rcu(&port->rlist, &br->router_list); 1356 hlist_add_head_rcu(&port->rlist, &br->router_list);
1255 br_rtr_notify(br->dev, port, RTM_NEWMDB); 1357 br_rtr_notify(br->dev, port, RTM_NEWMDB);
1358 br_port_mc_router_state_change(port, true);
1256} 1359}
1257 1360
1258static void br_multicast_mark_router(struct net_bridge *br, 1361static void br_multicast_mark_router(struct net_bridge *br,
@@ -1458,7 +1561,8 @@ br_multicast_leave_group(struct net_bridge *br,
1458 struct net_bridge_port *port, 1561 struct net_bridge_port *port,
1459 struct br_ip *group, 1562 struct br_ip *group,
1460 struct bridge_mcast_other_query *other_query, 1563 struct bridge_mcast_other_query *other_query,
1461 struct bridge_mcast_own_query *own_query) 1564 struct bridge_mcast_own_query *own_query,
1565 const unsigned char *src)
1462{ 1566{
1463 struct net_bridge_mdb_htable *mdb; 1567 struct net_bridge_mdb_htable *mdb;
1464 struct net_bridge_mdb_entry *mp; 1568 struct net_bridge_mdb_entry *mp;
@@ -1482,7 +1586,7 @@ br_multicast_leave_group(struct net_bridge *br,
1482 for (pp = &mp->ports; 1586 for (pp = &mp->ports;
1483 (p = mlock_dereference(*pp, br)) != NULL; 1587 (p = mlock_dereference(*pp, br)) != NULL;
1484 pp = &p->next) { 1588 pp = &p->next) {
1485 if (p->port != port) 1589 if (!br_port_group_equal(p, port, src))
1486 continue; 1590 continue;
1487 1591
1488 rcu_assign_pointer(*pp, p->next); 1592 rcu_assign_pointer(*pp, p->next);
@@ -1513,7 +1617,7 @@ br_multicast_leave_group(struct net_bridge *br,
1513 for (p = mlock_dereference(mp->ports, br); 1617 for (p = mlock_dereference(mp->ports, br);
1514 p != NULL; 1618 p != NULL;
1515 p = mlock_dereference(p->next, br)) { 1619 p = mlock_dereference(p->next, br)) {
1516 if (p->port != port) 1620 if (!br_port_group_equal(p, port, src))
1517 continue; 1621 continue;
1518 1622
1519 if (!hlist_unhashed(&p->mglist) && 1623 if (!hlist_unhashed(&p->mglist) &&
@@ -1564,7 +1668,8 @@ out:
1564static void br_ip4_multicast_leave_group(struct net_bridge *br, 1668static void br_ip4_multicast_leave_group(struct net_bridge *br,
1565 struct net_bridge_port *port, 1669 struct net_bridge_port *port,
1566 __be32 group, 1670 __be32 group,
1567 __u16 vid) 1671 __u16 vid,
1672 const unsigned char *src)
1568{ 1673{
1569 struct br_ip br_group; 1674 struct br_ip br_group;
1570 struct bridge_mcast_own_query *own_query; 1675 struct bridge_mcast_own_query *own_query;
@@ -1579,14 +1684,15 @@ static void br_ip4_multicast_leave_group(struct net_bridge *br,
1579 br_group.vid = vid; 1684 br_group.vid = vid;
1580 1685
1581 br_multicast_leave_group(br, port, &br_group, &br->ip4_other_query, 1686 br_multicast_leave_group(br, port, &br_group, &br->ip4_other_query,
1582 own_query); 1687 own_query, src);
1583} 1688}
1584 1689
1585#if IS_ENABLED(CONFIG_IPV6) 1690#if IS_ENABLED(CONFIG_IPV6)
1586static void br_ip6_multicast_leave_group(struct net_bridge *br, 1691static void br_ip6_multicast_leave_group(struct net_bridge *br,
1587 struct net_bridge_port *port, 1692 struct net_bridge_port *port,
1588 const struct in6_addr *group, 1693 const struct in6_addr *group,
1589 __u16 vid) 1694 __u16 vid,
1695 const unsigned char *src)
1590{ 1696{
1591 struct br_ip br_group; 1697 struct br_ip br_group;
1592 struct bridge_mcast_own_query *own_query; 1698 struct bridge_mcast_own_query *own_query;
@@ -1601,7 +1707,7 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br,
1601 br_group.vid = vid; 1707 br_group.vid = vid;
1602 1708
1603 br_multicast_leave_group(br, port, &br_group, &br->ip6_other_query, 1709 br_multicast_leave_group(br, port, &br_group, &br->ip6_other_query,
1604 own_query); 1710 own_query, src);
1605} 1711}
1606#endif 1712#endif
1607 1713
@@ -1638,20 +1744,40 @@ static void br_multicast_err_count(const struct net_bridge *br,
1638 u64_stats_update_end(&pstats->syncp); 1744 u64_stats_update_end(&pstats->syncp);
1639} 1745}
1640 1746
1747static void br_multicast_pim(struct net_bridge *br,
1748 struct net_bridge_port *port,
1749 const struct sk_buff *skb)
1750{
1751 unsigned int offset = skb_transport_offset(skb);
1752 struct pimhdr *pimhdr, _pimhdr;
1753
1754 pimhdr = skb_header_pointer(skb, offset, sizeof(_pimhdr), &_pimhdr);
1755 if (!pimhdr || pim_hdr_version(pimhdr) != PIM_VERSION ||
1756 pim_hdr_type(pimhdr) != PIM_TYPE_HELLO)
1757 return;
1758
1759 br_multicast_mark_router(br, port);
1760}
1761
1641static int br_multicast_ipv4_rcv(struct net_bridge *br, 1762static int br_multicast_ipv4_rcv(struct net_bridge *br,
1642 struct net_bridge_port *port, 1763 struct net_bridge_port *port,
1643 struct sk_buff *skb, 1764 struct sk_buff *skb,
1644 u16 vid) 1765 u16 vid)
1645{ 1766{
1646 struct sk_buff *skb_trimmed = NULL; 1767 struct sk_buff *skb_trimmed = NULL;
1768 const unsigned char *src;
1647 struct igmphdr *ih; 1769 struct igmphdr *ih;
1648 int err; 1770 int err;
1649 1771
1650 err = ip_mc_check_igmp(skb, &skb_trimmed); 1772 err = ip_mc_check_igmp(skb, &skb_trimmed);
1651 1773
1652 if (err == -ENOMSG) { 1774 if (err == -ENOMSG) {
1653 if (!ipv4_is_local_multicast(ip_hdr(skb)->daddr)) 1775 if (!ipv4_is_local_multicast(ip_hdr(skb)->daddr)) {
1654 BR_INPUT_SKB_CB(skb)->mrouters_only = 1; 1776 BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
1777 } else if (pim_ipv4_all_pim_routers(ip_hdr(skb)->daddr)) {
1778 if (ip_hdr(skb)->protocol == IPPROTO_PIM)
1779 br_multicast_pim(br, port, skb);
1780 }
1655 return 0; 1781 return 0;
1656 } else if (err < 0) { 1782 } else if (err < 0) {
1657 br_multicast_err_count(br, port, skb->protocol); 1783 br_multicast_err_count(br, port, skb->protocol);
@@ -1659,13 +1785,14 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
1659 } 1785 }
1660 1786
1661 ih = igmp_hdr(skb); 1787 ih = igmp_hdr(skb);
1788 src = eth_hdr(skb)->h_source;
1662 BR_INPUT_SKB_CB(skb)->igmp = ih->type; 1789 BR_INPUT_SKB_CB(skb)->igmp = ih->type;
1663 1790
1664 switch (ih->type) { 1791 switch (ih->type) {
1665 case IGMP_HOST_MEMBERSHIP_REPORT: 1792 case IGMP_HOST_MEMBERSHIP_REPORT:
1666 case IGMPV2_HOST_MEMBERSHIP_REPORT: 1793 case IGMPV2_HOST_MEMBERSHIP_REPORT:
1667 BR_INPUT_SKB_CB(skb)->mrouters_only = 1; 1794 BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
1668 err = br_ip4_multicast_add_group(br, port, ih->group, vid); 1795 err = br_ip4_multicast_add_group(br, port, ih->group, vid, src);
1669 break; 1796 break;
1670 case IGMPV3_HOST_MEMBERSHIP_REPORT: 1797 case IGMPV3_HOST_MEMBERSHIP_REPORT:
1671 err = br_ip4_multicast_igmp3_report(br, port, skb_trimmed, vid); 1798 err = br_ip4_multicast_igmp3_report(br, port, skb_trimmed, vid);
@@ -1674,7 +1801,7 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
1674 err = br_ip4_multicast_query(br, port, skb_trimmed, vid); 1801 err = br_ip4_multicast_query(br, port, skb_trimmed, vid);
1675 break; 1802 break;
1676 case IGMP_HOST_LEAVE_MESSAGE: 1803 case IGMP_HOST_LEAVE_MESSAGE:
1677 br_ip4_multicast_leave_group(br, port, ih->group, vid); 1804 br_ip4_multicast_leave_group(br, port, ih->group, vid, src);
1678 break; 1805 break;
1679 } 1806 }
1680 1807
@@ -1694,6 +1821,7 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
1694 u16 vid) 1821 u16 vid)
1695{ 1822{
1696 struct sk_buff *skb_trimmed = NULL; 1823 struct sk_buff *skb_trimmed = NULL;
1824 const unsigned char *src;
1697 struct mld_msg *mld; 1825 struct mld_msg *mld;
1698 int err; 1826 int err;
1699 1827
@@ -1713,8 +1841,10 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
1713 1841
1714 switch (mld->mld_type) { 1842 switch (mld->mld_type) {
1715 case ICMPV6_MGM_REPORT: 1843 case ICMPV6_MGM_REPORT:
1844 src = eth_hdr(skb)->h_source;
1716 BR_INPUT_SKB_CB(skb)->mrouters_only = 1; 1845 BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
1717 err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid); 1846 err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid,
1847 src);
1718 break; 1848 break;
1719 case ICMPV6_MLD2_REPORT: 1849 case ICMPV6_MLD2_REPORT:
1720 err = br_ip6_multicast_mld2_report(br, port, skb_trimmed, vid); 1850 err = br_ip6_multicast_mld2_report(br, port, skb_trimmed, vid);
@@ -1723,7 +1853,8 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
1723 err = br_ip6_multicast_query(br, port, skb_trimmed, vid); 1853 err = br_ip6_multicast_query(br, port, skb_trimmed, vid);
1724 break; 1854 break;
1725 case ICMPV6_MGM_REDUCTION: 1855 case ICMPV6_MGM_REDUCTION:
1726 br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid); 1856 src = eth_hdr(skb)->h_source;
1857 br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid, src);
1727 break; 1858 break;
1728 } 1859 }
1729 1860
@@ -1811,7 +1942,9 @@ void br_multicast_init(struct net_bridge *br)
1811 1942
1812 br->ip4_other_query.delay_time = 0; 1943 br->ip4_other_query.delay_time = 0;
1813 br->ip4_querier.port = NULL; 1944 br->ip4_querier.port = NULL;
1945 br->multicast_igmp_version = 2;
1814#if IS_ENABLED(CONFIG_IPV6) 1946#if IS_ENABLED(CONFIG_IPV6)
1947 br->multicast_mld_version = 1;
1815 br->ip6_other_query.delay_time = 0; 1948 br->ip6_other_query.delay_time = 0;
1816 br->ip6_querier.port = NULL; 1949 br->ip6_querier.port = NULL;
1817#endif 1950#endif
@@ -1898,8 +2031,6 @@ void br_multicast_dev_del(struct net_bridge *br)
1898 2031
1899out: 2032out:
1900 spin_unlock_bh(&br->multicast_lock); 2033 spin_unlock_bh(&br->multicast_lock);
1901
1902 free_percpu(br->mcast_stats);
1903} 2034}
1904 2035
1905int br_multicast_set_router(struct net_bridge *br, unsigned long val) 2036int br_multicast_set_router(struct net_bridge *br, unsigned long val)
@@ -1930,6 +2061,11 @@ static void __del_port_router(struct net_bridge_port *p)
1930 return; 2061 return;
1931 hlist_del_init_rcu(&p->rlist); 2062 hlist_del_init_rcu(&p->rlist);
1932 br_rtr_notify(p->br->dev, p, RTM_DELMDB); 2063 br_rtr_notify(p->br->dev, p, RTM_DELMDB);
2064 br_port_mc_router_state_change(p, false);
2065
2066 /* don't allow timer refresh */
2067 if (p->multicast_router == MDB_RTR_TYPE_TEMP)
2068 p->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
1933} 2069}
1934 2070
1935int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val) 2071int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val)
@@ -2007,6 +2143,7 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val)
2007 if (br->multicast_disabled == !val) 2143 if (br->multicast_disabled == !val)
2008 goto unlock; 2144 goto unlock;
2009 2145
2146 br_mc_disabled_update(br->dev, !val);
2010 br->multicast_disabled = !val; 2147 br->multicast_disabled = !val;
2011 if (br->multicast_disabled) 2148 if (br->multicast_disabled)
2012 goto unlock; 2149 goto unlock;
@@ -2112,6 +2249,44 @@ unlock:
2112 return err; 2249 return err;
2113} 2250}
2114 2251
2252int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val)
2253{
2254 /* Currently we support only version 2 and 3 */
2255 switch (val) {
2256 case 2:
2257 case 3:
2258 break;
2259 default:
2260 return -EINVAL;
2261 }
2262
2263 spin_lock_bh(&br->multicast_lock);
2264 br->multicast_igmp_version = val;
2265 spin_unlock_bh(&br->multicast_lock);
2266
2267 return 0;
2268}
2269
2270#if IS_ENABLED(CONFIG_IPV6)
2271int br_multicast_set_mld_version(struct net_bridge *br, unsigned long val)
2272{
2273 /* Currently we support version 1 and 2 */
2274 switch (val) {
2275 case 1:
2276 case 2:
2277 break;
2278 default:
2279 return -EINVAL;
2280 }
2281
2282 spin_lock_bh(&br->multicast_lock);
2283 br->multicast_mld_version = val;
2284 spin_unlock_bh(&br->multicast_lock);
2285
2286 return 0;
2287}
2288#endif
2289
2115/** 2290/**
2116 * br_multicast_list_adjacent - Returns snooped multicast addresses 2291 * br_multicast_list_adjacent - Returns snooped multicast addresses
2117 * @dev: The bridge port adjacent to which to retrieve addresses 2292 * @dev: The bridge port adjacent to which to retrieve addresses
@@ -2354,6 +2529,11 @@ int br_multicast_init_stats(struct net_bridge *br)
2354 return 0; 2529 return 0;
2355} 2530}
2356 2531
2532void br_multicast_uninit_stats(struct net_bridge *br)
2533{
2534 free_percpu(br->mcast_stats);
2535}
2536
2357static void mcast_stats_add_dir(u64 *dst, u64 *src) 2537static void mcast_stats_add_dir(u64 *dst, u64 *src)
2358{ 2538{
2359 dst[BR_MCAST_DIR_RX] += src[BR_MCAST_DIR_RX]; 2539 dst[BR_MCAST_DIR_RX] += src[BR_MCAST_DIR_RX];
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 2fe9345c1407..1f1e62095464 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -40,13 +40,13 @@
40#include <net/netfilter/br_netfilter.h> 40#include <net/netfilter/br_netfilter.h>
41#include <net/netns/generic.h> 41#include <net/netns/generic.h>
42 42
43#include <asm/uaccess.h> 43#include <linux/uaccess.h>
44#include "br_private.h" 44#include "br_private.h"
45#ifdef CONFIG_SYSCTL 45#ifdef CONFIG_SYSCTL
46#include <linux/sysctl.h> 46#include <linux/sysctl.h>
47#endif 47#endif
48 48
49static int brnf_net_id __read_mostly; 49static unsigned int brnf_net_id __read_mostly;
50 50
51struct brnf_net { 51struct brnf_net {
52 bool enabled; 52 bool enabled;
@@ -399,7 +399,7 @@ bridged_dnat:
399 br_nf_hook_thresh(NF_BR_PRE_ROUTING, 399 br_nf_hook_thresh(NF_BR_PRE_ROUTING,
400 net, sk, skb, skb->dev, 400 net, sk, skb, skb->dev,
401 NULL, 401 NULL,
402 br_nf_pre_routing_finish); 402 br_nf_pre_routing_finish_bridge);
403 return 0; 403 return 0;
404 } 404 }
405 ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr); 405 ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr);
@@ -521,21 +521,6 @@ static unsigned int br_nf_pre_routing(void *priv,
521} 521}
522 522
523 523
524/* PF_BRIDGE/LOCAL_IN ************************************************/
525/* The packet is locally destined, which requires a real
526 * dst_entry, so detach the fake one. On the way up, the
527 * packet would pass through PRE_ROUTING again (which already
528 * took place when the packet entered the bridge), but we
529 * register an IPv4 PRE_ROUTING 'sabotage' hook that will
530 * prevent this from happening. */
531static unsigned int br_nf_local_in(void *priv,
532 struct sk_buff *skb,
533 const struct nf_hook_state *state)
534{
535 br_drop_fake_rtable(skb);
536 return NF_ACCEPT;
537}
538
539/* PF_BRIDGE/FORWARD *************************************************/ 524/* PF_BRIDGE/FORWARD *************************************************/
540static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) 525static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
541{ 526{
@@ -561,8 +546,8 @@ static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff
561 } 546 }
562 nf_bridge_push_encap_header(skb); 547 nf_bridge_push_encap_header(skb);
563 548
564 NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_FORWARD, net, sk, skb, 549 br_nf_hook_thresh(NF_BR_FORWARD, net, sk, skb, in, skb->dev,
565 in, skb->dev, br_forward_finish, 1); 550 br_forward_finish);
566 return 0; 551 return 0;
567} 552}
568 553
@@ -721,18 +706,20 @@ static unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb)
721 706
722static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) 707static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
723{ 708{
724 struct nf_bridge_info *nf_bridge; 709 struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
725 unsigned int mtu_reserved; 710 unsigned int mtu, mtu_reserved;
726 711
727 mtu_reserved = nf_bridge_mtu_reduction(skb); 712 mtu_reserved = nf_bridge_mtu_reduction(skb);
713 mtu = skb->dev->mtu;
714
715 if (nf_bridge->frag_max_size && nf_bridge->frag_max_size < mtu)
716 mtu = nf_bridge->frag_max_size;
728 717
729 if (skb_is_gso(skb) || skb->len + mtu_reserved <= skb->dev->mtu) { 718 if (skb_is_gso(skb) || skb->len + mtu_reserved <= mtu) {
730 nf_bridge_info_free(skb); 719 nf_bridge_info_free(skb);
731 return br_dev_queue_push_xmit(net, sk, skb); 720 return br_dev_queue_push_xmit(net, sk, skb);
732 } 721 }
733 722
734 nf_bridge = nf_bridge_info_get(skb);
735
736 /* This is wrong! We should preserve the original fragment 723 /* This is wrong! We should preserve the original fragment
737 * boundaries by preserving frag_list rather than refragmenting. 724 * boundaries by preserving frag_list rather than refragmenting.
738 */ 725 */
@@ -845,8 +832,10 @@ static unsigned int ip_sabotage_in(void *priv,
845 struct sk_buff *skb, 832 struct sk_buff *skb,
846 const struct nf_hook_state *state) 833 const struct nf_hook_state *state)
847{ 834{
848 if (skb->nf_bridge && !skb->nf_bridge->in_prerouting) 835 if (skb->nf_bridge && !skb->nf_bridge->in_prerouting) {
849 return NF_STOP; 836 state->okfn(state->net, state->sk, skb);
837 return NF_STOLEN;
838 }
850 839
851 return NF_ACCEPT; 840 return NF_ACCEPT;
852} 841}
@@ -906,12 +895,6 @@ static struct nf_hook_ops br_nf_ops[] __read_mostly = {
906 .priority = NF_BR_PRI_BRNF, 895 .priority = NF_BR_PRI_BRNF,
907 }, 896 },
908 { 897 {
909 .hook = br_nf_local_in,
910 .pf = NFPROTO_BRIDGE,
911 .hooknum = NF_BR_LOCAL_IN,
912 .priority = NF_BR_PRI_BRNF,
913 },
914 {
915 .hook = br_nf_forward_ip, 898 .hook = br_nf_forward_ip,
916 .pf = NFPROTO_BRIDGE, 899 .pf = NFPROTO_BRIDGE,
917 .hooknum = NF_BR_FORWARD, 900 .hooknum = NF_BR_FORWARD,
@@ -1006,20 +989,20 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net,
1006 struct nf_hook_state state; 989 struct nf_hook_state state;
1007 int ret; 990 int ret;
1008 991
1009 elem = rcu_dereference(net->nf.hooks[NFPROTO_BRIDGE][hook]); 992 for (elem = rcu_dereference(net->nf.hooks[NFPROTO_BRIDGE][hook]);
1010 993 elem && nf_hook_entry_priority(elem) <= NF_BR_PRI_BRNF;
1011 while (elem && (elem->ops.priority <= NF_BR_PRI_BRNF)) 994 elem = rcu_dereference(elem->next))
1012 elem = rcu_dereference(elem->next); 995 ;
1013 996
1014 if (!elem) 997 if (!elem)
1015 return okfn(net, sk, skb); 998 return okfn(net, sk, skb);
1016 999
1017 /* We may already have this, but read-locks nest anyway */ 1000 /* We may already have this, but read-locks nest anyway */
1018 rcu_read_lock(); 1001 rcu_read_lock();
1019 nf_hook_state_init(&state, elem, hook, NF_BR_PRI_BRNF + 1, 1002 nf_hook_state_init(&state, hook, NFPROTO_BRIDGE, indev, outdev,
1020 NFPROTO_BRIDGE, indev, outdev, sk, net, okfn); 1003 sk, net, okfn);
1021 1004
1022 ret = nf_hook_slow(skb, &state); 1005 ret = nf_hook_slow(skb, &state, elem);
1023 rcu_read_unlock(); 1006 rcu_read_unlock();
1024 if (ret == 1) 1007 if (ret == 1)
1025 ret = okfn(net, sk, skb); 1008 ret = okfn(net, sk, skb);
diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c
index 5989661c659f..96c072e71ea2 100644
--- a/net/bridge/br_netfilter_ipv6.c
+++ b/net/bridge/br_netfilter_ipv6.c
@@ -38,7 +38,7 @@
38#include <net/route.h> 38#include <net/route.h>
39#include <net/netfilter/br_netfilter.h> 39#include <net/netfilter/br_netfilter.h>
40 40
41#include <asm/uaccess.h> 41#include <linux/uaccess.h>
42#include "br_private.h" 42#include "br_private.h"
43#ifdef CONFIG_SYSCTL 43#ifdef CONFIG_SYSCTL
44#include <linux/sysctl.h> 44#include <linux/sysctl.h>
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index e99037c6f7b7..225ef7d53701 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -20,6 +20,7 @@
20 20
21#include "br_private.h" 21#include "br_private.h"
22#include "br_private_stp.h" 22#include "br_private_stp.h"
23#include "br_private_tunnel.h"
23 24
24static int __get_num_vlan_infos(struct net_bridge_vlan_group *vg, 25static int __get_num_vlan_infos(struct net_bridge_vlan_group *vg,
25 u32 filter_mask) 26 u32 filter_mask)
@@ -95,9 +96,10 @@ static size_t br_get_link_af_size_filtered(const struct net_device *dev,
95 u32 filter_mask) 96 u32 filter_mask)
96{ 97{
97 struct net_bridge_vlan_group *vg = NULL; 98 struct net_bridge_vlan_group *vg = NULL;
98 struct net_bridge_port *p; 99 struct net_bridge_port *p = NULL;
99 struct net_bridge *br; 100 struct net_bridge *br;
100 int num_vlan_infos; 101 int num_vlan_infos;
102 size_t vinfo_sz = 0;
101 103
102 rcu_read_lock(); 104 rcu_read_lock();
103 if (br_port_exists(dev)) { 105 if (br_port_exists(dev)) {
@@ -110,8 +112,13 @@ static size_t br_get_link_af_size_filtered(const struct net_device *dev,
110 num_vlan_infos = br_get_num_vlan_infos(vg, filter_mask); 112 num_vlan_infos = br_get_num_vlan_infos(vg, filter_mask);
111 rcu_read_unlock(); 113 rcu_read_unlock();
112 114
115 if (p && (p->flags & BR_VLAN_TUNNEL))
116 vinfo_sz += br_get_vlan_tunnel_info_size(vg);
117
113 /* Each VLAN is returned in bridge_vlan_info along with flags */ 118 /* Each VLAN is returned in bridge_vlan_info along with flags */
114 return num_vlan_infos * nla_total_size(sizeof(struct bridge_vlan_info)); 119 vinfo_sz += num_vlan_infos * nla_total_size(sizeof(struct bridge_vlan_info));
120
121 return vinfo_sz;
115} 122}
116 123
117static inline size_t br_port_info_size(void) 124static inline size_t br_port_info_size(void)
@@ -123,10 +130,12 @@ static inline size_t br_port_info_size(void)
123 + nla_total_size(1) /* IFLA_BRPORT_GUARD */ 130 + nla_total_size(1) /* IFLA_BRPORT_GUARD */
124 + nla_total_size(1) /* IFLA_BRPORT_PROTECT */ 131 + nla_total_size(1) /* IFLA_BRPORT_PROTECT */
125 + nla_total_size(1) /* IFLA_BRPORT_FAST_LEAVE */ 132 + nla_total_size(1) /* IFLA_BRPORT_FAST_LEAVE */
133 + nla_total_size(1) /* IFLA_BRPORT_MCAST_TO_UCAST */
126 + nla_total_size(1) /* IFLA_BRPORT_LEARNING */ 134 + nla_total_size(1) /* IFLA_BRPORT_LEARNING */
127 + nla_total_size(1) /* IFLA_BRPORT_UNICAST_FLOOD */ 135 + nla_total_size(1) /* IFLA_BRPORT_UNICAST_FLOOD */
128 + nla_total_size(1) /* IFLA_BRPORT_PROXYARP */ 136 + nla_total_size(1) /* IFLA_BRPORT_PROXYARP */
129 + nla_total_size(1) /* IFLA_BRPORT_PROXYARP_WIFI */ 137 + nla_total_size(1) /* IFLA_BRPORT_PROXYARP_WIFI */
138 + nla_total_size(1) /* IFLA_BRPORT_VLAN_TUNNEL */
130 + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */ 139 + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */
131 + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */ 140 + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */
132 + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_PORT */ 141 + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_PORT */
@@ -173,6 +182,8 @@ static int br_port_fill_attrs(struct sk_buff *skb,
173 !!(p->flags & BR_ROOT_BLOCK)) || 182 !!(p->flags & BR_ROOT_BLOCK)) ||
174 nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE, 183 nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE,
175 !!(p->flags & BR_MULTICAST_FAST_LEAVE)) || 184 !!(p->flags & BR_MULTICAST_FAST_LEAVE)) ||
185 nla_put_u8(skb, IFLA_BRPORT_MCAST_TO_UCAST,
186 !!(p->flags & BR_MULTICAST_TO_UNICAST)) ||
176 nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)) || 187 nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)) ||
177 nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD, 188 nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD,
178 !!(p->flags & BR_FLOOD)) || 189 !!(p->flags & BR_FLOOD)) ||
@@ -191,7 +202,9 @@ static int br_port_fill_attrs(struct sk_buff *skb,
191 nla_put_u16(skb, IFLA_BRPORT_NO, p->port_no) || 202 nla_put_u16(skb, IFLA_BRPORT_NO, p->port_no) ||
192 nla_put_u8(skb, IFLA_BRPORT_TOPOLOGY_CHANGE_ACK, 203 nla_put_u8(skb, IFLA_BRPORT_TOPOLOGY_CHANGE_ACK,
193 p->topology_change_ack) || 204 p->topology_change_ack) ||
194 nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending)) 205 nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending) ||
206 nla_put_u8(skb, IFLA_BRPORT_VLAN_TUNNEL, !!(p->flags &
207 BR_VLAN_TUNNEL)))
195 return -EMSGSIZE; 208 return -EMSGSIZE;
196 209
197 timerval = br_timer_value(&p->message_age_timer); 210 timerval = br_timer_value(&p->message_age_timer);
@@ -414,6 +427,9 @@ static int br_fill_ifinfo(struct sk_buff *skb,
414 err = br_fill_ifvlaninfo_compressed(skb, vg); 427 err = br_fill_ifvlaninfo_compressed(skb, vg);
415 else 428 else
416 err = br_fill_ifvlaninfo(skb, vg); 429 err = br_fill_ifvlaninfo(skb, vg);
430
431 if (port && (port->flags & BR_VLAN_TUNNEL))
432 err = br_fill_vlan_tunnel_info(skb, vg);
417 rcu_read_unlock(); 433 rcu_read_unlock();
418 if (err) 434 if (err)
419 goto nla_put_failure; 435 goto nla_put_failure;
@@ -514,60 +530,88 @@ static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p,
514 return err; 530 return err;
515} 531}
516 532
533static int br_process_vlan_info(struct net_bridge *br,
534 struct net_bridge_port *p, int cmd,
535 struct bridge_vlan_info *vinfo_curr,
536 struct bridge_vlan_info **vinfo_last)
537{
538 if (!vinfo_curr->vid || vinfo_curr->vid >= VLAN_VID_MASK)
539 return -EINVAL;
540
541 if (vinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) {
542 /* check if we are already processing a range */
543 if (*vinfo_last)
544 return -EINVAL;
545 *vinfo_last = vinfo_curr;
546 /* don't allow range of pvids */
547 if ((*vinfo_last)->flags & BRIDGE_VLAN_INFO_PVID)
548 return -EINVAL;
549 return 0;
550 }
551
552 if (*vinfo_last) {
553 struct bridge_vlan_info tmp_vinfo;
554 int v, err;
555
556 if (!(vinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_END))
557 return -EINVAL;
558
559 if (vinfo_curr->vid <= (*vinfo_last)->vid)
560 return -EINVAL;
561
562 memcpy(&tmp_vinfo, *vinfo_last,
563 sizeof(struct bridge_vlan_info));
564 for (v = (*vinfo_last)->vid; v <= vinfo_curr->vid; v++) {
565 tmp_vinfo.vid = v;
566 err = br_vlan_info(br, p, cmd, &tmp_vinfo);
567 if (err)
568 break;
569 }
570 *vinfo_last = NULL;
571
572 return 0;
573 }
574
575 return br_vlan_info(br, p, cmd, vinfo_curr);
576}
577
517static int br_afspec(struct net_bridge *br, 578static int br_afspec(struct net_bridge *br,
518 struct net_bridge_port *p, 579 struct net_bridge_port *p,
519 struct nlattr *af_spec, 580 struct nlattr *af_spec,
520 int cmd) 581 int cmd)
521{ 582{
522 struct bridge_vlan_info *vinfo_start = NULL; 583 struct bridge_vlan_info *vinfo_curr = NULL;
523 struct bridge_vlan_info *vinfo = NULL; 584 struct bridge_vlan_info *vinfo_last = NULL;
524 struct nlattr *attr; 585 struct nlattr *attr;
525 int err = 0; 586 struct vtunnel_info tinfo_last = {};
526 int rem; 587 struct vtunnel_info tinfo_curr = {};
588 int err = 0, rem;
527 589
528 nla_for_each_nested(attr, af_spec, rem) { 590 nla_for_each_nested(attr, af_spec, rem) {
529 if (nla_type(attr) != IFLA_BRIDGE_VLAN_INFO) 591 err = 0;
530 continue; 592 switch (nla_type(attr)) {
531 if (nla_len(attr) != sizeof(struct bridge_vlan_info)) 593 case IFLA_BRIDGE_VLAN_TUNNEL_INFO:
532 return -EINVAL; 594 if (!(p->flags & BR_VLAN_TUNNEL))
533 vinfo = nla_data(attr);
534 if (!vinfo->vid || vinfo->vid >= VLAN_VID_MASK)
535 return -EINVAL;
536 if (vinfo->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) {
537 if (vinfo_start)
538 return -EINVAL;
539 vinfo_start = vinfo;
540 /* don't allow range of pvids */
541 if (vinfo_start->flags & BRIDGE_VLAN_INFO_PVID)
542 return -EINVAL; 595 return -EINVAL;
543 continue; 596 err = br_parse_vlan_tunnel_info(attr, &tinfo_curr);
544 } 597 if (err)
545 598 return err;
546 if (vinfo_start) { 599 err = br_process_vlan_tunnel_info(br, p, cmd,
547 struct bridge_vlan_info tmp_vinfo; 600 &tinfo_curr,
548 int v; 601 &tinfo_last);
549 602 if (err)
550 if (!(vinfo->flags & BRIDGE_VLAN_INFO_RANGE_END)) 603 return err;
551 return -EINVAL; 604 break;
552 605 case IFLA_BRIDGE_VLAN_INFO:
553 if (vinfo->vid <= vinfo_start->vid) 606 if (nla_len(attr) != sizeof(struct bridge_vlan_info))
554 return -EINVAL; 607 return -EINVAL;
555 608 vinfo_curr = nla_data(attr);
556 memcpy(&tmp_vinfo, vinfo_start, 609 err = br_process_vlan_info(br, p, cmd, vinfo_curr,
557 sizeof(struct bridge_vlan_info)); 610 &vinfo_last);
558 611 if (err)
559 for (v = vinfo_start->vid; v <= vinfo->vid; v++) { 612 return err;
560 tmp_vinfo.vid = v;
561 err = br_vlan_info(br, p, cmd, &tmp_vinfo);
562 if (err)
563 break;
564 }
565 vinfo_start = NULL;
566 } else {
567 err = br_vlan_info(br, p, cmd, vinfo);
568 }
569 if (err)
570 break; 613 break;
614 }
571 } 615 }
572 616
573 return err; 617 return err;
@@ -586,6 +630,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
586 [IFLA_BRPORT_PROXYARP] = { .type = NLA_U8 }, 630 [IFLA_BRPORT_PROXYARP] = { .type = NLA_U8 },
587 [IFLA_BRPORT_PROXYARP_WIFI] = { .type = NLA_U8 }, 631 [IFLA_BRPORT_PROXYARP_WIFI] = { .type = NLA_U8 },
588 [IFLA_BRPORT_MULTICAST_ROUTER] = { .type = NLA_U8 }, 632 [IFLA_BRPORT_MULTICAST_ROUTER] = { .type = NLA_U8 },
633 [IFLA_BRPORT_MCAST_TO_UCAST] = { .type = NLA_U8 },
589}; 634};
590 635
591/* Change the state of the port and notify spanning tree */ 636/* Change the state of the port and notify spanning tree */
@@ -626,8 +671,9 @@ static void br_set_port_flag(struct net_bridge_port *p, struct nlattr *tb[],
626/* Process bridge protocol info on port */ 671/* Process bridge protocol info on port */
627static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) 672static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
628{ 673{
629 int err;
630 unsigned long old_flags = p->flags; 674 unsigned long old_flags = p->flags;
675 bool br_vlan_tunnel_old = false;
676 int err;
631 677
632 br_set_port_flag(p, tb, IFLA_BRPORT_MODE, BR_HAIRPIN_MODE); 678 br_set_port_flag(p, tb, IFLA_BRPORT_MODE, BR_HAIRPIN_MODE);
633 br_set_port_flag(p, tb, IFLA_BRPORT_GUARD, BR_BPDU_GUARD); 679 br_set_port_flag(p, tb, IFLA_BRPORT_GUARD, BR_BPDU_GUARD);
@@ -636,9 +682,15 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
636 br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING); 682 br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING);
637 br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD); 683 br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD);
638 br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD); 684 br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD);
685 br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_TO_UCAST, BR_MULTICAST_TO_UNICAST);
639 br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP); 686 br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP);
640 br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP_WIFI, BR_PROXYARP_WIFI); 687 br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP_WIFI, BR_PROXYARP_WIFI);
641 688
689 br_vlan_tunnel_old = (p->flags & BR_VLAN_TUNNEL) ? true : false;
690 br_set_port_flag(p, tb, IFLA_BRPORT_VLAN_TUNNEL, BR_VLAN_TUNNEL);
691 if (br_vlan_tunnel_old && !(p->flags & BR_VLAN_TUNNEL))
692 nbp_vlan_tunnel_info_flush(p);
693
642 if (tb[IFLA_BRPORT_COST]) { 694 if (tb[IFLA_BRPORT_COST]) {
643 err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST])); 695 err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST]));
644 if (err) 696 if (err)
@@ -781,20 +833,6 @@ static int br_validate(struct nlattr *tb[], struct nlattr *data[])
781 return 0; 833 return 0;
782} 834}
783 835
784static int br_dev_newlink(struct net *src_net, struct net_device *dev,
785 struct nlattr *tb[], struct nlattr *data[])
786{
787 struct net_bridge *br = netdev_priv(dev);
788
789 if (tb[IFLA_ADDRESS]) {
790 spin_lock_bh(&br->lock);
791 br_stp_change_bridge_id(br, nla_data(tb[IFLA_ADDRESS]));
792 spin_unlock_bh(&br->lock);
793 }
794
795 return register_netdevice(dev);
796}
797
798static int br_port_slave_changelink(struct net_device *brdev, 836static int br_port_slave_changelink(struct net_device *brdev,
799 struct net_device *dev, 837 struct net_device *dev,
800 struct nlattr *tb[], 838 struct nlattr *tb[],
@@ -858,6 +896,8 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = {
858 [IFLA_BR_VLAN_DEFAULT_PVID] = { .type = NLA_U16 }, 896 [IFLA_BR_VLAN_DEFAULT_PVID] = { .type = NLA_U16 },
859 [IFLA_BR_VLAN_STATS_ENABLED] = { .type = NLA_U8 }, 897 [IFLA_BR_VLAN_STATS_ENABLED] = { .type = NLA_U8 },
860 [IFLA_BR_MCAST_STATS_ENABLED] = { .type = NLA_U8 }, 898 [IFLA_BR_MCAST_STATS_ENABLED] = { .type = NLA_U8 },
899 [IFLA_BR_MCAST_IGMP_VERSION] = { .type = NLA_U8 },
900 [IFLA_BR_MCAST_MLD_VERSION] = { .type = NLA_U8 },
861}; 901};
862 902
863static int br_changelink(struct net_device *brdev, struct nlattr *tb[], 903static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
@@ -1069,6 +1109,26 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
1069 mcast_stats = nla_get_u8(data[IFLA_BR_MCAST_STATS_ENABLED]); 1109 mcast_stats = nla_get_u8(data[IFLA_BR_MCAST_STATS_ENABLED]);
1070 br->multicast_stats_enabled = !!mcast_stats; 1110 br->multicast_stats_enabled = !!mcast_stats;
1071 } 1111 }
1112
1113 if (data[IFLA_BR_MCAST_IGMP_VERSION]) {
1114 __u8 igmp_version;
1115
1116 igmp_version = nla_get_u8(data[IFLA_BR_MCAST_IGMP_VERSION]);
1117 err = br_multicast_set_igmp_version(br, igmp_version);
1118 if (err)
1119 return err;
1120 }
1121
1122#if IS_ENABLED(CONFIG_IPV6)
1123 if (data[IFLA_BR_MCAST_MLD_VERSION]) {
1124 __u8 mld_version;
1125
1126 mld_version = nla_get_u8(data[IFLA_BR_MCAST_MLD_VERSION]);
1127 err = br_multicast_set_mld_version(br, mld_version);
1128 if (err)
1129 return err;
1130 }
1131#endif
1072#endif 1132#endif
1073#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) 1133#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
1074 if (data[IFLA_BR_NF_CALL_IPTABLES]) { 1134 if (data[IFLA_BR_NF_CALL_IPTABLES]) {
@@ -1093,6 +1153,28 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
1093 return 0; 1153 return 0;
1094} 1154}
1095 1155
1156static int br_dev_newlink(struct net *src_net, struct net_device *dev,
1157 struct nlattr *tb[], struct nlattr *data[])
1158{
1159 struct net_bridge *br = netdev_priv(dev);
1160 int err;
1161
1162 if (tb[IFLA_ADDRESS]) {
1163 spin_lock_bh(&br->lock);
1164 br_stp_change_bridge_id(br, nla_data(tb[IFLA_ADDRESS]));
1165 spin_unlock_bh(&br->lock);
1166 }
1167
1168 err = register_netdevice(dev);
1169 if (err)
1170 return err;
1171
1172 err = br_changelink(dev, tb, data);
1173 if (err)
1174 unregister_netdevice(dev);
1175 return err;
1176}
1177
1096static size_t br_get_size(const struct net_device *brdev) 1178static size_t br_get_size(const struct net_device *brdev)
1097{ 1179{
1098 return nla_total_size(sizeof(u32)) + /* IFLA_BR_FORWARD_DELAY */ 1180 return nla_total_size(sizeof(u32)) + /* IFLA_BR_FORWARD_DELAY */
@@ -1135,6 +1217,8 @@ static size_t br_get_size(const struct net_device *brdev)
1135 nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_INTVL */ 1217 nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_INTVL */
1136 nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_RESPONSE_INTVL */ 1218 nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_RESPONSE_INTVL */
1137 nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_STARTUP_QUERY_INTVL */ 1219 nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_STARTUP_QUERY_INTVL */
1220 nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_IGMP_VERSION */
1221 nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_MLD_VERSION */
1138#endif 1222#endif
1139#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) 1223#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
1140 nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IPTABLES */ 1224 nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IPTABLES */
@@ -1166,7 +1250,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
1166 if (nla_put_u64_64bit(skb, IFLA_BR_TOPOLOGY_CHANGE_TIMER, clockval, 1250 if (nla_put_u64_64bit(skb, IFLA_BR_TOPOLOGY_CHANGE_TIMER, clockval,
1167 IFLA_BR_PAD)) 1251 IFLA_BR_PAD))
1168 return -EMSGSIZE; 1252 return -EMSGSIZE;
1169 clockval = br_timer_value(&br->gc_timer); 1253 clockval = br_timer_value(&br->gc_work.timer);
1170 if (nla_put_u64_64bit(skb, IFLA_BR_GC_TIMER, clockval, IFLA_BR_PAD)) 1254 if (nla_put_u64_64bit(skb, IFLA_BR_GC_TIMER, clockval, IFLA_BR_PAD))
1171 return -EMSGSIZE; 1255 return -EMSGSIZE;
1172 1256
@@ -1210,9 +1294,15 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
1210 nla_put_u32(skb, IFLA_BR_MCAST_LAST_MEMBER_CNT, 1294 nla_put_u32(skb, IFLA_BR_MCAST_LAST_MEMBER_CNT,
1211 br->multicast_last_member_count) || 1295 br->multicast_last_member_count) ||
1212 nla_put_u32(skb, IFLA_BR_MCAST_STARTUP_QUERY_CNT, 1296 nla_put_u32(skb, IFLA_BR_MCAST_STARTUP_QUERY_CNT,
1213 br->multicast_startup_query_count)) 1297 br->multicast_startup_query_count) ||
1298 nla_put_u8(skb, IFLA_BR_MCAST_IGMP_VERSION,
1299 br->multicast_igmp_version))
1214 return -EMSGSIZE; 1300 return -EMSGSIZE;
1215 1301#if IS_ENABLED(CONFIG_IPV6)
1302 if (nla_put_u8(skb, IFLA_BR_MCAST_MLD_VERSION,
1303 br->multicast_mld_version))
1304 return -EMSGSIZE;
1305#endif
1216 clockval = jiffies_to_clock_t(br->multicast_last_member_interval); 1306 clockval = jiffies_to_clock_t(br->multicast_last_member_interval);
1217 if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_LAST_MEMBER_INTVL, clockval, 1307 if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_LAST_MEMBER_INTVL, clockval,
1218 IFLA_BR_PAD)) 1308 IFLA_BR_PAD))
diff --git a/net/bridge/br_netlink_tunnel.c b/net/bridge/br_netlink_tunnel.c
new file mode 100644
index 000000000000..c913491495ab
--- /dev/null
+++ b/net/bridge/br_netlink_tunnel.c
@@ -0,0 +1,294 @@
1/*
2 * Bridge per vlan tunnel port dst_metadata netlink control interface
3 *
4 * Authors:
5 * Roopa Prabhu <roopa@cumulusnetworks.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 */
12
13#include <linux/kernel.h>
14#include <linux/slab.h>
15#include <linux/etherdevice.h>
16#include <net/rtnetlink.h>
17#include <net/net_namespace.h>
18#include <net/sock.h>
19#include <uapi/linux/if_bridge.h>
20#include <net/dst_metadata.h>
21
22#include "br_private.h"
23#include "br_private_tunnel.h"
24
25static size_t __get_vlan_tinfo_size(void)
26{
27 return nla_total_size(0) + /* nest IFLA_BRIDGE_VLAN_TUNNEL_INFO */
28 nla_total_size(sizeof(u32)) + /* IFLA_BRIDGE_VLAN_TUNNEL_ID */
29 nla_total_size(sizeof(u16)) + /* IFLA_BRIDGE_VLAN_TUNNEL_VID */
30 nla_total_size(sizeof(u16)); /* IFLA_BRIDGE_VLAN_TUNNEL_FLAGS */
31}
32
33static bool vlan_tunid_inrange(struct net_bridge_vlan *v_curr,
34 struct net_bridge_vlan *v_last)
35{
36 __be32 tunid_curr = tunnel_id_to_key32(v_curr->tinfo.tunnel_id);
37 __be32 tunid_last = tunnel_id_to_key32(v_last->tinfo.tunnel_id);
38
39 return (be32_to_cpu(tunid_curr) - be32_to_cpu(tunid_last)) == 1;
40}
41
42static int __get_num_vlan_tunnel_infos(struct net_bridge_vlan_group *vg)
43{
44 struct net_bridge_vlan *v, *vtbegin = NULL, *vtend = NULL;
45 int num_tinfos = 0;
46
47 /* Count number of vlan infos */
48 list_for_each_entry_rcu(v, &vg->vlan_list, vlist) {
49 /* only a context, bridge vlan not activated */
50 if (!br_vlan_should_use(v) || !v->tinfo.tunnel_id)
51 continue;
52
53 if (!vtbegin) {
54 goto initvars;
55 } else if ((v->vid - vtend->vid) == 1 &&
56 vlan_tunid_inrange(v, vtend)) {
57 vtend = v;
58 continue;
59 } else {
60 if ((vtend->vid - vtbegin->vid) > 0)
61 num_tinfos += 2;
62 else
63 num_tinfos += 1;
64 }
65initvars:
66 vtbegin = v;
67 vtend = v;
68 }
69
70 if (vtbegin && vtend) {
71 if ((vtend->vid - vtbegin->vid) > 0)
72 num_tinfos += 2;
73 else
74 num_tinfos += 1;
75 }
76
77 return num_tinfos;
78}
79
80int br_get_vlan_tunnel_info_size(struct net_bridge_vlan_group *vg)
81{
82 int num_tinfos;
83
84 if (!vg)
85 return 0;
86
87 rcu_read_lock();
88 num_tinfos = __get_num_vlan_tunnel_infos(vg);
89 rcu_read_unlock();
90
91 return num_tinfos * __get_vlan_tinfo_size();
92}
93
94static int br_fill_vlan_tinfo(struct sk_buff *skb, u16 vid,
95 __be64 tunnel_id, u16 flags)
96{
97 __be32 tid = tunnel_id_to_key32(tunnel_id);
98 struct nlattr *tmap;
99
100 tmap = nla_nest_start(skb, IFLA_BRIDGE_VLAN_TUNNEL_INFO);
101 if (!tmap)
102 return -EMSGSIZE;
103 if (nla_put_u32(skb, IFLA_BRIDGE_VLAN_TUNNEL_ID,
104 be32_to_cpu(tid)))
105 goto nla_put_failure;
106 if (nla_put_u16(skb, IFLA_BRIDGE_VLAN_TUNNEL_VID,
107 vid))
108 goto nla_put_failure;
109 if (nla_put_u16(skb, IFLA_BRIDGE_VLAN_TUNNEL_FLAGS,
110 flags))
111 goto nla_put_failure;
112 nla_nest_end(skb, tmap);
113
114 return 0;
115
116nla_put_failure:
117 nla_nest_cancel(skb, tmap);
118
119 return -EMSGSIZE;
120}
121
122static int br_fill_vlan_tinfo_range(struct sk_buff *skb,
123 struct net_bridge_vlan *vtbegin,
124 struct net_bridge_vlan *vtend)
125{
126 int err;
127
128 if (vtend && (vtend->vid - vtbegin->vid) > 0) {
129 /* add range to skb */
130 err = br_fill_vlan_tinfo(skb, vtbegin->vid,
131 vtbegin->tinfo.tunnel_id,
132 BRIDGE_VLAN_INFO_RANGE_BEGIN);
133 if (err)
134 return err;
135
136 err = br_fill_vlan_tinfo(skb, vtend->vid,
137 vtend->tinfo.tunnel_id,
138 BRIDGE_VLAN_INFO_RANGE_END);
139 if (err)
140 return err;
141 } else {
142 err = br_fill_vlan_tinfo(skb, vtbegin->vid,
143 vtbegin->tinfo.tunnel_id,
144 0);
145 if (err)
146 return err;
147 }
148
149 return 0;
150}
151
152int br_fill_vlan_tunnel_info(struct sk_buff *skb,
153 struct net_bridge_vlan_group *vg)
154{
155 struct net_bridge_vlan *vtbegin = NULL;
156 struct net_bridge_vlan *vtend = NULL;
157 struct net_bridge_vlan *v;
158 int err;
159
160 /* Count number of vlan infos */
161 list_for_each_entry_rcu(v, &vg->vlan_list, vlist) {
162 /* only a context, bridge vlan not activated */
163 if (!br_vlan_should_use(v))
164 continue;
165
166 if (!v->tinfo.tunnel_dst)
167 continue;
168
169 if (!vtbegin) {
170 goto initvars;
171 } else if ((v->vid - vtend->vid) == 1 &&
172 vlan_tunid_inrange(v, vtend)) {
173 vtend = v;
174 continue;
175 } else {
176 err = br_fill_vlan_tinfo_range(skb, vtbegin, vtend);
177 if (err)
178 return err;
179 }
180initvars:
181 vtbegin = v;
182 vtend = v;
183 }
184
185 if (vtbegin) {
186 err = br_fill_vlan_tinfo_range(skb, vtbegin, vtend);
187 if (err)
188 return err;
189 }
190
191 return 0;
192}
193
194static const struct nla_policy vlan_tunnel_policy[IFLA_BRIDGE_VLAN_TUNNEL_MAX + 1] = {
195 [IFLA_BRIDGE_VLAN_TUNNEL_ID] = { .type = NLA_U32 },
196 [IFLA_BRIDGE_VLAN_TUNNEL_VID] = { .type = NLA_U16 },
197 [IFLA_BRIDGE_VLAN_TUNNEL_FLAGS] = { .type = NLA_U16 },
198};
199
200static int br_vlan_tunnel_info(struct net_bridge_port *p, int cmd,
201 u16 vid, u32 tun_id)
202{
203 int err = 0;
204
205 if (!p)
206 return -EINVAL;
207
208 switch (cmd) {
209 case RTM_SETLINK:
210 err = nbp_vlan_tunnel_info_add(p, vid, tun_id);
211 break;
212 case RTM_DELLINK:
213 nbp_vlan_tunnel_info_delete(p, vid);
214 break;
215 }
216
217 return err;
218}
219
220int br_parse_vlan_tunnel_info(struct nlattr *attr,
221 struct vtunnel_info *tinfo)
222{
223 struct nlattr *tb[IFLA_BRIDGE_VLAN_TUNNEL_MAX + 1];
224 u32 tun_id;
225 u16 vid, flags = 0;
226 int err;
227
228 memset(tinfo, 0, sizeof(*tinfo));
229
230 err = nla_parse_nested(tb, IFLA_BRIDGE_VLAN_TUNNEL_MAX,
231 attr, vlan_tunnel_policy);
232 if (err < 0)
233 return err;
234
235 if (!tb[IFLA_BRIDGE_VLAN_TUNNEL_ID] ||
236 !tb[IFLA_BRIDGE_VLAN_TUNNEL_VID])
237 return -EINVAL;
238
239 tun_id = nla_get_u32(tb[IFLA_BRIDGE_VLAN_TUNNEL_ID]);
240 vid = nla_get_u16(tb[IFLA_BRIDGE_VLAN_TUNNEL_VID]);
241 if (vid >= VLAN_VID_MASK)
242 return -ERANGE;
243
244 if (tb[IFLA_BRIDGE_VLAN_TUNNEL_FLAGS])
245 flags = nla_get_u16(tb[IFLA_BRIDGE_VLAN_TUNNEL_FLAGS]);
246
247 tinfo->tunid = tun_id;
248 tinfo->vid = vid;
249 tinfo->flags = flags;
250
251 return 0;
252}
253
254int br_process_vlan_tunnel_info(struct net_bridge *br,
255 struct net_bridge_port *p, int cmd,
256 struct vtunnel_info *tinfo_curr,
257 struct vtunnel_info *tinfo_last)
258{
259 int err;
260
261 if (tinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) {
262 if (tinfo_last->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN)
263 return -EINVAL;
264 memcpy(tinfo_last, tinfo_curr, sizeof(struct vtunnel_info));
265 } else if (tinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_END) {
266 int t, v;
267
268 if (!(tinfo_last->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN))
269 return -EINVAL;
270 if ((tinfo_curr->vid - tinfo_last->vid) !=
271 (tinfo_curr->tunid - tinfo_last->tunid))
272 return -EINVAL;
273 t = tinfo_last->tunid;
274 for (v = tinfo_last->vid; v <= tinfo_curr->vid; v++) {
275 err = br_vlan_tunnel_info(p, cmd, v, t);
276 if (err)
277 return err;
278 t++;
279 }
280 memset(tinfo_last, 0, sizeof(struct vtunnel_info));
281 memset(tinfo_curr, 0, sizeof(struct vtunnel_info));
282 } else {
283 if (tinfo_last->flags)
284 return -EINVAL;
285 err = br_vlan_tunnel_info(p, cmd, tinfo_curr->vid,
286 tinfo_curr->tunid);
287 if (err)
288 return err;
289 memset(tinfo_last, 0, sizeof(struct vtunnel_info));
290 memset(tinfo_curr, 0, sizeof(struct vtunnel_info));
291 }
292
293 return 0;
294}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 1b63177e0ccd..0d177280aa84 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -91,6 +91,11 @@ struct br_vlan_stats {
91 struct u64_stats_sync syncp; 91 struct u64_stats_sync syncp;
92}; 92};
93 93
94struct br_tunnel_info {
95 __be64 tunnel_id;
96 struct metadata_dst *tunnel_dst;
97};
98
94/** 99/**
95 * struct net_bridge_vlan - per-vlan entry 100 * struct net_bridge_vlan - per-vlan entry
96 * 101 *
@@ -113,6 +118,7 @@ struct br_vlan_stats {
113 */ 118 */
114struct net_bridge_vlan { 119struct net_bridge_vlan {
115 struct rhash_head vnode; 120 struct rhash_head vnode;
121 struct rhash_head tnode;
116 u16 vid; 122 u16 vid;
117 u16 flags; 123 u16 flags;
118 struct br_vlan_stats __percpu *stats; 124 struct br_vlan_stats __percpu *stats;
@@ -124,6 +130,9 @@ struct net_bridge_vlan {
124 atomic_t refcnt; 130 atomic_t refcnt;
125 struct net_bridge_vlan *brvlan; 131 struct net_bridge_vlan *brvlan;
126 }; 132 };
133
134 struct br_tunnel_info tinfo;
135
127 struct list_head vlist; 136 struct list_head vlist;
128 137
129 struct rcu_head rcu; 138 struct rcu_head rcu;
@@ -145,24 +154,27 @@ struct net_bridge_vlan {
145 */ 154 */
146struct net_bridge_vlan_group { 155struct net_bridge_vlan_group {
147 struct rhashtable vlan_hash; 156 struct rhashtable vlan_hash;
157 struct rhashtable tunnel_hash;
148 struct list_head vlan_list; 158 struct list_head vlan_list;
149 u16 num_vlans; 159 u16 num_vlans;
150 u16 pvid; 160 u16 pvid;
151}; 161};
152 162
153struct net_bridge_fdb_entry 163struct net_bridge_fdb_entry {
154{
155 struct hlist_node hlist; 164 struct hlist_node hlist;
156 struct net_bridge_port *dst; 165 struct net_bridge_port *dst;
157 166
158 unsigned long updated;
159 unsigned long used;
160 mac_addr addr; 167 mac_addr addr;
161 __u16 vlan_id; 168 __u16 vlan_id;
162 unsigned char is_local:1, 169 unsigned char is_local:1,
163 is_static:1, 170 is_static:1,
164 added_by_user:1, 171 added_by_user:1,
165 added_by_external_learn:1; 172 added_by_external_learn:1;
173
174 /* write-heavy members should not affect lookups */
175 unsigned long updated ____cacheline_aligned_in_smp;
176 unsigned long used;
177
166 struct rcu_head rcu; 178 struct rcu_head rcu;
167}; 179};
168 180
@@ -177,6 +189,7 @@ struct net_bridge_port_group {
177 struct timer_list timer; 189 struct timer_list timer;
178 struct br_ip addr; 190 struct br_ip addr;
179 unsigned char flags; 191 unsigned char flags;
192 unsigned char eth_addr[ETH_ALEN];
180}; 193};
181 194
182struct net_bridge_mdb_entry 195struct net_bridge_mdb_entry
@@ -201,12 +214,16 @@ struct net_bridge_mdb_htable
201 u32 ver; 214 u32 ver;
202}; 215};
203 216
204struct net_bridge_port 217struct net_bridge_port {
205{
206 struct net_bridge *br; 218 struct net_bridge *br;
207 struct net_device *dev; 219 struct net_device *dev;
208 struct list_head list; 220 struct list_head list;
209 221
222 unsigned long flags;
223#ifdef CONFIG_BRIDGE_VLAN_FILTERING
224 struct net_bridge_vlan_group __rcu *vlgrp;
225#endif
226
210 /* STP */ 227 /* STP */
211 u8 priority; 228 u8 priority;
212 u8 state; 229 u8 state;
@@ -227,8 +244,6 @@ struct net_bridge_port
227 struct kobject kobj; 244 struct kobject kobj;
228 struct rcu_head rcu; 245 struct rcu_head rcu;
229 246
230 unsigned long flags;
231
232#ifdef CONFIG_BRIDGE_IGMP_SNOOPING 247#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
233 struct bridge_mcast_own_query ip4_own_query; 248 struct bridge_mcast_own_query ip4_own_query;
234#if IS_ENABLED(CONFIG_IPV6) 249#if IS_ENABLED(CONFIG_IPV6)
@@ -248,9 +263,6 @@ struct net_bridge_port
248#ifdef CONFIG_NET_POLL_CONTROLLER 263#ifdef CONFIG_NET_POLL_CONTROLLER
249 struct netpoll *np; 264 struct netpoll *np;
250#endif 265#endif
251#ifdef CONFIG_BRIDGE_VLAN_FILTERING
252 struct net_bridge_vlan_group __rcu *vlgrp;
253#endif
254#ifdef CONFIG_NET_SWITCHDEV 266#ifdef CONFIG_NET_SWITCHDEV
255 int offload_fwd_mark; 267 int offload_fwd_mark;
256#endif 268#endif
@@ -272,14 +284,21 @@ static inline struct net_bridge_port *br_port_get_rtnl(const struct net_device *
272 rtnl_dereference(dev->rx_handler_data) : NULL; 284 rtnl_dereference(dev->rx_handler_data) : NULL;
273} 285}
274 286
275struct net_bridge 287struct net_bridge {
276{
277 spinlock_t lock; 288 spinlock_t lock;
289 spinlock_t hash_lock;
278 struct list_head port_list; 290 struct list_head port_list;
279 struct net_device *dev; 291 struct net_device *dev;
280
281 struct pcpu_sw_netstats __percpu *stats; 292 struct pcpu_sw_netstats __percpu *stats;
282 spinlock_t hash_lock; 293 /* These fields are accessed on each packet */
294#ifdef CONFIG_BRIDGE_VLAN_FILTERING
295 u8 vlan_enabled;
296 u8 vlan_stats_enabled;
297 __be16 vlan_proto;
298 u16 default_pvid;
299 struct net_bridge_vlan_group __rcu *vlgrp;
300#endif
301
283 struct hlist_head hash[BR_HASH_SIZE]; 302 struct hlist_head hash[BR_HASH_SIZE];
284#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) 303#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
285 union { 304 union {
@@ -297,17 +316,20 @@ struct net_bridge
297 bridge_id designated_root; 316 bridge_id designated_root;
298 bridge_id bridge_id; 317 bridge_id bridge_id;
299 u32 root_path_cost; 318 u32 root_path_cost;
319 unsigned char topology_change;
320 unsigned char topology_change_detected;
321 u16 root_port;
300 unsigned long max_age; 322 unsigned long max_age;
301 unsigned long hello_time; 323 unsigned long hello_time;
302 unsigned long forward_delay; 324 unsigned long forward_delay;
303 unsigned long bridge_max_age;
304 unsigned long ageing_time; 325 unsigned long ageing_time;
326 unsigned long bridge_max_age;
305 unsigned long bridge_hello_time; 327 unsigned long bridge_hello_time;
306 unsigned long bridge_forward_delay; 328 unsigned long bridge_forward_delay;
329 unsigned long bridge_ageing_time;
307 330
308 u8 group_addr[ETH_ALEN]; 331 u8 group_addr[ETH_ALEN];
309 bool group_addr_set; 332 bool group_addr_set;
310 u16 root_port;
311 333
312 enum { 334 enum {
313 BR_NO_STP, /* no spanning tree */ 335 BR_NO_STP, /* no spanning tree */
@@ -315,9 +337,6 @@ struct net_bridge
315 BR_USER_STP, /* new RSTP in userspace */ 337 BR_USER_STP, /* new RSTP in userspace */
316 } stp_enabled; 338 } stp_enabled;
317 339
318 unsigned char topology_change;
319 unsigned char topology_change_detected;
320
321#ifdef CONFIG_BRIDGE_IGMP_SNOOPING 340#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
322 unsigned char multicast_router; 341 unsigned char multicast_router;
323 342
@@ -333,6 +352,8 @@ struct net_bridge
333 u32 multicast_last_member_count; 352 u32 multicast_last_member_count;
334 u32 multicast_startup_query_count; 353 u32 multicast_startup_query_count;
335 354
355 u8 multicast_igmp_version;
356
336 unsigned long multicast_last_member_interval; 357 unsigned long multicast_last_member_interval;
337 unsigned long multicast_membership_interval; 358 unsigned long multicast_membership_interval;
338 unsigned long multicast_querier_interval; 359 unsigned long multicast_querier_interval;
@@ -353,27 +374,20 @@ struct net_bridge
353 struct bridge_mcast_other_query ip6_other_query; 374 struct bridge_mcast_other_query ip6_other_query;
354 struct bridge_mcast_own_query ip6_own_query; 375 struct bridge_mcast_own_query ip6_own_query;
355 struct bridge_mcast_querier ip6_querier; 376 struct bridge_mcast_querier ip6_querier;
377 u8 multicast_mld_version;
356#endif /* IS_ENABLED(CONFIG_IPV6) */ 378#endif /* IS_ENABLED(CONFIG_IPV6) */
357#endif 379#endif
358 380
359 struct timer_list hello_timer; 381 struct timer_list hello_timer;
360 struct timer_list tcn_timer; 382 struct timer_list tcn_timer;
361 struct timer_list topology_change_timer; 383 struct timer_list topology_change_timer;
362 struct timer_list gc_timer; 384 struct delayed_work gc_work;
363 struct kobject *ifobj; 385 struct kobject *ifobj;
364 u32 auto_cnt; 386 u32 auto_cnt;
365 387
366#ifdef CONFIG_NET_SWITCHDEV 388#ifdef CONFIG_NET_SWITCHDEV
367 int offload_fwd_mark; 389 int offload_fwd_mark;
368#endif 390#endif
369
370#ifdef CONFIG_BRIDGE_VLAN_FILTERING
371 struct net_bridge_vlan_group __rcu *vlgrp;
372 u8 vlan_enabled;
373 u8 vlan_stats_enabled;
374 __be16 vlan_proto;
375 u16 default_pvid;
376#endif
377}; 391};
378 392
379struct br_input_skb_cb { 393struct br_input_skb_cb {
@@ -490,11 +504,12 @@ void br_fdb_find_delete_local(struct net_bridge *br,
490 const unsigned char *addr, u16 vid); 504 const unsigned char *addr, u16 vid);
491void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr); 505void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr);
492void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr); 506void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr);
493void br_fdb_cleanup(unsigned long arg); 507void br_fdb_cleanup(struct work_struct *work);
494void br_fdb_delete_by_port(struct net_bridge *br, 508void br_fdb_delete_by_port(struct net_bridge *br,
495 const struct net_bridge_port *p, u16 vid, int do_all); 509 const struct net_bridge_port *p, u16 vid, int do_all);
496struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br, 510struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br,
497 const unsigned char *addr, __u16 vid); 511 const unsigned char *addr,
512 __u16 vid);
498int br_fdb_test_addr(struct net_device *dev, unsigned char *addr); 513int br_fdb_test_addr(struct net_device *dev, unsigned char *addr);
499int br_fdb_fillbuf(struct net_bridge *br, void *buf, unsigned long count, 514int br_fdb_fillbuf(struct net_bridge *br, void *buf, unsigned long count,
500 unsigned long off); 515 unsigned long off);
@@ -582,6 +597,10 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val);
582int br_multicast_toggle(struct net_bridge *br, unsigned long val); 597int br_multicast_toggle(struct net_bridge *br, unsigned long val);
583int br_multicast_set_querier(struct net_bridge *br, unsigned long val); 598int br_multicast_set_querier(struct net_bridge *br, unsigned long val);
584int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val); 599int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val);
600int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val);
601#if IS_ENABLED(CONFIG_IPV6)
602int br_multicast_set_mld_version(struct net_bridge *br, unsigned long val);
603#endif
585struct net_bridge_mdb_entry * 604struct net_bridge_mdb_entry *
586br_mdb_ip_get(struct net_bridge_mdb_htable *mdb, struct br_ip *dst); 605br_mdb_ip_get(struct net_bridge_mdb_htable *mdb, struct br_ip *dst);
587struct net_bridge_mdb_entry * 606struct net_bridge_mdb_entry *
@@ -591,7 +610,7 @@ void br_multicast_free_pg(struct rcu_head *head);
591struct net_bridge_port_group * 610struct net_bridge_port_group *
592br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group, 611br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group,
593 struct net_bridge_port_group __rcu *next, 612 struct net_bridge_port_group __rcu *next,
594 unsigned char flags); 613 unsigned char flags, const unsigned char *src);
595void br_mdb_init(void); 614void br_mdb_init(void);
596void br_mdb_uninit(void); 615void br_mdb_uninit(void);
597void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, 616void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
@@ -601,6 +620,7 @@ void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port,
601void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p, 620void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p,
602 const struct sk_buff *skb, u8 type, u8 dir); 621 const struct sk_buff *skb, u8 type, u8 dir);
603int br_multicast_init_stats(struct net_bridge *br); 622int br_multicast_init_stats(struct net_bridge *br);
623void br_multicast_uninit_stats(struct net_bridge *br);
604void br_multicast_get_stats(const struct net_bridge *br, 624void br_multicast_get_stats(const struct net_bridge *br,
605 const struct net_bridge_port *p, 625 const struct net_bridge_port *p,
606 struct br_mcast_stats *dest); 626 struct br_mcast_stats *dest);
@@ -741,6 +761,10 @@ static inline int br_multicast_init_stats(struct net_bridge *br)
741 return 0; 761 return 0;
742} 762}
743 763
764static inline void br_multicast_uninit_stats(struct net_bridge *br)
765{
766}
767
744static inline int br_multicast_igmp_type(const struct sk_buff *skb) 768static inline int br_multicast_igmp_type(const struct sk_buff *skb)
745{ 769{
746 return 0; 770 return 0;
@@ -756,6 +780,7 @@ bool br_allowed_egress(struct net_bridge_vlan_group *vg,
756 const struct sk_buff *skb); 780 const struct sk_buff *skb);
757bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid); 781bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid);
758struct sk_buff *br_handle_vlan(struct net_bridge *br, 782struct sk_buff *br_handle_vlan(struct net_bridge *br,
783 const struct net_bridge_port *port,
759 struct net_bridge_vlan_group *vg, 784 struct net_bridge_vlan_group *vg,
760 struct sk_buff *skb); 785 struct sk_buff *skb);
761int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags); 786int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags);
@@ -855,6 +880,7 @@ static inline bool br_should_learn(struct net_bridge_port *p,
855} 880}
856 881
857static inline struct sk_buff *br_handle_vlan(struct net_bridge *br, 882static inline struct sk_buff *br_handle_vlan(struct net_bridge *br,
883 const struct net_bridge_port *port,
858 struct net_bridge_vlan_group *vg, 884 struct net_bridge_vlan_group *vg,
859 struct sk_buff *skb) 885 struct sk_buff *skb)
860{ 886{
@@ -992,6 +1018,7 @@ void __br_set_forward_delay(struct net_bridge *br, unsigned long t);
992int br_set_forward_delay(struct net_bridge *br, unsigned long x); 1018int br_set_forward_delay(struct net_bridge *br, unsigned long x);
993int br_set_hello_time(struct net_bridge *br, unsigned long x); 1019int br_set_hello_time(struct net_bridge *br, unsigned long x);
994int br_set_max_age(struct net_bridge *br, unsigned long x); 1020int br_set_max_age(struct net_bridge *br, unsigned long x);
1021int __set_ageing_time(struct net_device *dev, unsigned long t);
995int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time); 1022int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time);
996 1023
997 1024
diff --git a/net/bridge/br_private_stp.h b/net/bridge/br_private_stp.h
index 2fe910c4e170..3f7543a29b76 100644
--- a/net/bridge/br_private_stp.h
+++ b/net/bridge/br_private_stp.h
@@ -61,6 +61,7 @@ void br_received_tcn_bpdu(struct net_bridge_port *p);
61void br_transmit_config(struct net_bridge_port *p); 61void br_transmit_config(struct net_bridge_port *p);
62void br_transmit_tcn(struct net_bridge *br); 62void br_transmit_tcn(struct net_bridge *br);
63void br_topology_change_detection(struct net_bridge *br); 63void br_topology_change_detection(struct net_bridge *br);
64void __br_set_topology_change(struct net_bridge *br, unsigned char val);
64 65
65/* br_stp_bpdu.c */ 66/* br_stp_bpdu.c */
66void br_send_config_bpdu(struct net_bridge_port *, struct br_config_bpdu *); 67void br_send_config_bpdu(struct net_bridge_port *, struct br_config_bpdu *);
diff --git a/net/bridge/br_private_tunnel.h b/net/bridge/br_private_tunnel.h
new file mode 100644
index 000000000000..4a447a378ab3
--- /dev/null
+++ b/net/bridge/br_private_tunnel.h
@@ -0,0 +1,83 @@
1/*
2 * Bridge per vlan tunnels
3 *
4 * Authors:
5 * Roopa Prabhu <roopa@cumulusnetworks.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 */
12
13#ifndef _BR_PRIVATE_TUNNEL_H
14#define _BR_PRIVATE_TUNNEL_H
15
16struct vtunnel_info {
17 u32 tunid;
18 u16 vid;
19 u16 flags;
20};
21
22/* br_netlink_tunnel.c */
23int br_parse_vlan_tunnel_info(struct nlattr *attr,
24 struct vtunnel_info *tinfo);
25int br_process_vlan_tunnel_info(struct net_bridge *br,
26 struct net_bridge_port *p,
27 int cmd,
28 struct vtunnel_info *tinfo_curr,
29 struct vtunnel_info *tinfo_last);
30int br_get_vlan_tunnel_info_size(struct net_bridge_vlan_group *vg);
31int br_fill_vlan_tunnel_info(struct sk_buff *skb,
32 struct net_bridge_vlan_group *vg);
33
34#ifdef CONFIG_BRIDGE_VLAN_FILTERING
35/* br_vlan_tunnel.c */
36int vlan_tunnel_init(struct net_bridge_vlan_group *vg);
37void vlan_tunnel_deinit(struct net_bridge_vlan_group *vg);
38int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port, u16 vid);
39int nbp_vlan_tunnel_info_add(struct net_bridge_port *port, u16 vid, u32 tun_id);
40void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port);
41void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg,
42 struct net_bridge_vlan *vlan);
43int br_handle_ingress_vlan_tunnel(struct sk_buff *skb,
44 struct net_bridge_port *p,
45 struct net_bridge_vlan_group *vg);
46int br_handle_egress_vlan_tunnel(struct sk_buff *skb,
47 struct net_bridge_vlan *vlan);
48#else
49static inline int vlan_tunnel_init(struct net_bridge_vlan_group *vg)
50{
51 return 0;
52}
53
54static inline int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port,
55 u16 vid)
56{
57 return 0;
58}
59
60static inline int nbp_vlan_tunnel_info_add(struct net_bridge_port *port,
61 u16 vid, u32 tun_id)
62{
63 return 0;
64}
65
66static inline void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port)
67{
68}
69
70static inline void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg,
71 struct net_bridge_vlan *vlan)
72{
73}
74
75static inline int br_handle_ingress_vlan_tunnel(struct sk_buff *skb,
76 struct net_bridge_port *p,
77 struct net_bridge_vlan_group *vg)
78{
79 return 0;
80}
81#endif
82
83#endif
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index 9258b8ef14ff..8f56c2d1f1a7 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -234,7 +234,7 @@ static void br_record_config_timeout_values(struct net_bridge *br,
234 br->max_age = bpdu->max_age; 234 br->max_age = bpdu->max_age;
235 br->hello_time = bpdu->hello_time; 235 br->hello_time = bpdu->hello_time;
236 br->forward_delay = bpdu->forward_delay; 236 br->forward_delay = bpdu->forward_delay;
237 br->topology_change = bpdu->topology_change; 237 __br_set_topology_change(br, bpdu->topology_change);
238} 238}
239 239
240/* called under bridge lock */ 240/* called under bridge lock */
@@ -344,7 +344,7 @@ void br_topology_change_detection(struct net_bridge *br)
344 isroot ? "propagating" : "sending tcn bpdu"); 344 isroot ? "propagating" : "sending tcn bpdu");
345 345
346 if (isroot) { 346 if (isroot) {
347 br->topology_change = 1; 347 __br_set_topology_change(br, 1);
348 mod_timer(&br->topology_change_timer, jiffies 348 mod_timer(&br->topology_change_timer, jiffies
349 + br->bridge_forward_delay + br->bridge_max_age); 349 + br->bridge_forward_delay + br->bridge_max_age);
350 } else if (!br->topology_change_detected) { 350 } else if (!br->topology_change_detected) {
@@ -562,6 +562,24 @@ int br_set_max_age(struct net_bridge *br, unsigned long val)
562 562
563} 563}
564 564
565/* called under bridge lock */
566int __set_ageing_time(struct net_device *dev, unsigned long t)
567{
568 struct switchdev_attr attr = {
569 .orig_dev = dev,
570 .id = SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME,
571 .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP | SWITCHDEV_F_DEFER,
572 .u.ageing_time = jiffies_to_clock_t(t),
573 };
574 int err;
575
576 err = switchdev_port_attr_set(dev, &attr);
577 if (err && err != -EOPNOTSUPP)
578 return err;
579
580 return 0;
581}
582
565/* Set time interval that dynamic forwarding entries live 583/* Set time interval that dynamic forwarding entries live
566 * For pure software bridge, allow values outside the 802.1 584 * For pure software bridge, allow values outside the 802.1
567 * standard specification for special cases: 585 * standard specification for special cases:
@@ -572,25 +590,52 @@ int br_set_max_age(struct net_bridge *br, unsigned long val)
572 */ 590 */
573int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time) 591int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time)
574{ 592{
575 struct switchdev_attr attr = {
576 .orig_dev = br->dev,
577 .id = SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME,
578 .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP,
579 .u.ageing_time = ageing_time,
580 };
581 unsigned long t = clock_t_to_jiffies(ageing_time); 593 unsigned long t = clock_t_to_jiffies(ageing_time);
582 int err; 594 int err;
583 595
584 err = switchdev_port_attr_set(br->dev, &attr); 596 err = __set_ageing_time(br->dev, t);
585 if (err && err != -EOPNOTSUPP) 597 if (err)
586 return err; 598 return err;
587 599
600 spin_lock_bh(&br->lock);
601 br->bridge_ageing_time = t;
588 br->ageing_time = t; 602 br->ageing_time = t;
589 mod_timer(&br->gc_timer, jiffies); 603 spin_unlock_bh(&br->lock);
604
605 mod_delayed_work(system_long_wq, &br->gc_work, 0);
590 606
591 return 0; 607 return 0;
592} 608}
593 609
610/* called under bridge lock */
611void __br_set_topology_change(struct net_bridge *br, unsigned char val)
612{
613 unsigned long t;
614 int err;
615
616 if (br->stp_enabled == BR_KERNEL_STP && br->topology_change != val) {
617 /* On topology change, set the bridge ageing time to twice the
618 * forward delay. Otherwise, restore its default ageing time.
619 */
620
621 if (val) {
622 t = 2 * br->forward_delay;
623 br_debug(br, "decreasing ageing time to %lu\n", t);
624 } else {
625 t = br->bridge_ageing_time;
626 br_debug(br, "restoring ageing time to %lu\n", t);
627 }
628
629 err = __set_ageing_time(br->dev, t);
630 if (err)
631 br_warn(br, "error offloading ageing time\n");
632 else
633 br->ageing_time = t;
634 }
635
636 br->topology_change = val;
637}
638
594void __br_set_forward_delay(struct net_bridge *br, unsigned long t) 639void __br_set_forward_delay(struct net_bridge *br, unsigned long t)
595{ 640{
596 br->bridge_forward_delay = t; 641 br->bridge_forward_delay = t;
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index d8ad73b38de2..08341d2aa9c9 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -36,12 +36,6 @@ static inline port_id br_make_port_id(__u8 priority, __u16 port_no)
36/* called under bridge lock */ 36/* called under bridge lock */
37void br_init_port(struct net_bridge_port *p) 37void br_init_port(struct net_bridge_port *p)
38{ 38{
39 struct switchdev_attr attr = {
40 .orig_dev = p->dev,
41 .id = SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME,
42 .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP | SWITCHDEV_F_DEFER,
43 .u.ageing_time = jiffies_to_clock_t(p->br->ageing_time),
44 };
45 int err; 39 int err;
46 40
47 p->port_id = br_make_port_id(p->priority, p->port_no); 41 p->port_id = br_make_port_id(p->priority, p->port_no);
@@ -50,9 +44,9 @@ void br_init_port(struct net_bridge_port *p)
50 p->topology_change_ack = 0; 44 p->topology_change_ack = 0;
51 p->config_pending = 0; 45 p->config_pending = 0;
52 46
53 err = switchdev_port_attr_set(p->dev, &attr); 47 err = __set_ageing_time(p->dev, p->br->ageing_time);
54 if (err && err != -EOPNOTSUPP) 48 if (err)
55 netdev_err(p->dev, "failed to set HW ageing time\n"); 49 netdev_err(p->dev, "failed to offload ageing time\n");
56} 50}
57 51
58/* NO locks held */ 52/* NO locks held */
@@ -63,7 +57,7 @@ void br_stp_enable_bridge(struct net_bridge *br)
63 spin_lock_bh(&br->lock); 57 spin_lock_bh(&br->lock);
64 if (br->stp_enabled == BR_KERNEL_STP) 58 if (br->stp_enabled == BR_KERNEL_STP)
65 mod_timer(&br->hello_timer, jiffies + br->hello_time); 59 mod_timer(&br->hello_timer, jiffies + br->hello_time);
66 mod_timer(&br->gc_timer, jiffies + HZ/10); 60 mod_delayed_work(system_long_wq, &br->gc_work, HZ / 10);
67 61
68 br_config_bpdu_generation(br); 62 br_config_bpdu_generation(br);
69 63
@@ -87,14 +81,14 @@ void br_stp_disable_bridge(struct net_bridge *br)
87 81
88 } 82 }
89 83
90 br->topology_change = 0; 84 __br_set_topology_change(br, 0);
91 br->topology_change_detected = 0; 85 br->topology_change_detected = 0;
92 spin_unlock_bh(&br->lock); 86 spin_unlock_bh(&br->lock);
93 87
94 del_timer_sync(&br->hello_timer); 88 del_timer_sync(&br->hello_timer);
95 del_timer_sync(&br->topology_change_timer); 89 del_timer_sync(&br->topology_change_timer);
96 del_timer_sync(&br->tcn_timer); 90 del_timer_sync(&br->tcn_timer);
97 del_timer_sync(&br->gc_timer); 91 cancel_delayed_work_sync(&br->gc_work);
98} 92}
99 93
100/* called under bridge lock */ 94/* called under bridge lock */
diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c
index da058b85aa22..c98b3e5c140a 100644
--- a/net/bridge/br_stp_timer.c
+++ b/net/bridge/br_stp_timer.c
@@ -125,7 +125,7 @@ static void br_topology_change_timer_expired(unsigned long arg)
125 br_debug(br, "topo change timer expired\n"); 125 br_debug(br, "topo change timer expired\n");
126 spin_lock(&br->lock); 126 spin_lock(&br->lock);
127 br->topology_change_detected = 0; 127 br->topology_change_detected = 0;
128 br->topology_change = 0; 128 __br_set_topology_change(br, 0);
129 spin_unlock(&br->lock); 129 spin_unlock(&br->lock);
130} 130}
131 131
@@ -153,8 +153,6 @@ void br_stp_timer_init(struct net_bridge *br)
153 setup_timer(&br->topology_change_timer, 153 setup_timer(&br->topology_change_timer,
154 br_topology_change_timer_expired, 154 br_topology_change_timer_expired,
155 (unsigned long) br); 155 (unsigned long) br);
156
157 setup_timer(&br->gc_timer, br_fdb_cleanup, (unsigned long) br);
158} 156}
159 157
160void br_stp_port_timer_init(struct net_bridge_port *p) 158void br_stp_port_timer_init(struct net_bridge_port *p)
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index f88c4df3f91e..0b5dd607444c 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -19,6 +19,7 @@
19#include <linux/rtnetlink.h> 19#include <linux/rtnetlink.h>
20#include <linux/spinlock.h> 20#include <linux/spinlock.h>
21#include <linux/times.h> 21#include <linux/times.h>
22#include <linux/sched/signal.h>
22 23
23#include "br_private.h" 24#include "br_private.h"
24 25
@@ -263,7 +264,7 @@ static ssize_t gc_timer_show(struct device *d, struct device_attribute *attr,
263 char *buf) 264 char *buf)
264{ 265{
265 struct net_bridge *br = to_bridge(d); 266 struct net_bridge *br = to_bridge(d);
266 return sprintf(buf, "%ld\n", br_timer_value(&br->gc_timer)); 267 return sprintf(buf, "%ld\n", br_timer_value(&br->gc_work.timer));
267} 268}
268static DEVICE_ATTR_RO(gc_timer); 269static DEVICE_ATTR_RO(gc_timer);
269 270
@@ -440,6 +441,23 @@ static ssize_t hash_max_store(struct device *d, struct device_attribute *attr,
440} 441}
441static DEVICE_ATTR_RW(hash_max); 442static DEVICE_ATTR_RW(hash_max);
442 443
444static ssize_t multicast_igmp_version_show(struct device *d,
445 struct device_attribute *attr,
446 char *buf)
447{
448 struct net_bridge *br = to_bridge(d);
449
450 return sprintf(buf, "%u\n", br->multicast_igmp_version);
451}
452
453static ssize_t multicast_igmp_version_store(struct device *d,
454 struct device_attribute *attr,
455 const char *buf, size_t len)
456{
457 return store_bridge_parm(d, buf, len, br_multicast_set_igmp_version);
458}
459static DEVICE_ATTR_RW(multicast_igmp_version);
460
443static ssize_t multicast_last_member_count_show(struct device *d, 461static ssize_t multicast_last_member_count_show(struct device *d,
444 struct device_attribute *attr, 462 struct device_attribute *attr,
445 char *buf) 463 char *buf)
@@ -642,6 +660,25 @@ static ssize_t multicast_stats_enabled_store(struct device *d,
642 return store_bridge_parm(d, buf, len, set_stats_enabled); 660 return store_bridge_parm(d, buf, len, set_stats_enabled);
643} 661}
644static DEVICE_ATTR_RW(multicast_stats_enabled); 662static DEVICE_ATTR_RW(multicast_stats_enabled);
663
664#if IS_ENABLED(CONFIG_IPV6)
665static ssize_t multicast_mld_version_show(struct device *d,
666 struct device_attribute *attr,
667 char *buf)
668{
669 struct net_bridge *br = to_bridge(d);
670
671 return sprintf(buf, "%u\n", br->multicast_mld_version);
672}
673
674static ssize_t multicast_mld_version_store(struct device *d,
675 struct device_attribute *attr,
676 const char *buf, size_t len)
677{
678 return store_bridge_parm(d, buf, len, br_multicast_set_mld_version);
679}
680static DEVICE_ATTR_RW(multicast_mld_version);
681#endif
645#endif 682#endif
646#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) 683#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
647static ssize_t nf_call_iptables_show( 684static ssize_t nf_call_iptables_show(
@@ -809,6 +846,10 @@ static struct attribute *bridge_attrs[] = {
809 &dev_attr_multicast_query_response_interval.attr, 846 &dev_attr_multicast_query_response_interval.attr,
810 &dev_attr_multicast_startup_query_interval.attr, 847 &dev_attr_multicast_startup_query_interval.attr,
811 &dev_attr_multicast_stats_enabled.attr, 848 &dev_attr_multicast_stats_enabled.attr,
849 &dev_attr_multicast_igmp_version.attr,
850#if IS_ENABLED(CONFIG_IPV6)
851 &dev_attr_multicast_mld_version.attr,
852#endif
812#endif 853#endif
813#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) 854#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
814 &dev_attr_nf_call_iptables.attr, 855 &dev_attr_nf_call_iptables.attr,
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index 8bd569695e76..79aee759aba5 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -17,6 +17,7 @@
17#include <linux/if_bridge.h> 17#include <linux/if_bridge.h>
18#include <linux/rtnetlink.h> 18#include <linux/rtnetlink.h>
19#include <linux/spinlock.h> 19#include <linux/spinlock.h>
20#include <linux/sched/signal.h>
20 21
21#include "br_private.h" 22#include "br_private.h"
22 23
@@ -188,6 +189,7 @@ static BRPORT_ATTR(multicast_router, S_IRUGO | S_IWUSR, show_multicast_router,
188 store_multicast_router); 189 store_multicast_router);
189 190
190BRPORT_ATTR_FLAG(multicast_fast_leave, BR_MULTICAST_FAST_LEAVE); 191BRPORT_ATTR_FLAG(multicast_fast_leave, BR_MULTICAST_FAST_LEAVE);
192BRPORT_ATTR_FLAG(multicast_to_unicast, BR_MULTICAST_TO_UNICAST);
191#endif 193#endif
192 194
193static const struct brport_attribute *brport_attrs[] = { 195static const struct brport_attribute *brport_attrs[] = {
@@ -214,6 +216,7 @@ static const struct brport_attribute *brport_attrs[] = {
214#ifdef CONFIG_BRIDGE_IGMP_SNOOPING 216#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
215 &brport_attr_multicast_router, 217 &brport_attr_multicast_router,
216 &brport_attr_multicast_fast_leave, 218 &brport_attr_multicast_fast_leave,
219 &brport_attr_multicast_to_unicast,
217#endif 220#endif
218 &brport_attr_proxyarp, 221 &brport_attr_proxyarp,
219 &brport_attr_proxyarp_wifi, 222 &brport_attr_proxyarp_wifi,
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index b6de4f457161..b838213c408e 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -5,6 +5,7 @@
5#include <net/switchdev.h> 5#include <net/switchdev.h>
6 6
7#include "br_private.h" 7#include "br_private.h"
8#include "br_private_tunnel.h"
8 9
9static inline int br_vlan_cmp(struct rhashtable_compare_arg *arg, 10static inline int br_vlan_cmp(struct rhashtable_compare_arg *arg,
10 const void *ptr) 11 const void *ptr)
@@ -310,6 +311,7 @@ static int __vlan_del(struct net_bridge_vlan *v)
310 } 311 }
311 312
312 if (masterv != v) { 313 if (masterv != v) {
314 vlan_tunnel_info_del(vg, v);
313 rhashtable_remove_fast(&vg->vlan_hash, &v->vnode, 315 rhashtable_remove_fast(&vg->vlan_hash, &v->vnode,
314 br_vlan_rht_params); 316 br_vlan_rht_params);
315 __vlan_del_list(v); 317 __vlan_del_list(v);
@@ -325,6 +327,7 @@ static void __vlan_group_free(struct net_bridge_vlan_group *vg)
325{ 327{
326 WARN_ON(!list_empty(&vg->vlan_list)); 328 WARN_ON(!list_empty(&vg->vlan_list));
327 rhashtable_destroy(&vg->vlan_hash); 329 rhashtable_destroy(&vg->vlan_hash);
330 vlan_tunnel_deinit(vg);
328 kfree(vg); 331 kfree(vg);
329} 332}
330 333
@@ -338,6 +341,7 @@ static void __vlan_flush(struct net_bridge_vlan_group *vg)
338} 341}
339 342
340struct sk_buff *br_handle_vlan(struct net_bridge *br, 343struct sk_buff *br_handle_vlan(struct net_bridge *br,
344 const struct net_bridge_port *p,
341 struct net_bridge_vlan_group *vg, 345 struct net_bridge_vlan_group *vg,
342 struct sk_buff *skb) 346 struct sk_buff *skb)
343{ 347{
@@ -378,6 +382,12 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br,
378 382
379 if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED) 383 if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED)
380 skb->vlan_tci = 0; 384 skb->vlan_tci = 0;
385
386 if (p && (p->flags & BR_VLAN_TUNNEL) &&
387 br_handle_egress_vlan_tunnel(skb, v)) {
388 kfree_skb(skb);
389 return NULL;
390 }
381out: 391out:
382 return skb; 392 return skb;
383} 393}
@@ -613,6 +623,8 @@ int br_vlan_delete(struct net_bridge *br, u16 vid)
613 br_fdb_find_delete_local(br, NULL, br->dev->dev_addr, vid); 623 br_fdb_find_delete_local(br, NULL, br->dev->dev_addr, vid);
614 br_fdb_delete_by_port(br, NULL, vid, 0); 624 br_fdb_delete_by_port(br, NULL, vid, 0);
615 625
626 vlan_tunnel_info_del(vg, v);
627
616 return __vlan_del(v); 628 return __vlan_del(v);
617} 629}
618 630
@@ -918,6 +930,9 @@ int br_vlan_init(struct net_bridge *br)
918 ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params); 930 ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params);
919 if (ret) 931 if (ret)
920 goto err_rhtbl; 932 goto err_rhtbl;
933 ret = vlan_tunnel_init(vg);
934 if (ret)
935 goto err_tunnel_init;
921 INIT_LIST_HEAD(&vg->vlan_list); 936 INIT_LIST_HEAD(&vg->vlan_list);
922 br->vlan_proto = htons(ETH_P_8021Q); 937 br->vlan_proto = htons(ETH_P_8021Q);
923 br->default_pvid = 1; 938 br->default_pvid = 1;
@@ -932,6 +947,8 @@ out:
932 return ret; 947 return ret;
933 948
934err_vlan_add: 949err_vlan_add:
950 vlan_tunnel_deinit(vg);
951err_tunnel_init:
935 rhashtable_destroy(&vg->vlan_hash); 952 rhashtable_destroy(&vg->vlan_hash);
936err_rhtbl: 953err_rhtbl:
937 kfree(vg); 954 kfree(vg);
@@ -961,6 +978,9 @@ int nbp_vlan_init(struct net_bridge_port *p)
961 ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params); 978 ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params);
962 if (ret) 979 if (ret)
963 goto err_rhtbl; 980 goto err_rhtbl;
981 ret = vlan_tunnel_init(vg);
982 if (ret)
983 goto err_tunnel_init;
964 INIT_LIST_HEAD(&vg->vlan_list); 984 INIT_LIST_HEAD(&vg->vlan_list);
965 rcu_assign_pointer(p->vlgrp, vg); 985 rcu_assign_pointer(p->vlgrp, vg);
966 if (p->br->default_pvid) { 986 if (p->br->default_pvid) {
@@ -976,9 +996,11 @@ out:
976err_vlan_add: 996err_vlan_add:
977 RCU_INIT_POINTER(p->vlgrp, NULL); 997 RCU_INIT_POINTER(p->vlgrp, NULL);
978 synchronize_rcu(); 998 synchronize_rcu();
999 vlan_tunnel_deinit(vg);
1000err_tunnel_init:
979 rhashtable_destroy(&vg->vlan_hash); 1001 rhashtable_destroy(&vg->vlan_hash);
980err_vlan_enabled:
981err_rhtbl: 1002err_rhtbl:
1003err_vlan_enabled:
982 kfree(vg); 1004 kfree(vg);
983 1005
984 goto out; 1006 goto out;
diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c
new file mode 100644
index 000000000000..6d2c4eed2dc8
--- /dev/null
+++ b/net/bridge/br_vlan_tunnel.c
@@ -0,0 +1,205 @@
1/*
2 * Bridge per vlan tunnel port dst_metadata handling code
3 *
4 * Authors:
5 * Roopa Prabhu <roopa@cumulusnetworks.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 */
12
13#include <linux/kernel.h>
14#include <linux/netdevice.h>
15#include <linux/rtnetlink.h>
16#include <linux/slab.h>
17#include <net/switchdev.h>
18#include <net/dst_metadata.h>
19
20#include "br_private.h"
21#include "br_private_tunnel.h"
22
23static inline int br_vlan_tunid_cmp(struct rhashtable_compare_arg *arg,
24 const void *ptr)
25{
26 const struct net_bridge_vlan *vle = ptr;
27 __be64 tunid = *(__be64 *)arg->key;
28
29 return vle->tinfo.tunnel_id != tunid;
30}
31
32static const struct rhashtable_params br_vlan_tunnel_rht_params = {
33 .head_offset = offsetof(struct net_bridge_vlan, tnode),
34 .key_offset = offsetof(struct net_bridge_vlan, tinfo.tunnel_id),
35 .key_len = sizeof(__be64),
36 .nelem_hint = 3,
37 .locks_mul = 1,
38 .obj_cmpfn = br_vlan_tunid_cmp,
39 .automatic_shrinking = true,
40};
41
42static struct net_bridge_vlan *br_vlan_tunnel_lookup(struct rhashtable *tbl,
43 u64 tunnel_id)
44{
45 return rhashtable_lookup_fast(tbl, &tunnel_id,
46 br_vlan_tunnel_rht_params);
47}
48
49void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg,
50 struct net_bridge_vlan *vlan)
51{
52 if (!vlan->tinfo.tunnel_dst)
53 return;
54 rhashtable_remove_fast(&vg->tunnel_hash, &vlan->tnode,
55 br_vlan_tunnel_rht_params);
56 vlan->tinfo.tunnel_id = 0;
57 dst_release(&vlan->tinfo.tunnel_dst->dst);
58 vlan->tinfo.tunnel_dst = NULL;
59}
60
61static int __vlan_tunnel_info_add(struct net_bridge_vlan_group *vg,
62 struct net_bridge_vlan *vlan, u32 tun_id)
63{
64 struct metadata_dst *metadata = NULL;
65 __be64 key = key32_to_tunnel_id(cpu_to_be32(tun_id));
66 int err;
67
68 if (vlan->tinfo.tunnel_dst)
69 return -EEXIST;
70
71 metadata = __ip_tun_set_dst(0, 0, 0, 0, 0, TUNNEL_KEY,
72 key, 0);
73 if (!metadata)
74 return -EINVAL;
75
76 metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_BRIDGE;
77 vlan->tinfo.tunnel_dst = metadata;
78 vlan->tinfo.tunnel_id = key;
79
80 err = rhashtable_lookup_insert_fast(&vg->tunnel_hash, &vlan->tnode,
81 br_vlan_tunnel_rht_params);
82 if (err)
83 goto out;
84
85 return 0;
86out:
87 dst_release(&vlan->tinfo.tunnel_dst->dst);
88 vlan->tinfo.tunnel_dst = NULL;
89 vlan->tinfo.tunnel_id = 0;
90
91 return err;
92}
93
94/* Must be protected by RTNL.
95 * Must be called with vid in range from 1 to 4094 inclusive.
96 */
97int nbp_vlan_tunnel_info_add(struct net_bridge_port *port, u16 vid, u32 tun_id)
98{
99 struct net_bridge_vlan_group *vg;
100 struct net_bridge_vlan *vlan;
101
102 ASSERT_RTNL();
103
104 vg = nbp_vlan_group(port);
105 vlan = br_vlan_find(vg, vid);
106 if (!vlan)
107 return -EINVAL;
108
109 return __vlan_tunnel_info_add(vg, vlan, tun_id);
110}
111
112/* Must be protected by RTNL.
113 * Must be called with vid in range from 1 to 4094 inclusive.
114 */
115int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port, u16 vid)
116{
117 struct net_bridge_vlan_group *vg;
118 struct net_bridge_vlan *v;
119
120 ASSERT_RTNL();
121
122 vg = nbp_vlan_group(port);
123 v = br_vlan_find(vg, vid);
124 if (!v)
125 return -ENOENT;
126
127 vlan_tunnel_info_del(vg, v);
128
129 return 0;
130}
131
132static void __vlan_tunnel_info_flush(struct net_bridge_vlan_group *vg)
133{
134 struct net_bridge_vlan *vlan, *tmp;
135
136 list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist)
137 vlan_tunnel_info_del(vg, vlan);
138}
139
140void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port)
141{
142 struct net_bridge_vlan_group *vg;
143
144 ASSERT_RTNL();
145
146 vg = nbp_vlan_group(port);
147 __vlan_tunnel_info_flush(vg);
148}
149
150int vlan_tunnel_init(struct net_bridge_vlan_group *vg)
151{
152 return rhashtable_init(&vg->tunnel_hash, &br_vlan_tunnel_rht_params);
153}
154
155void vlan_tunnel_deinit(struct net_bridge_vlan_group *vg)
156{
157 rhashtable_destroy(&vg->tunnel_hash);
158}
159
160int br_handle_ingress_vlan_tunnel(struct sk_buff *skb,
161 struct net_bridge_port *p,
162 struct net_bridge_vlan_group *vg)
163{
164 struct ip_tunnel_info *tinfo = skb_tunnel_info(skb);
165 struct net_bridge_vlan *vlan;
166
167 if (!vg || !tinfo)
168 return 0;
169
170 /* if already tagged, ignore */
171 if (skb_vlan_tagged(skb))
172 return 0;
173
174 /* lookup vid, given tunnel id */
175 vlan = br_vlan_tunnel_lookup(&vg->tunnel_hash, tinfo->key.tun_id);
176 if (!vlan)
177 return 0;
178
179 skb_dst_drop(skb);
180
181 __vlan_hwaccel_put_tag(skb, p->br->vlan_proto, vlan->vid);
182
183 return 0;
184}
185
186int br_handle_egress_vlan_tunnel(struct sk_buff *skb,
187 struct net_bridge_vlan *vlan)
188{
189 int err;
190
191 if (!vlan || !vlan->tinfo.tunnel_id)
192 return 0;
193
194 if (unlikely(!skb_vlan_tag_present(skb)))
195 return 0;
196
197 skb_dst_drop(skb);
198 err = skb_vlan_pop(skb);
199 if (err)
200 return err;
201
202 skb_dst_set(skb, dst_clone(&vlan->tinfo.tunnel_dst->dst));
203
204 return 0;
205}
diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index 9cebf47ac840..e7ef1a1ef3a6 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -22,6 +22,7 @@ config NFT_BRIDGE_REJECT
22 22
23config NF_LOG_BRIDGE 23config NF_LOG_BRIDGE
24 tristate "Bridge packet logging" 24 tristate "Bridge packet logging"
25 select NF_LOG_COMMON
25 26
26endif # NF_TABLES_BRIDGE 27endif # NF_TABLES_BRIDGE
27 28
diff --git a/net/bridge/netfilter/ebt_among.c b/net/bridge/netfilter/ebt_among.c
index 9024283d2bca..279527f8b1fe 100644
--- a/net/bridge/netfilter/ebt_among.c
+++ b/net/bridge/netfilter/ebt_among.c
@@ -187,7 +187,7 @@ static int ebt_among_mt_check(const struct xt_mtchk_param *par)
187 expected_length += ebt_mac_wormhash_size(wh_src); 187 expected_length += ebt_mac_wormhash_size(wh_src);
188 188
189 if (em->match_size != EBT_ALIGN(expected_length)) { 189 if (em->match_size != EBT_ALIGN(expected_length)) {
190 pr_info("wrong size: %d against expected %d, rounded to %Zd\n", 190 pr_info("wrong size: %d against expected %d, rounded to %zd\n",
191 em->match_size, expected_length, 191 em->match_size, expected_length,
192 EBT_ALIGN(expected_length)); 192 EBT_ALIGN(expected_length));
193 return -EINVAL; 193 return -EINVAL;
diff --git a/net/bridge/netfilter/ebt_arpreply.c b/net/bridge/netfilter/ebt_arpreply.c
index 070cf134a22f..5929309beaa1 100644
--- a/net/bridge/netfilter/ebt_arpreply.c
+++ b/net/bridge/netfilter/ebt_arpreply.c
@@ -51,7 +51,8 @@ ebt_arpreply_tg(struct sk_buff *skb, const struct xt_action_param *par)
51 if (diptr == NULL) 51 if (diptr == NULL)
52 return EBT_DROP; 52 return EBT_DROP;
53 53
54 arp_send(ARPOP_REPLY, ETH_P_ARP, *siptr, (struct net_device *)par->in, 54 arp_send(ARPOP_REPLY, ETH_P_ARP, *siptr,
55 (struct net_device *)xt_in(par),
55 *diptr, shp, info->mac, shp); 56 *diptr, shp, info->mac, shp);
56 57
57 return info->target; 58 return info->target;
diff --git a/net/bridge/netfilter/ebt_limit.c b/net/bridge/netfilter/ebt_limit.c
index 517e78befcb2..61a9f1be1263 100644
--- a/net/bridge/netfilter/ebt_limit.c
+++ b/net/bridge/netfilter/ebt_limit.c
@@ -105,6 +105,7 @@ static struct xt_match ebt_limit_mt_reg __read_mostly = {
105 .match = ebt_limit_mt, 105 .match = ebt_limit_mt,
106 .checkentry = ebt_limit_mt_check, 106 .checkentry = ebt_limit_mt_check,
107 .matchsize = sizeof(struct ebt_limit_info), 107 .matchsize = sizeof(struct ebt_limit_info),
108 .usersize = offsetof(struct ebt_limit_info, prev),
108#ifdef CONFIG_COMPAT 109#ifdef CONFIG_COMPAT
109 .compatsize = sizeof(struct ebt_compat_limit_info), 110 .compatsize = sizeof(struct ebt_compat_limit_info),
110#endif 111#endif
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index 9a11086ba6ff..98b9c8e8615e 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -78,7 +78,7 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum,
78 unsigned int bitmask; 78 unsigned int bitmask;
79 79
80 /* FIXME: Disabled from containers until syslog ns is supported */ 80 /* FIXME: Disabled from containers until syslog ns is supported */
81 if (!net_eq(net, &init_net)) 81 if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns)
82 return; 82 return;
83 83
84 spin_lock_bh(&ebt_log_lock); 84 spin_lock_bh(&ebt_log_lock);
@@ -179,7 +179,7 @@ ebt_log_tg(struct sk_buff *skb, const struct xt_action_param *par)
179{ 179{
180 const struct ebt_log_info *info = par->targinfo; 180 const struct ebt_log_info *info = par->targinfo;
181 struct nf_loginfo li; 181 struct nf_loginfo li;
182 struct net *net = par->net; 182 struct net *net = xt_net(par);
183 183
184 li.type = NF_LOG_TYPE_LOG; 184 li.type = NF_LOG_TYPE_LOG;
185 li.u.log.level = info->loglevel; 185 li.u.log.level = info->loglevel;
@@ -190,11 +190,12 @@ ebt_log_tg(struct sk_buff *skb, const struct xt_action_param *par)
190 * nf_log_packet() with NFT_LOG_TYPE_LOG here. --Pablo 190 * nf_log_packet() with NFT_LOG_TYPE_LOG here. --Pablo
191 */ 191 */
192 if (info->bitmask & EBT_LOG_NFLOG) 192 if (info->bitmask & EBT_LOG_NFLOG)
193 nf_log_packet(net, NFPROTO_BRIDGE, par->hooknum, skb, 193 nf_log_packet(net, NFPROTO_BRIDGE, xt_hooknum(par), skb,
194 par->in, par->out, &li, "%s", info->prefix); 194 xt_in(par), xt_out(par), &li, "%s",
195 info->prefix);
195 else 196 else
196 ebt_log_packet(net, NFPROTO_BRIDGE, par->hooknum, skb, par->in, 197 ebt_log_packet(net, NFPROTO_BRIDGE, xt_hooknum(par), skb,
197 par->out, &li, info->prefix); 198 xt_in(par), xt_out(par), &li, info->prefix);
198 return EBT_CONTINUE; 199 return EBT_CONTINUE;
199} 200}
200 201
diff --git a/net/bridge/netfilter/ebt_nflog.c b/net/bridge/netfilter/ebt_nflog.c
index 54816150608e..c1dc48686200 100644
--- a/net/bridge/netfilter/ebt_nflog.c
+++ b/net/bridge/netfilter/ebt_nflog.c
@@ -23,16 +23,16 @@ static unsigned int
23ebt_nflog_tg(struct sk_buff *skb, const struct xt_action_param *par) 23ebt_nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
24{ 24{
25 const struct ebt_nflog_info *info = par->targinfo; 25 const struct ebt_nflog_info *info = par->targinfo;
26 struct net *net = xt_net(par);
26 struct nf_loginfo li; 27 struct nf_loginfo li;
27 struct net *net = par->net;
28 28
29 li.type = NF_LOG_TYPE_ULOG; 29 li.type = NF_LOG_TYPE_ULOG;
30 li.u.ulog.copy_len = info->len; 30 li.u.ulog.copy_len = info->len;
31 li.u.ulog.group = info->group; 31 li.u.ulog.group = info->group;
32 li.u.ulog.qthreshold = info->threshold; 32 li.u.ulog.qthreshold = info->threshold;
33 33
34 nf_log_packet(net, PF_BRIDGE, par->hooknum, skb, par->in, 34 nf_log_packet(net, PF_BRIDGE, xt_hooknum(par), skb, xt_in(par),
35 par->out, &li, "%s", info->prefix); 35 xt_out(par), &li, "%s", info->prefix);
36 return EBT_CONTINUE; 36 return EBT_CONTINUE;
37} 37}
38 38
diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c
index 2e7c4f974340..8d2a85e0594e 100644
--- a/net/bridge/netfilter/ebt_redirect.c
+++ b/net/bridge/netfilter/ebt_redirect.c
@@ -23,12 +23,12 @@ ebt_redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
23 if (!skb_make_writable(skb, 0)) 23 if (!skb_make_writable(skb, 0))
24 return EBT_DROP; 24 return EBT_DROP;
25 25
26 if (par->hooknum != NF_BR_BROUTING) 26 if (xt_hooknum(par) != NF_BR_BROUTING)
27 /* rcu_read_lock()ed by nf_hook_thresh */ 27 /* rcu_read_lock()ed by nf_hook_thresh */
28 ether_addr_copy(eth_hdr(skb)->h_dest, 28 ether_addr_copy(eth_hdr(skb)->h_dest,
29 br_port_get_rcu(par->in)->br->dev->dev_addr); 29 br_port_get_rcu(xt_in(par))->br->dev->dev_addr);
30 else 30 else
31 ether_addr_copy(eth_hdr(skb)->h_dest, par->in->dev_addr); 31 ether_addr_copy(eth_hdr(skb)->h_dest, xt_in(par)->dev_addr);
32 skb->pkt_type = PACKET_HOST; 32 skb->pkt_type = PACKET_HOST;
33 return info->target; 33 return info->target;
34} 34}
diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c
index ec94c6f1ae88..8fe36dc3aab2 100644
--- a/net/bridge/netfilter/ebtable_broute.c
+++ b/net/bridge/netfilter/ebtable_broute.c
@@ -53,7 +53,7 @@ static int ebt_broute(struct sk_buff *skb)
53 struct nf_hook_state state; 53 struct nf_hook_state state;
54 int ret; 54 int ret;
55 55
56 nf_hook_state_init(&state, NULL, NF_BR_BROUTING, INT_MIN, 56 nf_hook_state_init(&state, NF_BR_BROUTING,
57 NFPROTO_BRIDGE, skb->dev, NULL, NULL, 57 NFPROTO_BRIDGE, skb->dev, NULL, NULL,
58 dev_net(skb->dev), NULL); 58 dev_net(skb->dev), NULL);
59 59
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index f5c11bbe27db..79b69917f521 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -23,7 +23,7 @@
23#include <linux/spinlock.h> 23#include <linux/spinlock.h>
24#include <linux/mutex.h> 24#include <linux/mutex.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <asm/uaccess.h> 26#include <linux/uaccess.h>
27#include <linux/smp.h> 27#include <linux/smp.h>
28#include <linux/cpumask.h> 28#include <linux/cpumask.h>
29#include <linux/audit.h> 29#include <linux/audit.h>
@@ -194,12 +194,8 @@ unsigned int ebt_do_table(struct sk_buff *skb,
194 const struct ebt_table_info *private; 194 const struct ebt_table_info *private;
195 struct xt_action_param acpar; 195 struct xt_action_param acpar;
196 196
197 acpar.family = NFPROTO_BRIDGE; 197 acpar.state = state;
198 acpar.net = state->net;
199 acpar.in = state->in;
200 acpar.out = state->out;
201 acpar.hotdrop = false; 198 acpar.hotdrop = false;
202 acpar.hooknum = hook;
203 199
204 read_lock_bh(&table->lock); 200 read_lock_bh(&table->lock);
205 private = table->private; 201 private = table->private;
@@ -1350,56 +1346,72 @@ static int update_counters(struct net *net, const void __user *user,
1350 hlp.num_counters, user, len); 1346 hlp.num_counters, user, len);
1351} 1347}
1352 1348
1353static inline int ebt_make_matchname(const struct ebt_entry_match *m, 1349static inline int ebt_obj_to_user(char __user *um, const char *_name,
1354 const char *base, char __user *ubase) 1350 const char *data, int entrysize,
1351 int usersize, int datasize)
1355{ 1352{
1356 char __user *hlp = ubase + ((char *)m - base); 1353 char name[EBT_FUNCTION_MAXNAMELEN] = {0};
1357 char name[EBT_FUNCTION_MAXNAMELEN] = {};
1358 1354
1359 /* ebtables expects 32 bytes long names but xt_match names are 29 bytes 1355 /* ebtables expects 32 bytes long names but xt_match names are 29 bytes
1360 * long. Copy 29 bytes and fill remaining bytes with zeroes. 1356 * long. Copy 29 bytes and fill remaining bytes with zeroes.
1361 */ 1357 */
1362 strlcpy(name, m->u.match->name, sizeof(name)); 1358 strlcpy(name, _name, sizeof(name));
1363 if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN)) 1359 if (copy_to_user(um, name, EBT_FUNCTION_MAXNAMELEN) ||
1360 put_user(datasize, (int __user *)(um + EBT_FUNCTION_MAXNAMELEN)) ||
1361 xt_data_to_user(um + entrysize, data, usersize, datasize))
1364 return -EFAULT; 1362 return -EFAULT;
1363
1365 return 0; 1364 return 0;
1366} 1365}
1367 1366
1368static inline int ebt_make_watchername(const struct ebt_entry_watcher *w, 1367static inline int ebt_match_to_user(const struct ebt_entry_match *m,
1369 const char *base, char __user *ubase) 1368 const char *base, char __user *ubase)
1370{ 1369{
1371 char __user *hlp = ubase + ((char *)w - base); 1370 return ebt_obj_to_user(ubase + ((char *)m - base),
1372 char name[EBT_FUNCTION_MAXNAMELEN] = {}; 1371 m->u.match->name, m->data, sizeof(*m),
1372 m->u.match->usersize, m->match_size);
1373}
1373 1374
1374 strlcpy(name, w->u.watcher->name, sizeof(name)); 1375static inline int ebt_watcher_to_user(const struct ebt_entry_watcher *w,
1375 if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN)) 1376 const char *base, char __user *ubase)
1376 return -EFAULT; 1377{
1377 return 0; 1378 return ebt_obj_to_user(ubase + ((char *)w - base),
1379 w->u.watcher->name, w->data, sizeof(*w),
1380 w->u.watcher->usersize, w->watcher_size);
1378} 1381}
1379 1382
1380static inline int ebt_make_names(struct ebt_entry *e, const char *base, 1383static inline int ebt_entry_to_user(struct ebt_entry *e, const char *base,
1381 char __user *ubase) 1384 char __user *ubase)
1382{ 1385{
1383 int ret; 1386 int ret;
1384 char __user *hlp; 1387 char __user *hlp;
1385 const struct ebt_entry_target *t; 1388 const struct ebt_entry_target *t;
1386 char name[EBT_FUNCTION_MAXNAMELEN] = {};
1387 1389
1388 if (e->bitmask == 0) 1390 if (e->bitmask == 0) {
1391 /* special case !EBT_ENTRY_OR_ENTRIES */
1392 if (copy_to_user(ubase + ((char *)e - base), e,
1393 sizeof(struct ebt_entries)))
1394 return -EFAULT;
1389 return 0; 1395 return 0;
1396 }
1397
1398 if (copy_to_user(ubase + ((char *)e - base), e, sizeof(*e)))
1399 return -EFAULT;
1390 1400
1391 hlp = ubase + (((char *)e + e->target_offset) - base); 1401 hlp = ubase + (((char *)e + e->target_offset) - base);
1392 t = (struct ebt_entry_target *)(((char *)e) + e->target_offset); 1402 t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
1393 1403
1394 ret = EBT_MATCH_ITERATE(e, ebt_make_matchname, base, ubase); 1404 ret = EBT_MATCH_ITERATE(e, ebt_match_to_user, base, ubase);
1395 if (ret != 0) 1405 if (ret != 0)
1396 return ret; 1406 return ret;
1397 ret = EBT_WATCHER_ITERATE(e, ebt_make_watchername, base, ubase); 1407 ret = EBT_WATCHER_ITERATE(e, ebt_watcher_to_user, base, ubase);
1398 if (ret != 0) 1408 if (ret != 0)
1399 return ret; 1409 return ret;
1400 strlcpy(name, t->u.target->name, sizeof(name)); 1410 ret = ebt_obj_to_user(hlp, t->u.target->name, t->data, sizeof(*t),
1401 if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN)) 1411 t->u.target->usersize, t->target_size);
1402 return -EFAULT; 1412 if (ret != 0)
1413 return ret;
1414
1403 return 0; 1415 return 0;
1404} 1416}
1405 1417
@@ -1479,13 +1491,9 @@ static int copy_everything_to_user(struct ebt_table *t, void __user *user,
1479 if (ret) 1491 if (ret)
1480 return ret; 1492 return ret;
1481 1493
1482 if (copy_to_user(tmp.entries, entries, entries_size)) {
1483 BUGPRINT("Couldn't copy entries to userspace\n");
1484 return -EFAULT;
1485 }
1486 /* set the match/watcher/target names right */ 1494 /* set the match/watcher/target names right */
1487 return EBT_ENTRY_ITERATE(entries, entries_size, 1495 return EBT_ENTRY_ITERATE(entries, entries_size,
1488 ebt_make_names, entries, tmp.entries); 1496 ebt_entry_to_user, entries, tmp.entries);
1489} 1497}
1490 1498
1491static int do_ebt_set_ctl(struct sock *sk, 1499static int do_ebt_set_ctl(struct sock *sk,
@@ -1634,8 +1642,10 @@ static int compat_match_to_user(struct ebt_entry_match *m, void __user **dstptr,
1634 if (match->compat_to_user) { 1642 if (match->compat_to_user) {
1635 if (match->compat_to_user(cm->data, m->data)) 1643 if (match->compat_to_user(cm->data, m->data))
1636 return -EFAULT; 1644 return -EFAULT;
1637 } else if (copy_to_user(cm->data, m->data, msize)) 1645 } else {
1646 if (xt_data_to_user(cm->data, m->data, match->usersize, msize))
1638 return -EFAULT; 1647 return -EFAULT;
1648 }
1639 1649
1640 *size -= ebt_compat_entry_padsize() + off; 1650 *size -= ebt_compat_entry_padsize() + off;
1641 *dstptr = cm->data; 1651 *dstptr = cm->data;
@@ -1661,8 +1671,10 @@ static int compat_target_to_user(struct ebt_entry_target *t,
1661 if (target->compat_to_user) { 1671 if (target->compat_to_user) {
1662 if (target->compat_to_user(cm->data, t->data)) 1672 if (target->compat_to_user(cm->data, t->data))
1663 return -EFAULT; 1673 return -EFAULT;
1664 } else if (copy_to_user(cm->data, t->data, tsize)) 1674 } else {
1665 return -EFAULT; 1675 if (xt_data_to_user(cm->data, t->data, target->usersize, tsize))
1676 return -EFAULT;
1677 }
1666 1678
1667 *size -= ebt_compat_entry_padsize() + off; 1679 *size -= ebt_compat_entry_padsize() + off;
1668 *dstptr = cm->data; 1680 *dstptr = cm->data;
diff --git a/net/bridge/netfilter/nf_log_bridge.c b/net/bridge/netfilter/nf_log_bridge.c
index 1663df598545..bd2b3c78f59b 100644
--- a/net/bridge/netfilter/nf_log_bridge.c
+++ b/net/bridge/netfilter/nf_log_bridge.c
@@ -24,21 +24,8 @@ static void nf_log_bridge_packet(struct net *net, u_int8_t pf,
24 const struct nf_loginfo *loginfo, 24 const struct nf_loginfo *loginfo,
25 const char *prefix) 25 const char *prefix)
26{ 26{
27 switch (eth_hdr(skb)->h_proto) { 27 nf_log_l2packet(net, pf, eth_hdr(skb)->h_proto, hooknum, skb,
28 case htons(ETH_P_IP): 28 in, out, loginfo, prefix);
29 nf_log_packet(net, NFPROTO_IPV4, hooknum, skb, in, out,
30 loginfo, "%s", prefix);
31 break;
32 case htons(ETH_P_IPV6):
33 nf_log_packet(net, NFPROTO_IPV6, hooknum, skb, in, out,
34 loginfo, "%s", prefix);
35 break;
36 case htons(ETH_P_ARP):
37 case htons(ETH_P_RARP):
38 nf_log_packet(net, NFPROTO_ARP, hooknum, skb, in, out,
39 loginfo, "%s", prefix);
40 break;
41 }
42} 29}
43 30
44static struct nf_logger nf_bridge_logger __read_mostly = { 31static struct nf_logger nf_bridge_logger __read_mostly = {
diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c
index ad47a921b701..5974dbc1ea24 100644
--- a/net/bridge/netfilter/nft_meta_bridge.c
+++ b/net/bridge/netfilter/nft_meta_bridge.c
@@ -23,7 +23,7 @@ static void nft_meta_bridge_get_eval(const struct nft_expr *expr,
23 const struct nft_pktinfo *pkt) 23 const struct nft_pktinfo *pkt)
24{ 24{
25 const struct nft_meta *priv = nft_expr_priv(expr); 25 const struct nft_meta *priv = nft_expr_priv(expr);
26 const struct net_device *in = pkt->in, *out = pkt->out; 26 const struct net_device *in = nft_in(pkt), *out = nft_out(pkt);
27 u32 *dest = &regs->data[priv->dreg]; 27 u32 *dest = &regs->data[priv->dreg];
28 const struct net_bridge_port *p; 28 const struct net_bridge_port *p;
29 29
diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c
index 4b3df6b0e3b9..206dc266ecd2 100644
--- a/net/bridge/netfilter/nft_reject_bridge.c
+++ b/net/bridge/netfilter/nft_reject_bridge.c
@@ -315,17 +315,20 @@ static void nft_reject_bridge_eval(const struct nft_expr *expr,
315 case htons(ETH_P_IP): 315 case htons(ETH_P_IP):
316 switch (priv->type) { 316 switch (priv->type) {
317 case NFT_REJECT_ICMP_UNREACH: 317 case NFT_REJECT_ICMP_UNREACH:
318 nft_reject_br_send_v4_unreach(pkt->net, pkt->skb, 318 nft_reject_br_send_v4_unreach(nft_net(pkt), pkt->skb,
319 pkt->in, pkt->hook, 319 nft_in(pkt),
320 nft_hook(pkt),
320 priv->icmp_code); 321 priv->icmp_code);
321 break; 322 break;
322 case NFT_REJECT_TCP_RST: 323 case NFT_REJECT_TCP_RST:
323 nft_reject_br_send_v4_tcp_reset(pkt->net, pkt->skb, 324 nft_reject_br_send_v4_tcp_reset(nft_net(pkt), pkt->skb,
324 pkt->in, pkt->hook); 325 nft_in(pkt),
326 nft_hook(pkt));
325 break; 327 break;
326 case NFT_REJECT_ICMPX_UNREACH: 328 case NFT_REJECT_ICMPX_UNREACH:
327 nft_reject_br_send_v4_unreach(pkt->net, pkt->skb, 329 nft_reject_br_send_v4_unreach(nft_net(pkt), pkt->skb,
328 pkt->in, pkt->hook, 330 nft_in(pkt),
331 nft_hook(pkt),
329 nft_reject_icmp_code(priv->icmp_code)); 332 nft_reject_icmp_code(priv->icmp_code));
330 break; 333 break;
331 } 334 }
@@ -333,17 +336,20 @@ static void nft_reject_bridge_eval(const struct nft_expr *expr,
333 case htons(ETH_P_IPV6): 336 case htons(ETH_P_IPV6):
334 switch (priv->type) { 337 switch (priv->type) {
335 case NFT_REJECT_ICMP_UNREACH: 338 case NFT_REJECT_ICMP_UNREACH:
336 nft_reject_br_send_v6_unreach(pkt->net, pkt->skb, 339 nft_reject_br_send_v6_unreach(nft_net(pkt), pkt->skb,
337 pkt->in, pkt->hook, 340 nft_in(pkt),
341 nft_hook(pkt),
338 priv->icmp_code); 342 priv->icmp_code);
339 break; 343 break;
340 case NFT_REJECT_TCP_RST: 344 case NFT_REJECT_TCP_RST:
341 nft_reject_br_send_v6_tcp_reset(pkt->net, pkt->skb, 345 nft_reject_br_send_v6_tcp_reset(nft_net(pkt), pkt->skb,
342 pkt->in, pkt->hook); 346 nft_in(pkt),
347 nft_hook(pkt));
343 break; 348 break;
344 case NFT_REJECT_ICMPX_UNREACH: 349 case NFT_REJECT_ICMPX_UNREACH:
345 nft_reject_br_send_v6_unreach(pkt->net, pkt->skb, 350 nft_reject_br_send_v6_unreach(nft_net(pkt), pkt->skb,
346 pkt->in, pkt->hook, 351 nft_in(pkt),
352 nft_hook(pkt),
347 nft_reject_icmpv6_code(priv->icmp_code)); 353 nft_reject_icmpv6_code(priv->icmp_code));
348 break; 354 break;
349 } 355 }
diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c
index d730a0f68f46..2d38b6e34203 100644
--- a/net/caif/caif_dev.c
+++ b/net/caif/caif_dev.c
@@ -52,7 +52,7 @@ struct caif_net {
52 struct caif_device_entry_list caifdevs; 52 struct caif_device_entry_list caifdevs;
53}; 53};
54 54
55static int caif_net_id; 55static unsigned int caif_net_id;
56static int q_high = 50; /* Percent */ 56static int q_high = 50; /* Percent */
57 57
58struct cfcnfg *get_cfcnfg(struct net *net) 58struct cfcnfg *get_cfcnfg(struct net *net)
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 92cbbd2afddb..adcad344c843 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -9,7 +9,7 @@
9#include <linux/fs.h> 9#include <linux/fs.h>
10#include <linux/init.h> 10#include <linux/init.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/sched.h> 12#include <linux/sched/signal.h>
13#include <linux/spinlock.h> 13#include <linux/spinlock.h>
14#include <linux/mutex.h> 14#include <linux/mutex.h>
15#include <linux/list.h> 15#include <linux/list.h>
diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c
index fa39fc298708..273cb07f57d8 100644
--- a/net/caif/cfcnfg.c
+++ b/net/caif/cfcnfg.c
@@ -390,8 +390,7 @@ cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv,
390 rcu_read_lock(); 390 rcu_read_lock();
391 391
392 if (adapt_layer == NULL) { 392 if (adapt_layer == NULL) {
393 pr_debug("link setup response but no client exist," 393 pr_debug("link setup response but no client exist, send linkdown back\n");
394 "send linkdown back\n");
395 cfctrl_linkdown_req(cnfg->ctrl, channel_id, NULL); 394 cfctrl_linkdown_req(cnfg->ctrl, channel_id, NULL);
396 goto unlock; 395 goto unlock;
397 } 396 }
@@ -401,8 +400,7 @@ cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv,
401 400
402 phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phyid); 401 phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phyid);
403 if (phyinfo == NULL) { 402 if (phyinfo == NULL) {
404 pr_err("ERROR: Link Layer Device disappeared" 403 pr_err("ERROR: Link Layer Device disappeared while connecting\n");
405 "while connecting\n");
406 goto unlock; 404 goto unlock;
407 } 405 }
408 406
@@ -436,8 +434,7 @@ cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv,
436 servicel = cfdbgl_create(channel_id, &phyinfo->dev_info); 434 servicel = cfdbgl_create(channel_id, &phyinfo->dev_info);
437 break; 435 break;
438 default: 436 default:
439 pr_err("Protocol error. Link setup response " 437 pr_err("Protocol error. Link setup response - unknown channel type\n");
440 "- unknown channel type\n");
441 goto unlock; 438 goto unlock;
442 } 439 }
443 if (!servicel) 440 if (!servicel)
diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c
index 3408ed51b611..1816fc9f1ee7 100644
--- a/net/caif/chnl_net.c
+++ b/net/caif/chnl_net.c
@@ -44,7 +44,6 @@ enum caif_states {
44 44
45struct chnl_net { 45struct chnl_net {
46 struct cflayer chnl; 46 struct cflayer chnl;
47 struct net_device_stats stats;
48 struct caif_connect_request conn_req; 47 struct caif_connect_request conn_req;
49 struct list_head list_field; 48 struct list_head list_field;
50 struct net_device *netdev; 49 struct net_device *netdev;
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 1108079d934f..5488e4a6ccd0 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -445,6 +445,7 @@ static struct hlist_head *find_rcv_list(canid_t *can_id, canid_t *mask,
445 * @func: callback function on filter match 445 * @func: callback function on filter match
446 * @data: returned parameter for callback function 446 * @data: returned parameter for callback function
447 * @ident: string for calling module identification 447 * @ident: string for calling module identification
448 * @sk: socket pointer (might be NULL)
448 * 449 *
449 * Description: 450 * Description:
450 * Invokes the callback function with the received sk_buff and the given 451 * Invokes the callback function with the received sk_buff and the given
@@ -468,7 +469,7 @@ static struct hlist_head *find_rcv_list(canid_t *can_id, canid_t *mask,
468 */ 469 */
469int can_rx_register(struct net_device *dev, canid_t can_id, canid_t mask, 470int can_rx_register(struct net_device *dev, canid_t can_id, canid_t mask,
470 void (*func)(struct sk_buff *, void *), void *data, 471 void (*func)(struct sk_buff *, void *), void *data,
471 char *ident) 472 char *ident, struct sock *sk)
472{ 473{
473 struct receiver *r; 474 struct receiver *r;
474 struct hlist_head *rl; 475 struct hlist_head *rl;
@@ -496,6 +497,7 @@ int can_rx_register(struct net_device *dev, canid_t can_id, canid_t mask,
496 r->func = func; 497 r->func = func;
497 r->data = data; 498 r->data = data;
498 r->ident = ident; 499 r->ident = ident;
500 r->sk = sk;
499 501
500 hlist_add_head_rcu(&r->list, rl); 502 hlist_add_head_rcu(&r->list, rl);
501 d->entries++; 503 d->entries++;
@@ -520,8 +522,11 @@ EXPORT_SYMBOL(can_rx_register);
520static void can_rx_delete_receiver(struct rcu_head *rp) 522static void can_rx_delete_receiver(struct rcu_head *rp)
521{ 523{
522 struct receiver *r = container_of(rp, struct receiver, rcu); 524 struct receiver *r = container_of(rp, struct receiver, rcu);
525 struct sock *sk = r->sk;
523 526
524 kmem_cache_free(rcv_cache, r); 527 kmem_cache_free(rcv_cache, r);
528 if (sk)
529 sock_put(sk);
525} 530}
526 531
527/** 532/**
@@ -596,8 +601,11 @@ void can_rx_unregister(struct net_device *dev, canid_t can_id, canid_t mask,
596 spin_unlock(&can_rcvlists_lock); 601 spin_unlock(&can_rcvlists_lock);
597 602
598 /* schedule the receiver item for deletion */ 603 /* schedule the receiver item for deletion */
599 if (r) 604 if (r) {
605 if (r->sk)
606 sock_hold(r->sk);
600 call_rcu(&r->rcu, can_rx_delete_receiver); 607 call_rcu(&r->rcu, can_rx_delete_receiver);
608 }
601} 609}
602EXPORT_SYMBOL(can_rx_unregister); 610EXPORT_SYMBOL(can_rx_unregister);
603 611
diff --git a/net/can/af_can.h b/net/can/af_can.h
index fca0fe9fc45a..b86f5129e838 100644
--- a/net/can/af_can.h
+++ b/net/can/af_can.h
@@ -50,13 +50,14 @@
50 50
51struct receiver { 51struct receiver {
52 struct hlist_node list; 52 struct hlist_node list;
53 struct rcu_head rcu;
54 canid_t can_id; 53 canid_t can_id;
55 canid_t mask; 54 canid_t mask;
56 unsigned long matches; 55 unsigned long matches;
57 void (*func)(struct sk_buff *, void *); 56 void (*func)(struct sk_buff *, void *);
58 void *data; 57 void *data;
59 char *ident; 58 char *ident;
59 struct sock *sk;
60 struct rcu_head rcu;
60}; 61};
61 62
62#define CAN_SFF_RCV_ARRAY_SZ (1 << CAN_SFF_ID_BITS) 63#define CAN_SFF_RCV_ARRAY_SZ (1 << CAN_SFF_ID_BITS)
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 436a7537e6a9..95d13b233c65 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -199,11 +199,11 @@ static int bcm_proc_show(struct seq_file *m, void *v)
199 199
200 seq_printf(m, "%c ", (op->flags & RX_CHECK_DLC) ? 'd' : ' '); 200 seq_printf(m, "%c ", (op->flags & RX_CHECK_DLC) ? 'd' : ' ');
201 201
202 if (op->kt_ival1.tv64) 202 if (op->kt_ival1)
203 seq_printf(m, "timeo=%lld ", 203 seq_printf(m, "timeo=%lld ",
204 (long long)ktime_to_us(op->kt_ival1)); 204 (long long)ktime_to_us(op->kt_ival1));
205 205
206 if (op->kt_ival2.tv64) 206 if (op->kt_ival2)
207 seq_printf(m, "thr=%lld ", 207 seq_printf(m, "thr=%lld ",
208 (long long)ktime_to_us(op->kt_ival2)); 208 (long long)ktime_to_us(op->kt_ival2));
209 209
@@ -226,11 +226,11 @@ static int bcm_proc_show(struct seq_file *m, void *v)
226 else 226 else
227 seq_printf(m, "[%u] ", op->nframes); 227 seq_printf(m, "[%u] ", op->nframes);
228 228
229 if (op->kt_ival1.tv64) 229 if (op->kt_ival1)
230 seq_printf(m, "t1=%lld ", 230 seq_printf(m, "t1=%lld ",
231 (long long)ktime_to_us(op->kt_ival1)); 231 (long long)ktime_to_us(op->kt_ival1));
232 232
233 if (op->kt_ival2.tv64) 233 if (op->kt_ival2)
234 seq_printf(m, "t2=%lld ", 234 seq_printf(m, "t2=%lld ",
235 (long long)ktime_to_us(op->kt_ival2)); 235 (long long)ktime_to_us(op->kt_ival2));
236 236
@@ -365,11 +365,11 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head,
365 365
366static void bcm_tx_start_timer(struct bcm_op *op) 366static void bcm_tx_start_timer(struct bcm_op *op)
367{ 367{
368 if (op->kt_ival1.tv64 && op->count) 368 if (op->kt_ival1 && op->count)
369 hrtimer_start(&op->timer, 369 hrtimer_start(&op->timer,
370 ktime_add(ktime_get(), op->kt_ival1), 370 ktime_add(ktime_get(), op->kt_ival1),
371 HRTIMER_MODE_ABS); 371 HRTIMER_MODE_ABS);
372 else if (op->kt_ival2.tv64) 372 else if (op->kt_ival2)
373 hrtimer_start(&op->timer, 373 hrtimer_start(&op->timer,
374 ktime_add(ktime_get(), op->kt_ival2), 374 ktime_add(ktime_get(), op->kt_ival2),
375 HRTIMER_MODE_ABS); 375 HRTIMER_MODE_ABS);
@@ -380,7 +380,7 @@ static void bcm_tx_timeout_tsklet(unsigned long data)
380 struct bcm_op *op = (struct bcm_op *)data; 380 struct bcm_op *op = (struct bcm_op *)data;
381 struct bcm_msg_head msg_head; 381 struct bcm_msg_head msg_head;
382 382
383 if (op->kt_ival1.tv64 && (op->count > 0)) { 383 if (op->kt_ival1 && (op->count > 0)) {
384 384
385 op->count--; 385 op->count--;
386 if (!op->count && (op->flags & TX_COUNTEVT)) { 386 if (!op->count && (op->flags & TX_COUNTEVT)) {
@@ -398,7 +398,7 @@ static void bcm_tx_timeout_tsklet(unsigned long data)
398 } 398 }
399 bcm_can_tx(op); 399 bcm_can_tx(op);
400 400
401 } else if (op->kt_ival2.tv64) 401 } else if (op->kt_ival2)
402 bcm_can_tx(op); 402 bcm_can_tx(op);
403 403
404 bcm_tx_start_timer(op); 404 bcm_tx_start_timer(op);
@@ -459,7 +459,7 @@ static void bcm_rx_update_and_send(struct bcm_op *op,
459 lastdata->flags |= (RX_RECV|RX_THR); 459 lastdata->flags |= (RX_RECV|RX_THR);
460 460
461 /* throttling mode inactive ? */ 461 /* throttling mode inactive ? */
462 if (!op->kt_ival2.tv64) { 462 if (!op->kt_ival2) {
463 /* send RX_CHANGED to the user immediately */ 463 /* send RX_CHANGED to the user immediately */
464 bcm_rx_changed(op, lastdata); 464 bcm_rx_changed(op, lastdata);
465 return; 465 return;
@@ -470,7 +470,7 @@ static void bcm_rx_update_and_send(struct bcm_op *op,
470 return; 470 return;
471 471
472 /* first reception with enabled throttling mode */ 472 /* first reception with enabled throttling mode */
473 if (!op->kt_lastmsg.tv64) 473 if (!op->kt_lastmsg)
474 goto rx_changed_settime; 474 goto rx_changed_settime;
475 475
476 /* got a second frame inside a potential throttle period? */ 476 /* got a second frame inside a potential throttle period? */
@@ -537,7 +537,7 @@ static void bcm_rx_starttimer(struct bcm_op *op)
537 if (op->flags & RX_NO_AUTOTIMER) 537 if (op->flags & RX_NO_AUTOTIMER)
538 return; 538 return;
539 539
540 if (op->kt_ival1.tv64) 540 if (op->kt_ival1)
541 hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL); 541 hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL);
542} 542}
543 543
@@ -643,7 +643,7 @@ static enum hrtimer_restart bcm_rx_thr_handler(struct hrtimer *hrtimer)
643 return HRTIMER_RESTART; 643 return HRTIMER_RESTART;
644 } else { 644 } else {
645 /* rearm throttle handling */ 645 /* rearm throttle handling */
646 op->kt_lastmsg = ktime_set(0, 0); 646 op->kt_lastmsg = 0;
647 return HRTIMER_NORESTART; 647 return HRTIMER_NORESTART;
648 } 648 }
649} 649}
@@ -734,14 +734,23 @@ static struct bcm_op *bcm_find_op(struct list_head *ops,
734 734
735static void bcm_remove_op(struct bcm_op *op) 735static void bcm_remove_op(struct bcm_op *op)
736{ 736{
737 hrtimer_cancel(&op->timer); 737 if (op->tsklet.func) {
738 hrtimer_cancel(&op->thrtimer); 738 while (test_bit(TASKLET_STATE_SCHED, &op->tsklet.state) ||
739 739 test_bit(TASKLET_STATE_RUN, &op->tsklet.state) ||
740 if (op->tsklet.func) 740 hrtimer_active(&op->timer)) {
741 tasklet_kill(&op->tsklet); 741 hrtimer_cancel(&op->timer);
742 tasklet_kill(&op->tsklet);
743 }
744 }
742 745
743 if (op->thrtsklet.func) 746 if (op->thrtsklet.func) {
744 tasklet_kill(&op->thrtsklet); 747 while (test_bit(TASKLET_STATE_SCHED, &op->thrtsklet.state) ||
748 test_bit(TASKLET_STATE_RUN, &op->thrtsklet.state) ||
749 hrtimer_active(&op->thrtimer)) {
750 hrtimer_cancel(&op->thrtimer);
751 tasklet_kill(&op->thrtsklet);
752 }
753 }
745 754
746 if ((op->frames) && (op->frames != &op->sframe)) 755 if ((op->frames) && (op->frames != &op->sframe))
747 kfree(op->frames); 756 kfree(op->frames);
@@ -1005,7 +1014,7 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
1005 op->kt_ival2 = bcm_timeval_to_ktime(msg_head->ival2); 1014 op->kt_ival2 = bcm_timeval_to_ktime(msg_head->ival2);
1006 1015
1007 /* disable an active timer due to zero values? */ 1016 /* disable an active timer due to zero values? */
1008 if (!op->kt_ival1.tv64 && !op->kt_ival2.tv64) 1017 if (!op->kt_ival1 && !op->kt_ival2)
1009 hrtimer_cancel(&op->timer); 1018 hrtimer_cancel(&op->timer);
1010 } 1019 }
1011 1020
@@ -1189,19 +1198,19 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
1189 op->kt_ival2 = bcm_timeval_to_ktime(msg_head->ival2); 1198 op->kt_ival2 = bcm_timeval_to_ktime(msg_head->ival2);
1190 1199
1191 /* disable an active timer due to zero value? */ 1200 /* disable an active timer due to zero value? */
1192 if (!op->kt_ival1.tv64) 1201 if (!op->kt_ival1)
1193 hrtimer_cancel(&op->timer); 1202 hrtimer_cancel(&op->timer);
1194 1203
1195 /* 1204 /*
1196 * In any case cancel the throttle timer, flush 1205 * In any case cancel the throttle timer, flush
1197 * potentially blocked msgs and reset throttle handling 1206 * potentially blocked msgs and reset throttle handling
1198 */ 1207 */
1199 op->kt_lastmsg = ktime_set(0, 0); 1208 op->kt_lastmsg = 0;
1200 hrtimer_cancel(&op->thrtimer); 1209 hrtimer_cancel(&op->thrtimer);
1201 bcm_rx_thr_flush(op, 1); 1210 bcm_rx_thr_flush(op, 1);
1202 } 1211 }
1203 1212
1204 if ((op->flags & STARTTIMER) && op->kt_ival1.tv64) 1213 if ((op->flags & STARTTIMER) && op->kt_ival1)
1205 hrtimer_start(&op->timer, op->kt_ival1, 1214 hrtimer_start(&op->timer, op->kt_ival1,
1206 HRTIMER_MODE_REL); 1215 HRTIMER_MODE_REL);
1207 } 1216 }
@@ -1216,7 +1225,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
1216 err = can_rx_register(dev, op->can_id, 1225 err = can_rx_register(dev, op->can_id,
1217 REGMASK(op->can_id), 1226 REGMASK(op->can_id),
1218 bcm_rx_handler, op, 1227 bcm_rx_handler, op,
1219 "bcm"); 1228 "bcm", sk);
1220 1229
1221 op->rx_reg_dev = dev; 1230 op->rx_reg_dev = dev;
1222 dev_put(dev); 1231 dev_put(dev);
@@ -1225,7 +1234,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
1225 } else 1234 } else
1226 err = can_rx_register(NULL, op->can_id, 1235 err = can_rx_register(NULL, op->can_id,
1227 REGMASK(op->can_id), 1236 REGMASK(op->can_id),
1228 bcm_rx_handler, op, "bcm"); 1237 bcm_rx_handler, op, "bcm", sk);
1229 if (err) { 1238 if (err) {
1230 /* this bcm rx op is broken -> remove it */ 1239 /* this bcm rx op is broken -> remove it */
1231 list_del(&op->list); 1240 list_del(&op->list);
diff --git a/net/can/gw.c b/net/can/gw.c
index 455168718c2e..7056a1a2bb70 100644
--- a/net/can/gw.c
+++ b/net/can/gw.c
@@ -429,7 +429,7 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data)
429 429
430 /* clear the skb timestamp if not configured the other way */ 430 /* clear the skb timestamp if not configured the other way */
431 if (!(gwj->flags & CGW_FLAGS_CAN_SRC_TSTAMP)) 431 if (!(gwj->flags & CGW_FLAGS_CAN_SRC_TSTAMP))
432 nskb->tstamp.tv64 = 0; 432 nskb->tstamp = 0;
433 433
434 /* send to netdevice */ 434 /* send to netdevice */
435 if (can_send(nskb, gwj->flags & CGW_FLAGS_CAN_ECHO)) 435 if (can_send(nskb, gwj->flags & CGW_FLAGS_CAN_ECHO))
@@ -442,7 +442,7 @@ static inline int cgw_register_filter(struct cgw_job *gwj)
442{ 442{
443 return can_rx_register(gwj->src.dev, gwj->ccgw.filter.can_id, 443 return can_rx_register(gwj->src.dev, gwj->ccgw.filter.can_id,
444 gwj->ccgw.filter.can_mask, can_can_gw_rcv, 444 gwj->ccgw.filter.can_mask, can_can_gw_rcv,
445 gwj, "gw"); 445 gwj, "gw", NULL);
446} 446}
447 447
448static inline void cgw_unregister_filter(struct cgw_job *gwj) 448static inline void cgw_unregister_filter(struct cgw_job *gwj)
diff --git a/net/can/raw.c b/net/can/raw.c
index b075f028d7e2..6dc546a06673 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -190,7 +190,7 @@ static int raw_enable_filters(struct net_device *dev, struct sock *sk,
190 for (i = 0; i < count; i++) { 190 for (i = 0; i < count; i++) {
191 err = can_rx_register(dev, filter[i].can_id, 191 err = can_rx_register(dev, filter[i].can_id,
192 filter[i].can_mask, 192 filter[i].can_mask,
193 raw_rcv, sk, "raw"); 193 raw_rcv, sk, "raw", sk);
194 if (err) { 194 if (err) {
195 /* clean up successfully registered filters */ 195 /* clean up successfully registered filters */
196 while (--i >= 0) 196 while (--i >= 0)
@@ -211,7 +211,7 @@ static int raw_enable_errfilter(struct net_device *dev, struct sock *sk,
211 211
212 if (err_mask) 212 if (err_mask)
213 err = can_rx_register(dev, 0, err_mask | CAN_ERR_FLAG, 213 err = can_rx_register(dev, 0, err_mask | CAN_ERR_FLAG,
214 raw_rcv, sk, "raw"); 214 raw_rcv, sk, "raw", sk);
215 215
216 return err; 216 return err;
217} 217}
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index c822b3ae1bd3..48bb8d95195b 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -315,13 +315,13 @@ int ceph_auth_update_authorizer(struct ceph_auth_client *ac,
315EXPORT_SYMBOL(ceph_auth_update_authorizer); 315EXPORT_SYMBOL(ceph_auth_update_authorizer);
316 316
317int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac, 317int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
318 struct ceph_authorizer *a, size_t len) 318 struct ceph_authorizer *a)
319{ 319{
320 int ret = 0; 320 int ret = 0;
321 321
322 mutex_lock(&ac->mutex); 322 mutex_lock(&ac->mutex);
323 if (ac->ops && ac->ops->verify_authorizer_reply) 323 if (ac->ops && ac->ops->verify_authorizer_reply)
324 ret = ac->ops->verify_authorizer_reply(ac, a, len); 324 ret = ac->ops->verify_authorizer_reply(ac, a);
325 mutex_unlock(&ac->mutex); 325 mutex_unlock(&ac->mutex);
326 return ret; 326 return ret;
327} 327}
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index a0905f04bd13..2034fb926670 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -39,56 +39,58 @@ static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
39 return need != 0; 39 return need != 0;
40} 40}
41 41
42static int ceph_x_encrypt_offset(void)
43{
44 return sizeof(u32) + sizeof(struct ceph_x_encrypt_header);
45}
46
42static int ceph_x_encrypt_buflen(int ilen) 47static int ceph_x_encrypt_buflen(int ilen)
43{ 48{
44 return sizeof(struct ceph_x_encrypt_header) + ilen + 16 + 49 return ceph_x_encrypt_offset() + ilen + 16;
45 sizeof(u32);
46} 50}
47 51
48static int ceph_x_encrypt(struct ceph_crypto_key *secret, 52static int ceph_x_encrypt(struct ceph_crypto_key *secret, void *buf,
49 void *ibuf, int ilen, void *obuf, size_t olen) 53 int buf_len, int plaintext_len)
50{ 54{
51 struct ceph_x_encrypt_header head = { 55 struct ceph_x_encrypt_header *hdr = buf + sizeof(u32);
52 .struct_v = 1, 56 int ciphertext_len;
53 .magic = cpu_to_le64(CEPHX_ENC_MAGIC)
54 };
55 size_t len = olen - sizeof(u32);
56 int ret; 57 int ret;
57 58
58 ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len, 59 hdr->struct_v = 1;
59 &head, sizeof(head), ibuf, ilen); 60 hdr->magic = cpu_to_le64(CEPHX_ENC_MAGIC);
61
62 ret = ceph_crypt(secret, true, buf + sizeof(u32), buf_len - sizeof(u32),
63 plaintext_len + sizeof(struct ceph_x_encrypt_header),
64 &ciphertext_len);
60 if (ret) 65 if (ret)
61 return ret; 66 return ret;
62 ceph_encode_32(&obuf, len); 67
63 return len + sizeof(u32); 68 ceph_encode_32(&buf, ciphertext_len);
69 return sizeof(u32) + ciphertext_len;
64} 70}
65 71
66static int ceph_x_decrypt(struct ceph_crypto_key *secret, 72static int ceph_x_decrypt(struct ceph_crypto_key *secret, void **p, void *end)
67 void **p, void *end, void **obuf, size_t olen)
68{ 73{
69 struct ceph_x_encrypt_header head; 74 struct ceph_x_encrypt_header *hdr = *p + sizeof(u32);
70 size_t head_len = sizeof(head); 75 int ciphertext_len, plaintext_len;
71 int len, ret; 76 int ret;
72
73 len = ceph_decode_32(p);
74 if (*p + len > end)
75 return -EINVAL;
76 77
77 dout("ceph_x_decrypt len %d\n", len); 78 ceph_decode_32_safe(p, end, ciphertext_len, e_inval);
78 if (*obuf == NULL) { 79 ceph_decode_need(p, end, ciphertext_len, e_inval);
79 *obuf = kmalloc(len, GFP_NOFS);
80 if (!*obuf)
81 return -ENOMEM;
82 olen = len;
83 }
84 80
85 ret = ceph_decrypt2(secret, &head, &head_len, *obuf, &olen, *p, len); 81 ret = ceph_crypt(secret, false, *p, end - *p, ciphertext_len,
82 &plaintext_len);
86 if (ret) 83 if (ret)
87 return ret; 84 return ret;
88 if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC) 85
86 if (hdr->struct_v != 1 || le64_to_cpu(hdr->magic) != CEPHX_ENC_MAGIC)
89 return -EPERM; 87 return -EPERM;
90 *p += len; 88
91 return olen; 89 *p += ciphertext_len;
90 return plaintext_len - sizeof(struct ceph_x_encrypt_header);
91
92e_inval:
93 return -EINVAL;
92} 94}
93 95
94/* 96/*
@@ -143,13 +145,10 @@ static int process_one_ticket(struct ceph_auth_client *ac,
143 int type; 145 int type;
144 u8 tkt_struct_v, blob_struct_v; 146 u8 tkt_struct_v, blob_struct_v;
145 struct ceph_x_ticket_handler *th; 147 struct ceph_x_ticket_handler *th;
146 void *dbuf = NULL;
147 void *dp, *dend; 148 void *dp, *dend;
148 int dlen; 149 int dlen;
149 char is_enc; 150 char is_enc;
150 struct timespec validity; 151 struct timespec validity;
151 struct ceph_crypto_key old_key;
152 void *ticket_buf = NULL;
153 void *tp, *tpend; 152 void *tp, *tpend;
154 void **ptp; 153 void **ptp;
155 struct ceph_crypto_key new_session_key; 154 struct ceph_crypto_key new_session_key;
@@ -174,20 +173,17 @@ static int process_one_ticket(struct ceph_auth_client *ac,
174 } 173 }
175 174
176 /* blob for me */ 175 /* blob for me */
177 dlen = ceph_x_decrypt(secret, p, end, &dbuf, 0); 176 dp = *p + ceph_x_encrypt_offset();
178 if (dlen <= 0) { 177 ret = ceph_x_decrypt(secret, p, end);
179 ret = dlen; 178 if (ret < 0)
180 goto out; 179 goto out;
181 } 180 dout(" decrypted %d bytes\n", ret);
182 dout(" decrypted %d bytes\n", dlen); 181 dend = dp + ret;
183 dp = dbuf;
184 dend = dp + dlen;
185 182
186 tkt_struct_v = ceph_decode_8(&dp); 183 tkt_struct_v = ceph_decode_8(&dp);
187 if (tkt_struct_v != 1) 184 if (tkt_struct_v != 1)
188 goto bad; 185 goto bad;
189 186
190 memcpy(&old_key, &th->session_key, sizeof(old_key));
191 ret = ceph_crypto_key_decode(&new_session_key, &dp, dend); 187 ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
192 if (ret) 188 if (ret)
193 goto out; 189 goto out;
@@ -203,15 +199,13 @@ static int process_one_ticket(struct ceph_auth_client *ac,
203 ceph_decode_8_safe(p, end, is_enc, bad); 199 ceph_decode_8_safe(p, end, is_enc, bad);
204 if (is_enc) { 200 if (is_enc) {
205 /* encrypted */ 201 /* encrypted */
206 dout(" encrypted ticket\n"); 202 tp = *p + ceph_x_encrypt_offset();
207 dlen = ceph_x_decrypt(&old_key, p, end, &ticket_buf, 0); 203 ret = ceph_x_decrypt(&th->session_key, p, end);
208 if (dlen < 0) { 204 if (ret < 0)
209 ret = dlen;
210 goto out; 205 goto out;
211 } 206 dout(" encrypted ticket, decrypted %d bytes\n", ret);
212 tp = ticket_buf;
213 ptp = &tp; 207 ptp = &tp;
214 tpend = *ptp + dlen; 208 tpend = tp + ret;
215 } else { 209 } else {
216 /* unencrypted */ 210 /* unencrypted */
217 ptp = p; 211 ptp = p;
@@ -242,8 +236,6 @@ static int process_one_ticket(struct ceph_auth_client *ac,
242 xi->have_keys |= th->service; 236 xi->have_keys |= th->service;
243 237
244out: 238out:
245 kfree(ticket_buf);
246 kfree(dbuf);
247 return ret; 239 return ret;
248 240
249bad: 241bad:
@@ -294,7 +286,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
294{ 286{
295 int maxlen; 287 int maxlen;
296 struct ceph_x_authorize_a *msg_a; 288 struct ceph_x_authorize_a *msg_a;
297 struct ceph_x_authorize_b msg_b; 289 struct ceph_x_authorize_b *msg_b;
298 void *p, *end; 290 void *p, *end;
299 int ret; 291 int ret;
300 int ticket_blob_len = 292 int ticket_blob_len =
@@ -308,8 +300,8 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
308 if (ret) 300 if (ret)
309 goto out_au; 301 goto out_au;
310 302
311 maxlen = sizeof(*msg_a) + sizeof(msg_b) + 303 maxlen = sizeof(*msg_a) + ticket_blob_len +
312 ceph_x_encrypt_buflen(ticket_blob_len); 304 ceph_x_encrypt_buflen(sizeof(*msg_b));
313 dout(" need len %d\n", maxlen); 305 dout(" need len %d\n", maxlen);
314 if (au->buf && au->buf->alloc_len < maxlen) { 306 if (au->buf && au->buf->alloc_len < maxlen) {
315 ceph_buffer_put(au->buf); 307 ceph_buffer_put(au->buf);
@@ -343,18 +335,19 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
343 p += ticket_blob_len; 335 p += ticket_blob_len;
344 end = au->buf->vec.iov_base + au->buf->vec.iov_len; 336 end = au->buf->vec.iov_base + au->buf->vec.iov_len;
345 337
338 msg_b = p + ceph_x_encrypt_offset();
339 msg_b->struct_v = 1;
346 get_random_bytes(&au->nonce, sizeof(au->nonce)); 340 get_random_bytes(&au->nonce, sizeof(au->nonce));
347 msg_b.struct_v = 1; 341 msg_b->nonce = cpu_to_le64(au->nonce);
348 msg_b.nonce = cpu_to_le64(au->nonce); 342 ret = ceph_x_encrypt(&au->session_key, p, end - p, sizeof(*msg_b));
349 ret = ceph_x_encrypt(&au->session_key, &msg_b, sizeof(msg_b),
350 p, end - p);
351 if (ret < 0) 343 if (ret < 0)
352 goto out_au; 344 goto out_au;
345
353 p += ret; 346 p += ret;
347 WARN_ON(p > end);
354 au->buf->vec.iov_len = p - au->buf->vec.iov_base; 348 au->buf->vec.iov_len = p - au->buf->vec.iov_base;
355 dout(" built authorizer nonce %llx len %d\n", au->nonce, 349 dout(" built authorizer nonce %llx len %d\n", au->nonce,
356 (int)au->buf->vec.iov_len); 350 (int)au->buf->vec.iov_len);
357 BUG_ON(au->buf->vec.iov_len > maxlen);
358 return 0; 351 return 0;
359 352
360out_au: 353out_au:
@@ -452,8 +445,9 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
452 if (need & CEPH_ENTITY_TYPE_AUTH) { 445 if (need & CEPH_ENTITY_TYPE_AUTH) {
453 struct ceph_x_authenticate *auth = (void *)(head + 1); 446 struct ceph_x_authenticate *auth = (void *)(head + 1);
454 void *p = auth + 1; 447 void *p = auth + 1;
455 struct ceph_x_challenge_blob tmp; 448 void *enc_buf = xi->auth_authorizer.enc_buf;
456 char tmp_enc[40]; 449 struct ceph_x_challenge_blob *blob = enc_buf +
450 ceph_x_encrypt_offset();
457 u64 *u; 451 u64 *u;
458 452
459 if (p > end) 453 if (p > end)
@@ -464,16 +458,16 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
464 458
465 /* encrypt and hash */ 459 /* encrypt and hash */
466 get_random_bytes(&auth->client_challenge, sizeof(u64)); 460 get_random_bytes(&auth->client_challenge, sizeof(u64));
467 tmp.client_challenge = auth->client_challenge; 461 blob->client_challenge = auth->client_challenge;
468 tmp.server_challenge = cpu_to_le64(xi->server_challenge); 462 blob->server_challenge = cpu_to_le64(xi->server_challenge);
469 ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp), 463 ret = ceph_x_encrypt(&xi->secret, enc_buf, CEPHX_AU_ENC_BUF_LEN,
470 tmp_enc, sizeof(tmp_enc)); 464 sizeof(*blob));
471 if (ret < 0) 465 if (ret < 0)
472 return ret; 466 return ret;
473 467
474 auth->struct_v = 1; 468 auth->struct_v = 1;
475 auth->key = 0; 469 auth->key = 0;
476 for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++) 470 for (u = (u64 *)enc_buf; u + 1 <= (u64 *)(enc_buf + ret); u++)
477 auth->key ^= *(__le64 *)u; 471 auth->key ^= *(__le64 *)u;
478 dout(" server_challenge %llx client_challenge %llx key %llx\n", 472 dout(" server_challenge %llx client_challenge %llx key %llx\n",
479 xi->server_challenge, le64_to_cpu(auth->client_challenge), 473 xi->server_challenge, le64_to_cpu(auth->client_challenge),
@@ -600,8 +594,8 @@ static int ceph_x_create_authorizer(
600 auth->authorizer = (struct ceph_authorizer *) au; 594 auth->authorizer = (struct ceph_authorizer *) au;
601 auth->authorizer_buf = au->buf->vec.iov_base; 595 auth->authorizer_buf = au->buf->vec.iov_base;
602 auth->authorizer_buf_len = au->buf->vec.iov_len; 596 auth->authorizer_buf_len = au->buf->vec.iov_len;
603 auth->authorizer_reply_buf = au->reply_buf; 597 auth->authorizer_reply_buf = au->enc_buf;
604 auth->authorizer_reply_buf_len = sizeof (au->reply_buf); 598 auth->authorizer_reply_buf_len = CEPHX_AU_ENC_BUF_LEN;
605 auth->sign_message = ac->ops->sign_message; 599 auth->sign_message = ac->ops->sign_message;
606 auth->check_message_signature = ac->ops->check_message_signature; 600 auth->check_message_signature = ac->ops->check_message_signature;
607 601
@@ -629,27 +623,25 @@ static int ceph_x_update_authorizer(
629} 623}
630 624
631static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac, 625static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
632 struct ceph_authorizer *a, size_t len) 626 struct ceph_authorizer *a)
633{ 627{
634 struct ceph_x_authorizer *au = (void *)a; 628 struct ceph_x_authorizer *au = (void *)a;
635 int ret = 0; 629 void *p = au->enc_buf;
636 struct ceph_x_authorize_reply reply; 630 struct ceph_x_authorize_reply *reply = p + ceph_x_encrypt_offset();
637 void *preply = &reply; 631 int ret;
638 void *p = au->reply_buf;
639 void *end = p + sizeof(au->reply_buf);
640 632
641 ret = ceph_x_decrypt(&au->session_key, &p, end, &preply, sizeof(reply)); 633 ret = ceph_x_decrypt(&au->session_key, &p, p + CEPHX_AU_ENC_BUF_LEN);
642 if (ret < 0) 634 if (ret < 0)
643 return ret; 635 return ret;
644 if (ret != sizeof(reply)) 636 if (ret != sizeof(*reply))
645 return -EPERM; 637 return -EPERM;
646 638
647 if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one)) 639 if (au->nonce + 1 != le64_to_cpu(reply->nonce_plus_one))
648 ret = -EPERM; 640 ret = -EPERM;
649 else 641 else
650 ret = 0; 642 ret = 0;
651 dout("verify_authorizer_reply nonce %llx got %llx ret %d\n", 643 dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
652 au->nonce, le64_to_cpu(reply.nonce_plus_one), ret); 644 au->nonce, le64_to_cpu(reply->nonce_plus_one), ret);
653 return ret; 645 return ret;
654} 646}
655 647
@@ -704,35 +696,48 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
704 invalidate_ticket(ac, CEPH_ENTITY_TYPE_AUTH); 696 invalidate_ticket(ac, CEPH_ENTITY_TYPE_AUTH);
705} 697}
706 698
707static int calcu_signature(struct ceph_x_authorizer *au, 699static int calc_signature(struct ceph_x_authorizer *au, struct ceph_msg *msg,
708 struct ceph_msg *msg, __le64 *sig) 700 __le64 *psig)
709{ 701{
702 void *enc_buf = au->enc_buf;
703 struct {
704 __le32 len;
705 __le32 header_crc;
706 __le32 front_crc;
707 __le32 middle_crc;
708 __le32 data_crc;
709 } __packed *sigblock = enc_buf + ceph_x_encrypt_offset();
710 int ret; 710 int ret;
711 char tmp_enc[40]; 711
712 __le32 tmp[5] = { 712 sigblock->len = cpu_to_le32(4*sizeof(u32));
713 cpu_to_le32(16), msg->hdr.crc, msg->footer.front_crc, 713 sigblock->header_crc = msg->hdr.crc;
714 msg->footer.middle_crc, msg->footer.data_crc, 714 sigblock->front_crc = msg->footer.front_crc;
715 }; 715 sigblock->middle_crc = msg->footer.middle_crc;
716 ret = ceph_x_encrypt(&au->session_key, &tmp, sizeof(tmp), 716 sigblock->data_crc = msg->footer.data_crc;
717 tmp_enc, sizeof(tmp_enc)); 717 ret = ceph_x_encrypt(&au->session_key, enc_buf, CEPHX_AU_ENC_BUF_LEN,
718 sizeof(*sigblock));
718 if (ret < 0) 719 if (ret < 0)
719 return ret; 720 return ret;
720 *sig = *(__le64*)(tmp_enc + 4); 721
722 *psig = *(__le64 *)(enc_buf + sizeof(u32));
721 return 0; 723 return 0;
722} 724}
723 725
724static int ceph_x_sign_message(struct ceph_auth_handshake *auth, 726static int ceph_x_sign_message(struct ceph_auth_handshake *auth,
725 struct ceph_msg *msg) 727 struct ceph_msg *msg)
726{ 728{
729 __le64 sig;
727 int ret; 730 int ret;
728 731
729 if (ceph_test_opt(from_msgr(msg->con->msgr), NOMSGSIGN)) 732 if (ceph_test_opt(from_msgr(msg->con->msgr), NOMSGSIGN))
730 return 0; 733 return 0;
731 734
732 ret = calcu_signature((struct ceph_x_authorizer *)auth->authorizer, 735 ret = calc_signature((struct ceph_x_authorizer *)auth->authorizer,
733 msg, &msg->footer.sig); 736 msg, &sig);
734 if (ret < 0) 737 if (ret)
735 return ret; 738 return ret;
739
740 msg->footer.sig = sig;
736 msg->footer.flags |= CEPH_MSG_FOOTER_SIGNED; 741 msg->footer.flags |= CEPH_MSG_FOOTER_SIGNED;
737 return 0; 742 return 0;
738} 743}
@@ -746,9 +751,9 @@ static int ceph_x_check_message_signature(struct ceph_auth_handshake *auth,
746 if (ceph_test_opt(from_msgr(msg->con->msgr), NOMSGSIGN)) 751 if (ceph_test_opt(from_msgr(msg->con->msgr), NOMSGSIGN))
747 return 0; 752 return 0;
748 753
749 ret = calcu_signature((struct ceph_x_authorizer *)auth->authorizer, 754 ret = calc_signature((struct ceph_x_authorizer *)auth->authorizer,
750 msg, &sig_check); 755 msg, &sig_check);
751 if (ret < 0) 756 if (ret)
752 return ret; 757 return ret;
753 if (sig_check == msg->footer.sig) 758 if (sig_check == msg->footer.sig)
754 return 0; 759 return 0;
diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h
index 21a5af904bae..48e9ad41bd2a 100644
--- a/net/ceph/auth_x.h
+++ b/net/ceph/auth_x.h
@@ -24,6 +24,7 @@ struct ceph_x_ticket_handler {
24 unsigned long renew_after, expires; 24 unsigned long renew_after, expires;
25}; 25};
26 26
27#define CEPHX_AU_ENC_BUF_LEN 128 /* big enough for encrypted blob */
27 28
28struct ceph_x_authorizer { 29struct ceph_x_authorizer {
29 struct ceph_authorizer base; 30 struct ceph_authorizer base;
@@ -32,7 +33,7 @@ struct ceph_x_authorizer {
32 unsigned int service; 33 unsigned int service;
33 u64 nonce; 34 u64 nonce;
34 u64 secret_id; 35 u64 secret_id;
35 char reply_buf[128]; /* big enough for encrypted blob */ 36 char enc_buf[CEPHX_AU_ENC_BUF_LEN] __aligned(8);
36}; 37};
37 38
38struct ceph_x_info { 39struct ceph_x_info {
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 464e88599b9d..108533859a53 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -230,6 +230,7 @@ enum {
230 Opt_osdkeepalivetimeout, 230 Opt_osdkeepalivetimeout,
231 Opt_mount_timeout, 231 Opt_mount_timeout,
232 Opt_osd_idle_ttl, 232 Opt_osd_idle_ttl,
233 Opt_osd_request_timeout,
233 Opt_last_int, 234 Opt_last_int,
234 /* int args above */ 235 /* int args above */
235 Opt_fsid, 236 Opt_fsid,
@@ -256,6 +257,7 @@ static match_table_t opt_tokens = {
256 {Opt_osdkeepalivetimeout, "osdkeepalive=%d"}, 257 {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
257 {Opt_mount_timeout, "mount_timeout=%d"}, 258 {Opt_mount_timeout, "mount_timeout=%d"},
258 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"}, 259 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
260 {Opt_osd_request_timeout, "osd_request_timeout=%d"},
259 /* int args above */ 261 /* int args above */
260 {Opt_fsid, "fsid=%s"}, 262 {Opt_fsid, "fsid=%s"},
261 {Opt_name, "name=%s"}, 263 {Opt_name, "name=%s"},
@@ -361,6 +363,7 @@ ceph_parse_options(char *options, const char *dev_name,
361 opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; 363 opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
362 opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; 364 opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
363 opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; 365 opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
366 opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT;
364 367
365 /* get mon ip(s) */ 368 /* get mon ip(s) */
366 /* ip1[:port1][,ip2[:port2]...] */ 369 /* ip1[:port1][,ip2[:port2]...] */
@@ -473,6 +476,15 @@ ceph_parse_options(char *options, const char *dev_name,
473 } 476 }
474 opt->mount_timeout = msecs_to_jiffies(intval * 1000); 477 opt->mount_timeout = msecs_to_jiffies(intval * 1000);
475 break; 478 break;
479 case Opt_osd_request_timeout:
480 /* 0 is "wait forever" (i.e. infinite timeout) */
481 if (intval < 0 || intval > INT_MAX / 1000) {
482 pr_err("osd_request_timeout out of range\n");
483 err = -EINVAL;
484 goto out;
485 }
486 opt->osd_request_timeout = msecs_to_jiffies(intval * 1000);
487 break;
476 488
477 case Opt_share: 489 case Opt_share:
478 opt->flags &= ~CEPH_OPT_NOSHARE; 490 opt->flags &= ~CEPH_OPT_NOSHARE;
@@ -557,6 +569,9 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
557 if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) 569 if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
558 seq_printf(m, "osdkeepalivetimeout=%d,", 570 seq_printf(m, "osdkeepalivetimeout=%d,",
559 jiffies_to_msecs(opt->osd_keepalive_timeout) / 1000); 571 jiffies_to_msecs(opt->osd_keepalive_timeout) / 1000);
572 if (opt->osd_request_timeout != CEPH_OSD_REQUEST_TIMEOUT_DEFAULT)
573 seq_printf(m, "osd_request_timeout=%d,",
574 jiffies_to_msecs(opt->osd_request_timeout) / 1000);
560 575
561 /* drop redundant comma */ 576 /* drop redundant comma */
562 if (m->count != pos) 577 if (m->count != pos)
diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c
index 50f040fdb2a9..b9233b990399 100644
--- a/net/ceph/cls_lock_client.c
+++ b/net/ceph/cls_lock_client.c
@@ -69,8 +69,8 @@ int ceph_cls_lock(struct ceph_osd_client *osdc,
69 dout("%s lock_name %s type %d cookie %s tag %s desc %s flags 0x%x\n", 69 dout("%s lock_name %s type %d cookie %s tag %s desc %s flags 0x%x\n",
70 __func__, lock_name, type, cookie, tag, desc, flags); 70 __func__, lock_name, type, cookie, tag, desc, flags);
71 ret = ceph_osdc_call(osdc, oid, oloc, "lock", "lock", 71 ret = ceph_osdc_call(osdc, oid, oloc, "lock", "lock",
72 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 72 CEPH_OSD_FLAG_WRITE, lock_op_page,
73 lock_op_page, lock_op_buf_size, NULL, NULL); 73 lock_op_buf_size, NULL, NULL);
74 74
75 dout("%s: status %d\n", __func__, ret); 75 dout("%s: status %d\n", __func__, ret);
76 __free_page(lock_op_page); 76 __free_page(lock_op_page);
@@ -117,8 +117,8 @@ int ceph_cls_unlock(struct ceph_osd_client *osdc,
117 117
118 dout("%s lock_name %s cookie %s\n", __func__, lock_name, cookie); 118 dout("%s lock_name %s cookie %s\n", __func__, lock_name, cookie);
119 ret = ceph_osdc_call(osdc, oid, oloc, "lock", "unlock", 119 ret = ceph_osdc_call(osdc, oid, oloc, "lock", "unlock",
120 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 120 CEPH_OSD_FLAG_WRITE, unlock_op_page,
121 unlock_op_page, unlock_op_buf_size, NULL, NULL); 121 unlock_op_buf_size, NULL, NULL);
122 122
123 dout("%s: status %d\n", __func__, ret); 123 dout("%s: status %d\n", __func__, ret);
124 __free_page(unlock_op_page); 124 __free_page(unlock_op_page);
@@ -170,8 +170,8 @@ int ceph_cls_break_lock(struct ceph_osd_client *osdc,
170 dout("%s lock_name %s cookie %s locker %s%llu\n", __func__, lock_name, 170 dout("%s lock_name %s cookie %s locker %s%llu\n", __func__, lock_name,
171 cookie, ENTITY_NAME(*locker)); 171 cookie, ENTITY_NAME(*locker));
172 ret = ceph_osdc_call(osdc, oid, oloc, "lock", "break_lock", 172 ret = ceph_osdc_call(osdc, oid, oloc, "lock", "break_lock",
173 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 173 CEPH_OSD_FLAG_WRITE, break_op_page,
174 break_op_page, break_op_buf_size, NULL, NULL); 174 break_op_buf_size, NULL, NULL);
175 175
176 dout("%s: status %d\n", __func__, ret); 176 dout("%s: status %d\n", __func__, ret);
177 __free_page(break_op_page); 177 __free_page(break_op_page);
@@ -278,7 +278,7 @@ int ceph_cls_lock_info(struct ceph_osd_client *osdc,
278 int get_info_op_buf_size; 278 int get_info_op_buf_size;
279 int name_len = strlen(lock_name); 279 int name_len = strlen(lock_name);
280 struct page *get_info_op_page, *reply_page; 280 struct page *get_info_op_page, *reply_page;
281 size_t reply_len; 281 size_t reply_len = PAGE_SIZE;
282 void *p, *end; 282 void *p, *end;
283 int ret; 283 int ret;
284 284
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
index 80d7c3a97cb8..5bf94c04f645 100644
--- a/net/ceph/crush/crush.c
+++ b/net/ceph/crush/crush.c
@@ -45,7 +45,6 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p)
45 45
46void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b) 46void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
47{ 47{
48 kfree(b->h.perm);
49 kfree(b->h.items); 48 kfree(b->h.items);
50 kfree(b); 49 kfree(b);
51} 50}
@@ -54,14 +53,12 @@ void crush_destroy_bucket_list(struct crush_bucket_list *b)
54{ 53{
55 kfree(b->item_weights); 54 kfree(b->item_weights);
56 kfree(b->sum_weights); 55 kfree(b->sum_weights);
57 kfree(b->h.perm);
58 kfree(b->h.items); 56 kfree(b->h.items);
59 kfree(b); 57 kfree(b);
60} 58}
61 59
62void crush_destroy_bucket_tree(struct crush_bucket_tree *b) 60void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
63{ 61{
64 kfree(b->h.perm);
65 kfree(b->h.items); 62 kfree(b->h.items);
66 kfree(b->node_weights); 63 kfree(b->node_weights);
67 kfree(b); 64 kfree(b);
@@ -71,7 +68,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
71{ 68{
72 kfree(b->straws); 69 kfree(b->straws);
73 kfree(b->item_weights); 70 kfree(b->item_weights);
74 kfree(b->h.perm);
75 kfree(b->h.items); 71 kfree(b->h.items);
76 kfree(b); 72 kfree(b);
77} 73}
@@ -79,7 +75,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
79void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b) 75void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b)
80{ 76{
81 kfree(b->item_weights); 77 kfree(b->item_weights);
82 kfree(b->h.perm);
83 kfree(b->h.items); 78 kfree(b->h.items);
84 kfree(b); 79 kfree(b);
85} 80}
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index a421e905331a..b5cd8c21bfdf 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -17,10 +17,12 @@
17# include <linux/kernel.h> 17# include <linux/kernel.h>
18# include <linux/crush/crush.h> 18# include <linux/crush/crush.h>
19# include <linux/crush/hash.h> 19# include <linux/crush/hash.h>
20# include <linux/crush/mapper.h>
20#else 21#else
21# include "crush_compat.h" 22# include "crush_compat.h"
22# include "crush.h" 23# include "crush.h"
23# include "hash.h" 24# include "hash.h"
25# include "mapper.h"
24#endif 26#endif
25#include "crush_ln_table.h" 27#include "crush_ln_table.h"
26 28
@@ -52,7 +54,6 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size
52 return -1; 54 return -1;
53} 55}
54 56
55
56/* 57/*
57 * bucket choose methods 58 * bucket choose methods
58 * 59 *
@@ -70,59 +71,60 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size
70 * Since this is expensive, we optimize for the r=0 case, which 71 * Since this is expensive, we optimize for the r=0 case, which
71 * captures the vast majority of calls. 72 * captures the vast majority of calls.
72 */ 73 */
73static int bucket_perm_choose(struct crush_bucket *bucket, 74static int bucket_perm_choose(const struct crush_bucket *bucket,
75 struct crush_work_bucket *work,
74 int x, int r) 76 int x, int r)
75{ 77{
76 unsigned int pr = r % bucket->size; 78 unsigned int pr = r % bucket->size;
77 unsigned int i, s; 79 unsigned int i, s;
78 80
79 /* start a new permutation if @x has changed */ 81 /* start a new permutation if @x has changed */
80 if (bucket->perm_x != (__u32)x || bucket->perm_n == 0) { 82 if (work->perm_x != (__u32)x || work->perm_n == 0) {
81 dprintk("bucket %d new x=%d\n", bucket->id, x); 83 dprintk("bucket %d new x=%d\n", bucket->id, x);
82 bucket->perm_x = x; 84 work->perm_x = x;
83 85
84 /* optimize common r=0 case */ 86 /* optimize common r=0 case */
85 if (pr == 0) { 87 if (pr == 0) {
86 s = crush_hash32_3(bucket->hash, x, bucket->id, 0) % 88 s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
87 bucket->size; 89 bucket->size;
88 bucket->perm[0] = s; 90 work->perm[0] = s;
89 bucket->perm_n = 0xffff; /* magic value, see below */ 91 work->perm_n = 0xffff; /* magic value, see below */
90 goto out; 92 goto out;
91 } 93 }
92 94
93 for (i = 0; i < bucket->size; i++) 95 for (i = 0; i < bucket->size; i++)
94 bucket->perm[i] = i; 96 work->perm[i] = i;
95 bucket->perm_n = 0; 97 work->perm_n = 0;
96 } else if (bucket->perm_n == 0xffff) { 98 } else if (work->perm_n == 0xffff) {
97 /* clean up after the r=0 case above */ 99 /* clean up after the r=0 case above */
98 for (i = 1; i < bucket->size; i++) 100 for (i = 1; i < bucket->size; i++)
99 bucket->perm[i] = i; 101 work->perm[i] = i;
100 bucket->perm[bucket->perm[0]] = 0; 102 work->perm[work->perm[0]] = 0;
101 bucket->perm_n = 1; 103 work->perm_n = 1;
102 } 104 }
103 105
104 /* calculate permutation up to pr */ 106 /* calculate permutation up to pr */
105 for (i = 0; i < bucket->perm_n; i++) 107 for (i = 0; i < work->perm_n; i++)
106 dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]); 108 dprintk(" perm_choose have %d: %d\n", i, work->perm[i]);
107 while (bucket->perm_n <= pr) { 109 while (work->perm_n <= pr) {
108 unsigned int p = bucket->perm_n; 110 unsigned int p = work->perm_n;
109 /* no point in swapping the final entry */ 111 /* no point in swapping the final entry */
110 if (p < bucket->size - 1) { 112 if (p < bucket->size - 1) {
111 i = crush_hash32_3(bucket->hash, x, bucket->id, p) % 113 i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
112 (bucket->size - p); 114 (bucket->size - p);
113 if (i) { 115 if (i) {
114 unsigned int t = bucket->perm[p + i]; 116 unsigned int t = work->perm[p + i];
115 bucket->perm[p + i] = bucket->perm[p]; 117 work->perm[p + i] = work->perm[p];
116 bucket->perm[p] = t; 118 work->perm[p] = t;
117 } 119 }
118 dprintk(" perm_choose swap %d with %d\n", p, p+i); 120 dprintk(" perm_choose swap %d with %d\n", p, p+i);
119 } 121 }
120 bucket->perm_n++; 122 work->perm_n++;
121 } 123 }
122 for (i = 0; i < bucket->size; i++) 124 for (i = 0; i < bucket->size; i++)
123 dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]); 125 dprintk(" perm_choose %d: %d\n", i, work->perm[i]);
124 126
125 s = bucket->perm[pr]; 127 s = work->perm[pr];
126out: 128out:
127 dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id, 129 dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
128 bucket->size, x, r, pr, s); 130 bucket->size, x, r, pr, s);
@@ -130,14 +132,14 @@ out:
130} 132}
131 133
132/* uniform */ 134/* uniform */
133static int bucket_uniform_choose(struct crush_bucket_uniform *bucket, 135static int bucket_uniform_choose(const struct crush_bucket_uniform *bucket,
134 int x, int r) 136 struct crush_work_bucket *work, int x, int r)
135{ 137{
136 return bucket_perm_choose(&bucket->h, x, r); 138 return bucket_perm_choose(&bucket->h, work, x, r);
137} 139}
138 140
139/* list */ 141/* list */
140static int bucket_list_choose(struct crush_bucket_list *bucket, 142static int bucket_list_choose(const struct crush_bucket_list *bucket,
141 int x, int r) 143 int x, int r)
142{ 144{
143 int i; 145 int i;
@@ -153,8 +155,9 @@ static int bucket_list_choose(struct crush_bucket_list *bucket,
153 w *= bucket->sum_weights[i]; 155 w *= bucket->sum_weights[i];
154 w = w >> 16; 156 w = w >> 16;
155 /*dprintk(" scaled %llx\n", w);*/ 157 /*dprintk(" scaled %llx\n", w);*/
156 if (w < bucket->item_weights[i]) 158 if (w < bucket->item_weights[i]) {
157 return bucket->h.items[i]; 159 return bucket->h.items[i];
160 }
158 } 161 }
159 162
160 dprintk("bad list sums for bucket %d\n", bucket->h.id); 163 dprintk("bad list sums for bucket %d\n", bucket->h.id);
@@ -190,7 +193,7 @@ static int terminal(int x)
190 return x & 1; 193 return x & 1;
191} 194}
192 195
193static int bucket_tree_choose(struct crush_bucket_tree *bucket, 196static int bucket_tree_choose(const struct crush_bucket_tree *bucket,
194 int x, int r) 197 int x, int r)
195{ 198{
196 int n; 199 int n;
@@ -222,7 +225,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket,
222 225
223/* straw */ 226/* straw */
224 227
225static int bucket_straw_choose(struct crush_bucket_straw *bucket, 228static int bucket_straw_choose(const struct crush_bucket_straw *bucket,
226 int x, int r) 229 int x, int r)
227{ 230{
228 __u32 i; 231 __u32 i;
@@ -299,7 +302,7 @@ static __u64 crush_ln(unsigned int xin)
299 * 302 *
300 */ 303 */
301 304
302static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket, 305static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
303 int x, int r) 306 int x, int r)
304{ 307{
305 unsigned int i, high = 0; 308 unsigned int i, high = 0;
@@ -342,37 +345,42 @@ static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket,
342 high_draw = draw; 345 high_draw = draw;
343 } 346 }
344 } 347 }
348
345 return bucket->h.items[high]; 349 return bucket->h.items[high];
346} 350}
347 351
348 352
349static int crush_bucket_choose(struct crush_bucket *in, int x, int r) 353static int crush_bucket_choose(const struct crush_bucket *in,
354 struct crush_work_bucket *work,
355 int x, int r)
350{ 356{
351 dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); 357 dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
352 BUG_ON(in->size == 0); 358 BUG_ON(in->size == 0);
353 switch (in->alg) { 359 switch (in->alg) {
354 case CRUSH_BUCKET_UNIFORM: 360 case CRUSH_BUCKET_UNIFORM:
355 return bucket_uniform_choose((struct crush_bucket_uniform *)in, 361 return bucket_uniform_choose(
356 x, r); 362 (const struct crush_bucket_uniform *)in,
363 work, x, r);
357 case CRUSH_BUCKET_LIST: 364 case CRUSH_BUCKET_LIST:
358 return bucket_list_choose((struct crush_bucket_list *)in, 365 return bucket_list_choose((const struct crush_bucket_list *)in,
359 x, r); 366 x, r);
360 case CRUSH_BUCKET_TREE: 367 case CRUSH_BUCKET_TREE:
361 return bucket_tree_choose((struct crush_bucket_tree *)in, 368 return bucket_tree_choose((const struct crush_bucket_tree *)in,
362 x, r); 369 x, r);
363 case CRUSH_BUCKET_STRAW: 370 case CRUSH_BUCKET_STRAW:
364 return bucket_straw_choose((struct crush_bucket_straw *)in, 371 return bucket_straw_choose(
365 x, r); 372 (const struct crush_bucket_straw *)in,
373 x, r);
366 case CRUSH_BUCKET_STRAW2: 374 case CRUSH_BUCKET_STRAW2:
367 return bucket_straw2_choose((struct crush_bucket_straw2 *)in, 375 return bucket_straw2_choose(
368 x, r); 376 (const struct crush_bucket_straw2 *)in,
377 x, r);
369 default: 378 default:
370 dprintk("unknown bucket %d alg %d\n", in->id, in->alg); 379 dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
371 return in->items[0]; 380 return in->items[0];
372 } 381 }
373} 382}
374 383
375
376/* 384/*
377 * true if device is marked "out" (failed, fully offloaded) 385 * true if device is marked "out" (failed, fully offloaded)
378 * of the cluster 386 * of the cluster
@@ -414,7 +422,8 @@ static int is_out(const struct crush_map *map,
414 * @parent_r: r value passed from the parent 422 * @parent_r: r value passed from the parent
415 */ 423 */
416static int crush_choose_firstn(const struct crush_map *map, 424static int crush_choose_firstn(const struct crush_map *map,
417 struct crush_bucket *bucket, 425 struct crush_work *work,
426 const struct crush_bucket *bucket,
418 const __u32 *weight, int weight_max, 427 const __u32 *weight, int weight_max,
419 int x, int numrep, int type, 428 int x, int numrep, int type,
420 int *out, int outpos, 429 int *out, int outpos,
@@ -432,7 +441,7 @@ static int crush_choose_firstn(const struct crush_map *map,
432 int rep; 441 int rep;
433 unsigned int ftotal, flocal; 442 unsigned int ftotal, flocal;
434 int retry_descent, retry_bucket, skip_rep; 443 int retry_descent, retry_bucket, skip_rep;
435 struct crush_bucket *in = bucket; 444 const struct crush_bucket *in = bucket;
436 int r; 445 int r;
437 int i; 446 int i;
438 int item = 0; 447 int item = 0;
@@ -471,9 +480,13 @@ static int crush_choose_firstn(const struct crush_map *map,
471 if (local_fallback_retries > 0 && 480 if (local_fallback_retries > 0 &&
472 flocal >= (in->size>>1) && 481 flocal >= (in->size>>1) &&
473 flocal > local_fallback_retries) 482 flocal > local_fallback_retries)
474 item = bucket_perm_choose(in, x, r); 483 item = bucket_perm_choose(
484 in, work->work[-1-in->id],
485 x, r);
475 else 486 else
476 item = crush_bucket_choose(in, x, r); 487 item = crush_bucket_choose(
488 in, work->work[-1-in->id],
489 x, r);
477 if (item >= map->max_devices) { 490 if (item >= map->max_devices) {
478 dprintk(" bad item %d\n", item); 491 dprintk(" bad item %d\n", item);
479 skip_rep = 1; 492 skip_rep = 1;
@@ -516,19 +529,21 @@ static int crush_choose_firstn(const struct crush_map *map,
516 sub_r = r >> (vary_r-1); 529 sub_r = r >> (vary_r-1);
517 else 530 else
518 sub_r = 0; 531 sub_r = 0;
519 if (crush_choose_firstn(map, 532 if (crush_choose_firstn(
520 map->buckets[-1-item], 533 map,
521 weight, weight_max, 534 work,
522 x, stable ? 1 : outpos+1, 0, 535 map->buckets[-1-item],
523 out2, outpos, count, 536 weight, weight_max,
524 recurse_tries, 0, 537 x, stable ? 1 : outpos+1, 0,
525 local_retries, 538 out2, outpos, count,
526 local_fallback_retries, 539 recurse_tries, 0,
527 0, 540 local_retries,
528 vary_r, 541 local_fallback_retries,
529 stable, 542 0,
530 NULL, 543 vary_r,
531 sub_r) <= outpos) 544 stable,
545 NULL,
546 sub_r) <= outpos)
532 /* didn't get leaf */ 547 /* didn't get leaf */
533 reject = 1; 548 reject = 1;
534 } else { 549 } else {
@@ -537,14 +552,12 @@ static int crush_choose_firstn(const struct crush_map *map,
537 } 552 }
538 } 553 }
539 554
540 if (!reject) { 555 if (!reject && !collide) {
541 /* out? */ 556 /* out? */
542 if (itemtype == 0) 557 if (itemtype == 0)
543 reject = is_out(map, weight, 558 reject = is_out(map, weight,
544 weight_max, 559 weight_max,
545 item, x); 560 item, x);
546 else
547 reject = 0;
548 } 561 }
549 562
550reject: 563reject:
@@ -598,7 +611,8 @@ reject:
598 * 611 *
599 */ 612 */
600static void crush_choose_indep(const struct crush_map *map, 613static void crush_choose_indep(const struct crush_map *map,
601 struct crush_bucket *bucket, 614 struct crush_work *work,
615 const struct crush_bucket *bucket,
602 const __u32 *weight, int weight_max, 616 const __u32 *weight, int weight_max,
603 int x, int left, int numrep, int type, 617 int x, int left, int numrep, int type,
604 int *out, int outpos, 618 int *out, int outpos,
@@ -608,7 +622,7 @@ static void crush_choose_indep(const struct crush_map *map,
608 int *out2, 622 int *out2,
609 int parent_r) 623 int parent_r)
610{ 624{
611 struct crush_bucket *in = bucket; 625 const struct crush_bucket *in = bucket;
612 int endpos = outpos + left; 626 int endpos = outpos + left;
613 int rep; 627 int rep;
614 unsigned int ftotal; 628 unsigned int ftotal;
@@ -676,7 +690,9 @@ static void crush_choose_indep(const struct crush_map *map,
676 break; 690 break;
677 } 691 }
678 692
679 item = crush_bucket_choose(in, x, r); 693 item = crush_bucket_choose(
694 in, work->work[-1-in->id],
695 x, r);
680 if (item >= map->max_devices) { 696 if (item >= map->max_devices) {
681 dprintk(" bad item %d\n", item); 697 dprintk(" bad item %d\n", item);
682 out[rep] = CRUSH_ITEM_NONE; 698 out[rep] = CRUSH_ITEM_NONE;
@@ -722,13 +738,15 @@ static void crush_choose_indep(const struct crush_map *map,
722 738
723 if (recurse_to_leaf) { 739 if (recurse_to_leaf) {
724 if (item < 0) { 740 if (item < 0) {
725 crush_choose_indep(map, 741 crush_choose_indep(
726 map->buckets[-1-item], 742 map,
727 weight, weight_max, 743 work,
728 x, 1, numrep, 0, 744 map->buckets[-1-item],
729 out2, rep, 745 weight, weight_max,
730 recurse_tries, 0, 746 x, 1, numrep, 0,
731 0, NULL, r); 747 out2, rep,
748 recurse_tries, 0,
749 0, NULL, r);
732 if (out2[rep] == CRUSH_ITEM_NONE) { 750 if (out2[rep] == CRUSH_ITEM_NONE) {
733 /* placed nothing; no leaf */ 751 /* placed nothing; no leaf */
734 break; 752 break;
@@ -779,6 +797,53 @@ static void crush_choose_indep(const struct crush_map *map,
779#endif 797#endif
780} 798}
781 799
800
801/*
802 * This takes a chunk of memory and sets it up to be a shiny new
803 * working area for a CRUSH placement computation. It must be called
804 * on any newly allocated memory before passing it in to
805 * crush_do_rule. It may be used repeatedly after that, so long as the
806 * map has not changed. If the map /has/ changed, you must make sure
807 * the working size is no smaller than what was allocated and re-run
808 * crush_init_workspace.
809 *
810 * If you do retain the working space between calls to crush, make it
811 * thread-local.
812 */
813void crush_init_workspace(const struct crush_map *map, void *v)
814{
815 struct crush_work *w = v;
816 __s32 b;
817
818 /*
819 * We work by moving through the available space and setting
820 * values and pointers as we go.
821 *
822 * It's a bit like Forth's use of the 'allot' word since we
823 * set the pointer first and then reserve the space for it to
824 * point to by incrementing the point.
825 */
826 v += sizeof(struct crush_work *);
827 w->work = v;
828 v += map->max_buckets * sizeof(struct crush_work_bucket *);
829 for (b = 0; b < map->max_buckets; ++b) {
830 if (!map->buckets[b])
831 continue;
832
833 w->work[b] = v;
834 switch (map->buckets[b]->alg) {
835 default:
836 v += sizeof(struct crush_work_bucket);
837 break;
838 }
839 w->work[b]->perm_x = 0;
840 w->work[b]->perm_n = 0;
841 w->work[b]->perm = v;
842 v += map->buckets[b]->size * sizeof(__u32);
843 }
844 BUG_ON(v - (void *)w != map->working_size);
845}
846
782/** 847/**
783 * crush_do_rule - calculate a mapping with the given input and rule 848 * crush_do_rule - calculate a mapping with the given input and rule
784 * @map: the crush_map 849 * @map: the crush_map
@@ -788,24 +853,25 @@ static void crush_choose_indep(const struct crush_map *map,
788 * @result_max: maximum result size 853 * @result_max: maximum result size
789 * @weight: weight vector (for map leaves) 854 * @weight: weight vector (for map leaves)
790 * @weight_max: size of weight vector 855 * @weight_max: size of weight vector
791 * @scratch: scratch vector for private use; must be >= 3 * result_max 856 * @cwin: pointer to at least crush_work_size() bytes of memory
792 */ 857 */
793int crush_do_rule(const struct crush_map *map, 858int crush_do_rule(const struct crush_map *map,
794 int ruleno, int x, int *result, int result_max, 859 int ruleno, int x, int *result, int result_max,
795 const __u32 *weight, int weight_max, 860 const __u32 *weight, int weight_max,
796 int *scratch) 861 void *cwin)
797{ 862{
798 int result_len; 863 int result_len;
799 int *a = scratch; 864 struct crush_work *cw = cwin;
800 int *b = scratch + result_max; 865 int *a = cwin + map->working_size;
801 int *c = scratch + result_max*2; 866 int *b = a + result_max;
867 int *c = b + result_max;
868 int *w = a;
869 int *o = b;
802 int recurse_to_leaf; 870 int recurse_to_leaf;
803 int *w;
804 int wsize = 0; 871 int wsize = 0;
805 int *o;
806 int osize; 872 int osize;
807 int *tmp; 873 int *tmp;
808 struct crush_rule *rule; 874 const struct crush_rule *rule;
809 __u32 step; 875 __u32 step;
810 int i, j; 876 int i, j;
811 int numrep; 877 int numrep;
@@ -833,12 +899,10 @@ int crush_do_rule(const struct crush_map *map,
833 899
834 rule = map->rules[ruleno]; 900 rule = map->rules[ruleno];
835 result_len = 0; 901 result_len = 0;
836 w = a;
837 o = b;
838 902
839 for (step = 0; step < rule->len; step++) { 903 for (step = 0; step < rule->len; step++) {
840 int firstn = 0; 904 int firstn = 0;
841 struct crush_rule_step *curstep = &rule->steps[step]; 905 const struct crush_rule_step *curstep = &rule->steps[step];
842 906
843 switch (curstep->op) { 907 switch (curstep->op) {
844 case CRUSH_RULE_TAKE: 908 case CRUSH_RULE_TAKE:
@@ -934,6 +998,7 @@ int crush_do_rule(const struct crush_map *map,
934 recurse_tries = choose_tries; 998 recurse_tries = choose_tries;
935 osize += crush_choose_firstn( 999 osize += crush_choose_firstn(
936 map, 1000 map,
1001 cw,
937 map->buckets[bno], 1002 map->buckets[bno],
938 weight, weight_max, 1003 weight, weight_max,
939 x, numrep, 1004 x, numrep,
@@ -954,6 +1019,7 @@ int crush_do_rule(const struct crush_map *map,
954 numrep : (result_max-osize)); 1019 numrep : (result_max-osize));
955 crush_choose_indep( 1020 crush_choose_indep(
956 map, 1021 map,
1022 cw,
957 map->buckets[bno], 1023 map->buckets[bno],
958 weight, weight_max, 1024 weight, weight_max,
959 x, out_size, numrep, 1025 x, out_size, numrep,
@@ -995,5 +1061,6 @@ int crush_do_rule(const struct crush_map *map,
995 break; 1061 break;
996 } 1062 }
997 } 1063 }
1064
998 return result_len; 1065 return result_len;
999} 1066}
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
index db2847ac5f12..46008d5ac504 100644
--- a/net/ceph/crypto.c
+++ b/net/ceph/crypto.c
@@ -3,24 +3,72 @@
3 3
4#include <linux/err.h> 4#include <linux/err.h>
5#include <linux/scatterlist.h> 5#include <linux/scatterlist.h>
6#include <linux/sched.h>
6#include <linux/slab.h> 7#include <linux/slab.h>
7#include <crypto/aes.h> 8#include <crypto/aes.h>
8#include <crypto/skcipher.h> 9#include <crypto/skcipher.h>
9#include <linux/key-type.h> 10#include <linux/key-type.h>
11#include <linux/sched/mm.h>
10 12
11#include <keys/ceph-type.h> 13#include <keys/ceph-type.h>
12#include <keys/user-type.h> 14#include <keys/user-type.h>
13#include <linux/ceph/decode.h> 15#include <linux/ceph/decode.h>
14#include "crypto.h" 16#include "crypto.h"
15 17
18/*
19 * Set ->key and ->tfm. The rest of the key should be filled in before
20 * this function is called.
21 */
22static int set_secret(struct ceph_crypto_key *key, void *buf)
23{
24 unsigned int noio_flag;
25 int ret;
26
27 key->key = NULL;
28 key->tfm = NULL;
29
30 switch (key->type) {
31 case CEPH_CRYPTO_NONE:
32 return 0; /* nothing to do */
33 case CEPH_CRYPTO_AES:
34 break;
35 default:
36 return -ENOTSUPP;
37 }
38
39 WARN_ON(!key->len);
40 key->key = kmemdup(buf, key->len, GFP_NOIO);
41 if (!key->key) {
42 ret = -ENOMEM;
43 goto fail;
44 }
45
46 /* crypto_alloc_skcipher() allocates with GFP_KERNEL */
47 noio_flag = memalloc_noio_save();
48 key->tfm = crypto_alloc_skcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
49 memalloc_noio_restore(noio_flag);
50 if (IS_ERR(key->tfm)) {
51 ret = PTR_ERR(key->tfm);
52 key->tfm = NULL;
53 goto fail;
54 }
55
56 ret = crypto_skcipher_setkey(key->tfm, key->key, key->len);
57 if (ret)
58 goto fail;
59
60 return 0;
61
62fail:
63 ceph_crypto_key_destroy(key);
64 return ret;
65}
66
16int ceph_crypto_key_clone(struct ceph_crypto_key *dst, 67int ceph_crypto_key_clone(struct ceph_crypto_key *dst,
17 const struct ceph_crypto_key *src) 68 const struct ceph_crypto_key *src)
18{ 69{
19 memcpy(dst, src, sizeof(struct ceph_crypto_key)); 70 memcpy(dst, src, sizeof(struct ceph_crypto_key));
20 dst->key = kmemdup(src->key, src->len, GFP_NOFS); 71 return set_secret(dst, src->key);
21 if (!dst->key)
22 return -ENOMEM;
23 return 0;
24} 72}
25 73
26int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end) 74int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
@@ -37,16 +85,16 @@ int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
37 85
38int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end) 86int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
39{ 87{
88 int ret;
89
40 ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad); 90 ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
41 key->type = ceph_decode_16(p); 91 key->type = ceph_decode_16(p);
42 ceph_decode_copy(p, &key->created, sizeof(key->created)); 92 ceph_decode_copy(p, &key->created, sizeof(key->created));
43 key->len = ceph_decode_16(p); 93 key->len = ceph_decode_16(p);
44 ceph_decode_need(p, end, key->len, bad); 94 ceph_decode_need(p, end, key->len, bad);
45 key->key = kmalloc(key->len, GFP_NOFS); 95 ret = set_secret(key, *p);
46 if (!key->key) 96 *p += key->len;
47 return -ENOMEM; 97 return ret;
48 ceph_decode_copy(p, key->key, key->len);
49 return 0;
50 98
51bad: 99bad:
52 dout("failed to decode crypto key\n"); 100 dout("failed to decode crypto key\n");
@@ -80,9 +128,14 @@ int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
80 return 0; 128 return 0;
81} 129}
82 130
83static struct crypto_skcipher *ceph_crypto_alloc_cipher(void) 131void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
84{ 132{
85 return crypto_alloc_skcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC); 133 if (key) {
134 kfree(key->key);
135 key->key = NULL;
136 crypto_free_skcipher(key->tfm);
137 key->tfm = NULL;
138 }
86} 139}
87 140
88static const u8 *aes_iv = (u8 *)CEPH_AES_IV; 141static const u8 *aes_iv = (u8 *)CEPH_AES_IV;
@@ -157,372 +210,82 @@ static void teardown_sgtable(struct sg_table *sgt)
157 sg_free_table(sgt); 210 sg_free_table(sgt);
158} 211}
159 212
160static int ceph_aes_encrypt(const void *key, int key_len, 213static int ceph_aes_crypt(const struct ceph_crypto_key *key, bool encrypt,
161 void *dst, size_t *dst_len, 214 void *buf, int buf_len, int in_len, int *pout_len)
162 const void *src, size_t src_len)
163{
164 struct scatterlist sg_in[2], prealloc_sg;
165 struct sg_table sg_out;
166 struct crypto_skcipher *tfm = ceph_crypto_alloc_cipher();
167 SKCIPHER_REQUEST_ON_STACK(req, tfm);
168 int ret;
169 char iv[AES_BLOCK_SIZE];
170 size_t zero_padding = (0x10 - (src_len & 0x0f));
171 char pad[16];
172
173 if (IS_ERR(tfm))
174 return PTR_ERR(tfm);
175
176 memset(pad, zero_padding, zero_padding);
177
178 *dst_len = src_len + zero_padding;
179
180 sg_init_table(sg_in, 2);
181 sg_set_buf(&sg_in[0], src, src_len);
182 sg_set_buf(&sg_in[1], pad, zero_padding);
183 ret = setup_sgtable(&sg_out, &prealloc_sg, dst, *dst_len);
184 if (ret)
185 goto out_tfm;
186
187 crypto_skcipher_setkey((void *)tfm, key, key_len);
188 memcpy(iv, aes_iv, AES_BLOCK_SIZE);
189
190 skcipher_request_set_tfm(req, tfm);
191 skcipher_request_set_callback(req, 0, NULL, NULL);
192 skcipher_request_set_crypt(req, sg_in, sg_out.sgl,
193 src_len + zero_padding, iv);
194
195 /*
196 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
197 key, key_len, 1);
198 print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
199 src, src_len, 1);
200 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
201 pad, zero_padding, 1);
202 */
203 ret = crypto_skcipher_encrypt(req);
204 skcipher_request_zero(req);
205 if (ret < 0) {
206 pr_err("ceph_aes_crypt failed %d\n", ret);
207 goto out_sg;
208 }
209 /*
210 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
211 dst, *dst_len, 1);
212 */
213
214out_sg:
215 teardown_sgtable(&sg_out);
216out_tfm:
217 crypto_free_skcipher(tfm);
218 return ret;
219}
220
221static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
222 size_t *dst_len,
223 const void *src1, size_t src1_len,
224 const void *src2, size_t src2_len)
225{
226 struct scatterlist sg_in[3], prealloc_sg;
227 struct sg_table sg_out;
228 struct crypto_skcipher *tfm = ceph_crypto_alloc_cipher();
229 SKCIPHER_REQUEST_ON_STACK(req, tfm);
230 int ret;
231 char iv[AES_BLOCK_SIZE];
232 size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
233 char pad[16];
234
235 if (IS_ERR(tfm))
236 return PTR_ERR(tfm);
237
238 memset(pad, zero_padding, zero_padding);
239
240 *dst_len = src1_len + src2_len + zero_padding;
241
242 sg_init_table(sg_in, 3);
243 sg_set_buf(&sg_in[0], src1, src1_len);
244 sg_set_buf(&sg_in[1], src2, src2_len);
245 sg_set_buf(&sg_in[2], pad, zero_padding);
246 ret = setup_sgtable(&sg_out, &prealloc_sg, dst, *dst_len);
247 if (ret)
248 goto out_tfm;
249
250 crypto_skcipher_setkey((void *)tfm, key, key_len);
251 memcpy(iv, aes_iv, AES_BLOCK_SIZE);
252
253 skcipher_request_set_tfm(req, tfm);
254 skcipher_request_set_callback(req, 0, NULL, NULL);
255 skcipher_request_set_crypt(req, sg_in, sg_out.sgl,
256 src1_len + src2_len + zero_padding, iv);
257
258 /*
259 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
260 key, key_len, 1);
261 print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
262 src1, src1_len, 1);
263 print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
264 src2, src2_len, 1);
265 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
266 pad, zero_padding, 1);
267 */
268 ret = crypto_skcipher_encrypt(req);
269 skcipher_request_zero(req);
270 if (ret < 0) {
271 pr_err("ceph_aes_crypt2 failed %d\n", ret);
272 goto out_sg;
273 }
274 /*
275 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
276 dst, *dst_len, 1);
277 */
278
279out_sg:
280 teardown_sgtable(&sg_out);
281out_tfm:
282 crypto_free_skcipher(tfm);
283 return ret;
284}
285
286static int ceph_aes_decrypt(const void *key, int key_len,
287 void *dst, size_t *dst_len,
288 const void *src, size_t src_len)
289{ 215{
290 struct sg_table sg_in; 216 SKCIPHER_REQUEST_ON_STACK(req, key->tfm);
291 struct scatterlist sg_out[2], prealloc_sg; 217 struct sg_table sgt;
292 struct crypto_skcipher *tfm = ceph_crypto_alloc_cipher(); 218 struct scatterlist prealloc_sg;
293 SKCIPHER_REQUEST_ON_STACK(req, tfm); 219 char iv[AES_BLOCK_SIZE] __aligned(8);
294 char pad[16]; 220 int pad_byte = AES_BLOCK_SIZE - (in_len & (AES_BLOCK_SIZE - 1));
295 char iv[AES_BLOCK_SIZE]; 221 int crypt_len = encrypt ? in_len + pad_byte : in_len;
296 int ret; 222 int ret;
297 int last_byte;
298
299 if (IS_ERR(tfm))
300 return PTR_ERR(tfm);
301 223
302 sg_init_table(sg_out, 2); 224 WARN_ON(crypt_len > buf_len);
303 sg_set_buf(&sg_out[0], dst, *dst_len); 225 if (encrypt)
304 sg_set_buf(&sg_out[1], pad, sizeof(pad)); 226 memset(buf + in_len, pad_byte, pad_byte);
305 ret = setup_sgtable(&sg_in, &prealloc_sg, src, src_len); 227 ret = setup_sgtable(&sgt, &prealloc_sg, buf, crypt_len);
306 if (ret) 228 if (ret)
307 goto out_tfm; 229 return ret;
308 230
309 crypto_skcipher_setkey((void *)tfm, key, key_len);
310 memcpy(iv, aes_iv, AES_BLOCK_SIZE); 231 memcpy(iv, aes_iv, AES_BLOCK_SIZE);
311 232 skcipher_request_set_tfm(req, key->tfm);
312 skcipher_request_set_tfm(req, tfm);
313 skcipher_request_set_callback(req, 0, NULL, NULL); 233 skcipher_request_set_callback(req, 0, NULL, NULL);
314 skcipher_request_set_crypt(req, sg_in.sgl, sg_out, 234 skcipher_request_set_crypt(req, sgt.sgl, sgt.sgl, crypt_len, iv);
315 src_len, iv);
316 235
317 /* 236 /*
318 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1, 237 print_hex_dump(KERN_ERR, "key: ", DUMP_PREFIX_NONE, 16, 1,
319 key, key_len, 1); 238 key->key, key->len, 1);
320 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1, 239 print_hex_dump(KERN_ERR, " in: ", DUMP_PREFIX_NONE, 16, 1,
321 src, src_len, 1); 240 buf, crypt_len, 1);
322 */ 241 */
323 ret = crypto_skcipher_decrypt(req); 242 if (encrypt)
324 skcipher_request_zero(req); 243 ret = crypto_skcipher_encrypt(req);
325 if (ret < 0) {
326 pr_err("ceph_aes_decrypt failed %d\n", ret);
327 goto out_sg;
328 }
329
330 if (src_len <= *dst_len)
331 last_byte = ((char *)dst)[src_len - 1];
332 else 244 else
333 last_byte = pad[src_len - *dst_len - 1]; 245 ret = crypto_skcipher_decrypt(req);
334 if (last_byte <= 16 && src_len >= last_byte) {
335 *dst_len = src_len - last_byte;
336 } else {
337 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
338 last_byte, (int)src_len);
339 return -EPERM; /* bad padding */
340 }
341 /*
342 print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
343 dst, *dst_len, 1);
344 */
345
346out_sg:
347 teardown_sgtable(&sg_in);
348out_tfm:
349 crypto_free_skcipher(tfm);
350 return ret;
351}
352
353static int ceph_aes_decrypt2(const void *key, int key_len,
354 void *dst1, size_t *dst1_len,
355 void *dst2, size_t *dst2_len,
356 const void *src, size_t src_len)
357{
358 struct sg_table sg_in;
359 struct scatterlist sg_out[3], prealloc_sg;
360 struct crypto_skcipher *tfm = ceph_crypto_alloc_cipher();
361 SKCIPHER_REQUEST_ON_STACK(req, tfm);
362 char pad[16];
363 char iv[AES_BLOCK_SIZE];
364 int ret;
365 int last_byte;
366
367 if (IS_ERR(tfm))
368 return PTR_ERR(tfm);
369
370 sg_init_table(sg_out, 3);
371 sg_set_buf(&sg_out[0], dst1, *dst1_len);
372 sg_set_buf(&sg_out[1], dst2, *dst2_len);
373 sg_set_buf(&sg_out[2], pad, sizeof(pad));
374 ret = setup_sgtable(&sg_in, &prealloc_sg, src, src_len);
375 if (ret)
376 goto out_tfm;
377
378 crypto_skcipher_setkey((void *)tfm, key, key_len);
379 memcpy(iv, aes_iv, AES_BLOCK_SIZE);
380
381 skcipher_request_set_tfm(req, tfm);
382 skcipher_request_set_callback(req, 0, NULL, NULL);
383 skcipher_request_set_crypt(req, sg_in.sgl, sg_out,
384 src_len, iv);
385
386 /*
387 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
388 key, key_len, 1);
389 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
390 src, src_len, 1);
391 */
392 ret = crypto_skcipher_decrypt(req);
393 skcipher_request_zero(req); 246 skcipher_request_zero(req);
394 if (ret < 0) { 247 if (ret) {
395 pr_err("ceph_aes_decrypt failed %d\n", ret); 248 pr_err("%s %scrypt failed: %d\n", __func__,
396 goto out_sg; 249 encrypt ? "en" : "de", ret);
397 } 250 goto out_sgt;
398
399 if (src_len <= *dst1_len)
400 last_byte = ((char *)dst1)[src_len - 1];
401 else if (src_len <= *dst1_len + *dst2_len)
402 last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
403 else
404 last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
405 if (last_byte <= 16 && src_len >= last_byte) {
406 src_len -= last_byte;
407 } else {
408 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
409 last_byte, (int)src_len);
410 return -EPERM; /* bad padding */
411 }
412
413 if (src_len < *dst1_len) {
414 *dst1_len = src_len;
415 *dst2_len = 0;
416 } else {
417 *dst2_len = src_len - *dst1_len;
418 } 251 }
419 /* 252 /*
420 print_hex_dump(KERN_ERR, "dec out1: ", DUMP_PREFIX_NONE, 16, 1, 253 print_hex_dump(KERN_ERR, "out: ", DUMP_PREFIX_NONE, 16, 1,
421 dst1, *dst1_len, 1); 254 buf, crypt_len, 1);
422 print_hex_dump(KERN_ERR, "dec out2: ", DUMP_PREFIX_NONE, 16, 1,
423 dst2, *dst2_len, 1);
424 */ 255 */
425 256
426out_sg: 257 if (encrypt) {
427 teardown_sgtable(&sg_in); 258 *pout_len = crypt_len;
428out_tfm: 259 } else {
429 crypto_free_skcipher(tfm); 260 pad_byte = *(char *)(buf + in_len - 1);
430 return ret; 261 if (pad_byte > 0 && pad_byte <= AES_BLOCK_SIZE &&
431} 262 in_len >= pad_byte) {
432 263 *pout_len = in_len - pad_byte;
433 264 } else {
434int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len, 265 pr_err("%s got bad padding %d on in_len %d\n",
435 const void *src, size_t src_len) 266 __func__, pad_byte, in_len);
436{ 267 ret = -EPERM;
437 switch (secret->type) { 268 goto out_sgt;
438 case CEPH_CRYPTO_NONE:
439 if (*dst_len < src_len)
440 return -ERANGE;
441 memcpy(dst, src, src_len);
442 *dst_len = src_len;
443 return 0;
444
445 case CEPH_CRYPTO_AES:
446 return ceph_aes_decrypt(secret->key, secret->len, dst,
447 dst_len, src, src_len);
448
449 default:
450 return -EINVAL;
451 }
452}
453
454int ceph_decrypt2(struct ceph_crypto_key *secret,
455 void *dst1, size_t *dst1_len,
456 void *dst2, size_t *dst2_len,
457 const void *src, size_t src_len)
458{
459 size_t t;
460
461 switch (secret->type) {
462 case CEPH_CRYPTO_NONE:
463 if (*dst1_len + *dst2_len < src_len)
464 return -ERANGE;
465 t = min(*dst1_len, src_len);
466 memcpy(dst1, src, t);
467 *dst1_len = t;
468 src += t;
469 src_len -= t;
470 if (src_len) {
471 t = min(*dst2_len, src_len);
472 memcpy(dst2, src, t);
473 *dst2_len = t;
474 } 269 }
475 return 0;
476
477 case CEPH_CRYPTO_AES:
478 return ceph_aes_decrypt2(secret->key, secret->len,
479 dst1, dst1_len, dst2, dst2_len,
480 src, src_len);
481
482 default:
483 return -EINVAL;
484 } 270 }
485}
486
487int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
488 const void *src, size_t src_len)
489{
490 switch (secret->type) {
491 case CEPH_CRYPTO_NONE:
492 if (*dst_len < src_len)
493 return -ERANGE;
494 memcpy(dst, src, src_len);
495 *dst_len = src_len;
496 return 0;
497 271
498 case CEPH_CRYPTO_AES: 272out_sgt:
499 return ceph_aes_encrypt(secret->key, secret->len, dst, 273 teardown_sgtable(&sgt);
500 dst_len, src, src_len); 274 return ret;
501
502 default:
503 return -EINVAL;
504 }
505} 275}
506 276
507int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len, 277int ceph_crypt(const struct ceph_crypto_key *key, bool encrypt,
508 const void *src1, size_t src1_len, 278 void *buf, int buf_len, int in_len, int *pout_len)
509 const void *src2, size_t src2_len)
510{ 279{
511 switch (secret->type) { 280 switch (key->type) {
512 case CEPH_CRYPTO_NONE: 281 case CEPH_CRYPTO_NONE:
513 if (*dst_len < src1_len + src2_len) 282 *pout_len = in_len;
514 return -ERANGE;
515 memcpy(dst, src1, src1_len);
516 memcpy(dst + src1_len, src2, src2_len);
517 *dst_len = src1_len + src2_len;
518 return 0; 283 return 0;
519
520 case CEPH_CRYPTO_AES: 284 case CEPH_CRYPTO_AES:
521 return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len, 285 return ceph_aes_crypt(key, encrypt, buf, buf_len, in_len,
522 src1, src1_len, src2, src2_len); 286 pout_len);
523
524 default: 287 default:
525 return -EINVAL; 288 return -ENOTSUPP;
526 } 289 }
527} 290}
528 291
diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h
index 2e9cab09f37b..58d83aa7740f 100644
--- a/net/ceph/crypto.h
+++ b/net/ceph/crypto.h
@@ -12,37 +12,19 @@ struct ceph_crypto_key {
12 struct ceph_timespec created; 12 struct ceph_timespec created;
13 int len; 13 int len;
14 void *key; 14 void *key;
15 struct crypto_skcipher *tfm;
15}; 16};
16 17
17static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
18{
19 if (key) {
20 kfree(key->key);
21 key->key = NULL;
22 }
23}
24
25int ceph_crypto_key_clone(struct ceph_crypto_key *dst, 18int ceph_crypto_key_clone(struct ceph_crypto_key *dst,
26 const struct ceph_crypto_key *src); 19 const struct ceph_crypto_key *src);
27int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end); 20int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end);
28int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end); 21int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end);
29int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in); 22int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
23void ceph_crypto_key_destroy(struct ceph_crypto_key *key);
30 24
31/* crypto.c */ 25/* crypto.c */
32int ceph_decrypt(struct ceph_crypto_key *secret, 26int ceph_crypt(const struct ceph_crypto_key *key, bool encrypt,
33 void *dst, size_t *dst_len, 27 void *buf, int buf_len, int in_len, int *pout_len);
34 const void *src, size_t src_len);
35int ceph_encrypt(struct ceph_crypto_key *secret,
36 void *dst, size_t *dst_len,
37 const void *src, size_t src_len);
38int ceph_decrypt2(struct ceph_crypto_key *secret,
39 void *dst1, size_t *dst1_len,
40 void *dst2, size_t *dst2_len,
41 const void *src, size_t src_len);
42int ceph_encrypt2(struct ceph_crypto_key *secret,
43 void *dst, size_t *dst_len,
44 const void *src1, size_t src1_len,
45 const void *src2, size_t src2_len);
46int ceph_crypto_init(void); 28int ceph_crypto_init(void);
47void ceph_crypto_shutdown(void); 29void ceph_crypto_shutdown(void);
48 30
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index a5502898ea33..f76bb3332613 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -7,6 +7,7 @@
7#include <linux/kthread.h> 7#include <linux/kthread.h>
8#include <linux/net.h> 8#include <linux/net.h>
9#include <linux/nsproxy.h> 9#include <linux/nsproxy.h>
10#include <linux/sched/mm.h>
10#include <linux/slab.h> 11#include <linux/slab.h>
11#include <linux/socket.h> 12#include <linux/socket.h>
12#include <linux/string.h> 13#include <linux/string.h>
@@ -469,11 +470,16 @@ static int ceph_tcp_connect(struct ceph_connection *con)
469{ 470{
470 struct sockaddr_storage *paddr = &con->peer_addr.in_addr; 471 struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
471 struct socket *sock; 472 struct socket *sock;
473 unsigned int noio_flag;
472 int ret; 474 int ret;
473 475
474 BUG_ON(con->sock); 476 BUG_ON(con->sock);
477
478 /* sock_create_kern() allocates with GFP_KERNEL */
479 noio_flag = memalloc_noio_save();
475 ret = sock_create_kern(read_pnet(&con->msgr->net), paddr->ss_family, 480 ret = sock_create_kern(read_pnet(&con->msgr->net), paddr->ss_family,
476 SOCK_STREAM, IPPROTO_TCP, &sock); 481 SOCK_STREAM, IPPROTO_TCP, &sock);
482 memalloc_noio_restore(noio_flag);
477 if (ret) 483 if (ret)
478 return ret; 484 return ret;
479 sock->sk->sk_allocation = GFP_NOFS; 485 sock->sk->sk_allocation = GFP_NOFS;
@@ -520,7 +526,8 @@ static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
520 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; 526 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
521 int r; 527 int r;
522 528
523 r = kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags); 529 iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, len);
530 r = sock_recvmsg(sock, &msg, msg.msg_flags);
524 if (r == -EAGAIN) 531 if (r == -EAGAIN)
525 r = 0; 532 r = 0;
526 return r; 533 return r;
@@ -529,17 +536,20 @@ static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
529static int ceph_tcp_recvpage(struct socket *sock, struct page *page, 536static int ceph_tcp_recvpage(struct socket *sock, struct page *page,
530 int page_offset, size_t length) 537 int page_offset, size_t length)
531{ 538{
532 void *kaddr; 539 struct bio_vec bvec = {
533 int ret; 540 .bv_page = page,
541 .bv_offset = page_offset,
542 .bv_len = length
543 };
544 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
545 int r;
534 546
535 BUG_ON(page_offset + length > PAGE_SIZE); 547 BUG_ON(page_offset + length > PAGE_SIZE);
536 548 iov_iter_bvec(&msg.msg_iter, READ | ITER_BVEC, &bvec, 1, length);
537 kaddr = kmap(page); 549 r = sock_recvmsg(sock, &msg, msg.msg_flags);
538 BUG_ON(!kaddr); 550 if (r == -EAGAIN)
539 ret = ceph_tcp_recvmsg(sock, kaddr + page_offset, length); 551 r = 0;
540 kunmap(page); 552 return r;
541
542 return ret;
543} 553}
544 554
545/* 555/*
@@ -579,18 +589,28 @@ static int __ceph_tcp_sendpage(struct socket *sock, struct page *page,
579static int ceph_tcp_sendpage(struct socket *sock, struct page *page, 589static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
580 int offset, size_t size, bool more) 590 int offset, size_t size, bool more)
581{ 591{
592 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
593 struct bio_vec bvec;
582 int ret; 594 int ret;
583 struct kvec iov;
584 595
585 /* sendpage cannot properly handle pages with page_count == 0, 596 /* sendpage cannot properly handle pages with page_count == 0,
586 * we need to fallback to sendmsg if that's the case */ 597 * we need to fallback to sendmsg if that's the case */
587 if (page_count(page) >= 1) 598 if (page_count(page) >= 1)
588 return __ceph_tcp_sendpage(sock, page, offset, size, more); 599 return __ceph_tcp_sendpage(sock, page, offset, size, more);
589 600
590 iov.iov_base = kmap(page) + offset; 601 bvec.bv_page = page;
591 iov.iov_len = size; 602 bvec.bv_offset = offset;
592 ret = ceph_tcp_sendmsg(sock, &iov, 1, size, more); 603 bvec.bv_len = size;
593 kunmap(page); 604
605 if (more)
606 msg.msg_flags |= MSG_MORE;
607 else
608 msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
609
610 iov_iter_bvec(&msg.msg_iter, WRITE | ITER_BVEC, &bvec, 1, size);
611 ret = sock_sendmsg(sock, &msg);
612 if (ret == -EAGAIN)
613 ret = 0;
594 614
595 return ret; 615 return ret;
596} 616}
@@ -1393,15 +1413,9 @@ static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection
1393 return NULL; 1413 return NULL;
1394 } 1414 }
1395 1415
1396 /* Can't hold the mutex while getting authorizer */
1397 mutex_unlock(&con->mutex);
1398 auth = con->ops->get_authorizer(con, auth_proto, con->auth_retry); 1416 auth = con->ops->get_authorizer(con, auth_proto, con->auth_retry);
1399 mutex_lock(&con->mutex);
1400
1401 if (IS_ERR(auth)) 1417 if (IS_ERR(auth))
1402 return auth; 1418 return auth;
1403 if (con->state != CON_STATE_NEGOTIATING)
1404 return ERR_PTR(-EAGAIN);
1405 1419
1406 con->auth_reply_buf = auth->authorizer_reply_buf; 1420 con->auth_reply_buf = auth->authorizer_reply_buf;
1407 con->auth_reply_buf_len = auth->authorizer_reply_buf_len; 1421 con->auth_reply_buf_len = auth->authorizer_reply_buf_len;
@@ -2027,6 +2041,19 @@ static int process_connect(struct ceph_connection *con)
2027 2041
2028 dout("process_connect on %p tag %d\n", con, (int)con->in_tag); 2042 dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
2029 2043
2044 if (con->auth_reply_buf) {
2045 /*
2046 * Any connection that defines ->get_authorizer()
2047 * should also define ->verify_authorizer_reply().
2048 * See get_connect_authorizer().
2049 */
2050 ret = con->ops->verify_authorizer_reply(con);
2051 if (ret < 0) {
2052 con->error_msg = "bad authorize reply";
2053 return ret;
2054 }
2055 }
2056
2030 switch (con->in_reply.tag) { 2057 switch (con->in_reply.tag) {
2031 case CEPH_MSGR_TAG_FEATURES: 2058 case CEPH_MSGR_TAG_FEATURES:
2032 pr_err("%s%lld %s feature set mismatch," 2059 pr_err("%s%lld %s feature set mismatch,"
@@ -3418,7 +3445,7 @@ static void ceph_msg_release(struct kref *kref)
3418struct ceph_msg *ceph_msg_get(struct ceph_msg *msg) 3445struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
3419{ 3446{
3420 dout("%s %p (was %d)\n", __func__, msg, 3447 dout("%s %p (was %d)\n", __func__, msg,
3421 atomic_read(&msg->kref.refcount)); 3448 kref_read(&msg->kref));
3422 kref_get(&msg->kref); 3449 kref_get(&msg->kref);
3423 return msg; 3450 return msg;
3424} 3451}
@@ -3427,7 +3454,7 @@ EXPORT_SYMBOL(ceph_msg_get);
3427void ceph_msg_put(struct ceph_msg *msg) 3454void ceph_msg_put(struct ceph_msg *msg)
3428{ 3455{
3429 dout("%s %p (was %d)\n", __func__, msg, 3456 dout("%s %p (was %d)\n", __func__, msg,
3430 atomic_read(&msg->kref.refcount)); 3457 kref_read(&msg->kref));
3431 kref_put(&msg->kref, ceph_msg_release); 3458 kref_put(&msg->kref, ceph_msg_release);
3432} 3459}
3433EXPORT_SYMBOL(ceph_msg_put); 3460EXPORT_SYMBOL(ceph_msg_put);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index a8effc8b7280..29a0ef351c5e 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -1028,21 +1028,21 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
1028 err = -ENOMEM; 1028 err = -ENOMEM;
1029 monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK, 1029 monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
1030 sizeof(struct ceph_mon_subscribe_ack), 1030 sizeof(struct ceph_mon_subscribe_ack),
1031 GFP_NOFS, true); 1031 GFP_KERNEL, true);
1032 if (!monc->m_subscribe_ack) 1032 if (!monc->m_subscribe_ack)
1033 goto out_auth; 1033 goto out_auth;
1034 1034
1035 monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 128, GFP_NOFS, 1035 monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 128,
1036 true); 1036 GFP_KERNEL, true);
1037 if (!monc->m_subscribe) 1037 if (!monc->m_subscribe)
1038 goto out_subscribe_ack; 1038 goto out_subscribe_ack;
1039 1039
1040 monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS, 1040 monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096,
1041 true); 1041 GFP_KERNEL, true);
1042 if (!monc->m_auth_reply) 1042 if (!monc->m_auth_reply)
1043 goto out_subscribe; 1043 goto out_subscribe;
1044 1044
1045 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS, true); 1045 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_KERNEL, true);
1046 monc->pending_auth = 0; 1046 monc->pending_auth = 0;
1047 if (!monc->m_auth) 1047 if (!monc->m_auth)
1048 goto out_auth_reply; 1048 goto out_auth_reply;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index e6ae15bc41b7..e15ea9e4c495 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -438,7 +438,7 @@ static void ceph_osdc_release_request(struct kref *kref)
438void ceph_osdc_get_request(struct ceph_osd_request *req) 438void ceph_osdc_get_request(struct ceph_osd_request *req)
439{ 439{
440 dout("%s %p (was %d)\n", __func__, req, 440 dout("%s %p (was %d)\n", __func__, req,
441 atomic_read(&req->r_kref.refcount)); 441 kref_read(&req->r_kref));
442 kref_get(&req->r_kref); 442 kref_get(&req->r_kref);
443} 443}
444EXPORT_SYMBOL(ceph_osdc_get_request); 444EXPORT_SYMBOL(ceph_osdc_get_request);
@@ -447,7 +447,7 @@ void ceph_osdc_put_request(struct ceph_osd_request *req)
447{ 447{
448 if (req) { 448 if (req) {
449 dout("%s %p (was %d)\n", __func__, req, 449 dout("%s %p (was %d)\n", __func__, req,
450 atomic_read(&req->r_kref.refcount)); 450 kref_read(&req->r_kref));
451 kref_put(&req->r_kref, ceph_osdc_release_request); 451 kref_put(&req->r_kref, ceph_osdc_release_request);
452 } 452 }
453} 453}
@@ -460,7 +460,6 @@ static void request_init(struct ceph_osd_request *req)
460 460
461 kref_init(&req->r_kref); 461 kref_init(&req->r_kref);
462 init_completion(&req->r_completion); 462 init_completion(&req->r_completion);
463 init_completion(&req->r_safe_completion);
464 RB_CLEAR_NODE(&req->r_node); 463 RB_CLEAR_NODE(&req->r_node);
465 RB_CLEAR_NODE(&req->r_mc_node); 464 RB_CLEAR_NODE(&req->r_mc_node);
466 INIT_LIST_HEAD(&req->r_unsafe_item); 465 INIT_LIST_HEAD(&req->r_unsafe_item);
@@ -487,11 +486,11 @@ static void request_reinit(struct ceph_osd_request *req)
487 struct ceph_msg *reply_msg = req->r_reply; 486 struct ceph_msg *reply_msg = req->r_reply;
488 487
489 dout("%s req %p\n", __func__, req); 488 dout("%s req %p\n", __func__, req);
490 WARN_ON(atomic_read(&req->r_kref.refcount) != 1); 489 WARN_ON(kref_read(&req->r_kref) != 1);
491 request_release_checks(req); 490 request_release_checks(req);
492 491
493 WARN_ON(atomic_read(&request_msg->kref.refcount) != 1); 492 WARN_ON(kref_read(&request_msg->kref) != 1);
494 WARN_ON(atomic_read(&reply_msg->kref.refcount) != 1); 493 WARN_ON(kref_read(&reply_msg->kref) != 1);
495 target_destroy(&req->r_t); 494 target_destroy(&req->r_t);
496 495
497 request_init(req); 496 request_init(req);
@@ -672,7 +671,8 @@ void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
672 BUG_ON(length > previous); 671 BUG_ON(length > previous);
673 672
674 op->extent.length = length; 673 op->extent.length = length;
675 op->indata_len -= previous - length; 674 if (op->op == CEPH_OSD_OP_WRITE || op->op == CEPH_OSD_OP_WRITEFULL)
675 op->indata_len -= previous - length;
676} 676}
677EXPORT_SYMBOL(osd_req_op_extent_update); 677EXPORT_SYMBOL(osd_req_op_extent_update);
678 678
@@ -1636,7 +1636,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
1636 bool need_send = false; 1636 bool need_send = false;
1637 bool promoted = false; 1637 bool promoted = false;
1638 1638
1639 WARN_ON(req->r_tid || req->r_got_reply); 1639 WARN_ON(req->r_tid);
1640 dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); 1640 dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
1641 1641
1642again: 1642again:
@@ -1704,18 +1704,13 @@ promote:
1704 1704
1705static void account_request(struct ceph_osd_request *req) 1705static void account_request(struct ceph_osd_request *req)
1706{ 1706{
1707 unsigned int mask = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK; 1707 WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK));
1708 1708 WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE)));
1709 if (req->r_flags & CEPH_OSD_FLAG_READ) {
1710 WARN_ON(req->r_flags & mask);
1711 req->r_flags |= CEPH_OSD_FLAG_ACK;
1712 } else if (req->r_flags & CEPH_OSD_FLAG_WRITE)
1713 WARN_ON(!(req->r_flags & mask));
1714 else
1715 WARN_ON(1);
1716 1709
1717 WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask); 1710 req->r_flags |= CEPH_OSD_FLAG_ONDISK;
1718 atomic_inc(&req->r_osdc->num_requests); 1711 atomic_inc(&req->r_osdc->num_requests);
1712
1713 req->r_start_stamp = jiffies;
1719} 1714}
1720 1715
1721static void submit_request(struct ceph_osd_request *req, bool wrlocked) 1716static void submit_request(struct ceph_osd_request *req, bool wrlocked)
@@ -1725,7 +1720,7 @@ static void submit_request(struct ceph_osd_request *req, bool wrlocked)
1725 __submit_request(req, wrlocked); 1720 __submit_request(req, wrlocked);
1726} 1721}
1727 1722
1728static void __finish_request(struct ceph_osd_request *req) 1723static void finish_request(struct ceph_osd_request *req)
1729{ 1724{
1730 struct ceph_osd_client *osdc = req->r_osdc; 1725 struct ceph_osd_client *osdc = req->r_osdc;
1731 struct ceph_osd *osd = req->r_osd; 1726 struct ceph_osd *osd = req->r_osd;
@@ -1747,32 +1742,26 @@ static void __finish_request(struct ceph_osd_request *req)
1747 ceph_msg_revoke_incoming(req->r_reply); 1742 ceph_msg_revoke_incoming(req->r_reply);
1748} 1743}
1749 1744
1750static void finish_request(struct ceph_osd_request *req)
1751{
1752 __finish_request(req);
1753 ceph_osdc_put_request(req);
1754}
1755
1756static void __complete_request(struct ceph_osd_request *req) 1745static void __complete_request(struct ceph_osd_request *req)
1757{ 1746{
1758 if (req->r_callback) 1747 if (req->r_callback) {
1748 dout("%s req %p tid %llu cb %pf result %d\n", __func__, req,
1749 req->r_tid, req->r_callback, req->r_result);
1759 req->r_callback(req); 1750 req->r_callback(req);
1760 else 1751 }
1761 complete_all(&req->r_completion);
1762} 1752}
1763 1753
1764/* 1754/*
1765 * Note that this is open-coded in handle_reply(), which has to deal 1755 * This is open-coded in handle_reply().
1766 * with ack vs commit, dup acks, etc.
1767 */ 1756 */
1768static void complete_request(struct ceph_osd_request *req, int err) 1757static void complete_request(struct ceph_osd_request *req, int err)
1769{ 1758{
1770 dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err); 1759 dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err);
1771 1760
1772 req->r_result = err; 1761 req->r_result = err;
1773 __finish_request(req); 1762 finish_request(req);
1774 __complete_request(req); 1763 __complete_request(req);
1775 complete_all(&req->r_safe_completion); 1764 complete_all(&req->r_completion);
1776 ceph_osdc_put_request(req); 1765 ceph_osdc_put_request(req);
1777} 1766}
1778 1767
@@ -1798,6 +1787,16 @@ static void cancel_request(struct ceph_osd_request *req)
1798 1787
1799 cancel_map_check(req); 1788 cancel_map_check(req);
1800 finish_request(req); 1789 finish_request(req);
1790 complete_all(&req->r_completion);
1791 ceph_osdc_put_request(req);
1792}
1793
1794static void abort_request(struct ceph_osd_request *req, int err)
1795{
1796 dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err);
1797
1798 cancel_map_check(req);
1799 complete_request(req, err);
1801} 1800}
1802 1801
1803static void check_pool_dne(struct ceph_osd_request *req) 1802static void check_pool_dne(struct ceph_osd_request *req)
@@ -2173,7 +2172,6 @@ static void linger_commit_cb(struct ceph_osd_request *req)
2173 mutex_lock(&lreq->lock); 2172 mutex_lock(&lreq->lock);
2174 dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq, 2173 dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq,
2175 lreq->linger_id, req->r_result); 2174 lreq->linger_id, req->r_result);
2176 WARN_ON(!__linger_registered(lreq));
2177 linger_reg_commit_complete(lreq, req->r_result); 2175 linger_reg_commit_complete(lreq, req->r_result);
2178 lreq->committed = true; 2176 lreq->committed = true;
2179 2177
@@ -2499,6 +2497,7 @@ static void handle_timeout(struct work_struct *work)
2499 container_of(work, struct ceph_osd_client, timeout_work.work); 2497 container_of(work, struct ceph_osd_client, timeout_work.work);
2500 struct ceph_options *opts = osdc->client->options; 2498 struct ceph_options *opts = osdc->client->options;
2501 unsigned long cutoff = jiffies - opts->osd_keepalive_timeout; 2499 unsigned long cutoff = jiffies - opts->osd_keepalive_timeout;
2500 unsigned long expiry_cutoff = jiffies - opts->osd_request_timeout;
2502 LIST_HEAD(slow_osds); 2501 LIST_HEAD(slow_osds);
2503 struct rb_node *n, *p; 2502 struct rb_node *n, *p;
2504 2503
@@ -2514,15 +2513,23 @@ static void handle_timeout(struct work_struct *work)
2514 struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); 2513 struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
2515 bool found = false; 2514 bool found = false;
2516 2515
2517 for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) { 2516 for (p = rb_first(&osd->o_requests); p; ) {
2518 struct ceph_osd_request *req = 2517 struct ceph_osd_request *req =
2519 rb_entry(p, struct ceph_osd_request, r_node); 2518 rb_entry(p, struct ceph_osd_request, r_node);
2520 2519
2520 p = rb_next(p); /* abort_request() */
2521
2521 if (time_before(req->r_stamp, cutoff)) { 2522 if (time_before(req->r_stamp, cutoff)) {
2522 dout(" req %p tid %llu on osd%d is laggy\n", 2523 dout(" req %p tid %llu on osd%d is laggy\n",
2523 req, req->r_tid, osd->o_osd); 2524 req, req->r_tid, osd->o_osd);
2524 found = true; 2525 found = true;
2525 } 2526 }
2527 if (opts->osd_request_timeout &&
2528 time_before(req->r_start_stamp, expiry_cutoff)) {
2529 pr_err_ratelimited("tid %llu on osd%d timeout\n",
2530 req->r_tid, osd->o_osd);
2531 abort_request(req, -ETIMEDOUT);
2532 }
2526 } 2533 }
2527 for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) { 2534 for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) {
2528 struct ceph_osd_linger_request *lreq = 2535 struct ceph_osd_linger_request *lreq =
@@ -2542,6 +2549,21 @@ static void handle_timeout(struct work_struct *work)
2542 list_move_tail(&osd->o_keepalive_item, &slow_osds); 2549 list_move_tail(&osd->o_keepalive_item, &slow_osds);
2543 } 2550 }
2544 2551
2552 if (opts->osd_request_timeout) {
2553 for (p = rb_first(&osdc->homeless_osd.o_requests); p; ) {
2554 struct ceph_osd_request *req =
2555 rb_entry(p, struct ceph_osd_request, r_node);
2556
2557 p = rb_next(p); /* abort_request() */
2558
2559 if (time_before(req->r_start_stamp, expiry_cutoff)) {
2560 pr_err_ratelimited("tid %llu on osd%d timeout\n",
2561 req->r_tid, osdc->homeless_osd.o_osd);
2562 abort_request(req, -ETIMEDOUT);
2563 }
2564 }
2565 }
2566
2545 if (atomic_read(&osdc->num_homeless) || !list_empty(&slow_osds)) 2567 if (atomic_read(&osdc->num_homeless) || !list_empty(&slow_osds))
2546 maybe_request_map(osdc); 2568 maybe_request_map(osdc);
2547 2569
@@ -2789,31 +2811,8 @@ e_inval:
2789} 2811}
2790 2812
2791/* 2813/*
2792 * We are done with @req if 2814 * Handle MOSDOpReply. Set ->r_result and call the callback if it is
2793 * - @m is a safe reply, or 2815 * specified.
2794 * - @m is an unsafe reply and we didn't want a safe one
2795 */
2796static bool done_request(const struct ceph_osd_request *req,
2797 const struct MOSDOpReply *m)
2798{
2799 return (m->result < 0 ||
2800 (m->flags & CEPH_OSD_FLAG_ONDISK) ||
2801 !(req->r_flags & CEPH_OSD_FLAG_ONDISK));
2802}
2803
2804/*
2805 * handle osd op reply. either call the callback if it is specified,
2806 * or do the completion to wake up the waiting thread.
2807 *
2808 * ->r_unsafe_callback is set? yes no
2809 *
2810 * first reply is OK (needed r_cb/r_completion, r_cb/r_completion,
2811 * any or needed/got safe) r_safe_completion r_safe_completion
2812 *
2813 * first reply is unsafe r_unsafe_cb(true) (nothing)
2814 *
2815 * when we get the safe reply r_unsafe_cb(false), r_cb/r_completion,
2816 * r_safe_completion r_safe_completion
2817 */ 2816 */
2818static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) 2817static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
2819{ 2818{
@@ -2822,7 +2821,6 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
2822 struct MOSDOpReply m; 2821 struct MOSDOpReply m;
2823 u64 tid = le64_to_cpu(msg->hdr.tid); 2822 u64 tid = le64_to_cpu(msg->hdr.tid);
2824 u32 data_len = 0; 2823 u32 data_len = 0;
2825 bool already_acked;
2826 int ret; 2824 int ret;
2827 int i; 2825 int i;
2828 2826
@@ -2901,51 +2899,22 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
2901 le32_to_cpu(msg->hdr.data_len), req->r_tid); 2899 le32_to_cpu(msg->hdr.data_len), req->r_tid);
2902 goto fail_request; 2900 goto fail_request;
2903 } 2901 }
2904 dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__, 2902 dout("%s req %p tid %llu result %d data_len %u\n", __func__,
2905 req, req->r_tid, req->r_got_reply, m.result, data_len); 2903 req, req->r_tid, m.result, data_len);
2906
2907 already_acked = req->r_got_reply;
2908 if (!already_acked) {
2909 req->r_result = m.result ?: data_len;
2910 req->r_replay_version = m.replay_version; /* struct */
2911 req->r_got_reply = true;
2912 } else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) {
2913 dout("req %p tid %llu dup ack\n", req, req->r_tid);
2914 goto out_unlock_session;
2915 }
2916
2917 if (done_request(req, &m)) {
2918 __finish_request(req);
2919 if (req->r_linger) {
2920 WARN_ON(req->r_unsafe_callback);
2921 dout("req %p tid %llu cb (locked)\n", req, req->r_tid);
2922 __complete_request(req);
2923 }
2924 }
2925 2904
2905 /*
2906 * Since we only ever request ONDISK, we should only ever get
2907 * one (type of) reply back.
2908 */
2909 WARN_ON(!(m.flags & CEPH_OSD_FLAG_ONDISK));
2910 req->r_result = m.result ?: data_len;
2911 finish_request(req);
2926 mutex_unlock(&osd->lock); 2912 mutex_unlock(&osd->lock);
2927 up_read(&osdc->lock); 2913 up_read(&osdc->lock);
2928 2914
2929 if (done_request(req, &m)) { 2915 __complete_request(req);
2930 if (already_acked && req->r_unsafe_callback) { 2916 complete_all(&req->r_completion);
2931 dout("req %p tid %llu safe-cb\n", req, req->r_tid); 2917 ceph_osdc_put_request(req);
2932 req->r_unsafe_callback(req, false);
2933 } else if (!req->r_linger) {
2934 dout("req %p tid %llu cb\n", req, req->r_tid);
2935 __complete_request(req);
2936 }
2937 if (m.flags & CEPH_OSD_FLAG_ONDISK)
2938 complete_all(&req->r_safe_completion);
2939 ceph_osdc_put_request(req);
2940 } else {
2941 if (req->r_unsafe_callback) {
2942 dout("req %p tid %llu unsafe-cb\n", req, req->r_tid);
2943 req->r_unsafe_callback(req, true);
2944 } else {
2945 WARN_ON(1);
2946 }
2947 }
2948
2949 return; 2918 return;
2950 2919
2951fail_request: 2920fail_request:
@@ -3471,9 +3440,8 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
3471EXPORT_SYMBOL(ceph_osdc_start_request); 3440EXPORT_SYMBOL(ceph_osdc_start_request);
3472 3441
3473/* 3442/*
3474 * Unregister a registered request. The request is not completed (i.e. 3443 * Unregister a registered request. The request is not completed:
3475 * no callbacks or wakeups) - higher layers are supposed to know what 3444 * ->r_result isn't set and __complete_request() isn't called.
3476 * they are canceling.
3477 */ 3445 */
3478void ceph_osdc_cancel_request(struct ceph_osd_request *req) 3446void ceph_osdc_cancel_request(struct ceph_osd_request *req)
3479{ 3447{
@@ -3500,9 +3468,6 @@ static int wait_request_timeout(struct ceph_osd_request *req,
3500 if (left <= 0) { 3468 if (left <= 0) {
3501 left = left ?: -ETIMEDOUT; 3469 left = left ?: -ETIMEDOUT;
3502 ceph_osdc_cancel_request(req); 3470 ceph_osdc_cancel_request(req);
3503
3504 /* kludge - need to to wake ceph_osdc_sync() */
3505 complete_all(&req->r_safe_completion);
3506 } else { 3471 } else {
3507 left = req->r_result; /* completed */ 3472 left = req->r_result; /* completed */
3508 } 3473 }
@@ -3549,7 +3514,7 @@ again:
3549 up_read(&osdc->lock); 3514 up_read(&osdc->lock);
3550 dout("%s waiting on req %p tid %llu last_tid %llu\n", 3515 dout("%s waiting on req %p tid %llu last_tid %llu\n",
3551 __func__, req, req->r_tid, last_tid); 3516 __func__, req, req->r_tid, last_tid);
3552 wait_for_completion(&req->r_safe_completion); 3517 wait_for_completion(&req->r_completion);
3553 ceph_osdc_put_request(req); 3518 ceph_osdc_put_request(req);
3554 goto again; 3519 goto again;
3555 } 3520 }
@@ -3608,7 +3573,7 @@ ceph_osdc_watch(struct ceph_osd_client *osdc,
3608 3573
3609 ceph_oid_copy(&lreq->t.base_oid, oid); 3574 ceph_oid_copy(&lreq->t.base_oid, oid);
3610 ceph_oloc_copy(&lreq->t.base_oloc, oloc); 3575 ceph_oloc_copy(&lreq->t.base_oloc, oloc);
3611 lreq->t.flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 3576 lreq->t.flags = CEPH_OSD_FLAG_WRITE;
3612 lreq->mtime = CURRENT_TIME; 3577 lreq->mtime = CURRENT_TIME;
3613 3578
3614 lreq->reg_req = alloc_linger_request(lreq); 3579 lreq->reg_req = alloc_linger_request(lreq);
@@ -3666,7 +3631,7 @@ int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
3666 3631
3667 ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); 3632 ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
3668 ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); 3633 ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
3669 req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 3634 req->r_flags = CEPH_OSD_FLAG_WRITE;
3670 req->r_mtime = CURRENT_TIME; 3635 req->r_mtime = CURRENT_TIME;
3671 osd_req_op_watch_init(req, 0, lreq->linger_id, 3636 osd_req_op_watch_init(req, 0, lreq->linger_id,
3672 CEPH_OSD_WATCH_OP_UNWATCH); 3637 CEPH_OSD_WATCH_OP_UNWATCH);
@@ -4031,7 +3996,7 @@ EXPORT_SYMBOL(ceph_osdc_maybe_request_map);
4031 * Execute an OSD class method on an object. 3996 * Execute an OSD class method on an object.
4032 * 3997 *
4033 * @flags: CEPH_OSD_FLAG_* 3998 * @flags: CEPH_OSD_FLAG_*
4034 * @resp_len: out param for reply length 3999 * @resp_len: in/out param for reply length
4035 */ 4000 */
4036int ceph_osdc_call(struct ceph_osd_client *osdc, 4001int ceph_osdc_call(struct ceph_osd_client *osdc,
4037 struct ceph_object_id *oid, 4002 struct ceph_object_id *oid,
@@ -4044,6 +4009,9 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
4044 struct ceph_osd_request *req; 4009 struct ceph_osd_request *req;
4045 int ret; 4010 int ret;
4046 4011
4012 if (req_len > PAGE_SIZE || (resp_page && *resp_len > PAGE_SIZE))
4013 return -E2BIG;
4014
4047 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO); 4015 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
4048 if (!req) 4016 if (!req)
4049 return -ENOMEM; 4017 return -ENOMEM;
@@ -4062,7 +4030,7 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
4062 0, false, false); 4030 0, false, false);
4063 if (resp_page) 4031 if (resp_page)
4064 osd_req_op_cls_response_data_pages(req, 0, &resp_page, 4032 osd_req_op_cls_response_data_pages(req, 0, &resp_page,
4065 PAGE_SIZE, 0, false, false); 4033 *resp_len, 0, false, false);
4066 4034
4067 ceph_osdc_start_request(osdc, req, false); 4035 ceph_osdc_start_request(osdc, req, false);
4068 ret = ceph_osdc_wait_request(osdc, req); 4036 ret = ceph_osdc_wait_request(osdc, req);
@@ -4229,8 +4197,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
4229 int page_align = off & ~PAGE_MASK; 4197 int page_align = off & ~PAGE_MASK;
4230 4198
4231 req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1, 4199 req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
4232 CEPH_OSD_OP_WRITE, 4200 CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
4233 CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
4234 snapc, truncate_seq, truncate_size, 4201 snapc, truncate_seq, truncate_size,
4235 true); 4202 true);
4236 if (IS_ERR(req)) 4203 if (IS_ERR(req))
@@ -4478,13 +4445,13 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
4478} 4445}
4479 4446
4480 4447
4481static int verify_authorizer_reply(struct ceph_connection *con, int len) 4448static int verify_authorizer_reply(struct ceph_connection *con)
4482{ 4449{
4483 struct ceph_osd *o = con->private; 4450 struct ceph_osd *o = con->private;
4484 struct ceph_osd_client *osdc = o->o_osdc; 4451 struct ceph_osd_client *osdc = o->o_osdc;
4485 struct ceph_auth_client *ac = osdc->client->monc.auth; 4452 struct ceph_auth_client *ac = osdc->client->monc.auth;
4486 4453
4487 return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer, len); 4454 return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer);
4488} 4455}
4489 4456
4490static int invalidate_authorizer(struct ceph_connection *con) 4457static int invalidate_authorizer(struct ceph_connection *con)
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index d2436880b305..ffe9e904d4d1 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -153,6 +153,32 @@ bad:
153 return -EINVAL; 153 return -EINVAL;
154} 154}
155 155
156static void crush_finalize(struct crush_map *c)
157{
158 __s32 b;
159
160 /* Space for the array of pointers to per-bucket workspace */
161 c->working_size = sizeof(struct crush_work) +
162 c->max_buckets * sizeof(struct crush_work_bucket *);
163
164 for (b = 0; b < c->max_buckets; b++) {
165 if (!c->buckets[b])
166 continue;
167
168 switch (c->buckets[b]->alg) {
169 default:
170 /*
171 * The base case, permutation variables and
172 * the pointer to the permutation array.
173 */
174 c->working_size += sizeof(struct crush_work_bucket);
175 break;
176 }
177 /* Every bucket has a permutation array. */
178 c->working_size += c->buckets[b]->size * sizeof(__u32);
179 }
180}
181
156static struct crush_map *crush_decode(void *pbyval, void *end) 182static struct crush_map *crush_decode(void *pbyval, void *end)
157{ 183{
158 struct crush_map *c; 184 struct crush_map *c;
@@ -246,10 +272,6 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
246 b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS); 272 b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
247 if (b->items == NULL) 273 if (b->items == NULL)
248 goto badmem; 274 goto badmem;
249 b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
250 if (b->perm == NULL)
251 goto badmem;
252 b->perm_n = 0;
253 275
254 ceph_decode_need(p, end, b->size*sizeof(u32), bad); 276 ceph_decode_need(p, end, b->size*sizeof(u32), bad);
255 for (j = 0; j < b->size; j++) 277 for (j = 0; j < b->size; j++)
@@ -369,6 +391,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
369 c->chooseleaf_stable); 391 c->chooseleaf_stable);
370 392
371done: 393done:
394 crush_finalize(c);
372 dout("crush_decode success\n"); 395 dout("crush_decode success\n");
373 return c; 396 return c;
374 397
@@ -719,7 +742,7 @@ struct ceph_osdmap *ceph_osdmap_alloc(void)
719 map->pool_max = -1; 742 map->pool_max = -1;
720 map->pg_temp = RB_ROOT; 743 map->pg_temp = RB_ROOT;
721 map->primary_temp = RB_ROOT; 744 map->primary_temp = RB_ROOT;
722 mutex_init(&map->crush_scratch_mutex); 745 mutex_init(&map->crush_workspace_mutex);
723 746
724 return map; 747 return map;
725} 748}
@@ -753,6 +776,7 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
753 kfree(map->osd_weight); 776 kfree(map->osd_weight);
754 kfree(map->osd_addr); 777 kfree(map->osd_addr);
755 kfree(map->osd_primary_affinity); 778 kfree(map->osd_primary_affinity);
779 kfree(map->crush_workspace);
756 kfree(map); 780 kfree(map);
757} 781}
758 782
@@ -808,6 +832,31 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
808 return 0; 832 return 0;
809} 833}
810 834
835static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
836{
837 void *workspace;
838 size_t work_size;
839
840 if (IS_ERR(crush))
841 return PTR_ERR(crush);
842
843 work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE);
844 dout("%s work_size %zu bytes\n", __func__, work_size);
845 workspace = kmalloc(work_size, GFP_NOIO);
846 if (!workspace) {
847 crush_destroy(crush);
848 return -ENOMEM;
849 }
850 crush_init_workspace(crush, workspace);
851
852 if (map->crush)
853 crush_destroy(map->crush);
854 kfree(map->crush_workspace);
855 map->crush = crush;
856 map->crush_workspace = workspace;
857 return 0;
858}
859
811#define OSDMAP_WRAPPER_COMPAT_VER 7 860#define OSDMAP_WRAPPER_COMPAT_VER 7
812#define OSDMAP_CLIENT_DATA_COMPAT_VER 1 861#define OSDMAP_CLIENT_DATA_COMPAT_VER 1
813 862
@@ -1214,13 +1263,9 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
1214 1263
1215 /* crush */ 1264 /* crush */
1216 ceph_decode_32_safe(p, end, len, e_inval); 1265 ceph_decode_32_safe(p, end, len, e_inval);
1217 map->crush = crush_decode(*p, min(*p + len, end)); 1266 err = osdmap_set_crush(map, crush_decode(*p, min(*p + len, end)));
1218 if (IS_ERR(map->crush)) { 1267 if (err)
1219 err = PTR_ERR(map->crush);
1220 map->crush = NULL;
1221 goto bad; 1268 goto bad;
1222 }
1223 *p += len;
1224 1269
1225 /* ignore the rest */ 1270 /* ignore the rest */
1226 *p = end; 1271 *p = end;
@@ -1334,7 +1379,6 @@ static int decode_new_up_state_weight(void **p, void *end,
1334 if ((map->osd_state[osd] & CEPH_OSD_EXISTS) && 1379 if ((map->osd_state[osd] & CEPH_OSD_EXISTS) &&
1335 (xorstate & CEPH_OSD_EXISTS)) { 1380 (xorstate & CEPH_OSD_EXISTS)) {
1336 pr_info("osd%d does not exist\n", osd); 1381 pr_info("osd%d does not exist\n", osd);
1337 map->osd_weight[osd] = CEPH_OSD_IN;
1338 ret = set_primary_affinity(map, osd, 1382 ret = set_primary_affinity(map, osd,
1339 CEPH_OSD_DEFAULT_PRIMARY_AFFINITY); 1383 CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
1340 if (ret) 1384 if (ret)
@@ -1375,7 +1419,6 @@ e_inval:
1375struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, 1419struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
1376 struct ceph_osdmap *map) 1420 struct ceph_osdmap *map)
1377{ 1421{
1378 struct crush_map *newcrush = NULL;
1379 struct ceph_fsid fsid; 1422 struct ceph_fsid fsid;
1380 u32 epoch = 0; 1423 u32 epoch = 0;
1381 struct ceph_timespec modified; 1424 struct ceph_timespec modified;
@@ -1414,12 +1457,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
1414 /* new crush? */ 1457 /* new crush? */
1415 ceph_decode_32_safe(p, end, len, e_inval); 1458 ceph_decode_32_safe(p, end, len, e_inval);
1416 if (len > 0) { 1459 if (len > 0) {
1417 newcrush = crush_decode(*p, min(*p+len, end)); 1460 err = osdmap_set_crush(map,
1418 if (IS_ERR(newcrush)) { 1461 crush_decode(*p, min(*p + len, end)));
1419 err = PTR_ERR(newcrush); 1462 if (err)
1420 newcrush = NULL;
1421 goto bad; 1463 goto bad;
1422 }
1423 *p += len; 1464 *p += len;
1424 } 1465 }
1425 1466
@@ -1439,12 +1480,6 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
1439 1480
1440 map->epoch++; 1481 map->epoch++;
1441 map->modified = modified; 1482 map->modified = modified;
1442 if (newcrush) {
1443 if (map->crush)
1444 crush_destroy(map->crush);
1445 map->crush = newcrush;
1446 newcrush = NULL;
1447 }
1448 1483
1449 /* new_pools */ 1484 /* new_pools */
1450 err = decode_new_pools(p, end, map); 1485 err = decode_new_pools(p, end, map);
@@ -1505,8 +1540,6 @@ bad:
1505 print_hex_dump(KERN_DEBUG, "osdmap: ", 1540 print_hex_dump(KERN_DEBUG, "osdmap: ",
1506 DUMP_PREFIX_OFFSET, 16, 1, 1541 DUMP_PREFIX_OFFSET, 16, 1,
1507 start, end - start, true); 1542 start, end - start, true);
1508 if (newcrush)
1509 crush_destroy(newcrush);
1510 return ERR_PTR(err); 1543 return ERR_PTR(err);
1511} 1544}
1512 1545
@@ -1942,10 +1975,10 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
1942 1975
1943 BUG_ON(result_max > CEPH_PG_MAX_SIZE); 1976 BUG_ON(result_max > CEPH_PG_MAX_SIZE);
1944 1977
1945 mutex_lock(&map->crush_scratch_mutex); 1978 mutex_lock(&map->crush_workspace_mutex);
1946 r = crush_do_rule(map->crush, ruleno, x, result, result_max, 1979 r = crush_do_rule(map->crush, ruleno, x, result, result_max,
1947 weight, weight_max, map->crush_scratch_ary); 1980 weight, weight_max, map->crush_workspace);
1948 mutex_unlock(&map->crush_scratch_mutex); 1981 mutex_unlock(&map->crush_workspace_mutex);
1949 1982
1950 return r; 1983 return r;
1951} 1984}
@@ -1978,8 +2011,14 @@ static void pg_to_raw_osds(struct ceph_osdmap *osdmap,
1978 return; 2011 return;
1979 } 2012 }
1980 2013
1981 len = do_crush(osdmap, ruleno, pps, raw->osds, 2014 if (pi->size > ARRAY_SIZE(raw->osds)) {
1982 min_t(int, pi->size, ARRAY_SIZE(raw->osds)), 2015 pr_err_ratelimited("pool %lld ruleset %d type %d too wide: size %d > %zu\n",
2016 pi->id, pi->crush_ruleset, pi->type, pi->size,
2017 ARRAY_SIZE(raw->osds));
2018 return;
2019 }
2020
2021 len = do_crush(osdmap, ruleno, pps, raw->osds, pi->size,
1983 osdmap->osd_weight, osdmap->max_osd); 2022 osdmap->osd_weight, osdmap->max_osd);
1984 if (len < 0) { 2023 if (len < 0) {
1985 pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n", 2024 pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
diff --git a/net/ceph/snapshot.c b/net/ceph/snapshot.c
index 154683f5f14c..705414e78ae0 100644
--- a/net/ceph/snapshot.c
+++ b/net/ceph/snapshot.c
@@ -18,8 +18,6 @@
18 * 02110-1301, USA. 18 * 02110-1301, USA.
19 */ 19 */
20 20
21#include <stddef.h>
22
23#include <linux/types.h> 21#include <linux/types.h>
24#include <linux/export.h> 22#include <linux/export.h>
25#include <linux/ceph/libceph.h> 23#include <linux/ceph/libceph.h>
diff --git a/net/compat.c b/net/compat.c
index 1cd2ec046164..aba929e5250f 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -22,13 +22,14 @@
22#include <linux/filter.h> 22#include <linux/filter.h>
23#include <linux/compat.h> 23#include <linux/compat.h>
24#include <linux/security.h> 24#include <linux/security.h>
25#include <linux/audit.h>
25#include <linux/export.h> 26#include <linux/export.h>
26 27
27#include <net/scm.h> 28#include <net/scm.h>
28#include <net/sock.h> 29#include <net/sock.h>
29#include <net/ip.h> 30#include <net/ip.h>
30#include <net/ipv6.h> 31#include <net/ipv6.h>
31#include <asm/uaccess.h> 32#include <linux/uaccess.h>
32#include <net/compat.h> 33#include <net/compat.h>
33 34
34int get_compat_msghdr(struct msghdr *kmsg, 35int get_compat_msghdr(struct msghdr *kmsg,
@@ -90,11 +91,11 @@ int get_compat_msghdr(struct msghdr *kmsg,
90#define CMSG_COMPAT_ALIGN(len) ALIGN((len), sizeof(s32)) 91#define CMSG_COMPAT_ALIGN(len) ALIGN((len), sizeof(s32))
91 92
92#define CMSG_COMPAT_DATA(cmsg) \ 93#define CMSG_COMPAT_DATA(cmsg) \
93 ((void __user *)((char __user *)(cmsg) + CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)))) 94 ((void __user *)((char __user *)(cmsg) + sizeof(struct compat_cmsghdr)))
94#define CMSG_COMPAT_SPACE(len) \ 95#define CMSG_COMPAT_SPACE(len) \
95 (CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)) + CMSG_COMPAT_ALIGN(len)) 96 (sizeof(struct compat_cmsghdr) + CMSG_COMPAT_ALIGN(len))
96#define CMSG_COMPAT_LEN(len) \ 97#define CMSG_COMPAT_LEN(len) \
97 (CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)) + (len)) 98 (sizeof(struct compat_cmsghdr) + (len))
98 99
99#define CMSG_COMPAT_FIRSTHDR(msg) \ 100#define CMSG_COMPAT_FIRSTHDR(msg) \
100 (((msg)->msg_controllen) >= sizeof(struct compat_cmsghdr) ? \ 101 (((msg)->msg_controllen) >= sizeof(struct compat_cmsghdr) ? \
@@ -130,6 +131,9 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
130 __kernel_size_t kcmlen, tmp; 131 __kernel_size_t kcmlen, tmp;
131 int err = -EFAULT; 132 int err = -EFAULT;
132 133
134 BUILD_BUG_ON(sizeof(struct compat_cmsghdr) !=
135 CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)));
136
133 kcmlen = 0; 137 kcmlen = 0;
134 kcmsg_base = kcmsg = (struct cmsghdr *)stackbuf; 138 kcmsg_base = kcmsg = (struct cmsghdr *)stackbuf;
135 ucmsg = CMSG_COMPAT_FIRSTHDR(kmsg); 139 ucmsg = CMSG_COMPAT_FIRSTHDR(kmsg);
@@ -141,8 +145,7 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
141 if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg)) 145 if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg))
142 return -EINVAL; 146 return -EINVAL;
143 147
144 tmp = ((ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))) + 148 tmp = ((ucmlen - sizeof(*ucmsg)) + sizeof(struct cmsghdr));
145 CMSG_ALIGN(sizeof(struct cmsghdr)));
146 tmp = CMSG_ALIGN(tmp); 149 tmp = CMSG_ALIGN(tmp);
147 kcmlen += tmp; 150 kcmlen += tmp;
148 ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen); 151 ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen);
@@ -168,8 +171,7 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
168 goto Efault; 171 goto Efault;
169 if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg)) 172 if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg))
170 goto Einval; 173 goto Einval;
171 tmp = ((ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))) + 174 tmp = ((ucmlen - sizeof(*ucmsg)) + sizeof(struct cmsghdr));
172 CMSG_ALIGN(sizeof(struct cmsghdr)));
173 if ((char *)kcmsg_base + kcmlen - (char *)kcmsg < CMSG_ALIGN(tmp)) 175 if ((char *)kcmsg_base + kcmlen - (char *)kcmsg < CMSG_ALIGN(tmp))
174 goto Einval; 176 goto Einval;
175 kcmsg->cmsg_len = tmp; 177 kcmsg->cmsg_len = tmp;
@@ -178,7 +180,7 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
178 __get_user(kcmsg->cmsg_type, &ucmsg->cmsg_type) || 180 __get_user(kcmsg->cmsg_type, &ucmsg->cmsg_type) ||
179 copy_from_user(CMSG_DATA(kcmsg), 181 copy_from_user(CMSG_DATA(kcmsg),
180 CMSG_COMPAT_DATA(ucmsg), 182 CMSG_COMPAT_DATA(ucmsg),
181 (ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))))) 183 (ucmlen - sizeof(*ucmsg))))
182 goto Efault; 184 goto Efault;
183 185
184 /* Advance. */ 186 /* Advance. */
@@ -781,14 +783,24 @@ COMPAT_SYSCALL_DEFINE5(recvmmsg, int, fd, struct compat_mmsghdr __user *, mmsg,
781 783
782COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args) 784COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args)
783{ 785{
784 int ret; 786 u32 a[AUDITSC_ARGS];
785 u32 a[6]; 787 unsigned int len;
786 u32 a0, a1; 788 u32 a0, a1;
789 int ret;
787 790
788 if (call < SYS_SOCKET || call > SYS_SENDMMSG) 791 if (call < SYS_SOCKET || call > SYS_SENDMMSG)
789 return -EINVAL; 792 return -EINVAL;
790 if (copy_from_user(a, args, nas[call])) 793 len = nas[call];
794 if (len > sizeof(a))
795 return -EINVAL;
796
797 if (copy_from_user(a, args, len))
791 return -EFAULT; 798 return -EFAULT;
799
800 ret = audit_socketcall_compat(len / sizeof(a[0]), a);
801 if (ret)
802 return ret;
803
792 a0 = a[0]; 804 a0 = a[0];
793 a1 = a[1]; 805 a1 = a[1];
794 806
diff --git a/net/core/Makefile b/net/core/Makefile
index d6508c2ddca5..79f9479e9658 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -24,6 +24,8 @@ obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
24obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o 24obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
25obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o 25obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
26obj-$(CONFIG_LWTUNNEL) += lwtunnel.o 26obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
27obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o
27obj-$(CONFIG_DST_CACHE) += dst_cache.o 28obj-$(CONFIG_DST_CACHE) += dst_cache.o
28obj-$(CONFIG_HWBM) += hwbm.o 29obj-$(CONFIG_HWBM) += hwbm.o
29obj-$(CONFIG_NET_DEVLINK) += devlink.o 30obj-$(CONFIG_NET_DEVLINK) += devlink.o
31obj-$(CONFIG_GRO_CELLS) += gro_cells.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
index b7de71f8d5d3..f4947e737f34 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -36,7 +36,7 @@
36#include <linux/module.h> 36#include <linux/module.h>
37#include <linux/types.h> 37#include <linux/types.h>
38#include <linux/kernel.h> 38#include <linux/kernel.h>
39#include <asm/uaccess.h> 39#include <linux/uaccess.h>
40#include <linux/mm.h> 40#include <linux/mm.h>
41#include <linux/interrupt.h> 41#include <linux/interrupt.h>
42#include <linux/errno.h> 42#include <linux/errno.h>
@@ -165,6 +165,7 @@ done:
165 * __skb_try_recv_datagram - Receive a datagram skbuff 165 * __skb_try_recv_datagram - Receive a datagram skbuff
166 * @sk: socket 166 * @sk: socket
167 * @flags: MSG_ flags 167 * @flags: MSG_ flags
168 * @destructor: invoked under the receive lock on successful dequeue
168 * @peeked: returns non-zero if this packet has been seen before 169 * @peeked: returns non-zero if this packet has been seen before
169 * @off: an offset in bytes to peek skb from. Returns an offset 170 * @off: an offset in bytes to peek skb from. Returns an offset
170 * within an skb where data actually starts 171 * within an skb where data actually starts
@@ -197,6 +198,8 @@ done:
197 * the standard around please. 198 * the standard around please.
198 */ 199 */
199struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, 200struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
201 void (*destructor)(struct sock *sk,
202 struct sk_buff *skb),
200 int *peeked, int *off, int *err, 203 int *peeked, int *off, int *err,
201 struct sk_buff **last) 204 struct sk_buff **last)
202{ 205{
@@ -211,6 +214,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
211 if (error) 214 if (error)
212 goto no_packet; 215 goto no_packet;
213 216
217 *peeked = 0;
214 do { 218 do {
215 /* Again only user level code calls this function, so nothing 219 /* Again only user level code calls this function, so nothing
216 * interrupt level will suddenly eat the receive_queue. 220 * interrupt level will suddenly eat the receive_queue.
@@ -224,26 +228,28 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
224 spin_lock_irqsave(&queue->lock, cpu_flags); 228 spin_lock_irqsave(&queue->lock, cpu_flags);
225 skb_queue_walk(queue, skb) { 229 skb_queue_walk(queue, skb) {
226 *last = skb; 230 *last = skb;
227 *peeked = skb->peeked;
228 if (flags & MSG_PEEK) { 231 if (flags & MSG_PEEK) {
229 if (_off >= skb->len && (skb->len || _off || 232 if (_off >= skb->len && (skb->len || _off ||
230 skb->peeked)) { 233 skb->peeked)) {
231 _off -= skb->len; 234 _off -= skb->len;
232 continue; 235 continue;
233 } 236 }
234 237 if (!skb->len) {
235 skb = skb_set_peeked(skb); 238 skb = skb_set_peeked(skb);
236 error = PTR_ERR(skb); 239 if (IS_ERR(skb)) {
237 if (IS_ERR(skb)) { 240 error = PTR_ERR(skb);
238 spin_unlock_irqrestore(&queue->lock, 241 spin_unlock_irqrestore(&queue->lock,
239 cpu_flags); 242 cpu_flags);
240 goto no_packet; 243 goto no_packet;
244 }
241 } 245 }
242 246 *peeked = 1;
243 atomic_inc(&skb->users); 247 atomic_inc(&skb->users);
244 } else 248 } else {
245 __skb_unlink(skb, queue); 249 __skb_unlink(skb, queue);
246 250 if (destructor)
251 destructor(sk, skb);
252 }
247 spin_unlock_irqrestore(&queue->lock, cpu_flags); 253 spin_unlock_irqrestore(&queue->lock, cpu_flags);
248 *off = _off; 254 *off = _off;
249 return skb; 255 return skb;
@@ -262,6 +268,8 @@ no_packet:
262EXPORT_SYMBOL(__skb_try_recv_datagram); 268EXPORT_SYMBOL(__skb_try_recv_datagram);
263 269
264struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, 270struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
271 void (*destructor)(struct sock *sk,
272 struct sk_buff *skb),
265 int *peeked, int *off, int *err) 273 int *peeked, int *off, int *err)
266{ 274{
267 struct sk_buff *skb, *last; 275 struct sk_buff *skb, *last;
@@ -270,8 +278,8 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
270 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 278 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
271 279
272 do { 280 do {
273 skb = __skb_try_recv_datagram(sk, flags, peeked, off, err, 281 skb = __skb_try_recv_datagram(sk, flags, destructor, peeked,
274 &last); 282 off, err, &last);
275 if (skb) 283 if (skb)
276 return skb; 284 return skb;
277 285
@@ -290,7 +298,7 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags,
290 int peeked, off = 0; 298 int peeked, off = 0;
291 299
292 return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), 300 return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
293 &peeked, &off, err); 301 NULL, &peeked, &off, err);
294} 302}
295EXPORT_SYMBOL(skb_recv_datagram); 303EXPORT_SYMBOL(skb_recv_datagram);
296 304
@@ -323,6 +331,31 @@ void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len)
323} 331}
324EXPORT_SYMBOL(__skb_free_datagram_locked); 332EXPORT_SYMBOL(__skb_free_datagram_locked);
325 333
334int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb,
335 unsigned int flags,
336 void (*destructor)(struct sock *sk,
337 struct sk_buff *skb))
338{
339 int err = 0;
340
341 if (flags & MSG_PEEK) {
342 err = -ENOENT;
343 spin_lock_bh(&sk->sk_receive_queue.lock);
344 if (skb == skb_peek(&sk->sk_receive_queue)) {
345 __skb_unlink(skb, &sk->sk_receive_queue);
346 atomic_dec(&skb->users);
347 if (destructor)
348 destructor(sk, skb);
349 err = 0;
350 }
351 spin_unlock_bh(&sk->sk_receive_queue.lock);
352 }
353
354 atomic_inc(&sk->sk_drops);
355 return err;
356}
357EXPORT_SYMBOL(__sk_queue_drop_skb);
358
326/** 359/**
327 * skb_kill_datagram - Free a datagram skbuff forcibly 360 * skb_kill_datagram - Free a datagram skbuff forcibly
328 * @sk: socket 361 * @sk: socket
@@ -346,23 +379,10 @@ EXPORT_SYMBOL(__skb_free_datagram_locked);
346 379
347int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) 380int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
348{ 381{
349 int err = 0; 382 int err = __sk_queue_drop_skb(sk, skb, flags, NULL);
350
351 if (flags & MSG_PEEK) {
352 err = -ENOENT;
353 spin_lock_bh(&sk->sk_receive_queue.lock);
354 if (skb == skb_peek(&sk->sk_receive_queue)) {
355 __skb_unlink(skb, &sk->sk_receive_queue);
356 atomic_dec(&skb->users);
357 err = 0;
358 }
359 spin_unlock_bh(&sk->sk_receive_queue.lock);
360 }
361 383
362 kfree_skb(skb); 384 kfree_skb(skb);
363 atomic_inc(&sk->sk_drops);
364 sk_mem_reclaim_partial(sk); 385 sk_mem_reclaim_partial(sk);
365
366 return err; 386 return err;
367} 387}
368EXPORT_SYMBOL(skb_kill_datagram); 388EXPORT_SYMBOL(skb_kill_datagram);
@@ -378,7 +398,7 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
378 struct iov_iter *to, int len) 398 struct iov_iter *to, int len)
379{ 399{
380 int start = skb_headlen(skb); 400 int start = skb_headlen(skb);
381 int i, copy = start - offset; 401 int i, copy = start - offset, start_off = offset, n;
382 struct sk_buff *frag_iter; 402 struct sk_buff *frag_iter;
383 403
384 trace_skb_copy_datagram_iovec(skb, len); 404 trace_skb_copy_datagram_iovec(skb, len);
@@ -387,11 +407,12 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
387 if (copy > 0) { 407 if (copy > 0) {
388 if (copy > len) 408 if (copy > len)
389 copy = len; 409 copy = len;
390 if (copy_to_iter(skb->data + offset, copy, to) != copy) 410 n = copy_to_iter(skb->data + offset, copy, to);
411 offset += n;
412 if (n != copy)
391 goto short_copy; 413 goto short_copy;
392 if ((len -= copy) == 0) 414 if ((len -= copy) == 0)
393 return 0; 415 return 0;
394 offset += copy;
395 } 416 }
396 417
397 /* Copy paged appendix. Hmm... why does this look so complicated? */ 418 /* Copy paged appendix. Hmm... why does this look so complicated? */
@@ -405,13 +426,14 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
405 if ((copy = end - offset) > 0) { 426 if ((copy = end - offset) > 0) {
406 if (copy > len) 427 if (copy > len)
407 copy = len; 428 copy = len;
408 if (copy_page_to_iter(skb_frag_page(frag), 429 n = copy_page_to_iter(skb_frag_page(frag),
409 frag->page_offset + offset - 430 frag->page_offset + offset -
410 start, copy, to) != copy) 431 start, copy, to);
432 offset += n;
433 if (n != copy)
411 goto short_copy; 434 goto short_copy;
412 if (!(len -= copy)) 435 if (!(len -= copy))
413 return 0; 436 return 0;
414 offset += copy;
415 } 437 }
416 start = end; 438 start = end;
417 } 439 }
@@ -443,6 +465,7 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
443 */ 465 */
444 466
445fault: 467fault:
468 iov_iter_revert(to, offset - start_off);
446 return -EFAULT; 469 return -EFAULT;
447 470
448short_copy: 471short_copy:
@@ -593,7 +616,7 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
593 __wsum *csump) 616 __wsum *csump)
594{ 617{
595 int start = skb_headlen(skb); 618 int start = skb_headlen(skb);
596 int i, copy = start - offset; 619 int i, copy = start - offset, start_off = offset;
597 struct sk_buff *frag_iter; 620 struct sk_buff *frag_iter;
598 int pos = 0; 621 int pos = 0;
599 int n; 622 int n;
@@ -603,11 +626,11 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
603 if (copy > len) 626 if (copy > len)
604 copy = len; 627 copy = len;
605 n = csum_and_copy_to_iter(skb->data + offset, copy, csump, to); 628 n = csum_and_copy_to_iter(skb->data + offset, copy, csump, to);
629 offset += n;
606 if (n != copy) 630 if (n != copy)
607 goto fault; 631 goto fault;
608 if ((len -= copy) == 0) 632 if ((len -= copy) == 0)
609 return 0; 633 return 0;
610 offset += copy;
611 pos = copy; 634 pos = copy;
612 } 635 }
613 636
@@ -629,12 +652,12 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
629 offset - start, copy, 652 offset - start, copy,
630 &csum2, to); 653 &csum2, to);
631 kunmap(page); 654 kunmap(page);
655 offset += n;
632 if (n != copy) 656 if (n != copy)
633 goto fault; 657 goto fault;
634 *csump = csum_block_add(*csump, csum2, pos); 658 *csump = csum_block_add(*csump, csum2, pos);
635 if (!(len -= copy)) 659 if (!(len -= copy))
636 return 0; 660 return 0;
637 offset += copy;
638 pos += copy; 661 pos += copy;
639 } 662 }
640 start = end; 663 start = end;
@@ -667,6 +690,7 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
667 return 0; 690 return 0;
668 691
669fault: 692fault:
693 iov_iter_revert(to, offset - start_off);
670 return -EFAULT; 694 return -EFAULT;
671} 695}
672 696
@@ -751,6 +775,7 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb,
751 } 775 }
752 return 0; 776 return 0;
753csum_error: 777csum_error:
778 iov_iter_revert(&msg->msg_iter, chunk);
754 return -EINVAL; 779 return -EINVAL;
755fault: 780fault:
756 return -EFAULT; 781 return -EFAULT;
diff --git a/net/core/dev.c b/net/core/dev.c
index 6666b28b6815..533a6d6f6092 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * NET3 Protocol independent device support routines. 2 * NET3 Protocol independent device support routines.
3 * 3 *
4 * This program is free software; you can redistribute it and/or 4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License 5 * modify it under the terms of the GNU General Public License
@@ -7,7 +7,7 @@
7 * 2 of the License, or (at your option) any later version. 7 * 2 of the License, or (at your option) any later version.
8 * 8 *
9 * Derived from the non IP parts of dev.c 1.0.19 9 * Derived from the non IP parts of dev.c 1.0.19
10 * Authors: Ross Biro 10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * 13 *
@@ -21,9 +21,9 @@
21 * 21 *
22 * Changes: 22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set 23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called 24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a 25 * before net_dev_init & also removed a
26 * few lines of code in the process. 26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back. 27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant 28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe. 29 * stunts to keep the queue safe.
@@ -36,7 +36,7 @@
36 * Alan Cox : 100 backlog just doesn't cut it when 36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8) 37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager. 38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths. 39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass 40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler 41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before 42 * Alan Cox : Network driver sets packet type before
@@ -46,7 +46,7 @@
46 * Richard Kooijman: Timestamp fixes. 46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR 47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection. 48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close 49 * Alan Cox : Fixed nasty side effect of device close
50 * changes. 50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to 51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address() 52 * set_mac_address()
@@ -67,12 +67,12 @@
67 * Paul Rusty Russell : SIOCSIFNAME 67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code 68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait 69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt 70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling 71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback 72 * - netif_rx() feedback
73 */ 73 */
74 74
75#include <asm/uaccess.h> 75#include <linux/uaccess.h>
76#include <linux/bitops.h> 76#include <linux/bitops.h>
77#include <linux/capability.h> 77#include <linux/capability.h>
78#include <linux/cpu.h> 78#include <linux/cpu.h>
@@ -139,7 +139,6 @@
139#include <linux/errqueue.h> 139#include <linux/errqueue.h>
140#include <linux/hrtimer.h> 140#include <linux/hrtimer.h>
141#include <linux/netfilter_ingress.h> 141#include <linux/netfilter_ingress.h>
142#include <linux/sctp.h>
143#include <linux/crash_dump.h> 142#include <linux/crash_dump.h>
144 143
145#include "net-sysfs.h" 144#include "net-sysfs.h"
@@ -193,7 +192,8 @@ static seqcount_t devnet_rename_seq;
193 192
194static inline void dev_base_seq_inc(struct net *net) 193static inline void dev_base_seq_inc(struct net *net)
195{ 194{
196 while (++net->dev_base_seq == 0); 195 while (++net->dev_base_seq == 0)
196 ;
197} 197}
198 198
199static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) 199static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
@@ -275,8 +275,8 @@ EXPORT_PER_CPU_SYMBOL(softnet_data);
275 * register_netdevice() inits txq->_xmit_lock and sets lockdep class 275 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
276 * according to dev->type 276 * according to dev->type
277 */ 277 */
278static const unsigned short netdev_lock_type[] = 278static const unsigned short netdev_lock_type[] = {
279 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, 279 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
280 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, 280 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
281 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, 281 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
282 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, 282 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
@@ -292,22 +292,22 @@ static const unsigned short netdev_lock_type[] =
292 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, 292 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
293 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; 293 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
294 294
295static const char *const netdev_lock_name[] = 295static const char *const netdev_lock_name[] = {
296 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", 296 "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
297 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", 297 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
298 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", 298 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
299 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", 299 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
300 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", 300 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
301 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", 301 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
302 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", 302 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
303 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", 303 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
304 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", 304 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
305 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", 305 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
306 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", 306 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
307 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", 307 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
308 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", 308 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
309 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", 309 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
310 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; 310 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
311 311
312static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; 312static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
313static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; 313static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
@@ -353,10 +353,11 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
353#endif 353#endif
354 354
355/******************************************************************************* 355/*******************************************************************************
356 *
357 * Protocol management and registration routines
358 *
359 *******************************************************************************/
356 360
357 Protocol management and registration routines
358
359*******************************************************************************/
360 361
361/* 362/*
362 * Add a protocol ID to the list. Now that the input handler is 363 * Add a protocol ID to the list. Now that the input handler is
@@ -539,10 +540,10 @@ void dev_remove_offload(struct packet_offload *po)
539EXPORT_SYMBOL(dev_remove_offload); 540EXPORT_SYMBOL(dev_remove_offload);
540 541
541/****************************************************************************** 542/******************************************************************************
542 543 *
543 Device Boot-time Settings Routines 544 * Device Boot-time Settings Routines
544 545 *
545*******************************************************************************/ 546 ******************************************************************************/
546 547
547/* Boot time configuration table */ 548/* Boot time configuration table */
548static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; 549static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
@@ -575,13 +576,13 @@ static int netdev_boot_setup_add(char *name, struct ifmap *map)
575} 576}
576 577
577/** 578/**
578 * netdev_boot_setup_check - check boot time settings 579 * netdev_boot_setup_check - check boot time settings
579 * @dev: the netdevice 580 * @dev: the netdevice
580 * 581 *
581 * Check boot time settings for the device. 582 * Check boot time settings for the device.
582 * The found settings are set for the device to be used 583 * The found settings are set for the device to be used
583 * later in the device probing. 584 * later in the device probing.
584 * Returns 0 if no settings found, 1 if they are. 585 * Returns 0 if no settings found, 1 if they are.
585 */ 586 */
586int netdev_boot_setup_check(struct net_device *dev) 587int netdev_boot_setup_check(struct net_device *dev)
587{ 588{
@@ -591,10 +592,10 @@ int netdev_boot_setup_check(struct net_device *dev)
591 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 592 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
592 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && 593 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
593 !strcmp(dev->name, s[i].name)) { 594 !strcmp(dev->name, s[i].name)) {
594 dev->irq = s[i].map.irq; 595 dev->irq = s[i].map.irq;
595 dev->base_addr = s[i].map.base_addr; 596 dev->base_addr = s[i].map.base_addr;
596 dev->mem_start = s[i].map.mem_start; 597 dev->mem_start = s[i].map.mem_start;
597 dev->mem_end = s[i].map.mem_end; 598 dev->mem_end = s[i].map.mem_end;
598 return 1; 599 return 1;
599 } 600 }
600 } 601 }
@@ -604,14 +605,14 @@ EXPORT_SYMBOL(netdev_boot_setup_check);
604 605
605 606
606/** 607/**
607 * netdev_boot_base - get address from boot time settings 608 * netdev_boot_base - get address from boot time settings
608 * @prefix: prefix for network device 609 * @prefix: prefix for network device
609 * @unit: id for network device 610 * @unit: id for network device
610 * 611 *
611 * Check boot time settings for the base address of device. 612 * Check boot time settings for the base address of device.
612 * The found settings are set for the device to be used 613 * The found settings are set for the device to be used
613 * later in the device probing. 614 * later in the device probing.
614 * Returns 0 if no settings found. 615 * Returns 0 if no settings found.
615 */ 616 */
616unsigned long netdev_boot_base(const char *prefix, int unit) 617unsigned long netdev_boot_base(const char *prefix, int unit)
617{ 618{
@@ -664,10 +665,10 @@ int __init netdev_boot_setup(char *str)
664__setup("netdev=", netdev_boot_setup); 665__setup("netdev=", netdev_boot_setup);
665 666
666/******************************************************************************* 667/*******************************************************************************
667 668 *
668 Device Interface Subroutines 669 * Device Interface Subroutines
669 670 *
670*******************************************************************************/ 671 *******************************************************************************/
671 672
672/** 673/**
673 * dev_get_iflink - get 'iflink' value of a interface 674 * dev_get_iflink - get 'iflink' value of a interface
@@ -738,15 +739,15 @@ struct net_device *__dev_get_by_name(struct net *net, const char *name)
738EXPORT_SYMBOL(__dev_get_by_name); 739EXPORT_SYMBOL(__dev_get_by_name);
739 740
740/** 741/**
741 * dev_get_by_name_rcu - find a device by its name 742 * dev_get_by_name_rcu - find a device by its name
742 * @net: the applicable net namespace 743 * @net: the applicable net namespace
743 * @name: name to find 744 * @name: name to find
744 * 745 *
745 * Find an interface by name. 746 * Find an interface by name.
746 * If the name is found a pointer to the device is returned. 747 * If the name is found a pointer to the device is returned.
747 * If the name is not found then %NULL is returned. 748 * If the name is not found then %NULL is returned.
748 * The reference counters are not incremented so the caller must be 749 * The reference counters are not incremented so the caller must be
749 * careful with locks. The caller must hold RCU lock. 750 * careful with locks. The caller must hold RCU lock.
750 */ 751 */
751 752
752struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) 753struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
@@ -1290,8 +1291,8 @@ void netdev_state_change(struct net_device *dev)
1290EXPORT_SYMBOL(netdev_state_change); 1291EXPORT_SYMBOL(netdev_state_change);
1291 1292
1292/** 1293/**
1293 * netdev_notify_peers - notify network peers about existence of @dev 1294 * netdev_notify_peers - notify network peers about existence of @dev
1294 * @dev: network device 1295 * @dev: network device
1295 * 1296 *
1296 * Generate traffic such that interested network peers are aware of 1297 * Generate traffic such that interested network peers are aware of
1297 * @dev, such as by generating a gratuitous ARP. This may be used when 1298 * @dev, such as by generating a gratuitous ARP. This may be used when
@@ -1303,6 +1304,7 @@ void netdev_notify_peers(struct net_device *dev)
1303{ 1304{
1304 rtnl_lock(); 1305 rtnl_lock();
1305 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); 1306 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1307 call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1306 rtnl_unlock(); 1308 rtnl_unlock();
1307} 1309}
1308EXPORT_SYMBOL(netdev_notify_peers); 1310EXPORT_SYMBOL(netdev_notify_peers);
@@ -1519,17 +1521,17 @@ static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1519static int dev_boot_phase = 1; 1521static int dev_boot_phase = 1;
1520 1522
1521/** 1523/**
1522 * register_netdevice_notifier - register a network notifier block 1524 * register_netdevice_notifier - register a network notifier block
1523 * @nb: notifier 1525 * @nb: notifier
1524 * 1526 *
1525 * Register a notifier to be called when network device events occur. 1527 * Register a notifier to be called when network device events occur.
1526 * The notifier passed is linked into the kernel structures and must 1528 * The notifier passed is linked into the kernel structures and must
1527 * not be reused until it has been unregistered. A negative errno code 1529 * not be reused until it has been unregistered. A negative errno code
1528 * is returned on a failure. 1530 * is returned on a failure.
1529 * 1531 *
1530 * When registered all registration and up events are replayed 1532 * When registered all registration and up events are replayed
1531 * to the new notifier to allow device to have a race free 1533 * to the new notifier to allow device to have a race free
1532 * view of the network device list. 1534 * view of the network device list.
1533 */ 1535 */
1534 1536
1535int register_netdevice_notifier(struct notifier_block *nb) 1537int register_netdevice_notifier(struct notifier_block *nb)
@@ -1586,17 +1588,17 @@ outroll:
1586EXPORT_SYMBOL(register_netdevice_notifier); 1588EXPORT_SYMBOL(register_netdevice_notifier);
1587 1589
1588/** 1590/**
1589 * unregister_netdevice_notifier - unregister a network notifier block 1591 * unregister_netdevice_notifier - unregister a network notifier block
1590 * @nb: notifier 1592 * @nb: notifier
1591 * 1593 *
1592 * Unregister a notifier previously registered by 1594 * Unregister a notifier previously registered by
1593 * register_netdevice_notifier(). The notifier is unlinked into the 1595 * register_netdevice_notifier(). The notifier is unlinked into the
1594 * kernel structures and may then be reused. A negative errno code 1596 * kernel structures and may then be reused. A negative errno code
1595 * is returned on a failure. 1597 * is returned on a failure.
1596 * 1598 *
1597 * After unregistering unregister and down device events are synthesized 1599 * After unregistering unregister and down device events are synthesized
1598 * for all devices on the device list to the removed notifier to remove 1600 * for all devices on the device list to the removed notifier to remove
1599 * the need for special case cleanup code. 1601 * the need for special case cleanup code.
1600 */ 1602 */
1601 1603
1602int unregister_netdevice_notifier(struct notifier_block *nb) 1604int unregister_netdevice_notifier(struct notifier_block *nb)
@@ -1696,50 +1698,72 @@ EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1696 1698
1697static struct static_key netstamp_needed __read_mostly; 1699static struct static_key netstamp_needed __read_mostly;
1698#ifdef HAVE_JUMP_LABEL 1700#ifdef HAVE_JUMP_LABEL
1699/* We are not allowed to call static_key_slow_dec() from irq context
1700 * If net_disable_timestamp() is called from irq context, defer the
1701 * static_key_slow_dec() calls.
1702 */
1703static atomic_t netstamp_needed_deferred; 1701static atomic_t netstamp_needed_deferred;
1702static atomic_t netstamp_wanted;
1703static void netstamp_clear(struct work_struct *work)
1704{
1705 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1706 int wanted;
1707
1708 wanted = atomic_add_return(deferred, &netstamp_wanted);
1709 if (wanted > 0)
1710 static_key_enable(&netstamp_needed);
1711 else
1712 static_key_disable(&netstamp_needed);
1713}
1714static DECLARE_WORK(netstamp_work, netstamp_clear);
1704#endif 1715#endif
1705 1716
1706void net_enable_timestamp(void) 1717void net_enable_timestamp(void)
1707{ 1718{
1708#ifdef HAVE_JUMP_LABEL 1719#ifdef HAVE_JUMP_LABEL
1709 int deferred = atomic_xchg(&netstamp_needed_deferred, 0); 1720 int wanted;
1710 1721
1711 if (deferred) { 1722 while (1) {
1712 while (--deferred) 1723 wanted = atomic_read(&netstamp_wanted);
1713 static_key_slow_dec(&netstamp_needed); 1724 if (wanted <= 0)
1714 return; 1725 break;
1726 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
1727 return;
1715 } 1728 }
1716#endif 1729 atomic_inc(&netstamp_needed_deferred);
1730 schedule_work(&netstamp_work);
1731#else
1717 static_key_slow_inc(&netstamp_needed); 1732 static_key_slow_inc(&netstamp_needed);
1733#endif
1718} 1734}
1719EXPORT_SYMBOL(net_enable_timestamp); 1735EXPORT_SYMBOL(net_enable_timestamp);
1720 1736
1721void net_disable_timestamp(void) 1737void net_disable_timestamp(void)
1722{ 1738{
1723#ifdef HAVE_JUMP_LABEL 1739#ifdef HAVE_JUMP_LABEL
1724 if (in_interrupt()) { 1740 int wanted;
1725 atomic_inc(&netstamp_needed_deferred); 1741
1726 return; 1742 while (1) {
1743 wanted = atomic_read(&netstamp_wanted);
1744 if (wanted <= 1)
1745 break;
1746 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
1747 return;
1727 } 1748 }
1728#endif 1749 atomic_dec(&netstamp_needed_deferred);
1750 schedule_work(&netstamp_work);
1751#else
1729 static_key_slow_dec(&netstamp_needed); 1752 static_key_slow_dec(&netstamp_needed);
1753#endif
1730} 1754}
1731EXPORT_SYMBOL(net_disable_timestamp); 1755EXPORT_SYMBOL(net_disable_timestamp);
1732 1756
1733static inline void net_timestamp_set(struct sk_buff *skb) 1757static inline void net_timestamp_set(struct sk_buff *skb)
1734{ 1758{
1735 skb->tstamp.tv64 = 0; 1759 skb->tstamp = 0;
1736 if (static_key_false(&netstamp_needed)) 1760 if (static_key_false(&netstamp_needed))
1737 __net_timestamp(skb); 1761 __net_timestamp(skb);
1738} 1762}
1739 1763
1740#define net_timestamp_check(COND, SKB) \ 1764#define net_timestamp_check(COND, SKB) \
1741 if (static_key_false(&netstamp_needed)) { \ 1765 if (static_key_false(&netstamp_needed)) { \
1742 if ((COND) && !(SKB)->tstamp.tv64) \ 1766 if ((COND) && !(SKB)->tstamp) \
1743 __net_timestamp(SKB); \ 1767 __net_timestamp(SKB); \
1744 } \ 1768 } \
1745 1769
@@ -1944,37 +1968,80 @@ static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1944 } 1968 }
1945} 1969}
1946 1970
1971int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
1972{
1973 if (dev->num_tc) {
1974 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1975 int i;
1976
1977 for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
1978 if ((txq - tc->offset) < tc->count)
1979 return i;
1980 }
1981
1982 return -1;
1983 }
1984
1985 return 0;
1986}
1987
1947#ifdef CONFIG_XPS 1988#ifdef CONFIG_XPS
1948static DEFINE_MUTEX(xps_map_mutex); 1989static DEFINE_MUTEX(xps_map_mutex);
1949#define xmap_dereference(P) \ 1990#define xmap_dereference(P) \
1950 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) 1991 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1951 1992
1952static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps, 1993static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
1953 int cpu, u16 index) 1994 int tci, u16 index)
1954{ 1995{
1955 struct xps_map *map = NULL; 1996 struct xps_map *map = NULL;
1956 int pos; 1997 int pos;
1957 1998
1958 if (dev_maps) 1999 if (dev_maps)
1959 map = xmap_dereference(dev_maps->cpu_map[cpu]); 2000 map = xmap_dereference(dev_maps->cpu_map[tci]);
2001 if (!map)
2002 return false;
1960 2003
1961 for (pos = 0; map && pos < map->len; pos++) { 2004 for (pos = map->len; pos--;) {
1962 if (map->queues[pos] == index) { 2005 if (map->queues[pos] != index)
1963 if (map->len > 1) { 2006 continue;
1964 map->queues[pos] = map->queues[--map->len]; 2007
1965 } else { 2008 if (map->len > 1) {
1966 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL); 2009 map->queues[pos] = map->queues[--map->len];
1967 kfree_rcu(map, rcu);
1968 map = NULL;
1969 }
1970 break; 2010 break;
1971 } 2011 }
2012
2013 RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
2014 kfree_rcu(map, rcu);
2015 return false;
1972 } 2016 }
1973 2017
1974 return map; 2018 return true;
1975} 2019}
1976 2020
1977static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) 2021static bool remove_xps_queue_cpu(struct net_device *dev,
2022 struct xps_dev_maps *dev_maps,
2023 int cpu, u16 offset, u16 count)
2024{
2025 int num_tc = dev->num_tc ? : 1;
2026 bool active = false;
2027 int tci;
2028
2029 for (tci = cpu * num_tc; num_tc--; tci++) {
2030 int i, j;
2031
2032 for (i = count, j = offset; i--; j++) {
2033 if (!remove_xps_queue(dev_maps, cpu, j))
2034 break;
2035 }
2036
2037 active |= i < 0;
2038 }
2039
2040 return active;
2041}
2042
2043static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2044 u16 count)
1978{ 2045{
1979 struct xps_dev_maps *dev_maps; 2046 struct xps_dev_maps *dev_maps;
1980 int cpu, i; 2047 int cpu, i;
@@ -1986,21 +2053,16 @@ static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1986 if (!dev_maps) 2053 if (!dev_maps)
1987 goto out_no_maps; 2054 goto out_no_maps;
1988 2055
1989 for_each_possible_cpu(cpu) { 2056 for_each_possible_cpu(cpu)
1990 for (i = index; i < dev->num_tx_queues; i++) { 2057 active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
1991 if (!remove_xps_queue(dev_maps, cpu, i)) 2058 offset, count);
1992 break;
1993 }
1994 if (i == dev->num_tx_queues)
1995 active = true;
1996 }
1997 2059
1998 if (!active) { 2060 if (!active) {
1999 RCU_INIT_POINTER(dev->xps_maps, NULL); 2061 RCU_INIT_POINTER(dev->xps_maps, NULL);
2000 kfree_rcu(dev_maps, rcu); 2062 kfree_rcu(dev_maps, rcu);
2001 } 2063 }
2002 2064
2003 for (i = index; i < dev->num_tx_queues; i++) 2065 for (i = offset + (count - 1); count--; i--)
2004 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i), 2066 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2005 NUMA_NO_NODE); 2067 NUMA_NO_NODE);
2006 2068
@@ -2008,6 +2070,11 @@ out_no_maps:
2008 mutex_unlock(&xps_map_mutex); 2070 mutex_unlock(&xps_map_mutex);
2009} 2071}
2010 2072
2073static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2074{
2075 netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2076}
2077
2011static struct xps_map *expand_xps_map(struct xps_map *map, 2078static struct xps_map *expand_xps_map(struct xps_map *map,
2012 int cpu, u16 index) 2079 int cpu, u16 index)
2013{ 2080{
@@ -2047,20 +2114,28 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2047 u16 index) 2114 u16 index)
2048{ 2115{
2049 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; 2116 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2117 int i, cpu, tci, numa_node_id = -2;
2118 int maps_sz, num_tc = 1, tc = 0;
2050 struct xps_map *map, *new_map; 2119 struct xps_map *map, *new_map;
2051 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2052 int cpu, numa_node_id = -2;
2053 bool active = false; 2120 bool active = false;
2054 2121
2122 if (dev->num_tc) {
2123 num_tc = dev->num_tc;
2124 tc = netdev_txq_to_tc(dev, index);
2125 if (tc < 0)
2126 return -EINVAL;
2127 }
2128
2129 maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
2130 if (maps_sz < L1_CACHE_BYTES)
2131 maps_sz = L1_CACHE_BYTES;
2132
2055 mutex_lock(&xps_map_mutex); 2133 mutex_lock(&xps_map_mutex);
2056 2134
2057 dev_maps = xmap_dereference(dev->xps_maps); 2135 dev_maps = xmap_dereference(dev->xps_maps);
2058 2136
2059 /* allocate memory for queue storage */ 2137 /* allocate memory for queue storage */
2060 for_each_online_cpu(cpu) { 2138 for_each_cpu_and(cpu, cpu_online_mask, mask) {
2061 if (!cpumask_test_cpu(cpu, mask))
2062 continue;
2063
2064 if (!new_dev_maps) 2139 if (!new_dev_maps)
2065 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); 2140 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2066 if (!new_dev_maps) { 2141 if (!new_dev_maps) {
@@ -2068,25 +2143,38 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2068 return -ENOMEM; 2143 return -ENOMEM;
2069 } 2144 }
2070 2145
2071 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : 2146 tci = cpu * num_tc + tc;
2147 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
2072 NULL; 2148 NULL;
2073 2149
2074 map = expand_xps_map(map, cpu, index); 2150 map = expand_xps_map(map, cpu, index);
2075 if (!map) 2151 if (!map)
2076 goto error; 2152 goto error;
2077 2153
2078 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); 2154 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2079 } 2155 }
2080 2156
2081 if (!new_dev_maps) 2157 if (!new_dev_maps)
2082 goto out_no_new_maps; 2158 goto out_no_new_maps;
2083 2159
2084 for_each_possible_cpu(cpu) { 2160 for_each_possible_cpu(cpu) {
2161 /* copy maps belonging to foreign traffic classes */
2162 for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
2163 /* fill in the new device map from the old device map */
2164 map = xmap_dereference(dev_maps->cpu_map[tci]);
2165 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2166 }
2167
2168 /* We need to explicitly update tci as prevous loop
2169 * could break out early if dev_maps is NULL.
2170 */
2171 tci = cpu * num_tc + tc;
2172
2085 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) { 2173 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2086 /* add queue to CPU maps */ 2174 /* add queue to CPU maps */
2087 int pos = 0; 2175 int pos = 0;
2088 2176
2089 map = xmap_dereference(new_dev_maps->cpu_map[cpu]); 2177 map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2090 while ((pos < map->len) && (map->queues[pos] != index)) 2178 while ((pos < map->len) && (map->queues[pos] != index))
2091 pos++; 2179 pos++;
2092 2180
@@ -2100,26 +2188,36 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2100#endif 2188#endif
2101 } else if (dev_maps) { 2189 } else if (dev_maps) {
2102 /* fill in the new device map from the old device map */ 2190 /* fill in the new device map from the old device map */
2103 map = xmap_dereference(dev_maps->cpu_map[cpu]); 2191 map = xmap_dereference(dev_maps->cpu_map[tci]);
2104 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); 2192 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2105 } 2193 }
2106 2194
2195 /* copy maps belonging to foreign traffic classes */
2196 for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2197 /* fill in the new device map from the old device map */
2198 map = xmap_dereference(dev_maps->cpu_map[tci]);
2199 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2200 }
2107 } 2201 }
2108 2202
2109 rcu_assign_pointer(dev->xps_maps, new_dev_maps); 2203 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2110 2204
2111 /* Cleanup old maps */ 2205 /* Cleanup old maps */
2112 if (dev_maps) { 2206 if (!dev_maps)
2113 for_each_possible_cpu(cpu) { 2207 goto out_no_old_maps;
2114 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); 2208
2115 map = xmap_dereference(dev_maps->cpu_map[cpu]); 2209 for_each_possible_cpu(cpu) {
2210 for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2211 new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2212 map = xmap_dereference(dev_maps->cpu_map[tci]);
2116 if (map && map != new_map) 2213 if (map && map != new_map)
2117 kfree_rcu(map, rcu); 2214 kfree_rcu(map, rcu);
2118 } 2215 }
2119
2120 kfree_rcu(dev_maps, rcu);
2121 } 2216 }
2122 2217
2218 kfree_rcu(dev_maps, rcu);
2219
2220out_no_old_maps:
2123 dev_maps = new_dev_maps; 2221 dev_maps = new_dev_maps;
2124 active = true; 2222 active = true;
2125 2223
@@ -2134,11 +2232,12 @@ out_no_new_maps:
2134 2232
2135 /* removes queue from unused CPUs */ 2233 /* removes queue from unused CPUs */
2136 for_each_possible_cpu(cpu) { 2234 for_each_possible_cpu(cpu) {
2137 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) 2235 for (i = tc, tci = cpu * num_tc; i--; tci++)
2138 continue; 2236 active |= remove_xps_queue(dev_maps, tci, index);
2139 2237 if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
2140 if (remove_xps_queue(dev_maps, cpu, index)) 2238 active |= remove_xps_queue(dev_maps, tci, index);
2141 active = true; 2239 for (i = num_tc - tc, tci++; --i; tci++)
2240 active |= remove_xps_queue(dev_maps, tci, index);
2142 } 2241 }
2143 2242
2144 /* free map if not active */ 2243 /* free map if not active */
@@ -2154,11 +2253,14 @@ out_no_maps:
2154error: 2253error:
2155 /* remove any maps that we added */ 2254 /* remove any maps that we added */
2156 for_each_possible_cpu(cpu) { 2255 for_each_possible_cpu(cpu) {
2157 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); 2256 for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2158 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : 2257 new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2159 NULL; 2258 map = dev_maps ?
2160 if (new_map && new_map != map) 2259 xmap_dereference(dev_maps->cpu_map[tci]) :
2161 kfree(new_map); 2260 NULL;
2261 if (new_map && new_map != map)
2262 kfree(new_map);
2263 }
2162 } 2264 }
2163 2265
2164 mutex_unlock(&xps_map_mutex); 2266 mutex_unlock(&xps_map_mutex);
@@ -2169,6 +2271,44 @@ error:
2169EXPORT_SYMBOL(netif_set_xps_queue); 2271EXPORT_SYMBOL(netif_set_xps_queue);
2170 2272
2171#endif 2273#endif
2274void netdev_reset_tc(struct net_device *dev)
2275{
2276#ifdef CONFIG_XPS
2277 netif_reset_xps_queues_gt(dev, 0);
2278#endif
2279 dev->num_tc = 0;
2280 memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2281 memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2282}
2283EXPORT_SYMBOL(netdev_reset_tc);
2284
2285int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2286{
2287 if (tc >= dev->num_tc)
2288 return -EINVAL;
2289
2290#ifdef CONFIG_XPS
2291 netif_reset_xps_queues(dev, offset, count);
2292#endif
2293 dev->tc_to_txq[tc].count = count;
2294 dev->tc_to_txq[tc].offset = offset;
2295 return 0;
2296}
2297EXPORT_SYMBOL(netdev_set_tc_queue);
2298
2299int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2300{
2301 if (num_tc > TC_MAX_QUEUE)
2302 return -EINVAL;
2303
2304#ifdef CONFIG_XPS
2305 netif_reset_xps_queues_gt(dev, 0);
2306#endif
2307 dev->num_tc = num_tc;
2308 return 0;
2309}
2310EXPORT_SYMBOL(netdev_set_num_tc);
2311
2172/* 2312/*
2173 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues 2313 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2174 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. 2314 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
@@ -2293,28 +2433,6 @@ void netif_schedule_queue(struct netdev_queue *txq)
2293} 2433}
2294EXPORT_SYMBOL(netif_schedule_queue); 2434EXPORT_SYMBOL(netif_schedule_queue);
2295 2435
2296/**
2297 * netif_wake_subqueue - allow sending packets on subqueue
2298 * @dev: network device
2299 * @queue_index: sub queue index
2300 *
2301 * Resume individual transmit queue of a device with multiple transmit queues.
2302 */
2303void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2304{
2305 struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2306
2307 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2308 struct Qdisc *q;
2309
2310 rcu_read_lock();
2311 q = rcu_dereference(txq->qdisc);
2312 __netif_schedule(q);
2313 rcu_read_unlock();
2314 }
2315}
2316EXPORT_SYMBOL(netif_wake_subqueue);
2317
2318void netif_tx_wake_queue(struct netdev_queue *dev_queue) 2436void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2319{ 2437{
2320 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) { 2438 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
@@ -2408,6 +2526,7 @@ u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2408 2526
2409 if (dev->num_tc) { 2527 if (dev->num_tc) {
2410 u8 tc = netdev_get_prio_tc_map(dev, skb->priority); 2528 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2529
2411 qoffset = dev->tc_to_txq[tc].offset; 2530 qoffset = dev->tc_to_txq[tc].offset;
2412 qcount = dev->tc_to_txq[tc].count; 2531 qcount = dev->tc_to_txq[tc].count;
2413 } 2532 }
@@ -2487,141 +2606,6 @@ out:
2487} 2606}
2488EXPORT_SYMBOL(skb_checksum_help); 2607EXPORT_SYMBOL(skb_checksum_help);
2489 2608
2490/* skb_csum_offload_check - Driver helper function to determine if a device
2491 * with limited checksum offload capabilities is able to offload the checksum
2492 * for a given packet.
2493 *
2494 * Arguments:
2495 * skb - sk_buff for the packet in question
2496 * spec - contains the description of what device can offload
2497 * csum_encapped - returns true if the checksum being offloaded is
2498 * encpasulated. That is it is checksum for the transport header
2499 * in the inner headers.
2500 * checksum_help - when set indicates that helper function should
2501 * call skb_checksum_help if offload checks fail
2502 *
2503 * Returns:
2504 * true: Packet has passed the checksum checks and should be offloadable to
2505 * the device (a driver may still need to check for additional
2506 * restrictions of its device)
2507 * false: Checksum is not offloadable. If checksum_help was set then
2508 * skb_checksum_help was called to resolve checksum for non-GSO
2509 * packets and when IP protocol is not SCTP
2510 */
2511bool __skb_csum_offload_chk(struct sk_buff *skb,
2512 const struct skb_csum_offl_spec *spec,
2513 bool *csum_encapped,
2514 bool csum_help)
2515{
2516 struct iphdr *iph;
2517 struct ipv6hdr *ipv6;
2518 void *nhdr;
2519 int protocol;
2520 u8 ip_proto;
2521
2522 if (skb->protocol == htons(ETH_P_8021Q) ||
2523 skb->protocol == htons(ETH_P_8021AD)) {
2524 if (!spec->vlan_okay)
2525 goto need_help;
2526 }
2527
2528 /* We check whether the checksum refers to a transport layer checksum in
2529 * the outermost header or an encapsulated transport layer checksum that
2530 * corresponds to the inner headers of the skb. If the checksum is for
2531 * something else in the packet we need help.
2532 */
2533 if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) {
2534 /* Non-encapsulated checksum */
2535 protocol = eproto_to_ipproto(vlan_get_protocol(skb));
2536 nhdr = skb_network_header(skb);
2537 *csum_encapped = false;
2538 if (spec->no_not_encapped)
2539 goto need_help;
2540 } else if (skb->encapsulation && spec->encap_okay &&
2541 skb_checksum_start_offset(skb) ==
2542 skb_inner_transport_offset(skb)) {
2543 /* Encapsulated checksum */
2544 *csum_encapped = true;
2545 switch (skb->inner_protocol_type) {
2546 case ENCAP_TYPE_ETHER:
2547 protocol = eproto_to_ipproto(skb->inner_protocol);
2548 break;
2549 case ENCAP_TYPE_IPPROTO:
2550 protocol = skb->inner_protocol;
2551 break;
2552 }
2553 nhdr = skb_inner_network_header(skb);
2554 } else {
2555 goto need_help;
2556 }
2557
2558 switch (protocol) {
2559 case IPPROTO_IP:
2560 if (!spec->ipv4_okay)
2561 goto need_help;
2562 iph = nhdr;
2563 ip_proto = iph->protocol;
2564 if (iph->ihl != 5 && !spec->ip_options_okay)
2565 goto need_help;
2566 break;
2567 case IPPROTO_IPV6:
2568 if (!spec->ipv6_okay)
2569 goto need_help;
2570 if (spec->no_encapped_ipv6 && *csum_encapped)
2571 goto need_help;
2572 ipv6 = nhdr;
2573 nhdr += sizeof(*ipv6);
2574 ip_proto = ipv6->nexthdr;
2575 break;
2576 default:
2577 goto need_help;
2578 }
2579
2580ip_proto_again:
2581 switch (ip_proto) {
2582 case IPPROTO_TCP:
2583 if (!spec->tcp_okay ||
2584 skb->csum_offset != offsetof(struct tcphdr, check))
2585 goto need_help;
2586 break;
2587 case IPPROTO_UDP:
2588 if (!spec->udp_okay ||
2589 skb->csum_offset != offsetof(struct udphdr, check))
2590 goto need_help;
2591 break;
2592 case IPPROTO_SCTP:
2593 if (!spec->sctp_okay ||
2594 skb->csum_offset != offsetof(struct sctphdr, checksum))
2595 goto cant_help;
2596 break;
2597 case NEXTHDR_HOP:
2598 case NEXTHDR_ROUTING:
2599 case NEXTHDR_DEST: {
2600 u8 *opthdr = nhdr;
2601
2602 if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay)
2603 goto need_help;
2604
2605 ip_proto = opthdr[0];
2606 nhdr += (opthdr[1] + 1) << 3;
2607
2608 goto ip_proto_again;
2609 }
2610 default:
2611 goto need_help;
2612 }
2613
2614 /* Passed the tests for offloading checksum */
2615 return true;
2616
2617need_help:
2618 if (csum_help && !skb_shinfo(skb)->gso_size)
2619 skb_checksum_help(skb);
2620cant_help:
2621 return false;
2622}
2623EXPORT_SYMBOL(__skb_csum_offload_chk);
2624
2625__be16 skb_network_protocol(struct sk_buff *skb, int *depth) 2609__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2626{ 2610{
2627 __be16 type = skb->protocol; 2611 __be16 type = skb->protocol;
@@ -2679,9 +2663,10 @@ EXPORT_SYMBOL(skb_mac_gso_segment);
2679static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) 2663static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2680{ 2664{
2681 if (tx_path) 2665 if (tx_path)
2682 return skb->ip_summed != CHECKSUM_PARTIAL; 2666 return skb->ip_summed != CHECKSUM_PARTIAL &&
2683 else 2667 skb->ip_summed != CHECKSUM_NONE;
2684 return skb->ip_summed == CHECKSUM_NONE; 2668
2669 return skb->ip_summed == CHECKSUM_NONE;
2685} 2670}
2686 2671
2687/** 2672/**
@@ -2700,11 +2685,12 @@ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2700struct sk_buff *__skb_gso_segment(struct sk_buff *skb, 2685struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2701 netdev_features_t features, bool tx_path) 2686 netdev_features_t features, bool tx_path)
2702{ 2687{
2688 struct sk_buff *segs;
2689
2703 if (unlikely(skb_needs_check(skb, tx_path))) { 2690 if (unlikely(skb_needs_check(skb, tx_path))) {
2704 int err; 2691 int err;
2705 2692
2706 skb_warn_bad_offload(skb); 2693 /* We're going to init ->check field in TCP or UDP header */
2707
2708 err = skb_cow_head(skb, 0); 2694 err = skb_cow_head(skb, 0);
2709 if (err < 0) 2695 if (err < 0)
2710 return ERR_PTR(err); 2696 return ERR_PTR(err);
@@ -2732,7 +2718,12 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2732 skb_reset_mac_header(skb); 2718 skb_reset_mac_header(skb);
2733 skb_reset_mac_len(skb); 2719 skb_reset_mac_len(skb);
2734 2720
2735 return skb_mac_gso_segment(skb, features); 2721 segs = skb_mac_gso_segment(skb, features);
2722
2723 if (unlikely(skb_needs_check(skb, tx_path)))
2724 skb_warn_bad_offload(skb);
2725
2726 return segs;
2736} 2727}
2737EXPORT_SYMBOL(__skb_gso_segment); 2728EXPORT_SYMBOL(__skb_gso_segment);
2738 2729
@@ -2757,9 +2748,11 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2757{ 2748{
2758#ifdef CONFIG_HIGHMEM 2749#ifdef CONFIG_HIGHMEM
2759 int i; 2750 int i;
2751
2760 if (!(dev->features & NETIF_F_HIGHDMA)) { 2752 if (!(dev->features & NETIF_F_HIGHDMA)) {
2761 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2753 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2762 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2754 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2755
2763 if (PageHighMem(skb_frag_page(frag))) 2756 if (PageHighMem(skb_frag_page(frag)))
2764 return 1; 2757 return 1;
2765 } 2758 }
@@ -2773,6 +2766,7 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2773 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2766 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2774 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2767 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2775 dma_addr_t addr = page_to_phys(skb_frag_page(frag)); 2768 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2769
2776 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask) 2770 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2777 return 1; 2771 return 1;
2778 } 2772 }
@@ -2815,9 +2809,9 @@ static netdev_features_t harmonize_features(struct sk_buff *skb,
2815 if (skb->ip_summed != CHECKSUM_NONE && 2809 if (skb->ip_summed != CHECKSUM_NONE &&
2816 !can_checksum_protocol(features, type)) { 2810 !can_checksum_protocol(features, type)) {
2817 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); 2811 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2818 } else if (illegal_highdma(skb->dev, skb)) {
2819 features &= ~NETIF_F_SG;
2820 } 2812 }
2813 if (illegal_highdma(skb->dev, skb))
2814 features &= ~NETIF_F_SG;
2821 2815
2822 return features; 2816 return features;
2823} 2817}
@@ -3173,9 +3167,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3173 if (!cl) 3167 if (!cl)
3174 return skb; 3168 return skb;
3175 3169
3176 /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set 3170 /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
3177 * earlier by the caller.
3178 */
3179 qdisc_bstats_cpu_update(cl->q, skb); 3171 qdisc_bstats_cpu_update(cl->q, skb);
3180 3172
3181 switch (tc_classify(skb, cl, &cl_res, false)) { 3173 switch (tc_classify(skb, cl, &cl_res, false)) {
@@ -3216,8 +3208,14 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3216 rcu_read_lock(); 3208 rcu_read_lock();
3217 dev_maps = rcu_dereference(dev->xps_maps); 3209 dev_maps = rcu_dereference(dev->xps_maps);
3218 if (dev_maps) { 3210 if (dev_maps) {
3219 map = rcu_dereference( 3211 unsigned int tci = skb->sender_cpu - 1;
3220 dev_maps->cpu_map[skb->sender_cpu - 1]); 3212
3213 if (dev->num_tc) {
3214 tci *= dev->num_tc;
3215 tci += netdev_get_prio_tc_map(dev, skb->priority);
3216 }
3217
3218 map = rcu_dereference(dev_maps->cpu_map[tci]);
3221 if (map) { 3219 if (map) {
3222 if (map->len == 1) 3220 if (map->len == 1)
3223 queue_index = map->queues[0]; 3221 queue_index = map->queues[0];
@@ -3244,6 +3242,7 @@ static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3244 if (queue_index < 0 || skb->ooo_okay || 3242 if (queue_index < 0 || skb->ooo_okay ||
3245 queue_index >= dev->real_num_tx_queues) { 3243 queue_index >= dev->real_num_tx_queues) {
3246 int new_index = get_xps_queue(dev, skb); 3244 int new_index = get_xps_queue(dev, skb);
3245
3247 if (new_index < 0) 3246 if (new_index < 0)
3248 new_index = skb_tx_hash(dev, skb); 3247 new_index = skb_tx_hash(dev, skb);
3249 3248
@@ -3273,6 +3272,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3273 3272
3274 if (dev->real_num_tx_queues != 1) { 3273 if (dev->real_num_tx_queues != 1) {
3275 const struct net_device_ops *ops = dev->netdev_ops; 3274 const struct net_device_ops *ops = dev->netdev_ops;
3275
3276 if (ops->ndo_select_queue) 3276 if (ops->ndo_select_queue)
3277 queue_index = ops->ndo_select_queue(dev, skb, accel_priv, 3277 queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3278 __netdev_pick_tx); 3278 __netdev_pick_tx);
@@ -3334,7 +3334,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3334 3334
3335 qdisc_pkt_len_init(skb); 3335 qdisc_pkt_len_init(skb);
3336#ifdef CONFIG_NET_CLS_ACT 3336#ifdef CONFIG_NET_CLS_ACT
3337 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); 3337 skb->tc_at_ingress = 0;
3338# ifdef CONFIG_NET_EGRESS 3338# ifdef CONFIG_NET_EGRESS
3339 if (static_key_false(&egress_needed)) { 3339 if (static_key_false(&egress_needed)) {
3340 skb = sch_handle_egress(skb, &rc, dev); 3340 skb = sch_handle_egress(skb, &rc, dev);
@@ -3361,16 +3361,16 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3361 } 3361 }
3362 3362
3363 /* The device has no queue. Common case for software devices: 3363 /* The device has no queue. Common case for software devices:
3364 loopback, all the sorts of tunnels... 3364 * loopback, all the sorts of tunnels...
3365 3365
3366 Really, it is unlikely that netif_tx_lock protection is necessary 3366 * Really, it is unlikely that netif_tx_lock protection is necessary
3367 here. (f.e. loopback and IP tunnels are clean ignoring statistics 3367 * here. (f.e. loopback and IP tunnels are clean ignoring statistics
3368 counters.) 3368 * counters.)
3369 However, it is possible, that they rely on protection 3369 * However, it is possible, that they rely on protection
3370 made by us here. 3370 * made by us here.
3371 3371
3372 Check this and shot the lock. It is not prone from deadlocks. 3372 * Check this and shot the lock. It is not prone from deadlocks.
3373 Either shot noqueue qdisc, it is even simpler 8) 3373 *Either shot noqueue qdisc, it is even simpler 8)
3374 */ 3374 */
3375 if (dev->flags & IFF_UP) { 3375 if (dev->flags & IFF_UP) {
3376 int cpu = smp_processor_id(); /* ok because BHs are off */ 3376 int cpu = smp_processor_id(); /* ok because BHs are off */
@@ -3432,16 +3432,20 @@ int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3432EXPORT_SYMBOL(dev_queue_xmit_accel); 3432EXPORT_SYMBOL(dev_queue_xmit_accel);
3433 3433
3434 3434
3435/*======================================================================= 3435/*************************************************************************
3436 Receiver routines 3436 * Receiver routines
3437 =======================================================================*/ 3437 *************************************************************************/
3438 3438
3439int netdev_max_backlog __read_mostly = 1000; 3439int netdev_max_backlog __read_mostly = 1000;
3440EXPORT_SYMBOL(netdev_max_backlog); 3440EXPORT_SYMBOL(netdev_max_backlog);
3441 3441
3442int netdev_tstamp_prequeue __read_mostly = 1; 3442int netdev_tstamp_prequeue __read_mostly = 1;
3443int netdev_budget __read_mostly = 300; 3443int netdev_budget __read_mostly = 300;
3444int weight_p __read_mostly = 64; /* old backlog weight */ 3444int weight_p __read_mostly = 64; /* old backlog weight */
3445int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */
3446int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */
3447int dev_rx_weight __read_mostly = 64;
3448int dev_tx_weight __read_mostly = 64;
3445 3449
3446/* Called with irq disabled */ 3450/* Called with irq disabled */
3447static inline void ____napi_schedule(struct softnet_data *sd, 3451static inline void ____napi_schedule(struct softnet_data *sd,
@@ -3461,6 +3465,8 @@ EXPORT_SYMBOL(rps_cpu_mask);
3461 3465
3462struct static_key rps_needed __read_mostly; 3466struct static_key rps_needed __read_mostly;
3463EXPORT_SYMBOL(rps_needed); 3467EXPORT_SYMBOL(rps_needed);
3468struct static_key rfs_needed __read_mostly;
3469EXPORT_SYMBOL(rfs_needed);
3464 3470
3465static struct rps_dev_flow * 3471static struct rps_dev_flow *
3466set_rps_cpu(struct net_device *dev, struct sk_buff *skb, 3472set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
@@ -3796,6 +3802,7 @@ static int netif_rx_internal(struct sk_buff *skb)
3796#endif 3802#endif
3797 { 3803 {
3798 unsigned int qtail; 3804 unsigned int qtail;
3805
3799 ret = enqueue_to_backlog(skb, get_cpu(), &qtail); 3806 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3800 put_cpu(); 3807 put_cpu();
3801 } 3808 }
@@ -3855,6 +3862,7 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
3855 3862
3856 while (clist) { 3863 while (clist) {
3857 struct sk_buff *skb = clist; 3864 struct sk_buff *skb = clist;
3865
3858 clist = clist->next; 3866 clist = clist->next;
3859 3867
3860 WARN_ON(atomic_read(&skb->users)); 3868 WARN_ON(atomic_read(&skb->users));
@@ -3928,7 +3936,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3928 } 3936 }
3929 3937
3930 qdisc_skb_cb(skb)->pkt_len = skb->len; 3938 qdisc_skb_cb(skb)->pkt_len = skb->len;
3931 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); 3939 skb->tc_at_ingress = 1;
3932 qdisc_bstats_cpu_update(cl->q, skb); 3940 qdisc_bstats_cpu_update(cl->q, skb);
3933 3941
3934 switch (tc_classify(skb, cl, &cl_res, false)) { 3942 switch (tc_classify(skb, cl, &cl_res, false)) {
@@ -3993,9 +4001,7 @@ int netdev_rx_handler_register(struct net_device *dev,
3993 rx_handler_func_t *rx_handler, 4001 rx_handler_func_t *rx_handler,
3994 void *rx_handler_data) 4002 void *rx_handler_data)
3995{ 4003{
3996 ASSERT_RTNL(); 4004 if (netdev_is_rx_handler_busy(dev))
3997
3998 if (dev->rx_handler)
3999 return -EBUSY; 4005 return -EBUSY;
4000 4006
4001 /* Note: rx_handler_data must be set before rx_handler */ 4007 /* Note: rx_handler_data must be set before rx_handler */
@@ -4101,12 +4107,8 @@ another_round:
4101 goto out; 4107 goto out;
4102 } 4108 }
4103 4109
4104#ifdef CONFIG_NET_CLS_ACT 4110 if (skb_skip_tc_classify(skb))
4105 if (skb->tc_verd & TC_NCLS) { 4111 goto skip_classify;
4106 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
4107 goto ncls;
4108 }
4109#endif
4110 4112
4111 if (pfmemalloc) 4113 if (pfmemalloc)
4112 goto skip_taps; 4114 goto skip_taps;
@@ -4134,10 +4136,8 @@ skip_taps:
4134 goto out; 4136 goto out;
4135 } 4137 }
4136#endif 4138#endif
4137#ifdef CONFIG_NET_CLS_ACT 4139 skb_reset_tc(skb);
4138 skb->tc_verd = 0; 4140skip_classify:
4139ncls:
4140#endif
4141 if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) 4141 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4142 goto drop; 4142 goto drop;
4143 4143
@@ -4453,7 +4453,9 @@ static void skb_gro_reset_offset(struct sk_buff *skb)
4453 pinfo->nr_frags && 4453 pinfo->nr_frags &&
4454 !PageHighMem(skb_frag_page(frag0))) { 4454 !PageHighMem(skb_frag_page(frag0))) {
4455 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); 4455 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4456 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0); 4456 NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
4457 skb_frag_size(frag0),
4458 skb->end - skb->tail);
4457 } 4459 }
4458} 4460}
4459 4461
@@ -4491,7 +4493,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
4491 if (!(skb->dev->features & NETIF_F_GRO)) 4493 if (!(skb->dev->features & NETIF_F_GRO))
4492 goto normal; 4494 goto normal;
4493 4495
4494 if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad) 4496 if (skb->csum_bad)
4495 goto normal; 4497 goto normal;
4496 4498
4497 gro_list_prepare(napi, skb); 4499 gro_list_prepare(napi, skb);
@@ -4504,7 +4506,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
4504 skb_set_network_header(skb, skb_gro_offset(skb)); 4506 skb_set_network_header(skb, skb_gro_offset(skb));
4505 skb_reset_mac_len(skb); 4507 skb_reset_mac_len(skb);
4506 NAPI_GRO_CB(skb)->same_flow = 0; 4508 NAPI_GRO_CB(skb)->same_flow = 0;
4507 NAPI_GRO_CB(skb)->flush = 0; 4509 NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
4508 NAPI_GRO_CB(skb)->free = 0; 4510 NAPI_GRO_CB(skb)->free = 0;
4509 NAPI_GRO_CB(skb)->encap_mark = 0; 4511 NAPI_GRO_CB(skb)->encap_mark = 0;
4510 NAPI_GRO_CB(skb)->recursion_counter = 0; 4512 NAPI_GRO_CB(skb)->recursion_counter = 0;
@@ -4536,6 +4538,11 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
4536 if (&ptype->list == head) 4538 if (&ptype->list == head)
4537 goto normal; 4539 goto normal;
4538 4540
4541 if (IS_ERR(pp) && PTR_ERR(pp) == -EINPROGRESS) {
4542 ret = GRO_CONSUMED;
4543 goto ok;
4544 }
4545
4539 same_flow = NAPI_GRO_CB(skb)->same_flow; 4546 same_flow = NAPI_GRO_CB(skb)->same_flow;
4540 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; 4547 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4541 4548
@@ -4631,6 +4638,7 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4631 case GRO_MERGED_FREE: 4638 case GRO_MERGED_FREE:
4632 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) { 4639 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4633 skb_dst_drop(skb); 4640 skb_dst_drop(skb);
4641 secpath_reset(skb);
4634 kmem_cache_free(skbuff_head_cache, skb); 4642 kmem_cache_free(skbuff_head_cache, skb);
4635 } else { 4643 } else {
4636 __kfree_skb(skb); 4644 __kfree_skb(skb);
@@ -4639,6 +4647,7 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4639 4647
4640 case GRO_HELD: 4648 case GRO_HELD:
4641 case GRO_MERGED: 4649 case GRO_MERGED:
4650 case GRO_CONSUMED:
4642 break; 4651 break;
4643 } 4652 }
4644 4653
@@ -4671,6 +4680,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4671 skb->encapsulation = 0; 4680 skb->encapsulation = 0;
4672 skb_shinfo(skb)->gso_type = 0; 4681 skb_shinfo(skb)->gso_type = 0;
4673 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); 4682 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4683 secpath_reset(skb);
4674 4684
4675 napi->skb = skb; 4685 napi->skb = skb;
4676} 4686}
@@ -4709,6 +4719,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi,
4709 break; 4719 break;
4710 4720
4711 case GRO_MERGED: 4721 case GRO_MERGED:
4722 case GRO_CONSUMED:
4712 break; 4723 break;
4713 } 4724 }
4714 4725
@@ -4845,7 +4856,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
4845 net_rps_action_and_irq_enable(sd); 4856 net_rps_action_and_irq_enable(sd);
4846 } 4857 }
4847 4858
4848 napi->weight = weight_p; 4859 napi->weight = dev_rx_weight;
4849 while (again) { 4860 while (again) {
4850 struct sk_buff *skb; 4861 struct sk_buff *skb;
4851 4862
@@ -4901,6 +4912,39 @@ void __napi_schedule(struct napi_struct *n)
4901EXPORT_SYMBOL(__napi_schedule); 4912EXPORT_SYMBOL(__napi_schedule);
4902 4913
4903/** 4914/**
4915 * napi_schedule_prep - check if napi can be scheduled
4916 * @n: napi context
4917 *
4918 * Test if NAPI routine is already running, and if not mark
4919 * it as running. This is used as a condition variable
4920 * insure only one NAPI poll instance runs. We also make
4921 * sure there is no pending NAPI disable.
4922 */
4923bool napi_schedule_prep(struct napi_struct *n)
4924{
4925 unsigned long val, new;
4926
4927 do {
4928 val = READ_ONCE(n->state);
4929 if (unlikely(val & NAPIF_STATE_DISABLE))
4930 return false;
4931 new = val | NAPIF_STATE_SCHED;
4932
4933 /* Sets STATE_MISSED bit if STATE_SCHED was already set
4934 * This was suggested by Alexander Duyck, as compiler
4935 * emits better code than :
4936 * if (val & NAPIF_STATE_SCHED)
4937 * new |= NAPIF_STATE_MISSED;
4938 */
4939 new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
4940 NAPIF_STATE_MISSED;
4941 } while (cmpxchg(&n->state, val, new) != val);
4942
4943 return !(val & NAPIF_STATE_SCHED);
4944}
4945EXPORT_SYMBOL(napi_schedule_prep);
4946
4947/**
4904 * __napi_schedule_irqoff - schedule for receive 4948 * __napi_schedule_irqoff - schedule for receive
4905 * @n: entry to schedule 4949 * @n: entry to schedule
4906 * 4950 *
@@ -4912,26 +4956,19 @@ void __napi_schedule_irqoff(struct napi_struct *n)
4912} 4956}
4913EXPORT_SYMBOL(__napi_schedule_irqoff); 4957EXPORT_SYMBOL(__napi_schedule_irqoff);
4914 4958
4915void __napi_complete(struct napi_struct *n) 4959bool napi_complete_done(struct napi_struct *n, int work_done)
4916{
4917 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4918
4919 list_del_init(&n->poll_list);
4920 smp_mb__before_atomic();
4921 clear_bit(NAPI_STATE_SCHED, &n->state);
4922}
4923EXPORT_SYMBOL(__napi_complete);
4924
4925void napi_complete_done(struct napi_struct *n, int work_done)
4926{ 4960{
4927 unsigned long flags; 4961 unsigned long flags, val, new;
4928 4962
4929 /* 4963 /*
4930 * don't let napi dequeue from the cpu poll list 4964 * 1) Don't let napi dequeue from the cpu poll list
4931 * just in case its running on a different cpu 4965 * just in case its running on a different cpu.
4966 * 2) If we are busy polling, do nothing here, we have
4967 * the guarantee we will be called later.
4932 */ 4968 */
4933 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) 4969 if (unlikely(n->state & (NAPIF_STATE_NPSVC |
4934 return; 4970 NAPIF_STATE_IN_BUSY_POLL)))
4971 return false;
4935 4972
4936 if (n->gro_list) { 4973 if (n->gro_list) {
4937 unsigned long timeout = 0; 4974 unsigned long timeout = 0;
@@ -4945,14 +4982,34 @@ void napi_complete_done(struct napi_struct *n, int work_done)
4945 else 4982 else
4946 napi_gro_flush(n, false); 4983 napi_gro_flush(n, false);
4947 } 4984 }
4948 if (likely(list_empty(&n->poll_list))) { 4985 if (unlikely(!list_empty(&n->poll_list))) {
4949 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4950 } else {
4951 /* If n->poll_list is not empty, we need to mask irqs */ 4986 /* If n->poll_list is not empty, we need to mask irqs */
4952 local_irq_save(flags); 4987 local_irq_save(flags);
4953 __napi_complete(n); 4988 list_del_init(&n->poll_list);
4954 local_irq_restore(flags); 4989 local_irq_restore(flags);
4955 } 4990 }
4991
4992 do {
4993 val = READ_ONCE(n->state);
4994
4995 WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
4996
4997 new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
4998
4999 /* If STATE_MISSED was set, leave STATE_SCHED set,
5000 * because we will call napi->poll() one more time.
5001 * This C code was suggested by Alexander Duyck to help gcc.
5002 */
5003 new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
5004 NAPIF_STATE_SCHED;
5005 } while (cmpxchg(&n->state, val, new) != val);
5006
5007 if (unlikely(val & NAPIF_STATE_MISSED)) {
5008 __napi_schedule(n);
5009 return false;
5010 }
5011
5012 return true;
4956} 5013}
4957EXPORT_SYMBOL(napi_complete_done); 5014EXPORT_SYMBOL(napi_complete_done);
4958 5015
@@ -4970,13 +5027,50 @@ static struct napi_struct *napi_by_id(unsigned int napi_id)
4970} 5027}
4971 5028
4972#if defined(CONFIG_NET_RX_BUSY_POLL) 5029#if defined(CONFIG_NET_RX_BUSY_POLL)
5030
4973#define BUSY_POLL_BUDGET 8 5031#define BUSY_POLL_BUDGET 8
5032
5033static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
5034{
5035 int rc;
5036
5037 /* Busy polling means there is a high chance device driver hard irq
5038 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
5039 * set in napi_schedule_prep().
5040 * Since we are about to call napi->poll() once more, we can safely
5041 * clear NAPI_STATE_MISSED.
5042 *
5043 * Note: x86 could use a single "lock and ..." instruction
5044 * to perform these two clear_bit()
5045 */
5046 clear_bit(NAPI_STATE_MISSED, &napi->state);
5047 clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
5048
5049 local_bh_disable();
5050
5051 /* All we really want here is to re-enable device interrupts.
5052 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
5053 */
5054 rc = napi->poll(napi, BUSY_POLL_BUDGET);
5055 netpoll_poll_unlock(have_poll_lock);
5056 if (rc == BUSY_POLL_BUDGET)
5057 __napi_schedule(napi);
5058 local_bh_enable();
5059 if (local_softirq_pending())
5060 do_softirq();
5061}
5062
4974bool sk_busy_loop(struct sock *sk, int nonblock) 5063bool sk_busy_loop(struct sock *sk, int nonblock)
4975{ 5064{
4976 unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; 5065 unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
4977 int (*busy_poll)(struct napi_struct *dev); 5066 int (*napi_poll)(struct napi_struct *napi, int budget);
5067 void *have_poll_lock = NULL;
4978 struct napi_struct *napi; 5068 struct napi_struct *napi;
4979 int rc = false; 5069 int rc;
5070
5071restart:
5072 rc = false;
5073 napi_poll = NULL;
4980 5074
4981 rcu_read_lock(); 5075 rcu_read_lock();
4982 5076
@@ -4984,39 +5078,54 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
4984 if (!napi) 5078 if (!napi)
4985 goto out; 5079 goto out;
4986 5080
4987 /* Note: ndo_busy_poll method is optional in linux-4.5 */ 5081 preempt_disable();
4988 busy_poll = napi->dev->netdev_ops->ndo_busy_poll; 5082 for (;;) {
4989
4990 do {
4991 rc = 0; 5083 rc = 0;
4992 local_bh_disable(); 5084 local_bh_disable();
4993 if (busy_poll) { 5085 if (!napi_poll) {
4994 rc = busy_poll(napi); 5086 unsigned long val = READ_ONCE(napi->state);
4995 } else if (napi_schedule_prep(napi)) { 5087
4996 void *have = netpoll_poll_lock(napi); 5088 /* If multiple threads are competing for this napi,
4997 5089 * we avoid dirtying napi->state as much as we can.
4998 if (test_bit(NAPI_STATE_SCHED, &napi->state)) { 5090 */
4999 rc = napi->poll(napi, BUSY_POLL_BUDGET); 5091 if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
5000 trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); 5092 NAPIF_STATE_IN_BUSY_POLL))
5001 if (rc == BUSY_POLL_BUDGET) { 5093 goto count;
5002 napi_complete_done(napi, rc); 5094 if (cmpxchg(&napi->state, val,
5003 napi_schedule(napi); 5095 val | NAPIF_STATE_IN_BUSY_POLL |
5004 } 5096 NAPIF_STATE_SCHED) != val)
5005 } 5097 goto count;
5006 netpoll_poll_unlock(have); 5098 have_poll_lock = netpoll_poll_lock(napi);
5099 napi_poll = napi->poll;
5007 } 5100 }
5101 rc = napi_poll(napi, BUSY_POLL_BUDGET);
5102 trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
5103count:
5008 if (rc > 0) 5104 if (rc > 0)
5009 __NET_ADD_STATS(sock_net(sk), 5105 __NET_ADD_STATS(sock_net(sk),
5010 LINUX_MIB_BUSYPOLLRXPACKETS, rc); 5106 LINUX_MIB_BUSYPOLLRXPACKETS, rc);
5011 local_bh_enable(); 5107 local_bh_enable();
5012 5108
5013 if (rc == LL_FLUSH_FAILED) 5109 if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) ||
5014 break; /* permanent failure */ 5110 busy_loop_timeout(end_time))
5111 break;
5015 5112
5113 if (unlikely(need_resched())) {
5114 if (napi_poll)
5115 busy_poll_stop(napi, have_poll_lock);
5116 preempt_enable();
5117 rcu_read_unlock();
5118 cond_resched();
5119 rc = !skb_queue_empty(&sk->sk_receive_queue);
5120 if (rc || busy_loop_timeout(end_time))
5121 return rc;
5122 goto restart;
5123 }
5016 cpu_relax(); 5124 cpu_relax();
5017 } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && 5125 }
5018 !need_resched() && !busy_loop_timeout(end_time)); 5126 if (napi_poll)
5019 5127 busy_poll_stop(napi, have_poll_lock);
5128 preempt_enable();
5020 rc = !skb_queue_empty(&sk->sk_receive_queue); 5129 rc = !skb_queue_empty(&sk->sk_receive_queue);
5021out: 5130out:
5022 rcu_read_unlock(); 5131 rcu_read_unlock();
@@ -5026,7 +5135,7 @@ EXPORT_SYMBOL(sk_busy_loop);
5026 5135
5027#endif /* CONFIG_NET_RX_BUSY_POLL */ 5136#endif /* CONFIG_NET_RX_BUSY_POLL */
5028 5137
5029void napi_hash_add(struct napi_struct *napi) 5138static void napi_hash_add(struct napi_struct *napi)
5030{ 5139{
5031 if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) || 5140 if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5032 test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) 5141 test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
@@ -5046,7 +5155,6 @@ void napi_hash_add(struct napi_struct *napi)
5046 5155
5047 spin_unlock(&napi_hash_lock); 5156 spin_unlock(&napi_hash_lock);
5048} 5157}
5049EXPORT_SYMBOL_GPL(napi_hash_add);
5050 5158
5051/* Warning : caller is responsible to make sure rcu grace period 5159/* Warning : caller is responsible to make sure rcu grace period
5052 * is respected before freeing memory containing @napi 5160 * is respected before freeing memory containing @napi
@@ -5071,8 +5179,13 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5071 struct napi_struct *napi; 5179 struct napi_struct *napi;
5072 5180
5073 napi = container_of(timer, struct napi_struct, timer); 5181 napi = container_of(timer, struct napi_struct, timer);
5074 if (napi->gro_list) 5182
5075 napi_schedule(napi); 5183 /* Note : we use a relaxed variant of napi_schedule_prep() not setting
5184 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
5185 */
5186 if (napi->gro_list && !napi_disable_pending(napi) &&
5187 !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
5188 __napi_schedule_irqoff(napi);
5076 5189
5077 return HRTIMER_NORESTART; 5190 return HRTIMER_NORESTART;
5078} 5191}
@@ -5094,7 +5207,6 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5094 list_add(&napi->dev_list, &dev->napi_list); 5207 list_add(&napi->dev_list, &dev->napi_list);
5095 napi->dev = dev; 5208 napi->dev = dev;
5096#ifdef CONFIG_NETPOLL 5209#ifdef CONFIG_NETPOLL
5097 spin_lock_init(&napi->poll_lock);
5098 napi->poll_owner = -1; 5210 napi->poll_owner = -1;
5099#endif 5211#endif
5100 set_bit(NAPI_STATE_SCHED, &napi->state); 5212 set_bit(NAPI_STATE_SCHED, &napi->state);
@@ -5212,7 +5324,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
5212 5324
5213 if (list_empty(&list)) { 5325 if (list_empty(&list)) {
5214 if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) 5326 if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5215 return; 5327 goto out;
5216 break; 5328 break;
5217 } 5329 }
5218 5330
@@ -5230,7 +5342,6 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
5230 } 5342 }
5231 } 5343 }
5232 5344
5233 __kfree_skb_flush();
5234 local_irq_disable(); 5345 local_irq_disable();
5235 5346
5236 list_splice_tail_init(&sd->poll_list, &list); 5347 list_splice_tail_init(&sd->poll_list, &list);
@@ -5240,6 +5351,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
5240 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 5351 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
5241 5352
5242 net_rps_action_and_irq_enable(sd); 5353 net_rps_action_and_irq_enable(sd);
5354out:
5355 __kfree_skb_flush();
5243} 5356}
5244 5357
5245struct netdev_adjacent { 5358struct netdev_adjacent {
@@ -5270,6 +5383,13 @@ static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5270 return NULL; 5383 return NULL;
5271} 5384}
5272 5385
5386static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
5387{
5388 struct net_device *dev = data;
5389
5390 return upper_dev == dev;
5391}
5392
5273/** 5393/**
5274 * netdev_has_upper_dev - Check if device is linked to an upper device 5394 * netdev_has_upper_dev - Check if device is linked to an upper device
5275 * @dev: device 5395 * @dev: device
@@ -5284,11 +5404,30 @@ bool netdev_has_upper_dev(struct net_device *dev,
5284{ 5404{
5285 ASSERT_RTNL(); 5405 ASSERT_RTNL();
5286 5406
5287 return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper); 5407 return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5408 upper_dev);
5288} 5409}
5289EXPORT_SYMBOL(netdev_has_upper_dev); 5410EXPORT_SYMBOL(netdev_has_upper_dev);
5290 5411
5291/** 5412/**
5413 * netdev_has_upper_dev_all - Check if device is linked to an upper device
5414 * @dev: device
5415 * @upper_dev: upper device to check
5416 *
5417 * Find out if a device is linked to specified upper device and return true
5418 * in case it is. Note that this checks the entire upper device chain.
5419 * The caller must hold rcu lock.
5420 */
5421
5422bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
5423 struct net_device *upper_dev)
5424{
5425 return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5426 upper_dev);
5427}
5428EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
5429
5430/**
5292 * netdev_has_any_upper_dev - Check if device is linked to some device 5431 * netdev_has_any_upper_dev - Check if device is linked to some device
5293 * @dev: device 5432 * @dev: device
5294 * 5433 *
@@ -5299,7 +5438,7 @@ static bool netdev_has_any_upper_dev(struct net_device *dev)
5299{ 5438{
5300 ASSERT_RTNL(); 5439 ASSERT_RTNL();
5301 5440
5302 return !list_empty(&dev->all_adj_list.upper); 5441 return !list_empty(&dev->adj_list.upper);
5303} 5442}
5304 5443
5305/** 5444/**
@@ -5326,6 +5465,20 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5326} 5465}
5327EXPORT_SYMBOL(netdev_master_upper_dev_get); 5466EXPORT_SYMBOL(netdev_master_upper_dev_get);
5328 5467
5468/**
5469 * netdev_has_any_lower_dev - Check if device is linked to some device
5470 * @dev: device
5471 *
5472 * Find out if a device is linked to a lower device and return true in case
5473 * it is. The caller must hold the RTNL lock.
5474 */
5475static bool netdev_has_any_lower_dev(struct net_device *dev)
5476{
5477 ASSERT_RTNL();
5478
5479 return !list_empty(&dev->adj_list.lower);
5480}
5481
5329void *netdev_adjacent_get_private(struct list_head *adj_list) 5482void *netdev_adjacent_get_private(struct list_head *adj_list)
5330{ 5483{
5331 struct netdev_adjacent *adj; 5484 struct netdev_adjacent *adj;
@@ -5362,16 +5515,8 @@ struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5362} 5515}
5363EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); 5516EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5364 5517
5365/** 5518static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
5366 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list 5519 struct list_head **iter)
5367 * @dev: device
5368 * @iter: list_head ** of the current position
5369 *
5370 * Gets the next device from the dev's upper list, starting from iter
5371 * position. The caller must hold RCU read lock.
5372 */
5373struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
5374 struct list_head **iter)
5375{ 5520{
5376 struct netdev_adjacent *upper; 5521 struct netdev_adjacent *upper;
5377 5522
@@ -5379,14 +5524,41 @@ struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
5379 5524
5380 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 5525 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5381 5526
5382 if (&upper->list == &dev->all_adj_list.upper) 5527 if (&upper->list == &dev->adj_list.upper)
5383 return NULL; 5528 return NULL;
5384 5529
5385 *iter = &upper->list; 5530 *iter = &upper->list;
5386 5531
5387 return upper->dev; 5532 return upper->dev;
5388} 5533}
5389EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu); 5534
5535int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
5536 int (*fn)(struct net_device *dev,
5537 void *data),
5538 void *data)
5539{
5540 struct net_device *udev;
5541 struct list_head *iter;
5542 int ret;
5543
5544 for (iter = &dev->adj_list.upper,
5545 udev = netdev_next_upper_dev_rcu(dev, &iter);
5546 udev;
5547 udev = netdev_next_upper_dev_rcu(dev, &iter)) {
5548 /* first is the upper device itself */
5549 ret = fn(udev, data);
5550 if (ret)
5551 return ret;
5552
5553 /* then look at all of its upper devices */
5554 ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
5555 if (ret)
5556 return ret;
5557 }
5558
5559 return 0;
5560}
5561EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
5390 5562
5391/** 5563/**
5392 * netdev_lower_get_next_private - Get the next ->private from the 5564 * netdev_lower_get_next_private - Get the next ->private from the
@@ -5469,55 +5641,90 @@ void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5469} 5641}
5470EXPORT_SYMBOL(netdev_lower_get_next); 5642EXPORT_SYMBOL(netdev_lower_get_next);
5471 5643
5472/** 5644static struct net_device *netdev_next_lower_dev(struct net_device *dev,
5473 * netdev_all_lower_get_next - Get the next device from all lower neighbour list 5645 struct list_head **iter)
5474 * @dev: device
5475 * @iter: list_head ** of the current position
5476 *
5477 * Gets the next netdev_adjacent from the dev's all lower neighbour
5478 * list, starting from iter position. The caller must hold RTNL lock or
5479 * its own locking that guarantees that the neighbour all lower
5480 * list will remain unchanged.
5481 */
5482struct net_device *netdev_all_lower_get_next(struct net_device *dev, struct list_head **iter)
5483{ 5646{
5484 struct netdev_adjacent *lower; 5647 struct netdev_adjacent *lower;
5485 5648
5486 lower = list_entry(*iter, struct netdev_adjacent, list); 5649 lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5487 5650
5488 if (&lower->list == &dev->all_adj_list.lower) 5651 if (&lower->list == &dev->adj_list.lower)
5489 return NULL; 5652 return NULL;
5490 5653
5491 *iter = lower->list.next; 5654 *iter = &lower->list;
5492 5655
5493 return lower->dev; 5656 return lower->dev;
5494} 5657}
5495EXPORT_SYMBOL(netdev_all_lower_get_next);
5496 5658
5497/** 5659int netdev_walk_all_lower_dev(struct net_device *dev,
5498 * netdev_all_lower_get_next_rcu - Get the next device from all 5660 int (*fn)(struct net_device *dev,
5499 * lower neighbour list, RCU variant 5661 void *data),
5500 * @dev: device 5662 void *data)
5501 * @iter: list_head ** of the current position 5663{
5502 * 5664 struct net_device *ldev;
5503 * Gets the next netdev_adjacent from the dev's all lower neighbour 5665 struct list_head *iter;
5504 * list, starting from iter position. The caller must hold RCU read lock. 5666 int ret;
5505 */ 5667
5506struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev, 5668 for (iter = &dev->adj_list.lower,
5507 struct list_head **iter) 5669 ldev = netdev_next_lower_dev(dev, &iter);
5670 ldev;
5671 ldev = netdev_next_lower_dev(dev, &iter)) {
5672 /* first is the lower device itself */
5673 ret = fn(ldev, data);
5674 if (ret)
5675 return ret;
5676
5677 /* then look at all of its lower devices */
5678 ret = netdev_walk_all_lower_dev(ldev, fn, data);
5679 if (ret)
5680 return ret;
5681 }
5682
5683 return 0;
5684}
5685EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
5686
5687static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
5688 struct list_head **iter)
5508{ 5689{
5509 struct netdev_adjacent *lower; 5690 struct netdev_adjacent *lower;
5510 5691
5511 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 5692 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5512 5693 if (&lower->list == &dev->adj_list.lower)
5513 if (&lower->list == &dev->all_adj_list.lower)
5514 return NULL; 5694 return NULL;
5515 5695
5516 *iter = &lower->list; 5696 *iter = &lower->list;
5517 5697
5518 return lower->dev; 5698 return lower->dev;
5519} 5699}
5520EXPORT_SYMBOL(netdev_all_lower_get_next_rcu); 5700
5701int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
5702 int (*fn)(struct net_device *dev,
5703 void *data),
5704 void *data)
5705{
5706 struct net_device *ldev;
5707 struct list_head *iter;
5708 int ret;
5709
5710 for (iter = &dev->adj_list.lower,
5711 ldev = netdev_next_lower_dev_rcu(dev, &iter);
5712 ldev;
5713 ldev = netdev_next_lower_dev_rcu(dev, &iter)) {
5714 /* first is the lower device itself */
5715 ret = fn(ldev, data);
5716 if (ret)
5717 return ret;
5718
5719 /* then look at all of its lower devices */
5720 ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data);
5721 if (ret)
5722 return ret;
5723 }
5724
5725 return 0;
5726}
5727EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
5521 5728
5522/** 5729/**
5523 * netdev_lower_get_first_private_rcu - Get the first ->private from the 5730 * netdev_lower_get_first_private_rcu - Get the first ->private from the
@@ -5564,6 +5771,7 @@ static int netdev_adjacent_sysfs_add(struct net_device *dev,
5564 struct list_head *dev_list) 5771 struct list_head *dev_list)
5565{ 5772{
5566 char linkname[IFNAMSIZ+7]; 5773 char linkname[IFNAMSIZ+7];
5774
5567 sprintf(linkname, dev_list == &dev->adj_list.upper ? 5775 sprintf(linkname, dev_list == &dev->adj_list.upper ?
5568 "upper_%s" : "lower_%s", adj_dev->name); 5776 "upper_%s" : "lower_%s", adj_dev->name);
5569 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), 5777 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
@@ -5574,6 +5782,7 @@ static void netdev_adjacent_sysfs_del(struct net_device *dev,
5574 struct list_head *dev_list) 5782 struct list_head *dev_list)
5575{ 5783{
5576 char linkname[IFNAMSIZ+7]; 5784 char linkname[IFNAMSIZ+7];
5785
5577 sprintf(linkname, dev_list == &dev->adj_list.upper ? 5786 sprintf(linkname, dev_list == &dev->adj_list.upper ?
5578 "upper_%s" : "lower_%s", name); 5787 "upper_%s" : "lower_%s", name);
5579 sysfs_remove_link(&(dev->dev.kobj), linkname); 5788 sysfs_remove_link(&(dev->dev.kobj), linkname);
@@ -5590,7 +5799,6 @@ static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5590 5799
5591static int __netdev_adjacent_dev_insert(struct net_device *dev, 5800static int __netdev_adjacent_dev_insert(struct net_device *dev,
5592 struct net_device *adj_dev, 5801 struct net_device *adj_dev,
5593 u16 ref_nr,
5594 struct list_head *dev_list, 5802 struct list_head *dev_list,
5595 void *private, bool master) 5803 void *private, bool master)
5596{ 5804{
@@ -5600,7 +5808,10 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev,
5600 adj = __netdev_find_adj(adj_dev, dev_list); 5808 adj = __netdev_find_adj(adj_dev, dev_list);
5601 5809
5602 if (adj) { 5810 if (adj) {
5603 adj->ref_nr += ref_nr; 5811 adj->ref_nr += 1;
5812 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
5813 dev->name, adj_dev->name, adj->ref_nr);
5814
5604 return 0; 5815 return 0;
5605 } 5816 }
5606 5817
@@ -5610,12 +5821,12 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev,
5610 5821
5611 adj->dev = adj_dev; 5822 adj->dev = adj_dev;
5612 adj->master = master; 5823 adj->master = master;
5613 adj->ref_nr = ref_nr; 5824 adj->ref_nr = 1;
5614 adj->private = private; 5825 adj->private = private;
5615 dev_hold(adj_dev); 5826 dev_hold(adj_dev);
5616 5827
5617 pr_debug("dev_hold for %s, because of link added from %s to %s\n", 5828 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
5618 adj_dev->name, dev->name, adj_dev->name); 5829 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
5619 5830
5620 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) { 5831 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5621 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); 5832 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
@@ -5654,17 +5865,22 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev,
5654{ 5865{
5655 struct netdev_adjacent *adj; 5866 struct netdev_adjacent *adj;
5656 5867
5868 pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
5869 dev->name, adj_dev->name, ref_nr);
5870
5657 adj = __netdev_find_adj(adj_dev, dev_list); 5871 adj = __netdev_find_adj(adj_dev, dev_list);
5658 5872
5659 if (!adj) { 5873 if (!adj) {
5660 pr_err("tried to remove device %s from %s\n", 5874 pr_err("Adjacency does not exist for device %s from %s\n",
5661 dev->name, adj_dev->name); 5875 dev->name, adj_dev->name);
5662 BUG(); 5876 WARN_ON(1);
5877 return;
5663 } 5878 }
5664 5879
5665 if (adj->ref_nr > ref_nr) { 5880 if (adj->ref_nr > ref_nr) {
5666 pr_debug("%s to %s ref_nr-%d = %d\n", dev->name, adj_dev->name, 5881 pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
5667 ref_nr, adj->ref_nr-ref_nr); 5882 dev->name, adj_dev->name, ref_nr,
5883 adj->ref_nr - ref_nr);
5668 adj->ref_nr -= ref_nr; 5884 adj->ref_nr -= ref_nr;
5669 return; 5885 return;
5670 } 5886 }
@@ -5676,7 +5892,7 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev,
5676 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); 5892 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5677 5893
5678 list_del_rcu(&adj->list); 5894 list_del_rcu(&adj->list);
5679 pr_debug("dev_put for %s, because link removed from %s to %s\n", 5895 pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
5680 adj_dev->name, dev->name, adj_dev->name); 5896 adj_dev->name, dev->name, adj_dev->name);
5681 dev_put(adj_dev); 5897 dev_put(adj_dev);
5682 kfree_rcu(adj, rcu); 5898 kfree_rcu(adj, rcu);
@@ -5684,38 +5900,27 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev,
5684 5900
5685static int __netdev_adjacent_dev_link_lists(struct net_device *dev, 5901static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5686 struct net_device *upper_dev, 5902 struct net_device *upper_dev,
5687 u16 ref_nr,
5688 struct list_head *up_list, 5903 struct list_head *up_list,
5689 struct list_head *down_list, 5904 struct list_head *down_list,
5690 void *private, bool master) 5905 void *private, bool master)
5691{ 5906{
5692 int ret; 5907 int ret;
5693 5908
5694 ret = __netdev_adjacent_dev_insert(dev, upper_dev, ref_nr, up_list, 5909 ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
5695 private, master); 5910 private, master);
5696 if (ret) 5911 if (ret)
5697 return ret; 5912 return ret;
5698 5913
5699 ret = __netdev_adjacent_dev_insert(upper_dev, dev, ref_nr, down_list, 5914 ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
5700 private, false); 5915 private, false);
5701 if (ret) { 5916 if (ret) {
5702 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list); 5917 __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
5703 return ret; 5918 return ret;
5704 } 5919 }
5705 5920
5706 return 0; 5921 return 0;
5707} 5922}
5708 5923
5709static int __netdev_adjacent_dev_link(struct net_device *dev,
5710 struct net_device *upper_dev,
5711 u16 ref_nr)
5712{
5713 return __netdev_adjacent_dev_link_lists(dev, upper_dev, ref_nr,
5714 &dev->all_adj_list.upper,
5715 &upper_dev->all_adj_list.lower,
5716 NULL, false);
5717}
5718
5719static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, 5924static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5720 struct net_device *upper_dev, 5925 struct net_device *upper_dev,
5721 u16 ref_nr, 5926 u16 ref_nr,
@@ -5726,40 +5931,19 @@ static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5726 __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list); 5931 __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
5727} 5932}
5728 5933
5729static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5730 struct net_device *upper_dev,
5731 u16 ref_nr)
5732{
5733 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, ref_nr,
5734 &dev->all_adj_list.upper,
5735 &upper_dev->all_adj_list.lower);
5736}
5737
5738static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, 5934static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5739 struct net_device *upper_dev, 5935 struct net_device *upper_dev,
5740 void *private, bool master) 5936 void *private, bool master)
5741{ 5937{
5742 int ret = __netdev_adjacent_dev_link(dev, upper_dev, 1); 5938 return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5743 5939 &dev->adj_list.upper,
5744 if (ret) 5940 &upper_dev->adj_list.lower,
5745 return ret; 5941 private, master);
5746
5747 ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 1,
5748 &dev->adj_list.upper,
5749 &upper_dev->adj_list.lower,
5750 private, master);
5751 if (ret) {
5752 __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5753 return ret;
5754 }
5755
5756 return 0;
5757} 5942}
5758 5943
5759static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, 5944static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5760 struct net_device *upper_dev) 5945 struct net_device *upper_dev)
5761{ 5946{
5762 __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5763 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1, 5947 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
5764 &dev->adj_list.upper, 5948 &dev->adj_list.upper,
5765 &upper_dev->adj_list.lower); 5949 &upper_dev->adj_list.lower);
@@ -5770,7 +5954,6 @@ static int __netdev_upper_dev_link(struct net_device *dev,
5770 void *upper_priv, void *upper_info) 5954 void *upper_priv, void *upper_info)
5771{ 5955{
5772 struct netdev_notifier_changeupper_info changeupper_info; 5956 struct netdev_notifier_changeupper_info changeupper_info;
5773 struct netdev_adjacent *i, *j, *to_i, *to_j;
5774 int ret = 0; 5957 int ret = 0;
5775 5958
5776 ASSERT_RTNL(); 5959 ASSERT_RTNL();
@@ -5779,10 +5962,10 @@ static int __netdev_upper_dev_link(struct net_device *dev,
5779 return -EBUSY; 5962 return -EBUSY;
5780 5963
5781 /* To prevent loops, check if dev is not upper device to upper_dev. */ 5964 /* To prevent loops, check if dev is not upper device to upper_dev. */
5782 if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper)) 5965 if (netdev_has_upper_dev(upper_dev, dev))
5783 return -EBUSY; 5966 return -EBUSY;
5784 5967
5785 if (__netdev_find_adj(upper_dev, &dev->adj_list.upper)) 5968 if (netdev_has_upper_dev(dev, upper_dev))
5786 return -EEXIST; 5969 return -EEXIST;
5787 5970
5788 if (master && netdev_master_upper_dev_get(dev)) 5971 if (master && netdev_master_upper_dev_get(dev))
@@ -5804,80 +5987,15 @@ static int __netdev_upper_dev_link(struct net_device *dev,
5804 if (ret) 5987 if (ret)
5805 return ret; 5988 return ret;
5806 5989
5807 /* Now that we linked these devs, make all the upper_dev's
5808 * all_adj_list.upper visible to every dev's all_adj_list.lower an
5809 * versa, and don't forget the devices itself. All of these
5810 * links are non-neighbours.
5811 */
5812 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5813 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5814 pr_debug("Interlinking %s with %s, non-neighbour\n",
5815 i->dev->name, j->dev->name);
5816 ret = __netdev_adjacent_dev_link(i->dev, j->dev, i->ref_nr);
5817 if (ret)
5818 goto rollback_mesh;
5819 }
5820 }
5821
5822 /* add dev to every upper_dev's upper device */
5823 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5824 pr_debug("linking %s's upper device %s with %s\n",
5825 upper_dev->name, i->dev->name, dev->name);
5826 ret = __netdev_adjacent_dev_link(dev, i->dev, i->ref_nr);
5827 if (ret)
5828 goto rollback_upper_mesh;
5829 }
5830
5831 /* add upper_dev to every dev's lower device */
5832 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5833 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5834 i->dev->name, upper_dev->name);
5835 ret = __netdev_adjacent_dev_link(i->dev, upper_dev, i->ref_nr);
5836 if (ret)
5837 goto rollback_lower_mesh;
5838 }
5839
5840 ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, 5990 ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5841 &changeupper_info.info); 5991 &changeupper_info.info);
5842 ret = notifier_to_errno(ret); 5992 ret = notifier_to_errno(ret);
5843 if (ret) 5993 if (ret)
5844 goto rollback_lower_mesh; 5994 goto rollback;
5845 5995
5846 return 0; 5996 return 0;
5847 5997
5848rollback_lower_mesh: 5998rollback:
5849 to_i = i;
5850 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5851 if (i == to_i)
5852 break;
5853 __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
5854 }
5855
5856 i = NULL;
5857
5858rollback_upper_mesh:
5859 to_i = i;
5860 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5861 if (i == to_i)
5862 break;
5863 __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
5864 }
5865
5866 i = j = NULL;
5867
5868rollback_mesh:
5869 to_i = i;
5870 to_j = j;
5871 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5872 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5873 if (i == to_i && j == to_j)
5874 break;
5875 __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
5876 }
5877 if (i == to_i)
5878 break;
5879 }
5880
5881 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); 5999 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5882 6000
5883 return ret; 6001 return ret;
@@ -5934,7 +6052,7 @@ void netdev_upper_dev_unlink(struct net_device *dev,
5934 struct net_device *upper_dev) 6052 struct net_device *upper_dev)
5935{ 6053{
5936 struct netdev_notifier_changeupper_info changeupper_info; 6054 struct netdev_notifier_changeupper_info changeupper_info;
5937 struct netdev_adjacent *i, *j; 6055
5938 ASSERT_RTNL(); 6056 ASSERT_RTNL();
5939 6057
5940 changeupper_info.upper_dev = upper_dev; 6058 changeupper_info.upper_dev = upper_dev;
@@ -5946,23 +6064,6 @@ void netdev_upper_dev_unlink(struct net_device *dev,
5946 6064
5947 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); 6065 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5948 6066
5949 /* Here is the tricky part. We must remove all dev's lower
5950 * devices from all upper_dev's upper devices and vice
5951 * versa, to maintain the graph relationship.
5952 */
5953 list_for_each_entry(i, &dev->all_adj_list.lower, list)
5954 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5955 __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
5956
5957 /* remove also the devices itself from lower/upper device
5958 * list
5959 */
5960 list_for_each_entry(i, &dev->all_adj_list.lower, list)
5961 __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
5962
5963 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5964 __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
5965
5966 call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, 6067 call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5967 &changeupper_info.info); 6068 &changeupper_info.info);
5968} 6069}
@@ -6118,50 +6219,6 @@ void netdev_lower_state_changed(struct net_device *lower_dev,
6118} 6219}
6119EXPORT_SYMBOL(netdev_lower_state_changed); 6220EXPORT_SYMBOL(netdev_lower_state_changed);
6120 6221
6121int netdev_default_l2upper_neigh_construct(struct net_device *dev,
6122 struct neighbour *n)
6123{
6124 struct net_device *lower_dev, *stop_dev;
6125 struct list_head *iter;
6126 int err;
6127
6128 netdev_for_each_lower_dev(dev, lower_dev, iter) {
6129 if (!lower_dev->netdev_ops->ndo_neigh_construct)
6130 continue;
6131 err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n);
6132 if (err) {
6133 stop_dev = lower_dev;
6134 goto rollback;
6135 }
6136 }
6137 return 0;
6138
6139rollback:
6140 netdev_for_each_lower_dev(dev, lower_dev, iter) {
6141 if (lower_dev == stop_dev)
6142 break;
6143 if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6144 continue;
6145 lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6146 }
6147 return err;
6148}
6149EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct);
6150
6151void netdev_default_l2upper_neigh_destroy(struct net_device *dev,
6152 struct neighbour *n)
6153{
6154 struct net_device *lower_dev;
6155 struct list_head *iter;
6156
6157 netdev_for_each_lower_dev(dev, lower_dev, iter) {
6158 if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6159 continue;
6160 lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6161 }
6162}
6163EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy);
6164
6165static void dev_change_rx_flags(struct net_device *dev, int flags) 6222static void dev_change_rx_flags(struct net_device *dev, int flags)
6166{ 6223{
6167 const struct net_device_ops *ops = dev->netdev_ops; 6224 const struct net_device_ops *ops = dev->netdev_ops;
@@ -6414,8 +6471,8 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags)
6414 } 6471 }
6415 6472
6416 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI 6473 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6417 is important. Some (broken) drivers set IFF_PROMISC, when 6474 * is important. Some (broken) drivers set IFF_PROMISC, when
6418 IFF_ALLMULTI is requested not asking us and not reporting. 6475 * IFF_ALLMULTI is requested not asking us and not reporting.
6419 */ 6476 */
6420 if ((flags ^ dev->gflags) & IFF_ALLMULTI) { 6477 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6421 int inc = (flags & IFF_ALLMULTI) ? 1 : -1; 6478 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
@@ -6500,9 +6557,18 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
6500 if (new_mtu == dev->mtu) 6557 if (new_mtu == dev->mtu)
6501 return 0; 6558 return 0;
6502 6559
6503 /* MTU must be positive. */ 6560 /* MTU must be positive, and in range */
6504 if (new_mtu < 0) 6561 if (new_mtu < 0 || new_mtu < dev->min_mtu) {
6562 net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n",
6563 dev->name, new_mtu, dev->min_mtu);
6564 return -EINVAL;
6565 }
6566
6567 if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
6568 net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n",
6569 dev->name, new_mtu, dev->max_mtu);
6505 return -EINVAL; 6570 return -EINVAL;
6571 }
6506 6572
6507 if (!netif_device_present(dev)) 6573 if (!netif_device_present(dev))
6508 return -ENODEV; 6574 return -ENODEV;
@@ -6649,33 +6715,48 @@ EXPORT_SYMBOL(dev_change_proto_down);
6649 * dev_change_xdp_fd - set or clear a bpf program for a device rx path 6715 * dev_change_xdp_fd - set or clear a bpf program for a device rx path
6650 * @dev: device 6716 * @dev: device
6651 * @fd: new program fd or negative value to clear 6717 * @fd: new program fd or negative value to clear
6718 * @flags: xdp-related flags
6652 * 6719 *
6653 * Set or clear a bpf program for a device 6720 * Set or clear a bpf program for a device
6654 */ 6721 */
6655int dev_change_xdp_fd(struct net_device *dev, int fd) 6722int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags)
6656{ 6723{
6657 const struct net_device_ops *ops = dev->netdev_ops; 6724 const struct net_device_ops *ops = dev->netdev_ops;
6658 struct bpf_prog *prog = NULL; 6725 struct bpf_prog *prog = NULL;
6659 struct netdev_xdp xdp = {}; 6726 struct netdev_xdp xdp;
6660 int err; 6727 int err;
6661 6728
6729 ASSERT_RTNL();
6730
6662 if (!ops->ndo_xdp) 6731 if (!ops->ndo_xdp)
6663 return -EOPNOTSUPP; 6732 return -EOPNOTSUPP;
6664 if (fd >= 0) { 6733 if (fd >= 0) {
6734 if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) {
6735 memset(&xdp, 0, sizeof(xdp));
6736 xdp.command = XDP_QUERY_PROG;
6737
6738 err = ops->ndo_xdp(dev, &xdp);
6739 if (err < 0)
6740 return err;
6741 if (xdp.prog_attached)
6742 return -EBUSY;
6743 }
6744
6665 prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); 6745 prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
6666 if (IS_ERR(prog)) 6746 if (IS_ERR(prog))
6667 return PTR_ERR(prog); 6747 return PTR_ERR(prog);
6668 } 6748 }
6669 6749
6750 memset(&xdp, 0, sizeof(xdp));
6670 xdp.command = XDP_SETUP_PROG; 6751 xdp.command = XDP_SETUP_PROG;
6671 xdp.prog = prog; 6752 xdp.prog = prog;
6753
6672 err = ops->ndo_xdp(dev, &xdp); 6754 err = ops->ndo_xdp(dev, &xdp);
6673 if (err < 0 && prog) 6755 if (err < 0 && prog)
6674 bpf_prog_put(prog); 6756 bpf_prog_put(prog);
6675 6757
6676 return err; 6758 return err;
6677} 6759}
6678EXPORT_SYMBOL(dev_change_xdp_fd);
6679 6760
6680/** 6761/**
6681 * dev_new_index - allocate an ifindex 6762 * dev_new_index - allocate an ifindex
@@ -6688,6 +6769,7 @@ EXPORT_SYMBOL(dev_change_xdp_fd);
6688static int dev_new_index(struct net *net) 6769static int dev_new_index(struct net *net)
6689{ 6770{
6690 int ifindex = net->ifindex; 6771 int ifindex = net->ifindex;
6772
6691 for (;;) { 6773 for (;;) {
6692 if (++ifindex <= 0) 6774 if (++ifindex <= 0)
6693 ifindex = 1; 6775 ifindex = 1;
@@ -6754,8 +6836,8 @@ static void rollback_registered_many(struct list_head *head)
6754 6836
6755 6837
6756 /* Notify protocols, that we are about to destroy 6838 /* Notify protocols, that we are about to destroy
6757 this device. They should clean all the things. 6839 * this device. They should clean all the things.
6758 */ 6840 */
6759 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 6841 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6760 6842
6761 if (!dev->rtnl_link_ops || 6843 if (!dev->rtnl_link_ops ||
@@ -6777,6 +6859,7 @@ static void rollback_registered_many(struct list_head *head)
6777 6859
6778 /* Notifier chain MUST detach us all upper devices. */ 6860 /* Notifier chain MUST detach us all upper devices. */
6779 WARN_ON(netdev_has_any_upper_dev(dev)); 6861 WARN_ON(netdev_has_any_upper_dev(dev));
6862 WARN_ON(netdev_has_any_lower_dev(dev));
6780 6863
6781 /* Remove entries from kobject tree */ 6864 /* Remove entries from kobject tree */
6782 netdev_unregister_kobject(dev); 6865 netdev_unregister_kobject(dev);
@@ -6912,13 +6995,6 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
6912 features &= ~dev->gso_partial_features; 6995 features &= ~dev->gso_partial_features;
6913 } 6996 }
6914 6997
6915#ifdef CONFIG_NET_RX_BUSY_POLL
6916 if (dev->netdev_ops->ndo_busy_poll)
6917 features |= NETIF_F_BUSY_POLL;
6918 else
6919#endif
6920 features &= ~NETIF_F_BUSY_POLL;
6921
6922 return features; 6998 return features;
6923} 6999}
6924 7000
@@ -7107,6 +7183,7 @@ void netif_tx_stop_all_queues(struct net_device *dev)
7107 7183
7108 for (i = 0; i < dev->num_tx_queues; i++) { 7184 for (i = 0; i < dev->num_tx_queues; i++) {
7109 struct netdev_queue *txq = netdev_get_tx_queue(dev, i); 7185 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
7186
7110 netif_tx_stop_queue(txq); 7187 netif_tx_stop_queue(txq);
7111 } 7188 }
7112} 7189}
@@ -7581,17 +7658,17 @@ void netdev_freemem(struct net_device *dev)
7581} 7658}
7582 7659
7583/** 7660/**
7584 * alloc_netdev_mqs - allocate network device 7661 * alloc_netdev_mqs - allocate network device
7585 * @sizeof_priv: size of private data to allocate space for 7662 * @sizeof_priv: size of private data to allocate space for
7586 * @name: device name format string 7663 * @name: device name format string
7587 * @name_assign_type: origin of device name 7664 * @name_assign_type: origin of device name
7588 * @setup: callback to initialize device 7665 * @setup: callback to initialize device
7589 * @txqs: the number of TX subqueues to allocate 7666 * @txqs: the number of TX subqueues to allocate
7590 * @rxqs: the number of RX subqueues to allocate 7667 * @rxqs: the number of RX subqueues to allocate
7591 * 7668 *
7592 * Allocates a struct net_device with private data area for driver use 7669 * Allocates a struct net_device with private data area for driver use
7593 * and performs basic initialization. Also allocates subqueue structs 7670 * and performs basic initialization. Also allocates subqueue structs
7594 * for each queue on the device. 7671 * for each queue on the device.
7595 */ 7672 */
7596struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, 7673struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7597 unsigned char name_assign_type, 7674 unsigned char name_assign_type,
@@ -7655,8 +7732,6 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7655 INIT_LIST_HEAD(&dev->link_watch_list); 7732 INIT_LIST_HEAD(&dev->link_watch_list);
7656 INIT_LIST_HEAD(&dev->adj_list.upper); 7733 INIT_LIST_HEAD(&dev->adj_list.upper);
7657 INIT_LIST_HEAD(&dev->adj_list.lower); 7734 INIT_LIST_HEAD(&dev->adj_list.lower);
7658 INIT_LIST_HEAD(&dev->all_adj_list.upper);
7659 INIT_LIST_HEAD(&dev->all_adj_list.lower);
7660 INIT_LIST_HEAD(&dev->ptype_all); 7735 INIT_LIST_HEAD(&dev->ptype_all);
7661 INIT_LIST_HEAD(&dev->ptype_specific); 7736 INIT_LIST_HEAD(&dev->ptype_specific);
7662#ifdef CONFIG_NET_SCHED 7737#ifdef CONFIG_NET_SCHED
@@ -7667,7 +7742,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7667 7742
7668 if (!dev->tx_queue_len) { 7743 if (!dev->tx_queue_len) {
7669 dev->priv_flags |= IFF_NO_QUEUE; 7744 dev->priv_flags |= IFF_NO_QUEUE;
7670 dev->tx_queue_len = 1; 7745 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
7671 } 7746 }
7672 7747
7673 dev->num_tx_queues = txqs; 7748 dev->num_tx_queues = txqs;
@@ -7705,13 +7780,13 @@ free_dev:
7705EXPORT_SYMBOL(alloc_netdev_mqs); 7780EXPORT_SYMBOL(alloc_netdev_mqs);
7706 7781
7707/** 7782/**
7708 * free_netdev - free network device 7783 * free_netdev - free network device
7709 * @dev: device 7784 * @dev: device
7710 * 7785 *
7711 * This function does the last stage of destroying an allocated device 7786 * This function does the last stage of destroying an allocated device
7712 * interface. The reference to the device object is released. 7787 * interface. The reference to the device object is released. If this
7713 * If this is the last reference then it will be freed. 7788 * is the last reference then it will be freed.Must be called in process
7714 * Must be called in process context. 7789 * context.
7715 */ 7790 */
7716void free_netdev(struct net_device *dev) 7791void free_netdev(struct net_device *dev)
7717{ 7792{
@@ -7893,12 +7968,12 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
7893 dev_shutdown(dev); 7968 dev_shutdown(dev);
7894 7969
7895 /* Notify protocols, that we are about to destroy 7970 /* Notify protocols, that we are about to destroy
7896 this device. They should clean all the things. 7971 * this device. They should clean all the things.
7897 7972 *
7898 Note that dev->reg_state stays at NETREG_REGISTERED. 7973 * Note that dev->reg_state stays at NETREG_REGISTERED.
7899 This is wanted because this way 8021q and macvlan know 7974 * This is wanted because this way 8021q and macvlan know
7900 the device is just moving and can keep their slaves up. 7975 * the device is just moving and can keep their slaves up.
7901 */ 7976 */
7902 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 7977 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7903 rcu_barrier(); 7978 rcu_barrier();
7904 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); 7979 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
@@ -7948,18 +8023,13 @@ out:
7948} 8023}
7949EXPORT_SYMBOL_GPL(dev_change_net_namespace); 8024EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7950 8025
7951static int dev_cpu_callback(struct notifier_block *nfb, 8026static int dev_cpu_dead(unsigned int oldcpu)
7952 unsigned long action,
7953 void *ocpu)
7954{ 8027{
7955 struct sk_buff **list_skb; 8028 struct sk_buff **list_skb;
7956 struct sk_buff *skb; 8029 struct sk_buff *skb;
7957 unsigned int cpu, oldcpu = (unsigned long)ocpu; 8030 unsigned int cpu;
7958 struct softnet_data *sd, *oldsd; 8031 struct softnet_data *sd, *oldsd;
7959 8032
7960 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7961 return NOTIFY_OK;
7962
7963 local_irq_disable(); 8033 local_irq_disable();
7964 cpu = smp_processor_id(); 8034 cpu = smp_processor_id();
7965 sd = &per_cpu(softnet_data, cpu); 8035 sd = &per_cpu(softnet_data, cpu);
@@ -8009,10 +8079,9 @@ static int dev_cpu_callback(struct notifier_block *nfb,
8009 input_queue_head_incr(oldsd); 8079 input_queue_head_incr(oldsd);
8010 } 8080 }
8011 8081
8012 return NOTIFY_OK; 8082 return 0;
8013} 8083}
8014 8084
8015
8016/** 8085/**
8017 * netdev_increment_features - increment feature set by one 8086 * netdev_increment_features - increment feature set by one
8018 * @all: current feature set 8087 * @all: current feature set
@@ -8346,7 +8415,9 @@ static int __init net_dev_init(void)
8346 open_softirq(NET_TX_SOFTIRQ, net_tx_action); 8415 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8347 open_softirq(NET_RX_SOFTIRQ, net_rx_action); 8416 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
8348 8417
8349 hotcpu_notifier(dev_cpu_callback, 0); 8418 rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
8419 NULL, dev_cpu_dead);
8420 WARN_ON(rc < 0);
8350 dst_subsys_init(); 8421 dst_subsys_init();
8351 rc = 0; 8422 rc = 0;
8352out: 8423out:
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 1b5063088f1a..e9c1e6acfb6d 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -341,15 +341,7 @@ static void devlink_nl_post_doit(const struct genl_ops *ops,
341 mutex_unlock(&devlink_mutex); 341 mutex_unlock(&devlink_mutex);
342} 342}
343 343
344static struct genl_family devlink_nl_family = { 344static struct genl_family devlink_nl_family;
345 .id = GENL_ID_GENERATE,
346 .name = DEVLINK_GENL_NAME,
347 .version = DEVLINK_GENL_VERSION,
348 .maxattr = DEVLINK_ATTR_MAX,
349 .netnsok = true,
350 .pre_doit = devlink_nl_pre_doit,
351 .post_doit = devlink_nl_post_doit,
352};
353 345
354enum devlink_multicast_groups { 346enum devlink_multicast_groups {
355 DEVLINK_MCGRP_CONFIG, 347 DEVLINK_MCGRP_CONFIG,
@@ -608,6 +600,8 @@ static int devlink_port_type_set(struct devlink *devlink,
608 if (devlink->ops && devlink->ops->port_type_set) { 600 if (devlink->ops && devlink->ops->port_type_set) {
609 if (port_type == DEVLINK_PORT_TYPE_NOTSET) 601 if (port_type == DEVLINK_PORT_TYPE_NOTSET)
610 return -EINVAL; 602 return -EINVAL;
603 if (port_type == devlink_port->type)
604 return 0;
611 err = devlink->ops->port_type_set(devlink_port, port_type); 605 err = devlink->ops->port_type_set(devlink_port, port_type);
612 if (err) 606 if (err)
613 return err; 607 return err;
@@ -1398,52 +1392,68 @@ static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb,
1398 return -EOPNOTSUPP; 1392 return -EOPNOTSUPP;
1399} 1393}
1400 1394
1401static int devlink_eswitch_fill(struct sk_buff *msg, struct devlink *devlink, 1395static int devlink_nl_eswitch_fill(struct sk_buff *msg, struct devlink *devlink,
1402 enum devlink_command cmd, u32 portid, 1396 enum devlink_command cmd, u32 portid,
1403 u32 seq, int flags, u16 mode) 1397 u32 seq, int flags)
1404{ 1398{
1399 const struct devlink_ops *ops = devlink->ops;
1405 void *hdr; 1400 void *hdr;
1401 int err = 0;
1402 u16 mode;
1403 u8 inline_mode;
1406 1404
1407 hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); 1405 hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
1408 if (!hdr) 1406 if (!hdr)
1409 return -EMSGSIZE; 1407 return -EMSGSIZE;
1410 1408
1411 if (devlink_nl_put_handle(msg, devlink)) 1409 err = devlink_nl_put_handle(msg, devlink);
1410 if (err)
1412 goto nla_put_failure; 1411 goto nla_put_failure;
1413 1412
1414 if (nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode)) 1413 if (ops->eswitch_mode_get) {
1415 goto nla_put_failure; 1414 err = ops->eswitch_mode_get(devlink, &mode);
1415 if (err)
1416 goto nla_put_failure;
1417 err = nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode);
1418 if (err)
1419 goto nla_put_failure;
1420 }
1421
1422 if (ops->eswitch_inline_mode_get) {
1423 err = ops->eswitch_inline_mode_get(devlink, &inline_mode);
1424 if (err)
1425 goto nla_put_failure;
1426 err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_INLINE_MODE,
1427 inline_mode);
1428 if (err)
1429 goto nla_put_failure;
1430 }
1416 1431
1417 genlmsg_end(msg, hdr); 1432 genlmsg_end(msg, hdr);
1418 return 0; 1433 return 0;
1419 1434
1420nla_put_failure: 1435nla_put_failure:
1421 genlmsg_cancel(msg, hdr); 1436 genlmsg_cancel(msg, hdr);
1422 return -EMSGSIZE; 1437 return err;
1423} 1438}
1424 1439
1425static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb, 1440static int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb,
1426 struct genl_info *info) 1441 struct genl_info *info)
1427{ 1442{
1428 struct devlink *devlink = info->user_ptr[0]; 1443 struct devlink *devlink = info->user_ptr[0];
1429 const struct devlink_ops *ops = devlink->ops; 1444 const struct devlink_ops *ops = devlink->ops;
1430 struct sk_buff *msg; 1445 struct sk_buff *msg;
1431 u16 mode;
1432 int err; 1446 int err;
1433 1447
1434 if (!ops || !ops->eswitch_mode_get) 1448 if (!ops)
1435 return -EOPNOTSUPP; 1449 return -EOPNOTSUPP;
1436 1450
1437 err = ops->eswitch_mode_get(devlink, &mode);
1438 if (err)
1439 return err;
1440
1441 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 1451 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
1442 if (!msg) 1452 if (!msg)
1443 return -ENOMEM; 1453 return -ENOMEM;
1444 1454
1445 err = devlink_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_MODE_GET, 1455 err = devlink_nl_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_GET,
1446 info->snd_portid, info->snd_seq, 0, mode); 1456 info->snd_portid, info->snd_seq, 0);
1447 1457
1448 if (err) { 1458 if (err) {
1449 nlmsg_free(msg); 1459 nlmsg_free(msg);
@@ -1453,21 +1463,38 @@ static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb,
1453 return genlmsg_reply(msg, info); 1463 return genlmsg_reply(msg, info);
1454} 1464}
1455 1465
1456static int devlink_nl_cmd_eswitch_mode_set_doit(struct sk_buff *skb, 1466static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb,
1457 struct genl_info *info) 1467 struct genl_info *info)
1458{ 1468{
1459 struct devlink *devlink = info->user_ptr[0]; 1469 struct devlink *devlink = info->user_ptr[0];
1460 const struct devlink_ops *ops = devlink->ops; 1470 const struct devlink_ops *ops = devlink->ops;
1461 u16 mode; 1471 u16 mode;
1472 u8 inline_mode;
1473 int err = 0;
1462 1474
1463 if (!info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) 1475 if (!ops)
1464 return -EINVAL; 1476 return -EOPNOTSUPP;
1477
1478 if (info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) {
1479 if (!ops->eswitch_mode_set)
1480 return -EOPNOTSUPP;
1481 mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]);
1482 err = ops->eswitch_mode_set(devlink, mode);
1483 if (err)
1484 return err;
1485 }
1465 1486
1466 mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]); 1487 if (info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]) {
1488 if (!ops->eswitch_inline_mode_set)
1489 return -EOPNOTSUPP;
1490 inline_mode = nla_get_u8(
1491 info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]);
1492 err = ops->eswitch_inline_mode_set(devlink, inline_mode);
1493 if (err)
1494 return err;
1495 }
1467 1496
1468 if (ops && ops->eswitch_mode_set) 1497 return 0;
1469 return ops->eswitch_mode_set(devlink, mode);
1470 return -EOPNOTSUPP;
1471} 1498}
1472 1499
1473static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { 1500static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
@@ -1484,6 +1511,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
1484 [DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 }, 1511 [DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 },
1485 [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 }, 1512 [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 },
1486 [DEVLINK_ATTR_ESWITCH_MODE] = { .type = NLA_U16 }, 1513 [DEVLINK_ATTR_ESWITCH_MODE] = { .type = NLA_U16 },
1514 [DEVLINK_ATTR_ESWITCH_INLINE_MODE] = { .type = NLA_U8 },
1487}; 1515};
1488 1516
1489static const struct genl_ops devlink_nl_ops[] = { 1517static const struct genl_ops devlink_nl_ops[] = {
@@ -1603,21 +1631,35 @@ static const struct genl_ops devlink_nl_ops[] = {
1603 DEVLINK_NL_FLAG_LOCK_PORTS, 1631 DEVLINK_NL_FLAG_LOCK_PORTS,
1604 }, 1632 },
1605 { 1633 {
1606 .cmd = DEVLINK_CMD_ESWITCH_MODE_GET, 1634 .cmd = DEVLINK_CMD_ESWITCH_GET,
1607 .doit = devlink_nl_cmd_eswitch_mode_get_doit, 1635 .doit = devlink_nl_cmd_eswitch_get_doit,
1608 .policy = devlink_nl_policy, 1636 .policy = devlink_nl_policy,
1609 .flags = GENL_ADMIN_PERM, 1637 .flags = GENL_ADMIN_PERM,
1610 .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, 1638 .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
1611 }, 1639 },
1612 { 1640 {
1613 .cmd = DEVLINK_CMD_ESWITCH_MODE_SET, 1641 .cmd = DEVLINK_CMD_ESWITCH_SET,
1614 .doit = devlink_nl_cmd_eswitch_mode_set_doit, 1642 .doit = devlink_nl_cmd_eswitch_set_doit,
1615 .policy = devlink_nl_policy, 1643 .policy = devlink_nl_policy,
1616 .flags = GENL_ADMIN_PERM, 1644 .flags = GENL_ADMIN_PERM,
1617 .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, 1645 .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
1618 }, 1646 },
1619}; 1647};
1620 1648
1649static struct genl_family devlink_nl_family __ro_after_init = {
1650 .name = DEVLINK_GENL_NAME,
1651 .version = DEVLINK_GENL_VERSION,
1652 .maxattr = DEVLINK_ATTR_MAX,
1653 .netnsok = true,
1654 .pre_doit = devlink_nl_pre_doit,
1655 .post_doit = devlink_nl_post_doit,
1656 .module = THIS_MODULE,
1657 .ops = devlink_nl_ops,
1658 .n_ops = ARRAY_SIZE(devlink_nl_ops),
1659 .mcgrps = devlink_nl_mcgrps,
1660 .n_mcgrps = ARRAY_SIZE(devlink_nl_mcgrps),
1661};
1662
1621/** 1663/**
1622 * devlink_alloc - Allocate new devlink instance resources 1664 * devlink_alloc - Allocate new devlink instance resources
1623 * 1665 *
@@ -1840,9 +1882,7 @@ EXPORT_SYMBOL_GPL(devlink_sb_unregister);
1840 1882
1841static int __init devlink_module_init(void) 1883static int __init devlink_module_init(void)
1842{ 1884{
1843 return genl_register_family_with_ops_groups(&devlink_nl_family, 1885 return genl_register_family(&devlink_nl_family);
1844 devlink_nl_ops,
1845 devlink_nl_mcgrps);
1846} 1886}
1847 1887
1848static void __exit devlink_module_exit(void) 1888static void __exit devlink_module_exit(void)
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 72cfb0c61125..fb55327dcfea 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -59,12 +59,7 @@ struct dm_hw_stat_delta {
59 unsigned long last_drop_val; 59 unsigned long last_drop_val;
60}; 60};
61 61
62static struct genl_family net_drop_monitor_family = { 62static struct genl_family net_drop_monitor_family;
63 .id = GENL_ID_GENERATE,
64 .hdrsize = 0,
65 .name = "NET_DM",
66 .version = 2,
67};
68 63
69static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); 64static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
70 65
@@ -80,6 +75,7 @@ static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data)
80 struct nlattr *nla; 75 struct nlattr *nla;
81 struct sk_buff *skb; 76 struct sk_buff *skb;
82 unsigned long flags; 77 unsigned long flags;
78 void *msg_header;
83 79
84 al = sizeof(struct net_dm_alert_msg); 80 al = sizeof(struct net_dm_alert_msg);
85 al += dm_hit_limit * sizeof(struct net_dm_drop_point); 81 al += dm_hit_limit * sizeof(struct net_dm_drop_point);
@@ -87,21 +83,41 @@ static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data)
87 83
88 skb = genlmsg_new(al, GFP_KERNEL); 84 skb = genlmsg_new(al, GFP_KERNEL);
89 85
90 if (skb) { 86 if (!skb)
91 genlmsg_put(skb, 0, 0, &net_drop_monitor_family, 87 goto err;
92 0, NET_DM_CMD_ALERT); 88
93 nla = nla_reserve(skb, NLA_UNSPEC, 89 msg_header = genlmsg_put(skb, 0, 0, &net_drop_monitor_family,
94 sizeof(struct net_dm_alert_msg)); 90 0, NET_DM_CMD_ALERT);
95 msg = nla_data(nla); 91 if (!msg_header) {
96 memset(msg, 0, al); 92 nlmsg_free(skb);
97 } else { 93 skb = NULL;
98 mod_timer(&data->send_timer, jiffies + HZ / 10); 94 goto err;
95 }
96 nla = nla_reserve(skb, NLA_UNSPEC,
97 sizeof(struct net_dm_alert_msg));
98 if (!nla) {
99 nlmsg_free(skb);
100 skb = NULL;
101 goto err;
99 } 102 }
103 msg = nla_data(nla);
104 memset(msg, 0, al);
105 goto out;
100 106
107err:
108 mod_timer(&data->send_timer, jiffies + HZ / 10);
109out:
101 spin_lock_irqsave(&data->lock, flags); 110 spin_lock_irqsave(&data->lock, flags);
102 swap(data->skb, skb); 111 swap(data->skb, skb);
103 spin_unlock_irqrestore(&data->lock, flags); 112 spin_unlock_irqrestore(&data->lock, flags);
104 113
114 if (skb) {
115 struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data;
116 struct genlmsghdr *gnlh = (struct genlmsghdr *)nlmsg_data(nlh);
117
118 genlmsg_end(skb, genlmsg_data(gnlh));
119 }
120
105 return skb; 121 return skb;
106} 122}
107 123
@@ -351,6 +367,17 @@ static const struct genl_ops dropmon_ops[] = {
351 }, 367 },
352}; 368};
353 369
370static struct genl_family net_drop_monitor_family __ro_after_init = {
371 .hdrsize = 0,
372 .name = "NET_DM",
373 .version = 2,
374 .module = THIS_MODULE,
375 .ops = dropmon_ops,
376 .n_ops = ARRAY_SIZE(dropmon_ops),
377 .mcgrps = dropmon_mcgrps,
378 .n_mcgrps = ARRAY_SIZE(dropmon_mcgrps),
379};
380
354static struct notifier_block dropmon_net_notifier = { 381static struct notifier_block dropmon_net_notifier = {
355 .notifier_call = dropmon_net_event 382 .notifier_call = dropmon_net_event
356}; 383};
@@ -367,8 +394,7 @@ static int __init init_net_drop_monitor(void)
367 return -ENOSPC; 394 return -ENOSPC;
368 } 395 }
369 396
370 rc = genl_register_family_with_ops_groups(&net_drop_monitor_family, 397 rc = genl_register_family(&net_drop_monitor_family);
371 dropmon_ops, dropmon_mcgrps);
372 if (rc) { 398 if (rc) {
373 pr_err("Could not create drop monitor netlink family\n"); 399 pr_err("Could not create drop monitor netlink family\n");
374 return rc; 400 return rc;
diff --git a/net/core/dst.c b/net/core/dst.c
index b5cbbe07f786..960e503b5a52 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -190,7 +190,6 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
190 dst->__use = 0; 190 dst->__use = 0;
191 dst->lastuse = jiffies; 191 dst->lastuse = jiffies;
192 dst->flags = flags; 192 dst->flags = flags;
193 dst->pending_confirm = 0;
194 dst->next = NULL; 193 dst->next = NULL;
195 if (!(flags & DST_NOCOUNT)) 194 if (!(flags & DST_NOCOUNT))
196 dst_entries_add(ops, 1); 195 dst_entries_add(ops, 1);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 047a1752ece1..aecb2c7241b6 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -24,7 +24,7 @@
24#include <linux/vmalloc.h> 24#include <linux/vmalloc.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/rtnetlink.h> 26#include <linux/rtnetlink.h>
27#include <linux/sched.h> 27#include <linux/sched/signal.h>
28#include <linux/net.h> 28#include <linux/net.h>
29 29
30/* 30/*
@@ -102,7 +102,6 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
102 [NETIF_F_RXFCS_BIT] = "rx-fcs", 102 [NETIF_F_RXFCS_BIT] = "rx-fcs",
103 [NETIF_F_RXALL_BIT] = "rx-all", 103 [NETIF_F_RXALL_BIT] = "rx-all",
104 [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", 104 [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
105 [NETIF_F_BUSY_POLL_BIT] = "busy-poll",
106 [NETIF_F_HW_TC_BIT] = "hw-tc-offload", 105 [NETIF_F_HW_TC_BIT] = "hw-tc-offload",
107}; 106};
108 107
@@ -119,6 +118,12 @@ tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = {
119 [ETHTOOL_TX_COPYBREAK] = "tx-copybreak", 118 [ETHTOOL_TX_COPYBREAK] = "tx-copybreak",
120}; 119};
121 120
121static const char
122phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN] = {
123 [ETHTOOL_ID_UNSPEC] = "Unspec",
124 [ETHTOOL_PHY_DOWNSHIFT] = "phy-downshift",
125};
126
122static int ethtool_get_features(struct net_device *dev, void __user *useraddr) 127static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
123{ 128{
124 struct ethtool_gfeatures cmd = { 129 struct ethtool_gfeatures cmd = {
@@ -227,6 +232,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset)
227 if (sset == ETH_SS_TUNABLES) 232 if (sset == ETH_SS_TUNABLES)
228 return ARRAY_SIZE(tunable_strings); 233 return ARRAY_SIZE(tunable_strings);
229 234
235 if (sset == ETH_SS_PHY_TUNABLES)
236 return ARRAY_SIZE(phy_tunable_strings);
237
230 if (sset == ETH_SS_PHY_STATS) { 238 if (sset == ETH_SS_PHY_STATS) {
231 if (dev->phydev) 239 if (dev->phydev)
232 return phy_get_sset_count(dev->phydev); 240 return phy_get_sset_count(dev->phydev);
@@ -253,6 +261,8 @@ static void __ethtool_get_strings(struct net_device *dev,
253 sizeof(rss_hash_func_strings)); 261 sizeof(rss_hash_func_strings));
254 else if (stringset == ETH_SS_TUNABLES) 262 else if (stringset == ETH_SS_TUNABLES)
255 memcpy(data, tunable_strings, sizeof(tunable_strings)); 263 memcpy(data, tunable_strings, sizeof(tunable_strings));
264 else if (stringset == ETH_SS_PHY_TUNABLES)
265 memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings));
256 else if (stringset == ETH_SS_PHY_STATS) { 266 else if (stringset == ETH_SS_PHY_STATS) {
257 struct phy_device *phydev = dev->phydev; 267 struct phy_device *phydev = dev->phydev;
258 268
@@ -1394,9 +1404,12 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
1394 if (regs.len > reglen) 1404 if (regs.len > reglen)
1395 regs.len = reglen; 1405 regs.len = reglen;
1396 1406
1397 regbuf = vzalloc(reglen); 1407 regbuf = NULL;
1398 if (reglen && !regbuf) 1408 if (reglen) {
1399 return -ENOMEM; 1409 regbuf = vzalloc(reglen);
1410 if (!regbuf)
1411 return -ENOMEM;
1412 }
1400 1413
1401 ops->get_regs(dev, &regs, regbuf); 1414 ops->get_regs(dev, &regs, regbuf);
1402 1415
@@ -1701,7 +1714,7 @@ static noinline_for_stack int ethtool_get_channels(struct net_device *dev,
1701static noinline_for_stack int ethtool_set_channels(struct net_device *dev, 1714static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
1702 void __user *useraddr) 1715 void __user *useraddr)
1703{ 1716{
1704 struct ethtool_channels channels, max; 1717 struct ethtool_channels channels, max = { .cmd = ETHTOOL_GCHANNELS };
1705 u32 max_rx_in_use = 0; 1718 u32 max_rx_in_use = 0;
1706 1719
1707 if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels) 1720 if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels)
@@ -1806,11 +1819,13 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
1806 ret = __ethtool_get_sset_count(dev, gstrings.string_set); 1819 ret = __ethtool_get_sset_count(dev, gstrings.string_set);
1807 if (ret < 0) 1820 if (ret < 0)
1808 return ret; 1821 return ret;
1822 if (ret > S32_MAX / ETH_GSTRING_LEN)
1823 return -ENOMEM;
1824 WARN_ON_ONCE(!ret);
1809 1825
1810 gstrings.len = ret; 1826 gstrings.len = ret;
1811 1827 data = vzalloc(gstrings.len * ETH_GSTRING_LEN);
1812 data = kcalloc(gstrings.len, ETH_GSTRING_LEN, GFP_USER); 1828 if (gstrings.len && !data)
1813 if (!data)
1814 return -ENOMEM; 1829 return -ENOMEM;
1815 1830
1816 __ethtool_get_strings(dev, gstrings.string_set, data); 1831 __ethtool_get_strings(dev, gstrings.string_set, data);
@@ -1819,12 +1834,13 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
1819 if (copy_to_user(useraddr, &gstrings, sizeof(gstrings))) 1834 if (copy_to_user(useraddr, &gstrings, sizeof(gstrings)))
1820 goto out; 1835 goto out;
1821 useraddr += sizeof(gstrings); 1836 useraddr += sizeof(gstrings);
1822 if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN)) 1837 if (gstrings.len &&
1838 copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN))
1823 goto out; 1839 goto out;
1824 ret = 0; 1840 ret = 0;
1825 1841
1826out: 1842out:
1827 kfree(data); 1843 vfree(data);
1828 return ret; 1844 return ret;
1829} 1845}
1830 1846
@@ -1901,14 +1917,15 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
1901 n_stats = ops->get_sset_count(dev, ETH_SS_STATS); 1917 n_stats = ops->get_sset_count(dev, ETH_SS_STATS);
1902 if (n_stats < 0) 1918 if (n_stats < 0)
1903 return n_stats; 1919 return n_stats;
1904 WARN_ON(n_stats == 0); 1920 if (n_stats > S32_MAX / sizeof(u64))
1905 1921 return -ENOMEM;
1922 WARN_ON_ONCE(!n_stats);
1906 if (copy_from_user(&stats, useraddr, sizeof(stats))) 1923 if (copy_from_user(&stats, useraddr, sizeof(stats)))
1907 return -EFAULT; 1924 return -EFAULT;
1908 1925
1909 stats.n_stats = n_stats; 1926 stats.n_stats = n_stats;
1910 data = kmalloc(n_stats * sizeof(u64), GFP_USER); 1927 data = vzalloc(n_stats * sizeof(u64));
1911 if (!data) 1928 if (n_stats && !data)
1912 return -ENOMEM; 1929 return -ENOMEM;
1913 1930
1914 ops->get_ethtool_stats(dev, &stats, data); 1931 ops->get_ethtool_stats(dev, &stats, data);
@@ -1917,12 +1934,12 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
1917 if (copy_to_user(useraddr, &stats, sizeof(stats))) 1934 if (copy_to_user(useraddr, &stats, sizeof(stats)))
1918 goto out; 1935 goto out;
1919 useraddr += sizeof(stats); 1936 useraddr += sizeof(stats);
1920 if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64))) 1937 if (n_stats && copy_to_user(useraddr, data, n_stats * sizeof(u64)))
1921 goto out; 1938 goto out;
1922 ret = 0; 1939 ret = 0;
1923 1940
1924 out: 1941 out:
1925 kfree(data); 1942 vfree(data);
1926 return ret; 1943 return ret;
1927} 1944}
1928 1945
@@ -1937,17 +1954,18 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
1937 return -EOPNOTSUPP; 1954 return -EOPNOTSUPP;
1938 1955
1939 n_stats = phy_get_sset_count(phydev); 1956 n_stats = phy_get_sset_count(phydev);
1940
1941 if (n_stats < 0) 1957 if (n_stats < 0)
1942 return n_stats; 1958 return n_stats;
1943 WARN_ON(n_stats == 0); 1959 if (n_stats > S32_MAX / sizeof(u64))
1960 return -ENOMEM;
1961 WARN_ON_ONCE(!n_stats);
1944 1962
1945 if (copy_from_user(&stats, useraddr, sizeof(stats))) 1963 if (copy_from_user(&stats, useraddr, sizeof(stats)))
1946 return -EFAULT; 1964 return -EFAULT;
1947 1965
1948 stats.n_stats = n_stats; 1966 stats.n_stats = n_stats;
1949 data = kmalloc_array(n_stats, sizeof(u64), GFP_USER); 1967 data = vzalloc(n_stats * sizeof(u64));
1950 if (!data) 1968 if (n_stats && !data)
1951 return -ENOMEM; 1969 return -ENOMEM;
1952 1970
1953 mutex_lock(&phydev->lock); 1971 mutex_lock(&phydev->lock);
@@ -1958,12 +1976,12 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
1958 if (copy_to_user(useraddr, &stats, sizeof(stats))) 1976 if (copy_to_user(useraddr, &stats, sizeof(stats)))
1959 goto out; 1977 goto out;
1960 useraddr += sizeof(stats); 1978 useraddr += sizeof(stats);
1961 if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64))) 1979 if (n_stats && copy_to_user(useraddr, data, n_stats * sizeof(u64)))
1962 goto out; 1980 goto out;
1963 ret = 0; 1981 ret = 0;
1964 1982
1965 out: 1983 out:
1966 kfree(data); 1984 vfree(data);
1967 return ret; 1985 return ret;
1968} 1986}
1969 1987
@@ -2422,6 +2440,85 @@ static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr)
2422 }; 2440 };
2423} 2441}
2424 2442
2443static int ethtool_phy_tunable_valid(const struct ethtool_tunable *tuna)
2444{
2445 switch (tuna->id) {
2446 case ETHTOOL_PHY_DOWNSHIFT:
2447 if (tuna->len != sizeof(u8) ||
2448 tuna->type_id != ETHTOOL_TUNABLE_U8)
2449 return -EINVAL;
2450 break;
2451 default:
2452 return -EINVAL;
2453 }
2454
2455 return 0;
2456}
2457
2458static int get_phy_tunable(struct net_device *dev, void __user *useraddr)
2459{
2460 int ret;
2461 struct ethtool_tunable tuna;
2462 struct phy_device *phydev = dev->phydev;
2463 void *data;
2464
2465 if (!(phydev && phydev->drv && phydev->drv->get_tunable))
2466 return -EOPNOTSUPP;
2467
2468 if (copy_from_user(&tuna, useraddr, sizeof(tuna)))
2469 return -EFAULT;
2470 ret = ethtool_phy_tunable_valid(&tuna);
2471 if (ret)
2472 return ret;
2473 data = kmalloc(tuna.len, GFP_USER);
2474 if (!data)
2475 return -ENOMEM;
2476 mutex_lock(&phydev->lock);
2477 ret = phydev->drv->get_tunable(phydev, &tuna, data);
2478 mutex_unlock(&phydev->lock);
2479 if (ret)
2480 goto out;
2481 useraddr += sizeof(tuna);
2482 ret = -EFAULT;
2483 if (copy_to_user(useraddr, data, tuna.len))
2484 goto out;
2485 ret = 0;
2486
2487out:
2488 kfree(data);
2489 return ret;
2490}
2491
2492static int set_phy_tunable(struct net_device *dev, void __user *useraddr)
2493{
2494 int ret;
2495 struct ethtool_tunable tuna;
2496 struct phy_device *phydev = dev->phydev;
2497 void *data;
2498
2499 if (!(phydev && phydev->drv && phydev->drv->set_tunable))
2500 return -EOPNOTSUPP;
2501 if (copy_from_user(&tuna, useraddr, sizeof(tuna)))
2502 return -EFAULT;
2503 ret = ethtool_phy_tunable_valid(&tuna);
2504 if (ret)
2505 return ret;
2506 data = kmalloc(tuna.len, GFP_USER);
2507 if (!data)
2508 return -ENOMEM;
2509 useraddr += sizeof(tuna);
2510 ret = -EFAULT;
2511 if (copy_from_user(data, useraddr, tuna.len))
2512 goto out;
2513 mutex_lock(&phydev->lock);
2514 ret = phydev->drv->set_tunable(phydev, &tuna, data);
2515 mutex_unlock(&phydev->lock);
2516
2517out:
2518 kfree(data);
2519 return ret;
2520}
2521
2425/* The main entry point in this file. Called from net/core/dev_ioctl.c */ 2522/* The main entry point in this file. Called from net/core/dev_ioctl.c */
2426 2523
2427int dev_ethtool(struct net *net, struct ifreq *ifr) 2524int dev_ethtool(struct net *net, struct ifreq *ifr)
@@ -2479,6 +2576,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
2479 case ETHTOOL_GET_TS_INFO: 2576 case ETHTOOL_GET_TS_INFO:
2480 case ETHTOOL_GEEE: 2577 case ETHTOOL_GEEE:
2481 case ETHTOOL_GTUNABLE: 2578 case ETHTOOL_GTUNABLE:
2579 case ETHTOOL_PHY_GTUNABLE:
2482 case ETHTOOL_GLINKSETTINGS: 2580 case ETHTOOL_GLINKSETTINGS:
2483 break; 2581 break;
2484 default: 2582 default:
@@ -2685,6 +2783,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
2685 case ETHTOOL_SLINKSETTINGS: 2783 case ETHTOOL_SLINKSETTINGS:
2686 rc = ethtool_set_link_ksettings(dev, useraddr); 2784 rc = ethtool_set_link_ksettings(dev, useraddr);
2687 break; 2785 break;
2786 case ETHTOOL_PHY_GTUNABLE:
2787 rc = get_phy_tunable(dev, useraddr);
2788 break;
2789 case ETHTOOL_PHY_STUNABLE:
2790 rc = set_phy_tunable(dev, useraddr);
2791 break;
2688 default: 2792 default:
2689 rc = -EOPNOTSUPP; 2793 rc = -EOPNOTSUPP;
2690 } 2794 }
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index be4629c344a6..b6791d94841d 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -18,6 +18,11 @@
18#include <net/fib_rules.h> 18#include <net/fib_rules.h>
19#include <net/ip_tunnels.h> 19#include <net/ip_tunnels.h>
20 20
21static const struct fib_kuid_range fib_kuid_range_unset = {
22 KUIDT_INIT(0),
23 KUIDT_INIT(~0),
24};
25
21int fib_default_rule_add(struct fib_rules_ops *ops, 26int fib_default_rule_add(struct fib_rules_ops *ops,
22 u32 pref, u32 table, u32 flags) 27 u32 pref, u32 table, u32 flags)
23{ 28{
@@ -33,6 +38,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
33 r->table = table; 38 r->table = table;
34 r->flags = flags; 39 r->flags = flags;
35 r->fr_net = ops->fro_net; 40 r->fr_net = ops->fro_net;
41 r->uid_range = fib_kuid_range_unset;
36 42
37 r->suppress_prefixlen = -1; 43 r->suppress_prefixlen = -1;
38 r->suppress_ifgroup = -1; 44 r->suppress_ifgroup = -1;
@@ -172,6 +178,34 @@ void fib_rules_unregister(struct fib_rules_ops *ops)
172} 178}
173EXPORT_SYMBOL_GPL(fib_rules_unregister); 179EXPORT_SYMBOL_GPL(fib_rules_unregister);
174 180
181static int uid_range_set(struct fib_kuid_range *range)
182{
183 return uid_valid(range->start) && uid_valid(range->end);
184}
185
186static struct fib_kuid_range nla_get_kuid_range(struct nlattr **tb)
187{
188 struct fib_rule_uid_range *in;
189 struct fib_kuid_range out;
190
191 in = (struct fib_rule_uid_range *)nla_data(tb[FRA_UID_RANGE]);
192
193 out.start = make_kuid(current_user_ns(), in->start);
194 out.end = make_kuid(current_user_ns(), in->end);
195
196 return out;
197}
198
199static int nla_put_uid_range(struct sk_buff *skb, struct fib_kuid_range *range)
200{
201 struct fib_rule_uid_range out = {
202 from_kuid_munged(current_user_ns(), range->start),
203 from_kuid_munged(current_user_ns(), range->end)
204 };
205
206 return nla_put(skb, FRA_UID_RANGE, sizeof(out), &out);
207}
208
175static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, 209static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
176 struct flowi *fl, int flags, 210 struct flowi *fl, int flags,
177 struct fib_lookup_arg *arg) 211 struct fib_lookup_arg *arg)
@@ -193,6 +227,10 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
193 if (rule->l3mdev && !l3mdev_fib_rule_match(rule->fr_net, fl, arg)) 227 if (rule->l3mdev && !l3mdev_fib_rule_match(rule->fr_net, fl, arg))
194 goto out; 228 goto out;
195 229
230 if (uid_lt(fl->flowi_uid, rule->uid_range.start) ||
231 uid_gt(fl->flowi_uid, rule->uid_range.end))
232 goto out;
233
196 ret = ops->match(rule, fl, flags); 234 ret = ops->match(rule, fl, flags);
197out: 235out:
198 return (rule->flags & FIB_RULE_INVERT) ? !ret : ret; 236 return (rule->flags & FIB_RULE_INVERT) ? !ret : ret;
@@ -305,6 +343,10 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
305 if (r->l3mdev != rule->l3mdev) 343 if (r->l3mdev != rule->l3mdev)
306 continue; 344 continue;
307 345
346 if (!uid_eq(r->uid_range.start, rule->uid_range.start) ||
347 !uid_eq(r->uid_range.end, rule->uid_range.end))
348 continue;
349
308 if (!ops->compare(r, frh, tb)) 350 if (!ops->compare(r, frh, tb))
309 continue; 351 continue;
310 return 1; 352 return 1;
@@ -429,6 +471,21 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh)
429 if (rule->l3mdev && rule->table) 471 if (rule->l3mdev && rule->table)
430 goto errout_free; 472 goto errout_free;
431 473
474 if (tb[FRA_UID_RANGE]) {
475 if (current_user_ns() != net->user_ns) {
476 err = -EPERM;
477 goto errout_free;
478 }
479
480 rule->uid_range = nla_get_kuid_range(tb);
481
482 if (!uid_range_set(&rule->uid_range) ||
483 !uid_lte(rule->uid_range.start, rule->uid_range.end))
484 goto errout_free;
485 } else {
486 rule->uid_range = fib_kuid_range_unset;
487 }
488
432 if ((nlh->nlmsg_flags & NLM_F_EXCL) && 489 if ((nlh->nlmsg_flags & NLM_F_EXCL) &&
433 rule_exists(ops, frh, tb, rule)) { 490 rule_exists(ops, frh, tb, rule)) {
434 err = -EEXIST; 491 err = -EEXIST;
@@ -497,6 +554,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh)
497 struct fib_rules_ops *ops = NULL; 554 struct fib_rules_ops *ops = NULL;
498 struct fib_rule *rule, *tmp; 555 struct fib_rule *rule, *tmp;
499 struct nlattr *tb[FRA_MAX+1]; 556 struct nlattr *tb[FRA_MAX+1];
557 struct fib_kuid_range range;
500 int err = -EINVAL; 558 int err = -EINVAL;
501 559
502 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) 560 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
@@ -516,6 +574,14 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh)
516 if (err < 0) 574 if (err < 0)
517 goto errout; 575 goto errout;
518 576
577 if (tb[FRA_UID_RANGE]) {
578 range = nla_get_kuid_range(tb);
579 if (!uid_range_set(&range))
580 goto errout;
581 } else {
582 range = fib_kuid_range_unset;
583 }
584
519 list_for_each_entry(rule, &ops->rules_list, list) { 585 list_for_each_entry(rule, &ops->rules_list, list) {
520 if (frh->action && (frh->action != rule->action)) 586 if (frh->action && (frh->action != rule->action))
521 continue; 587 continue;
@@ -552,6 +618,11 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh)
552 (rule->l3mdev != nla_get_u8(tb[FRA_L3MDEV]))) 618 (rule->l3mdev != nla_get_u8(tb[FRA_L3MDEV])))
553 continue; 619 continue;
554 620
621 if (uid_range_set(&range) &&
622 (!uid_eq(rule->uid_range.start, range.start) ||
623 !uid_eq(rule->uid_range.end, range.end)))
624 continue;
625
555 if (!ops->compare(rule, frh, tb)) 626 if (!ops->compare(rule, frh, tb))
556 continue; 627 continue;
557 628
@@ -619,7 +690,8 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
619 + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */ 690 + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */
620 + nla_total_size(4) /* FRA_FWMARK */ 691 + nla_total_size(4) /* FRA_FWMARK */
621 + nla_total_size(4) /* FRA_FWMASK */ 692 + nla_total_size(4) /* FRA_FWMASK */
622 + nla_total_size_64bit(8); /* FRA_TUN_ID */ 693 + nla_total_size_64bit(8) /* FRA_TUN_ID */
694 + nla_total_size(sizeof(struct fib_kuid_range));
623 695
624 if (ops->nlmsg_payload) 696 if (ops->nlmsg_payload)
625 payload += ops->nlmsg_payload(rule); 697 payload += ops->nlmsg_payload(rule);
@@ -679,7 +751,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
679 (rule->tun_id && 751 (rule->tun_id &&
680 nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD)) || 752 nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD)) ||
681 (rule->l3mdev && 753 (rule->l3mdev &&
682 nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev))) 754 nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)) ||
755 (uid_range_set(&rule->uid_range) &&
756 nla_put_uid_range(skb, &rule->uid_range)))
683 goto nla_put_failure; 757 goto nla_put_failure;
684 758
685 if (rule->suppress_ifgroup != -1) { 759 if (rule->suppress_ifgroup != -1) {
diff --git a/net/core/filter.c b/net/core/filter.c
index b391209838ef..ebaeaf2e46e8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -30,6 +30,7 @@
30#include <linux/inet.h> 30#include <linux/inet.h>
31#include <linux/netdevice.h> 31#include <linux/netdevice.h>
32#include <linux/if_packet.h> 32#include <linux/if_packet.h>
33#include <linux/if_arp.h>
33#include <linux/gfp.h> 34#include <linux/gfp.h>
34#include <net/ip.h> 35#include <net/ip.h>
35#include <net/protocol.h> 36#include <net/protocol.h>
@@ -39,7 +40,7 @@
39#include <net/flow_dissector.h> 40#include <net/flow_dissector.h>
40#include <linux/errno.h> 41#include <linux/errno.h>
41#include <linux/timer.h> 42#include <linux/timer.h>
42#include <asm/uaccess.h> 43#include <linux/uaccess.h>
43#include <asm/unaligned.h> 44#include <asm/unaligned.h>
44#include <linux/filter.h> 45#include <linux/filter.h>
45#include <linux/ratelimit.h> 46#include <linux/ratelimit.h>
@@ -75,8 +76,13 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
75 * allow SOCK_MEMALLOC sockets to use it as this socket is 76 * allow SOCK_MEMALLOC sockets to use it as this socket is
76 * helping free memory 77 * helping free memory
77 */ 78 */
78 if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) 79 if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) {
80 NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
79 return -ENOMEM; 81 return -ENOMEM;
82 }
83 err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb);
84 if (err)
85 return err;
80 86
81 err = security_sock_rcv_skb(sk, skb); 87 err = security_sock_rcv_skb(sk, skb);
82 if (err) 88 if (err)
@@ -1411,8 +1417,8 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
1411 .ret_type = RET_INTEGER, 1417 .ret_type = RET_INTEGER,
1412 .arg1_type = ARG_PTR_TO_CTX, 1418 .arg1_type = ARG_PTR_TO_CTX,
1413 .arg2_type = ARG_ANYTHING, 1419 .arg2_type = ARG_ANYTHING,
1414 .arg3_type = ARG_PTR_TO_STACK, 1420 .arg3_type = ARG_PTR_TO_MEM,
1415 .arg4_type = ARG_CONST_STACK_SIZE, 1421 .arg4_type = ARG_CONST_SIZE,
1416 .arg5_type = ARG_ANYTHING, 1422 .arg5_type = ARG_ANYTHING,
1417}; 1423};
1418 1424
@@ -1442,8 +1448,8 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
1442 .ret_type = RET_INTEGER, 1448 .ret_type = RET_INTEGER,
1443 .arg1_type = ARG_PTR_TO_CTX, 1449 .arg1_type = ARG_PTR_TO_CTX,
1444 .arg2_type = ARG_ANYTHING, 1450 .arg2_type = ARG_ANYTHING,
1445 .arg3_type = ARG_PTR_TO_RAW_STACK, 1451 .arg3_type = ARG_PTR_TO_UNINIT_MEM,
1446 .arg4_type = ARG_CONST_STACK_SIZE, 1452 .arg4_type = ARG_CONST_SIZE,
1447}; 1453};
1448 1454
1449BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len) 1455BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
@@ -1517,10 +1523,11 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
1517{ 1523{
1518 bool is_pseudo = flags & BPF_F_PSEUDO_HDR; 1524 bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
1519 bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; 1525 bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
1526 bool do_mforce = flags & BPF_F_MARK_ENFORCE;
1520 __sum16 *ptr; 1527 __sum16 *ptr;
1521 1528
1522 if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR | 1529 if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE |
1523 BPF_F_HDR_FIELD_MASK))) 1530 BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK)))
1524 return -EINVAL; 1531 return -EINVAL;
1525 if (unlikely(offset > 0xffff || offset & 1)) 1532 if (unlikely(offset > 0xffff || offset & 1))
1526 return -EFAULT; 1533 return -EFAULT;
@@ -1528,7 +1535,7 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
1528 return -EFAULT; 1535 return -EFAULT;
1529 1536
1530 ptr = (__sum16 *)(skb->data + offset); 1537 ptr = (__sum16 *)(skb->data + offset);
1531 if (is_mmzero && !*ptr) 1538 if (is_mmzero && !do_mforce && !*ptr)
1532 return 0; 1539 return 0;
1533 1540
1534 switch (flags & BPF_F_HDR_FIELD_MASK) { 1541 switch (flags & BPF_F_HDR_FIELD_MASK) {
@@ -1596,10 +1603,10 @@ static const struct bpf_func_proto bpf_csum_diff_proto = {
1596 .gpl_only = false, 1603 .gpl_only = false,
1597 .pkt_access = true, 1604 .pkt_access = true,
1598 .ret_type = RET_INTEGER, 1605 .ret_type = RET_INTEGER,
1599 .arg1_type = ARG_PTR_TO_STACK, 1606 .arg1_type = ARG_PTR_TO_MEM,
1600 .arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO, 1607 .arg2_type = ARG_CONST_SIZE_OR_ZERO,
1601 .arg3_type = ARG_PTR_TO_STACK, 1608 .arg3_type = ARG_PTR_TO_MEM,
1602 .arg4_type = ARG_CONST_STACK_SIZE_OR_ZERO, 1609 .arg4_type = ARG_CONST_SIZE_OR_ZERO,
1603 .arg5_type = ARG_ANYTHING, 1610 .arg5_type = ARG_ANYTHING,
1604}; 1611};
1605 1612
@@ -1684,6 +1691,12 @@ static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev,
1684static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev, 1691static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
1685 u32 flags) 1692 u32 flags)
1686{ 1693{
1694 /* Verify that a link layer header is carried */
1695 if (unlikely(skb->mac_header >= skb->network_header)) {
1696 kfree_skb(skb);
1697 return -ERANGE;
1698 }
1699
1687 bpf_push_mac_rcsum(skb); 1700 bpf_push_mac_rcsum(skb);
1688 return flags & BPF_F_INGRESS ? 1701 return flags & BPF_F_INGRESS ?
1689 __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); 1702 __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
@@ -1692,17 +1705,10 @@ static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
1692static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, 1705static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
1693 u32 flags) 1706 u32 flags)
1694{ 1707{
1695 switch (dev->type) { 1708 if (dev_is_mac_header_xmit(dev))
1696 case ARPHRD_TUNNEL:
1697 case ARPHRD_TUNNEL6:
1698 case ARPHRD_SIT:
1699 case ARPHRD_IPGRE:
1700 case ARPHRD_VOID:
1701 case ARPHRD_NONE:
1702 return __bpf_redirect_no_mac(skb, dev, flags);
1703 default:
1704 return __bpf_redirect_common(skb, dev, flags); 1709 return __bpf_redirect_common(skb, dev, flags);
1705 } 1710 else
1711 return __bpf_redirect_no_mac(skb, dev, flags);
1706} 1712}
1707 1713
1708BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) 1714BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
@@ -2190,16 +2196,79 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = {
2190 .arg3_type = ARG_ANYTHING, 2196 .arg3_type = ARG_ANYTHING,
2191}; 2197};
2192 2198
2193bool bpf_helper_changes_skb_data(void *func) 2199BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
2200 u64, flags)
2201{
2202 u32 max_len = __bpf_skb_max_len(skb);
2203 u32 new_len = skb->len + head_room;
2204 int ret;
2205
2206 if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) ||
2207 new_len < skb->len))
2208 return -EINVAL;
2209
2210 ret = skb_cow(skb, head_room);
2211 if (likely(!ret)) {
2212 /* Idea for this helper is that we currently only
2213 * allow to expand on mac header. This means that
2214 * skb->protocol network header, etc, stay as is.
2215 * Compared to bpf_skb_change_tail(), we're more
2216 * flexible due to not needing to linearize or
2217 * reset GSO. Intention for this helper is to be
2218 * used by an L3 skb that needs to push mac header
2219 * for redirection into L2 device.
2220 */
2221 __skb_push(skb, head_room);
2222 memset(skb->data, 0, head_room);
2223 skb_reset_mac_header(skb);
2224 }
2225
2226 bpf_compute_data_end(skb);
2227 return 0;
2228}
2229
2230static const struct bpf_func_proto bpf_skb_change_head_proto = {
2231 .func = bpf_skb_change_head,
2232 .gpl_only = false,
2233 .ret_type = RET_INTEGER,
2234 .arg1_type = ARG_PTR_TO_CTX,
2235 .arg2_type = ARG_ANYTHING,
2236 .arg3_type = ARG_ANYTHING,
2237};
2238
2239BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
2240{
2241 void *data = xdp->data + offset;
2242
2243 if (unlikely(data < xdp->data_hard_start ||
2244 data > xdp->data_end - ETH_HLEN))
2245 return -EINVAL;
2246
2247 xdp->data = data;
2248
2249 return 0;
2250}
2251
2252static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
2253 .func = bpf_xdp_adjust_head,
2254 .gpl_only = false,
2255 .ret_type = RET_INTEGER,
2256 .arg1_type = ARG_PTR_TO_CTX,
2257 .arg2_type = ARG_ANYTHING,
2258};
2259
2260bool bpf_helper_changes_pkt_data(void *func)
2194{ 2261{
2195 if (func == bpf_skb_vlan_push || 2262 if (func == bpf_skb_vlan_push ||
2196 func == bpf_skb_vlan_pop || 2263 func == bpf_skb_vlan_pop ||
2197 func == bpf_skb_store_bytes || 2264 func == bpf_skb_store_bytes ||
2198 func == bpf_skb_change_proto || 2265 func == bpf_skb_change_proto ||
2266 func == bpf_skb_change_head ||
2199 func == bpf_skb_change_tail || 2267 func == bpf_skb_change_tail ||
2200 func == bpf_skb_pull_data || 2268 func == bpf_skb_pull_data ||
2201 func == bpf_l3_csum_replace || 2269 func == bpf_l3_csum_replace ||
2202 func == bpf_l4_csum_replace) 2270 func == bpf_l4_csum_replace ||
2271 func == bpf_xdp_adjust_head)
2203 return true; 2272 return true;
2204 2273
2205 return false; 2274 return false;
@@ -2239,8 +2308,8 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = {
2239 .arg1_type = ARG_PTR_TO_CTX, 2308 .arg1_type = ARG_PTR_TO_CTX,
2240 .arg2_type = ARG_CONST_MAP_PTR, 2309 .arg2_type = ARG_CONST_MAP_PTR,
2241 .arg3_type = ARG_ANYTHING, 2310 .arg3_type = ARG_ANYTHING,
2242 .arg4_type = ARG_PTR_TO_STACK, 2311 .arg4_type = ARG_PTR_TO_MEM,
2243 .arg5_type = ARG_CONST_STACK_SIZE, 2312 .arg5_type = ARG_CONST_SIZE,
2244}; 2313};
2245 2314
2246static unsigned short bpf_tunnel_key_af(u64 flags) 2315static unsigned short bpf_tunnel_key_af(u64 flags)
@@ -2310,8 +2379,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
2310 .gpl_only = false, 2379 .gpl_only = false,
2311 .ret_type = RET_INTEGER, 2380 .ret_type = RET_INTEGER,
2312 .arg1_type = ARG_PTR_TO_CTX, 2381 .arg1_type = ARG_PTR_TO_CTX,
2313 .arg2_type = ARG_PTR_TO_RAW_STACK, 2382 .arg2_type = ARG_PTR_TO_UNINIT_MEM,
2314 .arg3_type = ARG_CONST_STACK_SIZE, 2383 .arg3_type = ARG_CONST_SIZE,
2315 .arg4_type = ARG_ANYTHING, 2384 .arg4_type = ARG_ANYTHING,
2316}; 2385};
2317 2386
@@ -2345,8 +2414,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
2345 .gpl_only = false, 2414 .gpl_only = false,
2346 .ret_type = RET_INTEGER, 2415 .ret_type = RET_INTEGER,
2347 .arg1_type = ARG_PTR_TO_CTX, 2416 .arg1_type = ARG_PTR_TO_CTX,
2348 .arg2_type = ARG_PTR_TO_RAW_STACK, 2417 .arg2_type = ARG_PTR_TO_UNINIT_MEM,
2349 .arg3_type = ARG_CONST_STACK_SIZE, 2418 .arg3_type = ARG_CONST_SIZE,
2350}; 2419};
2351 2420
2352static struct metadata_dst __percpu *md_dst; 2421static struct metadata_dst __percpu *md_dst;
@@ -2416,8 +2485,8 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
2416 .gpl_only = false, 2485 .gpl_only = false,
2417 .ret_type = RET_INTEGER, 2486 .ret_type = RET_INTEGER,
2418 .arg1_type = ARG_PTR_TO_CTX, 2487 .arg1_type = ARG_PTR_TO_CTX,
2419 .arg2_type = ARG_PTR_TO_STACK, 2488 .arg2_type = ARG_PTR_TO_MEM,
2420 .arg3_type = ARG_CONST_STACK_SIZE, 2489 .arg3_type = ARG_CONST_SIZE,
2421 .arg4_type = ARG_ANYTHING, 2490 .arg4_type = ARG_ANYTHING,
2422}; 2491};
2423 2492
@@ -2442,8 +2511,8 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
2442 .gpl_only = false, 2511 .gpl_only = false,
2443 .ret_type = RET_INTEGER, 2512 .ret_type = RET_INTEGER,
2444 .arg1_type = ARG_PTR_TO_CTX, 2513 .arg1_type = ARG_PTR_TO_CTX,
2445 .arg2_type = ARG_PTR_TO_STACK, 2514 .arg2_type = ARG_PTR_TO_MEM,
2446 .arg3_type = ARG_CONST_STACK_SIZE, 2515 .arg3_type = ARG_CONST_SIZE,
2447}; 2516};
2448 2517
2449static const struct bpf_func_proto * 2518static const struct bpf_func_proto *
@@ -2515,8 +2584,8 @@ BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map,
2515 if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data))) 2584 if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data)))
2516 return -EFAULT; 2585 return -EFAULT;
2517 2586
2518 return bpf_event_output(map, flags, meta, meta_size, xdp, xdp_size, 2587 return bpf_event_output(map, flags, meta, meta_size, xdp->data,
2519 bpf_xdp_copy); 2588 xdp_size, bpf_xdp_copy);
2520} 2589}
2521 2590
2522static const struct bpf_func_proto bpf_xdp_event_output_proto = { 2591static const struct bpf_func_proto bpf_xdp_event_output_proto = {
@@ -2526,12 +2595,12 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = {
2526 .arg1_type = ARG_PTR_TO_CTX, 2595 .arg1_type = ARG_PTR_TO_CTX,
2527 .arg2_type = ARG_CONST_MAP_PTR, 2596 .arg2_type = ARG_CONST_MAP_PTR,
2528 .arg3_type = ARG_ANYTHING, 2597 .arg3_type = ARG_ANYTHING,
2529 .arg4_type = ARG_PTR_TO_STACK, 2598 .arg4_type = ARG_PTR_TO_MEM,
2530 .arg5_type = ARG_CONST_STACK_SIZE, 2599 .arg5_type = ARG_CONST_SIZE,
2531}; 2600};
2532 2601
2533static const struct bpf_func_proto * 2602static const struct bpf_func_proto *
2534sk_filter_func_proto(enum bpf_func_id func_id) 2603bpf_base_func_proto(enum bpf_func_id func_id)
2535{ 2604{
2536 switch (func_id) { 2605 switch (func_id) {
2537 case BPF_FUNC_map_lookup_elem: 2606 case BPF_FUNC_map_lookup_elem:
@@ -2544,6 +2613,8 @@ sk_filter_func_proto(enum bpf_func_id func_id)
2544 return &bpf_get_prandom_u32_proto; 2613 return &bpf_get_prandom_u32_proto;
2545 case BPF_FUNC_get_smp_processor_id: 2614 case BPF_FUNC_get_smp_processor_id:
2546 return &bpf_get_raw_smp_processor_id_proto; 2615 return &bpf_get_raw_smp_processor_id_proto;
2616 case BPF_FUNC_get_numa_node_id:
2617 return &bpf_get_numa_node_id_proto;
2547 case BPF_FUNC_tail_call: 2618 case BPF_FUNC_tail_call:
2548 return &bpf_tail_call_proto; 2619 return &bpf_tail_call_proto;
2549 case BPF_FUNC_ktime_get_ns: 2620 case BPF_FUNC_ktime_get_ns:
@@ -2557,6 +2628,17 @@ sk_filter_func_proto(enum bpf_func_id func_id)
2557} 2628}
2558 2629
2559static const struct bpf_func_proto * 2630static const struct bpf_func_proto *
2631sk_filter_func_proto(enum bpf_func_id func_id)
2632{
2633 switch (func_id) {
2634 case BPF_FUNC_skb_load_bytes:
2635 return &bpf_skb_load_bytes_proto;
2636 default:
2637 return bpf_base_func_proto(func_id);
2638 }
2639}
2640
2641static const struct bpf_func_proto *
2560tc_cls_act_func_proto(enum bpf_func_id func_id) 2642tc_cls_act_func_proto(enum bpf_func_id func_id)
2561{ 2643{
2562 switch (func_id) { 2644 switch (func_id) {
@@ -2611,7 +2693,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
2611 case BPF_FUNC_skb_under_cgroup: 2693 case BPF_FUNC_skb_under_cgroup:
2612 return &bpf_skb_under_cgroup_proto; 2694 return &bpf_skb_under_cgroup_proto;
2613 default: 2695 default:
2614 return sk_filter_func_proto(func_id); 2696 return bpf_base_func_proto(func_id);
2615 } 2697 }
2616} 2698}
2617 2699
@@ -2623,20 +2705,106 @@ xdp_func_proto(enum bpf_func_id func_id)
2623 return &bpf_xdp_event_output_proto; 2705 return &bpf_xdp_event_output_proto;
2624 case BPF_FUNC_get_smp_processor_id: 2706 case BPF_FUNC_get_smp_processor_id:
2625 return &bpf_get_smp_processor_id_proto; 2707 return &bpf_get_smp_processor_id_proto;
2708 case BPF_FUNC_xdp_adjust_head:
2709 return &bpf_xdp_adjust_head_proto;
2626 default: 2710 default:
2627 return sk_filter_func_proto(func_id); 2711 return bpf_base_func_proto(func_id);
2628 } 2712 }
2629} 2713}
2630 2714
2631static bool __is_valid_access(int off, int size, enum bpf_access_type type) 2715static const struct bpf_func_proto *
2716cg_skb_func_proto(enum bpf_func_id func_id)
2717{
2718 switch (func_id) {
2719 case BPF_FUNC_skb_load_bytes:
2720 return &bpf_skb_load_bytes_proto;
2721 default:
2722 return bpf_base_func_proto(func_id);
2723 }
2724}
2725
2726static const struct bpf_func_proto *
2727lwt_inout_func_proto(enum bpf_func_id func_id)
2728{
2729 switch (func_id) {
2730 case BPF_FUNC_skb_load_bytes:
2731 return &bpf_skb_load_bytes_proto;
2732 case BPF_FUNC_skb_pull_data:
2733 return &bpf_skb_pull_data_proto;
2734 case BPF_FUNC_csum_diff:
2735 return &bpf_csum_diff_proto;
2736 case BPF_FUNC_get_cgroup_classid:
2737 return &bpf_get_cgroup_classid_proto;
2738 case BPF_FUNC_get_route_realm:
2739 return &bpf_get_route_realm_proto;
2740 case BPF_FUNC_get_hash_recalc:
2741 return &bpf_get_hash_recalc_proto;
2742 case BPF_FUNC_perf_event_output:
2743 return &bpf_skb_event_output_proto;
2744 case BPF_FUNC_get_smp_processor_id:
2745 return &bpf_get_smp_processor_id_proto;
2746 case BPF_FUNC_skb_under_cgroup:
2747 return &bpf_skb_under_cgroup_proto;
2748 default:
2749 return bpf_base_func_proto(func_id);
2750 }
2751}
2752
2753static const struct bpf_func_proto *
2754lwt_xmit_func_proto(enum bpf_func_id func_id)
2755{
2756 switch (func_id) {
2757 case BPF_FUNC_skb_get_tunnel_key:
2758 return &bpf_skb_get_tunnel_key_proto;
2759 case BPF_FUNC_skb_set_tunnel_key:
2760 return bpf_get_skb_set_tunnel_proto(func_id);
2761 case BPF_FUNC_skb_get_tunnel_opt:
2762 return &bpf_skb_get_tunnel_opt_proto;
2763 case BPF_FUNC_skb_set_tunnel_opt:
2764 return bpf_get_skb_set_tunnel_proto(func_id);
2765 case BPF_FUNC_redirect:
2766 return &bpf_redirect_proto;
2767 case BPF_FUNC_clone_redirect:
2768 return &bpf_clone_redirect_proto;
2769 case BPF_FUNC_skb_change_tail:
2770 return &bpf_skb_change_tail_proto;
2771 case BPF_FUNC_skb_change_head:
2772 return &bpf_skb_change_head_proto;
2773 case BPF_FUNC_skb_store_bytes:
2774 return &bpf_skb_store_bytes_proto;
2775 case BPF_FUNC_csum_update:
2776 return &bpf_csum_update_proto;
2777 case BPF_FUNC_l3_csum_replace:
2778 return &bpf_l3_csum_replace_proto;
2779 case BPF_FUNC_l4_csum_replace:
2780 return &bpf_l4_csum_replace_proto;
2781 case BPF_FUNC_set_hash_invalid:
2782 return &bpf_set_hash_invalid_proto;
2783 default:
2784 return lwt_inout_func_proto(func_id);
2785 }
2786}
2787
2788static bool __is_valid_access(int off, int size)
2632{ 2789{
2633 if (off < 0 || off >= sizeof(struct __sk_buff)) 2790 if (off < 0 || off >= sizeof(struct __sk_buff))
2634 return false; 2791 return false;
2792
2635 /* The verifier guarantees that size > 0. */ 2793 /* The verifier guarantees that size > 0. */
2636 if (off % size != 0) 2794 if (off % size != 0)
2637 return false; 2795 return false;
2638 if (size != sizeof(__u32)) 2796
2639 return false; 2797 switch (off) {
2798 case offsetof(struct __sk_buff, cb[0]) ...
2799 offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
2800 if (off + size >
2801 offsetof(struct __sk_buff, cb[4]) + sizeof(__u32))
2802 return false;
2803 break;
2804 default:
2805 if (size != sizeof(__u32))
2806 return false;
2807 }
2640 2808
2641 return true; 2809 return true;
2642} 2810}
@@ -2655,14 +2823,71 @@ static bool sk_filter_is_valid_access(int off, int size,
2655 if (type == BPF_WRITE) { 2823 if (type == BPF_WRITE) {
2656 switch (off) { 2824 switch (off) {
2657 case offsetof(struct __sk_buff, cb[0]) ... 2825 case offsetof(struct __sk_buff, cb[0]) ...
2658 offsetof(struct __sk_buff, cb[4]): 2826 offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
2827 break;
2828 default:
2829 return false;
2830 }
2831 }
2832
2833 return __is_valid_access(off, size);
2834}
2835
2836static bool lwt_is_valid_access(int off, int size,
2837 enum bpf_access_type type,
2838 enum bpf_reg_type *reg_type)
2839{
2840 switch (off) {
2841 case offsetof(struct __sk_buff, tc_classid):
2842 return false;
2843 }
2844
2845 if (type == BPF_WRITE) {
2846 switch (off) {
2847 case offsetof(struct __sk_buff, mark):
2848 case offsetof(struct __sk_buff, priority):
2849 case offsetof(struct __sk_buff, cb[0]) ...
2850 offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
2851 break;
2852 default:
2853 return false;
2854 }
2855 }
2856
2857 switch (off) {
2858 case offsetof(struct __sk_buff, data):
2859 *reg_type = PTR_TO_PACKET;
2860 break;
2861 case offsetof(struct __sk_buff, data_end):
2862 *reg_type = PTR_TO_PACKET_END;
2863 break;
2864 }
2865
2866 return __is_valid_access(off, size);
2867}
2868
2869static bool sock_filter_is_valid_access(int off, int size,
2870 enum bpf_access_type type,
2871 enum bpf_reg_type *reg_type)
2872{
2873 if (type == BPF_WRITE) {
2874 switch (off) {
2875 case offsetof(struct bpf_sock, bound_dev_if):
2659 break; 2876 break;
2660 default: 2877 default:
2661 return false; 2878 return false;
2662 } 2879 }
2663 } 2880 }
2664 2881
2665 return __is_valid_access(off, size, type); 2882 if (off < 0 || off + size > sizeof(struct bpf_sock))
2883 return false;
2884 /* The verifier guarantees that size > 0. */
2885 if (off % size != 0)
2886 return false;
2887 if (size != sizeof(__u32))
2888 return false;
2889
2890 return true;
2666} 2891}
2667 2892
2668static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, 2893static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
@@ -2714,7 +2939,7 @@ static bool tc_cls_act_is_valid_access(int off, int size,
2714 case offsetof(struct __sk_buff, tc_index): 2939 case offsetof(struct __sk_buff, tc_index):
2715 case offsetof(struct __sk_buff, priority): 2940 case offsetof(struct __sk_buff, priority):
2716 case offsetof(struct __sk_buff, cb[0]) ... 2941 case offsetof(struct __sk_buff, cb[0]) ...
2717 offsetof(struct __sk_buff, cb[4]): 2942 offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
2718 case offsetof(struct __sk_buff, tc_classid): 2943 case offsetof(struct __sk_buff, tc_classid):
2719 break; 2944 break;
2720 default: 2945 default:
@@ -2731,11 +2956,10 @@ static bool tc_cls_act_is_valid_access(int off, int size,
2731 break; 2956 break;
2732 } 2957 }
2733 2958
2734 return __is_valid_access(off, size, type); 2959 return __is_valid_access(off, size);
2735} 2960}
2736 2961
2737static bool __is_valid_xdp_access(int off, int size, 2962static bool __is_valid_xdp_access(int off, int size)
2738 enum bpf_access_type type)
2739{ 2963{
2740 if (off < 0 || off >= sizeof(struct xdp_md)) 2964 if (off < 0 || off >= sizeof(struct xdp_md))
2741 return false; 2965 return false;
@@ -2763,7 +2987,7 @@ static bool xdp_is_valid_access(int off, int size,
2763 break; 2987 break;
2764 } 2988 }
2765 2989
2766 return __is_valid_xdp_access(off, size, type); 2990 return __is_valid_xdp_access(off, size);
2767} 2991}
2768 2992
2769void bpf_warn_invalid_xdp_action(u32 act) 2993void bpf_warn_invalid_xdp_action(u32 act)
@@ -2772,32 +2996,33 @@ void bpf_warn_invalid_xdp_action(u32 act)
2772} 2996}
2773EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); 2997EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
2774 2998
2775static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg, 2999static u32 bpf_convert_ctx_access(enum bpf_access_type type,
2776 int src_reg, int ctx_off, 3000 const struct bpf_insn *si,
2777 struct bpf_insn *insn_buf, 3001 struct bpf_insn *insn_buf,
2778 struct bpf_prog *prog) 3002 struct bpf_prog *prog)
2779{ 3003{
2780 struct bpf_insn *insn = insn_buf; 3004 struct bpf_insn *insn = insn_buf;
3005 int off;
2781 3006
2782 switch (ctx_off) { 3007 switch (si->off) {
2783 case offsetof(struct __sk_buff, len): 3008 case offsetof(struct __sk_buff, len):
2784 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); 3009 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
2785 3010
2786 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 3011 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
2787 offsetof(struct sk_buff, len)); 3012 offsetof(struct sk_buff, len));
2788 break; 3013 break;
2789 3014
2790 case offsetof(struct __sk_buff, protocol): 3015 case offsetof(struct __sk_buff, protocol):
2791 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); 3016 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
2792 3017
2793 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 3018 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
2794 offsetof(struct sk_buff, protocol)); 3019 offsetof(struct sk_buff, protocol));
2795 break; 3020 break;
2796 3021
2797 case offsetof(struct __sk_buff, vlan_proto): 3022 case offsetof(struct __sk_buff, vlan_proto):
2798 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); 3023 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2);
2799 3024
2800 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 3025 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
2801 offsetof(struct sk_buff, vlan_proto)); 3026 offsetof(struct sk_buff, vlan_proto));
2802 break; 3027 break;
2803 3028
@@ -2805,17 +3030,17 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
2805 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4); 3030 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4);
2806 3031
2807 if (type == BPF_WRITE) 3032 if (type == BPF_WRITE)
2808 *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, 3033 *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
2809 offsetof(struct sk_buff, priority)); 3034 offsetof(struct sk_buff, priority));
2810 else 3035 else
2811 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 3036 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
2812 offsetof(struct sk_buff, priority)); 3037 offsetof(struct sk_buff, priority));
2813 break; 3038 break;
2814 3039
2815 case offsetof(struct __sk_buff, ingress_ifindex): 3040 case offsetof(struct __sk_buff, ingress_ifindex):
2816 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4); 3041 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4);
2817 3042
2818 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 3043 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
2819 offsetof(struct sk_buff, skb_iif)); 3044 offsetof(struct sk_buff, skb_iif));
2820 break; 3045 break;
2821 3046
@@ -2823,17 +3048,17 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
2823 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); 3048 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
2824 3049
2825 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), 3050 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
2826 dst_reg, src_reg, 3051 si->dst_reg, si->src_reg,
2827 offsetof(struct sk_buff, dev)); 3052 offsetof(struct sk_buff, dev));
2828 *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1); 3053 *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
2829 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg, 3054 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
2830 offsetof(struct net_device, ifindex)); 3055 offsetof(struct net_device, ifindex));
2831 break; 3056 break;
2832 3057
2833 case offsetof(struct __sk_buff, hash): 3058 case offsetof(struct __sk_buff, hash):
2834 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); 3059 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
2835 3060
2836 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 3061 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
2837 offsetof(struct sk_buff, hash)); 3062 offsetof(struct sk_buff, hash));
2838 break; 3063 break;
2839 3064
@@ -2841,63 +3066,77 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
2841 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); 3066 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
2842 3067
2843 if (type == BPF_WRITE) 3068 if (type == BPF_WRITE)
2844 *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, 3069 *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
2845 offsetof(struct sk_buff, mark)); 3070 offsetof(struct sk_buff, mark));
2846 else 3071 else
2847 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 3072 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
2848 offsetof(struct sk_buff, mark)); 3073 offsetof(struct sk_buff, mark));
2849 break; 3074 break;
2850 3075
2851 case offsetof(struct __sk_buff, pkt_type): 3076 case offsetof(struct __sk_buff, pkt_type):
2852 return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn); 3077 return convert_skb_access(SKF_AD_PKTTYPE, si->dst_reg,
3078 si->src_reg, insn);
2853 3079
2854 case offsetof(struct __sk_buff, queue_mapping): 3080 case offsetof(struct __sk_buff, queue_mapping):
2855 return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn); 3081 return convert_skb_access(SKF_AD_QUEUE, si->dst_reg,
3082 si->src_reg, insn);
2856 3083
2857 case offsetof(struct __sk_buff, vlan_present): 3084 case offsetof(struct __sk_buff, vlan_present):
2858 return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, 3085 return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
2859 dst_reg, src_reg, insn); 3086 si->dst_reg, si->src_reg, insn);
2860 3087
2861 case offsetof(struct __sk_buff, vlan_tci): 3088 case offsetof(struct __sk_buff, vlan_tci):
2862 return convert_skb_access(SKF_AD_VLAN_TAG, 3089 return convert_skb_access(SKF_AD_VLAN_TAG,
2863 dst_reg, src_reg, insn); 3090 si->dst_reg, si->src_reg, insn);
2864 3091
2865 case offsetof(struct __sk_buff, cb[0]) ... 3092 case offsetof(struct __sk_buff, cb[0]) ...
2866 offsetof(struct __sk_buff, cb[4]): 3093 offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
2867 BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20); 3094 BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20);
3095 BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
3096 offsetof(struct qdisc_skb_cb, data)) %
3097 sizeof(__u64));
2868 3098
2869 prog->cb_access = 1; 3099 prog->cb_access = 1;
2870 ctx_off -= offsetof(struct __sk_buff, cb[0]); 3100 off = si->off;
2871 ctx_off += offsetof(struct sk_buff, cb); 3101 off -= offsetof(struct __sk_buff, cb[0]);
2872 ctx_off += offsetof(struct qdisc_skb_cb, data); 3102 off += offsetof(struct sk_buff, cb);
3103 off += offsetof(struct qdisc_skb_cb, data);
2873 if (type == BPF_WRITE) 3104 if (type == BPF_WRITE)
2874 *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, ctx_off); 3105 *insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg,
3106 si->src_reg, off);
2875 else 3107 else
2876 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off); 3108 *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
3109 si->src_reg, off);
2877 break; 3110 break;
2878 3111
2879 case offsetof(struct __sk_buff, tc_classid): 3112 case offsetof(struct __sk_buff, tc_classid):
2880 ctx_off -= offsetof(struct __sk_buff, tc_classid); 3113 BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, tc_classid) != 2);
2881 ctx_off += offsetof(struct sk_buff, cb); 3114
2882 ctx_off += offsetof(struct qdisc_skb_cb, tc_classid); 3115 off = si->off;
3116 off -= offsetof(struct __sk_buff, tc_classid);
3117 off += offsetof(struct sk_buff, cb);
3118 off += offsetof(struct qdisc_skb_cb, tc_classid);
2883 if (type == BPF_WRITE) 3119 if (type == BPF_WRITE)
2884 *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off); 3120 *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg,
3121 si->src_reg, off);
2885 else 3122 else
2886 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ctx_off); 3123 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg,
3124 si->src_reg, off);
2887 break; 3125 break;
2888 3126
2889 case offsetof(struct __sk_buff, data): 3127 case offsetof(struct __sk_buff, data):
2890 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), 3128 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
2891 dst_reg, src_reg, 3129 si->dst_reg, si->src_reg,
2892 offsetof(struct sk_buff, data)); 3130 offsetof(struct sk_buff, data));
2893 break; 3131 break;
2894 3132
2895 case offsetof(struct __sk_buff, data_end): 3133 case offsetof(struct __sk_buff, data_end):
2896 ctx_off -= offsetof(struct __sk_buff, data_end); 3134 off = si->off;
2897 ctx_off += offsetof(struct sk_buff, cb); 3135 off -= offsetof(struct __sk_buff, data_end);
2898 ctx_off += offsetof(struct bpf_skb_data_end, data_end); 3136 off += offsetof(struct sk_buff, cb);
2899 *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), dst_reg, src_reg, 3137 off += offsetof(struct bpf_skb_data_end, data_end);
2900 ctx_off); 3138 *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
3139 si->src_reg, off);
2901 break; 3140 break;
2902 3141
2903 case offsetof(struct __sk_buff, tc_index): 3142 case offsetof(struct __sk_buff, tc_index):
@@ -2905,65 +3144,107 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
2905 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2); 3144 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2);
2906 3145
2907 if (type == BPF_WRITE) 3146 if (type == BPF_WRITE)
2908 *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, 3147 *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg,
2909 offsetof(struct sk_buff, tc_index)); 3148 offsetof(struct sk_buff, tc_index));
2910 else 3149 else
2911 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 3150 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
2912 offsetof(struct sk_buff, tc_index)); 3151 offsetof(struct sk_buff, tc_index));
2913 break;
2914#else 3152#else
2915 if (type == BPF_WRITE) 3153 if (type == BPF_WRITE)
2916 *insn++ = BPF_MOV64_REG(dst_reg, dst_reg); 3154 *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg);
2917 else 3155 else
2918 *insn++ = BPF_MOV64_IMM(dst_reg, 0); 3156 *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
2919 break;
2920#endif 3157#endif
3158 break;
3159 }
3160
3161 return insn - insn_buf;
3162}
3163
3164static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
3165 const struct bpf_insn *si,
3166 struct bpf_insn *insn_buf,
3167 struct bpf_prog *prog)
3168{
3169 struct bpf_insn *insn = insn_buf;
3170
3171 switch (si->off) {
3172 case offsetof(struct bpf_sock, bound_dev_if):
3173 BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4);
3174
3175 if (type == BPF_WRITE)
3176 *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
3177 offsetof(struct sock, sk_bound_dev_if));
3178 else
3179 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
3180 offsetof(struct sock, sk_bound_dev_if));
3181 break;
3182
3183 case offsetof(struct bpf_sock, family):
3184 BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2);
3185
3186 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
3187 offsetof(struct sock, sk_family));
3188 break;
3189
3190 case offsetof(struct bpf_sock, type):
3191 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
3192 offsetof(struct sock, __sk_flags_offset));
3193 *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK);
3194 *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT);
3195 break;
3196
3197 case offsetof(struct bpf_sock, protocol):
3198 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
3199 offsetof(struct sock, __sk_flags_offset));
3200 *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK);
3201 *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT);
3202 break;
2921 } 3203 }
2922 3204
2923 return insn - insn_buf; 3205 return insn - insn_buf;
2924} 3206}
2925 3207
2926static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg, 3208static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type,
2927 int src_reg, int ctx_off, 3209 const struct bpf_insn *si,
2928 struct bpf_insn *insn_buf, 3210 struct bpf_insn *insn_buf,
2929 struct bpf_prog *prog) 3211 struct bpf_prog *prog)
2930{ 3212{
2931 struct bpf_insn *insn = insn_buf; 3213 struct bpf_insn *insn = insn_buf;
2932 3214
2933 switch (ctx_off) { 3215 switch (si->off) {
2934 case offsetof(struct __sk_buff, ifindex): 3216 case offsetof(struct __sk_buff, ifindex):
2935 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); 3217 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
2936 3218
2937 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), 3219 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
2938 dst_reg, src_reg, 3220 si->dst_reg, si->src_reg,
2939 offsetof(struct sk_buff, dev)); 3221 offsetof(struct sk_buff, dev));
2940 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg, 3222 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
2941 offsetof(struct net_device, ifindex)); 3223 offsetof(struct net_device, ifindex));
2942 break; 3224 break;
2943 default: 3225 default:
2944 return sk_filter_convert_ctx_access(type, dst_reg, src_reg, 3226 return bpf_convert_ctx_access(type, si, insn_buf, prog);
2945 ctx_off, insn_buf, prog);
2946 } 3227 }
2947 3228
2948 return insn - insn_buf; 3229 return insn - insn_buf;
2949} 3230}
2950 3231
2951static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg, 3232static u32 xdp_convert_ctx_access(enum bpf_access_type type,
2952 int src_reg, int ctx_off, 3233 const struct bpf_insn *si,
2953 struct bpf_insn *insn_buf, 3234 struct bpf_insn *insn_buf,
2954 struct bpf_prog *prog) 3235 struct bpf_prog *prog)
2955{ 3236{
2956 struct bpf_insn *insn = insn_buf; 3237 struct bpf_insn *insn = insn_buf;
2957 3238
2958 switch (ctx_off) { 3239 switch (si->off) {
2959 case offsetof(struct xdp_md, data): 3240 case offsetof(struct xdp_md, data):
2960 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data), 3241 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data),
2961 dst_reg, src_reg, 3242 si->dst_reg, si->src_reg,
2962 offsetof(struct xdp_buff, data)); 3243 offsetof(struct xdp_buff, data));
2963 break; 3244 break;
2964 case offsetof(struct xdp_md, data_end): 3245 case offsetof(struct xdp_md, data_end):
2965 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end), 3246 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
2966 dst_reg, src_reg, 3247 si->dst_reg, si->src_reg,
2967 offsetof(struct xdp_buff, data_end)); 3248 offsetof(struct xdp_buff, data_end));
2968 break; 3249 break;
2969 } 3250 }
@@ -2974,7 +3255,7 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg,
2974static const struct bpf_verifier_ops sk_filter_ops = { 3255static const struct bpf_verifier_ops sk_filter_ops = {
2975 .get_func_proto = sk_filter_func_proto, 3256 .get_func_proto = sk_filter_func_proto,
2976 .is_valid_access = sk_filter_is_valid_access, 3257 .is_valid_access = sk_filter_is_valid_access,
2977 .convert_ctx_access = sk_filter_convert_ctx_access, 3258 .convert_ctx_access = bpf_convert_ctx_access,
2978}; 3259};
2979 3260
2980static const struct bpf_verifier_ops tc_cls_act_ops = { 3261static const struct bpf_verifier_ops tc_cls_act_ops = {
@@ -2990,32 +3271,87 @@ static const struct bpf_verifier_ops xdp_ops = {
2990 .convert_ctx_access = xdp_convert_ctx_access, 3271 .convert_ctx_access = xdp_convert_ctx_access,
2991}; 3272};
2992 3273
2993static struct bpf_prog_type_list sk_filter_type __read_mostly = { 3274static const struct bpf_verifier_ops cg_skb_ops = {
3275 .get_func_proto = cg_skb_func_proto,
3276 .is_valid_access = sk_filter_is_valid_access,
3277 .convert_ctx_access = bpf_convert_ctx_access,
3278};
3279
3280static const struct bpf_verifier_ops lwt_inout_ops = {
3281 .get_func_proto = lwt_inout_func_proto,
3282 .is_valid_access = lwt_is_valid_access,
3283 .convert_ctx_access = bpf_convert_ctx_access,
3284};
3285
3286static const struct bpf_verifier_ops lwt_xmit_ops = {
3287 .get_func_proto = lwt_xmit_func_proto,
3288 .is_valid_access = lwt_is_valid_access,
3289 .convert_ctx_access = bpf_convert_ctx_access,
3290 .gen_prologue = tc_cls_act_prologue,
3291};
3292
3293static const struct bpf_verifier_ops cg_sock_ops = {
3294 .get_func_proto = bpf_base_func_proto,
3295 .is_valid_access = sock_filter_is_valid_access,
3296 .convert_ctx_access = sock_filter_convert_ctx_access,
3297};
3298
3299static struct bpf_prog_type_list sk_filter_type __ro_after_init = {
2994 .ops = &sk_filter_ops, 3300 .ops = &sk_filter_ops,
2995 .type = BPF_PROG_TYPE_SOCKET_FILTER, 3301 .type = BPF_PROG_TYPE_SOCKET_FILTER,
2996}; 3302};
2997 3303
2998static struct bpf_prog_type_list sched_cls_type __read_mostly = { 3304static struct bpf_prog_type_list sched_cls_type __ro_after_init = {
2999 .ops = &tc_cls_act_ops, 3305 .ops = &tc_cls_act_ops,
3000 .type = BPF_PROG_TYPE_SCHED_CLS, 3306 .type = BPF_PROG_TYPE_SCHED_CLS,
3001}; 3307};
3002 3308
3003static struct bpf_prog_type_list sched_act_type __read_mostly = { 3309static struct bpf_prog_type_list sched_act_type __ro_after_init = {
3004 .ops = &tc_cls_act_ops, 3310 .ops = &tc_cls_act_ops,
3005 .type = BPF_PROG_TYPE_SCHED_ACT, 3311 .type = BPF_PROG_TYPE_SCHED_ACT,
3006}; 3312};
3007 3313
3008static struct bpf_prog_type_list xdp_type __read_mostly = { 3314static struct bpf_prog_type_list xdp_type __ro_after_init = {
3009 .ops = &xdp_ops, 3315 .ops = &xdp_ops,
3010 .type = BPF_PROG_TYPE_XDP, 3316 .type = BPF_PROG_TYPE_XDP,
3011}; 3317};
3012 3318
3319static struct bpf_prog_type_list cg_skb_type __ro_after_init = {
3320 .ops = &cg_skb_ops,
3321 .type = BPF_PROG_TYPE_CGROUP_SKB,
3322};
3323
3324static struct bpf_prog_type_list lwt_in_type __ro_after_init = {
3325 .ops = &lwt_inout_ops,
3326 .type = BPF_PROG_TYPE_LWT_IN,
3327};
3328
3329static struct bpf_prog_type_list lwt_out_type __ro_after_init = {
3330 .ops = &lwt_inout_ops,
3331 .type = BPF_PROG_TYPE_LWT_OUT,
3332};
3333
3334static struct bpf_prog_type_list lwt_xmit_type __ro_after_init = {
3335 .ops = &lwt_xmit_ops,
3336 .type = BPF_PROG_TYPE_LWT_XMIT,
3337};
3338
3339static struct bpf_prog_type_list cg_sock_type __ro_after_init = {
3340 .ops = &cg_sock_ops,
3341 .type = BPF_PROG_TYPE_CGROUP_SOCK
3342};
3343
3013static int __init register_sk_filter_ops(void) 3344static int __init register_sk_filter_ops(void)
3014{ 3345{
3015 bpf_register_prog_type(&sk_filter_type); 3346 bpf_register_prog_type(&sk_filter_type);
3016 bpf_register_prog_type(&sched_cls_type); 3347 bpf_register_prog_type(&sched_cls_type);
3017 bpf_register_prog_type(&sched_act_type); 3348 bpf_register_prog_type(&sched_act_type);
3018 bpf_register_prog_type(&xdp_type); 3349 bpf_register_prog_type(&xdp_type);
3350 bpf_register_prog_type(&cg_skb_type);
3351 bpf_register_prog_type(&cg_sock_type);
3352 bpf_register_prog_type(&lwt_in_type);
3353 bpf_register_prog_type(&lwt_out_type);
3354 bpf_register_prog_type(&lwt_xmit_type);
3019 3355
3020 return 0; 3356 return 0;
3021} 3357}
diff --git a/net/core/flow.c b/net/core/flow.c
index 18e8893d4be5..f765c11d8df5 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -417,28 +417,20 @@ static int flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
417 return 0; 417 return 0;
418} 418}
419 419
420static int flow_cache_cpu(struct notifier_block *nfb, 420static int flow_cache_cpu_up_prep(unsigned int cpu, struct hlist_node *node)
421 unsigned long action,
422 void *hcpu)
423{ 421{
424 struct flow_cache *fc = container_of(nfb, struct flow_cache, 422 struct flow_cache *fc = hlist_entry_safe(node, struct flow_cache, node);
425 hotcpu_notifier); 423
426 int res, cpu = (unsigned long) hcpu; 424 return flow_cache_cpu_prepare(fc, cpu);
425}
426
427static int flow_cache_cpu_dead(unsigned int cpu, struct hlist_node *node)
428{
429 struct flow_cache *fc = hlist_entry_safe(node, struct flow_cache, node);
427 struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); 430 struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
428 431
429 switch (action) { 432 __flow_cache_shrink(fc, fcp, 0);
430 case CPU_UP_PREPARE: 433 return 0;
431 case CPU_UP_PREPARE_FROZEN:
432 res = flow_cache_cpu_prepare(fc, cpu);
433 if (res)
434 return notifier_from_errno(res);
435 break;
436 case CPU_DEAD:
437 case CPU_DEAD_FROZEN:
438 __flow_cache_shrink(fc, fcp, 0);
439 break;
440 }
441 return NOTIFY_OK;
442} 434}
443 435
444int flow_cache_init(struct net *net) 436int flow_cache_init(struct net *net)
@@ -465,18 +457,8 @@ int flow_cache_init(struct net *net)
465 if (!fc->percpu) 457 if (!fc->percpu)
466 return -ENOMEM; 458 return -ENOMEM;
467 459
468 cpu_notifier_register_begin(); 460 if (cpuhp_state_add_instance(CPUHP_NET_FLOW_PREPARE, &fc->node))
469 461 goto err;
470 for_each_online_cpu(i) {
471 if (flow_cache_cpu_prepare(fc, i))
472 goto err;
473 }
474 fc->hotcpu_notifier = (struct notifier_block){
475 .notifier_call = flow_cache_cpu,
476 };
477 __register_hotcpu_notifier(&fc->hotcpu_notifier);
478
479 cpu_notifier_register_done();
480 462
481 setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd, 463 setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
482 (unsigned long) fc); 464 (unsigned long) fc);
@@ -492,8 +474,6 @@ err:
492 fcp->hash_table = NULL; 474 fcp->hash_table = NULL;
493 } 475 }
494 476
495 cpu_notifier_register_done();
496
497 free_percpu(fc->percpu); 477 free_percpu(fc->percpu);
498 fc->percpu = NULL; 478 fc->percpu = NULL;
499 479
@@ -507,7 +487,8 @@ void flow_cache_fini(struct net *net)
507 struct flow_cache *fc = &net->xfrm.flow_cache_global; 487 struct flow_cache *fc = &net->xfrm.flow_cache_global;
508 488
509 del_timer_sync(&fc->rnd_timer); 489 del_timer_sync(&fc->rnd_timer);
510 unregister_hotcpu_notifier(&fc->hotcpu_notifier); 490
491 cpuhp_state_remove_instance_nocalls(CPUHP_NET_FLOW_PREPARE, &fc->node);
511 492
512 for_each_possible_cpu(i) { 493 for_each_possible_cpu(i) {
513 struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i); 494 struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
@@ -519,3 +500,14 @@ void flow_cache_fini(struct net *net)
519 fc->percpu = NULL; 500 fc->percpu = NULL;
520} 501}
521EXPORT_SYMBOL(flow_cache_fini); 502EXPORT_SYMBOL(flow_cache_fini);
503
504void __init flow_cache_hp_init(void)
505{
506 int ret;
507
508 ret = cpuhp_setup_state_multi(CPUHP_NET_FLOW_PREPARE,
509 "net/flow:prepare",
510 flow_cache_cpu_up_prep,
511 flow_cache_cpu_dead);
512 WARN_ON(ret < 0);
513}
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index c6d8207ffa7e..d98d4998213d 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -58,6 +58,28 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
58EXPORT_SYMBOL(skb_flow_dissector_init); 58EXPORT_SYMBOL(skb_flow_dissector_init);
59 59
60/** 60/**
61 * skb_flow_get_be16 - extract be16 entity
62 * @skb: sk_buff to extract from
63 * @poff: offset to extract at
64 * @data: raw buffer pointer to the packet
65 * @hlen: packet header length
66 *
67 * The function will try to retrieve a be32 entity at
68 * offset poff
69 */
70static __be16 skb_flow_get_be16(const struct sk_buff *skb, int poff,
71 void *data, int hlen)
72{
73 __be16 *u, _u;
74
75 u = __skb_header_pointer(skb, poff, sizeof(_u), data, hlen, &_u);
76 if (u)
77 return *u;
78
79 return 0;
80}
81
82/**
61 * __skb_flow_get_ports - extract the upper layer ports and return them 83 * __skb_flow_get_ports - extract the upper layer ports and return them
62 * @skb: sk_buff to extract the ports from 84 * @skb: sk_buff to extract the ports from
63 * @thoff: transport header offset 85 * @thoff: transport header offset
@@ -116,7 +138,9 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
116 struct flow_dissector_key_control *key_control; 138 struct flow_dissector_key_control *key_control;
117 struct flow_dissector_key_basic *key_basic; 139 struct flow_dissector_key_basic *key_basic;
118 struct flow_dissector_key_addrs *key_addrs; 140 struct flow_dissector_key_addrs *key_addrs;
141 struct flow_dissector_key_arp *key_arp;
119 struct flow_dissector_key_ports *key_ports; 142 struct flow_dissector_key_ports *key_ports;
143 struct flow_dissector_key_icmp *key_icmp;
120 struct flow_dissector_key_tags *key_tags; 144 struct flow_dissector_key_tags *key_tags;
121 struct flow_dissector_key_vlan *key_vlan; 145 struct flow_dissector_key_vlan *key_vlan;
122 struct flow_dissector_key_keyid *key_keyid; 146 struct flow_dissector_key_keyid *key_keyid;
@@ -356,6 +380,62 @@ mpls:
356 380
357 nhoff += FCOE_HEADER_LEN; 381 nhoff += FCOE_HEADER_LEN;
358 goto out_good; 382 goto out_good;
383
384 case htons(ETH_P_ARP):
385 case htons(ETH_P_RARP): {
386 struct {
387 unsigned char ar_sha[ETH_ALEN];
388 unsigned char ar_sip[4];
389 unsigned char ar_tha[ETH_ALEN];
390 unsigned char ar_tip[4];
391 } *arp_eth, _arp_eth;
392 const struct arphdr *arp;
393 struct arphdr _arp;
394
395 arp = __skb_header_pointer(skb, nhoff, sizeof(_arp), data,
396 hlen, &_arp);
397 if (!arp)
398 goto out_bad;
399
400 if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
401 arp->ar_pro != htons(ETH_P_IP) ||
402 arp->ar_hln != ETH_ALEN ||
403 arp->ar_pln != 4 ||
404 (arp->ar_op != htons(ARPOP_REPLY) &&
405 arp->ar_op != htons(ARPOP_REQUEST)))
406 goto out_bad;
407
408 arp_eth = __skb_header_pointer(skb, nhoff + sizeof(_arp),
409 sizeof(_arp_eth), data,
410 hlen,
411 &_arp_eth);
412 if (!arp_eth)
413 goto out_bad;
414
415 if (dissector_uses_key(flow_dissector,
416 FLOW_DISSECTOR_KEY_ARP)) {
417
418 key_arp = skb_flow_dissector_target(flow_dissector,
419 FLOW_DISSECTOR_KEY_ARP,
420 target_container);
421
422 memcpy(&key_arp->sip, arp_eth->ar_sip,
423 sizeof(key_arp->sip));
424 memcpy(&key_arp->tip, arp_eth->ar_tip,
425 sizeof(key_arp->tip));
426
427 /* Only store the lower byte of the opcode;
428 * this covers ARPOP_REPLY and ARPOP_REQUEST.
429 */
430 key_arp->op = ntohs(arp->ar_op) & 0xff;
431
432 ether_addr_copy(key_arp->sha, arp_eth->ar_sha);
433 ether_addr_copy(key_arp->tha, arp_eth->ar_tha);
434 }
435
436 goto out_good;
437 }
438
359 default: 439 default:
360 goto out_bad; 440 goto out_bad;
361 } 441 }
@@ -445,8 +525,9 @@ ip_proto_again:
445 if (hdr->flags & GRE_ACK) 525 if (hdr->flags & GRE_ACK)
446 offset += sizeof(((struct pptp_gre_header *)0)->ack); 526 offset += sizeof(((struct pptp_gre_header *)0)->ack);
447 527
448 ppp_hdr = skb_header_pointer(skb, nhoff + offset, 528 ppp_hdr = __skb_header_pointer(skb, nhoff + offset,
449 sizeof(_ppp_hdr), _ppp_hdr); 529 sizeof(_ppp_hdr),
530 data, hlen, _ppp_hdr);
450 if (!ppp_hdr) 531 if (!ppp_hdr)
451 goto out_bad; 532 goto out_bad;
452 533
@@ -546,6 +627,14 @@ ip_proto_again:
546 data, hlen); 627 data, hlen);
547 } 628 }
548 629
630 if (dissector_uses_key(flow_dissector,
631 FLOW_DISSECTOR_KEY_ICMP)) {
632 key_icmp = skb_flow_dissector_target(flow_dissector,
633 FLOW_DISSECTOR_KEY_ICMP,
634 target_container);
635 key_icmp->icmp = skb_flow_get_be16(skb, nhoff, data, hlen);
636 }
637
549out_good: 638out_good:
550 ret = true; 639 ret = true;
551 640
@@ -726,7 +815,7 @@ EXPORT_SYMBOL(make_flow_keys_digest);
726 815
727static struct flow_dissector flow_keys_dissector_symmetric __read_mostly; 816static struct flow_dissector flow_keys_dissector_symmetric __read_mostly;
728 817
729u32 __skb_get_hash_symmetric(struct sk_buff *skb) 818u32 __skb_get_hash_symmetric(const struct sk_buff *skb)
730{ 819{
731 struct flow_keys keys; 820 struct flow_keys keys;
732 821
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index cad8e791f28e..0385dece1f6f 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -7,13 +7,14 @@
7 * 2 of the License, or (at your option) any later version. 7 * 2 of the License, or (at your option) any later version.
8 * 8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 * Eric Dumazet <edumazet@google.com>
10 * 11 *
11 * Changes: 12 * Changes:
12 * Jamal Hadi Salim - moved it to net/core and reshulfed 13 * Jamal Hadi Salim - moved it to net/core and reshulfed
13 * names to make it usable in general net subsystem. 14 * names to make it usable in general net subsystem.
14 */ 15 */
15 16
16#include <asm/uaccess.h> 17#include <linux/uaccess.h>
17#include <linux/bitops.h> 18#include <linux/bitops.h>
18#include <linux/module.h> 19#include <linux/module.h>
19#include <linux/types.h> 20#include <linux/types.h>
@@ -30,165 +31,79 @@
30#include <linux/skbuff.h> 31#include <linux/skbuff.h>
31#include <linux/rtnetlink.h> 32#include <linux/rtnetlink.h>
32#include <linux/init.h> 33#include <linux/init.h>
33#include <linux/rbtree.h>
34#include <linux/slab.h> 34#include <linux/slab.h>
35#include <linux/seqlock.h>
35#include <net/sock.h> 36#include <net/sock.h>
36#include <net/gen_stats.h> 37#include <net/gen_stats.h>
37 38
38/* 39/* This code is NOT intended to be used for statistics collection,
39 This code is NOT intended to be used for statistics collection, 40 * its purpose is to provide a base for statistical multiplexing
40 its purpose is to provide a base for statistical multiplexing 41 * for controlled load service.
41 for controlled load service. 42 * If you need only statistics, run a user level daemon which
42 If you need only statistics, run a user level daemon which 43 * periodically reads byte counters.
43 periodically reads byte counters.
44
45 Unfortunately, rate estimation is not a very easy task.
46 F.e. I did not find a simple way to estimate the current peak rate
47 and even failed to formulate the problem 8)8)
48
49 So I preferred not to built an estimator into the scheduler,
50 but run this task separately.
51 Ideally, it should be kernel thread(s), but for now it runs
52 from timers, which puts apparent top bounds on the number of rated
53 flows, has minimal overhead on small, but is enough
54 to handle controlled load service, sets of aggregates.
55
56 We measure rate over A=(1<<interval) seconds and evaluate EWMA:
57
58 avrate = avrate*(1-W) + rate*W
59
60 where W is chosen as negative power of 2: W = 2^(-ewma_log)
61
62 The resulting time constant is:
63
64 T = A/(-ln(1-W))
65
66
67 NOTES.
68
69 * avbps and avpps are scaled by 2^5.
70 * both values are reported as 32 bit unsigned values. bps can
71 overflow for fast links : max speed being 34360Mbit/sec
72 * Minimal interval is HZ/4=250msec (it is the greatest common divisor
73 for HZ=100 and HZ=1024 8)), maximal interval
74 is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals
75 are too expensive, longer ones can be implemented
76 at user level painlessly.
77 */ 44 */
78 45
79#define EST_MAX_INTERVAL 5 46struct net_rate_estimator {
80
81struct gen_estimator
82{
83 struct list_head list;
84 struct gnet_stats_basic_packed *bstats; 47 struct gnet_stats_basic_packed *bstats;
85 struct gnet_stats_rate_est64 *rate_est;
86 spinlock_t *stats_lock; 48 spinlock_t *stats_lock;
87 seqcount_t *running; 49 seqcount_t *running;
88 int ewma_log; 50 struct gnet_stats_basic_cpu __percpu *cpu_bstats;
51 u8 ewma_log;
52 u8 intvl_log; /* period : (250ms << intvl_log) */
53
54 seqcount_t seq;
89 u32 last_packets; 55 u32 last_packets;
90 unsigned long avpps;
91 u64 last_bytes; 56 u64 last_bytes;
57
58 u64 avpps;
92 u64 avbps; 59 u64 avbps;
93 struct rcu_head e_rcu;
94 struct rb_node node;
95 struct gnet_stats_basic_cpu __percpu *cpu_bstats;
96 struct rcu_head head;
97};
98 60
99struct gen_estimator_head 61 unsigned long next_jiffies;
100{ 62 struct timer_list timer;
101 struct timer_list timer; 63 struct rcu_head rcu;
102 struct list_head list;
103}; 64};
104 65
105static struct gen_estimator_head elist[EST_MAX_INTERVAL+1]; 66static void est_fetch_counters(struct net_rate_estimator *e,
106 67 struct gnet_stats_basic_packed *b)
107/* Protects against NULL dereference */
108static DEFINE_RWLOCK(est_lock);
109
110/* Protects against soft lockup during large deletion */
111static struct rb_root est_root = RB_ROOT;
112static DEFINE_SPINLOCK(est_tree_lock);
113
114static void est_timer(unsigned long arg)
115{ 68{
116 int idx = (int)arg; 69 if (e->stats_lock)
117 struct gen_estimator *e; 70 spin_lock(e->stats_lock);
118 71
119 rcu_read_lock(); 72 __gnet_stats_copy_basic(e->running, b, e->cpu_bstats, e->bstats);
120 list_for_each_entry_rcu(e, &elist[idx].list, list) { 73
121 struct gnet_stats_basic_packed b = {0}; 74 if (e->stats_lock)
122 unsigned long rate; 75 spin_unlock(e->stats_lock);
123 u64 brate;
124
125 if (e->stats_lock)
126 spin_lock(e->stats_lock);
127 read_lock(&est_lock);
128 if (e->bstats == NULL)
129 goto skip;
130
131 __gnet_stats_copy_basic(e->running, &b, e->cpu_bstats, e->bstats);
132
133 brate = (b.bytes - e->last_bytes)<<(7 - idx);
134 e->last_bytes = b.bytes;
135 e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
136 WRITE_ONCE(e->rate_est->bps, (e->avbps + 0xF) >> 5);
137
138 rate = b.packets - e->last_packets;
139 rate <<= (7 - idx);
140 e->last_packets = b.packets;
141 e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
142 WRITE_ONCE(e->rate_est->pps, (e->avpps + 0xF) >> 5);
143skip:
144 read_unlock(&est_lock);
145 if (e->stats_lock)
146 spin_unlock(e->stats_lock);
147 }
148 76
149 if (!list_empty(&elist[idx].list))
150 mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
151 rcu_read_unlock();
152} 77}
153 78
154static void gen_add_node(struct gen_estimator *est) 79static void est_timer(unsigned long arg)
155{ 80{
156 struct rb_node **p = &est_root.rb_node, *parent = NULL; 81 struct net_rate_estimator *est = (struct net_rate_estimator *)arg;
82 struct gnet_stats_basic_packed b;
83 u64 rate, brate;
157 84
158 while (*p) { 85 est_fetch_counters(est, &b);
159 struct gen_estimator *e; 86 brate = (b.bytes - est->last_bytes) << (8 - est->ewma_log);
87 brate -= (est->avbps >> est->ewma_log);
160 88
161 parent = *p; 89 rate = (u64)(b.packets - est->last_packets) << (8 - est->ewma_log);
162 e = rb_entry(parent, struct gen_estimator, node); 90 rate -= (est->avpps >> est->ewma_log);
163 91
164 if (est->bstats > e->bstats) 92 write_seqcount_begin(&est->seq);
165 p = &parent->rb_right; 93 est->avbps += brate;
166 else 94 est->avpps += rate;
167 p = &parent->rb_left; 95 write_seqcount_end(&est->seq);
168 }
169 rb_link_node(&est->node, parent, p);
170 rb_insert_color(&est->node, &est_root);
171}
172 96
173static 97 est->last_bytes = b.bytes;
174struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats, 98 est->last_packets = b.packets;
175 const struct gnet_stats_rate_est64 *rate_est)
176{
177 struct rb_node *p = est_root.rb_node;
178
179 while (p) {
180 struct gen_estimator *e;
181 99
182 e = rb_entry(p, struct gen_estimator, node); 100 est->next_jiffies += ((HZ/4) << est->intvl_log);
183 101
184 if (bstats > e->bstats) 102 if (unlikely(time_after_eq(jiffies, est->next_jiffies))) {
185 p = p->rb_right; 103 /* Ouch... timer was delayed. */
186 else if (bstats < e->bstats || rate_est != e->rate_est) 104 est->next_jiffies = jiffies + 1;
187 p = p->rb_left;
188 else
189 return e;
190 } 105 }
191 return NULL; 106 mod_timer(&est->timer, est->next_jiffies);
192} 107}
193 108
194/** 109/**
@@ -211,83 +126,76 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats
211 */ 126 */
212int gen_new_estimator(struct gnet_stats_basic_packed *bstats, 127int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
213 struct gnet_stats_basic_cpu __percpu *cpu_bstats, 128 struct gnet_stats_basic_cpu __percpu *cpu_bstats,
214 struct gnet_stats_rate_est64 *rate_est, 129 struct net_rate_estimator __rcu **rate_est,
215 spinlock_t *stats_lock, 130 spinlock_t *stats_lock,
216 seqcount_t *running, 131 seqcount_t *running,
217 struct nlattr *opt) 132 struct nlattr *opt)
218{ 133{
219 struct gen_estimator *est;
220 struct gnet_estimator *parm = nla_data(opt); 134 struct gnet_estimator *parm = nla_data(opt);
221 struct gnet_stats_basic_packed b = {0}; 135 struct net_rate_estimator *old, *est;
222 int idx; 136 struct gnet_stats_basic_packed b;
137 int intvl_log;
223 138
224 if (nla_len(opt) < sizeof(*parm)) 139 if (nla_len(opt) < sizeof(*parm))
225 return -EINVAL; 140 return -EINVAL;
226 141
142 /* allowed timer periods are :
143 * -2 : 250ms, -1 : 500ms, 0 : 1 sec
144 * 1 : 2 sec, 2 : 4 sec, 3 : 8 sec
145 */
227 if (parm->interval < -2 || parm->interval > 3) 146 if (parm->interval < -2 || parm->interval > 3)
228 return -EINVAL; 147 return -EINVAL;
229 148
230 est = kzalloc(sizeof(*est), GFP_KERNEL); 149 est = kzalloc(sizeof(*est), GFP_KERNEL);
231 if (est == NULL) 150 if (!est)
232 return -ENOBUFS; 151 return -ENOBUFS;
233 152
234 __gnet_stats_copy_basic(running, &b, cpu_bstats, bstats); 153 seqcount_init(&est->seq);
235 154 intvl_log = parm->interval + 2;
236 idx = parm->interval + 2;
237 est->bstats = bstats; 155 est->bstats = bstats;
238 est->rate_est = rate_est;
239 est->stats_lock = stats_lock; 156 est->stats_lock = stats_lock;
240 est->running = running; 157 est->running = running;
241 est->ewma_log = parm->ewma_log; 158 est->ewma_log = parm->ewma_log;
242 est->last_bytes = b.bytes; 159 est->intvl_log = intvl_log;
243 est->avbps = rate_est->bps<<5;
244 est->last_packets = b.packets;
245 est->avpps = rate_est->pps<<10;
246 est->cpu_bstats = cpu_bstats; 160 est->cpu_bstats = cpu_bstats;
247 161
248 spin_lock_bh(&est_tree_lock); 162 est_fetch_counters(est, &b);
249 if (!elist[idx].timer.function) { 163 est->last_bytes = b.bytes;
250 INIT_LIST_HEAD(&elist[idx].list); 164 est->last_packets = b.packets;
251 setup_timer(&elist[idx].timer, est_timer, idx); 165 old = rcu_dereference_protected(*rate_est, 1);
166 if (old) {
167 del_timer_sync(&old->timer);
168 est->avbps = old->avbps;
169 est->avpps = old->avpps;
252 } 170 }
253 171
254 if (list_empty(&elist[idx].list)) 172 est->next_jiffies = jiffies + ((HZ/4) << intvl_log);
255 mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx)); 173 setup_timer(&est->timer, est_timer, (unsigned long)est);
256 174 mod_timer(&est->timer, est->next_jiffies);
257 list_add_rcu(&est->list, &elist[idx].list);
258 gen_add_node(est);
259 spin_unlock_bh(&est_tree_lock);
260 175
176 rcu_assign_pointer(*rate_est, est);
177 if (old)
178 kfree_rcu(old, rcu);
261 return 0; 179 return 0;
262} 180}
263EXPORT_SYMBOL(gen_new_estimator); 181EXPORT_SYMBOL(gen_new_estimator);
264 182
265/** 183/**
266 * gen_kill_estimator - remove a rate estimator 184 * gen_kill_estimator - remove a rate estimator
267 * @bstats: basic statistics 185 * @rate_est: rate estimator
268 * @rate_est: rate estimator statistics
269 * 186 *
270 * Removes the rate estimator specified by &bstats and &rate_est. 187 * Removes the rate estimator.
271 * 188 *
272 * Note : Caller should respect an RCU grace period before freeing stats_lock
273 */ 189 */
274void gen_kill_estimator(struct gnet_stats_basic_packed *bstats, 190void gen_kill_estimator(struct net_rate_estimator __rcu **rate_est)
275 struct gnet_stats_rate_est64 *rate_est)
276{ 191{
277 struct gen_estimator *e; 192 struct net_rate_estimator *est;
278
279 spin_lock_bh(&est_tree_lock);
280 while ((e = gen_find_node(bstats, rate_est))) {
281 rb_erase(&e->node, &est_root);
282 193
283 write_lock(&est_lock); 194 est = xchg((__force struct net_rate_estimator **)rate_est, NULL);
284 e->bstats = NULL; 195 if (est) {
285 write_unlock(&est_lock); 196 del_timer_sync(&est->timer);
286 197 kfree_rcu(est, rcu);
287 list_del_rcu(&e->list);
288 kfree_rcu(e, e_rcu);
289 } 198 }
290 spin_unlock_bh(&est_tree_lock);
291} 199}
292EXPORT_SYMBOL(gen_kill_estimator); 200EXPORT_SYMBOL(gen_kill_estimator);
293 201
@@ -307,33 +215,47 @@ EXPORT_SYMBOL(gen_kill_estimator);
307 */ 215 */
308int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, 216int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
309 struct gnet_stats_basic_cpu __percpu *cpu_bstats, 217 struct gnet_stats_basic_cpu __percpu *cpu_bstats,
310 struct gnet_stats_rate_est64 *rate_est, 218 struct net_rate_estimator __rcu **rate_est,
311 spinlock_t *stats_lock, 219 spinlock_t *stats_lock,
312 seqcount_t *running, struct nlattr *opt) 220 seqcount_t *running, struct nlattr *opt)
313{ 221{
314 gen_kill_estimator(bstats, rate_est); 222 return gen_new_estimator(bstats, cpu_bstats, rate_est,
315 return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, running, opt); 223 stats_lock, running, opt);
316} 224}
317EXPORT_SYMBOL(gen_replace_estimator); 225EXPORT_SYMBOL(gen_replace_estimator);
318 226
319/** 227/**
320 * gen_estimator_active - test if estimator is currently in use 228 * gen_estimator_active - test if estimator is currently in use
321 * @bstats: basic statistics 229 * @rate_est: rate estimator
322 * @rate_est: rate estimator statistics
323 * 230 *
324 * Returns true if estimator is active, and false if not. 231 * Returns true if estimator is active, and false if not.
325 */ 232 */
326bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats, 233bool gen_estimator_active(struct net_rate_estimator __rcu **rate_est)
327 const struct gnet_stats_rate_est64 *rate_est)
328{ 234{
329 bool res; 235 return !!rcu_access_pointer(*rate_est);
236}
237EXPORT_SYMBOL(gen_estimator_active);
330 238
331 ASSERT_RTNL(); 239bool gen_estimator_read(struct net_rate_estimator __rcu **rate_est,
240 struct gnet_stats_rate_est64 *sample)
241{
242 struct net_rate_estimator *est;
243 unsigned seq;
244
245 rcu_read_lock();
246 est = rcu_dereference(*rate_est);
247 if (!est) {
248 rcu_read_unlock();
249 return false;
250 }
332 251
333 spin_lock_bh(&est_tree_lock); 252 do {
334 res = gen_find_node(bstats, rate_est) != NULL; 253 seq = read_seqcount_begin(&est->seq);
335 spin_unlock_bh(&est_tree_lock); 254 sample->bps = est->avbps >> 8;
255 sample->pps = est->avpps >> 8;
256 } while (read_seqcount_retry(&est->seq, seq));
336 257
337 return res; 258 rcu_read_unlock();
259 return true;
338} 260}
339EXPORT_SYMBOL(gen_estimator_active); 261EXPORT_SYMBOL(gen_estimator_read);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 508e051304fb..87f28557b329 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -194,8 +194,7 @@ EXPORT_SYMBOL(gnet_stats_copy_basic);
194/** 194/**
195 * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV 195 * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV
196 * @d: dumping handle 196 * @d: dumping handle
197 * @b: basic statistics 197 * @rate_est: rate estimator
198 * @r: rate estimator statistics
199 * 198 *
200 * Appends the rate estimator statistics to the top level TLV created by 199 * Appends the rate estimator statistics to the top level TLV created by
201 * gnet_stats_start_copy(). 200 * gnet_stats_start_copy().
@@ -205,18 +204,17 @@ EXPORT_SYMBOL(gnet_stats_copy_basic);
205 */ 204 */
206int 205int
207gnet_stats_copy_rate_est(struct gnet_dump *d, 206gnet_stats_copy_rate_est(struct gnet_dump *d,
208 const struct gnet_stats_basic_packed *b, 207 struct net_rate_estimator __rcu **rate_est)
209 struct gnet_stats_rate_est64 *r)
210{ 208{
209 struct gnet_stats_rate_est64 sample;
211 struct gnet_stats_rate_est est; 210 struct gnet_stats_rate_est est;
212 int res; 211 int res;
213 212
214 if (b && !gen_estimator_active(b, r)) 213 if (!gen_estimator_read(rate_est, &sample))
215 return 0; 214 return 0;
216 215 est.bps = min_t(u64, UINT_MAX, sample.bps);
217 est.bps = min_t(u64, UINT_MAX, r->bps);
218 /* we have some time before reaching 2^32 packets per second */ 216 /* we have some time before reaching 2^32 packets per second */
219 est.pps = r->pps; 217 est.pps = sample.pps;
220 218
221 if (d->compat_tc_stats) { 219 if (d->compat_tc_stats) {
222 d->tc_stats.bps = est.bps; 220 d->tc_stats.bps = est.bps;
@@ -226,11 +224,11 @@ gnet_stats_copy_rate_est(struct gnet_dump *d,
226 if (d->tail) { 224 if (d->tail) {
227 res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est), 225 res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est),
228 TCA_STATS_PAD); 226 TCA_STATS_PAD);
229 if (res < 0 || est.bps == r->bps) 227 if (res < 0 || est.bps == sample.bps)
230 return res; 228 return res;
231 /* emit 64bit stats only if needed */ 229 /* emit 64bit stats only if needed */
232 return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r), 230 return gnet_stats_copy(d, TCA_STATS_RATE_EST64, &sample,
233 TCA_STATS_PAD); 231 sizeof(sample), TCA_STATS_PAD);
234 } 232 }
235 233
236 return 0; 234 return 0;
diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c
new file mode 100644
index 000000000000..c98bbfbd26b8
--- /dev/null
+++ b/net/core/gro_cells.c
@@ -0,0 +1,92 @@
1#include <linux/skbuff.h>
2#include <linux/slab.h>
3#include <linux/netdevice.h>
4#include <net/gro_cells.h>
5
6struct gro_cell {
7 struct sk_buff_head napi_skbs;
8 struct napi_struct napi;
9};
10
11int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb)
12{
13 struct net_device *dev = skb->dev;
14 struct gro_cell *cell;
15
16 if (!gcells->cells || skb_cloned(skb) || !(dev->features & NETIF_F_GRO))
17 return netif_rx(skb);
18
19 cell = this_cpu_ptr(gcells->cells);
20
21 if (skb_queue_len(&cell->napi_skbs) > netdev_max_backlog) {
22 atomic_long_inc(&dev->rx_dropped);
23 kfree_skb(skb);
24 return NET_RX_DROP;
25 }
26
27 __skb_queue_tail(&cell->napi_skbs, skb);
28 if (skb_queue_len(&cell->napi_skbs) == 1)
29 napi_schedule(&cell->napi);
30 return NET_RX_SUCCESS;
31}
32EXPORT_SYMBOL(gro_cells_receive);
33
34/* called under BH context */
35static int gro_cell_poll(struct napi_struct *napi, int budget)
36{
37 struct gro_cell *cell = container_of(napi, struct gro_cell, napi);
38 struct sk_buff *skb;
39 int work_done = 0;
40
41 while (work_done < budget) {
42 skb = __skb_dequeue(&cell->napi_skbs);
43 if (!skb)
44 break;
45 napi_gro_receive(napi, skb);
46 work_done++;
47 }
48
49 if (work_done < budget)
50 napi_complete_done(napi, work_done);
51 return work_done;
52}
53
54int gro_cells_init(struct gro_cells *gcells, struct net_device *dev)
55{
56 int i;
57
58 gcells->cells = alloc_percpu(struct gro_cell);
59 if (!gcells->cells)
60 return -ENOMEM;
61
62 for_each_possible_cpu(i) {
63 struct gro_cell *cell = per_cpu_ptr(gcells->cells, i);
64
65 __skb_queue_head_init(&cell->napi_skbs);
66
67 set_bit(NAPI_STATE_NO_BUSY_POLL, &cell->napi.state);
68
69 netif_napi_add(dev, &cell->napi, gro_cell_poll,
70 NAPI_POLL_WEIGHT);
71 napi_enable(&cell->napi);
72 }
73 return 0;
74}
75EXPORT_SYMBOL(gro_cells_init);
76
77void gro_cells_destroy(struct gro_cells *gcells)
78{
79 int i;
80
81 if (!gcells->cells)
82 return;
83 for_each_possible_cpu(i) {
84 struct gro_cell *cell = per_cpu_ptr(gcells->cells, i);
85
86 netif_napi_del(&cell->napi);
87 __skb_queue_purge(&cell->napi_skbs);
88 }
89 free_percpu(gcells->cells);
90 gcells->cells = NULL;
91}
92EXPORT_SYMBOL(gro_cells_destroy);
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
new file mode 100644
index 000000000000..0cfe7b0216c3
--- /dev/null
+++ b/net/core/lwt_bpf.c
@@ -0,0 +1,397 @@
1/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch>
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
11 */
12
13#include <linux/kernel.h>
14#include <linux/module.h>
15#include <linux/skbuff.h>
16#include <linux/types.h>
17#include <linux/bpf.h>
18#include <net/lwtunnel.h>
19
20struct bpf_lwt_prog {
21 struct bpf_prog *prog;
22 char *name;
23};
24
25struct bpf_lwt {
26 struct bpf_lwt_prog in;
27 struct bpf_lwt_prog out;
28 struct bpf_lwt_prog xmit;
29 int family;
30};
31
32#define MAX_PROG_NAME 256
33
34static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt)
35{
36 return (struct bpf_lwt *)lwt->data;
37}
38
39#define NO_REDIRECT false
40#define CAN_REDIRECT true
41
42static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
43 struct dst_entry *dst, bool can_redirect)
44{
45 int ret;
46
47 /* Preempt disable is needed to protect per-cpu redirect_info between
48 * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and
49 * access to maps strictly require a rcu_read_lock() for protection,
50 * mixing with BH RCU lock doesn't work.
51 */
52 preempt_disable();
53 rcu_read_lock();
54 bpf_compute_data_end(skb);
55 ret = bpf_prog_run_save_cb(lwt->prog, skb);
56 rcu_read_unlock();
57
58 switch (ret) {
59 case BPF_OK:
60 break;
61
62 case BPF_REDIRECT:
63 if (unlikely(!can_redirect)) {
64 pr_warn_once("Illegal redirect return code in prog %s\n",
65 lwt->name ? : "<unknown>");
66 ret = BPF_OK;
67 } else {
68 ret = skb_do_redirect(skb);
69 if (ret == 0)
70 ret = BPF_REDIRECT;
71 }
72 break;
73
74 case BPF_DROP:
75 kfree_skb(skb);
76 ret = -EPERM;
77 break;
78
79 default:
80 pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret);
81 kfree_skb(skb);
82 ret = -EINVAL;
83 break;
84 }
85
86 preempt_enable();
87
88 return ret;
89}
90
91static int bpf_input(struct sk_buff *skb)
92{
93 struct dst_entry *dst = skb_dst(skb);
94 struct bpf_lwt *bpf;
95 int ret;
96
97 bpf = bpf_lwt_lwtunnel(dst->lwtstate);
98 if (bpf->in.prog) {
99 ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
100 if (ret < 0)
101 return ret;
102 }
103
104 if (unlikely(!dst->lwtstate->orig_input)) {
105 pr_warn_once("orig_input not set on dst for prog %s\n",
106 bpf->out.name);
107 kfree_skb(skb);
108 return -EINVAL;
109 }
110
111 return dst->lwtstate->orig_input(skb);
112}
113
114static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb)
115{
116 struct dst_entry *dst = skb_dst(skb);
117 struct bpf_lwt *bpf;
118 int ret;
119
120 bpf = bpf_lwt_lwtunnel(dst->lwtstate);
121 if (bpf->out.prog) {
122 ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT);
123 if (ret < 0)
124 return ret;
125 }
126
127 if (unlikely(!dst->lwtstate->orig_output)) {
128 pr_warn_once("orig_output not set on dst for prog %s\n",
129 bpf->out.name);
130 kfree_skb(skb);
131 return -EINVAL;
132 }
133
134 return dst->lwtstate->orig_output(net, sk, skb);
135}
136
137static int xmit_check_hhlen(struct sk_buff *skb)
138{
139 int hh_len = skb_dst(skb)->dev->hard_header_len;
140
141 if (skb_headroom(skb) < hh_len) {
142 int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb));
143
144 if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC))
145 return -ENOMEM;
146 }
147
148 return 0;
149}
150
151static int bpf_xmit(struct sk_buff *skb)
152{
153 struct dst_entry *dst = skb_dst(skb);
154 struct bpf_lwt *bpf;
155
156 bpf = bpf_lwt_lwtunnel(dst->lwtstate);
157 if (bpf->xmit.prog) {
158 int ret;
159
160 ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT);
161 switch (ret) {
162 case BPF_OK:
163 /* If the header was expanded, headroom might be too
164 * small for L2 header to come, expand as needed.
165 */
166 ret = xmit_check_hhlen(skb);
167 if (unlikely(ret))
168 return ret;
169
170 return LWTUNNEL_XMIT_CONTINUE;
171 case BPF_REDIRECT:
172 return LWTUNNEL_XMIT_DONE;
173 default:
174 return ret;
175 }
176 }
177
178 return LWTUNNEL_XMIT_CONTINUE;
179}
180
181static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog)
182{
183 if (prog->prog)
184 bpf_prog_put(prog->prog);
185
186 kfree(prog->name);
187}
188
189static void bpf_destroy_state(struct lwtunnel_state *lwt)
190{
191 struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt);
192
193 bpf_lwt_prog_destroy(&bpf->in);
194 bpf_lwt_prog_destroy(&bpf->out);
195 bpf_lwt_prog_destroy(&bpf->xmit);
196}
197
198static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = {
199 [LWT_BPF_PROG_FD] = { .type = NLA_U32, },
200 [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING,
201 .len = MAX_PROG_NAME },
202};
203
204static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog,
205 enum bpf_prog_type type)
206{
207 struct nlattr *tb[LWT_BPF_PROG_MAX + 1];
208 struct bpf_prog *p;
209 int ret;
210 u32 fd;
211
212 ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy);
213 if (ret < 0)
214 return ret;
215
216 if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME])
217 return -EINVAL;
218
219 prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_KERNEL);
220 if (!prog->name)
221 return -ENOMEM;
222
223 fd = nla_get_u32(tb[LWT_BPF_PROG_FD]);
224 p = bpf_prog_get_type(fd, type);
225 if (IS_ERR(p))
226 return PTR_ERR(p);
227
228 prog->prog = p;
229
230 return 0;
231}
232
233static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = {
234 [LWT_BPF_IN] = { .type = NLA_NESTED, },
235 [LWT_BPF_OUT] = { .type = NLA_NESTED, },
236 [LWT_BPF_XMIT] = { .type = NLA_NESTED, },
237 [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 },
238};
239
240static int bpf_build_state(struct nlattr *nla,
241 unsigned int family, const void *cfg,
242 struct lwtunnel_state **ts)
243{
244 struct nlattr *tb[LWT_BPF_MAX + 1];
245 struct lwtunnel_state *newts;
246 struct bpf_lwt *bpf;
247 int ret;
248
249 if (family != AF_INET && family != AF_INET6)
250 return -EAFNOSUPPORT;
251
252 ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy);
253 if (ret < 0)
254 return ret;
255
256 if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT])
257 return -EINVAL;
258
259 newts = lwtunnel_state_alloc(sizeof(*bpf));
260 if (!newts)
261 return -ENOMEM;
262
263 newts->type = LWTUNNEL_ENCAP_BPF;
264 bpf = bpf_lwt_lwtunnel(newts);
265
266 if (tb[LWT_BPF_IN]) {
267 newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT;
268 ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in,
269 BPF_PROG_TYPE_LWT_IN);
270 if (ret < 0)
271 goto errout;
272 }
273
274 if (tb[LWT_BPF_OUT]) {
275 newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
276 ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out,
277 BPF_PROG_TYPE_LWT_OUT);
278 if (ret < 0)
279 goto errout;
280 }
281
282 if (tb[LWT_BPF_XMIT]) {
283 newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT;
284 ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit,
285 BPF_PROG_TYPE_LWT_XMIT);
286 if (ret < 0)
287 goto errout;
288 }
289
290 if (tb[LWT_BPF_XMIT_HEADROOM]) {
291 u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]);
292
293 if (headroom > LWT_BPF_MAX_HEADROOM) {
294 ret = -ERANGE;
295 goto errout;
296 }
297
298 newts->headroom = headroom;
299 }
300
301 bpf->family = family;
302 *ts = newts;
303
304 return 0;
305
306errout:
307 bpf_destroy_state(newts);
308 kfree(newts);
309 return ret;
310}
311
312static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr,
313 struct bpf_lwt_prog *prog)
314{
315 struct nlattr *nest;
316
317 if (!prog->prog)
318 return 0;
319
320 nest = nla_nest_start(skb, attr);
321 if (!nest)
322 return -EMSGSIZE;
323
324 if (prog->name &&
325 nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name))
326 return -EMSGSIZE;
327
328 return nla_nest_end(skb, nest);
329}
330
331static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt)
332{
333 struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt);
334
335 if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 ||
336 bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 ||
337 bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0)
338 return -EMSGSIZE;
339
340 return 0;
341}
342
343static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate)
344{
345 int nest_len = nla_total_size(sizeof(struct nlattr)) +
346 nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */
347 0;
348
349 return nest_len + /* LWT_BPF_IN */
350 nest_len + /* LWT_BPF_OUT */
351 nest_len + /* LWT_BPF_XMIT */
352 0;
353}
354
355static int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b)
356{
357 /* FIXME:
358 * The LWT state is currently rebuilt for delete requests which
359 * results in a new bpf_prog instance. Comparing names for now.
360 */
361 if (!a->name && !b->name)
362 return 0;
363
364 if (!a->name || !b->name)
365 return 1;
366
367 return strcmp(a->name, b->name);
368}
369
370static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
371{
372 struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a);
373 struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b);
374
375 return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) ||
376 bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) ||
377 bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit);
378}
379
380static const struct lwtunnel_encap_ops bpf_encap_ops = {
381 .build_state = bpf_build_state,
382 .destroy_state = bpf_destroy_state,
383 .input = bpf_input,
384 .output = bpf_output,
385 .xmit = bpf_xmit,
386 .fill_encap = bpf_fill_encap_info,
387 .get_encap_size = bpf_encap_nlsize,
388 .cmp_encap = bpf_encap_cmp,
389 .owner = THIS_MODULE,
390};
391
392static int __init bpf_lwt_init(void)
393{
394 return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF);
395}
396
397subsys_initcall(bpf_lwt_init)
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index e5f84c26ba1a..6df9f8fabf0c 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -26,6 +26,7 @@
26#include <net/lwtunnel.h> 26#include <net/lwtunnel.h>
27#include <net/rtnetlink.h> 27#include <net/rtnetlink.h>
28#include <net/ip6_fib.h> 28#include <net/ip6_fib.h>
29#include <net/nexthop.h>
29 30
30#ifdef CONFIG_MODULES 31#ifdef CONFIG_MODULES
31 32
@@ -39,6 +40,10 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
39 return "MPLS"; 40 return "MPLS";
40 case LWTUNNEL_ENCAP_ILA: 41 case LWTUNNEL_ENCAP_ILA:
41 return "ILA"; 42 return "ILA";
43 case LWTUNNEL_ENCAP_SEG6:
44 return "SEG6";
45 case LWTUNNEL_ENCAP_BPF:
46 return "BPF";
42 case LWTUNNEL_ENCAP_IP6: 47 case LWTUNNEL_ENCAP_IP6:
43 case LWTUNNEL_ENCAP_IP: 48 case LWTUNNEL_ENCAP_IP:
44 case LWTUNNEL_ENCAP_NONE: 49 case LWTUNNEL_ENCAP_NONE:
@@ -96,7 +101,7 @@ int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops,
96} 101}
97EXPORT_SYMBOL(lwtunnel_encap_del_ops); 102EXPORT_SYMBOL(lwtunnel_encap_del_ops);
98 103
99int lwtunnel_build_state(struct net_device *dev, u16 encap_type, 104int lwtunnel_build_state(u16 encap_type,
100 struct nlattr *encap, unsigned int family, 105 struct nlattr *encap, unsigned int family,
101 const void *cfg, struct lwtunnel_state **lws) 106 const void *cfg, struct lwtunnel_state **lws)
102{ 107{
@@ -110,25 +115,91 @@ int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
110 ret = -EOPNOTSUPP; 115 ret = -EOPNOTSUPP;
111 rcu_read_lock(); 116 rcu_read_lock();
112 ops = rcu_dereference(lwtun_encaps[encap_type]); 117 ops = rcu_dereference(lwtun_encaps[encap_type]);
118 if (likely(ops && ops->build_state && try_module_get(ops->owner))) {
119 ret = ops->build_state(encap, family, cfg, lws);
120 if (ret)
121 module_put(ops->owner);
122 }
123 rcu_read_unlock();
124
125 return ret;
126}
127EXPORT_SYMBOL(lwtunnel_build_state);
128
129int lwtunnel_valid_encap_type(u16 encap_type)
130{
131 const struct lwtunnel_encap_ops *ops;
132 int ret = -EINVAL;
133
134 if (encap_type == LWTUNNEL_ENCAP_NONE ||
135 encap_type > LWTUNNEL_ENCAP_MAX)
136 return ret;
137
138 rcu_read_lock();
139 ops = rcu_dereference(lwtun_encaps[encap_type]);
140 rcu_read_unlock();
113#ifdef CONFIG_MODULES 141#ifdef CONFIG_MODULES
114 if (!ops) { 142 if (!ops) {
115 const char *encap_type_str = lwtunnel_encap_str(encap_type); 143 const char *encap_type_str = lwtunnel_encap_str(encap_type);
116 144
117 if (encap_type_str) { 145 if (encap_type_str) {
118 rcu_read_unlock(); 146 __rtnl_unlock();
119 request_module("rtnl-lwt-%s", encap_type_str); 147 request_module("rtnl-lwt-%s", encap_type_str);
148 rtnl_lock();
149
120 rcu_read_lock(); 150 rcu_read_lock();
121 ops = rcu_dereference(lwtun_encaps[encap_type]); 151 ops = rcu_dereference(lwtun_encaps[encap_type]);
152 rcu_read_unlock();
122 } 153 }
123 } 154 }
124#endif 155#endif
125 if (likely(ops && ops->build_state)) 156 return ops ? 0 : -EOPNOTSUPP;
126 ret = ops->build_state(dev, encap, family, cfg, lws); 157}
127 rcu_read_unlock(); 158EXPORT_SYMBOL(lwtunnel_valid_encap_type);
128 159
129 return ret; 160int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining)
161{
162 struct rtnexthop *rtnh = (struct rtnexthop *)attr;
163 struct nlattr *nla_entype;
164 struct nlattr *attrs;
165 struct nlattr *nla;
166 u16 encap_type;
167 int attrlen;
168
169 while (rtnh_ok(rtnh, remaining)) {
170 attrlen = rtnh_attrlen(rtnh);
171 if (attrlen > 0) {
172 attrs = rtnh_attrs(rtnh);
173 nla = nla_find(attrs, attrlen, RTA_ENCAP);
174 nla_entype = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
175
176 if (nla_entype) {
177 encap_type = nla_get_u16(nla_entype);
178
179 if (lwtunnel_valid_encap_type(encap_type) != 0)
180 return -EOPNOTSUPP;
181 }
182 }
183 rtnh = rtnh_next(rtnh, &remaining);
184 }
185
186 return 0;
130} 187}
131EXPORT_SYMBOL(lwtunnel_build_state); 188EXPORT_SYMBOL(lwtunnel_valid_encap_type_attr);
189
190void lwtstate_free(struct lwtunnel_state *lws)
191{
192 const struct lwtunnel_encap_ops *ops = lwtun_encaps[lws->type];
193
194 if (ops->destroy_state) {
195 ops->destroy_state(lws);
196 kfree_rcu(lws, rcu);
197 } else {
198 kfree(lws);
199 }
200 module_put(ops->owner);
201}
202EXPORT_SYMBOL(lwtstate_free);
132 203
133int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate) 204int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate)
134{ 205{
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 2ae929f9bd06..4526cbd7e28a 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -100,6 +100,7 @@ static void neigh_cleanup_and_release(struct neighbour *neigh)
100 neigh->parms->neigh_cleanup(neigh); 100 neigh->parms->neigh_cleanup(neigh);
101 101
102 __neigh_notify(neigh, RTM_DELNEIGH, 0); 102 __neigh_notify(neigh, RTM_DELNEIGH, 0);
103 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
103 neigh_release(neigh); 104 neigh_release(neigh);
104} 105}
105 106
@@ -859,7 +860,8 @@ static void neigh_probe(struct neighbour *neigh)
859 if (skb) 860 if (skb)
860 skb = skb_clone(skb, GFP_ATOMIC); 861 skb = skb_clone(skb, GFP_ATOMIC);
861 write_unlock(&neigh->lock); 862 write_unlock(&neigh->lock);
862 neigh->ops->solicit(neigh, skb); 863 if (neigh->ops->solicit)
864 neigh->ops->solicit(neigh, skb);
863 atomic_inc(&neigh->probes); 865 atomic_inc(&neigh->probes);
864 kfree_skb(skb); 866 kfree_skb(skb);
865} 867}
@@ -2291,13 +2293,10 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
2291 for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0; 2293 for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0;
2292 n != NULL; 2294 n != NULL;
2293 n = rcu_dereference_bh(n->next)) { 2295 n = rcu_dereference_bh(n->next)) {
2294 if (!net_eq(dev_net(n->dev), net)) 2296 if (idx < s_idx || !net_eq(dev_net(n->dev), net))
2295 continue; 2297 goto next;
2296 if (neigh_ifindex_filtered(n->dev, filter_idx)) 2298 if (neigh_ifindex_filtered(n->dev, filter_idx) ||
2297 continue; 2299 neigh_master_filtered(n->dev, filter_master_idx))
2298 if (neigh_master_filtered(n->dev, filter_master_idx))
2299 continue;
2300 if (idx < s_idx)
2301 goto next; 2300 goto next;
2302 if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, 2301 if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
2303 cb->nlh->nlmsg_seq, 2302 cb->nlh->nlmsg_seq,
@@ -2332,9 +2331,7 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
2332 if (h > s_h) 2331 if (h > s_h)
2333 s_idx = 0; 2332 s_idx = 0;
2334 for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) { 2333 for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) {
2335 if (pneigh_net(n) != net) 2334 if (idx < s_idx || pneigh_net(n) != net)
2336 continue;
2337 if (idx < s_idx)
2338 goto next; 2335 goto next;
2339 if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, 2336 if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
2340 cb->nlh->nlmsg_seq, 2337 cb->nlh->nlmsg_seq,
@@ -2927,7 +2924,8 @@ static void neigh_proc_update(struct ctl_table *ctl, int write)
2927 return; 2924 return;
2928 2925
2929 set_bit(index, p->data_state); 2926 set_bit(index, p->data_state);
2930 call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); 2927 if (index == NEIGH_VAR_DELAY_PROBE_TIME)
2928 call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p);
2931 if (!dev) /* NULL dev means this is default value */ 2929 if (!dev) /* NULL dev means this is default value */
2932 neigh_copy_dflt_parms(net, p, index); 2930 neigh_copy_dflt_parms(net, p, index);
2933} 2931}
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 6e4f34721080..65ea0ff4017c 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -15,6 +15,7 @@
15#include <net/switchdev.h> 15#include <net/switchdev.h>
16#include <linux/if_arp.h> 16#include <linux/if_arp.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/sched/signal.h>
18#include <linux/nsproxy.h> 19#include <linux/nsproxy.h>
19#include <net/sock.h> 20#include <net/sock.h>
20#include <net/net_namespace.h> 21#include <net/net_namespace.h>
@@ -950,10 +951,13 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
950 } 951 }
951 952
952 while (--i >= new_num) { 953 while (--i >= new_num) {
954 struct kobject *kobj = &dev->_rx[i].kobj;
955
956 if (!atomic_read(&dev_net(dev)->count))
957 kobj->uevent_suppress = 1;
953 if (dev->sysfs_rx_queue_group) 958 if (dev->sysfs_rx_queue_group)
954 sysfs_remove_group(&dev->_rx[i].kobj, 959 sysfs_remove_group(kobj, dev->sysfs_rx_queue_group);
955 dev->sysfs_rx_queue_group); 960 kobject_put(kobj);
956 kobject_put(&dev->_rx[i].kobj);
957 } 961 }
958 962
959 return error; 963 return error;
@@ -1021,7 +1025,6 @@ static ssize_t show_trans_timeout(struct netdev_queue *queue,
1021 return sprintf(buf, "%lu", trans_timeout); 1025 return sprintf(buf, "%lu", trans_timeout);
1022} 1026}
1023 1027
1024#ifdef CONFIG_XPS
1025static unsigned int get_netdev_queue_index(struct netdev_queue *queue) 1028static unsigned int get_netdev_queue_index(struct netdev_queue *queue)
1026{ 1029{
1027 struct net_device *dev = queue->dev; 1030 struct net_device *dev = queue->dev;
@@ -1033,6 +1036,21 @@ static unsigned int get_netdev_queue_index(struct netdev_queue *queue)
1033 return i; 1036 return i;
1034} 1037}
1035 1038
1039static ssize_t show_traffic_class(struct netdev_queue *queue,
1040 struct netdev_queue_attribute *attribute,
1041 char *buf)
1042{
1043 struct net_device *dev = queue->dev;
1044 int index = get_netdev_queue_index(queue);
1045 int tc = netdev_txq_to_tc(dev, index);
1046
1047 if (tc < 0)
1048 return -EINVAL;
1049
1050 return sprintf(buf, "%u\n", tc);
1051}
1052
1053#ifdef CONFIG_XPS
1036static ssize_t show_tx_maxrate(struct netdev_queue *queue, 1054static ssize_t show_tx_maxrate(struct netdev_queue *queue,
1037 struct netdev_queue_attribute *attribute, 1055 struct netdev_queue_attribute *attribute,
1038 char *buf) 1056 char *buf)
@@ -1075,6 +1093,9 @@ static struct netdev_queue_attribute queue_tx_maxrate =
1075static struct netdev_queue_attribute queue_trans_timeout = 1093static struct netdev_queue_attribute queue_trans_timeout =
1076 __ATTR(tx_timeout, S_IRUGO, show_trans_timeout, NULL); 1094 __ATTR(tx_timeout, S_IRUGO, show_trans_timeout, NULL);
1077 1095
1096static struct netdev_queue_attribute queue_traffic_class =
1097 __ATTR(traffic_class, S_IRUGO, show_traffic_class, NULL);
1098
1078#ifdef CONFIG_BQL 1099#ifdef CONFIG_BQL
1079/* 1100/*
1080 * Byte queue limits sysfs structures and functions. 1101 * Byte queue limits sysfs structures and functions.
@@ -1190,29 +1211,38 @@ static ssize_t show_xps_map(struct netdev_queue *queue,
1190 struct netdev_queue_attribute *attribute, char *buf) 1211 struct netdev_queue_attribute *attribute, char *buf)
1191{ 1212{
1192 struct net_device *dev = queue->dev; 1213 struct net_device *dev = queue->dev;
1214 int cpu, len, num_tc = 1, tc = 0;
1193 struct xps_dev_maps *dev_maps; 1215 struct xps_dev_maps *dev_maps;
1194 cpumask_var_t mask; 1216 cpumask_var_t mask;
1195 unsigned long index; 1217 unsigned long index;
1196 int i, len;
1197 1218
1198 if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) 1219 if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
1199 return -ENOMEM; 1220 return -ENOMEM;
1200 1221
1201 index = get_netdev_queue_index(queue); 1222 index = get_netdev_queue_index(queue);
1202 1223
1224 if (dev->num_tc) {
1225 num_tc = dev->num_tc;
1226 tc = netdev_txq_to_tc(dev, index);
1227 if (tc < 0)
1228 return -EINVAL;
1229 }
1230
1203 rcu_read_lock(); 1231 rcu_read_lock();
1204 dev_maps = rcu_dereference(dev->xps_maps); 1232 dev_maps = rcu_dereference(dev->xps_maps);
1205 if (dev_maps) { 1233 if (dev_maps) {
1206 for_each_possible_cpu(i) { 1234 for_each_possible_cpu(cpu) {
1207 struct xps_map *map = 1235 int i, tci = cpu * num_tc + tc;
1208 rcu_dereference(dev_maps->cpu_map[i]); 1236 struct xps_map *map;
1209 if (map) { 1237
1210 int j; 1238 map = rcu_dereference(dev_maps->cpu_map[tci]);
1211 for (j = 0; j < map->len; j++) { 1239 if (!map)
1212 if (map->queues[j] == index) { 1240 continue;
1213 cpumask_set_cpu(i, mask); 1241
1214 break; 1242 for (i = map->len; i--;) {
1215 } 1243 if (map->queues[i] == index) {
1244 cpumask_set_cpu(cpu, mask);
1245 break;
1216 } 1246 }
1217 } 1247 }
1218 } 1248 }
@@ -1260,6 +1290,7 @@ static struct netdev_queue_attribute xps_cpus_attribute =
1260 1290
1261static struct attribute *netdev_queue_default_attrs[] = { 1291static struct attribute *netdev_queue_default_attrs[] = {
1262 &queue_trans_timeout.attr, 1292 &queue_trans_timeout.attr,
1293 &queue_traffic_class.attr,
1263#ifdef CONFIG_XPS 1294#ifdef CONFIG_XPS
1264 &xps_cpus_attribute.attr, 1295 &xps_cpus_attribute.attr,
1265 &queue_tx_maxrate.attr, 1296 &queue_tx_maxrate.attr,
@@ -1340,6 +1371,8 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
1340 while (--i >= new_num) { 1371 while (--i >= new_num) {
1341 struct netdev_queue *queue = dev->_tx + i; 1372 struct netdev_queue *queue = dev->_tx + i;
1342 1373
1374 if (!atomic_read(&dev_net(dev)->count))
1375 queue->kobj.uevent_suppress = 1;
1343#ifdef CONFIG_BQL 1376#ifdef CONFIG_BQL
1344 sysfs_remove_group(&queue->kobj, &dql_group); 1377 sysfs_remove_group(&queue->kobj, &dql_group);
1345#endif 1378#endif
@@ -1525,6 +1558,9 @@ void netdev_unregister_kobject(struct net_device *ndev)
1525{ 1558{
1526 struct device *dev = &(ndev->dev); 1559 struct device *dev = &(ndev->dev);
1527 1560
1561 if (!atomic_read(&dev_net(ndev)->count))
1562 dev_set_uevent_suppress(dev, 1);
1563
1528 kobject_get(&dev->kobj); 1564 kobject_get(&dev->kobj);
1529 1565
1530 remove_queue_kobjects(ndev); 1566 remove_queue_kobjects(ndev);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 7001da910c6b..652468ff65b7 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -16,6 +16,8 @@
16#include <linux/export.h> 16#include <linux/export.h>
17#include <linux/user_namespace.h> 17#include <linux/user_namespace.h>
18#include <linux/net_namespace.h> 18#include <linux/net_namespace.h>
19#include <linux/sched/task.h>
20
19#include <net/sock.h> 21#include <net/sock.h>
20#include <net/netlink.h> 22#include <net/netlink.h>
21#include <net/net_namespace.h> 23#include <net/net_namespace.h>
@@ -39,6 +41,9 @@ EXPORT_SYMBOL(init_net);
39 41
40static bool init_net_initialized; 42static bool init_net_initialized;
41 43
44#define MIN_PERNET_OPS_ID \
45 ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *))
46
42#define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */ 47#define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */
43 48
44static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS; 49static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
@@ -46,27 +51,28 @@ static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
46static struct net_generic *net_alloc_generic(void) 51static struct net_generic *net_alloc_generic(void)
47{ 52{
48 struct net_generic *ng; 53 struct net_generic *ng;
49 size_t generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]); 54 unsigned int generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]);
50 55
51 ng = kzalloc(generic_size, GFP_KERNEL); 56 ng = kzalloc(generic_size, GFP_KERNEL);
52 if (ng) 57 if (ng)
53 ng->len = max_gen_ptrs; 58 ng->s.len = max_gen_ptrs;
54 59
55 return ng; 60 return ng;
56} 61}
57 62
58static int net_assign_generic(struct net *net, int id, void *data) 63static int net_assign_generic(struct net *net, unsigned int id, void *data)
59{ 64{
60 struct net_generic *ng, *old_ng; 65 struct net_generic *ng, *old_ng;
61 66
62 BUG_ON(!mutex_is_locked(&net_mutex)); 67 BUG_ON(!mutex_is_locked(&net_mutex));
63 BUG_ON(id == 0); 68 BUG_ON(id < MIN_PERNET_OPS_ID);
64 69
65 old_ng = rcu_dereference_protected(net->gen, 70 old_ng = rcu_dereference_protected(net->gen,
66 lockdep_is_held(&net_mutex)); 71 lockdep_is_held(&net_mutex));
67 ng = old_ng; 72 if (old_ng->s.len > id) {
68 if (old_ng->len >= id) 73 old_ng->ptr[id] = data;
69 goto assign; 74 return 0;
75 }
70 76
71 ng = net_alloc_generic(); 77 ng = net_alloc_generic();
72 if (ng == NULL) 78 if (ng == NULL)
@@ -83,12 +89,12 @@ static int net_assign_generic(struct net *net, int id, void *data)
83 * the old copy for kfree after a grace period. 89 * the old copy for kfree after a grace period.
84 */ 90 */
85 91
86 memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*)); 92 memcpy(&ng->ptr[MIN_PERNET_OPS_ID], &old_ng->ptr[MIN_PERNET_OPS_ID],
93 (old_ng->s.len - MIN_PERNET_OPS_ID) * sizeof(void *));
94 ng->ptr[id] = data;
87 95
88 rcu_assign_pointer(net->gen, ng); 96 rcu_assign_pointer(net->gen, ng);
89 kfree_rcu(old_ng, rcu); 97 kfree_rcu(old_ng, s.rcu);
90assign:
91 ng->ptr[id - 1] = data;
92 return 0; 98 return 0;
93} 99}
94 100
@@ -122,8 +128,7 @@ out:
122static void ops_free(const struct pernet_operations *ops, struct net *net) 128static void ops_free(const struct pernet_operations *ops, struct net *net)
123{ 129{
124 if (ops->id && ops->size) { 130 if (ops->id && ops->size) {
125 int id = *ops->id; 131 kfree(net_generic(net, *ops->id));
126 kfree(net_generic(net, id));
127 } 132 }
128} 133}
129 134
@@ -215,16 +220,15 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id);
215 */ 220 */
216int peernet2id_alloc(struct net *net, struct net *peer) 221int peernet2id_alloc(struct net *net, struct net *peer)
217{ 222{
218 unsigned long flags;
219 bool alloc; 223 bool alloc;
220 int id; 224 int id;
221 225
222 if (atomic_read(&net->count) == 0) 226 if (atomic_read(&net->count) == 0)
223 return NETNSA_NSID_NOT_ASSIGNED; 227 return NETNSA_NSID_NOT_ASSIGNED;
224 spin_lock_irqsave(&net->nsid_lock, flags); 228 spin_lock_bh(&net->nsid_lock);
225 alloc = atomic_read(&peer->count) == 0 ? false : true; 229 alloc = atomic_read(&peer->count) == 0 ? false : true;
226 id = __peernet2id_alloc(net, peer, &alloc); 230 id = __peernet2id_alloc(net, peer, &alloc);
227 spin_unlock_irqrestore(&net->nsid_lock, flags); 231 spin_unlock_bh(&net->nsid_lock);
228 if (alloc && id >= 0) 232 if (alloc && id >= 0)
229 rtnl_net_notifyid(net, RTM_NEWNSID, id); 233 rtnl_net_notifyid(net, RTM_NEWNSID, id);
230 return id; 234 return id;
@@ -233,12 +237,11 @@ int peernet2id_alloc(struct net *net, struct net *peer)
233/* This function returns, if assigned, the id of a peer netns. */ 237/* This function returns, if assigned, the id of a peer netns. */
234int peernet2id(struct net *net, struct net *peer) 238int peernet2id(struct net *net, struct net *peer)
235{ 239{
236 unsigned long flags;
237 int id; 240 int id;
238 241
239 spin_lock_irqsave(&net->nsid_lock, flags); 242 spin_lock_bh(&net->nsid_lock);
240 id = __peernet2id(net, peer); 243 id = __peernet2id(net, peer);
241 spin_unlock_irqrestore(&net->nsid_lock, flags); 244 spin_unlock_bh(&net->nsid_lock);
242 return id; 245 return id;
243} 246}
244EXPORT_SYMBOL(peernet2id); 247EXPORT_SYMBOL(peernet2id);
@@ -253,18 +256,17 @@ bool peernet_has_id(struct net *net, struct net *peer)
253 256
254struct net *get_net_ns_by_id(struct net *net, int id) 257struct net *get_net_ns_by_id(struct net *net, int id)
255{ 258{
256 unsigned long flags;
257 struct net *peer; 259 struct net *peer;
258 260
259 if (id < 0) 261 if (id < 0)
260 return NULL; 262 return NULL;
261 263
262 rcu_read_lock(); 264 rcu_read_lock();
263 spin_lock_irqsave(&net->nsid_lock, flags); 265 spin_lock_bh(&net->nsid_lock);
264 peer = idr_find(&net->netns_ids, id); 266 peer = idr_find(&net->netns_ids, id);
265 if (peer) 267 if (peer)
266 get_net(peer); 268 get_net(peer);
267 spin_unlock_irqrestore(&net->nsid_lock, flags); 269 spin_unlock_bh(&net->nsid_lock);
268 rcu_read_unlock(); 270 rcu_read_unlock();
269 271
270 return peer; 272 return peer;
@@ -384,7 +386,14 @@ struct net *copy_net_ns(unsigned long flags,
384 386
385 get_user_ns(user_ns); 387 get_user_ns(user_ns);
386 388
387 mutex_lock(&net_mutex); 389 rv = mutex_lock_killable(&net_mutex);
390 if (rv < 0) {
391 net_free(net);
392 dec_net_namespaces(ucounts);
393 put_user_ns(user_ns);
394 return ERR_PTR(rv);
395 }
396
388 net->ucounts = ucounts; 397 net->ucounts = ucounts;
389 rv = setup_net(net, user_ns); 398 rv = setup_net(net, user_ns);
390 if (rv == 0) { 399 if (rv == 0) {
@@ -427,17 +436,17 @@ static void cleanup_net(struct work_struct *work)
427 for_each_net(tmp) { 436 for_each_net(tmp) {
428 int id; 437 int id;
429 438
430 spin_lock_irq(&tmp->nsid_lock); 439 spin_lock_bh(&tmp->nsid_lock);
431 id = __peernet2id(tmp, net); 440 id = __peernet2id(tmp, net);
432 if (id >= 0) 441 if (id >= 0)
433 idr_remove(&tmp->netns_ids, id); 442 idr_remove(&tmp->netns_ids, id);
434 spin_unlock_irq(&tmp->nsid_lock); 443 spin_unlock_bh(&tmp->nsid_lock);
435 if (id >= 0) 444 if (id >= 0)
436 rtnl_net_notifyid(tmp, RTM_DELNSID, id); 445 rtnl_net_notifyid(tmp, RTM_DELNSID, id);
437 } 446 }
438 spin_lock_irq(&net->nsid_lock); 447 spin_lock_bh(&net->nsid_lock);
439 idr_destroy(&net->netns_ids); 448 idr_destroy(&net->netns_ids);
440 spin_unlock_irq(&net->nsid_lock); 449 spin_unlock_bh(&net->nsid_lock);
441 450
442 } 451 }
443 rtnl_unlock(); 452 rtnl_unlock();
@@ -566,7 +575,6 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh)
566{ 575{
567 struct net *net = sock_net(skb->sk); 576 struct net *net = sock_net(skb->sk);
568 struct nlattr *tb[NETNSA_MAX + 1]; 577 struct nlattr *tb[NETNSA_MAX + 1];
569 unsigned long flags;
570 struct net *peer; 578 struct net *peer;
571 int nsid, err; 579 int nsid, err;
572 580
@@ -587,15 +595,15 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh)
587 if (IS_ERR(peer)) 595 if (IS_ERR(peer))
588 return PTR_ERR(peer); 596 return PTR_ERR(peer);
589 597
590 spin_lock_irqsave(&net->nsid_lock, flags); 598 spin_lock_bh(&net->nsid_lock);
591 if (__peernet2id(net, peer) >= 0) { 599 if (__peernet2id(net, peer) >= 0) {
592 spin_unlock_irqrestore(&net->nsid_lock, flags); 600 spin_unlock_bh(&net->nsid_lock);
593 err = -EEXIST; 601 err = -EEXIST;
594 goto out; 602 goto out;
595 } 603 }
596 604
597 err = alloc_netid(net, peer, nsid); 605 err = alloc_netid(net, peer, nsid);
598 spin_unlock_irqrestore(&net->nsid_lock, flags); 606 spin_unlock_bh(&net->nsid_lock);
599 if (err >= 0) { 607 if (err >= 0) {
600 rtnl_net_notifyid(net, RTM_NEWNSID, err); 608 rtnl_net_notifyid(net, RTM_NEWNSID, err);
601 err = 0; 609 err = 0;
@@ -717,11 +725,10 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
717 .idx = 0, 725 .idx = 0,
718 .s_idx = cb->args[0], 726 .s_idx = cb->args[0],
719 }; 727 };
720 unsigned long flags;
721 728
722 spin_lock_irqsave(&net->nsid_lock, flags); 729 spin_lock_bh(&net->nsid_lock);
723 idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb); 730 idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb);
724 spin_unlock_irqrestore(&net->nsid_lock, flags); 731 spin_unlock_bh(&net->nsid_lock);
725 732
726 cb->args[0] = net_cb.idx; 733 cb->args[0] = net_cb.idx;
727 return skb->len; 734 return skb->len;
@@ -868,7 +875,7 @@ static int register_pernet_operations(struct list_head *list,
868 875
869 if (ops->id) { 876 if (ops->id) {
870again: 877again:
871 error = ida_get_new_above(&net_generic_ids, 1, ops->id); 878 error = ida_get_new_above(&net_generic_ids, MIN_PERNET_OPS_ID, ops->id);
872 if (error < 0) { 879 if (error < 0) {
873 if (error == -EAGAIN) { 880 if (error == -EAGAIN) {
874 ida_pre_get(&net_generic_ids, GFP_KERNEL); 881 ida_pre_get(&net_generic_ids, GFP_KERNEL);
@@ -876,7 +883,7 @@ again:
876 } 883 }
877 return error; 884 return error;
878 } 885 }
879 max_gen_ptrs = max_t(unsigned int, max_gen_ptrs, *ops->id); 886 max_gen_ptrs = max(max_gen_ptrs, *ops->id + 1);
880 } 887 }
881 error = __register_pernet_operations(list, ops); 888 error = __register_pernet_operations(list, ops);
882 if (error) { 889 if (error) {
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 11fce17274f6..029a61ac6cdd 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -12,6 +12,8 @@
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/cgroup.h> 13#include <linux/cgroup.h>
14#include <linux/fdtable.h> 14#include <linux/fdtable.h>
15#include <linux/sched/task.h>
16
15#include <net/cls_cgroup.h> 17#include <net/cls_cgroup.h>
16#include <net/sock.h> 18#include <net/sock.h>
17 19
@@ -69,27 +71,17 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n)
69 return 0; 71 return 0;
70} 72}
71 73
72static void update_classid(struct cgroup_subsys_state *css, void *v) 74static void cgrp_attach(struct cgroup_taskset *tset)
73{ 75{
74 struct css_task_iter it; 76 struct cgroup_subsys_state *css;
75 struct task_struct *p; 77 struct task_struct *p;
76 78
77 css_task_iter_start(css, &it); 79 cgroup_taskset_for_each(p, css, tset) {
78 while ((p = css_task_iter_next(&it))) {
79 task_lock(p); 80 task_lock(p);
80 iterate_fd(p->files, 0, update_classid_sock, v); 81 iterate_fd(p->files, 0, update_classid_sock,
82 (void *)(unsigned long)css_cls_state(css)->classid);
81 task_unlock(p); 83 task_unlock(p);
82 } 84 }
83 css_task_iter_end(&it);
84}
85
86static void cgrp_attach(struct cgroup_taskset *tset)
87{
88 struct cgroup_subsys_state *css;
89
90 cgroup_taskset_first(tset, &css);
91 update_classid(css,
92 (void *)(unsigned long)css_cls_state(css)->classid);
93} 85}
94 86
95static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft) 87static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft)
@@ -101,12 +93,22 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
101 u64 value) 93 u64 value)
102{ 94{
103 struct cgroup_cls_state *cs = css_cls_state(css); 95 struct cgroup_cls_state *cs = css_cls_state(css);
96 struct css_task_iter it;
97 struct task_struct *p;
104 98
105 cgroup_sk_alloc_disable(); 99 cgroup_sk_alloc_disable();
106 100
107 cs->classid = (u32)value; 101 cs->classid = (u32)value;
108 102
109 update_classid(css, (void *)(unsigned long)cs->classid); 103 css_task_iter_start(css, &it);
104 while ((p = css_task_iter_next(&it))) {
105 task_lock(p);
106 iterate_fd(p->files, 0, update_classid_sock,
107 (void *)(unsigned long)cs->classid);
108 task_unlock(p);
109 }
110 css_task_iter_end(&it);
111
110 return 0; 112 return 0;
111} 113}
112 114
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 53599bd0c82d..29be2466970c 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -105,15 +105,21 @@ static void queue_process(struct work_struct *work)
105 while ((skb = skb_dequeue(&npinfo->txq))) { 105 while ((skb = skb_dequeue(&npinfo->txq))) {
106 struct net_device *dev = skb->dev; 106 struct net_device *dev = skb->dev;
107 struct netdev_queue *txq; 107 struct netdev_queue *txq;
108 unsigned int q_index;
108 109
109 if (!netif_device_present(dev) || !netif_running(dev)) { 110 if (!netif_device_present(dev) || !netif_running(dev)) {
110 kfree_skb(skb); 111 kfree_skb(skb);
111 continue; 112 continue;
112 } 113 }
113 114
114 txq = skb_get_tx_queue(dev, skb);
115
116 local_irq_save(flags); 115 local_irq_save(flags);
116 /* check if skb->queue_mapping is still valid */
117 q_index = skb_get_queue_mapping(skb);
118 if (unlikely(q_index >= dev->real_num_tx_queues)) {
119 q_index = q_index % dev->real_num_tx_queues;
120 skb_set_queue_mapping(skb, q_index);
121 }
122 txq = netdev_get_tx_queue(dev, q_index);
117 HARD_TX_LOCK(dev, txq, smp_processor_id()); 123 HARD_TX_LOCK(dev, txq, smp_processor_id());
118 if (netif_xmit_frozen_or_stopped(txq) || 124 if (netif_xmit_frozen_or_stopped(txq) ||
119 netpoll_start_xmit(skb, dev, txq) != NETDEV_TX_OK) { 125 netpoll_start_xmit(skb, dev, txq) != NETDEV_TX_OK) {
@@ -171,12 +177,12 @@ static void poll_one_napi(struct napi_struct *napi)
171static void poll_napi(struct net_device *dev) 177static void poll_napi(struct net_device *dev)
172{ 178{
173 struct napi_struct *napi; 179 struct napi_struct *napi;
180 int cpu = smp_processor_id();
174 181
175 list_for_each_entry(napi, &dev->napi_list, dev_list) { 182 list_for_each_entry(napi, &dev->napi_list, dev_list) {
176 if (napi->poll_owner != smp_processor_id() && 183 if (cmpxchg(&napi->poll_owner, -1, cpu) == -1) {
177 spin_trylock(&napi->poll_lock)) {
178 poll_one_napi(napi); 184 poll_one_napi(napi);
179 spin_unlock(&napi->poll_lock); 185 smp_store_release(&napi->poll_owner, -1);
180 } 186 }
181 } 187 }
182} 188}
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 2ec86fc552df..0f9275ee5595 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -13,12 +13,15 @@
13 13
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/types.h> 15#include <linux/types.h>
16#include <linux/module.h>
16#include <linux/string.h> 17#include <linux/string.h>
17#include <linux/errno.h> 18#include <linux/errno.h>
18#include <linux/skbuff.h> 19#include <linux/skbuff.h>
19#include <linux/cgroup.h> 20#include <linux/cgroup.h>
20#include <linux/rcupdate.h> 21#include <linux/rcupdate.h>
21#include <linux/atomic.h> 22#include <linux/atomic.h>
23#include <linux/sched/task.h>
24
22#include <net/rtnetlink.h> 25#include <net/rtnetlink.h>
23#include <net/pkt_cls.h> 26#include <net/pkt_cls.h>
24#include <net/sock.h> 27#include <net/sock.h>
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 306b8f0e03c1..96947f5d41e4 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -413,7 +413,7 @@ struct pktgen_hdr {
413}; 413};
414 414
415 415
416static int pg_net_id __read_mostly; 416static unsigned int pg_net_id __read_mostly;
417 417
418struct pktgen_net { 418struct pktgen_net {
419 struct net *net; 419 struct net *net;
@@ -3439,9 +3439,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
3439 /* skb was 'freed' by stack, so clean few 3439 /* skb was 'freed' by stack, so clean few
3440 * bits and reuse it 3440 * bits and reuse it
3441 */ 3441 */
3442#ifdef CONFIG_NET_CLS_ACT 3442 skb_reset_tc(skb);
3443 skb->tc_verd = 0; /* reset reclass/redir ttl */
3444#endif
3445 } while (--burst > 0); 3443 } while (--burst > 0);
3446 goto out; /* Skips xmit_mode M_START_XMIT */ 3444 goto out; /* Skips xmit_mode M_START_XMIT */
3447 } else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) { 3445 } else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) {
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index 5d26056b6d8f..9b8727c67b58 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -34,8 +34,6 @@
34 * and it will increase in proportion to the memory of machine. 34 * and it will increase in proportion to the memory of machine.
35 * Note : Dont forget somaxconn that may limit backlog too. 35 * Note : Dont forget somaxconn that may limit backlog too.
36 */ 36 */
37int sysctl_max_syn_backlog = 256;
38EXPORT_SYMBOL(sysctl_max_syn_backlog);
39 37
40void reqsk_queue_alloc(struct request_sock_queue *queue) 38void reqsk_queue_alloc(struct request_sock_queue *queue)
41{ 39{
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a6196cf844f6..c4e84c558240 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -40,7 +40,7 @@
40#include <linux/pci.h> 40#include <linux/pci.h>
41#include <linux/etherdevice.h> 41#include <linux/etherdevice.h>
42 42
43#include <asm/uaccess.h> 43#include <linux/uaccess.h>
44 44
45#include <linux/inet.h> 45#include <linux/inet.h>
46#include <linux/netdevice.h> 46#include <linux/netdevice.h>
@@ -837,8 +837,7 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
837static inline int rtnl_vfinfo_size(const struct net_device *dev, 837static inline int rtnl_vfinfo_size(const struct net_device *dev,
838 u32 ext_filter_mask) 838 u32 ext_filter_mask)
839{ 839{
840 if (dev->dev.parent && dev_is_pci(dev->dev.parent) && 840 if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF)) {
841 (ext_filter_mask & RTEXT_FILTER_VF)) {
842 int num_vfs = dev_num_vf(dev->dev.parent); 841 int num_vfs = dev_num_vf(dev->dev.parent);
843 size_t size = nla_total_size(0); 842 size_t size = nla_total_size(0);
844 size += num_vfs * 843 size += num_vfs *
@@ -877,8 +876,6 @@ static size_t rtnl_port_size(const struct net_device *dev,
877{ 876{
878 size_t port_size = nla_total_size(4) /* PORT_VF */ 877 size_t port_size = nla_total_size(4) /* PORT_VF */
879 + nla_total_size(PORT_PROFILE_MAX) /* PORT_PROFILE */ 878 + nla_total_size(PORT_PROFILE_MAX) /* PORT_PROFILE */
880 + nla_total_size(sizeof(struct ifla_port_vsi))
881 /* PORT_VSI_TYPE */
882 + nla_total_size(PORT_UUID_MAX) /* PORT_INSTANCE_UUID */ 879 + nla_total_size(PORT_UUID_MAX) /* PORT_INSTANCE_UUID */
883 + nla_total_size(PORT_UUID_MAX) /* PORT_HOST_UUID */ 880 + nla_total_size(PORT_UUID_MAX) /* PORT_HOST_UUID */
884 + nla_total_size(1) /* PROT_VDP_REQUEST */ 881 + nla_total_size(1) /* PROT_VDP_REQUEST */
@@ -1492,19 +1489,25 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
1492 [IFLA_PORT_VF] = { .type = NLA_U32 }, 1489 [IFLA_PORT_VF] = { .type = NLA_U32 },
1493 [IFLA_PORT_PROFILE] = { .type = NLA_STRING, 1490 [IFLA_PORT_PROFILE] = { .type = NLA_STRING,
1494 .len = PORT_PROFILE_MAX }, 1491 .len = PORT_PROFILE_MAX },
1495 [IFLA_PORT_VSI_TYPE] = { .type = NLA_BINARY,
1496 .len = sizeof(struct ifla_port_vsi)},
1497 [IFLA_PORT_INSTANCE_UUID] = { .type = NLA_BINARY, 1492 [IFLA_PORT_INSTANCE_UUID] = { .type = NLA_BINARY,
1498 .len = PORT_UUID_MAX }, 1493 .len = PORT_UUID_MAX },
1499 [IFLA_PORT_HOST_UUID] = { .type = NLA_STRING, 1494 [IFLA_PORT_HOST_UUID] = { .type = NLA_STRING,
1500 .len = PORT_UUID_MAX }, 1495 .len = PORT_UUID_MAX },
1501 [IFLA_PORT_REQUEST] = { .type = NLA_U8, }, 1496 [IFLA_PORT_REQUEST] = { .type = NLA_U8, },
1502 [IFLA_PORT_RESPONSE] = { .type = NLA_U16, }, 1497 [IFLA_PORT_RESPONSE] = { .type = NLA_U16, },
1498
1499 /* Unused, but we need to keep it here since user space could
1500 * fill it. It's also broken with regard to NLA_BINARY use in
1501 * combination with structs.
1502 */
1503 [IFLA_PORT_VSI_TYPE] = { .type = NLA_BINARY,
1504 .len = sizeof(struct ifla_port_vsi) },
1503}; 1505};
1504 1506
1505static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = { 1507static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = {
1506 [IFLA_XDP_FD] = { .type = NLA_S32 }, 1508 [IFLA_XDP_FD] = { .type = NLA_S32 },
1507 [IFLA_XDP_ATTACHED] = { .type = NLA_U8 }, 1509 [IFLA_XDP_ATTACHED] = { .type = NLA_U8 },
1510 [IFLA_XDP_FLAGS] = { .type = NLA_U32 },
1508}; 1511};
1509 1512
1510static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla) 1513static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla)
@@ -2164,6 +2167,7 @@ static int do_setlink(const struct sk_buff *skb,
2164 2167
2165 if (tb[IFLA_XDP]) { 2168 if (tb[IFLA_XDP]) {
2166 struct nlattr *xdp[IFLA_XDP_MAX + 1]; 2169 struct nlattr *xdp[IFLA_XDP_MAX + 1];
2170 u32 xdp_flags = 0;
2167 2171
2168 err = nla_parse_nested(xdp, IFLA_XDP_MAX, tb[IFLA_XDP], 2172 err = nla_parse_nested(xdp, IFLA_XDP_MAX, tb[IFLA_XDP],
2169 ifla_xdp_policy); 2173 ifla_xdp_policy);
@@ -2174,9 +2178,19 @@ static int do_setlink(const struct sk_buff *skb,
2174 err = -EINVAL; 2178 err = -EINVAL;
2175 goto errout; 2179 goto errout;
2176 } 2180 }
2181
2182 if (xdp[IFLA_XDP_FLAGS]) {
2183 xdp_flags = nla_get_u32(xdp[IFLA_XDP_FLAGS]);
2184 if (xdp_flags & ~XDP_FLAGS_MASK) {
2185 err = -EINVAL;
2186 goto errout;
2187 }
2188 }
2189
2177 if (xdp[IFLA_XDP_FD]) { 2190 if (xdp[IFLA_XDP_FD]) {
2178 err = dev_change_xdp_fd(dev, 2191 err = dev_change_xdp_fd(dev,
2179 nla_get_s32(xdp[IFLA_XDP_FD])); 2192 nla_get_s32(xdp[IFLA_XDP_FD]),
2193 xdp_flags);
2180 if (err) 2194 if (err)
2181 goto errout; 2195 goto errout;
2182 status |= DO_SETLINK_NOTIFY; 2196 status |= DO_SETLINK_NOTIFY;
@@ -2344,7 +2358,6 @@ struct net_device *rtnl_create_link(struct net *net,
2344 const char *ifname, unsigned char name_assign_type, 2358 const char *ifname, unsigned char name_assign_type,
2345 const struct rtnl_link_ops *ops, struct nlattr *tb[]) 2359 const struct rtnl_link_ops *ops, struct nlattr *tb[])
2346{ 2360{
2347 int err;
2348 struct net_device *dev; 2361 struct net_device *dev;
2349 unsigned int num_tx_queues = 1; 2362 unsigned int num_tx_queues = 1;
2350 unsigned int num_rx_queues = 1; 2363 unsigned int num_rx_queues = 1;
@@ -2359,11 +2372,10 @@ struct net_device *rtnl_create_link(struct net *net,
2359 else if (ops->get_num_rx_queues) 2372 else if (ops->get_num_rx_queues)
2360 num_rx_queues = ops->get_num_rx_queues(); 2373 num_rx_queues = ops->get_num_rx_queues();
2361 2374
2362 err = -ENOMEM;
2363 dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type, 2375 dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type,
2364 ops->setup, num_tx_queues, num_rx_queues); 2376 ops->setup, num_tx_queues, num_rx_queues);
2365 if (!dev) 2377 if (!dev)
2366 goto err; 2378 return ERR_PTR(-ENOMEM);
2367 2379
2368 dev_net_set(dev, net); 2380 dev_net_set(dev, net);
2369 dev->rtnl_link_ops = ops; 2381 dev->rtnl_link_ops = ops;
@@ -2389,9 +2401,6 @@ struct net_device *rtnl_create_link(struct net *net,
2389 dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); 2401 dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
2390 2402
2391 return dev; 2403 return dev;
2392
2393err:
2394 return ERR_PTR(err);
2395} 2404}
2396EXPORT_SYMBOL(rtnl_create_link); 2405EXPORT_SYMBOL(rtnl_create_link);
2397 2406
@@ -2559,7 +2568,7 @@ replay:
2559 return -ENODEV; 2568 return -ENODEV;
2560 } 2569 }
2561 2570
2562 if (tb[IFLA_MAP] || tb[IFLA_MASTER] || tb[IFLA_PROTINFO]) 2571 if (tb[IFLA_MAP] || tb[IFLA_PROTINFO])
2563 return -EOPNOTSUPP; 2572 return -EOPNOTSUPP;
2564 2573
2565 if (!ops) { 2574 if (!ops) {
@@ -2641,6 +2650,11 @@ replay:
2641 if (err < 0) 2650 if (err < 0)
2642 goto out_unregister; 2651 goto out_unregister;
2643 } 2652 }
2653 if (tb[IFLA_MASTER]) {
2654 err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]));
2655 if (err)
2656 goto out_unregister;
2657 }
2644out: 2658out:
2645 if (link_net) 2659 if (link_net)
2646 put_net(link_net); 2660 put_net(link_net);
@@ -3165,7 +3179,7 @@ int ndo_dflt_fdb_dump(struct sk_buff *skb,
3165 err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->uc); 3179 err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->uc);
3166 if (err) 3180 if (err)
3167 goto out; 3181 goto out;
3168 nlmsg_populate_fdb(skb, cb, dev, idx, &dev->mc); 3182 err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->mc);
3169out: 3183out:
3170 netif_addr_unlock_bh(dev); 3184 netif_addr_unlock_bh(dev);
3171 return err; 3185 return err;
@@ -3671,7 +3685,7 @@ static int rtnl_get_offload_stats(struct sk_buff *skb, struct net_device *dev,
3671 if (!size) 3685 if (!size)
3672 continue; 3686 continue;
3673 3687
3674 if (!dev->netdev_ops->ndo_has_offload_stats(attr_id)) 3688 if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id))
3675 continue; 3689 continue;
3676 3690
3677 attr = nla_reserve_64bit(skb, attr_id, size, 3691 attr = nla_reserve_64bit(skb, attr_id, size,
@@ -3712,7 +3726,7 @@ static int rtnl_get_offload_stats_size(const struct net_device *dev)
3712 3726
3713 for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST; 3727 for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
3714 attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) { 3728 attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
3715 if (!dev->netdev_ops->ndo_has_offload_stats(attr_id)) 3729 if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id))
3716 continue; 3730 continue;
3717 size = rtnl_get_offload_stats_attr_size(attr_id); 3731 size = rtnl_get_offload_stats_attr_size(attr_id);
3718 nla_size += nla_total_size_64bit(size); 3732 nla_size += nla_total_size_64bit(size);
@@ -3817,6 +3831,39 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
3817 *idxattr = 0; 3831 *idxattr = 0;
3818 } 3832 }
3819 3833
3834 if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, *idxattr)) {
3835 struct rtnl_af_ops *af_ops;
3836
3837 *idxattr = IFLA_STATS_AF_SPEC;
3838 attr = nla_nest_start(skb, IFLA_STATS_AF_SPEC);
3839 if (!attr)
3840 goto nla_put_failure;
3841
3842 list_for_each_entry(af_ops, &rtnl_af_ops, list) {
3843 if (af_ops->fill_stats_af) {
3844 struct nlattr *af;
3845 int err;
3846
3847 af = nla_nest_start(skb, af_ops->family);
3848 if (!af)
3849 goto nla_put_failure;
3850
3851 err = af_ops->fill_stats_af(skb, dev);
3852
3853 if (err == -ENODATA)
3854 nla_nest_cancel(skb, af);
3855 else if (err < 0)
3856 goto nla_put_failure;
3857
3858 nla_nest_end(skb, af);
3859 }
3860 }
3861
3862 nla_nest_end(skb, attr);
3863
3864 *idxattr = 0;
3865 }
3866
3820 nlmsg_end(skb, nlh); 3867 nlmsg_end(skb, nlh);
3821 3868
3822 return 0; 3869 return 0;
@@ -3873,6 +3920,23 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev,
3873 if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0)) 3920 if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0))
3874 size += rtnl_get_offload_stats_size(dev); 3921 size += rtnl_get_offload_stats_size(dev);
3875 3922
3923 if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, 0)) {
3924 struct rtnl_af_ops *af_ops;
3925
3926 /* for IFLA_STATS_AF_SPEC */
3927 size += nla_total_size(0);
3928
3929 list_for_each_entry(af_ops, &rtnl_af_ops, list) {
3930 if (af_ops->get_stats_af_size) {
3931 size += nla_total_size(
3932 af_ops->get_stats_af_size(dev));
3933
3934 /* for AF_* */
3935 size += nla_total_size(0);
3936 }
3937 }
3938 }
3939
3876 return size; 3940 return size;
3877} 3941}
3878 3942
@@ -3886,6 +3950,9 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh)
3886 u32 filter_mask; 3950 u32 filter_mask;
3887 int err; 3951 int err;
3888 3952
3953 if (nlmsg_len(nlh) < sizeof(*ifsm))
3954 return -EINVAL;
3955
3889 ifsm = nlmsg_data(nlh); 3956 ifsm = nlmsg_data(nlh);
3890 if (ifsm->ifindex > 0) 3957 if (ifsm->ifindex > 0)
3891 dev = __dev_get_by_index(net, ifsm->ifindex); 3958 dev = __dev_get_by_index(net, ifsm->ifindex);
@@ -3935,6 +4002,9 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb)
3935 4002
3936 cb->seq = net->dev_base_seq; 4003 cb->seq = net->dev_base_seq;
3937 4004
4005 if (nlmsg_len(cb->nlh) < sizeof(*ifsm))
4006 return -EINVAL;
4007
3938 ifsm = nlmsg_data(cb->nlh); 4008 ifsm = nlmsg_data(cb->nlh);
3939 filter_mask = ifsm->filter_mask; 4009 filter_mask = ifsm->filter_mask;
3940 if (!filter_mask) 4010 if (!filter_mask)
diff --git a/net/core/scm.c b/net/core/scm.c
index 2696aefdc148..b1ff8a441748 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -14,6 +14,7 @@
14#include <linux/capability.h> 14#include <linux/capability.h>
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/sched/user.h>
17#include <linux/mm.h> 18#include <linux/mm.h>
18#include <linux/kernel.h> 19#include <linux/kernel.h>
19#include <linux/stat.h> 20#include <linux/stat.h>
@@ -29,7 +30,7 @@
29#include <linux/nsproxy.h> 30#include <linux/nsproxy.h>
30#include <linux/slab.h> 31#include <linux/slab.h>
31 32
32#include <asm/uaccess.h> 33#include <linux/uaccess.h>
33 34
34#include <net/protocol.h> 35#include <net/protocol.h>
35#include <linux/skbuff.h> 36#include <linux/skbuff.h>
@@ -71,7 +72,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
71 struct file **fpp; 72 struct file **fpp;
72 int i, num; 73 int i, num;
73 74
74 num = (cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)))/sizeof(int); 75 num = (cmsg->cmsg_len - sizeof(struct cmsghdr))/sizeof(int);
75 76
76 if (num <= 0) 77 if (num <= 0)
77 return 0; 78 return 0;
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index fd3ce461fbe6..d28da7d363f1 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -1,3 +1,7 @@
1/*
2 * Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
3 */
4
1#include <linux/kernel.h> 5#include <linux/kernel.h>
2#include <linux/init.h> 6#include <linux/init.h>
3#include <linux/cryptohash.h> 7#include <linux/cryptohash.h>
@@ -8,17 +12,20 @@
8#include <linux/ktime.h> 12#include <linux/ktime.h>
9#include <linux/string.h> 13#include <linux/string.h>
10#include <linux/net.h> 14#include <linux/net.h>
11 15#include <linux/siphash.h>
12#include <net/secure_seq.h> 16#include <net/secure_seq.h>
13 17
14#if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET) 18#if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET)
15#define NET_SECRET_SIZE (MD5_MESSAGE_BYTES / 4) 19#include <linux/in6.h>
20#include <net/tcp.h>
16 21
17static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned; 22static siphash_key_t net_secret __read_mostly;
23static siphash_key_t ts_secret __read_mostly;
18 24
19static __always_inline void net_secret_init(void) 25static __always_inline void net_secret_init(void)
20{ 26{
21 net_get_random_once(net_secret, sizeof(net_secret)); 27 net_get_random_once(&ts_secret, sizeof(ts_secret));
28 net_get_random_once(&net_secret, sizeof(net_secret));
22} 29}
23#endif 30#endif
24 31
@@ -40,81 +47,98 @@ static u32 seq_scale(u32 seq)
40#endif 47#endif
41 48
42#if IS_ENABLED(CONFIG_IPV6) 49#if IS_ENABLED(CONFIG_IPV6)
43__u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr, 50static u32 secure_tcpv6_ts_off(const __be32 *saddr, const __be32 *daddr)
44 __be16 sport, __be16 dport)
45{ 51{
46 u32 secret[MD5_MESSAGE_BYTES / 4]; 52 const struct {
47 u32 hash[MD5_DIGEST_WORDS]; 53 struct in6_addr saddr;
48 u32 i; 54 struct in6_addr daddr;
55 } __aligned(SIPHASH_ALIGNMENT) combined = {
56 .saddr = *(struct in6_addr *)saddr,
57 .daddr = *(struct in6_addr *)daddr,
58 };
59
60 if (sysctl_tcp_timestamps != 1)
61 return 0;
62
63 return siphash(&combined, offsetofend(typeof(combined), daddr),
64 &ts_secret);
65}
49 66
67u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
68 __be16 sport, __be16 dport, u32 *tsoff)
69{
70 const struct {
71 struct in6_addr saddr;
72 struct in6_addr daddr;
73 __be16 sport;
74 __be16 dport;
75 } __aligned(SIPHASH_ALIGNMENT) combined = {
76 .saddr = *(struct in6_addr *)saddr,
77 .daddr = *(struct in6_addr *)daddr,
78 .sport = sport,
79 .dport = dport
80 };
81 u64 hash;
50 net_secret_init(); 82 net_secret_init();
51 memcpy(hash, saddr, 16); 83 hash = siphash(&combined, offsetofend(typeof(combined), dport),
52 for (i = 0; i < 4; i++) 84 &net_secret);
53 secret[i] = net_secret[i] + (__force u32)daddr[i]; 85 *tsoff = secure_tcpv6_ts_off(saddr, daddr);
54 secret[4] = net_secret[4] + 86 return seq_scale(hash);
55 (((__force u16)sport << 16) + (__force u16)dport);
56 for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
57 secret[i] = net_secret[i];
58
59 md5_transform(hash, secret);
60
61 return seq_scale(hash[0]);
62} 87}
63EXPORT_SYMBOL(secure_tcpv6_sequence_number); 88EXPORT_SYMBOL(secure_tcpv6_sequence_number);
64 89
65u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, 90u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
66 __be16 dport) 91 __be16 dport)
67{ 92{
68 u32 secret[MD5_MESSAGE_BYTES / 4]; 93 const struct {
69 u32 hash[MD5_DIGEST_WORDS]; 94 struct in6_addr saddr;
70 u32 i; 95 struct in6_addr daddr;
71 96 __be16 dport;
97 } __aligned(SIPHASH_ALIGNMENT) combined = {
98 .saddr = *(struct in6_addr *)saddr,
99 .daddr = *(struct in6_addr *)daddr,
100 .dport = dport
101 };
72 net_secret_init(); 102 net_secret_init();
73 memcpy(hash, saddr, 16); 103 return siphash(&combined, offsetofend(typeof(combined), dport),
74 for (i = 0; i < 4; i++) 104 &net_secret);
75 secret[i] = net_secret[i] + (__force u32) daddr[i];
76 secret[4] = net_secret[4] + (__force u32)dport;
77 for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
78 secret[i] = net_secret[i];
79
80 md5_transform(hash, secret);
81
82 return hash[0];
83} 105}
84EXPORT_SYMBOL(secure_ipv6_port_ephemeral); 106EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
85#endif 107#endif
86 108
87#ifdef CONFIG_INET 109#ifdef CONFIG_INET
88 110static u32 secure_tcp_ts_off(__be32 saddr, __be32 daddr)
89__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
90 __be16 sport, __be16 dport)
91{ 111{
92 u32 hash[MD5_DIGEST_WORDS]; 112 if (sysctl_tcp_timestamps != 1)
113 return 0;
93 114
94 net_secret_init(); 115 return siphash_2u32((__force u32)saddr, (__force u32)daddr,
95 hash[0] = (__force u32)saddr; 116 &ts_secret);
96 hash[1] = (__force u32)daddr; 117}
97 hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
98 hash[3] = net_secret[15];
99 118
100 md5_transform(hash, net_secret); 119/* secure_tcp_sequence_number(a, b, 0, d) == secure_ipv4_port_ephemeral(a, b, d),
120 * but fortunately, `sport' cannot be 0 in any circumstances. If this changes,
121 * it would be easy enough to have the former function use siphash_4u32, passing
122 * the arguments as separate u32.
123 */
101 124
102 return seq_scale(hash[0]); 125u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
126 __be16 sport, __be16 dport, u32 *tsoff)
127{
128 u64 hash;
129 net_secret_init();
130 hash = siphash_3u32((__force u32)saddr, (__force u32)daddr,
131 (__force u32)sport << 16 | (__force u32)dport,
132 &net_secret);
133 *tsoff = secure_tcp_ts_off(saddr, daddr);
134 return seq_scale(hash);
103} 135}
104 136
105u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) 137u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
106{ 138{
107 u32 hash[MD5_DIGEST_WORDS];
108
109 net_secret_init(); 139 net_secret_init();
110 hash[0] = (__force u32)saddr; 140 return siphash_3u32((__force u32)saddr, (__force u32)daddr,
111 hash[1] = (__force u32)daddr; 141 (__force u16)dport, &net_secret);
112 hash[2] = (__force u32)dport ^ net_secret[14];
113 hash[3] = net_secret[15];
114
115 md5_transform(hash, net_secret);
116
117 return hash[0];
118} 142}
119EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral); 143EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral);
120#endif 144#endif
@@ -123,21 +147,13 @@ EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral);
123u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, 147u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
124 __be16 sport, __be16 dport) 148 __be16 sport, __be16 dport)
125{ 149{
126 u32 hash[MD5_DIGEST_WORDS];
127 u64 seq; 150 u64 seq;
128
129 net_secret_init(); 151 net_secret_init();
130 hash[0] = (__force u32)saddr; 152 seq = siphash_3u32((__force u32)saddr, (__force u32)daddr,
131 hash[1] = (__force u32)daddr; 153 (__force u32)sport << 16 | (__force u32)dport,
132 hash[2] = ((__force u16)sport << 16) + (__force u16)dport; 154 &net_secret);
133 hash[3] = net_secret[15];
134
135 md5_transform(hash, net_secret);
136
137 seq = hash[0] | (((u64)hash[1]) << 32);
138 seq += ktime_get_real_ns(); 155 seq += ktime_get_real_ns();
139 seq &= (1ull << 48) - 1; 156 seq &= (1ull << 48) - 1;
140
141 return seq; 157 return seq;
142} 158}
143EXPORT_SYMBOL(secure_dccp_sequence_number); 159EXPORT_SYMBOL(secure_dccp_sequence_number);
@@ -146,26 +162,23 @@ EXPORT_SYMBOL(secure_dccp_sequence_number);
146u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, 162u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
147 __be16 sport, __be16 dport) 163 __be16 sport, __be16 dport)
148{ 164{
149 u32 secret[MD5_MESSAGE_BYTES / 4]; 165 const struct {
150 u32 hash[MD5_DIGEST_WORDS]; 166 struct in6_addr saddr;
167 struct in6_addr daddr;
168 __be16 sport;
169 __be16 dport;
170 } __aligned(SIPHASH_ALIGNMENT) combined = {
171 .saddr = *(struct in6_addr *)saddr,
172 .daddr = *(struct in6_addr *)daddr,
173 .sport = sport,
174 .dport = dport
175 };
151 u64 seq; 176 u64 seq;
152 u32 i;
153
154 net_secret_init(); 177 net_secret_init();
155 memcpy(hash, saddr, 16); 178 seq = siphash(&combined, offsetofend(typeof(combined), dport),
156 for (i = 0; i < 4; i++) 179 &net_secret);
157 secret[i] = net_secret[i] + (__force u32)daddr[i];
158 secret[4] = net_secret[4] +
159 (((__force u16)sport << 16) + (__force u16)dport);
160 for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
161 secret[i] = net_secret[i];
162
163 md5_transform(hash, secret);
164
165 seq = hash[0] | (((u64)hash[1]) << 32);
166 seq += ktime_get_real_ns(); 180 seq += ktime_get_real_ns();
167 seq &= (1ull << 48) - 1; 181 seq &= (1ull << 48) - 1;
168
169 return seq; 182 return seq;
170} 183}
171EXPORT_SYMBOL(secure_dccpv6_sequence_number); 184EXPORT_SYMBOL(secure_dccpv6_sequence_number);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 1e3e0087245b..f86bf69cfb8d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -72,7 +72,7 @@
72#include <net/ip6_checksum.h> 72#include <net/ip6_checksum.h>
73#include <net/xfrm.h> 73#include <net/xfrm.h>
74 74
75#include <asm/uaccess.h> 75#include <linux/uaccess.h>
76#include <trace/events/skb.h> 76#include <trace/events/skb.h>
77#include <linux/highmem.h> 77#include <linux/highmem.h>
78#include <linux/capability.h> 78#include <linux/capability.h>
@@ -271,7 +271,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
271 atomic_set(&fclones->fclone_ref, 1); 271 atomic_set(&fclones->fclone_ref, 1);
272 272
273 fclones->skb2.fclone = SKB_FCLONE_CLONE; 273 fclones->skb2.fclone = SKB_FCLONE_CLONE;
274 fclones->skb2.pfmemalloc = pfmemalloc;
275 } 274 }
276out: 275out:
277 return skb; 276 return skb;
@@ -354,7 +353,7 @@ EXPORT_SYMBOL(build_skb);
354 353
355struct napi_alloc_cache { 354struct napi_alloc_cache {
356 struct page_frag_cache page; 355 struct page_frag_cache page;
357 size_t skb_count; 356 unsigned int skb_count;
358 void *skb_cache[NAPI_SKB_CACHE_SIZE]; 357 void *skb_cache[NAPI_SKB_CACHE_SIZE];
359}; 358};
360 359
@@ -369,7 +368,7 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
369 368
370 local_irq_save(flags); 369 local_irq_save(flags);
371 nc = this_cpu_ptr(&netdev_alloc_cache); 370 nc = this_cpu_ptr(&netdev_alloc_cache);
372 data = __alloc_page_frag(nc, fragsz, gfp_mask); 371 data = page_frag_alloc(nc, fragsz, gfp_mask);
373 local_irq_restore(flags); 372 local_irq_restore(flags);
374 return data; 373 return data;
375} 374}
@@ -391,7 +390,7 @@ static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
391{ 390{
392 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); 391 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
393 392
394 return __alloc_page_frag(&nc->page, fragsz, gfp_mask); 393 return page_frag_alloc(&nc->page, fragsz, gfp_mask);
395} 394}
396 395
397void *napi_alloc_frag(unsigned int fragsz) 396void *napi_alloc_frag(unsigned int fragsz)
@@ -441,7 +440,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
441 local_irq_save(flags); 440 local_irq_save(flags);
442 441
443 nc = this_cpu_ptr(&netdev_alloc_cache); 442 nc = this_cpu_ptr(&netdev_alloc_cache);
444 data = __alloc_page_frag(nc, len, gfp_mask); 443 data = page_frag_alloc(nc, len, gfp_mask);
445 pfmemalloc = nc->pfmemalloc; 444 pfmemalloc = nc->pfmemalloc;
446 445
447 local_irq_restore(flags); 446 local_irq_restore(flags);
@@ -505,7 +504,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
505 if (sk_memalloc_socks()) 504 if (sk_memalloc_socks())
506 gfp_mask |= __GFP_MEMALLOC; 505 gfp_mask |= __GFP_MEMALLOC;
507 506
508 data = __alloc_page_frag(&nc->page, len, gfp_mask); 507 data = page_frag_alloc(&nc->page, len, gfp_mask);
509 if (unlikely(!data)) 508 if (unlikely(!data))
510 return NULL; 509 return NULL;
511 510
@@ -655,7 +654,7 @@ static void skb_release_head_state(struct sk_buff *skb)
655 skb->destructor(skb); 654 skb->destructor(skb);
656 } 655 }
657#if IS_ENABLED(CONFIG_NF_CONNTRACK) 656#if IS_ENABLED(CONFIG_NF_CONNTRACK)
658 nf_conntrack_put(skb->nfct); 657 nf_conntrack_put(skb_nfct(skb));
659#endif 658#endif
660#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) 659#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
661 nf_bridge_put(skb->nf_bridge); 660 nf_bridge_put(skb->nf_bridge);
@@ -878,9 +877,6 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
878#endif 877#endif
879#ifdef CONFIG_NET_SCHED 878#ifdef CONFIG_NET_SCHED
880 CHECK_SKB_FIELD(tc_index); 879 CHECK_SKB_FIELD(tc_index);
881#ifdef CONFIG_NET_CLS_ACT
882 CHECK_SKB_FIELD(tc_verd);
883#endif
884#endif 880#endif
885 881
886} 882}
@@ -1195,10 +1191,10 @@ EXPORT_SYMBOL(__pskb_copy_fclone);
1195int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, 1191int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
1196 gfp_t gfp_mask) 1192 gfp_t gfp_mask)
1197{ 1193{
1198 int i; 1194 int i, osize = skb_end_offset(skb);
1199 u8 *data; 1195 int size = osize + nhead + ntail;
1200 int size = nhead + skb_end_offset(skb) + ntail;
1201 long off; 1196 long off;
1197 u8 *data;
1202 1198
1203 BUG_ON(nhead < 0); 1199 BUG_ON(nhead < 0);
1204 1200
@@ -1260,6 +1256,14 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
1260 skb->hdr_len = 0; 1256 skb->hdr_len = 0;
1261 skb->nohdr = 0; 1257 skb->nohdr = 0;
1262 atomic_set(&skb_shinfo(skb)->dataref, 1); 1258 atomic_set(&skb_shinfo(skb)->dataref, 1);
1259
1260 /* It is not generally safe to change skb->truesize.
1261 * For the moment, we really care of rx path, or
1262 * when skb is orphaned (not attached to a socket).
1263 */
1264 if (!skb->sk || skb->destructor == sock_edemux)
1265 skb->truesize += size - osize;
1266
1263 return 0; 1267 return 0;
1264 1268
1265nofrags: 1269nofrags:
@@ -2656,7 +2660,9 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
2656 struct skb_frag_struct *fragfrom, *fragto; 2660 struct skb_frag_struct *fragfrom, *fragto;
2657 2661
2658 BUG_ON(shiftlen > skb->len); 2662 BUG_ON(shiftlen > skb->len);
2659 BUG_ON(skb_headlen(skb)); /* Would corrupt stream */ 2663
2664 if (skb_headlen(skb))
2665 return 0;
2660 2666
2661 todo = shiftlen; 2667 todo = shiftlen;
2662 from = 0; 2668 from = 0;
@@ -3076,22 +3082,32 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
3076 if (sg && csum && (mss != GSO_BY_FRAGS)) { 3082 if (sg && csum && (mss != GSO_BY_FRAGS)) {
3077 if (!(features & NETIF_F_GSO_PARTIAL)) { 3083 if (!(features & NETIF_F_GSO_PARTIAL)) {
3078 struct sk_buff *iter; 3084 struct sk_buff *iter;
3085 unsigned int frag_len;
3079 3086
3080 if (!list_skb || 3087 if (!list_skb ||
3081 !net_gso_ok(features, skb_shinfo(head_skb)->gso_type)) 3088 !net_gso_ok(features, skb_shinfo(head_skb)->gso_type))
3082 goto normal; 3089 goto normal;
3083 3090
3084 /* Split the buffer at the frag_list pointer. 3091 /* If we get here then all the required
3085 * This is based on the assumption that all 3092 * GSO features except frag_list are supported.
3086 * buffers in the chain excluding the last 3093 * Try to split the SKB to multiple GSO SKBs
3087 * containing the same amount of data. 3094 * with no frag_list.
3095 * Currently we can do that only when the buffers don't
3096 * have a linear part and all the buffers except
3097 * the last are of the same length.
3088 */ 3098 */
3099 frag_len = list_skb->len;
3089 skb_walk_frags(head_skb, iter) { 3100 skb_walk_frags(head_skb, iter) {
3101 if (frag_len != iter->len && iter->next)
3102 goto normal;
3090 if (skb_headlen(iter)) 3103 if (skb_headlen(iter))
3091 goto normal; 3104 goto normal;
3092 3105
3093 len -= iter->len; 3106 len -= iter->len;
3094 } 3107 }
3108
3109 if (len != frag_len)
3110 goto normal;
3095 } 3111 }
3096 3112
3097 /* GSO partial only requires that we trim off any excess that 3113 /* GSO partial only requires that we trim off any excess that
@@ -3688,6 +3704,15 @@ static void sock_rmem_free(struct sk_buff *skb)
3688 atomic_sub(skb->truesize, &sk->sk_rmem_alloc); 3704 atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
3689} 3705}
3690 3706
3707static void skb_set_err_queue(struct sk_buff *skb)
3708{
3709 /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING.
3710 * So, it is safe to (mis)use it to mark skbs on the error queue.
3711 */
3712 skb->pkt_type = PACKET_OUTGOING;
3713 BUILD_BUG_ON(PACKET_OUTGOING == 0);
3714}
3715
3691/* 3716/*
3692 * Note: We dont mem charge error packets (no sk_forward_alloc changes) 3717 * Note: We dont mem charge error packets (no sk_forward_alloc changes)
3693 */ 3718 */
@@ -3701,6 +3726,7 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
3701 skb->sk = sk; 3726 skb->sk = sk;
3702 skb->destructor = sock_rmem_free; 3727 skb->destructor = sock_rmem_free;
3703 atomic_add(skb->truesize, &sk->sk_rmem_alloc); 3728 atomic_add(skb->truesize, &sk->sk_rmem_alloc);
3729 skb_set_err_queue(skb);
3704 3730
3705 /* before exiting rcu section, make sure dst is refcounted */ 3731 /* before exiting rcu section, make sure dst is refcounted */
3706 skb_dst_force(skb); 3732 skb_dst_force(skb);
@@ -3712,21 +3738,29 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
3712} 3738}
3713EXPORT_SYMBOL(sock_queue_err_skb); 3739EXPORT_SYMBOL(sock_queue_err_skb);
3714 3740
3741static bool is_icmp_err_skb(const struct sk_buff *skb)
3742{
3743 return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
3744 SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6);
3745}
3746
3715struct sk_buff *sock_dequeue_err_skb(struct sock *sk) 3747struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
3716{ 3748{
3717 struct sk_buff_head *q = &sk->sk_error_queue; 3749 struct sk_buff_head *q = &sk->sk_error_queue;
3718 struct sk_buff *skb, *skb_next; 3750 struct sk_buff *skb, *skb_next = NULL;
3751 bool icmp_next = false;
3719 unsigned long flags; 3752 unsigned long flags;
3720 int err = 0;
3721 3753
3722 spin_lock_irqsave(&q->lock, flags); 3754 spin_lock_irqsave(&q->lock, flags);
3723 skb = __skb_dequeue(q); 3755 skb = __skb_dequeue(q);
3724 if (skb && (skb_next = skb_peek(q))) 3756 if (skb && (skb_next = skb_peek(q)))
3725 err = SKB_EXT_ERR(skb_next)->ee.ee_errno; 3757 icmp_next = is_icmp_err_skb(skb_next);
3726 spin_unlock_irqrestore(&q->lock, flags); 3758 spin_unlock_irqrestore(&q->lock, flags);
3727 3759
3728 sk->sk_err = err; 3760 if (is_icmp_err_skb(skb) && !icmp_next)
3729 if (err) 3761 sk->sk_err = 0;
3762
3763 if (skb_next)
3730 sk->sk_error_report(sk); 3764 sk->sk_error_report(sk);
3731 3765
3732 return skb; 3766 return skb;
@@ -3769,16 +3803,21 @@ EXPORT_SYMBOL(skb_clone_sk);
3769 3803
3770static void __skb_complete_tx_timestamp(struct sk_buff *skb, 3804static void __skb_complete_tx_timestamp(struct sk_buff *skb,
3771 struct sock *sk, 3805 struct sock *sk,
3772 int tstype) 3806 int tstype,
3807 bool opt_stats)
3773{ 3808{
3774 struct sock_exterr_skb *serr; 3809 struct sock_exterr_skb *serr;
3775 int err; 3810 int err;
3776 3811
3812 BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb));
3813
3777 serr = SKB_EXT_ERR(skb); 3814 serr = SKB_EXT_ERR(skb);
3778 memset(serr, 0, sizeof(*serr)); 3815 memset(serr, 0, sizeof(*serr));
3779 serr->ee.ee_errno = ENOMSG; 3816 serr->ee.ee_errno = ENOMSG;
3780 serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; 3817 serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
3781 serr->ee.ee_info = tstype; 3818 serr->ee.ee_info = tstype;
3819 serr->opt_stats = opt_stats;
3820 serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0;
3782 if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { 3821 if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
3783 serr->ee.ee_data = skb_shinfo(skb)->tskey; 3822 serr->ee.ee_data = skb_shinfo(skb)->tskey;
3784 if (sk->sk_protocol == IPPROTO_TCP && 3823 if (sk->sk_protocol == IPPROTO_TCP &&
@@ -3814,13 +3853,14 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
3814 if (!skb_may_tx_timestamp(sk, false)) 3853 if (!skb_may_tx_timestamp(sk, false))
3815 return; 3854 return;
3816 3855
3817 /* take a reference to prevent skb_orphan() from freeing the socket */ 3856 /* Take a reference to prevent skb_orphan() from freeing the socket,
3818 sock_hold(sk); 3857 * but only if the socket refcount is not zero.
3819 3858 */
3820 *skb_hwtstamps(skb) = *hwtstamps; 3859 if (likely(atomic_inc_not_zero(&sk->sk_refcnt))) {
3821 __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND); 3860 *skb_hwtstamps(skb) = *hwtstamps;
3822 3861 __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false);
3823 sock_put(sk); 3862 sock_put(sk);
3863 }
3824} 3864}
3825EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); 3865EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
3826 3866
@@ -3829,7 +3869,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
3829 struct sock *sk, int tstype) 3869 struct sock *sk, int tstype)
3830{ 3870{
3831 struct sk_buff *skb; 3871 struct sk_buff *skb;
3832 bool tsonly; 3872 bool tsonly, opt_stats = false;
3833 3873
3834 if (!sk) 3874 if (!sk)
3835 return; 3875 return;
@@ -3838,10 +3878,19 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
3838 if (!skb_may_tx_timestamp(sk, tsonly)) 3878 if (!skb_may_tx_timestamp(sk, tsonly))
3839 return; 3879 return;
3840 3880
3841 if (tsonly) 3881 if (tsonly) {
3842 skb = alloc_skb(0, GFP_ATOMIC); 3882#ifdef CONFIG_INET
3843 else 3883 if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
3884 sk->sk_protocol == IPPROTO_TCP &&
3885 sk->sk_type == SOCK_STREAM) {
3886 skb = tcp_get_timestamping_opt_stats(sk);
3887 opt_stats = true;
3888 } else
3889#endif
3890 skb = alloc_skb(0, GFP_ATOMIC);
3891 } else {
3844 skb = skb_clone(orig_skb, GFP_ATOMIC); 3892 skb = skb_clone(orig_skb, GFP_ATOMIC);
3893 }
3845 if (!skb) 3894 if (!skb)
3846 return; 3895 return;
3847 3896
@@ -3855,7 +3904,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
3855 else 3904 else
3856 skb->tstamp = ktime_get_real(); 3905 skb->tstamp = ktime_get_real();
3857 3906
3858 __skb_complete_tx_timestamp(skb, sk, tstype); 3907 __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
3859} 3908}
3860EXPORT_SYMBOL_GPL(__skb_tstamp_tx); 3909EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
3861 3910
@@ -3871,7 +3920,7 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
3871{ 3920{
3872 struct sock *sk = skb->sk; 3921 struct sock *sk = skb->sk;
3873 struct sock_exterr_skb *serr; 3922 struct sock_exterr_skb *serr;
3874 int err; 3923 int err = 1;
3875 3924
3876 skb->wifi_acked_valid = 1; 3925 skb->wifi_acked_valid = 1;
3877 skb->wifi_acked = acked; 3926 skb->wifi_acked = acked;
@@ -3881,14 +3930,15 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
3881 serr->ee.ee_errno = ENOMSG; 3930 serr->ee.ee_errno = ENOMSG;
3882 serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; 3931 serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;
3883 3932
3884 /* take a reference to prevent skb_orphan() from freeing the socket */ 3933 /* Take a reference to prevent skb_orphan() from freeing the socket,
3885 sock_hold(sk); 3934 * but only if the socket refcount is not zero.
3886 3935 */
3887 err = sock_queue_err_skb(sk, skb); 3936 if (likely(atomic_inc_not_zero(&sk->sk_refcnt))) {
3937 err = sock_queue_err_skb(sk, skb);
3938 sock_put(sk);
3939 }
3888 if (err) 3940 if (err)
3889 kfree_skb(skb); 3941 kfree_skb(skb);
3890
3891 sock_put(sk);
3892} 3942}
3893EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); 3943EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
3894 3944
@@ -4350,7 +4400,7 @@ EXPORT_SYMBOL(skb_try_coalesce);
4350 */ 4400 */
4351void skb_scrub_packet(struct sk_buff *skb, bool xnet) 4401void skb_scrub_packet(struct sk_buff *skb, bool xnet)
4352{ 4402{
4353 skb->tstamp.tv64 = 0; 4403 skb->tstamp = 0;
4354 skb->pkt_type = PACKET_HOST; 4404 skb->pkt_type = PACKET_HOST;
4355 skb->skb_iif = 0; 4405 skb->skb_iif = 0;
4356 skb->ignore_df = 0; 4406 skb->ignore_df = 0;
@@ -4913,3 +4963,35 @@ struct sk_buff *pskb_extract(struct sk_buff *skb, int off,
4913 return clone; 4963 return clone;
4914} 4964}
4915EXPORT_SYMBOL(pskb_extract); 4965EXPORT_SYMBOL(pskb_extract);
4966
4967/**
4968 * skb_condense - try to get rid of fragments/frag_list if possible
4969 * @skb: buffer
4970 *
4971 * Can be used to save memory before skb is added to a busy queue.
4972 * If packet has bytes in frags and enough tail room in skb->head,
4973 * pull all of them, so that we can free the frags right now and adjust
4974 * truesize.
4975 * Notes:
4976 * We do not reallocate skb->head thus can not fail.
4977 * Caller must re-evaluate skb->truesize if needed.
4978 */
4979void skb_condense(struct sk_buff *skb)
4980{
4981 if (skb->data_len) {
4982 if (skb->data_len > skb->end - skb->tail ||
4983 skb_cloned(skb))
4984 return;
4985
4986 /* Nice, we can free page frag(s) right now */
4987 __pskb_pull_tail(skb, skb->data_len);
4988 }
4989 /* At this point, skb->truesize might be over estimated,
4990 * because skb had a fragment, and fragments do not tell
4991 * their truesize.
4992 * When we pulled its content into skb->head, fragment
4993 * was freed, but __pskb_pull_tail() could not possibly
4994 * adjust skb->truesize, not knowing the frag truesize.
4995 */
4996 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4997}
diff --git a/net/core/sock.c b/net/core/sock.c
index 00a074dbfe9b..2c4f574168fb 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -118,7 +118,7 @@
118#include <linux/memcontrol.h> 118#include <linux/memcontrol.h>
119#include <linux/prefetch.h> 119#include <linux/prefetch.h>
120 120
121#include <asm/uaccess.h> 121#include <linux/uaccess.h>
122 122
123#include <linux/netdevice.h> 123#include <linux/netdevice.h>
124#include <net/protocol.h> 124#include <net/protocol.h>
@@ -197,66 +197,55 @@ EXPORT_SYMBOL(sk_net_capable);
197 197
198/* 198/*
199 * Each address family might have different locking rules, so we have 199 * Each address family might have different locking rules, so we have
200 * one slock key per address family: 200 * one slock key per address family and separate keys for internal and
201 * userspace sockets.
201 */ 202 */
202static struct lock_class_key af_family_keys[AF_MAX]; 203static struct lock_class_key af_family_keys[AF_MAX];
204static struct lock_class_key af_family_kern_keys[AF_MAX];
203static struct lock_class_key af_family_slock_keys[AF_MAX]; 205static struct lock_class_key af_family_slock_keys[AF_MAX];
206static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
204 207
205/* 208/*
206 * Make lock validator output more readable. (we pre-construct these 209 * Make lock validator output more readable. (we pre-construct these
207 * strings build-time, so that runtime initialization of socket 210 * strings build-time, so that runtime initialization of socket
208 * locks is fast): 211 * locks is fast):
209 */ 212 */
213
214#define _sock_locks(x) \
215 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
216 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
217 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
218 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
219 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
220 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
221 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
222 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
223 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
224 x "27" , x "28" , x "AF_CAN" , \
225 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
226 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
227 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
228 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
229 x "AF_QIPCRTR", x "AF_SMC" , x "AF_MAX"
230
210static const char *const af_family_key_strings[AF_MAX+1] = { 231static const char *const af_family_key_strings[AF_MAX+1] = {
211 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" , 232 _sock_locks("sk_lock-")
212 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
213 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
214 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
215 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
216 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
217 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
218 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
219 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
220 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
221 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
222 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
223 "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
224 "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_KCM" ,
225 "sk_lock-AF_MAX"
226}; 233};
227static const char *const af_family_slock_key_strings[AF_MAX+1] = { 234static const char *const af_family_slock_key_strings[AF_MAX+1] = {
228 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" , 235 _sock_locks("slock-")
229 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
230 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
231 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
232 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
233 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
234 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
235 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
236 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
237 "slock-27" , "slock-28" , "slock-AF_CAN" ,
238 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
239 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
240 "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
241 "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_KCM" ,
242 "slock-AF_MAX"
243}; 236};
244static const char *const af_family_clock_key_strings[AF_MAX+1] = { 237static const char *const af_family_clock_key_strings[AF_MAX+1] = {
245 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , 238 _sock_locks("clock-")
246 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK", 239};
247 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" , 240
248 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" , 241static const char *const af_family_kern_key_strings[AF_MAX+1] = {
249 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" , 242 _sock_locks("k-sk_lock-")
250 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" , 243};
251 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" , 244static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
252 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" , 245 _sock_locks("k-slock-")
253 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" , 246};
254 "clock-27" , "clock-28" , "clock-AF_CAN" , 247static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
255 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" , 248 _sock_locks("k-clock-")
256 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
257 "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
258 "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_KCM" ,
259 "clock-AF_MAX"
260}; 249};
261 250
262/* 251/*
@@ -264,6 +253,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = {
264 * so split the lock classes by using a per-AF key: 253 * so split the lock classes by using a per-AF key:
265 */ 254 */
266static struct lock_class_key af_callback_keys[AF_MAX]; 255static struct lock_class_key af_callback_keys[AF_MAX];
256static struct lock_class_key af_kern_callback_keys[AF_MAX];
267 257
268/* Take into consideration the size of the struct sk_buff overhead in the 258/* Take into consideration the size of the struct sk_buff overhead in the
269 * determination of these values, since that is non-constant across 259 * determination of these values, since that is non-constant across
@@ -367,7 +357,7 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
367 if (tv.tv_sec == 0 && tv.tv_usec == 0) 357 if (tv.tv_sec == 0 && tv.tv_usec == 0)
368 return 0; 358 return 0;
369 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1)) 359 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
370 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ); 360 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
371 return 0; 361 return 0;
372} 362}
373 363
@@ -502,6 +492,7 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
502 492
503 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 493 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
504 sk_tx_queue_clear(sk); 494 sk_tx_queue_clear(sk);
495 sk->sk_dst_pending_confirm = 0;
505 RCU_INIT_POINTER(sk->sk_dst_cache, NULL); 496 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
506 dst_release(dst); 497 dst_release(dst);
507 return NULL; 498 return NULL;
@@ -762,11 +753,8 @@ set_rcvbuf:
762 goto set_rcvbuf; 753 goto set_rcvbuf;
763 754
764 case SO_KEEPALIVE: 755 case SO_KEEPALIVE:
765#ifdef CONFIG_INET 756 if (sk->sk_prot->keepalive)
766 if (sk->sk_protocol == IPPROTO_TCP && 757 sk->sk_prot->keepalive(sk, valbool);
767 sk->sk_type == SOCK_STREAM)
768 tcp_set_keepalive(sk, valbool);
769#endif
770 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); 758 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
771 break; 759 break;
772 760
@@ -854,6 +842,13 @@ set_rcvbuf:
854 sk->sk_tskey = 0; 842 sk->sk_tskey = 0;
855 } 843 }
856 } 844 }
845
846 if (val & SOF_TIMESTAMPING_OPT_STATS &&
847 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
848 ret = -EINVAL;
849 break;
850 }
851
857 sk->sk_tsflags = val; 852 sk->sk_tsflags = val;
858 if (val & SOF_TIMESTAMPING_RX_SOFTWARE) 853 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
859 sock_enable_timestamp(sk, 854 sock_enable_timestamp(sk,
@@ -1141,7 +1136,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
1141 v.tm.tv_usec = 0; 1136 v.tm.tv_usec = 0;
1142 } else { 1137 } else {
1143 v.tm.tv_sec = sk->sk_rcvtimeo / HZ; 1138 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1144 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ; 1139 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1145 } 1140 }
1146 break; 1141 break;
1147 1142
@@ -1152,7 +1147,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
1152 v.tm.tv_usec = 0; 1147 v.tm.tv_usec = 0;
1153 } else { 1148 } else {
1154 v.tm.tv_sec = sk->sk_sndtimeo / HZ; 1149 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1155 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ; 1150 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1156 } 1151 }
1157 break; 1152 break;
1158 1153
@@ -1288,7 +1283,16 @@ lenout:
1288 */ 1283 */
1289static inline void sock_lock_init(struct sock *sk) 1284static inline void sock_lock_init(struct sock *sk)
1290{ 1285{
1291 sock_lock_init_class_and_name(sk, 1286 if (sk->sk_kern_sock)
1287 sock_lock_init_class_and_name(
1288 sk,
1289 af_family_kern_slock_key_strings[sk->sk_family],
1290 af_family_kern_slock_keys + sk->sk_family,
1291 af_family_kern_key_strings[sk->sk_family],
1292 af_family_kern_keys + sk->sk_family);
1293 else
1294 sock_lock_init_class_and_name(
1295 sk,
1292 af_family_slock_key_strings[sk->sk_family], 1296 af_family_slock_key_strings[sk->sk_family],
1293 af_family_slock_keys + sk->sk_family, 1297 af_family_slock_keys + sk->sk_family,
1294 af_family_key_strings[sk->sk_family], 1298 af_family_key_strings[sk->sk_family],
@@ -1394,6 +1398,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1394 * why we need sk_prot_creator -acme 1398 * why we need sk_prot_creator -acme
1395 */ 1399 */
1396 sk->sk_prot = sk->sk_prot_creator = prot; 1400 sk->sk_prot = sk->sk_prot_creator = prot;
1401 sk->sk_kern_sock = kern;
1397 sock_lock_init(sk); 1402 sock_lock_init(sk);
1398 sk->sk_net_refcnt = kern ? 0 : 1; 1403 sk->sk_net_refcnt = kern ? 0 : 1;
1399 if (likely(sk->sk_net_refcnt)) 1404 if (likely(sk->sk_net_refcnt))
@@ -1437,6 +1442,11 @@ static void __sk_destruct(struct rcu_head *head)
1437 pr_debug("%s: optmem leakage (%d bytes) detected\n", 1442 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1438 __func__, atomic_read(&sk->sk_omem_alloc)); 1443 __func__, atomic_read(&sk->sk_omem_alloc));
1439 1444
1445 if (sk->sk_frag.page) {
1446 put_page(sk->sk_frag.page);
1447 sk->sk_frag.page = NULL;
1448 }
1449
1440 if (sk->sk_peer_cred) 1450 if (sk->sk_peer_cred)
1441 put_cred(sk->sk_peer_cred); 1451 put_cred(sk->sk_peer_cred);
1442 put_pid(sk->sk_peer_pid); 1452 put_pid(sk->sk_peer_pid);
@@ -1515,6 +1525,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1515 af_family_clock_key_strings[newsk->sk_family]); 1525 af_family_clock_key_strings[newsk->sk_family]);
1516 1526
1517 newsk->sk_dst_cache = NULL; 1527 newsk->sk_dst_cache = NULL;
1528 newsk->sk_dst_pending_confirm = 0;
1518 newsk->sk_wmem_queued = 0; 1529 newsk->sk_wmem_queued = 0;
1519 newsk->sk_forward_alloc = 0; 1530 newsk->sk_forward_alloc = 0;
1520 atomic_set(&newsk->sk_drops, 0); 1531 atomic_set(&newsk->sk_drops, 0);
@@ -1533,11 +1544,13 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1533 is_charged = sk_filter_charge(newsk, filter); 1544 is_charged = sk_filter_charge(newsk, filter);
1534 1545
1535 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { 1546 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1536 /* It is still raw copy of parent, so invalidate 1547 /* We need to make sure that we don't uncharge the new
1537 * destructor and make plain sk_free() */ 1548 * socket if we couldn't charge it in the first place
1538 newsk->sk_destruct = NULL; 1549 * as otherwise we uncharge the parent's filter.
1539 bh_unlock_sock(newsk); 1550 */
1540 sk_free(newsk); 1551 if (!is_charged)
1552 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1553 sk_free_unlock_clone(newsk);
1541 newsk = NULL; 1554 newsk = NULL;
1542 goto out; 1555 goto out;
1543 } 1556 }
@@ -1586,6 +1599,16 @@ out:
1586} 1599}
1587EXPORT_SYMBOL_GPL(sk_clone_lock); 1600EXPORT_SYMBOL_GPL(sk_clone_lock);
1588 1601
1602void sk_free_unlock_clone(struct sock *sk)
1603{
1604 /* It is still raw copy of parent, so invalidate
1605 * destructor and make plain sk_free() */
1606 sk->sk_destruct = NULL;
1607 bh_unlock_sock(sk);
1608 sk_free(sk);
1609}
1610EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1611
1589void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 1612void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1590{ 1613{
1591 u32 max_segs = 1; 1614 u32 max_segs = 1;
@@ -2080,37 +2103,31 @@ void __sk_flush_backlog(struct sock *sk)
2080 */ 2103 */
2081int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) 2104int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2082{ 2105{
2106 DEFINE_WAIT_FUNC(wait, woken_wake_function);
2083 int rc; 2107 int rc;
2084 DEFINE_WAIT(wait);
2085 2108
2086 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 2109 add_wait_queue(sk_sleep(sk), &wait);
2087 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2110 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2088 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb); 2111 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2089 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2112 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2090 finish_wait(sk_sleep(sk), &wait); 2113 remove_wait_queue(sk_sleep(sk), &wait);
2091 return rc; 2114 return rc;
2092} 2115}
2093EXPORT_SYMBOL(sk_wait_data); 2116EXPORT_SYMBOL(sk_wait_data);
2094 2117
2095/** 2118/**
2096 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated 2119 * __sk_mem_raise_allocated - increase memory_allocated
2097 * @sk: socket 2120 * @sk: socket
2098 * @size: memory size to allocate 2121 * @size: memory size to allocate
2122 * @amt: pages to allocate
2099 * @kind: allocation type 2123 * @kind: allocation type
2100 * 2124 *
2101 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means 2125 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2102 * rmem allocation. This function assumes that protocols which have
2103 * memory_pressure use sk_wmem_queued as write buffer accounting.
2104 */ 2126 */
2105int __sk_mem_schedule(struct sock *sk, int size, int kind) 2127int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2106{ 2128{
2107 struct proto *prot = sk->sk_prot; 2129 struct proto *prot = sk->sk_prot;
2108 int amt = sk_mem_pages(size); 2130 long allocated = sk_memory_allocated_add(sk, amt);
2109 long allocated;
2110
2111 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
2112
2113 allocated = sk_memory_allocated_add(sk, amt);
2114 2131
2115 if (mem_cgroup_sockets_enabled && sk->sk_memcg && 2132 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2116 !mem_cgroup_charge_skmem(sk->sk_memcg, amt)) 2133 !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
@@ -2171,9 +2188,6 @@ suppress_allocation:
2171 2188
2172 trace_sock_exceed_buf_limit(sk, prot, allocated); 2189 trace_sock_exceed_buf_limit(sk, prot, allocated);
2173 2190
2174 /* Alas. Undo changes. */
2175 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2176
2177 sk_memory_allocated_sub(sk, amt); 2191 sk_memory_allocated_sub(sk, amt);
2178 2192
2179 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2193 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
@@ -2181,18 +2195,40 @@ suppress_allocation:
2181 2195
2182 return 0; 2196 return 0;
2183} 2197}
2198EXPORT_SYMBOL(__sk_mem_raise_allocated);
2199
2200/**
2201 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2202 * @sk: socket
2203 * @size: memory size to allocate
2204 * @kind: allocation type
2205 *
2206 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2207 * rmem allocation. This function assumes that protocols which have
2208 * memory_pressure use sk_wmem_queued as write buffer accounting.
2209 */
2210int __sk_mem_schedule(struct sock *sk, int size, int kind)
2211{
2212 int ret, amt = sk_mem_pages(size);
2213
2214 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2215 ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2216 if (!ret)
2217 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2218 return ret;
2219}
2184EXPORT_SYMBOL(__sk_mem_schedule); 2220EXPORT_SYMBOL(__sk_mem_schedule);
2185 2221
2186/** 2222/**
2187 * __sk_mem_reclaim - reclaim memory_allocated 2223 * __sk_mem_reduce_allocated - reclaim memory_allocated
2188 * @sk: socket 2224 * @sk: socket
2189 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) 2225 * @amount: number of quanta
2226 *
2227 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2190 */ 2228 */
2191void __sk_mem_reclaim(struct sock *sk, int amount) 2229void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2192{ 2230{
2193 amount >>= SK_MEM_QUANTUM_SHIFT;
2194 sk_memory_allocated_sub(sk, amount); 2231 sk_memory_allocated_sub(sk, amount);
2195 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2196 2232
2197 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2233 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2198 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); 2234 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
@@ -2201,6 +2237,19 @@ void __sk_mem_reclaim(struct sock *sk, int amount)
2201 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) 2237 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2202 sk_leave_memory_pressure(sk); 2238 sk_leave_memory_pressure(sk);
2203} 2239}
2240EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2241
2242/**
2243 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2244 * @sk: socket
2245 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2246 */
2247void __sk_mem_reclaim(struct sock *sk, int amount)
2248{
2249 amount >>= SK_MEM_QUANTUM_SHIFT;
2250 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2251 __sk_mem_reduce_allocated(sk, amount);
2252}
2204EXPORT_SYMBOL(__sk_mem_reclaim); 2253EXPORT_SYMBOL(__sk_mem_reclaim);
2205 2254
2206int sk_set_peek_off(struct sock *sk, int val) 2255int sk_set_peek_off(struct sock *sk, int val)
@@ -2239,7 +2288,8 @@ int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2239} 2288}
2240EXPORT_SYMBOL(sock_no_socketpair); 2289EXPORT_SYMBOL(sock_no_socketpair);
2241 2290
2242int sock_no_accept(struct socket *sock, struct socket *newsock, int flags) 2291int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2292 bool kern)
2243{ 2293{
2244 return -EOPNOTSUPP; 2294 return -EOPNOTSUPP;
2245} 2295}
@@ -2436,11 +2486,21 @@ void sock_init_data(struct socket *sock, struct sock *sk)
2436 sk->sk_type = sock->type; 2486 sk->sk_type = sock->type;
2437 sk->sk_wq = sock->wq; 2487 sk->sk_wq = sock->wq;
2438 sock->sk = sk; 2488 sock->sk = sk;
2439 } else 2489 sk->sk_uid = SOCK_INODE(sock)->i_uid;
2490 } else {
2440 sk->sk_wq = NULL; 2491 sk->sk_wq = NULL;
2492 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0);
2493 }
2441 2494
2442 rwlock_init(&sk->sk_callback_lock); 2495 rwlock_init(&sk->sk_callback_lock);
2443 lockdep_set_class_and_name(&sk->sk_callback_lock, 2496 if (sk->sk_kern_sock)
2497 lockdep_set_class_and_name(
2498 &sk->sk_callback_lock,
2499 af_kern_callback_keys + sk->sk_family,
2500 af_family_kern_clock_key_strings[sk->sk_family]);
2501 else
2502 lockdep_set_class_and_name(
2503 &sk->sk_callback_lock,
2444 af_callback_keys + sk->sk_family, 2504 af_callback_keys + sk->sk_family,
2445 af_family_clock_key_strings[sk->sk_family]); 2505 af_family_clock_key_strings[sk->sk_family]);
2446 2506
@@ -2738,11 +2798,6 @@ void sk_common_release(struct sock *sk)
2738 2798
2739 sk_refcnt_debug_release(sk); 2799 sk_refcnt_debug_release(sk);
2740 2800
2741 if (sk->sk_frag.page) {
2742 put_page(sk->sk_frag.page);
2743 sk->sk_frag.page = NULL;
2744 }
2745
2746 sock_put(sk); 2801 sock_put(sk);
2747} 2802}
2748EXPORT_SYMBOL(sk_common_release); 2803EXPORT_SYMBOL(sk_common_release);
diff --git a/net/core/stream.c b/net/core/stream.c
index 1086c8b280a8..20231dbb1da0 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -13,6 +13,7 @@
13 */ 13 */
14 14
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/sched/signal.h>
16#include <linux/net.h> 17#include <linux/net.h>
17#include <linux/signal.h> 18#include <linux/signal.h>
18#include <linux/tcp.h> 19#include <linux/tcp.h>
@@ -53,8 +54,8 @@ void sk_stream_write_space(struct sock *sk)
53 */ 54 */
54int sk_stream_wait_connect(struct sock *sk, long *timeo_p) 55int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
55{ 56{
57 DEFINE_WAIT_FUNC(wait, woken_wake_function);
56 struct task_struct *tsk = current; 58 struct task_struct *tsk = current;
57 DEFINE_WAIT(wait);
58 int done; 59 int done;
59 60
60 do { 61 do {
@@ -68,13 +69,13 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
68 if (signal_pending(tsk)) 69 if (signal_pending(tsk))
69 return sock_intr_errno(*timeo_p); 70 return sock_intr_errno(*timeo_p);
70 71
71 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 72 add_wait_queue(sk_sleep(sk), &wait);
72 sk->sk_write_pending++; 73 sk->sk_write_pending++;
73 done = sk_wait_event(sk, timeo_p, 74 done = sk_wait_event(sk, timeo_p,
74 !sk->sk_err && 75 !sk->sk_err &&
75 !((1 << sk->sk_state) & 76 !((1 << sk->sk_state) &
76 ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))); 77 ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)), &wait);
77 finish_wait(sk_sleep(sk), &wait); 78 remove_wait_queue(sk_sleep(sk), &wait);
78 sk->sk_write_pending--; 79 sk->sk_write_pending--;
79 } while (!done); 80 } while (!done);
80 return 0; 81 return 0;
@@ -94,16 +95,16 @@ static inline int sk_stream_closing(struct sock *sk)
94void sk_stream_wait_close(struct sock *sk, long timeout) 95void sk_stream_wait_close(struct sock *sk, long timeout)
95{ 96{
96 if (timeout) { 97 if (timeout) {
97 DEFINE_WAIT(wait); 98 DEFINE_WAIT_FUNC(wait, woken_wake_function);
99
100 add_wait_queue(sk_sleep(sk), &wait);
98 101
99 do { 102 do {
100 prepare_to_wait(sk_sleep(sk), &wait, 103 if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk), &wait))
101 TASK_INTERRUPTIBLE);
102 if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk)))
103 break; 104 break;
104 } while (!signal_pending(current) && timeout); 105 } while (!signal_pending(current) && timeout);
105 106
106 finish_wait(sk_sleep(sk), &wait); 107 remove_wait_queue(sk_sleep(sk), &wait);
107 } 108 }
108} 109}
109EXPORT_SYMBOL(sk_stream_wait_close); 110EXPORT_SYMBOL(sk_stream_wait_close);
@@ -119,16 +120,16 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
119 long vm_wait = 0; 120 long vm_wait = 0;
120 long current_timeo = *timeo_p; 121 long current_timeo = *timeo_p;
121 bool noblock = (*timeo_p ? false : true); 122 bool noblock = (*timeo_p ? false : true);
122 DEFINE_WAIT(wait); 123 DEFINE_WAIT_FUNC(wait, woken_wake_function);
123 124
124 if (sk_stream_memory_free(sk)) 125 if (sk_stream_memory_free(sk))
125 current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2; 126 current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2;
126 127
128 add_wait_queue(sk_sleep(sk), &wait);
129
127 while (1) { 130 while (1) {
128 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 131 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
129 132
130 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
131
132 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) 133 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
133 goto do_error; 134 goto do_error;
134 if (!*timeo_p) { 135 if (!*timeo_p) {
@@ -147,7 +148,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
147 sk_wait_event(sk, &current_timeo, sk->sk_err || 148 sk_wait_event(sk, &current_timeo, sk->sk_err ||
148 (sk->sk_shutdown & SEND_SHUTDOWN) || 149 (sk->sk_shutdown & SEND_SHUTDOWN) ||
149 (sk_stream_memory_free(sk) && 150 (sk_stream_memory_free(sk) &&
150 !vm_wait)); 151 !vm_wait), &wait);
151 sk->sk_write_pending--; 152 sk->sk_write_pending--;
152 153
153 if (vm_wait) { 154 if (vm_wait) {
@@ -161,7 +162,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
161 *timeo_p = current_timeo; 162 *timeo_p = current_timeo;
162 } 163 }
163out: 164out:
164 finish_wait(sk_sleep(sk), &wait); 165 remove_wait_queue(sk_sleep(sk), &wait);
165 return err; 166 return err;
166 167
167do_error: 168do_error:
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 0df2aa652530..7f9cc400eca0 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -79,10 +79,13 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
79 79
80 if (sock_table != orig_sock_table) { 80 if (sock_table != orig_sock_table) {
81 rcu_assign_pointer(rps_sock_flow_table, sock_table); 81 rcu_assign_pointer(rps_sock_flow_table, sock_table);
82 if (sock_table) 82 if (sock_table) {
83 static_key_slow_inc(&rps_needed); 83 static_key_slow_inc(&rps_needed);
84 static_key_slow_inc(&rfs_needed);
85 }
84 if (orig_sock_table) { 86 if (orig_sock_table) {
85 static_key_slow_dec(&rps_needed); 87 static_key_slow_dec(&rps_needed);
88 static_key_slow_dec(&rfs_needed);
86 synchronize_rcu(); 89 synchronize_rcu();
87 vfree(orig_sock_table); 90 vfree(orig_sock_table);
88 } 91 }
@@ -219,6 +222,21 @@ static int set_default_qdisc(struct ctl_table *table, int write,
219} 222}
220#endif 223#endif
221 224
225static int proc_do_dev_weight(struct ctl_table *table, int write,
226 void __user *buffer, size_t *lenp, loff_t *ppos)
227{
228 int ret;
229
230 ret = proc_dointvec(table, write, buffer, lenp, ppos);
231 if (ret != 0)
232 return ret;
233
234 dev_rx_weight = weight_p * dev_weight_rx_bias;
235 dev_tx_weight = weight_p * dev_weight_tx_bias;
236
237 return ret;
238}
239
222static int proc_do_rss_key(struct ctl_table *table, int write, 240static int proc_do_rss_key(struct ctl_table *table, int write,
223 void __user *buffer, size_t *lenp, loff_t *ppos) 241 void __user *buffer, size_t *lenp, loff_t *ppos)
224{ 242{
@@ -270,7 +288,21 @@ static struct ctl_table net_core_table[] = {
270 .data = &weight_p, 288 .data = &weight_p,
271 .maxlen = sizeof(int), 289 .maxlen = sizeof(int),
272 .mode = 0644, 290 .mode = 0644,
273 .proc_handler = proc_dointvec 291 .proc_handler = proc_do_dev_weight,
292 },
293 {
294 .procname = "dev_weight_rx_bias",
295 .data = &dev_weight_rx_bias,
296 .maxlen = sizeof(int),
297 .mode = 0644,
298 .proc_handler = proc_do_dev_weight,
299 },
300 {
301 .procname = "dev_weight_tx_bias",
302 .data = &dev_weight_tx_bias,
303 .maxlen = sizeof(int),
304 .mode = 0644,
305 .proc_handler = proc_do_dev_weight,
274 }, 306 },
275 { 307 {
276 .procname = "netdev_max_backlog", 308 .procname = "netdev_max_backlog",
@@ -302,6 +334,13 @@ static struct ctl_table net_core_table[] = {
302 .mode = 0600, 334 .mode = 0600,
303 .proc_handler = proc_dointvec, 335 .proc_handler = proc_dointvec,
304 }, 336 },
337 {
338 .procname = "bpf_jit_kallsyms",
339 .data = &bpf_jit_kallsyms,
340 .maxlen = sizeof(int),
341 .mode = 0600,
342 .proc_handler = proc_dointvec,
343 },
305# endif 344# endif
306#endif 345#endif
307 { 346 {
@@ -369,14 +408,16 @@ static struct ctl_table net_core_table[] = {
369 .data = &sysctl_net_busy_poll, 408 .data = &sysctl_net_busy_poll,
370 .maxlen = sizeof(unsigned int), 409 .maxlen = sizeof(unsigned int),
371 .mode = 0644, 410 .mode = 0644,
372 .proc_handler = proc_dointvec 411 .proc_handler = proc_dointvec_minmax,
412 .extra1 = &zero,
373 }, 413 },
374 { 414 {
375 .procname = "busy_read", 415 .procname = "busy_read",
376 .data = &sysctl_net_busy_read, 416 .data = &sysctl_net_busy_read,
377 .maxlen = sizeof(unsigned int), 417 .maxlen = sizeof(unsigned int),
378 .mode = 0644, 418 .mode = 0644,
379 .proc_handler = proc_dointvec 419 .proc_handler = proc_dointvec_minmax,
420 .extra1 = &zero,
380 }, 421 },
381#endif 422#endif
382#ifdef CONFIG_NET_SCHED 423#ifdef CONFIG_NET_SCHED
diff --git a/net/core/utils.c b/net/core/utils.c
index cf5622b9ccc4..6592d7bbed39 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -31,7 +31,7 @@
31#include <net/net_ratelimit.h> 31#include <net/net_ratelimit.h>
32 32
33#include <asm/byteorder.h> 33#include <asm/byteorder.h>
34#include <asm/uaccess.h> 34#include <linux/uaccess.h>
35 35
36DEFINE_RATELIMIT_STATE(net_ratelimit_state, 5 * HZ, 10); 36DEFINE_RATELIMIT_STATE(net_ratelimit_state, 5 * HZ, 10);
37/* 37/*
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index f053198e730c..5e3a7302f774 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -749,6 +749,7 @@ static void ccid2_hc_tx_exit(struct sock *sk)
749 for (i = 0; i < hc->tx_seqbufc; i++) 749 for (i = 0; i < hc->tx_seqbufc; i++)
750 kfree(hc->tx_seqbuf[i]); 750 kfree(hc->tx_seqbuf[i]);
751 hc->tx_seqbufc = 0; 751 hc->tx_seqbufc = 0;
752 dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks);
752} 753}
753 754
754static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) 755static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
diff --git a/net/dccp/input.c b/net/dccp/input.c
index ba347184bda9..4a05d7876850 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -577,6 +577,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
577 struct dccp_sock *dp = dccp_sk(sk); 577 struct dccp_sock *dp = dccp_sk(sk);
578 struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); 578 struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
579 const int old_state = sk->sk_state; 579 const int old_state = sk->sk_state;
580 bool acceptable;
580 int queued = 0; 581 int queued = 0;
581 582
582 /* 583 /*
@@ -603,10 +604,16 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
603 */ 604 */
604 if (sk->sk_state == DCCP_LISTEN) { 605 if (sk->sk_state == DCCP_LISTEN) {
605 if (dh->dccph_type == DCCP_PKT_REQUEST) { 606 if (dh->dccph_type == DCCP_PKT_REQUEST) {
606 if (inet_csk(sk)->icsk_af_ops->conn_request(sk, 607 /* It is possible that we process SYN packets from backlog,
607 skb) < 0) 608 * so we need to make sure to disable BH right there.
609 */
610 local_bh_disable();
611 acceptable = inet_csk(sk)->icsk_af_ops->conn_request(sk, skb) >= 0;
612 local_bh_enable();
613 if (!acceptable)
608 return 1; 614 return 1;
609 goto discard; 615 consume_skb(skb);
616 return 0;
610 } 617 }
611 if (dh->dccph_type == DCCP_PKT_RESET) 618 if (dh->dccph_type == DCCP_PKT_RESET)
612 goto discard; 619 goto discard;
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index edbe59d203ef..b99168b0fabf 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -289,7 +289,8 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info)
289 289
290 switch (type) { 290 switch (type) {
291 case ICMP_REDIRECT: 291 case ICMP_REDIRECT:
292 dccp_do_redirect(skb, sk); 292 if (!sock_owned_by_user(sk))
293 dccp_do_redirect(skb, sk);
293 goto out; 294 goto out;
294 case ICMP_SOURCE_QUENCH: 295 case ICMP_SOURCE_QUENCH:
295 /* Just silently ignore these. */ 296 /* Just silently ignore these. */
@@ -590,13 +591,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
590 if (inet_csk_reqsk_queue_is_full(sk)) 591 if (inet_csk_reqsk_queue_is_full(sk))
591 goto drop; 592 goto drop;
592 593
593 /* 594 if (sk_acceptq_is_full(sk))
594 * Accept backlog is full. If we have already queued enough
595 * of warm entries in syn queue, drop request. It is better than
596 * clogging syn queue with openreqs with exponentially increasing
597 * timeout.
598 */
599 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
600 goto drop; 595 goto drop;
601 596
602 req = inet_reqsk_alloc(&dccp_request_sock_ops, sk, true); 597 req = inet_reqsk_alloc(&dccp_request_sock_ops, sk, true);
@@ -910,7 +905,6 @@ static const struct inet_connection_sock_af_ops dccp_ipv4_af_ops = {
910 .getsockopt = ip_getsockopt, 905 .getsockopt = ip_getsockopt,
911 .addr2sockaddr = inet_csk_addr2sockaddr, 906 .addr2sockaddr = inet_csk_addr2sockaddr,
912 .sockaddr_len = sizeof(struct sockaddr_in), 907 .sockaddr_len = sizeof(struct sockaddr_in),
913 .bind_conflict = inet_csk_bind_conflict,
914#ifdef CONFIG_COMPAT 908#ifdef CONFIG_COMPAT
915 .compat_setsockopt = compat_ip_setsockopt, 909 .compat_setsockopt = compat_ip_setsockopt,
916 .compat_getsockopt = compat_ip_getsockopt, 910 .compat_getsockopt = compat_ip_getsockopt,
@@ -1024,9 +1018,15 @@ static void __net_exit dccp_v4_exit_net(struct net *net)
1024 inet_ctl_sock_destroy(net->dccp.v4_ctl_sk); 1018 inet_ctl_sock_destroy(net->dccp.v4_ctl_sk);
1025} 1019}
1026 1020
1021static void __net_exit dccp_v4_exit_batch(struct list_head *net_exit_list)
1022{
1023 inet_twsk_purge(&dccp_hashinfo, AF_INET);
1024}
1025
1027static struct pernet_operations dccp_v4_ops = { 1026static struct pernet_operations dccp_v4_ops = {
1028 .init = dccp_v4_init_net, 1027 .init = dccp_v4_init_net,
1029 .exit = dccp_v4_exit_net, 1028 .exit = dccp_v4_exit_net,
1029 .exit_batch = dccp_v4_exit_batch,
1030}; 1030};
1031 1031
1032static int __init dccp_v4_init(void) 1032static int __init dccp_v4_init(void)
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 715e5d1dc107..d9b6a4e403e7 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -122,10 +122,12 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
122 np = inet6_sk(sk); 122 np = inet6_sk(sk);
123 123
124 if (type == NDISC_REDIRECT) { 124 if (type == NDISC_REDIRECT) {
125 struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); 125 if (!sock_owned_by_user(sk)) {
126 struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);
126 127
127 if (dst) 128 if (dst)
128 dst->ops->redirect(dst, sk, skb); 129 dst->ops->redirect(dst, sk, skb);
130 }
129 goto out; 131 goto out;
130 } 132 }
131 133
@@ -227,7 +229,7 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req
227 opt = ireq->ipv6_opt; 229 opt = ireq->ipv6_opt;
228 if (!opt) 230 if (!opt)
229 opt = rcu_dereference(np->opt); 231 opt = rcu_dereference(np->opt);
230 err = ip6_xmit(sk, skb, &fl6, opt, np->tclass); 232 err = ip6_xmit(sk, skb, &fl6, sk->sk_mark, opt, np->tclass);
231 rcu_read_unlock(); 233 rcu_read_unlock();
232 err = net_xmit_eval(err); 234 err = net_xmit_eval(err);
233 } 235 }
@@ -281,7 +283,7 @@ static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb)
281 dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL); 283 dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL);
282 if (!IS_ERR(dst)) { 284 if (!IS_ERR(dst)) {
283 skb_dst_set(skb, dst); 285 skb_dst_set(skb, dst);
284 ip6_xmit(ctl_sk, skb, &fl6, NULL, 0); 286 ip6_xmit(ctl_sk, skb, &fl6, 0, NULL, 0);
285 DCCP_INC_STATS(DCCP_MIB_OUTSEGS); 287 DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
286 DCCP_INC_STATS(DCCP_MIB_OUTRSTS); 288 DCCP_INC_STATS(DCCP_MIB_OUTRSTS);
287 return; 289 return;
@@ -326,7 +328,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
326 if (inet_csk_reqsk_queue_is_full(sk)) 328 if (inet_csk_reqsk_queue_is_full(sk))
327 goto drop; 329 goto drop;
328 330
329 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) 331 if (sk_acceptq_is_full(sk))
330 goto drop; 332 goto drop;
331 333
332 req = inet_reqsk_alloc(&dccp6_request_sock_ops, sk, true); 334 req = inet_reqsk_alloc(&dccp6_request_sock_ops, sk, true);
@@ -937,7 +939,6 @@ static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops = {
937 .getsockopt = ipv6_getsockopt, 939 .getsockopt = ipv6_getsockopt,
938 .addr2sockaddr = inet6_csk_addr2sockaddr, 940 .addr2sockaddr = inet6_csk_addr2sockaddr,
939 .sockaddr_len = sizeof(struct sockaddr_in6), 941 .sockaddr_len = sizeof(struct sockaddr_in6),
940 .bind_conflict = inet6_csk_bind_conflict,
941#ifdef CONFIG_COMPAT 942#ifdef CONFIG_COMPAT
942 .compat_setsockopt = compat_ipv6_setsockopt, 943 .compat_setsockopt = compat_ipv6_setsockopt,
943 .compat_getsockopt = compat_ipv6_getsockopt, 944 .compat_getsockopt = compat_ipv6_getsockopt,
@@ -958,7 +959,6 @@ static const struct inet_connection_sock_af_ops dccp_ipv6_mapped = {
958 .getsockopt = ipv6_getsockopt, 959 .getsockopt = ipv6_getsockopt,
959 .addr2sockaddr = inet6_csk_addr2sockaddr, 960 .addr2sockaddr = inet6_csk_addr2sockaddr,
960 .sockaddr_len = sizeof(struct sockaddr_in6), 961 .sockaddr_len = sizeof(struct sockaddr_in6),
961 .bind_conflict = inet6_csk_bind_conflict,
962#ifdef CONFIG_COMPAT 962#ifdef CONFIG_COMPAT
963 .compat_setsockopt = compat_ipv6_setsockopt, 963 .compat_setsockopt = compat_ipv6_setsockopt,
964 .compat_getsockopt = compat_ipv6_getsockopt, 964 .compat_getsockopt = compat_ipv6_getsockopt,
@@ -1077,9 +1077,15 @@ static void __net_exit dccp_v6_exit_net(struct net *net)
1077 inet_ctl_sock_destroy(net->dccp.v6_ctl_sk); 1077 inet_ctl_sock_destroy(net->dccp.v6_ctl_sk);
1078} 1078}
1079 1079
1080static void __net_exit dccp_v6_exit_batch(struct list_head *net_exit_list)
1081{
1082 inet_twsk_purge(&dccp_hashinfo, AF_INET6);
1083}
1084
1080static struct pernet_operations dccp_v6_ops = { 1085static struct pernet_operations dccp_v6_ops = {
1081 .init = dccp_v6_init_net, 1086 .init = dccp_v6_init_net,
1082 .exit = dccp_v6_exit_net, 1087 .exit = dccp_v6_exit_net,
1088 .exit_batch = dccp_v6_exit_batch,
1083}; 1089};
1084 1090
1085static int __init dccp_v6_init(void) 1091static int __init dccp_v6_init(void)
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 53eddf99e4f6..abd07a443219 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -119,10 +119,7 @@ struct sock *dccp_create_openreq_child(const struct sock *sk,
119 * Activate features: initialise CCIDs, sequence windows etc. 119 * Activate features: initialise CCIDs, sequence windows etc.
120 */ 120 */
121 if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) { 121 if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) {
122 /* It is still raw copy of parent, so invalidate 122 sk_free_unlock_clone(newsk);
123 * destructor and make plain sk_free() */
124 newsk->sk_destruct = NULL;
125 sk_free(newsk);
126 return NULL; 123 return NULL;
127 } 124 }
128 dccp_init_xmit_timers(newsk); 125 dccp_init_xmit_timers(newsk);
@@ -145,6 +142,13 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
145 struct dccp_request_sock *dreq = dccp_rsk(req); 142 struct dccp_request_sock *dreq = dccp_rsk(req);
146 bool own_req; 143 bool own_req;
147 144
145 /* TCP/DCCP listeners became lockless.
146 * DCCP stores complex state in its request_sock, so we need
147 * a protection for them, now this code runs without being protected
148 * by the parent (listener) lock.
149 */
150 spin_lock_bh(&dreq->dreq_lock);
151
148 /* Check for retransmitted REQUEST */ 152 /* Check for retransmitted REQUEST */
149 if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) { 153 if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) {
150 154
@@ -159,7 +163,7 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
159 inet_rtx_syn_ack(sk, req); 163 inet_rtx_syn_ack(sk, req);
160 } 164 }
161 /* Network Duplicate, discard packet */ 165 /* Network Duplicate, discard packet */
162 return NULL; 166 goto out;
163 } 167 }
164 168
165 DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; 169 DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR;
@@ -185,20 +189,20 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
185 189
186 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, 190 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
187 req, &own_req); 191 req, &own_req);
188 if (!child) 192 if (child) {
189 goto listen_overflow; 193 child = inet_csk_complete_hashdance(sk, child, req, own_req);
190 194 goto out;
191 return inet_csk_complete_hashdance(sk, child, req, own_req); 195 }
192 196
193listen_overflow:
194 dccp_pr_debug("listen_overflow!\n");
195 DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY; 197 DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY;
196drop: 198drop:
197 if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET) 199 if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET)
198 req->rsk_ops->send_reset(sk, skb); 200 req->rsk_ops->send_reset(sk, skb);
199 201
200 inet_csk_reqsk_queue_drop(sk, req); 202 inet_csk_reqsk_queue_drop(sk, req);
201 return NULL; 203out:
204 spin_unlock_bh(&dreq->dreq_lock);
205 return child;
202} 206}
203 207
204EXPORT_SYMBOL_GPL(dccp_check_req); 208EXPORT_SYMBOL_GPL(dccp_check_req);
@@ -249,6 +253,7 @@ int dccp_reqsk_init(struct request_sock *req,
249{ 253{
250 struct dccp_request_sock *dreq = dccp_rsk(req); 254 struct dccp_request_sock *dreq = dccp_rsk(req);
251 255
256 spin_lock_init(&dreq->dreq_lock);
252 inet_rsk(req)->ir_rmt_port = dccp_hdr(skb)->dccph_sport; 257 inet_rsk(req)->ir_rmt_port = dccp_hdr(skb)->dccph_sport;
253 inet_rsk(req)->ir_num = ntohs(dccp_hdr(skb)->dccph_dport); 258 inet_rsk(req)->ir_num = ntohs(dccp_hdr(skb)->dccph_dport);
254 inet_rsk(req)->acked = 0; 259 inet_rsk(req)->acked = 0;
diff --git a/net/dccp/output.c b/net/dccp/output.c
index b66c84db0766..91a15b3c4915 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -14,6 +14,7 @@
14#include <linux/kernel.h> 14#include <linux/kernel.h>
15#include <linux/skbuff.h> 15#include <linux/skbuff.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/sched/signal.h>
17 18
18#include <net/inet_sock.h> 19#include <net/inet_sock.h>
19#include <net/sock.h> 20#include <net/sock.h>
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 13d6b1a6e0fc..7de5b40a5d0d 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -106,7 +106,7 @@ Version 0.0.6 2.1.110 07-aug-98 Eduardo Marcelo Serrat
106#include <linux/socket.h> 106#include <linux/socket.h>
107#include <linux/in.h> 107#include <linux/in.h>
108#include <linux/kernel.h> 108#include <linux/kernel.h>
109#include <linux/sched.h> 109#include <linux/sched/signal.h>
110#include <linux/timer.h> 110#include <linux/timer.h>
111#include <linux/string.h> 111#include <linux/string.h>
112#include <linux/sockios.h> 112#include <linux/sockios.h>
@@ -1070,7 +1070,8 @@ static struct sk_buff *dn_wait_for_connect(struct sock *sk, long *timeo)
1070 return skb == NULL ? ERR_PTR(err) : skb; 1070 return skb == NULL ? ERR_PTR(err) : skb;
1071} 1071}
1072 1072
1073static int dn_accept(struct socket *sock, struct socket *newsock, int flags) 1073static int dn_accept(struct socket *sock, struct socket *newsock, int flags,
1074 bool kern)
1074{ 1075{
1075 struct sock *sk = sock->sk, *newsk; 1076 struct sock *sk = sock->sk, *newsk;
1076 struct sk_buff *skb = NULL; 1077 struct sk_buff *skb = NULL;
@@ -1099,7 +1100,7 @@ static int dn_accept(struct socket *sock, struct socket *newsock, int flags)
1099 1100
1100 cb = DN_SKB_CB(skb); 1101 cb = DN_SKB_CB(skb);
1101 sk->sk_ack_backlog--; 1102 sk->sk_ack_backlog--;
1102 newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation, 0); 1103 newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation, kern);
1103 if (newsk == NULL) { 1104 if (newsk == NULL) {
1104 release_sock(sk); 1105 release_sock(sk);
1105 kfree_skb(skb); 1106 kfree_skb(skb);
@@ -1718,7 +1719,7 @@ static int dn_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
1718 * See if there is data ready to read, sleep if there isn't 1719 * See if there is data ready to read, sleep if there isn't
1719 */ 1720 */
1720 for(;;) { 1721 for(;;) {
1721 DEFINE_WAIT(wait); 1722 DEFINE_WAIT_FUNC(wait, woken_wake_function);
1722 1723
1723 if (sk->sk_err) 1724 if (sk->sk_err)
1724 goto out; 1725 goto out;
@@ -1749,11 +1750,11 @@ static int dn_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
1749 goto out; 1750 goto out;
1750 } 1751 }
1751 1752
1752 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 1753 add_wait_queue(sk_sleep(sk), &wait);
1753 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 1754 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
1754 sk_wait_event(sk, &timeo, dn_data_ready(sk, queue, flags, target)); 1755 sk_wait_event(sk, &timeo, dn_data_ready(sk, queue, flags, target), &wait);
1755 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 1756 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
1756 finish_wait(sk_sleep(sk), &wait); 1757 remove_wait_queue(sk_sleep(sk), &wait);
1757 } 1758 }
1758 1759
1759 skb_queue_walk_safe(queue, skb, n) { 1760 skb_queue_walk_safe(queue, skb, n) {
@@ -1999,19 +2000,19 @@ static int dn_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
1999 * size. 2000 * size.
2000 */ 2001 */
2001 if (dn_queue_too_long(scp, queue, flags)) { 2002 if (dn_queue_too_long(scp, queue, flags)) {
2002 DEFINE_WAIT(wait); 2003 DEFINE_WAIT_FUNC(wait, woken_wake_function);
2003 2004
2004 if (flags & MSG_DONTWAIT) { 2005 if (flags & MSG_DONTWAIT) {
2005 err = -EWOULDBLOCK; 2006 err = -EWOULDBLOCK;
2006 goto out; 2007 goto out;
2007 } 2008 }
2008 2009
2009 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 2010 add_wait_queue(sk_sleep(sk), &wait);
2010 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2011 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2011 sk_wait_event(sk, &timeo, 2012 sk_wait_event(sk, &timeo,
2012 !dn_queue_too_long(scp, queue, flags)); 2013 !dn_queue_too_long(scp, queue, flags), &wait);
2013 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2014 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2014 finish_wait(sk_sleep(sk), &wait); 2015 remove_wait_queue(sk_sleep(sk), &wait);
2015 continue; 2016 continue;
2016 } 2017 }
2017 2018
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index b2c26b081134..8fdd9f492b0e 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -42,7 +42,7 @@
42#include <linux/notifier.h> 42#include <linux/notifier.h>
43#include <linux/slab.h> 43#include <linux/slab.h>
44#include <linux/jiffies.h> 44#include <linux/jiffies.h>
45#include <asm/uaccess.h> 45#include <linux/uaccess.h>
46#include <net/net_namespace.h> 46#include <net/net_namespace.h>
47#include <net/neighbour.h> 47#include <net/neighbour.h>
48#include <net/dst.h> 48#include <net/dst.h>
@@ -201,7 +201,7 @@ static struct dn_dev_sysctl_table {
201 .extra1 = &min_t3, 201 .extra1 = &min_t3,
202 .extra2 = &max_t3 202 .extra2 = &max_t3
203 }, 203 },
204 {0} 204 { }
205 }, 205 },
206}; 206};
207 207
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index a796fc7cbc35..7af0ba6157a1 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -31,7 +31,7 @@
31#include <linux/timer.h> 31#include <linux/timer.h>
32#include <linux/spinlock.h> 32#include <linux/spinlock.h>
33#include <linux/atomic.h> 33#include <linux/atomic.h>
34#include <asm/uaccess.h> 34#include <linux/uaccess.h>
35#include <net/neighbour.h> 35#include <net/neighbour.h>
36#include <net/dst.h> 36#include <net/dst.h>
37#include <net/flow.h> 37#include <net/flow.h>
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index 1540b506e3e0..232675480756 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -25,7 +25,7 @@
25#include <linux/timer.h> 25#include <linux/timer.h>
26#include <linux/spinlock.h> 26#include <linux/spinlock.h>
27#include <linux/atomic.h> 27#include <linux/atomic.h>
28#include <asm/uaccess.h> 28#include <linux/uaccess.h>
29#include <linux/route.h> /* RTF_xxx */ 29#include <linux/route.h> /* RTF_xxx */
30#include <net/neighbour.h> 30#include <net/neighbour.h>
31#include <net/netlink.h> 31#include <net/netlink.h>
diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c
index 5325b541c526..6c7da6c29bf0 100644
--- a/net/decnet/sysctl_net_decnet.c
+++ b/net/decnet/sysctl_net_decnet.c
@@ -22,7 +22,7 @@
22#include <net/dst.h> 22#include <net/dst.h>
23#include <net/flow.h> 23#include <net/flow.h>
24 24
25#include <asm/uaccess.h> 25#include <linux/uaccess.h>
26 26
27#include <net/dn.h> 27#include <net/dn.h>
28#include <net/dn_dev.h> 28#include <net/dn_dev.h>
diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c
index ecc28cff08ab..af781010753b 100644
--- a/net/dns_resolver/dns_query.c
+++ b/net/dns_resolver/dns_query.c
@@ -37,8 +37,10 @@
37 37
38#include <linux/module.h> 38#include <linux/module.h>
39#include <linux/slab.h> 39#include <linux/slab.h>
40#include <linux/cred.h>
40#include <linux/dns_resolver.h> 41#include <linux/dns_resolver.h>
41#include <linux/err.h> 42#include <linux/err.h>
43
42#include <keys/dns_resolver-type.h> 44#include <keys/dns_resolver-type.h>
43#include <keys/user-type.h> 45#include <keys/user-type.h>
44 46
@@ -70,7 +72,7 @@ int dns_query(const char *type, const char *name, size_t namelen,
70 const char *options, char **_result, time64_t *_expiry) 72 const char *options, char **_result, time64_t *_expiry)
71{ 73{
72 struct key *rkey; 74 struct key *rkey;
73 const struct user_key_payload *upayload; 75 struct user_key_payload *upayload;
74 const struct cred *saved_cred; 76 const struct cred *saved_cred;
75 size_t typelen, desclen; 77 size_t typelen, desclen;
76 char *desc, *cp; 78 char *desc, *cp;
@@ -141,7 +143,7 @@ int dns_query(const char *type, const char *name, size_t namelen,
141 if (ret) 143 if (ret)
142 goto put; 144 goto put;
143 145
144 upayload = user_key_payload(rkey); 146 upayload = user_key_payload_locked(rkey);
145 len = upayload->datalen; 147 len = upayload->datalen;
146 148
147 ret = -ENOMEM; 149 ret = -ENOMEM;
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 96e47c539bee..9649238eef40 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -1,12 +1,13 @@
1config HAVE_NET_DSA 1config HAVE_NET_DSA
2 def_bool y 2 def_bool y
3 depends on NETDEVICES && !S390 3 depends on INET && NETDEVICES && !S390
4 4
5# Drivers must select NET_DSA and the appropriate tagging format 5# Drivers must select NET_DSA and the appropriate tagging format
6 6
7config NET_DSA 7config NET_DSA
8 tristate "Distributed Switch Architecture" 8 tristate "Distributed Switch Architecture"
9 depends on HAVE_NET_DSA && NET_SWITCHDEV 9 depends on HAVE_NET_DSA
10 select NET_SWITCHDEV
10 select PHYLIB 11 select PHYLIB
11 ---help--- 12 ---help---
12 Say Y if you want to enable support for the hardware switches supported 13 Say Y if you want to enable support for the hardware switches supported
@@ -14,17 +15,6 @@ config NET_DSA
14 15
15if NET_DSA 16if NET_DSA
16 17
17config NET_DSA_HWMON
18 bool "Distributed Switch Architecture HWMON support"
19 default y
20 depends on HWMON && !(NET_DSA=y && HWMON=m)
21 ---help---
22 Say Y if you want to expose thermal sensor data on switches supported
23 by the Distributed Switch Architecture.
24
25 Some of those switches contain thermal sensors. This data is available
26 via the hwmon sysfs interface and exposes the onboard sensors.
27
28# tagging formats 18# tagging formats
29config NET_DSA_TAG_BRCM 19config NET_DSA_TAG_BRCM
30 bool 20 bool
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index a3380ed0e0be..31d343796251 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -1,6 +1,6 @@
1# the core 1# the core
2obj-$(CONFIG_NET_DSA) += dsa_core.o 2obj-$(CONFIG_NET_DSA) += dsa_core.o
3dsa_core-y += dsa.o slave.o dsa2.o 3dsa_core-y += dsa.o slave.o dsa2.o switch.o
4 4
5# tagging formats 5# tagging formats
6dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o 6dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 7899919cd9f0..b6d4f6a23f06 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -9,9 +9,7 @@
9 * (at your option) any later version. 9 * (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/ctype.h>
13#include <linux/device.h> 12#include <linux/device.h>
14#include <linux/hwmon.h>
15#include <linux/list.h> 13#include <linux/list.h>
16#include <linux/platform_device.h> 14#include <linux/platform_device.h>
17#include <linux/slab.h> 15#include <linux/slab.h>
@@ -27,8 +25,6 @@
27#include <linux/gpio/consumer.h> 25#include <linux/gpio/consumer.h>
28#include "dsa_priv.h" 26#include "dsa_priv.h"
29 27
30char dsa_driver_version[] = "0.1";
31
32static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb, 28static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb,
33 struct net_device *dev) 29 struct net_device *dev)
34{ 30{
@@ -64,27 +60,27 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = {
64static DEFINE_MUTEX(dsa_switch_drivers_mutex); 60static DEFINE_MUTEX(dsa_switch_drivers_mutex);
65static LIST_HEAD(dsa_switch_drivers); 61static LIST_HEAD(dsa_switch_drivers);
66 62
67void register_switch_driver(struct dsa_switch_ops *ops) 63void register_switch_driver(struct dsa_switch_driver *drv)
68{ 64{
69 mutex_lock(&dsa_switch_drivers_mutex); 65 mutex_lock(&dsa_switch_drivers_mutex);
70 list_add_tail(&ops->list, &dsa_switch_drivers); 66 list_add_tail(&drv->list, &dsa_switch_drivers);
71 mutex_unlock(&dsa_switch_drivers_mutex); 67 mutex_unlock(&dsa_switch_drivers_mutex);
72} 68}
73EXPORT_SYMBOL_GPL(register_switch_driver); 69EXPORT_SYMBOL_GPL(register_switch_driver);
74 70
75void unregister_switch_driver(struct dsa_switch_ops *ops) 71void unregister_switch_driver(struct dsa_switch_driver *drv)
76{ 72{
77 mutex_lock(&dsa_switch_drivers_mutex); 73 mutex_lock(&dsa_switch_drivers_mutex);
78 list_del_init(&ops->list); 74 list_del_init(&drv->list);
79 mutex_unlock(&dsa_switch_drivers_mutex); 75 mutex_unlock(&dsa_switch_drivers_mutex);
80} 76}
81EXPORT_SYMBOL_GPL(unregister_switch_driver); 77EXPORT_SYMBOL_GPL(unregister_switch_driver);
82 78
83static struct dsa_switch_ops * 79static const struct dsa_switch_ops *
84dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr, 80dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr,
85 const char **_name, void **priv) 81 const char **_name, void **priv)
86{ 82{
87 struct dsa_switch_ops *ret; 83 const struct dsa_switch_ops *ret;
88 struct list_head *list; 84 struct list_head *list;
89 const char *name; 85 const char *name;
90 86
@@ -93,9 +89,11 @@ dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr,
93 89
94 mutex_lock(&dsa_switch_drivers_mutex); 90 mutex_lock(&dsa_switch_drivers_mutex);
95 list_for_each(list, &dsa_switch_drivers) { 91 list_for_each(list, &dsa_switch_drivers) {
96 struct dsa_switch_ops *ops; 92 const struct dsa_switch_ops *ops;
93 struct dsa_switch_driver *drv;
97 94
98 ops = list_entry(list, struct dsa_switch_ops, list); 95 drv = list_entry(list, struct dsa_switch_driver, list);
96 ops = drv->ops;
99 97
100 name = ops->probe(parent, host_dev, sw_addr, priv); 98 name = ops->probe(parent, host_dev, sw_addr, priv);
101 if (name != NULL) { 99 if (name != NULL) {
@@ -110,109 +108,11 @@ dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr,
110 return ret; 108 return ret;
111} 109}
112 110
113/* hwmon support ************************************************************/
114
115#ifdef CONFIG_NET_DSA_HWMON
116
117static ssize_t temp1_input_show(struct device *dev,
118 struct device_attribute *attr, char *buf)
119{
120 struct dsa_switch *ds = dev_get_drvdata(dev);
121 int temp, ret;
122
123 ret = ds->ops->get_temp(ds, &temp);
124 if (ret < 0)
125 return ret;
126
127 return sprintf(buf, "%d\n", temp * 1000);
128}
129static DEVICE_ATTR_RO(temp1_input);
130
131static ssize_t temp1_max_show(struct device *dev,
132 struct device_attribute *attr, char *buf)
133{
134 struct dsa_switch *ds = dev_get_drvdata(dev);
135 int temp, ret;
136
137 ret = ds->ops->get_temp_limit(ds, &temp);
138 if (ret < 0)
139 return ret;
140
141 return sprintf(buf, "%d\n", temp * 1000);
142}
143
144static ssize_t temp1_max_store(struct device *dev,
145 struct device_attribute *attr, const char *buf,
146 size_t count)
147{
148 struct dsa_switch *ds = dev_get_drvdata(dev);
149 int temp, ret;
150
151 ret = kstrtoint(buf, 0, &temp);
152 if (ret < 0)
153 return ret;
154
155 ret = ds->ops->set_temp_limit(ds, DIV_ROUND_CLOSEST(temp, 1000));
156 if (ret < 0)
157 return ret;
158
159 return count;
160}
161static DEVICE_ATTR_RW(temp1_max);
162
163static ssize_t temp1_max_alarm_show(struct device *dev,
164 struct device_attribute *attr, char *buf)
165{
166 struct dsa_switch *ds = dev_get_drvdata(dev);
167 bool alarm;
168 int ret;
169
170 ret = ds->ops->get_temp_alarm(ds, &alarm);
171 if (ret < 0)
172 return ret;
173
174 return sprintf(buf, "%d\n", alarm);
175}
176static DEVICE_ATTR_RO(temp1_max_alarm);
177
178static struct attribute *dsa_hwmon_attrs[] = {
179 &dev_attr_temp1_input.attr, /* 0 */
180 &dev_attr_temp1_max.attr, /* 1 */
181 &dev_attr_temp1_max_alarm.attr, /* 2 */
182 NULL
183};
184
185static umode_t dsa_hwmon_attrs_visible(struct kobject *kobj,
186 struct attribute *attr, int index)
187{
188 struct device *dev = container_of(kobj, struct device, kobj);
189 struct dsa_switch *ds = dev_get_drvdata(dev);
190 struct dsa_switch_ops *ops = ds->ops;
191 umode_t mode = attr->mode;
192
193 if (index == 1) {
194 if (!ops->get_temp_limit)
195 mode = 0;
196 else if (!ops->set_temp_limit)
197 mode &= ~S_IWUSR;
198 } else if (index == 2 && !ops->get_temp_alarm) {
199 mode = 0;
200 }
201 return mode;
202}
203
204static const struct attribute_group dsa_hwmon_group = {
205 .attrs = dsa_hwmon_attrs,
206 .is_visible = dsa_hwmon_attrs_visible,
207};
208__ATTRIBUTE_GROUPS(dsa_hwmon);
209
210#endif /* CONFIG_NET_DSA_HWMON */
211
212/* basic switch operations **************************************************/ 111/* basic switch operations **************************************************/
213int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev, 112int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev,
214 struct device_node *port_dn, int port) 113 struct dsa_port *dport, int port)
215{ 114{
115 struct device_node *port_dn = dport->dn;
216 struct phy_device *phydev; 116 struct phy_device *phydev;
217 int ret, mode; 117 int ret, mode;
218 118
@@ -242,15 +142,15 @@ int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev,
242 142
243static int dsa_cpu_dsa_setups(struct dsa_switch *ds, struct device *dev) 143static int dsa_cpu_dsa_setups(struct dsa_switch *ds, struct device *dev)
244{ 144{
245 struct device_node *port_dn; 145 struct dsa_port *dport;
246 int ret, port; 146 int ret, port;
247 147
248 for (port = 0; port < DSA_MAX_PORTS; port++) { 148 for (port = 0; port < ds->num_ports; port++) {
249 if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))) 149 if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)))
250 continue; 150 continue;
251 151
252 port_dn = ds->ports[port].dn; 152 dport = &ds->ports[port];
253 ret = dsa_cpu_dsa_setup(ds, dev, port_dn, port); 153 ret = dsa_cpu_dsa_setup(ds, dev, dport, port);
254 if (ret) 154 if (ret)
255 return ret; 155 return ret;
256 } 156 }
@@ -308,7 +208,7 @@ void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds)
308 208
309static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) 209static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
310{ 210{
311 struct dsa_switch_ops *ops = ds->ops; 211 const struct dsa_switch_ops *ops = ds->ops;
312 struct dsa_switch_tree *dst = ds->dst; 212 struct dsa_switch_tree *dst = ds->dst;
313 struct dsa_chip_data *cd = ds->cd; 213 struct dsa_chip_data *cd = ds->cd;
314 bool valid_name_found = false; 214 bool valid_name_found = false;
@@ -318,7 +218,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
318 /* 218 /*
319 * Validate supplied switch configuration. 219 * Validate supplied switch configuration.
320 */ 220 */
321 for (i = 0; i < DSA_MAX_PORTS; i++) { 221 for (i = 0; i < ds->num_ports; i++) {
322 char *name; 222 char *name;
323 223
324 name = cd->port_names[i]; 224 name = cd->port_names[i];
@@ -326,13 +226,12 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
326 continue; 226 continue;
327 227
328 if (!strcmp(name, "cpu")) { 228 if (!strcmp(name, "cpu")) {
329 if (dst->cpu_switch != -1) { 229 if (dst->cpu_switch) {
330 netdev_err(dst->master_netdev, 230 netdev_err(dst->master_netdev,
331 "multiple cpu ports?!\n"); 231 "multiple cpu ports?!\n");
332 ret = -EINVAL; 232 return -EINVAL;
333 goto out;
334 } 233 }
335 dst->cpu_switch = index; 234 dst->cpu_switch = ds;
336 dst->cpu_port = i; 235 dst->cpu_port = i;
337 ds->cpu_port_mask |= 1 << i; 236 ds->cpu_port_mask |= 1 << i;
338 } else if (!strcmp(name, "dsa")) { 237 } else if (!strcmp(name, "dsa")) {
@@ -343,10 +242,8 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
343 valid_name_found = true; 242 valid_name_found = true;
344 } 243 }
345 244
346 if (!valid_name_found && i == DSA_MAX_PORTS) { 245 if (!valid_name_found && i == ds->num_ports)
347 ret = -EINVAL; 246 return -EINVAL;
348 goto out;
349 }
350 247
351 /* Make the built-in MII bus mask match the number of ports, 248 /* Make the built-in MII bus mask match the number of ports,
352 * switch drivers can override this later 249 * switch drivers can override this later
@@ -358,15 +255,13 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
358 * tagging protocol to the preferred tagging format of this 255 * tagging protocol to the preferred tagging format of this
359 * switch. 256 * switch.
360 */ 257 */
361 if (dst->cpu_switch == index) { 258 if (dst->cpu_switch == ds) {
362 enum dsa_tag_protocol tag_protocol; 259 enum dsa_tag_protocol tag_protocol;
363 260
364 tag_protocol = ops->get_tag_protocol(ds); 261 tag_protocol = ops->get_tag_protocol(ds);
365 dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol); 262 dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol);
366 if (IS_ERR(dst->tag_ops)) { 263 if (IS_ERR(dst->tag_ops))
367 ret = PTR_ERR(dst->tag_ops); 264 return PTR_ERR(dst->tag_ops);
368 goto out;
369 }
370 265
371 dst->rcv = dst->tag_ops->rcv; 266 dst->rcv = dst->tag_ops->rcv;
372 } 267 }
@@ -378,85 +273,55 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
378 */ 273 */
379 ret = ops->setup(ds); 274 ret = ops->setup(ds);
380 if (ret < 0) 275 if (ret < 0)
381 goto out; 276 return ret;
277
278 ret = dsa_switch_register_notifier(ds);
279 if (ret)
280 return ret;
382 281
383 if (ops->set_addr) { 282 if (ops->set_addr) {
384 ret = ops->set_addr(ds, dst->master_netdev->dev_addr); 283 ret = ops->set_addr(ds, dst->master_netdev->dev_addr);
385 if (ret < 0) 284 if (ret < 0)
386 goto out; 285 return ret;
387 } 286 }
388 287
389 if (!ds->slave_mii_bus && ops->phy_read) { 288 if (!ds->slave_mii_bus && ops->phy_read) {
390 ds->slave_mii_bus = devm_mdiobus_alloc(parent); 289 ds->slave_mii_bus = devm_mdiobus_alloc(parent);
391 if (!ds->slave_mii_bus) { 290 if (!ds->slave_mii_bus)
392 ret = -ENOMEM; 291 return -ENOMEM;
393 goto out;
394 }
395 dsa_slave_mii_bus_init(ds); 292 dsa_slave_mii_bus_init(ds);
396 293
397 ret = mdiobus_register(ds->slave_mii_bus); 294 ret = mdiobus_register(ds->slave_mii_bus);
398 if (ret < 0) 295 if (ret < 0)
399 goto out; 296 return ret;
400 } 297 }
401 298
402 /* 299 /*
403 * Create network devices for physical switch ports. 300 * Create network devices for physical switch ports.
404 */ 301 */
405 for (i = 0; i < DSA_MAX_PORTS; i++) { 302 for (i = 0; i < ds->num_ports; i++) {
406 ds->ports[i].dn = cd->port_dn[i]; 303 ds->ports[i].dn = cd->port_dn[i];
407 304
408 if (!(ds->enabled_port_mask & (1 << i))) 305 if (!(ds->enabled_port_mask & (1 << i)))
409 continue; 306 continue;
410 307
411 ret = dsa_slave_create(ds, parent, i, cd->port_names[i]); 308 ret = dsa_slave_create(ds, parent, i, cd->port_names[i]);
412 if (ret < 0) { 309 if (ret < 0)
413 netdev_err(dst->master_netdev, "[%d]: can't create dsa slave device for port %d(%s): %d\n", 310 netdev_err(dst->master_netdev, "[%d]: can't create dsa slave device for port %d(%s): %d\n",
414 index, i, cd->port_names[i], ret); 311 index, i, cd->port_names[i], ret);
415 ret = 0;
416 }
417 } 312 }
418 313
419 /* Perform configuration of the CPU and DSA ports */ 314 /* Perform configuration of the CPU and DSA ports */
420 ret = dsa_cpu_dsa_setups(ds, parent); 315 ret = dsa_cpu_dsa_setups(ds, parent);
421 if (ret < 0) { 316 if (ret < 0)
422 netdev_err(dst->master_netdev, "[%d] : can't configure CPU and DSA ports\n", 317 netdev_err(dst->master_netdev, "[%d] : can't configure CPU and DSA ports\n",
423 index); 318 index);
424 ret = 0;
425 }
426 319
427 ret = dsa_cpu_port_ethtool_setup(ds); 320 ret = dsa_cpu_port_ethtool_setup(ds);
428 if (ret) 321 if (ret)
429 return ret; 322 return ret;
430 323
431#ifdef CONFIG_NET_DSA_HWMON 324 return 0;
432 /* If the switch provides a temperature sensor,
433 * register with hardware monitoring subsystem.
434 * Treat registration error as non-fatal and ignore it.
435 */
436 if (ops->get_temp) {
437 const char *netname = netdev_name(dst->master_netdev);
438 char hname[IFNAMSIZ + 1];
439 int i, j;
440
441 /* Create valid hwmon 'name' attribute */
442 for (i = j = 0; i < IFNAMSIZ && netname[i]; i++) {
443 if (isalnum(netname[i]))
444 hname[j++] = netname[i];
445 }
446 hname[j] = '\0';
447 scnprintf(ds->hwmon_name, sizeof(ds->hwmon_name), "%s_dsa%d",
448 hname, index);
449 ds->hwmon_dev = hwmon_device_register_with_groups(NULL,
450 ds->hwmon_name, ds, dsa_hwmon_groups);
451 if (IS_ERR(ds->hwmon_dev))
452 ds->hwmon_dev = NULL;
453 }
454#endif /* CONFIG_NET_DSA_HWMON */
455
456 return ret;
457
458out:
459 return ret;
460} 325}
461 326
462static struct dsa_switch * 327static struct dsa_switch *
@@ -464,7 +329,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
464 struct device *parent, struct device *host_dev) 329 struct device *parent, struct device *host_dev)
465{ 330{
466 struct dsa_chip_data *cd = dst->pd->chip + index; 331 struct dsa_chip_data *cd = dst->pd->chip + index;
467 struct dsa_switch_ops *ops; 332 const struct dsa_switch_ops *ops;
468 struct dsa_switch *ds; 333 struct dsa_switch *ds;
469 int ret; 334 int ret;
470 const char *name; 335 const char *name;
@@ -486,8 +351,8 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
486 /* 351 /*
487 * Allocate and initialise switch state. 352 * Allocate and initialise switch state.
488 */ 353 */
489 ds = devm_kzalloc(parent, sizeof(*ds), GFP_KERNEL); 354 ds = dsa_switch_alloc(parent, DSA_MAX_PORTS);
490 if (ds == NULL) 355 if (!ds)
491 return ERR_PTR(-ENOMEM); 356 return ERR_PTR(-ENOMEM);
492 357
493 ds->dst = dst; 358 ds->dst = dst;
@@ -495,7 +360,6 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
495 ds->cd = cd; 360 ds->cd = cd;
496 ds->ops = ops; 361 ds->ops = ops;
497 ds->priv = priv; 362 ds->priv = priv;
498 ds->dev = parent;
499 363
500 ret = dsa_switch_setup_one(ds, parent); 364 ret = dsa_switch_setup_one(ds, parent);
501 if (ret) 365 if (ret)
@@ -504,8 +368,10 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
504 return ds; 368 return ds;
505} 369}
506 370
507void dsa_cpu_dsa_destroy(struct device_node *port_dn) 371void dsa_cpu_dsa_destroy(struct dsa_port *port)
508{ 372{
373 struct device_node *port_dn = port->dn;
374
509 if (of_phy_is_fixed_link(port_dn)) 375 if (of_phy_is_fixed_link(port_dn))
510 of_phy_deregister_fixed_link(port_dn); 376 of_phy_deregister_fixed_link(port_dn);
511} 377}
@@ -514,13 +380,8 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
514{ 380{
515 int port; 381 int port;
516 382
517#ifdef CONFIG_NET_DSA_HWMON
518 if (ds->hwmon_dev)
519 hwmon_device_unregister(ds->hwmon_dev);
520#endif
521
522 /* Destroy network devices for physical switch ports. */ 383 /* Destroy network devices for physical switch ports. */
523 for (port = 0; port < DSA_MAX_PORTS; port++) { 384 for (port = 0; port < ds->num_ports; port++) {
524 if (!(ds->enabled_port_mask & (1 << port))) 385 if (!(ds->enabled_port_mask & (1 << port)))
525 continue; 386 continue;
526 387
@@ -531,10 +392,10 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
531 } 392 }
532 393
533 /* Disable configuration of the CPU and DSA ports */ 394 /* Disable configuration of the CPU and DSA ports */
534 for (port = 0; port < DSA_MAX_PORTS; port++) { 395 for (port = 0; port < ds->num_ports; port++) {
535 if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))) 396 if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)))
536 continue; 397 continue;
537 dsa_cpu_dsa_destroy(ds->ports[port].dn); 398 dsa_cpu_dsa_destroy(&ds->ports[port]);
538 399
539 /* Clearing a bit which is not set does no harm */ 400 /* Clearing a bit which is not set does no harm */
540 ds->cpu_port_mask |= ~(1 << port); 401 ds->cpu_port_mask |= ~(1 << port);
@@ -543,6 +404,8 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
543 404
544 if (ds->slave_mii_bus && ds->ops->phy_read) 405 if (ds->slave_mii_bus && ds->ops->phy_read)
545 mdiobus_unregister(ds->slave_mii_bus); 406 mdiobus_unregister(ds->slave_mii_bus);
407
408 dsa_switch_unregister_notifier(ds);
546} 409}
547 410
548#ifdef CONFIG_PM_SLEEP 411#ifdef CONFIG_PM_SLEEP
@@ -551,7 +414,7 @@ int dsa_switch_suspend(struct dsa_switch *ds)
551 int i, ret = 0; 414 int i, ret = 0;
552 415
553 /* Suspend slave network devices */ 416 /* Suspend slave network devices */
554 for (i = 0; i < DSA_MAX_PORTS; i++) { 417 for (i = 0; i < ds->num_ports; i++) {
555 if (!dsa_is_port_initialized(ds, i)) 418 if (!dsa_is_port_initialized(ds, i))
556 continue; 419 continue;
557 420
@@ -578,7 +441,7 @@ int dsa_switch_resume(struct dsa_switch *ds)
578 return ret; 441 return ret;
579 442
580 /* Resume slave network devices */ 443 /* Resume slave network devices */
581 for (i = 0; i < DSA_MAX_PORTS; i++) { 444 for (i = 0; i < ds->num_ports; i++) {
582 if (!dsa_is_port_initialized(ds, i)) 445 if (!dsa_is_port_initialized(ds, i))
583 continue; 446 continue;
584 447
@@ -629,7 +492,7 @@ struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev)
629} 492}
630EXPORT_SYMBOL_GPL(dsa_host_dev_to_mii_bus); 493EXPORT_SYMBOL_GPL(dsa_host_dev_to_mii_bus);
631 494
632static struct net_device *dev_to_net_device(struct device *dev) 495struct net_device *dsa_dev_to_net_device(struct device *dev)
633{ 496{
634 struct device *d; 497 struct device *d;
635 498
@@ -646,6 +509,7 @@ static struct net_device *dev_to_net_device(struct device *dev)
646 509
647 return NULL; 510 return NULL;
648} 511}
512EXPORT_SYMBOL_GPL(dsa_dev_to_net_device);
649 513
650#ifdef CONFIG_OF 514#ifdef CONFIG_OF
651static int dsa_of_setup_routing_table(struct dsa_platform_data *pd, 515static int dsa_of_setup_routing_table(struct dsa_platform_data *pd,
@@ -898,7 +762,6 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev,
898 762
899 dst->pd = pd; 763 dst->pd = pd;
900 dst->master_netdev = dev; 764 dst->master_netdev = dev;
901 dst->cpu_switch = -1;
902 dst->cpu_port = -1; 765 dst->cpu_port = -1;
903 766
904 for (i = 0; i < pd->nr_chips; i++) { 767 for (i = 0; i < pd->nr_chips; i++) {
@@ -940,9 +803,6 @@ static int dsa_probe(struct platform_device *pdev)
940 struct dsa_switch_tree *dst; 803 struct dsa_switch_tree *dst;
941 int ret; 804 int ret;
942 805
943 pr_notice_once("Distributed Switch Architecture driver version %s\n",
944 dsa_driver_version);
945
946 if (pdev->dev.of_node) { 806 if (pdev->dev.of_node) {
947 ret = dsa_of_probe(&pdev->dev); 807 ret = dsa_of_probe(&pdev->dev);
948 if (ret) 808 if (ret)
@@ -958,7 +818,7 @@ static int dsa_probe(struct platform_device *pdev)
958 dev = pd->of_netdev; 818 dev = pd->of_netdev;
959 dev_hold(dev); 819 dev_hold(dev);
960 } else { 820 } else {
961 dev = dev_to_net_device(pd->netdev); 821 dev = dsa_dev_to_net_device(pd->netdev);
962 } 822 }
963 if (dev == NULL) { 823 if (dev == NULL) {
964 ret = -EPROBE_DEFER; 824 ret = -EPROBE_DEFER;
@@ -1013,7 +873,7 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst)
1013 dsa_switch_destroy(ds); 873 dsa_switch_destroy(ds);
1014 } 874 }
1015 875
1016 dsa_cpu_port_ethtool_restore(dst->ds[0]); 876 dsa_cpu_port_ethtool_restore(dst->cpu_switch);
1017 877
1018 dev_put(dst->master_netdev); 878 dev_put(dst->master_netdev);
1019} 879}
@@ -1050,10 +910,6 @@ static struct packet_type dsa_pack_type __read_mostly = {
1050 .func = dsa_switch_rcv, 910 .func = dsa_switch_rcv,
1051}; 911};
1052 912
1053static struct notifier_block dsa_netdevice_nb __read_mostly = {
1054 .notifier_call = dsa_slave_netdevice_event,
1055};
1056
1057#ifdef CONFIG_PM_SLEEP 913#ifdef CONFIG_PM_SLEEP
1058static int dsa_suspend(struct device *d) 914static int dsa_suspend(struct device *d)
1059{ 915{
@@ -1111,7 +967,9 @@ static int __init dsa_init_module(void)
1111{ 967{
1112 int rc; 968 int rc;
1113 969
1114 register_netdevice_notifier(&dsa_netdevice_nb); 970 rc = dsa_slave_register_notifier();
971 if (rc)
972 return rc;
1115 973
1116 rc = platform_driver_register(&dsa_driver); 974 rc = platform_driver_register(&dsa_driver);
1117 if (rc) 975 if (rc)
@@ -1125,7 +983,7 @@ module_init(dsa_init_module);
1125 983
1126static void __exit dsa_cleanup_module(void) 984static void __exit dsa_cleanup_module(void)
1127{ 985{
1128 unregister_netdevice_notifier(&dsa_netdevice_nb); 986 dsa_slave_unregister_notifier();
1129 dev_remove_pack(&dsa_pack_type); 987 dev_remove_pack(&dsa_pack_type);
1130 platform_driver_unregister(&dsa_driver); 988 platform_driver_unregister(&dsa_driver);
1131} 989}
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 5fff951a0a49..737be6470c7f 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -57,7 +57,6 @@ static struct dsa_switch_tree *dsa_add_dst(u32 tree)
57 if (!dst) 57 if (!dst)
58 return NULL; 58 return NULL;
59 dst->tree = tree; 59 dst->tree = tree;
60 dst->cpu_switch = -1;
61 INIT_LIST_HEAD(&dst->list); 60 INIT_LIST_HEAD(&dst->list);
62 list_add_tail(&dsa_switch_trees, &dst->list); 61 list_add_tail(&dsa_switch_trees, &dst->list);
63 kref_init(&dst->refcount); 62 kref_init(&dst->refcount);
@@ -79,47 +78,43 @@ static void dsa_dst_del_ds(struct dsa_switch_tree *dst,
79 kref_put(&dst->refcount, dsa_free_dst); 78 kref_put(&dst->refcount, dsa_free_dst);
80} 79}
81 80
82static bool dsa_port_is_dsa(struct device_node *port) 81/* For platform data configurations, we need to have a valid name argument to
82 * differentiate a disabled port from an enabled one
83 */
84static bool dsa_port_is_valid(struct dsa_port *port)
83{ 85{
84 const char *name; 86 return !!(port->dn || port->name);
85 87}
86 name = of_get_property(port, "label", NULL);
87 if (!name)
88 return false;
89 88
90 if (!strcmp(name, "dsa")) 89static bool dsa_port_is_dsa(struct dsa_port *port)
90{
91 if (port->name && !strcmp(port->name, "dsa"))
91 return true; 92 return true;
92 93 else
93 return false; 94 return !!of_parse_phandle(port->dn, "link", 0);
94} 95}
95 96
96static bool dsa_port_is_cpu(struct device_node *port) 97static bool dsa_port_is_cpu(struct dsa_port *port)
97{ 98{
98 const char *name; 99 if (port->name && !strcmp(port->name, "cpu"))
99
100 name = of_get_property(port, "label", NULL);
101 if (!name)
102 return false;
103
104 if (!strcmp(name, "cpu"))
105 return true; 100 return true;
106 101 else
107 return false; 102 return !!of_parse_phandle(port->dn, "ethernet", 0);
108} 103}
109 104
110static bool dsa_ds_find_port(struct dsa_switch *ds, 105static bool dsa_ds_find_port_dn(struct dsa_switch *ds,
111 struct device_node *port) 106 struct device_node *port)
112{ 107{
113 u32 index; 108 u32 index;
114 109
115 for (index = 0; index < DSA_MAX_PORTS; index++) 110 for (index = 0; index < ds->num_ports; index++)
116 if (ds->ports[index].dn == port) 111 if (ds->ports[index].dn == port)
117 return true; 112 return true;
118 return false; 113 return false;
119} 114}
120 115
121static struct dsa_switch *dsa_dst_find_port(struct dsa_switch_tree *dst, 116static struct dsa_switch *dsa_dst_find_port_dn(struct dsa_switch_tree *dst,
122 struct device_node *port) 117 struct device_node *port)
123{ 118{
124 struct dsa_switch *ds; 119 struct dsa_switch *ds;
125 u32 index; 120 u32 index;
@@ -129,7 +124,7 @@ static struct dsa_switch *dsa_dst_find_port(struct dsa_switch_tree *dst,
129 if (!ds) 124 if (!ds)
130 continue; 125 continue;
131 126
132 if (dsa_ds_find_port(ds, port)) 127 if (dsa_ds_find_port_dn(ds, port))
133 return ds; 128 return ds;
134 } 129 }
135 130
@@ -138,7 +133,7 @@ static struct dsa_switch *dsa_dst_find_port(struct dsa_switch_tree *dst,
138 133
139static int dsa_port_complete(struct dsa_switch_tree *dst, 134static int dsa_port_complete(struct dsa_switch_tree *dst,
140 struct dsa_switch *src_ds, 135 struct dsa_switch *src_ds,
141 struct device_node *port, 136 struct dsa_port *port,
142 u32 src_port) 137 u32 src_port)
143{ 138{
144 struct device_node *link; 139 struct device_node *link;
@@ -146,11 +141,11 @@ static int dsa_port_complete(struct dsa_switch_tree *dst,
146 struct dsa_switch *dst_ds; 141 struct dsa_switch *dst_ds;
147 142
148 for (index = 0;; index++) { 143 for (index = 0;; index++) {
149 link = of_parse_phandle(port, "link", index); 144 link = of_parse_phandle(port->dn, "link", index);
150 if (!link) 145 if (!link)
151 break; 146 break;
152 147
153 dst_ds = dsa_dst_find_port(dst, link); 148 dst_ds = dsa_dst_find_port_dn(dst, link);
154 of_node_put(link); 149 of_node_put(link);
155 150
156 if (!dst_ds) 151 if (!dst_ds)
@@ -169,13 +164,13 @@ static int dsa_port_complete(struct dsa_switch_tree *dst,
169 */ 164 */
170static int dsa_ds_complete(struct dsa_switch_tree *dst, struct dsa_switch *ds) 165static int dsa_ds_complete(struct dsa_switch_tree *dst, struct dsa_switch *ds)
171{ 166{
172 struct device_node *port; 167 struct dsa_port *port;
173 u32 index; 168 u32 index;
174 int err; 169 int err;
175 170
176 for (index = 0; index < DSA_MAX_PORTS; index++) { 171 for (index = 0; index < ds->num_ports; index++) {
177 port = ds->ports[index].dn; 172 port = &ds->ports[index];
178 if (!port) 173 if (!dsa_port_is_valid(port))
179 continue; 174 continue;
180 175
181 if (!dsa_port_is_dsa(port)) 176 if (!dsa_port_is_dsa(port))
@@ -215,7 +210,7 @@ static int dsa_dst_complete(struct dsa_switch_tree *dst)
215 return 0; 210 return 0;
216} 211}
217 212
218static int dsa_dsa_port_apply(struct device_node *port, u32 index, 213static int dsa_dsa_port_apply(struct dsa_port *port, u32 index,
219 struct dsa_switch *ds) 214 struct dsa_switch *ds)
220{ 215{
221 int err; 216 int err;
@@ -230,13 +225,13 @@ static int dsa_dsa_port_apply(struct device_node *port, u32 index,
230 return 0; 225 return 0;
231} 226}
232 227
233static void dsa_dsa_port_unapply(struct device_node *port, u32 index, 228static void dsa_dsa_port_unapply(struct dsa_port *port, u32 index,
234 struct dsa_switch *ds) 229 struct dsa_switch *ds)
235{ 230{
236 dsa_cpu_dsa_destroy(port); 231 dsa_cpu_dsa_destroy(port);
237} 232}
238 233
239static int dsa_cpu_port_apply(struct device_node *port, u32 index, 234static int dsa_cpu_port_apply(struct dsa_port *port, u32 index,
240 struct dsa_switch *ds) 235 struct dsa_switch *ds)
241{ 236{
242 int err; 237 int err;
@@ -253,7 +248,7 @@ static int dsa_cpu_port_apply(struct device_node *port, u32 index,
253 return 0; 248 return 0;
254} 249}
255 250
256static void dsa_cpu_port_unapply(struct device_node *port, u32 index, 251static void dsa_cpu_port_unapply(struct dsa_port *port, u32 index,
257 struct dsa_switch *ds) 252 struct dsa_switch *ds)
258{ 253{
259 dsa_cpu_dsa_destroy(port); 254 dsa_cpu_dsa_destroy(port);
@@ -261,25 +256,29 @@ static void dsa_cpu_port_unapply(struct device_node *port, u32 index,
261 256
262} 257}
263 258
264static int dsa_user_port_apply(struct device_node *port, u32 index, 259static int dsa_user_port_apply(struct dsa_port *port, u32 index,
265 struct dsa_switch *ds) 260 struct dsa_switch *ds)
266{ 261{
267 const char *name; 262 const char *name = port->name;
268 int err; 263 int err;
269 264
270 name = of_get_property(port, "label", NULL); 265 if (port->dn)
266 name = of_get_property(port->dn, "label", NULL);
267 if (!name)
268 name = "eth%d";
271 269
272 err = dsa_slave_create(ds, ds->dev, index, name); 270 err = dsa_slave_create(ds, ds->dev, index, name);
273 if (err) { 271 if (err) {
274 dev_warn(ds->dev, "Failed to create slave %d: %d\n", 272 dev_warn(ds->dev, "Failed to create slave %d: %d\n",
275 index, err); 273 index, err);
274 ds->ports[index].netdev = NULL;
276 return err; 275 return err;
277 } 276 }
278 277
279 return 0; 278 return 0;
280} 279}
281 280
282static void dsa_user_port_unapply(struct device_node *port, u32 index, 281static void dsa_user_port_unapply(struct dsa_port *port, u32 index,
283 struct dsa_switch *ds) 282 struct dsa_switch *ds)
284{ 283{
285 if (ds->ports[index].netdev) { 284 if (ds->ports[index].netdev) {
@@ -291,7 +290,7 @@ static void dsa_user_port_unapply(struct device_node *port, u32 index,
291 290
292static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds) 291static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
293{ 292{
294 struct device_node *port; 293 struct dsa_port *port;
295 u32 index; 294 u32 index;
296 int err; 295 int err;
297 296
@@ -306,6 +305,10 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
306 if (err < 0) 305 if (err < 0)
307 return err; 306 return err;
308 307
308 err = dsa_switch_register_notifier(ds);
309 if (err)
310 return err;
311
309 if (ds->ops->set_addr) { 312 if (ds->ops->set_addr) {
310 err = ds->ops->set_addr(ds, dst->master_netdev->dev_addr); 313 err = ds->ops->set_addr(ds, dst->master_netdev->dev_addr);
311 if (err < 0) 314 if (err < 0)
@@ -324,9 +327,9 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
324 return err; 327 return err;
325 } 328 }
326 329
327 for (index = 0; index < DSA_MAX_PORTS; index++) { 330 for (index = 0; index < ds->num_ports; index++) {
328 port = ds->ports[index].dn; 331 port = &ds->ports[index];
329 if (!port) 332 if (!dsa_port_is_valid(port))
330 continue; 333 continue;
331 334
332 if (dsa_port_is_dsa(port)) { 335 if (dsa_port_is_dsa(port)) {
@@ -353,12 +356,12 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
353 356
354static void dsa_ds_unapply(struct dsa_switch_tree *dst, struct dsa_switch *ds) 357static void dsa_ds_unapply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
355{ 358{
356 struct device_node *port; 359 struct dsa_port *port;
357 u32 index; 360 u32 index;
358 361
359 for (index = 0; index < DSA_MAX_PORTS; index++) { 362 for (index = 0; index < ds->num_ports; index++) {
360 port = ds->ports[index].dn; 363 port = &ds->ports[index];
361 if (!port) 364 if (!dsa_port_is_valid(port))
362 continue; 365 continue;
363 366
364 if (dsa_port_is_dsa(port)) { 367 if (dsa_port_is_dsa(port)) {
@@ -376,6 +379,8 @@ static void dsa_ds_unapply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
376 379
377 if (ds->slave_mii_bus && ds->ops->phy_read) 380 if (ds->slave_mii_bus && ds->ops->phy_read)
378 mdiobus_unregister(ds->slave_mii_bus); 381 mdiobus_unregister(ds->slave_mii_bus);
382
383 dsa_switch_unregister_notifier(ds);
379} 384}
380 385
381static int dsa_dst_apply(struct dsa_switch_tree *dst) 386static int dsa_dst_apply(struct dsa_switch_tree *dst)
@@ -394,9 +399,11 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst)
394 return err; 399 return err;
395 } 400 }
396 401
397 err = dsa_cpu_port_ethtool_setup(dst->ds[0]); 402 if (dst->cpu_switch) {
398 if (err) 403 err = dsa_cpu_port_ethtool_setup(dst->cpu_switch);
399 return err; 404 if (err)
405 return err;
406 }
400 407
401 /* If we use a tagging format that doesn't have an ethertype 408 /* If we use a tagging format that doesn't have an ethertype
402 * field, make sure that all packets from this point on get 409 * field, make sure that all packets from this point on get
@@ -433,13 +440,14 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst)
433 dsa_ds_unapply(dst, ds); 440 dsa_ds_unapply(dst, ds);
434 } 441 }
435 442
436 dsa_cpu_port_ethtool_restore(dst->ds[0]); 443 if (dst->cpu_switch)
444 dsa_cpu_port_ethtool_restore(dst->cpu_switch);
437 445
438 pr_info("DSA: tree %d unapplied\n", dst->tree); 446 pr_info("DSA: tree %d unapplied\n", dst->tree);
439 dst->applied = false; 447 dst->applied = false;
440} 448}
441 449
442static int dsa_cpu_parse(struct device_node *port, u32 index, 450static int dsa_cpu_parse(struct dsa_port *port, u32 index,
443 struct dsa_switch_tree *dst, 451 struct dsa_switch_tree *dst,
444 struct dsa_switch *ds) 452 struct dsa_switch *ds)
445{ 453{
@@ -447,11 +455,16 @@ static int dsa_cpu_parse(struct device_node *port, u32 index,
447 struct net_device *ethernet_dev; 455 struct net_device *ethernet_dev;
448 struct device_node *ethernet; 456 struct device_node *ethernet;
449 457
450 ethernet = of_parse_phandle(port, "ethernet", 0); 458 if (port->dn) {
451 if (!ethernet) 459 ethernet = of_parse_phandle(port->dn, "ethernet", 0);
452 return -EINVAL; 460 if (!ethernet)
461 return -EINVAL;
462 ethernet_dev = of_find_net_device_by_node(ethernet);
463 } else {
464 ethernet_dev = dsa_dev_to_net_device(ds->cd->netdev[index]);
465 dev_put(ethernet_dev);
466 }
453 467
454 ethernet_dev = of_find_net_device_by_node(ethernet);
455 if (!ethernet_dev) 468 if (!ethernet_dev)
456 return -EPROBE_DEFER; 469 return -EPROBE_DEFER;
457 470
@@ -461,8 +474,8 @@ static int dsa_cpu_parse(struct device_node *port, u32 index,
461 if (!dst->master_netdev) 474 if (!dst->master_netdev)
462 dst->master_netdev = ethernet_dev; 475 dst->master_netdev = ethernet_dev;
463 476
464 if (dst->cpu_switch == -1) { 477 if (!dst->cpu_switch) {
465 dst->cpu_switch = ds->index; 478 dst->cpu_switch = ds;
466 dst->cpu_port = index; 479 dst->cpu_port = index;
467 } 480 }
468 481
@@ -480,13 +493,13 @@ static int dsa_cpu_parse(struct device_node *port, u32 index,
480 493
481static int dsa_ds_parse(struct dsa_switch_tree *dst, struct dsa_switch *ds) 494static int dsa_ds_parse(struct dsa_switch_tree *dst, struct dsa_switch *ds)
482{ 495{
483 struct device_node *port; 496 struct dsa_port *port;
484 u32 index; 497 u32 index;
485 int err; 498 int err;
486 499
487 for (index = 0; index < DSA_MAX_PORTS; index++) { 500 for (index = 0; index < ds->num_ports; index++) {
488 port = ds->ports[index].dn; 501 port = &ds->ports[index];
489 if (!port) 502 if (!dsa_port_is_valid(port))
490 continue; 503 continue;
491 504
492 if (dsa_port_is_cpu(port)) { 505 if (dsa_port_is_cpu(port)) {
@@ -538,7 +551,7 @@ static int dsa_parse_ports_dn(struct device_node *ports, struct dsa_switch *ds)
538 if (err) 551 if (err)
539 return err; 552 return err;
540 553
541 if (reg >= DSA_MAX_PORTS) 554 if (reg >= ds->num_ports)
542 return -EINVAL; 555 return -EINVAL;
543 556
544 ds->ports[reg].dn = port; 557 ds->ports[reg].dn = port;
@@ -547,14 +560,41 @@ static int dsa_parse_ports_dn(struct device_node *ports, struct dsa_switch *ds)
547 * to have access to a correct value, just like what 560 * to have access to a correct value, just like what
548 * net/dsa/dsa.c::dsa_switch_setup_one does. 561 * net/dsa/dsa.c::dsa_switch_setup_one does.
549 */ 562 */
550 if (!dsa_port_is_cpu(port)) 563 if (!dsa_port_is_cpu(&ds->ports[reg]))
551 ds->enabled_port_mask |= 1 << reg; 564 ds->enabled_port_mask |= 1 << reg;
552 } 565 }
553 566
554 return 0; 567 return 0;
555} 568}
556 569
557static int dsa_parse_member(struct device_node *np, u32 *tree, u32 *index) 570static int dsa_parse_ports(struct dsa_chip_data *cd, struct dsa_switch *ds)
571{
572 bool valid_name_found = false;
573 unsigned int i;
574
575 for (i = 0; i < DSA_MAX_PORTS; i++) {
576 if (!cd->port_names[i])
577 continue;
578
579 ds->ports[i].name = cd->port_names[i];
580
581 /* Initialize enabled_port_mask now for drv->setup()
582 * to have access to a correct value, just like what
583 * net/dsa/dsa.c::dsa_switch_setup_one does.
584 */
585 if (!dsa_port_is_cpu(&ds->ports[i]))
586 ds->enabled_port_mask |= 1 << i;
587
588 valid_name_found = true;
589 }
590
591 if (!valid_name_found && i == DSA_MAX_PORTS)
592 return -EINVAL;
593
594 return 0;
595}
596
597static int dsa_parse_member_dn(struct device_node *np, u32 *tree, u32 *index)
558{ 598{
559 int err; 599 int err;
560 600
@@ -578,6 +618,18 @@ static int dsa_parse_member(struct device_node *np, u32 *tree, u32 *index)
578 return 0; 618 return 0;
579} 619}
580 620
621static int dsa_parse_member(struct dsa_chip_data *pd, u32 *tree, u32 *index)
622{
623 if (!pd)
624 return -ENODEV;
625
626 /* We do not support complex trees with dsa_chip_data */
627 *tree = 0;
628 *index = 0;
629
630 return 0;
631}
632
581static struct device_node *dsa_get_ports(struct dsa_switch *ds, 633static struct device_node *dsa_get_ports(struct dsa_switch *ds,
582 struct device_node *np) 634 struct device_node *np)
583{ 635{
@@ -592,23 +644,36 @@ static struct device_node *dsa_get_ports(struct dsa_switch *ds,
592 return ports; 644 return ports;
593} 645}
594 646
595static int _dsa_register_switch(struct dsa_switch *ds, struct device_node *np) 647static int _dsa_register_switch(struct dsa_switch *ds, struct device *dev)
596{ 648{
597 struct device_node *ports = dsa_get_ports(ds, np); 649 struct dsa_chip_data *pdata = dev->platform_data;
650 struct device_node *np = dev->of_node;
598 struct dsa_switch_tree *dst; 651 struct dsa_switch_tree *dst;
652 struct device_node *ports;
599 u32 tree, index; 653 u32 tree, index;
600 int i, err; 654 int i, err;
601 655
602 err = dsa_parse_member(np, &tree, &index); 656 if (np) {
603 if (err) 657 err = dsa_parse_member_dn(np, &tree, &index);
604 return err; 658 if (err)
659 return err;
605 660
606 if (IS_ERR(ports)) 661 ports = dsa_get_ports(ds, np);
607 return PTR_ERR(ports); 662 if (IS_ERR(ports))
663 return PTR_ERR(ports);
608 664
609 err = dsa_parse_ports_dn(ports, ds); 665 err = dsa_parse_ports_dn(ports, ds);
610 if (err) 666 if (err)
611 return err; 667 return err;
668 } else {
669 err = dsa_parse_member(pdata, &tree, &index);
670 if (err)
671 return err;
672
673 err = dsa_parse_ports(pdata, ds);
674 if (err)
675 return err;
676 }
612 677
613 dst = dsa_get_dst(tree); 678 dst = dsa_get_dst(tree);
614 if (!dst) { 679 if (!dst) {
@@ -624,6 +689,7 @@ static int _dsa_register_switch(struct dsa_switch *ds, struct device_node *np)
624 689
625 ds->dst = dst; 690 ds->dst = dst;
626 ds->index = index; 691 ds->index = index;
692 ds->cd = pdata;
627 693
628 /* Initialize the routing table */ 694 /* Initialize the routing table */
629 for (i = 0; i < DSA_MAX_SWITCHES; ++i) 695 for (i = 0; i < DSA_MAX_SWITCHES; ++i)
@@ -647,8 +713,14 @@ static int _dsa_register_switch(struct dsa_switch *ds, struct device_node *np)
647 } 713 }
648 714
649 err = dsa_dst_parse(dst); 715 err = dsa_dst_parse(dst);
650 if (err) 716 if (err) {
717 if (err == -EPROBE_DEFER) {
718 dsa_dst_del_ds(dst, ds, ds->index);
719 return err;
720 }
721
651 goto out_del_dst; 722 goto out_del_dst;
723 }
652 724
653 err = dsa_dst_apply(dst); 725 err = dsa_dst_apply(dst);
654 if (err) { 726 if (err) {
@@ -667,12 +739,34 @@ out:
667 return err; 739 return err;
668} 740}
669 741
670int dsa_register_switch(struct dsa_switch *ds, struct device_node *np) 742struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n)
743{
744 size_t size = sizeof(struct dsa_switch) + n * sizeof(struct dsa_port);
745 struct dsa_switch *ds;
746 int i;
747
748 ds = devm_kzalloc(dev, size, GFP_KERNEL);
749 if (!ds)
750 return NULL;
751
752 ds->dev = dev;
753 ds->num_ports = n;
754
755 for (i = 0; i < ds->num_ports; ++i) {
756 ds->ports[i].index = i;
757 ds->ports[i].ds = ds;
758 }
759
760 return ds;
761}
762EXPORT_SYMBOL_GPL(dsa_switch_alloc);
763
764int dsa_register_switch(struct dsa_switch *ds, struct device *dev)
671{ 765{
672 int err; 766 int err;
673 767
674 mutex_lock(&dsa2_mutex); 768 mutex_lock(&dsa2_mutex);
675 err = _dsa_register_switch(ds, np); 769 err = _dsa_register_switch(ds, dev);
676 mutex_unlock(&dsa2_mutex); 770 mutex_unlock(&dsa2_mutex);
677 771
678 return err; 772 return err;
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 6cfd7388834e..0706a511244e 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -25,12 +25,8 @@ struct dsa_slave_priv {
25 struct sk_buff * (*xmit)(struct sk_buff *skb, 25 struct sk_buff * (*xmit)(struct sk_buff *skb,
26 struct net_device *dev); 26 struct net_device *dev);
27 27
28 /* 28 /* DSA port data, such as switch, port index, etc. */
29 * Which switch this port is a part of, and the port index 29 struct dsa_port *dp;
30 * for this port.
31 */
32 struct dsa_switch *parent;
33 u8 port;
34 30
35 /* 31 /*
36 * The phylib phy_device pointer for the PHY connected 32 * The phylib phy_device pointer for the PHY connected
@@ -42,17 +38,18 @@ struct dsa_slave_priv {
42 int old_pause; 38 int old_pause;
43 int old_duplex; 39 int old_duplex;
44 40
45 struct net_device *bridge_dev;
46#ifdef CONFIG_NET_POLL_CONTROLLER 41#ifdef CONFIG_NET_POLL_CONTROLLER
47 struct netpoll *netpoll; 42 struct netpoll *netpoll;
48#endif 43#endif
44
45 /* TC context */
46 struct list_head mall_tc_list;
49}; 47};
50 48
51/* dsa.c */ 49/* dsa.c */
52extern char dsa_driver_version[];
53int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev, 50int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev,
54 struct device_node *port_dn, int port); 51 struct dsa_port *dport, int port);
55void dsa_cpu_dsa_destroy(struct device_node *port_dn); 52void dsa_cpu_dsa_destroy(struct dsa_port *dport);
56const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol); 53const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol);
57int dsa_cpu_port_ethtool_setup(struct dsa_switch *ds); 54int dsa_cpu_port_ethtool_setup(struct dsa_switch *ds);
58void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds); 55void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds);
@@ -66,8 +63,12 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
66void dsa_slave_destroy(struct net_device *slave_dev); 63void dsa_slave_destroy(struct net_device *slave_dev);
67int dsa_slave_suspend(struct net_device *slave_dev); 64int dsa_slave_suspend(struct net_device *slave_dev);
68int dsa_slave_resume(struct net_device *slave_dev); 65int dsa_slave_resume(struct net_device *slave_dev);
69int dsa_slave_netdevice_event(struct notifier_block *unused, 66int dsa_slave_register_notifier(void);
70 unsigned long event, void *ptr); 67void dsa_slave_unregister_notifier(void);
68
69/* switch.c */
70int dsa_switch_register_notifier(struct dsa_switch *ds);
71void dsa_switch_unregister_notifier(struct dsa_switch *ds);
71 72
72/* tag_dsa.c */ 73/* tag_dsa.c */
73extern const struct dsa_device_ops dsa_netdev_ops; 74extern const struct dsa_device_ops dsa_netdev_ops;
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 30e2e21d7619..c34872e1febc 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -16,12 +16,28 @@
16#include <linux/of_net.h> 16#include <linux/of_net.h>
17#include <linux/of_mdio.h> 17#include <linux/of_mdio.h>
18#include <linux/mdio.h> 18#include <linux/mdio.h>
19#include <linux/list.h>
19#include <net/rtnetlink.h> 20#include <net/rtnetlink.h>
20#include <net/switchdev.h> 21#include <net/switchdev.h>
22#include <net/pkt_cls.h>
23#include <net/tc_act/tc_mirred.h>
21#include <linux/if_bridge.h> 24#include <linux/if_bridge.h>
22#include <linux/netpoll.h> 25#include <linux/netpoll.h>
23#include "dsa_priv.h" 26#include "dsa_priv.h"
24 27
28static bool dsa_slave_dev_check(struct net_device *dev);
29
30static int dsa_slave_notify(struct net_device *dev, unsigned long e, void *v)
31{
32 struct dsa_slave_priv *p = netdev_priv(dev);
33 struct raw_notifier_head *nh = &p->dp->ds->dst->nh;
34 int err;
35
36 err = raw_notifier_call_chain(nh, e, v);
37
38 return notifier_to_errno(err);
39}
40
25/* slave mii_bus handling ***************************************************/ 41/* slave mii_bus handling ***************************************************/
26static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg) 42static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg)
27{ 43{
@@ -61,17 +77,20 @@ static int dsa_slave_get_iflink(const struct net_device *dev)
61{ 77{
62 struct dsa_slave_priv *p = netdev_priv(dev); 78 struct dsa_slave_priv *p = netdev_priv(dev);
63 79
64 return p->parent->dst->master_netdev->ifindex; 80 return p->dp->ds->dst->master_netdev->ifindex;
65} 81}
66 82
67static inline bool dsa_port_is_bridged(struct dsa_slave_priv *p) 83static inline bool dsa_port_is_bridged(struct dsa_port *dp)
68{ 84{
69 return !!p->bridge_dev; 85 return !!dp->bridge_dev;
70} 86}
71 87
72static void dsa_port_set_stp_state(struct dsa_switch *ds, int port, u8 state) 88static void dsa_slave_set_state(struct net_device *dev, u8 state)
73{ 89{
74 struct dsa_port *dp = &ds->ports[port]; 90 struct dsa_slave_priv *p = netdev_priv(dev);
91 struct dsa_port *dp = p->dp;
92 struct dsa_switch *ds = dp->ds;
93 int port = dp->index;
75 94
76 if (ds->ops->port_stp_state_set) 95 if (ds->ops->port_stp_state_set)
77 ds->ops->port_stp_state_set(ds, port, state); 96 ds->ops->port_stp_state_set(ds, port, state);
@@ -96,9 +115,9 @@ static void dsa_port_set_stp_state(struct dsa_switch *ds, int port, u8 state)
96static int dsa_slave_open(struct net_device *dev) 115static int dsa_slave_open(struct net_device *dev)
97{ 116{
98 struct dsa_slave_priv *p = netdev_priv(dev); 117 struct dsa_slave_priv *p = netdev_priv(dev);
99 struct net_device *master = p->parent->dst->master_netdev; 118 struct net_device *master = p->dp->ds->dst->master_netdev;
100 struct dsa_switch *ds = p->parent; 119 struct dsa_switch *ds = p->dp->ds;
101 u8 stp_state = dsa_port_is_bridged(p) ? 120 u8 stp_state = dsa_port_is_bridged(p->dp) ?
102 BR_STATE_BLOCKING : BR_STATE_FORWARDING; 121 BR_STATE_BLOCKING : BR_STATE_FORWARDING;
103 int err; 122 int err;
104 123
@@ -123,12 +142,12 @@ static int dsa_slave_open(struct net_device *dev)
123 } 142 }
124 143
125 if (ds->ops->port_enable) { 144 if (ds->ops->port_enable) {
126 err = ds->ops->port_enable(ds, p->port, p->phy); 145 err = ds->ops->port_enable(ds, p->dp->index, p->phy);
127 if (err) 146 if (err)
128 goto clear_promisc; 147 goto clear_promisc;
129 } 148 }
130 149
131 dsa_port_set_stp_state(ds, p->port, stp_state); 150 dsa_slave_set_state(dev, stp_state);
132 151
133 if (p->phy) 152 if (p->phy)
134 phy_start(p->phy); 153 phy_start(p->phy);
@@ -151,8 +170,8 @@ out:
151static int dsa_slave_close(struct net_device *dev) 170static int dsa_slave_close(struct net_device *dev)
152{ 171{
153 struct dsa_slave_priv *p = netdev_priv(dev); 172 struct dsa_slave_priv *p = netdev_priv(dev);
154 struct net_device *master = p->parent->dst->master_netdev; 173 struct net_device *master = p->dp->ds->dst->master_netdev;
155 struct dsa_switch *ds = p->parent; 174 struct dsa_switch *ds = p->dp->ds;
156 175
157 if (p->phy) 176 if (p->phy)
158 phy_stop(p->phy); 177 phy_stop(p->phy);
@@ -168,9 +187,9 @@ static int dsa_slave_close(struct net_device *dev)
168 dev_uc_del(master, dev->dev_addr); 187 dev_uc_del(master, dev->dev_addr);
169 188
170 if (ds->ops->port_disable) 189 if (ds->ops->port_disable)
171 ds->ops->port_disable(ds, p->port, p->phy); 190 ds->ops->port_disable(ds, p->dp->index, p->phy);
172 191
173 dsa_port_set_stp_state(ds, p->port, BR_STATE_DISABLED); 192 dsa_slave_set_state(dev, BR_STATE_DISABLED);
174 193
175 return 0; 194 return 0;
176} 195}
@@ -178,7 +197,7 @@ static int dsa_slave_close(struct net_device *dev)
178static void dsa_slave_change_rx_flags(struct net_device *dev, int change) 197static void dsa_slave_change_rx_flags(struct net_device *dev, int change)
179{ 198{
180 struct dsa_slave_priv *p = netdev_priv(dev); 199 struct dsa_slave_priv *p = netdev_priv(dev);
181 struct net_device *master = p->parent->dst->master_netdev; 200 struct net_device *master = p->dp->ds->dst->master_netdev;
182 201
183 if (change & IFF_ALLMULTI) 202 if (change & IFF_ALLMULTI)
184 dev_set_allmulti(master, dev->flags & IFF_ALLMULTI ? 1 : -1); 203 dev_set_allmulti(master, dev->flags & IFF_ALLMULTI ? 1 : -1);
@@ -189,7 +208,7 @@ static void dsa_slave_change_rx_flags(struct net_device *dev, int change)
189static void dsa_slave_set_rx_mode(struct net_device *dev) 208static void dsa_slave_set_rx_mode(struct net_device *dev)
190{ 209{
191 struct dsa_slave_priv *p = netdev_priv(dev); 210 struct dsa_slave_priv *p = netdev_priv(dev);
192 struct net_device *master = p->parent->dst->master_netdev; 211 struct net_device *master = p->dp->ds->dst->master_netdev;
193 212
194 dev_mc_sync(master, dev); 213 dev_mc_sync(master, dev);
195 dev_uc_sync(master, dev); 214 dev_uc_sync(master, dev);
@@ -198,7 +217,7 @@ static void dsa_slave_set_rx_mode(struct net_device *dev)
198static int dsa_slave_set_mac_address(struct net_device *dev, void *a) 217static int dsa_slave_set_mac_address(struct net_device *dev, void *a)
199{ 218{
200 struct dsa_slave_priv *p = netdev_priv(dev); 219 struct dsa_slave_priv *p = netdev_priv(dev);
201 struct net_device *master = p->parent->dst->master_netdev; 220 struct net_device *master = p->dp->ds->dst->master_netdev;
202 struct sockaddr *addr = a; 221 struct sockaddr *addr = a;
203 int err; 222 int err;
204 223
@@ -228,16 +247,17 @@ static int dsa_slave_port_vlan_add(struct net_device *dev,
228 struct switchdev_trans *trans) 247 struct switchdev_trans *trans)
229{ 248{
230 struct dsa_slave_priv *p = netdev_priv(dev); 249 struct dsa_slave_priv *p = netdev_priv(dev);
231 struct dsa_switch *ds = p->parent; 250 struct dsa_port *dp = p->dp;
251 struct dsa_switch *ds = dp->ds;
232 252
233 if (switchdev_trans_ph_prepare(trans)) { 253 if (switchdev_trans_ph_prepare(trans)) {
234 if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add) 254 if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add)
235 return -EOPNOTSUPP; 255 return -EOPNOTSUPP;
236 256
237 return ds->ops->port_vlan_prepare(ds, p->port, vlan, trans); 257 return ds->ops->port_vlan_prepare(ds, dp->index, vlan, trans);
238 } 258 }
239 259
240 ds->ops->port_vlan_add(ds, p->port, vlan, trans); 260 ds->ops->port_vlan_add(ds, dp->index, vlan, trans);
241 261
242 return 0; 262 return 0;
243} 263}
@@ -246,12 +266,12 @@ static int dsa_slave_port_vlan_del(struct net_device *dev,
246 const struct switchdev_obj_port_vlan *vlan) 266 const struct switchdev_obj_port_vlan *vlan)
247{ 267{
248 struct dsa_slave_priv *p = netdev_priv(dev); 268 struct dsa_slave_priv *p = netdev_priv(dev);
249 struct dsa_switch *ds = p->parent; 269 struct dsa_switch *ds = p->dp->ds;
250 270
251 if (!ds->ops->port_vlan_del) 271 if (!ds->ops->port_vlan_del)
252 return -EOPNOTSUPP; 272 return -EOPNOTSUPP;
253 273
254 return ds->ops->port_vlan_del(ds, p->port, vlan); 274 return ds->ops->port_vlan_del(ds, p->dp->index, vlan);
255} 275}
256 276
257static int dsa_slave_port_vlan_dump(struct net_device *dev, 277static int dsa_slave_port_vlan_dump(struct net_device *dev,
@@ -259,10 +279,10 @@ static int dsa_slave_port_vlan_dump(struct net_device *dev,
259 switchdev_obj_dump_cb_t *cb) 279 switchdev_obj_dump_cb_t *cb)
260{ 280{
261 struct dsa_slave_priv *p = netdev_priv(dev); 281 struct dsa_slave_priv *p = netdev_priv(dev);
262 struct dsa_switch *ds = p->parent; 282 struct dsa_switch *ds = p->dp->ds;
263 283
264 if (ds->ops->port_vlan_dump) 284 if (ds->ops->port_vlan_dump)
265 return ds->ops->port_vlan_dump(ds, p->port, vlan, cb); 285 return ds->ops->port_vlan_dump(ds, p->dp->index, vlan, cb);
266 286
267 return -EOPNOTSUPP; 287 return -EOPNOTSUPP;
268} 288}
@@ -272,16 +292,16 @@ static int dsa_slave_port_fdb_add(struct net_device *dev,
272 struct switchdev_trans *trans) 292 struct switchdev_trans *trans)
273{ 293{
274 struct dsa_slave_priv *p = netdev_priv(dev); 294 struct dsa_slave_priv *p = netdev_priv(dev);
275 struct dsa_switch *ds = p->parent; 295 struct dsa_switch *ds = p->dp->ds;
276 296
277 if (switchdev_trans_ph_prepare(trans)) { 297 if (switchdev_trans_ph_prepare(trans)) {
278 if (!ds->ops->port_fdb_prepare || !ds->ops->port_fdb_add) 298 if (!ds->ops->port_fdb_prepare || !ds->ops->port_fdb_add)
279 return -EOPNOTSUPP; 299 return -EOPNOTSUPP;
280 300
281 return ds->ops->port_fdb_prepare(ds, p->port, fdb, trans); 301 return ds->ops->port_fdb_prepare(ds, p->dp->index, fdb, trans);
282 } 302 }
283 303
284 ds->ops->port_fdb_add(ds, p->port, fdb, trans); 304 ds->ops->port_fdb_add(ds, p->dp->index, fdb, trans);
285 305
286 return 0; 306 return 0;
287} 307}
@@ -290,11 +310,11 @@ static int dsa_slave_port_fdb_del(struct net_device *dev,
290 const struct switchdev_obj_port_fdb *fdb) 310 const struct switchdev_obj_port_fdb *fdb)
291{ 311{
292 struct dsa_slave_priv *p = netdev_priv(dev); 312 struct dsa_slave_priv *p = netdev_priv(dev);
293 struct dsa_switch *ds = p->parent; 313 struct dsa_switch *ds = p->dp->ds;
294 int ret = -EOPNOTSUPP; 314 int ret = -EOPNOTSUPP;
295 315
296 if (ds->ops->port_fdb_del) 316 if (ds->ops->port_fdb_del)
297 ret = ds->ops->port_fdb_del(ds, p->port, fdb); 317 ret = ds->ops->port_fdb_del(ds, p->dp->index, fdb);
298 318
299 return ret; 319 return ret;
300} 320}
@@ -304,10 +324,10 @@ static int dsa_slave_port_fdb_dump(struct net_device *dev,
304 switchdev_obj_dump_cb_t *cb) 324 switchdev_obj_dump_cb_t *cb)
305{ 325{
306 struct dsa_slave_priv *p = netdev_priv(dev); 326 struct dsa_slave_priv *p = netdev_priv(dev);
307 struct dsa_switch *ds = p->parent; 327 struct dsa_switch *ds = p->dp->ds;
308 328
309 if (ds->ops->port_fdb_dump) 329 if (ds->ops->port_fdb_dump)
310 return ds->ops->port_fdb_dump(ds, p->port, fdb, cb); 330 return ds->ops->port_fdb_dump(ds, p->dp->index, fdb, cb);
311 331
312 return -EOPNOTSUPP; 332 return -EOPNOTSUPP;
313} 333}
@@ -317,16 +337,16 @@ static int dsa_slave_port_mdb_add(struct net_device *dev,
317 struct switchdev_trans *trans) 337 struct switchdev_trans *trans)
318{ 338{
319 struct dsa_slave_priv *p = netdev_priv(dev); 339 struct dsa_slave_priv *p = netdev_priv(dev);
320 struct dsa_switch *ds = p->parent; 340 struct dsa_switch *ds = p->dp->ds;
321 341
322 if (switchdev_trans_ph_prepare(trans)) { 342 if (switchdev_trans_ph_prepare(trans)) {
323 if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add) 343 if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add)
324 return -EOPNOTSUPP; 344 return -EOPNOTSUPP;
325 345
326 return ds->ops->port_mdb_prepare(ds, p->port, mdb, trans); 346 return ds->ops->port_mdb_prepare(ds, p->dp->index, mdb, trans);
327 } 347 }
328 348
329 ds->ops->port_mdb_add(ds, p->port, mdb, trans); 349 ds->ops->port_mdb_add(ds, p->dp->index, mdb, trans);
330 350
331 return 0; 351 return 0;
332} 352}
@@ -335,10 +355,10 @@ static int dsa_slave_port_mdb_del(struct net_device *dev,
335 const struct switchdev_obj_port_mdb *mdb) 355 const struct switchdev_obj_port_mdb *mdb)
336{ 356{
337 struct dsa_slave_priv *p = netdev_priv(dev); 357 struct dsa_slave_priv *p = netdev_priv(dev);
338 struct dsa_switch *ds = p->parent; 358 struct dsa_switch *ds = p->dp->ds;
339 359
340 if (ds->ops->port_mdb_del) 360 if (ds->ops->port_mdb_del)
341 return ds->ops->port_mdb_del(ds, p->port, mdb); 361 return ds->ops->port_mdb_del(ds, p->dp->index, mdb);
342 362
343 return -EOPNOTSUPP; 363 return -EOPNOTSUPP;
344} 364}
@@ -348,10 +368,10 @@ static int dsa_slave_port_mdb_dump(struct net_device *dev,
348 switchdev_obj_dump_cb_t *cb) 368 switchdev_obj_dump_cb_t *cb)
349{ 369{
350 struct dsa_slave_priv *p = netdev_priv(dev); 370 struct dsa_slave_priv *p = netdev_priv(dev);
351 struct dsa_switch *ds = p->parent; 371 struct dsa_switch *ds = p->dp->ds;
352 372
353 if (ds->ops->port_mdb_dump) 373 if (ds->ops->port_mdb_dump)
354 return ds->ops->port_mdb_dump(ds, p->port, mdb, cb); 374 return ds->ops->port_mdb_dump(ds, p->dp->index, mdb, cb);
355 375
356 return -EOPNOTSUPP; 376 return -EOPNOTSUPP;
357} 377}
@@ -371,12 +391,12 @@ static int dsa_slave_stp_state_set(struct net_device *dev,
371 struct switchdev_trans *trans) 391 struct switchdev_trans *trans)
372{ 392{
373 struct dsa_slave_priv *p = netdev_priv(dev); 393 struct dsa_slave_priv *p = netdev_priv(dev);
374 struct dsa_switch *ds = p->parent; 394 struct dsa_switch *ds = p->dp->ds;
375 395
376 if (switchdev_trans_ph_prepare(trans)) 396 if (switchdev_trans_ph_prepare(trans))
377 return ds->ops->port_stp_state_set ? 0 : -EOPNOTSUPP; 397 return ds->ops->port_stp_state_set ? 0 : -EOPNOTSUPP;
378 398
379 dsa_port_set_stp_state(ds, p->port, attr->u.stp_state); 399 dsa_slave_set_state(dev, attr->u.stp_state);
380 400
381 return 0; 401 return 0;
382} 402}
@@ -386,14 +406,14 @@ static int dsa_slave_vlan_filtering(struct net_device *dev,
386 struct switchdev_trans *trans) 406 struct switchdev_trans *trans)
387{ 407{
388 struct dsa_slave_priv *p = netdev_priv(dev); 408 struct dsa_slave_priv *p = netdev_priv(dev);
389 struct dsa_switch *ds = p->parent; 409 struct dsa_switch *ds = p->dp->ds;
390 410
391 /* bridge skips -EOPNOTSUPP, so skip the prepare phase */ 411 /* bridge skips -EOPNOTSUPP, so skip the prepare phase */
392 if (switchdev_trans_ph_prepare(trans)) 412 if (switchdev_trans_ph_prepare(trans))
393 return 0; 413 return 0;
394 414
395 if (ds->ops->port_vlan_filtering) 415 if (ds->ops->port_vlan_filtering)
396 return ds->ops->port_vlan_filtering(ds, p->port, 416 return ds->ops->port_vlan_filtering(ds, p->dp->index,
397 attr->u.vlan_filtering); 417 attr->u.vlan_filtering);
398 418
399 return 0; 419 return 0;
@@ -404,7 +424,7 @@ static int dsa_fastest_ageing_time(struct dsa_switch *ds,
404{ 424{
405 int i; 425 int i;
406 426
407 for (i = 0; i < DSA_MAX_PORTS; ++i) { 427 for (i = 0; i < ds->num_ports; ++i) {
408 struct dsa_port *dp = &ds->ports[i]; 428 struct dsa_port *dp = &ds->ports[i];
409 429
410 if (dp && dp->ageing_time && dp->ageing_time < ageing_time) 430 if (dp && dp->ageing_time && dp->ageing_time < ageing_time)
@@ -419,7 +439,7 @@ static int dsa_slave_ageing_time(struct net_device *dev,
419 struct switchdev_trans *trans) 439 struct switchdev_trans *trans)
420{ 440{
421 struct dsa_slave_priv *p = netdev_priv(dev); 441 struct dsa_slave_priv *p = netdev_priv(dev);
422 struct dsa_switch *ds = p->parent; 442 struct dsa_switch *ds = p->dp->ds;
423 unsigned long ageing_jiffies = clock_t_to_jiffies(attr->u.ageing_time); 443 unsigned long ageing_jiffies = clock_t_to_jiffies(attr->u.ageing_time);
424 unsigned int ageing_time = jiffies_to_msecs(ageing_jiffies); 444 unsigned int ageing_time = jiffies_to_msecs(ageing_jiffies);
425 445
@@ -428,7 +448,7 @@ static int dsa_slave_ageing_time(struct net_device *dev,
428 return 0; 448 return 0;
429 449
430 /* Keep the fastest ageing time in case of multiple bridges */ 450 /* Keep the fastest ageing time in case of multiple bridges */
431 ds->ports[p->port].ageing_time = ageing_time; 451 p->dp->ageing_time = ageing_time;
432 ageing_time = dsa_fastest_ageing_time(ds, ageing_time); 452 ageing_time = dsa_fastest_ageing_time(ds, ageing_time);
433 453
434 if (ds->ops->set_ageing_time) 454 if (ds->ops->set_ageing_time)
@@ -553,39 +573,58 @@ static int dsa_slave_bridge_port_join(struct net_device *dev,
553 struct net_device *br) 573 struct net_device *br)
554{ 574{
555 struct dsa_slave_priv *p = netdev_priv(dev); 575 struct dsa_slave_priv *p = netdev_priv(dev);
556 struct dsa_switch *ds = p->parent; 576 struct dsa_notifier_bridge_info info = {
557 int ret = -EOPNOTSUPP; 577 .sw_index = p->dp->ds->index,
578 .port = p->dp->index,
579 .br = br,
580 };
581 int err;
582
583 /* Here the port is already bridged. Reflect the current configuration
584 * so that drivers can program their chips accordingly.
585 */
586 p->dp->bridge_dev = br;
558 587
559 p->bridge_dev = br; 588 err = dsa_slave_notify(dev, DSA_NOTIFIER_BRIDGE_JOIN, &info);
560 589
561 if (ds->ops->port_bridge_join) 590 /* The bridging is rolled back on error */
562 ret = ds->ops->port_bridge_join(ds, p->port, br); 591 if (err)
592 p->dp->bridge_dev = NULL;
563 593
564 return ret == -EOPNOTSUPP ? 0 : ret; 594 return err;
565} 595}
566 596
567static void dsa_slave_bridge_port_leave(struct net_device *dev) 597static void dsa_slave_bridge_port_leave(struct net_device *dev,
598 struct net_device *br)
568{ 599{
569 struct dsa_slave_priv *p = netdev_priv(dev); 600 struct dsa_slave_priv *p = netdev_priv(dev);
570 struct dsa_switch *ds = p->parent; 601 struct dsa_notifier_bridge_info info = {
571 602 .sw_index = p->dp->ds->index,
603 .port = p->dp->index,
604 .br = br,
605 };
606 int err;
572 607
573 if (ds->ops->port_bridge_leave) 608 /* Here the port is already unbridged. Reflect the current configuration
574 ds->ops->port_bridge_leave(ds, p->port); 609 * so that drivers can program their chips accordingly.
610 */
611 p->dp->bridge_dev = NULL;
575 612
576 p->bridge_dev = NULL; 613 err = dsa_slave_notify(dev, DSA_NOTIFIER_BRIDGE_LEAVE, &info);
614 if (err)
615 netdev_err(dev, "failed to notify DSA_NOTIFIER_BRIDGE_LEAVE\n");
577 616
578 /* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer, 617 /* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer,
579 * so allow it to be in BR_STATE_FORWARDING to be kept functional 618 * so allow it to be in BR_STATE_FORWARDING to be kept functional
580 */ 619 */
581 dsa_port_set_stp_state(ds, p->port, BR_STATE_FORWARDING); 620 dsa_slave_set_state(dev, BR_STATE_FORWARDING);
582} 621}
583 622
584static int dsa_slave_port_attr_get(struct net_device *dev, 623static int dsa_slave_port_attr_get(struct net_device *dev,
585 struct switchdev_attr *attr) 624 struct switchdev_attr *attr)
586{ 625{
587 struct dsa_slave_priv *p = netdev_priv(dev); 626 struct dsa_slave_priv *p = netdev_priv(dev);
588 struct dsa_switch *ds = p->parent; 627 struct dsa_switch *ds = p->dp->ds;
589 628
590 switch (attr->id) { 629 switch (attr->id) {
591 case SWITCHDEV_ATTR_ID_PORT_PARENT_ID: 630 case SWITCHDEV_ATTR_ID_PORT_PARENT_ID:
@@ -633,7 +672,7 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
633 /* Queue the SKB for transmission on the parent interface, but 672 /* Queue the SKB for transmission on the parent interface, but
634 * do not modify its EtherType 673 * do not modify its EtherType
635 */ 674 */
636 nskb->dev = p->parent->dst->master_netdev; 675 nskb->dev = p->dp->ds->dst->master_netdev;
637 dev_queue_xmit(nskb); 676 dev_queue_xmit(nskb);
638 677
639 return NETDEV_TX_OK; 678 return NETDEV_TX_OK;
@@ -641,28 +680,26 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
641 680
642/* ethtool operations *******************************************************/ 681/* ethtool operations *******************************************************/
643static int 682static int
644dsa_slave_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) 683dsa_slave_get_link_ksettings(struct net_device *dev,
684 struct ethtool_link_ksettings *cmd)
645{ 685{
646 struct dsa_slave_priv *p = netdev_priv(dev); 686 struct dsa_slave_priv *p = netdev_priv(dev);
647 int err; 687 int err = -EOPNOTSUPP;
648 688
649 err = -EOPNOTSUPP; 689 if (p->phy != NULL)
650 if (p->phy != NULL) { 690 err = phy_ethtool_ksettings_get(p->phy, cmd);
651 err = phy_read_status(p->phy);
652 if (err == 0)
653 err = phy_ethtool_gset(p->phy, cmd);
654 }
655 691
656 return err; 692 return err;
657} 693}
658 694
659static int 695static int
660dsa_slave_set_settings(struct net_device *dev, struct ethtool_cmd *cmd) 696dsa_slave_set_link_ksettings(struct net_device *dev,
697 const struct ethtool_link_ksettings *cmd)
661{ 698{
662 struct dsa_slave_priv *p = netdev_priv(dev); 699 struct dsa_slave_priv *p = netdev_priv(dev);
663 700
664 if (p->phy != NULL) 701 if (p->phy != NULL)
665 return phy_ethtool_sset(p->phy, cmd); 702 return phy_ethtool_ksettings_set(p->phy, cmd);
666 703
667 return -EOPNOTSUPP; 704 return -EOPNOTSUPP;
668} 705}
@@ -671,7 +708,6 @@ static void dsa_slave_get_drvinfo(struct net_device *dev,
671 struct ethtool_drvinfo *drvinfo) 708 struct ethtool_drvinfo *drvinfo)
672{ 709{
673 strlcpy(drvinfo->driver, "dsa", sizeof(drvinfo->driver)); 710 strlcpy(drvinfo->driver, "dsa", sizeof(drvinfo->driver));
674 strlcpy(drvinfo->version, dsa_driver_version, sizeof(drvinfo->version));
675 strlcpy(drvinfo->fw_version, "N/A", sizeof(drvinfo->fw_version)); 711 strlcpy(drvinfo->fw_version, "N/A", sizeof(drvinfo->fw_version));
676 strlcpy(drvinfo->bus_info, "platform", sizeof(drvinfo->bus_info)); 712 strlcpy(drvinfo->bus_info, "platform", sizeof(drvinfo->bus_info));
677} 713}
@@ -679,10 +715,10 @@ static void dsa_slave_get_drvinfo(struct net_device *dev,
679static int dsa_slave_get_regs_len(struct net_device *dev) 715static int dsa_slave_get_regs_len(struct net_device *dev)
680{ 716{
681 struct dsa_slave_priv *p = netdev_priv(dev); 717 struct dsa_slave_priv *p = netdev_priv(dev);
682 struct dsa_switch *ds = p->parent; 718 struct dsa_switch *ds = p->dp->ds;
683 719
684 if (ds->ops->get_regs_len) 720 if (ds->ops->get_regs_len)
685 return ds->ops->get_regs_len(ds, p->port); 721 return ds->ops->get_regs_len(ds, p->dp->index);
686 722
687 return -EOPNOTSUPP; 723 return -EOPNOTSUPP;
688} 724}
@@ -691,10 +727,10 @@ static void
691dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p) 727dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p)
692{ 728{
693 struct dsa_slave_priv *p = netdev_priv(dev); 729 struct dsa_slave_priv *p = netdev_priv(dev);
694 struct dsa_switch *ds = p->parent; 730 struct dsa_switch *ds = p->dp->ds;
695 731
696 if (ds->ops->get_regs) 732 if (ds->ops->get_regs)
697 ds->ops->get_regs(ds, p->port, regs, _p); 733 ds->ops->get_regs(ds, p->dp->index, regs, _p);
698} 734}
699 735
700static int dsa_slave_nway_reset(struct net_device *dev) 736static int dsa_slave_nway_reset(struct net_device *dev)
@@ -722,7 +758,7 @@ static u32 dsa_slave_get_link(struct net_device *dev)
722static int dsa_slave_get_eeprom_len(struct net_device *dev) 758static int dsa_slave_get_eeprom_len(struct net_device *dev)
723{ 759{
724 struct dsa_slave_priv *p = netdev_priv(dev); 760 struct dsa_slave_priv *p = netdev_priv(dev);
725 struct dsa_switch *ds = p->parent; 761 struct dsa_switch *ds = p->dp->ds;
726 762
727 if (ds->cd && ds->cd->eeprom_len) 763 if (ds->cd && ds->cd->eeprom_len)
728 return ds->cd->eeprom_len; 764 return ds->cd->eeprom_len;
@@ -737,7 +773,7 @@ static int dsa_slave_get_eeprom(struct net_device *dev,
737 struct ethtool_eeprom *eeprom, u8 *data) 773 struct ethtool_eeprom *eeprom, u8 *data)
738{ 774{
739 struct dsa_slave_priv *p = netdev_priv(dev); 775 struct dsa_slave_priv *p = netdev_priv(dev);
740 struct dsa_switch *ds = p->parent; 776 struct dsa_switch *ds = p->dp->ds;
741 777
742 if (ds->ops->get_eeprom) 778 if (ds->ops->get_eeprom)
743 return ds->ops->get_eeprom(ds, eeprom, data); 779 return ds->ops->get_eeprom(ds, eeprom, data);
@@ -749,7 +785,7 @@ static int dsa_slave_set_eeprom(struct net_device *dev,
749 struct ethtool_eeprom *eeprom, u8 *data) 785 struct ethtool_eeprom *eeprom, u8 *data)
750{ 786{
751 struct dsa_slave_priv *p = netdev_priv(dev); 787 struct dsa_slave_priv *p = netdev_priv(dev);
752 struct dsa_switch *ds = p->parent; 788 struct dsa_switch *ds = p->dp->ds;
753 789
754 if (ds->ops->set_eeprom) 790 if (ds->ops->set_eeprom)
755 return ds->ops->set_eeprom(ds, eeprom, data); 791 return ds->ops->set_eeprom(ds, eeprom, data);
@@ -761,7 +797,7 @@ static void dsa_slave_get_strings(struct net_device *dev,
761 uint32_t stringset, uint8_t *data) 797 uint32_t stringset, uint8_t *data)
762{ 798{
763 struct dsa_slave_priv *p = netdev_priv(dev); 799 struct dsa_slave_priv *p = netdev_priv(dev);
764 struct dsa_switch *ds = p->parent; 800 struct dsa_switch *ds = p->dp->ds;
765 801
766 if (stringset == ETH_SS_STATS) { 802 if (stringset == ETH_SS_STATS) {
767 int len = ETH_GSTRING_LEN; 803 int len = ETH_GSTRING_LEN;
@@ -771,7 +807,7 @@ static void dsa_slave_get_strings(struct net_device *dev,
771 strncpy(data + 2 * len, "rx_packets", len); 807 strncpy(data + 2 * len, "rx_packets", len);
772 strncpy(data + 3 * len, "rx_bytes", len); 808 strncpy(data + 3 * len, "rx_bytes", len);
773 if (ds->ops->get_strings) 809 if (ds->ops->get_strings)
774 ds->ops->get_strings(ds, p->port, data + 4 * len); 810 ds->ops->get_strings(ds, p->dp->index, data + 4 * len);
775 } 811 }
776} 812}
777 813
@@ -780,7 +816,7 @@ static void dsa_cpu_port_get_ethtool_stats(struct net_device *dev,
780 uint64_t *data) 816 uint64_t *data)
781{ 817{
782 struct dsa_switch_tree *dst = dev->dsa_ptr; 818 struct dsa_switch_tree *dst = dev->dsa_ptr;
783 struct dsa_switch *ds = dst->ds[0]; 819 struct dsa_switch *ds = dst->cpu_switch;
784 s8 cpu_port = dst->cpu_port; 820 s8 cpu_port = dst->cpu_port;
785 int count = 0; 821 int count = 0;
786 822
@@ -797,7 +833,7 @@ static void dsa_cpu_port_get_ethtool_stats(struct net_device *dev,
797static int dsa_cpu_port_get_sset_count(struct net_device *dev, int sset) 833static int dsa_cpu_port_get_sset_count(struct net_device *dev, int sset)
798{ 834{
799 struct dsa_switch_tree *dst = dev->dsa_ptr; 835 struct dsa_switch_tree *dst = dev->dsa_ptr;
800 struct dsa_switch *ds = dst->ds[0]; 836 struct dsa_switch *ds = dst->cpu_switch;
801 int count = 0; 837 int count = 0;
802 838
803 if (dst->master_ethtool_ops.get_sset_count) 839 if (dst->master_ethtool_ops.get_sset_count)
@@ -813,7 +849,7 @@ static void dsa_cpu_port_get_strings(struct net_device *dev,
813 uint32_t stringset, uint8_t *data) 849 uint32_t stringset, uint8_t *data)
814{ 850{
815 struct dsa_switch_tree *dst = dev->dsa_ptr; 851 struct dsa_switch_tree *dst = dev->dsa_ptr;
816 struct dsa_switch *ds = dst->ds[0]; 852 struct dsa_switch *ds = dst->cpu_switch;
817 s8 cpu_port = dst->cpu_port; 853 s8 cpu_port = dst->cpu_port;
818 int len = ETH_GSTRING_LEN; 854 int len = ETH_GSTRING_LEN;
819 int mcount = 0, count; 855 int mcount = 0, count;
@@ -852,20 +888,20 @@ static void dsa_slave_get_ethtool_stats(struct net_device *dev,
852 uint64_t *data) 888 uint64_t *data)
853{ 889{
854 struct dsa_slave_priv *p = netdev_priv(dev); 890 struct dsa_slave_priv *p = netdev_priv(dev);
855 struct dsa_switch *ds = p->parent; 891 struct dsa_switch *ds = p->dp->ds;
856 892
857 data[0] = dev->stats.tx_packets; 893 data[0] = dev->stats.tx_packets;
858 data[1] = dev->stats.tx_bytes; 894 data[1] = dev->stats.tx_bytes;
859 data[2] = dev->stats.rx_packets; 895 data[2] = dev->stats.rx_packets;
860 data[3] = dev->stats.rx_bytes; 896 data[3] = dev->stats.rx_bytes;
861 if (ds->ops->get_ethtool_stats) 897 if (ds->ops->get_ethtool_stats)
862 ds->ops->get_ethtool_stats(ds, p->port, data + 4); 898 ds->ops->get_ethtool_stats(ds, p->dp->index, data + 4);
863} 899}
864 900
865static int dsa_slave_get_sset_count(struct net_device *dev, int sset) 901static int dsa_slave_get_sset_count(struct net_device *dev, int sset)
866{ 902{
867 struct dsa_slave_priv *p = netdev_priv(dev); 903 struct dsa_slave_priv *p = netdev_priv(dev);
868 struct dsa_switch *ds = p->parent; 904 struct dsa_switch *ds = p->dp->ds;
869 905
870 if (sset == ETH_SS_STATS) { 906 if (sset == ETH_SS_STATS) {
871 int count; 907 int count;
@@ -883,20 +919,20 @@ static int dsa_slave_get_sset_count(struct net_device *dev, int sset)
883static void dsa_slave_get_wol(struct net_device *dev, struct ethtool_wolinfo *w) 919static void dsa_slave_get_wol(struct net_device *dev, struct ethtool_wolinfo *w)
884{ 920{
885 struct dsa_slave_priv *p = netdev_priv(dev); 921 struct dsa_slave_priv *p = netdev_priv(dev);
886 struct dsa_switch *ds = p->parent; 922 struct dsa_switch *ds = p->dp->ds;
887 923
888 if (ds->ops->get_wol) 924 if (ds->ops->get_wol)
889 ds->ops->get_wol(ds, p->port, w); 925 ds->ops->get_wol(ds, p->dp->index, w);
890} 926}
891 927
892static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w) 928static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w)
893{ 929{
894 struct dsa_slave_priv *p = netdev_priv(dev); 930 struct dsa_slave_priv *p = netdev_priv(dev);
895 struct dsa_switch *ds = p->parent; 931 struct dsa_switch *ds = p->dp->ds;
896 int ret = -EOPNOTSUPP; 932 int ret = -EOPNOTSUPP;
897 933
898 if (ds->ops->set_wol) 934 if (ds->ops->set_wol)
899 ret = ds->ops->set_wol(ds, p->port, w); 935 ret = ds->ops->set_wol(ds, p->dp->index, w);
900 936
901 return ret; 937 return ret;
902} 938}
@@ -904,13 +940,13 @@ static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w)
904static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e) 940static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e)
905{ 941{
906 struct dsa_slave_priv *p = netdev_priv(dev); 942 struct dsa_slave_priv *p = netdev_priv(dev);
907 struct dsa_switch *ds = p->parent; 943 struct dsa_switch *ds = p->dp->ds;
908 int ret; 944 int ret;
909 945
910 if (!ds->ops->set_eee) 946 if (!ds->ops->set_eee)
911 return -EOPNOTSUPP; 947 return -EOPNOTSUPP;
912 948
913 ret = ds->ops->set_eee(ds, p->port, p->phy, e); 949 ret = ds->ops->set_eee(ds, p->dp->index, p->phy, e);
914 if (ret) 950 if (ret)
915 return ret; 951 return ret;
916 952
@@ -923,13 +959,13 @@ static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e)
923static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e) 959static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e)
924{ 960{
925 struct dsa_slave_priv *p = netdev_priv(dev); 961 struct dsa_slave_priv *p = netdev_priv(dev);
926 struct dsa_switch *ds = p->parent; 962 struct dsa_switch *ds = p->dp->ds;
927 int ret; 963 int ret;
928 964
929 if (!ds->ops->get_eee) 965 if (!ds->ops->get_eee)
930 return -EOPNOTSUPP; 966 return -EOPNOTSUPP;
931 967
932 ret = ds->ops->get_eee(ds, p->port, e); 968 ret = ds->ops->get_eee(ds, p->dp->index, e);
933 if (ret) 969 if (ret)
934 return ret; 970 return ret;
935 971
@@ -944,7 +980,7 @@ static int dsa_slave_netpoll_setup(struct net_device *dev,
944 struct netpoll_info *ni) 980 struct netpoll_info *ni)
945{ 981{
946 struct dsa_slave_priv *p = netdev_priv(dev); 982 struct dsa_slave_priv *p = netdev_priv(dev);
947 struct dsa_switch *ds = p->parent; 983 struct dsa_switch *ds = p->dp->ds;
948 struct net_device *master = ds->dst->master_netdev; 984 struct net_device *master = ds->dst->master_netdev;
949 struct netpoll *netpoll; 985 struct netpoll *netpoll;
950 int err = 0; 986 int err = 0;
@@ -982,6 +1018,144 @@ static void dsa_slave_poll_controller(struct net_device *dev)
982} 1018}
983#endif 1019#endif
984 1020
1021static int dsa_slave_get_phys_port_name(struct net_device *dev,
1022 char *name, size_t len)
1023{
1024 struct dsa_slave_priv *p = netdev_priv(dev);
1025
1026 if (snprintf(name, len, "p%d", p->dp->index) >= len)
1027 return -EINVAL;
1028
1029 return 0;
1030}
1031
1032static struct dsa_mall_tc_entry *
1033dsa_slave_mall_tc_entry_find(struct dsa_slave_priv *p,
1034 unsigned long cookie)
1035{
1036 struct dsa_mall_tc_entry *mall_tc_entry;
1037
1038 list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list)
1039 if (mall_tc_entry->cookie == cookie)
1040 return mall_tc_entry;
1041
1042 return NULL;
1043}
1044
1045static int dsa_slave_add_cls_matchall(struct net_device *dev,
1046 __be16 protocol,
1047 struct tc_cls_matchall_offload *cls,
1048 bool ingress)
1049{
1050 struct dsa_slave_priv *p = netdev_priv(dev);
1051 struct dsa_mall_tc_entry *mall_tc_entry;
1052 struct dsa_switch *ds = p->dp->ds;
1053 struct net *net = dev_net(dev);
1054 struct dsa_slave_priv *to_p;
1055 struct net_device *to_dev;
1056 const struct tc_action *a;
1057 int err = -EOPNOTSUPP;
1058 LIST_HEAD(actions);
1059 int ifindex;
1060
1061 if (!ds->ops->port_mirror_add)
1062 return err;
1063
1064 if (!tc_single_action(cls->exts))
1065 return err;
1066
1067 tcf_exts_to_list(cls->exts, &actions);
1068 a = list_first_entry(&actions, struct tc_action, list);
1069
1070 if (is_tcf_mirred_egress_mirror(a) && protocol == htons(ETH_P_ALL)) {
1071 struct dsa_mall_mirror_tc_entry *mirror;
1072
1073 ifindex = tcf_mirred_ifindex(a);
1074 to_dev = __dev_get_by_index(net, ifindex);
1075 if (!to_dev)
1076 return -EINVAL;
1077
1078 if (!dsa_slave_dev_check(to_dev))
1079 return -EOPNOTSUPP;
1080
1081 mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL);
1082 if (!mall_tc_entry)
1083 return -ENOMEM;
1084
1085 mall_tc_entry->cookie = cls->cookie;
1086 mall_tc_entry->type = DSA_PORT_MALL_MIRROR;
1087 mirror = &mall_tc_entry->mirror;
1088
1089 to_p = netdev_priv(to_dev);
1090
1091 mirror->to_local_port = to_p->dp->index;
1092 mirror->ingress = ingress;
1093
1094 err = ds->ops->port_mirror_add(ds, p->dp->index, mirror,
1095 ingress);
1096 if (err) {
1097 kfree(mall_tc_entry);
1098 return err;
1099 }
1100
1101 list_add_tail(&mall_tc_entry->list, &p->mall_tc_list);
1102 }
1103
1104 return 0;
1105}
1106
1107static void dsa_slave_del_cls_matchall(struct net_device *dev,
1108 struct tc_cls_matchall_offload *cls)
1109{
1110 struct dsa_slave_priv *p = netdev_priv(dev);
1111 struct dsa_mall_tc_entry *mall_tc_entry;
1112 struct dsa_switch *ds = p->dp->ds;
1113
1114 if (!ds->ops->port_mirror_del)
1115 return;
1116
1117 mall_tc_entry = dsa_slave_mall_tc_entry_find(p, cls->cookie);
1118 if (!mall_tc_entry)
1119 return;
1120
1121 list_del(&mall_tc_entry->list);
1122
1123 switch (mall_tc_entry->type) {
1124 case DSA_PORT_MALL_MIRROR:
1125 ds->ops->port_mirror_del(ds, p->dp->index,
1126 &mall_tc_entry->mirror);
1127 break;
1128 default:
1129 WARN_ON(1);
1130 }
1131
1132 kfree(mall_tc_entry);
1133}
1134
1135static int dsa_slave_setup_tc(struct net_device *dev, u32 handle,
1136 __be16 protocol, struct tc_to_netdev *tc)
1137{
1138 bool ingress = TC_H_MAJ(handle) == TC_H_MAJ(TC_H_INGRESS);
1139 int ret = -EOPNOTSUPP;
1140
1141 switch (tc->type) {
1142 case TC_SETUP_MATCHALL:
1143 switch (tc->cls_mall->command) {
1144 case TC_CLSMATCHALL_REPLACE:
1145 return dsa_slave_add_cls_matchall(dev, protocol,
1146 tc->cls_mall,
1147 ingress);
1148 case TC_CLSMATCHALL_DESTROY:
1149 dsa_slave_del_cls_matchall(dev, tc->cls_mall);
1150 return 0;
1151 }
1152 default:
1153 break;
1154 }
1155
1156 return ret;
1157}
1158
985void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops) 1159void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops)
986{ 1160{
987 ops->get_sset_count = dsa_cpu_port_get_sset_count; 1161 ops->get_sset_count = dsa_cpu_port_get_sset_count;
@@ -989,9 +1163,31 @@ void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops)
989 ops->get_strings = dsa_cpu_port_get_strings; 1163 ops->get_strings = dsa_cpu_port_get_strings;
990} 1164}
991 1165
1166static int dsa_slave_get_rxnfc(struct net_device *dev,
1167 struct ethtool_rxnfc *nfc, u32 *rule_locs)
1168{
1169 struct dsa_slave_priv *p = netdev_priv(dev);
1170 struct dsa_switch *ds = p->dp->ds;
1171
1172 if (!ds->ops->get_rxnfc)
1173 return -EOPNOTSUPP;
1174
1175 return ds->ops->get_rxnfc(ds, p->dp->index, nfc, rule_locs);
1176}
1177
1178static int dsa_slave_set_rxnfc(struct net_device *dev,
1179 struct ethtool_rxnfc *nfc)
1180{
1181 struct dsa_slave_priv *p = netdev_priv(dev);
1182 struct dsa_switch *ds = p->dp->ds;
1183
1184 if (!ds->ops->set_rxnfc)
1185 return -EOPNOTSUPP;
1186
1187 return ds->ops->set_rxnfc(ds, p->dp->index, nfc);
1188}
1189
992static const struct ethtool_ops dsa_slave_ethtool_ops = { 1190static const struct ethtool_ops dsa_slave_ethtool_ops = {
993 .get_settings = dsa_slave_get_settings,
994 .set_settings = dsa_slave_set_settings,
995 .get_drvinfo = dsa_slave_get_drvinfo, 1191 .get_drvinfo = dsa_slave_get_drvinfo,
996 .get_regs_len = dsa_slave_get_regs_len, 1192 .get_regs_len = dsa_slave_get_regs_len,
997 .get_regs = dsa_slave_get_regs, 1193 .get_regs = dsa_slave_get_regs,
@@ -1007,6 +1203,10 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
1007 .get_wol = dsa_slave_get_wol, 1203 .get_wol = dsa_slave_get_wol,
1008 .set_eee = dsa_slave_set_eee, 1204 .set_eee = dsa_slave_set_eee,
1009 .get_eee = dsa_slave_get_eee, 1205 .get_eee = dsa_slave_get_eee,
1206 .get_link_ksettings = dsa_slave_get_link_ksettings,
1207 .set_link_ksettings = dsa_slave_set_link_ksettings,
1208 .get_rxnfc = dsa_slave_get_rxnfc,
1209 .set_rxnfc = dsa_slave_set_rxnfc,
1010}; 1210};
1011 1211
1012static const struct net_device_ops dsa_slave_netdev_ops = { 1212static const struct net_device_ops dsa_slave_netdev_ops = {
@@ -1029,6 +1229,8 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
1029 .ndo_bridge_getlink = switchdev_port_bridge_getlink, 1229 .ndo_bridge_getlink = switchdev_port_bridge_getlink,
1030 .ndo_bridge_setlink = switchdev_port_bridge_setlink, 1230 .ndo_bridge_setlink = switchdev_port_bridge_setlink,
1031 .ndo_bridge_dellink = switchdev_port_bridge_dellink, 1231 .ndo_bridge_dellink = switchdev_port_bridge_dellink,
1232 .ndo_get_phys_port_name = dsa_slave_get_phys_port_name,
1233 .ndo_setup_tc = dsa_slave_setup_tc,
1032}; 1234};
1033 1235
1034static const struct switchdev_ops dsa_slave_switchdev_ops = { 1236static const struct switchdev_ops dsa_slave_switchdev_ops = {
@@ -1046,7 +1248,7 @@ static struct device_type dsa_type = {
1046static void dsa_slave_adjust_link(struct net_device *dev) 1248static void dsa_slave_adjust_link(struct net_device *dev)
1047{ 1249{
1048 struct dsa_slave_priv *p = netdev_priv(dev); 1250 struct dsa_slave_priv *p = netdev_priv(dev);
1049 struct dsa_switch *ds = p->parent; 1251 struct dsa_switch *ds = p->dp->ds;
1050 unsigned int status_changed = 0; 1252 unsigned int status_changed = 0;
1051 1253
1052 if (p->old_link != p->phy->link) { 1254 if (p->old_link != p->phy->link) {
@@ -1065,7 +1267,7 @@ static void dsa_slave_adjust_link(struct net_device *dev)
1065 } 1267 }
1066 1268
1067 if (ds->ops->adjust_link && status_changed) 1269 if (ds->ops->adjust_link && status_changed)
1068 ds->ops->adjust_link(ds, p->port, p->phy); 1270 ds->ops->adjust_link(ds, p->dp->index, p->phy);
1069 1271
1070 if (status_changed) 1272 if (status_changed)
1071 phy_print_status(p->phy); 1273 phy_print_status(p->phy);
@@ -1079,9 +1281,9 @@ static int dsa_slave_fixed_link_update(struct net_device *dev,
1079 1281
1080 if (dev) { 1282 if (dev) {
1081 p = netdev_priv(dev); 1283 p = netdev_priv(dev);
1082 ds = p->parent; 1284 ds = p->dp->ds;
1083 if (ds->ops->fixed_link_update) 1285 if (ds->ops->fixed_link_update)
1084 ds->ops->fixed_link_update(ds, p->port, status); 1286 ds->ops->fixed_link_update(ds, p->dp->index, status);
1085 } 1287 }
1086 1288
1087 return 0; 1289 return 0;
@@ -1092,7 +1294,7 @@ static int dsa_slave_phy_connect(struct dsa_slave_priv *p,
1092 struct net_device *slave_dev, 1294 struct net_device *slave_dev,
1093 int addr) 1295 int addr)
1094{ 1296{
1095 struct dsa_switch *ds = p->parent; 1297 struct dsa_switch *ds = p->dp->ds;
1096 1298
1097 p->phy = mdiobus_get_phy(ds->slave_mii_bus, addr); 1299 p->phy = mdiobus_get_phy(ds->slave_mii_bus, addr);
1098 if (!p->phy) { 1300 if (!p->phy) {
@@ -1103,22 +1305,20 @@ static int dsa_slave_phy_connect(struct dsa_slave_priv *p,
1103 /* Use already configured phy mode */ 1305 /* Use already configured phy mode */
1104 if (p->phy_interface == PHY_INTERFACE_MODE_NA) 1306 if (p->phy_interface == PHY_INTERFACE_MODE_NA)
1105 p->phy_interface = p->phy->interface; 1307 p->phy_interface = p->phy->interface;
1106 phy_connect_direct(slave_dev, p->phy, dsa_slave_adjust_link, 1308 return phy_connect_direct(slave_dev, p->phy, dsa_slave_adjust_link,
1107 p->phy_interface); 1309 p->phy_interface);
1108
1109 return 0;
1110} 1310}
1111 1311
1112static int dsa_slave_phy_setup(struct dsa_slave_priv *p, 1312static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
1113 struct net_device *slave_dev) 1313 struct net_device *slave_dev)
1114{ 1314{
1115 struct dsa_switch *ds = p->parent; 1315 struct dsa_switch *ds = p->dp->ds;
1116 struct device_node *phy_dn, *port_dn; 1316 struct device_node *phy_dn, *port_dn;
1117 bool phy_is_fixed = false; 1317 bool phy_is_fixed = false;
1118 u32 phy_flags = 0; 1318 u32 phy_flags = 0;
1119 int mode, ret; 1319 int mode, ret;
1120 1320
1121 port_dn = ds->ports[p->port].dn; 1321 port_dn = p->dp->dn;
1122 mode = of_get_phy_mode(port_dn); 1322 mode = of_get_phy_mode(port_dn);
1123 if (mode < 0) 1323 if (mode < 0)
1124 mode = PHY_INTERFACE_MODE_NA; 1324 mode = PHY_INTERFACE_MODE_NA;
@@ -1139,7 +1339,7 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
1139 } 1339 }
1140 1340
1141 if (ds->ops->get_phy_flags) 1341 if (ds->ops->get_phy_flags)
1142 phy_flags = ds->ops->get_phy_flags(ds, p->port); 1342 phy_flags = ds->ops->get_phy_flags(ds, p->dp->index);
1143 1343
1144 if (phy_dn) { 1344 if (phy_dn) {
1145 int phy_id = of_mdio_parse_addr(&slave_dev->dev, phy_dn); 1345 int phy_id = of_mdio_parse_addr(&slave_dev->dev, phy_dn);
@@ -1174,9 +1374,10 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
1174 * MDIO bus instead 1374 * MDIO bus instead
1175 */ 1375 */
1176 if (!p->phy) { 1376 if (!p->phy) {
1177 ret = dsa_slave_phy_connect(p, slave_dev, p->port); 1377 ret = dsa_slave_phy_connect(p, slave_dev, p->dp->index);
1178 if (ret) { 1378 if (ret) {
1179 netdev_err(slave_dev, "failed to connect to port %d: %d\n", p->port, ret); 1379 netdev_err(slave_dev, "failed to connect to port %d: %d\n",
1380 p->dp->index, ret);
1180 if (phy_is_fixed) 1381 if (phy_is_fixed)
1181 of_phy_deregister_fixed_link(port_dn); 1382 of_phy_deregister_fixed_link(port_dn);
1182 return ret; 1383 return ret;
@@ -1201,6 +1402,8 @@ int dsa_slave_suspend(struct net_device *slave_dev)
1201{ 1402{
1202 struct dsa_slave_priv *p = netdev_priv(slave_dev); 1403 struct dsa_slave_priv *p = netdev_priv(slave_dev);
1203 1404
1405 netif_device_detach(slave_dev);
1406
1204 if (p->phy) { 1407 if (p->phy) {
1205 phy_stop(p->phy); 1408 phy_stop(p->phy);
1206 p->old_pause = -1; 1409 p->old_pause = -1;
@@ -1244,12 +1447,15 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
1244 if (slave_dev == NULL) 1447 if (slave_dev == NULL)
1245 return -ENOMEM; 1448 return -ENOMEM;
1246 1449
1247 slave_dev->features = master->vlan_features; 1450 slave_dev->features = master->vlan_features | NETIF_F_HW_TC;
1451 slave_dev->hw_features |= NETIF_F_HW_TC;
1248 slave_dev->ethtool_ops = &dsa_slave_ethtool_ops; 1452 slave_dev->ethtool_ops = &dsa_slave_ethtool_ops;
1249 eth_hw_addr_inherit(slave_dev, master); 1453 eth_hw_addr_inherit(slave_dev, master);
1250 slave_dev->priv_flags |= IFF_NO_QUEUE; 1454 slave_dev->priv_flags |= IFF_NO_QUEUE;
1251 slave_dev->netdev_ops = &dsa_slave_netdev_ops; 1455 slave_dev->netdev_ops = &dsa_slave_netdev_ops;
1252 slave_dev->switchdev_ops = &dsa_slave_switchdev_ops; 1456 slave_dev->switchdev_ops = &dsa_slave_switchdev_ops;
1457 slave_dev->min_mtu = 0;
1458 slave_dev->max_mtu = ETH_MAX_MTU;
1253 SET_NETDEV_DEVTYPE(slave_dev, &dsa_type); 1459 SET_NETDEV_DEVTYPE(slave_dev, &dsa_type);
1254 1460
1255 netdev_for_each_tx_queue(slave_dev, dsa_slave_set_lockdep_class_one, 1461 netdev_for_each_tx_queue(slave_dev, dsa_slave_set_lockdep_class_one,
@@ -1260,8 +1466,8 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
1260 slave_dev->vlan_features = master->vlan_features; 1466 slave_dev->vlan_features = master->vlan_features;
1261 1467
1262 p = netdev_priv(slave_dev); 1468 p = netdev_priv(slave_dev);
1263 p->parent = ds; 1469 p->dp = &ds->ports[port];
1264 p->port = port; 1470 INIT_LIST_HEAD(&p->mall_tc_list);
1265 p->xmit = dst->tag_ops->xmit; 1471 p->xmit = dst->tag_ops->xmit;
1266 1472
1267 p->old_pause = -1; 1473 p->old_pause = -1;
@@ -1294,10 +1500,9 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
1294void dsa_slave_destroy(struct net_device *slave_dev) 1500void dsa_slave_destroy(struct net_device *slave_dev)
1295{ 1501{
1296 struct dsa_slave_priv *p = netdev_priv(slave_dev); 1502 struct dsa_slave_priv *p = netdev_priv(slave_dev);
1297 struct dsa_switch *ds = p->parent;
1298 struct device_node *port_dn; 1503 struct device_node *port_dn;
1299 1504
1300 port_dn = ds->ports[p->port].dn; 1505 port_dn = p->dp->dn;
1301 1506
1302 netif_carrier_off(slave_dev); 1507 netif_carrier_off(slave_dev);
1303 if (p->phy) { 1508 if (p->phy) {
@@ -1315,46 +1520,52 @@ static bool dsa_slave_dev_check(struct net_device *dev)
1315 return dev->netdev_ops == &dsa_slave_netdev_ops; 1520 return dev->netdev_ops == &dsa_slave_netdev_ops;
1316} 1521}
1317 1522
1318static int dsa_slave_port_upper_event(struct net_device *dev, 1523static int dsa_slave_changeupper(struct net_device *dev,
1319 unsigned long event, void *ptr) 1524 struct netdev_notifier_changeupper_info *info)
1320{ 1525{
1321 struct netdev_notifier_changeupper_info *info = ptr; 1526 int err = NOTIFY_DONE;
1322 struct net_device *upper = info->upper_dev;
1323 int err = 0;
1324 1527
1325 switch (event) { 1528 if (netif_is_bridge_master(info->upper_dev)) {
1326 case NETDEV_CHANGEUPPER: 1529 if (info->linking) {
1327 if (netif_is_bridge_master(upper)) { 1530 err = dsa_slave_bridge_port_join(dev, info->upper_dev);
1328 if (info->linking) 1531 err = notifier_from_errno(err);
1329 err = dsa_slave_bridge_port_join(dev, upper); 1532 } else {
1330 else 1533 dsa_slave_bridge_port_leave(dev, info->upper_dev);
1331 dsa_slave_bridge_port_leave(dev); 1534 err = NOTIFY_OK;
1332 } 1535 }
1333
1334 break;
1335 } 1536 }
1336 1537
1337 return notifier_from_errno(err); 1538 return err;
1338} 1539}
1339 1540
1340static int dsa_slave_port_event(struct net_device *dev, unsigned long event, 1541static int dsa_slave_netdevice_event(struct notifier_block *nb,
1341 void *ptr) 1542 unsigned long event, void *ptr)
1342{ 1543{
1343 switch (event) { 1544 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1344 case NETDEV_CHANGEUPPER: 1545
1345 return dsa_slave_port_upper_event(dev, event, ptr); 1546 if (dev->netdev_ops != &dsa_slave_netdev_ops)
1346 } 1547 return NOTIFY_DONE;
1548
1549 if (event == NETDEV_CHANGEUPPER)
1550 return dsa_slave_changeupper(dev, ptr);
1347 1551
1348 return NOTIFY_DONE; 1552 return NOTIFY_DONE;
1349} 1553}
1350 1554
1351int dsa_slave_netdevice_event(struct notifier_block *unused, 1555static struct notifier_block dsa_slave_nb __read_mostly = {
1352 unsigned long event, void *ptr) 1556 .notifier_call = dsa_slave_netdevice_event,
1557};
1558
1559int dsa_slave_register_notifier(void)
1353{ 1560{
1354 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 1561 return register_netdevice_notifier(&dsa_slave_nb);
1562}
1355 1563
1356 if (dsa_slave_dev_check(dev)) 1564void dsa_slave_unregister_notifier(void)
1357 return dsa_slave_port_event(dev, event, ptr); 1565{
1566 int err;
1358 1567
1359 return NOTIFY_DONE; 1568 err = unregister_netdevice_notifier(&dsa_slave_nb);
1569 if (err)
1570 pr_err("DSA: failed to unregister slave notifier (%d)\n", err);
1360} 1571}
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
new file mode 100644
index 000000000000..6456dacf9ae9
--- /dev/null
+++ b/net/dsa/switch.c
@@ -0,0 +1,85 @@
1/*
2 * Handling of a single switch chip, part of a switch fabric
3 *
4 * Copyright (c) 2017 Vivien Didelot <vivien.didelot@savoirfairelinux.com>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 */
11
12#include <linux/netdevice.h>
13#include <linux/notifier.h>
14#include <net/dsa.h>
15
16static int dsa_switch_bridge_join(struct dsa_switch *ds,
17 struct dsa_notifier_bridge_info *info)
18{
19 if (ds->index == info->sw_index && ds->ops->port_bridge_join)
20 return ds->ops->port_bridge_join(ds, info->port, info->br);
21
22 if (ds->index != info->sw_index)
23 dev_dbg(ds->dev, "crosschip DSA port %d.%d bridged to %s\n",
24 info->sw_index, info->port, netdev_name(info->br));
25
26 return 0;
27}
28
29static int dsa_switch_bridge_leave(struct dsa_switch *ds,
30 struct dsa_notifier_bridge_info *info)
31{
32 if (ds->index == info->sw_index && ds->ops->port_bridge_leave)
33 ds->ops->port_bridge_leave(ds, info->port, info->br);
34
35 if (ds->index != info->sw_index)
36 dev_dbg(ds->dev, "crosschip DSA port %d.%d unbridged from %s\n",
37 info->sw_index, info->port, netdev_name(info->br));
38
39 return 0;
40}
41
42static int dsa_switch_event(struct notifier_block *nb,
43 unsigned long event, void *info)
44{
45 struct dsa_switch *ds = container_of(nb, struct dsa_switch, nb);
46 int err;
47
48 switch (event) {
49 case DSA_NOTIFIER_BRIDGE_JOIN:
50 err = dsa_switch_bridge_join(ds, info);
51 break;
52 case DSA_NOTIFIER_BRIDGE_LEAVE:
53 err = dsa_switch_bridge_leave(ds, info);
54 break;
55 default:
56 err = -EOPNOTSUPP;
57 break;
58 }
59
60 /* Non-switchdev operations cannot be rolled back. If a DSA driver
61 * returns an error during the chained call, switch chips may be in an
62 * inconsistent state.
63 */
64 if (err)
65 dev_dbg(ds->dev, "breaking chain for DSA event %lu (%d)\n",
66 event, err);
67
68 return notifier_from_errno(err);
69}
70
71int dsa_switch_register_notifier(struct dsa_switch *ds)
72{
73 ds->nb.notifier_call = dsa_switch_event;
74
75 return raw_notifier_chain_register(&ds->dst->nh, &ds->nb);
76}
77
78void dsa_switch_unregister_notifier(struct dsa_switch *ds)
79{
80 int err;
81
82 err = raw_notifier_chain_unregister(&ds->dst->nh, &ds->nb);
83 if (err)
84 dev_err(ds->dev, "failed to unregister notifier (%d)\n", err);
85}
diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
index 21bffde6e4bf..5d925b6b2bb1 100644
--- a/net/dsa/tag_brcm.c
+++ b/net/dsa/tag_brcm.c
@@ -80,9 +80,9 @@ static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev
80 ((skb->priority << BRCM_IG_TC_SHIFT) & BRCM_IG_TC_MASK); 80 ((skb->priority << BRCM_IG_TC_SHIFT) & BRCM_IG_TC_MASK);
81 brcm_tag[1] = 0; 81 brcm_tag[1] = 0;
82 brcm_tag[2] = 0; 82 brcm_tag[2] = 0;
83 if (p->port == 8) 83 if (p->dp->index == 8)
84 brcm_tag[2] = BRCM_IG_DSTMAP2_MASK; 84 brcm_tag[2] = BRCM_IG_DSTMAP2_MASK;
85 brcm_tag[3] = (1 << p->port) & BRCM_IG_DSTMAP1_MASK; 85 brcm_tag[3] = (1 << p->dp->index) & BRCM_IG_DSTMAP1_MASK;
86 86
87 return skb; 87 return skb;
88 88
@@ -102,7 +102,7 @@ static int brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev,
102 if (unlikely(dst == NULL)) 102 if (unlikely(dst == NULL))
103 goto out_drop; 103 goto out_drop;
104 104
105 ds = dst->ds[0]; 105 ds = dst->cpu_switch;
106 106
107 skb = skb_unshare(skb, GFP_ATOMIC); 107 skb = skb_unshare(skb, GFP_ATOMIC);
108 if (skb == NULL) 108 if (skb == NULL)
@@ -121,13 +121,14 @@ static int brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev,
121 /* We should never see a reserved reason code without knowing how to 121 /* We should never see a reserved reason code without knowing how to
122 * handle it 122 * handle it
123 */ 123 */
124 WARN_ON(brcm_tag[2] & BRCM_EG_RC_RSVD); 124 if (unlikely(brcm_tag[2] & BRCM_EG_RC_RSVD))
125 goto out_drop;
125 126
126 /* Locate which port this is coming from */ 127 /* Locate which port this is coming from */
127 source_port = brcm_tag[3] & BRCM_EG_PID_MASK; 128 source_port = brcm_tag[3] & BRCM_EG_PID_MASK;
128 129
129 /* Validate port against switch setup, either the port is totally */ 130 /* Validate port against switch setup, either the port is totally */
130 if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev) 131 if (source_port >= ds->num_ports || !ds->ports[source_port].netdev)
131 goto out_drop; 132 goto out_drop;
132 133
133 /* Remove Broadcom tag and update checksum */ 134 /* Remove Broadcom tag and update checksum */
diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c
index bce79ffe342b..72579ceea381 100644
--- a/net/dsa/tag_dsa.c
+++ b/net/dsa/tag_dsa.c
@@ -33,8 +33,8 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev)
33 * Construct tagged FROM_CPU DSA tag from 802.1q tag. 33 * Construct tagged FROM_CPU DSA tag from 802.1q tag.
34 */ 34 */
35 dsa_header = skb->data + 2 * ETH_ALEN; 35 dsa_header = skb->data + 2 * ETH_ALEN;
36 dsa_header[0] = 0x60 | p->parent->index; 36 dsa_header[0] = 0x60 | p->dp->ds->index;
37 dsa_header[1] = p->port << 3; 37 dsa_header[1] = p->dp->index << 3;
38 38
39 /* 39 /*
40 * Move CFI field from byte 2 to byte 1. 40 * Move CFI field from byte 2 to byte 1.
@@ -54,8 +54,8 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev)
54 * Construct untagged FROM_CPU DSA tag. 54 * Construct untagged FROM_CPU DSA tag.
55 */ 55 */
56 dsa_header = skb->data + 2 * ETH_ALEN; 56 dsa_header = skb->data + 2 * ETH_ALEN;
57 dsa_header[0] = 0x40 | p->parent->index; 57 dsa_header[0] = 0x40 | p->dp->ds->index;
58 dsa_header[1] = p->port << 3; 58 dsa_header[1] = p->dp->index << 3;
59 dsa_header[2] = 0x00; 59 dsa_header[2] = 0x00;
60 dsa_header[3] = 0x00; 60 dsa_header[3] = 0x00;
61 } 61 }
@@ -114,7 +114,7 @@ static int dsa_rcv(struct sk_buff *skb, struct net_device *dev,
114 if (!ds) 114 if (!ds)
115 goto out_drop; 115 goto out_drop;
116 116
117 if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev) 117 if (source_port >= ds->num_ports || !ds->ports[source_port].netdev)
118 goto out_drop; 118 goto out_drop;
119 119
120 /* 120 /*
diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c
index 6c1720e88537..648c051817a1 100644
--- a/net/dsa/tag_edsa.c
+++ b/net/dsa/tag_edsa.c
@@ -42,8 +42,8 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev)
42 edsa_header[1] = ETH_P_EDSA & 0xff; 42 edsa_header[1] = ETH_P_EDSA & 0xff;
43 edsa_header[2] = 0x00; 43 edsa_header[2] = 0x00;
44 edsa_header[3] = 0x00; 44 edsa_header[3] = 0x00;
45 edsa_header[4] = 0x60 | p->parent->index; 45 edsa_header[4] = 0x60 | p->dp->ds->index;
46 edsa_header[5] = p->port << 3; 46 edsa_header[5] = p->dp->index << 3;
47 47
48 /* 48 /*
49 * Move CFI field from byte 6 to byte 5. 49 * Move CFI field from byte 6 to byte 5.
@@ -67,8 +67,8 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev)
67 edsa_header[1] = ETH_P_EDSA & 0xff; 67 edsa_header[1] = ETH_P_EDSA & 0xff;
68 edsa_header[2] = 0x00; 68 edsa_header[2] = 0x00;
69 edsa_header[3] = 0x00; 69 edsa_header[3] = 0x00;
70 edsa_header[4] = 0x40 | p->parent->index; 70 edsa_header[4] = 0x40 | p->dp->ds->index;
71 edsa_header[5] = p->port << 3; 71 edsa_header[5] = p->dp->index << 3;
72 edsa_header[6] = 0x00; 72 edsa_header[6] = 0x00;
73 edsa_header[7] = 0x00; 73 edsa_header[7] = 0x00;
74 } 74 }
@@ -127,7 +127,7 @@ static int edsa_rcv(struct sk_buff *skb, struct net_device *dev,
127 if (!ds) 127 if (!ds)
128 goto out_drop; 128 goto out_drop;
129 129
130 if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev) 130 if (source_port >= ds->num_ports || !ds->ports[source_port].netdev)
131 goto out_drop; 131 goto out_drop;
132 132
133 /* 133 /*
diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c
index 0c90cacee7aa..30240f343aea 100644
--- a/net/dsa/tag_qca.c
+++ b/net/dsa/tag_qca.c
@@ -54,7 +54,7 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev)
54 /* Set the version field, and set destination port information */ 54 /* Set the version field, and set destination port information */
55 hdr = QCA_HDR_VERSION << QCA_HDR_XMIT_VERSION_S | 55 hdr = QCA_HDR_VERSION << QCA_HDR_XMIT_VERSION_S |
56 QCA_HDR_XMIT_FROM_CPU | 56 QCA_HDR_XMIT_FROM_CPU |
57 BIT(p->port); 57 BIT(p->dp->index);
58 58
59 *phdr = htons(hdr); 59 *phdr = htons(hdr);
60 60
@@ -104,7 +104,7 @@ static int qca_tag_rcv(struct sk_buff *skb, struct net_device *dev,
104 /* This protocol doesn't support cascading multiple switches so it's 104 /* This protocol doesn't support cascading multiple switches so it's
105 * safe to assume the switch is first in the tree 105 * safe to assume the switch is first in the tree
106 */ 106 */
107 ds = dst->ds[0]; 107 ds = dst->cpu_switch;
108 if (!ds) 108 if (!ds)
109 goto out_drop; 109 goto out_drop;
110 110
diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c
index 5e3903eb1afa..26f977176978 100644
--- a/net/dsa/tag_trailer.c
+++ b/net/dsa/tag_trailer.c
@@ -50,7 +50,7 @@ static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev)
50 50
51 trailer = skb_put(nskb, 4); 51 trailer = skb_put(nskb, 4);
52 trailer[0] = 0x80; 52 trailer[0] = 0x80;
53 trailer[1] = 1 << p->port; 53 trailer[1] = 1 << p->dp->index;
54 trailer[2] = 0x10; 54 trailer[2] = 0x10;
55 trailer[3] = 0x00; 55 trailer[3] = 0x00;
56 56
@@ -67,7 +67,7 @@ static int trailer_rcv(struct sk_buff *skb, struct net_device *dev,
67 67
68 if (unlikely(dst == NULL)) 68 if (unlikely(dst == NULL))
69 goto out_drop; 69 goto out_drop;
70 ds = dst->ds[0]; 70 ds = dst->cpu_switch;
71 71
72 skb = skb_unshare(skb, GFP_ATOMIC); 72 skb = skb_unshare(skb, GFP_ATOMIC);
73 if (skb == NULL) 73 if (skb == NULL)
@@ -82,7 +82,7 @@ static int trailer_rcv(struct sk_buff *skb, struct net_device *dev,
82 goto out_drop; 82 goto out_drop;
83 83
84 source_port = trailer[1] & 7; 84 source_port = trailer[1] & 7;
85 if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev) 85 if (source_port >= ds->num_ports || !ds->ports[source_port].netdev)
86 goto out_drop; 86 goto out_drop;
87 87
88 pskb_trim_rcsum(skb, skb->len - 4); 88 pskb_trim_rcsum(skb, skb->len - 4);
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 02acfff36028..1446810047f5 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -62,6 +62,7 @@
62#include <net/dsa.h> 62#include <net/dsa.h>
63#include <net/flow_dissector.h> 63#include <net/flow_dissector.h>
64#include <linux/uaccess.h> 64#include <linux/uaccess.h>
65#include <net/pkt_sched.h>
65 66
66__setup("ether=", netdev_boot_setup); 67__setup("ether=", netdev_boot_setup);
67 68
@@ -322,8 +323,7 @@ EXPORT_SYMBOL(eth_mac_addr);
322 */ 323 */
323int eth_change_mtu(struct net_device *dev, int new_mtu) 324int eth_change_mtu(struct net_device *dev, int new_mtu)
324{ 325{
325 if (new_mtu < 68 || new_mtu > ETH_DATA_LEN) 326 netdev_warn(dev, "%s is deprecated\n", __func__);
326 return -EINVAL;
327 dev->mtu = new_mtu; 327 dev->mtu = new_mtu;
328 return 0; 328 return 0;
329} 329}
@@ -356,9 +356,12 @@ void ether_setup(struct net_device *dev)
356 dev->header_ops = &eth_header_ops; 356 dev->header_ops = &eth_header_ops;
357 dev->type = ARPHRD_ETHER; 357 dev->type = ARPHRD_ETHER;
358 dev->hard_header_len = ETH_HLEN; 358 dev->hard_header_len = ETH_HLEN;
359 dev->min_header_len = ETH_HLEN;
359 dev->mtu = ETH_DATA_LEN; 360 dev->mtu = ETH_DATA_LEN;
361 dev->min_mtu = ETH_MIN_MTU;
362 dev->max_mtu = ETH_DATA_LEN;
360 dev->addr_len = ETH_ALEN; 363 dev->addr_len = ETH_ALEN;
361 dev->tx_queue_len = 1000; /* Ethernet wants good queues */ 364 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
362 dev->flags = IFF_BROADCAST|IFF_MULTICAST; 365 dev->flags = IFF_BROADCAST|IFF_MULTICAST;
363 dev->priv_flags |= IFF_TX_SKB_SHARING; 366 dev->priv_flags |= IFF_TX_SKB_SHARING;
364 367
@@ -390,6 +393,34 @@ struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
390} 393}
391EXPORT_SYMBOL(alloc_etherdev_mqs); 394EXPORT_SYMBOL(alloc_etherdev_mqs);
392 395
396static void devm_free_netdev(struct device *dev, void *res)
397{
398 free_netdev(*(struct net_device **)res);
399}
400
401struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv,
402 unsigned int txqs, unsigned int rxqs)
403{
404 struct net_device **dr;
405 struct net_device *netdev;
406
407 dr = devres_alloc(devm_free_netdev, sizeof(*dr), GFP_KERNEL);
408 if (!dr)
409 return NULL;
410
411 netdev = alloc_etherdev_mqs(sizeof_priv, txqs, rxqs);
412 if (!netdev) {
413 devres_free(dr);
414 return NULL;
415 }
416
417 *dr = netdev;
418 devres_add(dev, dr);
419
420 return netdev;
421}
422EXPORT_SYMBOL(devm_alloc_etherdev_mqs);
423
393ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len) 424ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len)
394{ 425{
395 return scnprintf(buf, PAGE_SIZE, "%*phC\n", len, addr); 426 return scnprintf(buf, PAGE_SIZE, "%*phC\n", len, addr);
@@ -444,7 +475,7 @@ struct sk_buff **eth_gro_receive(struct sk_buff **head,
444out_unlock: 475out_unlock:
445 rcu_read_unlock(); 476 rcu_read_unlock();
446out: 477out:
447 NAPI_GRO_CB(skb)->flush |= flush; 478 skb_gro_flush_final(skb, pp, flush);
448 479
449 return pp; 480 return pp;
450} 481}
diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
index 16737cd8dae8..c73160fb11e7 100644
--- a/net/hsr/hsr_device.c
+++ b/net/hsr/hsr_device.c
@@ -395,9 +395,10 @@ static struct device_type hsr_type = {
395 395
396void hsr_dev_setup(struct net_device *dev) 396void hsr_dev_setup(struct net_device *dev)
397{ 397{
398 random_ether_addr(dev->dev_addr); 398 eth_hw_addr_random(dev);
399 399
400 ether_setup(dev); 400 ether_setup(dev);
401 dev->min_mtu = 0;
401 dev->header_ops = &hsr_header_ops; 402 dev->header_ops = &hsr_header_ops;
402 dev->netdev_ops = &hsr_device_ops; 403 dev->netdev_ops = &hsr_device_ops;
403 SET_NETDEV_DEVTYPE(dev, &hsr_type); 404 SET_NETDEV_DEVTYPE(dev, &hsr_type);
diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c
index d4d1617f43a8..1ab30e7d3f99 100644
--- a/net/hsr/hsr_netlink.c
+++ b/net/hsr/hsr_netlink.c
@@ -131,13 +131,7 @@ static const struct nla_policy hsr_genl_policy[HSR_A_MAX + 1] = {
131 [HSR_A_IF2_SEQ] = { .type = NLA_U16 }, 131 [HSR_A_IF2_SEQ] = { .type = NLA_U16 },
132}; 132};
133 133
134static struct genl_family hsr_genl_family = { 134static struct genl_family hsr_genl_family;
135 .id = GENL_ID_GENERATE,
136 .hdrsize = 0,
137 .name = "HSR",
138 .version = 1,
139 .maxattr = HSR_A_MAX,
140};
141 135
142static const struct genl_multicast_group hsr_mcgrps[] = { 136static const struct genl_multicast_group hsr_mcgrps[] = {
143 { .name = "hsr-network", }, 137 { .name = "hsr-network", },
@@ -467,6 +461,18 @@ static const struct genl_ops hsr_ops[] = {
467 }, 461 },
468}; 462};
469 463
464static struct genl_family hsr_genl_family __ro_after_init = {
465 .hdrsize = 0,
466 .name = "HSR",
467 .version = 1,
468 .maxattr = HSR_A_MAX,
469 .module = THIS_MODULE,
470 .ops = hsr_ops,
471 .n_ops = ARRAY_SIZE(hsr_ops),
472 .mcgrps = hsr_mcgrps,
473 .n_mcgrps = ARRAY_SIZE(hsr_mcgrps),
474};
475
470int __init hsr_netlink_init(void) 476int __init hsr_netlink_init(void)
471{ 477{
472 int rc; 478 int rc;
@@ -475,8 +481,7 @@ int __init hsr_netlink_init(void)
475 if (rc) 481 if (rc)
476 goto fail_rtnl_link_register; 482 goto fail_rtnl_link_register;
477 483
478 rc = genl_register_family_with_ops_groups(&hsr_genl_family, hsr_ops, 484 rc = genl_register_family(&hsr_genl_family);
479 hsr_mcgrps);
480 if (rc) 485 if (rc)
481 goto fail_genl_register_family; 486 goto fail_genl_register_family;
482 487
diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c
index f5b60388d02f..56080da4aa77 100644
--- a/net/hsr/hsr_slave.c
+++ b/net/hsr/hsr_slave.c
@@ -12,6 +12,7 @@
12#include "hsr_slave.h" 12#include "hsr_slave.h"
13#include <linux/etherdevice.h> 13#include <linux/etherdevice.h>
14#include <linux/if_arp.h> 14#include <linux/if_arp.h>
15#include <linux/if_vlan.h>
15#include "hsr_main.h" 16#include "hsr_main.h"
16#include "hsr_device.h" 17#include "hsr_device.h"
17#include "hsr_forward.h" 18#include "hsr_forward.h"
@@ -81,7 +82,7 @@ static int hsr_check_dev_ok(struct net_device *dev)
81 return -EINVAL; 82 return -EINVAL;
82 } 83 }
83 84
84 if (dev->priv_flags & IFF_802_1Q_VLAN) { 85 if (is_vlan_dev(dev)) {
85 netdev_info(dev, "HSR on top of VLAN is not yet supported in this driver.\n"); 86 netdev_info(dev, "HSR on top of VLAN is not yet supported in this driver.\n");
86 return -EINVAL; 87 return -EINVAL;
87 } 88 }
diff --git a/net/ieee802154/6lowpan/6lowpan_i.h b/net/ieee802154/6lowpan/6lowpan_i.h
index 5ac778962e4e..ac7c96b73ad5 100644
--- a/net/ieee802154/6lowpan/6lowpan_i.h
+++ b/net/ieee802154/6lowpan/6lowpan_i.h
@@ -7,7 +7,7 @@
7#include <net/inet_frag.h> 7#include <net/inet_frag.h>
8#include <net/6lowpan.h> 8#include <net/6lowpan.h>
9 9
10typedef unsigned __bitwise__ lowpan_rx_result; 10typedef unsigned __bitwise lowpan_rx_result;
11#define RX_CONTINUE ((__force lowpan_rx_result) 0u) 11#define RX_CONTINUE ((__force lowpan_rx_result) 0u)
12#define RX_DROP_UNUSABLE ((__force lowpan_rx_result) 1u) 12#define RX_DROP_UNUSABLE ((__force lowpan_rx_result) 1u)
13#define RX_DROP ((__force lowpan_rx_result) 2u) 13#define RX_DROP ((__force lowpan_rx_result) 2u)
diff --git a/net/ieee802154/Makefile b/net/ieee802154/Makefile
index 4adfd4d5471b..9b92ade687a3 100644
--- a/net/ieee802154/Makefile
+++ b/net/ieee802154/Makefile
@@ -7,5 +7,3 @@ ieee802154-y := netlink.o nl-mac.o nl-phy.o nl_policy.o core.o \
7ieee802154_socket-y := socket.o 7ieee802154_socket-y := socket.o
8 8
9CFLAGS_trace.o := -I$(src) 9CFLAGS_trace.o := -I$(src)
10
11ccflags-y += -D__CHECK_ENDIAN__
diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c
index c8133c07ceee..6bde9e5a5503 100644
--- a/net/ieee802154/netlink.c
+++ b/net/ieee802154/netlink.c
@@ -28,14 +28,6 @@
28static unsigned int ieee802154_seq_num; 28static unsigned int ieee802154_seq_num;
29static DEFINE_SPINLOCK(ieee802154_seq_lock); 29static DEFINE_SPINLOCK(ieee802154_seq_lock);
30 30
31struct genl_family nl802154_family = {
32 .id = GENL_ID_GENERATE,
33 .hdrsize = 0,
34 .name = IEEE802154_NL_NAME,
35 .version = 1,
36 .maxattr = IEEE802154_ATTR_MAX,
37};
38
39/* Requests to userspace */ 31/* Requests to userspace */
40struct sk_buff *ieee802154_nl_create(int flags, u8 req) 32struct sk_buff *ieee802154_nl_create(int flags, u8 req)
41{ 33{
@@ -139,11 +131,21 @@ static const struct genl_multicast_group ieee802154_mcgrps[] = {
139 [IEEE802154_BEACON_MCGRP] = { .name = IEEE802154_MCAST_BEACON_NAME, }, 131 [IEEE802154_BEACON_MCGRP] = { .name = IEEE802154_MCAST_BEACON_NAME, },
140}; 132};
141 133
134struct genl_family nl802154_family __ro_after_init = {
135 .hdrsize = 0,
136 .name = IEEE802154_NL_NAME,
137 .version = 1,
138 .maxattr = IEEE802154_ATTR_MAX,
139 .module = THIS_MODULE,
140 .ops = ieee8021154_ops,
141 .n_ops = ARRAY_SIZE(ieee8021154_ops),
142 .mcgrps = ieee802154_mcgrps,
143 .n_mcgrps = ARRAY_SIZE(ieee802154_mcgrps),
144};
145
142int __init ieee802154_nl_init(void) 146int __init ieee802154_nl_init(void)
143{ 147{
144 return genl_register_family_with_ops_groups(&nl802154_family, 148 return genl_register_family(&nl802154_family);
145 ieee8021154_ops,
146 ieee802154_mcgrps);
147} 149}
148 150
149void ieee802154_nl_exit(void) 151void ieee802154_nl_exit(void)
diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c
index 77d73014bde3..dc2960be51e0 100644
--- a/net/ieee802154/nl-phy.c
+++ b/net/ieee802154/nl-phy.c
@@ -286,9 +286,12 @@ int ieee802154_del_iface(struct sk_buff *skb, struct genl_info *info)
286 if (name[nla_len(info->attrs[IEEE802154_ATTR_DEV_NAME]) - 1] != '\0') 286 if (name[nla_len(info->attrs[IEEE802154_ATTR_DEV_NAME]) - 1] != '\0')
287 return -EINVAL; /* name should be null-terminated */ 287 return -EINVAL; /* name should be null-terminated */
288 288
289 rc = -ENODEV;
289 dev = dev_get_by_name(genl_info_net(info), name); 290 dev = dev_get_by_name(genl_info_net(info), name);
290 if (!dev) 291 if (!dev)
291 return -ENODEV; 292 return rc;
293 if (dev->type != ARPHRD_IEEE802154)
294 goto out;
292 295
293 phy = dev->ieee802154_ptr->wpan_phy; 296 phy = dev->ieee802154_ptr->wpan_phy;
294 BUG_ON(!phy); 297 BUG_ON(!phy);
@@ -342,6 +345,7 @@ nla_put_failure:
342 nlmsg_free(msg); 345 nlmsg_free(msg);
343out_dev: 346out_dev:
344 wpan_phy_put(phy); 347 wpan_phy_put(phy);
348out:
345 if (dev) 349 if (dev)
346 dev_put(dev); 350 dev_put(dev);
347 351
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index d90a4ed5b8a0..fc60cd061f39 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -26,23 +26,8 @@
26#include "rdev-ops.h" 26#include "rdev-ops.h"
27#include "core.h" 27#include "core.h"
28 28
29static int nl802154_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
30 struct genl_info *info);
31
32static void nl802154_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
33 struct genl_info *info);
34
35/* the netlink family */ 29/* the netlink family */
36static struct genl_family nl802154_fam = { 30static struct genl_family nl802154_fam;
37 .id = GENL_ID_GENERATE, /* don't bother with a hardcoded ID */
38 .name = NL802154_GENL_NAME, /* have users key off the name instead */
39 .hdrsize = 0, /* no private header */
40 .version = 1, /* no particular meaning now */
41 .maxattr = NL802154_ATTR_MAX,
42 .netnsok = true,
43 .pre_doit = nl802154_pre_doit,
44 .post_doit = nl802154_post_doit,
45};
46 31
47/* multicast groups */ 32/* multicast groups */
48enum nl802154_multicast_groups { 33enum nl802154_multicast_groups {
@@ -263,13 +248,14 @@ nl802154_prepare_wpan_dev_dump(struct sk_buff *skb,
263 248
264 if (!cb->args[0]) { 249 if (!cb->args[0]) {
265 err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl802154_fam.hdrsize, 250 err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl802154_fam.hdrsize,
266 nl802154_fam.attrbuf, nl802154_fam.maxattr, 251 genl_family_attrbuf(&nl802154_fam),
252 nl802154_fam.maxattr,
267 nl802154_policy); 253 nl802154_policy);
268 if (err) 254 if (err)
269 goto out_unlock; 255 goto out_unlock;
270 256
271 *wpan_dev = __cfg802154_wpan_dev_from_attrs(sock_net(skb->sk), 257 *wpan_dev = __cfg802154_wpan_dev_from_attrs(sock_net(skb->sk),
272 nl802154_fam.attrbuf); 258 genl_family_attrbuf(&nl802154_fam));
273 if (IS_ERR(*wpan_dev)) { 259 if (IS_ERR(*wpan_dev)) {
274 err = PTR_ERR(*wpan_dev); 260 err = PTR_ERR(*wpan_dev);
275 goto out_unlock; 261 goto out_unlock;
@@ -575,7 +561,7 @@ static int nl802154_dump_wpan_phy_parse(struct sk_buff *skb,
575 struct netlink_callback *cb, 561 struct netlink_callback *cb,
576 struct nl802154_dump_wpan_phy_state *state) 562 struct nl802154_dump_wpan_phy_state *state)
577{ 563{
578 struct nlattr **tb = nl802154_fam.attrbuf; 564 struct nlattr **tb = genl_family_attrbuf(&nl802154_fam);
579 int ret = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl802154_fam.hdrsize, 565 int ret = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl802154_fam.hdrsize,
580 tb, nl802154_fam.maxattr, nl802154_policy); 566 tb, nl802154_fam.maxattr, nl802154_policy);
581 567
@@ -2476,11 +2462,25 @@ static const struct genl_ops nl802154_ops[] = {
2476#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ 2462#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */
2477}; 2463};
2478 2464
2465static struct genl_family nl802154_fam __ro_after_init = {
2466 .name = NL802154_GENL_NAME, /* have users key off the name instead */
2467 .hdrsize = 0, /* no private header */
2468 .version = 1, /* no particular meaning now */
2469 .maxattr = NL802154_ATTR_MAX,
2470 .netnsok = true,
2471 .pre_doit = nl802154_pre_doit,
2472 .post_doit = nl802154_post_doit,
2473 .module = THIS_MODULE,
2474 .ops = nl802154_ops,
2475 .n_ops = ARRAY_SIZE(nl802154_ops),
2476 .mcgrps = nl802154_mcgrps,
2477 .n_mcgrps = ARRAY_SIZE(nl802154_mcgrps),
2478};
2479
2479/* initialisation/exit functions */ 2480/* initialisation/exit functions */
2480int nl802154_init(void) 2481int __init nl802154_init(void)
2481{ 2482{
2482 return genl_register_family_with_ops_groups(&nl802154_fam, nl802154_ops, 2483 return genl_register_family(&nl802154_fam);
2483 nl802154_mcgrps);
2484} 2484}
2485 2485
2486void nl802154_exit(void) 2486void nl802154_exit(void)
diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c
index e0bd013a1e5e..eedba7670b51 100644
--- a/net/ieee802154/socket.c
+++ b/net/ieee802154/socket.c
@@ -279,7 +279,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
279 pr_debug("name = %s, mtu = %u\n", dev->name, mtu); 279 pr_debug("name = %s, mtu = %u\n", dev->name, mtu);
280 280
281 if (size > mtu) { 281 if (size > mtu) {
282 pr_debug("size = %Zu, mtu = %u\n", size, mtu); 282 pr_debug("size = %zu, mtu = %u\n", size, mtu);
283 err = -EMSGSIZE; 283 err = -EMSGSIZE;
284 goto out_dev; 284 goto out_dev;
285 } 285 }
@@ -645,7 +645,7 @@ static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
645 pr_debug("name = %s, mtu = %u\n", dev->name, mtu); 645 pr_debug("name = %s, mtu = %u\n", dev->name, mtu);
646 646
647 if (size > mtu) { 647 if (size > mtu) {
648 pr_debug("size = %Zu, mtu = %u\n", size, mtu); 648 pr_debug("size = %zu, mtu = %u\n", size, mtu);
649 err = -EMSGSIZE; 649 err = -EMSGSIZE;
650 goto out_dev; 650 goto out_dev;
651 } 651 }
diff --git a/net/ife/Kconfig b/net/ife/Kconfig
new file mode 100644
index 000000000000..31e48b652c7c
--- /dev/null
+++ b/net/ife/Kconfig
@@ -0,0 +1,16 @@
1#
2# IFE subsystem configuration
3#
4
5menuconfig NET_IFE
6 depends on NET
7 tristate "Inter-FE based on IETF ForCES InterFE LFB"
8 default n
9 help
10 Say Y here to add support of IFE encapsulation protocol
11 For details refer to netdev01 paper:
12 "Distributing Linux Traffic Control Classifier-Action Subsystem"
13 Authors: Jamal Hadi Salim and Damascene M. Joachimpillai
14
15 To compile this support as a module, choose M here: the module will
16 be called ife.
diff --git a/net/ife/Makefile b/net/ife/Makefile
new file mode 100644
index 000000000000..2a90d97746cc
--- /dev/null
+++ b/net/ife/Makefile
@@ -0,0 +1,5 @@
1#
2# Makefile for the IFE encapsulation protocol
3#
4
5obj-$(CONFIG_NET_IFE) += ife.o
diff --git a/net/ife/ife.c b/net/ife/ife.c
new file mode 100644
index 000000000000..f360341c72eb
--- /dev/null
+++ b/net/ife/ife.c
@@ -0,0 +1,142 @@
1/*
2 * net/ife/ife.c - Inter-FE protocol based on ForCES WG InterFE LFB
3 * Copyright (c) 2015 Jamal Hadi Salim <jhs@mojatatu.com>
4 * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
5 *
6 * Refer to: draft-ietf-forces-interfelfb-03 and netdev01 paper:
7 * "Distributing Linux Traffic Control Classifier-Action Subsystem"
8 * Authors: Jamal Hadi Salim and Damascene M. Joachimpillai
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation.
13 */
14
15#include <linux/types.h>
16#include <linux/kernel.h>
17#include <linux/string.h>
18#include <linux/errno.h>
19#include <linux/skbuff.h>
20#include <linux/rtnetlink.h>
21#include <linux/module.h>
22#include <linux/init.h>
23#include <net/net_namespace.h>
24#include <net/netlink.h>
25#include <net/pkt_sched.h>
26#include <linux/etherdevice.h>
27#include <net/ife.h>
28
29struct ifeheadr {
30 __be16 metalen;
31 u8 tlv_data[];
32};
33
34void *ife_encode(struct sk_buff *skb, u16 metalen)
35{
36 /* OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA
37 * where ORIGDATA = original ethernet header ...
38 */
39 int hdrm = metalen + IFE_METAHDRLEN;
40 int total_push = hdrm + skb->dev->hard_header_len;
41 struct ifeheadr *ifehdr;
42 struct ethhdr *iethh; /* inner ether header */
43 int skboff = 0;
44 int err;
45
46 err = skb_cow_head(skb, total_push);
47 if (unlikely(err))
48 return NULL;
49
50 iethh = (struct ethhdr *) skb->data;
51
52 __skb_push(skb, total_push);
53 memcpy(skb->data, iethh, skb->dev->hard_header_len);
54 skb_reset_mac_header(skb);
55 skboff += skb->dev->hard_header_len;
56
57 /* total metadata length */
58 ifehdr = (struct ifeheadr *) (skb->data + skboff);
59 metalen += IFE_METAHDRLEN;
60 ifehdr->metalen = htons(metalen);
61
62 return ifehdr->tlv_data;
63}
64EXPORT_SYMBOL_GPL(ife_encode);
65
66void *ife_decode(struct sk_buff *skb, u16 *metalen)
67{
68 struct ifeheadr *ifehdr;
69 int total_pull;
70 u16 ifehdrln;
71
72 ifehdr = (struct ifeheadr *) (skb->data + skb->dev->hard_header_len);
73 ifehdrln = ntohs(ifehdr->metalen);
74 total_pull = skb->dev->hard_header_len + ifehdrln;
75
76 if (unlikely(ifehdrln < 2))
77 return NULL;
78
79 if (unlikely(!pskb_may_pull(skb, total_pull)))
80 return NULL;
81
82 skb_set_mac_header(skb, total_pull);
83 __skb_pull(skb, total_pull);
84 *metalen = ifehdrln - IFE_METAHDRLEN;
85
86 return &ifehdr->tlv_data;
87}
88EXPORT_SYMBOL_GPL(ife_decode);
89
90struct meta_tlvhdr {
91 __be16 type;
92 __be16 len;
93};
94
95/* Caller takes care of presenting data in network order
96 */
97void *ife_tlv_meta_decode(void *skbdata, u16 *attrtype, u16 *dlen, u16 *totlen)
98{
99 struct meta_tlvhdr *tlv = (struct meta_tlvhdr *) skbdata;
100
101 *dlen = ntohs(tlv->len) - NLA_HDRLEN;
102 *attrtype = ntohs(tlv->type);
103
104 if (totlen)
105 *totlen = nla_total_size(*dlen);
106
107 return skbdata + sizeof(struct meta_tlvhdr);
108}
109EXPORT_SYMBOL_GPL(ife_tlv_meta_decode);
110
111void *ife_tlv_meta_next(void *skbdata)
112{
113 struct meta_tlvhdr *tlv = (struct meta_tlvhdr *) skbdata;
114 u16 tlvlen = ntohs(tlv->len);
115
116 tlvlen = NLA_ALIGN(tlvlen);
117
118 return skbdata + tlvlen;
119}
120EXPORT_SYMBOL_GPL(ife_tlv_meta_next);
121
122/* Caller takes care of presenting data in network order
123 */
124int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval)
125{
126 __be32 *tlv = (__be32 *) (skbdata);
127 u16 totlen = nla_total_size(dlen); /*alignment + hdr */
128 char *dptr = (char *) tlv + NLA_HDRLEN;
129 u32 htlv = attrtype << 16 | (dlen + NLA_HDRLEN);
130
131 *tlv = htonl(htlv);
132 memset(dptr, 0, totlen - NLA_HDRLEN);
133 memcpy(dptr, dval, dlen);
134
135 return totlen;
136}
137EXPORT_SYMBOL_GPL(ife_tlv_meta_encode);
138
139MODULE_AUTHOR("Jamal Hadi Salim <jhs@mojatatu.com>");
140MODULE_AUTHOR("Yotam Gigi <yotamg@mellanox.com>");
141MODULE_DESCRIPTION("Inter-FE LFB action");
142MODULE_LICENSE("GPL");
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index b54b3ca939db..91a2557942fa 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -187,6 +187,7 @@ config NET_IPGRE_DEMUX
187config NET_IP_TUNNEL 187config NET_IP_TUNNEL
188 tristate 188 tristate
189 select DST_CACHE 189 select DST_CACHE
190 select GRO_CELLS
190 default n 191 default n
191 192
192config NET_IPGRE 193config NET_IPGRE
@@ -360,6 +361,19 @@ config INET_ESP
360 361
361 If unsure, say Y. 362 If unsure, say Y.
362 363
364config INET_ESP_OFFLOAD
365 tristate "IP: ESP transformation offload"
366 depends on INET_ESP
367 select XFRM_OFFLOAD
368 default n
369 ---help---
370 Support for ESP transformation offload. This makes sense
371 only if this system really does IPsec and want to do it
372 with high throughput. A typical desktop system does not
373 need it, even if it does IPsec.
374
375 If unsure, say N.
376
363config INET_IPCOMP 377config INET_IPCOMP
364 tristate "IP: IPComp transformation" 378 tristate "IP: IPComp transformation"
365 select INET_XFRM_TUNNEL 379 select INET_XFRM_TUNNEL
@@ -430,6 +444,14 @@ config INET_UDP_DIAG
430 Support for UDP socket monitoring interface used by the ss tool. 444 Support for UDP socket monitoring interface used by the ss tool.
431 If unsure, say Y. 445 If unsure, say Y.
432 446
447config INET_RAW_DIAG
448 tristate "RAW: socket monitoring interface"
449 depends on INET_DIAG && (IPV6 || IPV6=n)
450 default n
451 ---help---
452 Support for RAW socket monitoring interface used by the ss tool.
453 If unsure, say Y.
454
433config INET_DIAG_DESTROY 455config INET_DIAG_DESTROY
434 bool "INET: allow privileged process to administratively close sockets" 456 bool "INET: allow privileged process to administratively close sockets"
435 depends on INET_DIAG 457 depends on INET_DIAG
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index bc6a6c8b9bcd..c6d4238ff94a 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_NET_IPVTI) += ip_vti.o
29obj-$(CONFIG_SYN_COOKIES) += syncookies.o 29obj-$(CONFIG_SYN_COOKIES) += syncookies.o
30obj-$(CONFIG_INET_AH) += ah4.o 30obj-$(CONFIG_INET_AH) += ah4.o
31obj-$(CONFIG_INET_ESP) += esp4.o 31obj-$(CONFIG_INET_ESP) += esp4.o
32obj-$(CONFIG_INET_ESP_OFFLOAD) += esp4_offload.o
32obj-$(CONFIG_INET_IPCOMP) += ipcomp.o 33obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
33obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o 34obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o
34obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o 35obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o
@@ -40,6 +41,7 @@ obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/
40obj-$(CONFIG_INET_DIAG) += inet_diag.o 41obj-$(CONFIG_INET_DIAG) += inet_diag.o
41obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o 42obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
42obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o 43obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
44obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o
43obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o 45obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
44obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o 46obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
45obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o 47obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 215143246e4b..6b1fc6e4278e 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -90,7 +90,7 @@
90#include <linux/random.h> 90#include <linux/random.h>
91#include <linux/slab.h> 91#include <linux/slab.h>
92 92
93#include <asm/uaccess.h> 93#include <linux/uaccess.h>
94 94
95#include <linux/inet.h> 95#include <linux/inet.h>
96#include <linux/igmp.h> 96#include <linux/igmp.h>
@@ -374,8 +374,18 @@ lookup_protocol:
374 374
375 if (sk->sk_prot->init) { 375 if (sk->sk_prot->init) {
376 err = sk->sk_prot->init(sk); 376 err = sk->sk_prot->init(sk);
377 if (err) 377 if (err) {
378 sk_common_release(sk); 378 sk_common_release(sk);
379 goto out;
380 }
381 }
382
383 if (!kern) {
384 err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
385 if (err) {
386 sk_common_release(sk);
387 goto out;
388 }
379 } 389 }
380out: 390out:
381 return err; 391 return err;
@@ -469,7 +479,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
469 479
470 snum = ntohs(addr->sin_port); 480 snum = ntohs(addr->sin_port);
471 err = -EACCES; 481 err = -EACCES;
472 if (snum && snum < PROT_SOCK && 482 if (snum && snum < inet_prot_sock(net) &&
473 !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) 483 !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
474 goto out; 484 goto out;
475 485
@@ -560,19 +570,30 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
560 * TCP 'magic' in here. 570 * TCP 'magic' in here.
561 */ 571 */
562int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, 572int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
563 int addr_len, int flags) 573 int addr_len, int flags, int is_sendmsg)
564{ 574{
565 struct sock *sk = sock->sk; 575 struct sock *sk = sock->sk;
566 int err; 576 int err;
567 long timeo; 577 long timeo;
568 578
569 if (addr_len < sizeof(uaddr->sa_family)) 579 /*
570 return -EINVAL; 580 * uaddr can be NULL and addr_len can be 0 if:
581 * sk is a TCP fastopen active socket and
582 * TCP_FASTOPEN_CONNECT sockopt is set and
583 * we already have a valid cookie for this socket.
584 * In this case, user can call write() after connect().
585 * write() will invoke tcp_sendmsg_fastopen() which calls
586 * __inet_stream_connect().
587 */
588 if (uaddr) {
589 if (addr_len < sizeof(uaddr->sa_family))
590 return -EINVAL;
571 591
572 if (uaddr->sa_family == AF_UNSPEC) { 592 if (uaddr->sa_family == AF_UNSPEC) {
573 err = sk->sk_prot->disconnect(sk, flags); 593 err = sk->sk_prot->disconnect(sk, flags);
574 sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; 594 sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
575 goto out; 595 goto out;
596 }
576 } 597 }
577 598
578 switch (sock->state) { 599 switch (sock->state) {
@@ -583,7 +604,10 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
583 err = -EISCONN; 604 err = -EISCONN;
584 goto out; 605 goto out;
585 case SS_CONNECTING: 606 case SS_CONNECTING:
586 err = -EALREADY; 607 if (inet_sk(sk)->defer_connect)
608 err = is_sendmsg ? -EINPROGRESS : -EISCONN;
609 else
610 err = -EALREADY;
587 /* Fall out of switch with err, set for this state */ 611 /* Fall out of switch with err, set for this state */
588 break; 612 break;
589 case SS_UNCONNECTED: 613 case SS_UNCONNECTED:
@@ -597,6 +621,9 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
597 621
598 sock->state = SS_CONNECTING; 622 sock->state = SS_CONNECTING;
599 623
624 if (!err && inet_sk(sk)->defer_connect)
625 goto out;
626
600 /* Just entered SS_CONNECTING state; the only 627 /* Just entered SS_CONNECTING state; the only
601 * difference is that return value in non-blocking 628 * difference is that return value in non-blocking
602 * case is EINPROGRESS, rather than EALREADY. 629 * case is EINPROGRESS, rather than EALREADY.
@@ -652,7 +679,7 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
652 int err; 679 int err;
653 680
654 lock_sock(sock->sk); 681 lock_sock(sock->sk);
655 err = __inet_stream_connect(sock, uaddr, addr_len, flags); 682 err = __inet_stream_connect(sock, uaddr, addr_len, flags, 0);
656 release_sock(sock->sk); 683 release_sock(sock->sk);
657 return err; 684 return err;
658} 685}
@@ -662,11 +689,12 @@ EXPORT_SYMBOL(inet_stream_connect);
662 * Accept a pending connection. The TCP layer now gives BSD semantics. 689 * Accept a pending connection. The TCP layer now gives BSD semantics.
663 */ 690 */
664 691
665int inet_accept(struct socket *sock, struct socket *newsock, int flags) 692int inet_accept(struct socket *sock, struct socket *newsock, int flags,
693 bool kern)
666{ 694{
667 struct sock *sk1 = sock->sk; 695 struct sock *sk1 = sock->sk;
668 int err = -EINVAL; 696 int err = -EINVAL;
669 struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err); 697 struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err, kern);
670 698
671 if (!sk2) 699 if (!sk2)
672 goto do_err; 700 goto do_err;
@@ -1396,7 +1424,7 @@ out_unlock:
1396 rcu_read_unlock(); 1424 rcu_read_unlock();
1397 1425
1398out: 1426out:
1399 NAPI_GRO_CB(skb)->flush |= flush; 1427 skb_gro_flush_final(skb, pp, flush);
1400 1428
1401 return pp; 1429 return pp;
1402} 1430}
@@ -1460,8 +1488,10 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff)
1460 int proto = iph->protocol; 1488 int proto = iph->protocol;
1461 int err = -ENOSYS; 1489 int err = -ENOSYS;
1462 1490
1463 if (skb->encapsulation) 1491 if (skb->encapsulation) {
1492 skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IP));
1464 skb_set_inner_network_header(skb, nhoff); 1493 skb_set_inner_network_header(skb, nhoff);
1494 }
1465 1495
1466 csum_replace2(&iph->check, iph->tot_len, newlen); 1496 csum_replace2(&iph->check, iph->tot_len, newlen);
1467 iph->tot_len = newlen; 1497 iph->tot_len = newlen;
@@ -1690,6 +1720,9 @@ static __net_init int inet_init_net(struct net *net)
1690 net->ipv4.sysctl_ip_default_ttl = IPDEFTTL; 1720 net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
1691 net->ipv4.sysctl_ip_dynaddr = 0; 1721 net->ipv4.sysctl_ip_dynaddr = 0;
1692 net->ipv4.sysctl_ip_early_demux = 1; 1722 net->ipv4.sysctl_ip_early_demux = 1;
1723#ifdef CONFIG_SYSCTL
1724 net->ipv4.sysctl_ip_prot_sock = PROT_SOCK;
1725#endif
1693 1726
1694 return 0; 1727 return 0;
1695} 1728}
@@ -1821,8 +1854,6 @@ static int __init inet_init(void)
1821 1854
1822 ip_init(); 1855 ip_init();
1823 1856
1824 tcp_v4_init();
1825
1826 /* Setup TCP slab cache for open requests. */ 1857 /* Setup TCP slab cache for open requests. */
1827 tcp_init(); 1858 tcp_init();
1828 1859
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index f2a71025a770..22377c8ff14b 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -270,6 +270,9 @@ static void ah_input_done(struct crypto_async_request *base, int err)
270 int ihl = ip_hdrlen(skb); 270 int ihl = ip_hdrlen(skb);
271 int ah_hlen = (ah->hdrlen + 2) << 2; 271 int ah_hlen = (ah->hdrlen + 2) << 2;
272 272
273 if (err)
274 goto out;
275
273 work_iph = AH_SKB_CB(skb)->tmp; 276 work_iph = AH_SKB_CB(skb)->tmp;
274 auth_data = ah_tmp_auth(work_iph, ihl); 277 auth_data = ah_tmp_auth(work_iph, ihl);
275 icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len); 278 icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 89a8cac4726a..51b27ae09fbd 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1263,7 +1263,7 @@ void __init arp_init(void)
1263/* 1263/*
1264 * ax25 -> ASCII conversion 1264 * ax25 -> ASCII conversion
1265 */ 1265 */
1266static char *ax2asc2(ax25_address *a, char *buf) 1266static void ax2asc2(ax25_address *a, char *buf)
1267{ 1267{
1268 char c, *s; 1268 char c, *s;
1269 int n; 1269 int n;
@@ -1285,10 +1285,10 @@ static char *ax2asc2(ax25_address *a, char *buf)
1285 *s++ = n + '0'; 1285 *s++ = n + '0';
1286 *s++ = '\0'; 1286 *s++ = '\0';
1287 1287
1288 if (*buf == '\0' || *buf == '-') 1288 if (*buf == '\0' || *buf == '-') {
1289 return "*"; 1289 buf[0] = '*';
1290 1290 buf[1] = '\0';
1291 return buf; 1291 }
1292} 1292}
1293#endif /* CONFIG_AX25 */ 1293#endif /* CONFIG_AX25 */
1294 1294
@@ -1322,7 +1322,7 @@ static void arp_format_neigh_entry(struct seq_file *seq,
1322 } 1322 }
1323#endif 1323#endif
1324 sprintf(tbuf, "%pI4", n->primary_key); 1324 sprintf(tbuf, "%pI4", n->primary_key);
1325 seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n", 1325 seq_printf(seq, "%-16s 0x%-10x0x%-10x%-17s * %s\n",
1326 tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name); 1326 tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name);
1327 read_unlock(&n->lock); 1327 read_unlock(&n->lock);
1328} 1328}
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 72d6f056d863..ae206163c273 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1587,6 +1587,10 @@ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option)
1587 goto validate_return_locked; 1587 goto validate_return_locked;
1588 } 1588 }
1589 1589
1590 if (opt_iter + 1 == opt_len) {
1591 err_offset = opt_iter;
1592 goto validate_return_locked;
1593 }
1590 tag_len = tag[1]; 1594 tag_len = tag[1];
1591 if (tag_len > (opt_len - opt_iter)) { 1595 if (tag_len > (opt_len - opt_iter)) {
1592 err_offset = opt_iter + 1; 1596 err_offset = opt_iter + 1;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 062a67ca9a21..cebedd545e5e 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -26,12 +26,13 @@
26 */ 26 */
27 27
28 28
29#include <asm/uaccess.h> 29#include <linux/uaccess.h>
30#include <linux/bitops.h> 30#include <linux/bitops.h>
31#include <linux/capability.h> 31#include <linux/capability.h>
32#include <linux/module.h> 32#include <linux/module.h>
33#include <linux/types.h> 33#include <linux/types.h>
34#include <linux/kernel.h> 34#include <linux/kernel.h>
35#include <linux/sched/signal.h>
35#include <linux/string.h> 36#include <linux/string.h>
36#include <linux/mm.h> 37#include <linux/mm.h>
37#include <linux/socket.h> 38#include <linux/socket.h>
@@ -65,8 +66,6 @@
65#include <net/net_namespace.h> 66#include <net/net_namespace.h>
66#include <net/addrconf.h> 67#include <net/addrconf.h>
67 68
68#include "fib_lookup.h"
69
70static struct ipv4_devconf ipv4_devconf = { 69static struct ipv4_devconf ipv4_devconf = {
71 .data = { 70 .data = {
72 [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1, 71 [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 20fb25e3027b..b1e24446e297 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -18,6 +18,8 @@
18#include <net/protocol.h> 18#include <net/protocol.h>
19#include <net/udp.h> 19#include <net/udp.h>
20 20
21#include <linux/highmem.h>
22
21struct esp_skb_cb { 23struct esp_skb_cb {
22 struct xfrm_skb_cb xfrm; 24 struct xfrm_skb_cb xfrm;
23 void *tmp; 25 void *tmp;
@@ -92,11 +94,40 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
92 __alignof__(struct scatterlist)); 94 __alignof__(struct scatterlist));
93} 95}
94 96
97static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
98{
99 struct esp_output_extra *extra = esp_tmp_extra(tmp);
100 struct crypto_aead *aead = x->data;
101 int extralen = 0;
102 u8 *iv;
103 struct aead_request *req;
104 struct scatterlist *sg;
105
106 if (x->props.flags & XFRM_STATE_ESN)
107 extralen += sizeof(*extra);
108
109 extra = esp_tmp_extra(tmp);
110 iv = esp_tmp_iv(aead, tmp, extralen);
111 req = esp_tmp_req(aead, iv);
112
113 /* Unref skb_frag_pages in the src scatterlist if necessary.
114 * Skip the first sg which comes from skb->data.
115 */
116 if (req->src != req->dst)
117 for (sg = sg_next(req->src); sg; sg = sg_next(sg))
118 put_page(sg_page(sg));
119}
120
95static void esp_output_done(struct crypto_async_request *base, int err) 121static void esp_output_done(struct crypto_async_request *base, int err)
96{ 122{
97 struct sk_buff *skb = base->data; 123 struct sk_buff *skb = base->data;
124 void *tmp;
125 struct dst_entry *dst = skb_dst(skb);
126 struct xfrm_state *x = dst->xfrm;
98 127
99 kfree(ESP_SKB_CB(skb)->tmp); 128 tmp = ESP_SKB_CB(skb)->tmp;
129 esp_ssg_unref(x, tmp);
130 kfree(tmp);
100 xfrm_output_resume(skb, err); 131 xfrm_output_resume(skb, err);
101} 132}
102 133
@@ -120,6 +151,29 @@ static void esp_output_restore_header(struct sk_buff *skb)
120 sizeof(__be32)); 151 sizeof(__be32));
121} 152}
122 153
154static struct ip_esp_hdr *esp_output_set_extra(struct sk_buff *skb,
155 struct ip_esp_hdr *esph,
156 struct esp_output_extra *extra)
157{
158 struct xfrm_state *x = skb_dst(skb)->xfrm;
159
160 /* For ESN we move the header forward by 4 bytes to
161 * accomodate the high bits. We will move it back after
162 * encryption.
163 */
164 if ((x->props.flags & XFRM_STATE_ESN)) {
165 extra->esphoff = (unsigned char *)esph -
166 skb_transport_header(skb);
167 esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4);
168 extra->seqhi = esph->spi;
169 esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
170 }
171
172 esph->spi = x->id.spi;
173
174 return esph;
175}
176
123static void esp_output_done_esn(struct crypto_async_request *base, int err) 177static void esp_output_done_esn(struct crypto_async_request *base, int err)
124{ 178{
125 struct sk_buff *skb = base->data; 179 struct sk_buff *skb = base->data;
@@ -128,18 +182,36 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err)
128 esp_output_done(base, err); 182 esp_output_done(base, err);
129} 183}
130 184
185static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto)
186{
187 /* Fill padding... */
188 if (tfclen) {
189 memset(tail, 0, tfclen);
190 tail += tfclen;
191 }
192 do {
193 int i;
194 for (i = 0; i < plen - 2; i++)
195 tail[i] = i + 1;
196 } while (0);
197 tail[plen - 2] = plen - 2;
198 tail[plen - 1] = proto;
199}
200
131static int esp_output(struct xfrm_state *x, struct sk_buff *skb) 201static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
132{ 202{
133 int err;
134 struct esp_output_extra *extra; 203 struct esp_output_extra *extra;
204 int err = -ENOMEM;
135 struct ip_esp_hdr *esph; 205 struct ip_esp_hdr *esph;
136 struct crypto_aead *aead; 206 struct crypto_aead *aead;
137 struct aead_request *req; 207 struct aead_request *req;
138 struct scatterlist *sg; 208 struct scatterlist *sg, *dsg;
139 struct sk_buff *trailer; 209 struct sk_buff *trailer;
210 struct page *page;
140 void *tmp; 211 void *tmp;
141 u8 *iv; 212 u8 *iv;
142 u8 *tail; 213 u8 *tail;
214 u8 *vaddr;
143 int blksize; 215 int blksize;
144 int clen; 216 int clen;
145 int alen; 217 int alen;
@@ -149,7 +221,9 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
149 int nfrags; 221 int nfrags;
150 int assoclen; 222 int assoclen;
151 int extralen; 223 int extralen;
224 int tailen;
152 __be64 seqno; 225 __be64 seqno;
226 __u8 proto = *skb_mac_header(skb);
153 227
154 /* skb is pure payload to encrypt */ 228 /* skb is pure payload to encrypt */
155 229
@@ -169,12 +243,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
169 blksize = ALIGN(crypto_aead_blocksize(aead), 4); 243 blksize = ALIGN(crypto_aead_blocksize(aead), 4);
170 clen = ALIGN(skb->len + 2 + tfclen, blksize); 244 clen = ALIGN(skb->len + 2 + tfclen, blksize);
171 plen = clen - skb->len - tfclen; 245 plen = clen - skb->len - tfclen;
172 246 tailen = tfclen + plen + alen;
173 err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
174 if (err < 0)
175 goto error;
176 nfrags = err;
177
178 assoclen = sizeof(*esph); 247 assoclen = sizeof(*esph);
179 extralen = 0; 248 extralen = 0;
180 249
@@ -183,35 +252,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
183 assoclen += sizeof(__be32); 252 assoclen += sizeof(__be32);
184 } 253 }
185 254
186 tmp = esp_alloc_tmp(aead, nfrags, extralen);
187 if (!tmp) {
188 err = -ENOMEM;
189 goto error;
190 }
191
192 extra = esp_tmp_extra(tmp);
193 iv = esp_tmp_iv(aead, tmp, extralen);
194 req = esp_tmp_req(aead, iv);
195 sg = esp_req_sg(aead, req);
196
197 /* Fill padding... */
198 tail = skb_tail_pointer(trailer);
199 if (tfclen) {
200 memset(tail, 0, tfclen);
201 tail += tfclen;
202 }
203 do {
204 int i;
205 for (i = 0; i < plen - 2; i++)
206 tail[i] = i + 1;
207 } while (0);
208 tail[plen - 2] = plen - 2;
209 tail[plen - 1] = *skb_mac_header(skb);
210 pskb_put(skb, trailer, clen - skb->len + alen);
211
212 skb_push(skb, -skb_network_offset(skb));
213 esph = ip_esp_hdr(skb);
214 *skb_mac_header(skb) = IPPROTO_ESP; 255 *skb_mac_header(skb) = IPPROTO_ESP;
256 esph = ip_esp_hdr(skb);
215 257
216 /* this is non-NULL only with UDP Encapsulation */ 258 /* this is non-NULL only with UDP Encapsulation */
217 if (x->encap) { 259 if (x->encap) {
@@ -230,7 +272,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
230 uh = (struct udphdr *)esph; 272 uh = (struct udphdr *)esph;
231 uh->source = sport; 273 uh->source = sport;
232 uh->dest = dport; 274 uh->dest = dport;
233 uh->len = htons(skb->len - skb_transport_offset(skb)); 275 uh->len = htons(skb->len + tailen
276 - skb_transport_offset(skb));
234 uh->check = 0; 277 uh->check = 0;
235 278
236 switch (encap_type) { 279 switch (encap_type) {
@@ -248,31 +291,148 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
248 *skb_mac_header(skb) = IPPROTO_UDP; 291 *skb_mac_header(skb) = IPPROTO_UDP;
249 } 292 }
250 293
251 esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); 294 if (!skb_cloned(skb)) {
295 if (tailen <= skb_availroom(skb)) {
296 nfrags = 1;
297 trailer = skb;
298 tail = skb_tail_pointer(trailer);
252 299
253 aead_request_set_callback(req, 0, esp_output_done, skb); 300 goto skip_cow;
301 } else if ((skb_shinfo(skb)->nr_frags < MAX_SKB_FRAGS)
302 && !skb_has_frag_list(skb)) {
303 int allocsize;
304 struct sock *sk = skb->sk;
305 struct page_frag *pfrag = &x->xfrag;
254 306
255 /* For ESN we move the header forward by 4 bytes to 307 allocsize = ALIGN(tailen, L1_CACHE_BYTES);
256 * accomodate the high bits. We will move it back after 308
257 * encryption. 309 spin_lock_bh(&x->lock);
258 */ 310
259 if ((x->props.flags & XFRM_STATE_ESN)) { 311 if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
260 extra->esphoff = (unsigned char *)esph - 312 spin_unlock_bh(&x->lock);
261 skb_transport_header(skb); 313 goto cow;
262 esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4); 314 }
263 extra->seqhi = esph->spi; 315
264 esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi); 316 page = pfrag->page;
265 aead_request_set_callback(req, 0, esp_output_done_esn, skb); 317 get_page(page);
318
319 vaddr = kmap_atomic(page);
320
321 tail = vaddr + pfrag->offset;
322
323 esp_output_fill_trailer(tail, tfclen, plen, proto);
324
325 kunmap_atomic(vaddr);
326
327 nfrags = skb_shinfo(skb)->nr_frags;
328
329 __skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
330 tailen);
331 skb_shinfo(skb)->nr_frags = ++nfrags;
332
333 pfrag->offset = pfrag->offset + allocsize;
334 nfrags++;
335
336 skb->len += tailen;
337 skb->data_len += tailen;
338 skb->truesize += tailen;
339 if (sk)
340 atomic_add(tailen, &sk->sk_wmem_alloc);
341
342 skb_push(skb, -skb_network_offset(skb));
343
344 esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
345 esph->spi = x->id.spi;
346
347 tmp = esp_alloc_tmp(aead, nfrags + 2, extralen);
348 if (!tmp) {
349 spin_unlock_bh(&x->lock);
350 err = -ENOMEM;
351 goto error;
352 }
353
354 extra = esp_tmp_extra(tmp);
355 iv = esp_tmp_iv(aead, tmp, extralen);
356 req = esp_tmp_req(aead, iv);
357 sg = esp_req_sg(aead, req);
358 dsg = &sg[nfrags];
359
360 esph = esp_output_set_extra(skb, esph, extra);
361
362 sg_init_table(sg, nfrags);
363 skb_to_sgvec(skb, sg,
364 (unsigned char *)esph - skb->data,
365 assoclen + ivlen + clen + alen);
366
367 allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES);
368
369 if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
370 spin_unlock_bh(&x->lock);
371 err = -ENOMEM;
372 goto error;
373 }
374
375 skb_shinfo(skb)->nr_frags = 1;
376
377 page = pfrag->page;
378 get_page(page);
379 /* replace page frags in skb with new page */
380 __skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len);
381 pfrag->offset = pfrag->offset + allocsize;
382
383 sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1);
384 skb_to_sgvec(skb, dsg,
385 (unsigned char *)esph - skb->data,
386 assoclen + ivlen + clen + alen);
387
388 spin_unlock_bh(&x->lock);
389
390 goto skip_cow2;
391 }
266 } 392 }
267 393
394cow:
395 err = skb_cow_data(skb, tailen, &trailer);
396 if (err < 0)
397 goto error;
398 nfrags = err;
399 tail = skb_tail_pointer(trailer);
400 esph = ip_esp_hdr(skb);
401
402skip_cow:
403 esp_output_fill_trailer(tail, tfclen, plen, proto);
404
405 pskb_put(skb, trailer, clen - skb->len + alen);
406 skb_push(skb, -skb_network_offset(skb));
407 esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
268 esph->spi = x->id.spi; 408 esph->spi = x->id.spi;
269 409
410 tmp = esp_alloc_tmp(aead, nfrags, extralen);
411 if (!tmp) {
412 err = -ENOMEM;
413 goto error;
414 }
415
416 extra = esp_tmp_extra(tmp);
417 iv = esp_tmp_iv(aead, tmp, extralen);
418 req = esp_tmp_req(aead, iv);
419 sg = esp_req_sg(aead, req);
420 dsg = sg;
421
422 esph = esp_output_set_extra(skb, esph, extra);
423
270 sg_init_table(sg, nfrags); 424 sg_init_table(sg, nfrags);
271 skb_to_sgvec(skb, sg, 425 skb_to_sgvec(skb, sg,
272 (unsigned char *)esph - skb->data, 426 (unsigned char *)esph - skb->data,
273 assoclen + ivlen + clen + alen); 427 assoclen + ivlen + clen + alen);
274 428
275 aead_request_set_crypt(req, sg, sg, ivlen + clen, iv); 429skip_cow2:
430 if ((x->props.flags & XFRM_STATE_ESN))
431 aead_request_set_callback(req, 0, esp_output_done_esn, skb);
432 else
433 aead_request_set_callback(req, 0, esp_output_done, skb);
434
435 aead_request_set_crypt(req, sg, dsg, ivlen + clen, iv);
276 aead_request_set_ad(req, assoclen); 436 aead_request_set_ad(req, assoclen);
277 437
278 seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low + 438 seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low +
@@ -298,6 +458,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
298 esp_output_restore_header(skb); 458 esp_output_restore_header(skb);
299 } 459 }
300 460
461 if (sg != dsg)
462 esp_ssg_unref(x, tmp);
301 kfree(tmp); 463 kfree(tmp);
302 464
303error: 465error:
@@ -401,6 +563,23 @@ static void esp_input_restore_header(struct sk_buff *skb)
401 __skb_pull(skb, 4); 563 __skb_pull(skb, 4);
402} 564}
403 565
566static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi)
567{
568 struct xfrm_state *x = xfrm_input_state(skb);
569 struct ip_esp_hdr *esph = (struct ip_esp_hdr *)skb->data;
570
571 /* For ESN we move the header forward by 4 bytes to
572 * accomodate the high bits. We will move it back after
573 * decryption.
574 */
575 if ((x->props.flags & XFRM_STATE_ESN)) {
576 esph = (void *)skb_push(skb, 4);
577 *seqhi = esph->spi;
578 esph->spi = esph->seq_no;
579 esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
580 }
581}
582
404static void esp_input_done_esn(struct crypto_async_request *base, int err) 583static void esp_input_done_esn(struct crypto_async_request *base, int err)
405{ 584{
406 struct sk_buff *skb = base->data; 585 struct sk_buff *skb = base->data;
@@ -437,12 +616,6 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
437 if (elen <= 0) 616 if (elen <= 0)
438 goto out; 617 goto out;
439 618
440 err = skb_cow_data(skb, 0, &trailer);
441 if (err < 0)
442 goto out;
443
444 nfrags = err;
445
446 assoclen = sizeof(*esph); 619 assoclen = sizeof(*esph);
447 seqhilen = 0; 620 seqhilen = 0;
448 621
@@ -451,6 +624,26 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
451 assoclen += seqhilen; 624 assoclen += seqhilen;
452 } 625 }
453 626
627 if (!skb_cloned(skb)) {
628 if (!skb_is_nonlinear(skb)) {
629 nfrags = 1;
630
631 goto skip_cow;
632 } else if (!skb_has_frag_list(skb)) {
633 nfrags = skb_shinfo(skb)->nr_frags;
634 nfrags++;
635
636 goto skip_cow;
637 }
638 }
639
640 err = skb_cow_data(skb, 0, &trailer);
641 if (err < 0)
642 goto out;
643
644 nfrags = err;
645
646skip_cow:
454 err = -ENOMEM; 647 err = -ENOMEM;
455 tmp = esp_alloc_tmp(aead, nfrags, seqhilen); 648 tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
456 if (!tmp) 649 if (!tmp)
@@ -462,26 +655,17 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
462 req = esp_tmp_req(aead, iv); 655 req = esp_tmp_req(aead, iv);
463 sg = esp_req_sg(aead, req); 656 sg = esp_req_sg(aead, req);
464 657
465 skb->ip_summed = CHECKSUM_NONE; 658 esp_input_set_header(skb, seqhi);
466 659
467 esph = (struct ip_esp_hdr *)skb->data; 660 sg_init_table(sg, nfrags);
661 skb_to_sgvec(skb, sg, 0, skb->len);
468 662
469 aead_request_set_callback(req, 0, esp_input_done, skb); 663 skb->ip_summed = CHECKSUM_NONE;
470 664
471 /* For ESN we move the header forward by 4 bytes to 665 if ((x->props.flags & XFRM_STATE_ESN))
472 * accomodate the high bits. We will move it back after
473 * decryption.
474 */
475 if ((x->props.flags & XFRM_STATE_ESN)) {
476 esph = (void *)skb_push(skb, 4);
477 *seqhi = esph->spi;
478 esph->spi = esph->seq_no;
479 esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
480 aead_request_set_callback(req, 0, esp_input_done_esn, skb); 666 aead_request_set_callback(req, 0, esp_input_done_esn, skb);
481 } 667 else
482 668 aead_request_set_callback(req, 0, esp_input_done, skb);
483 sg_init_table(sg, nfrags);
484 skb_to_sgvec(skb, sg, 0, skb->len);
485 669
486 aead_request_set_crypt(req, sg, sg, elen + ivlen, iv); 670 aead_request_set_crypt(req, sg, sg, elen + ivlen, iv);
487 aead_request_set_ad(req, assoclen); 671 aead_request_set_ad(req, assoclen);
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
new file mode 100644
index 000000000000..1de442632406
--- /dev/null
+++ b/net/ipv4/esp4_offload.c
@@ -0,0 +1,106 @@
1/*
2 * IPV4 GSO/GRO offload support
3 * Linux INET implementation
4 *
5 * Copyright (C) 2016 secunet Security Networks AG
6 * Author: Steffen Klassert <steffen.klassert@secunet.com>
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms and conditions of the GNU General Public License,
10 * version 2, as published by the Free Software Foundation.
11 *
12 * ESP GRO support
13 */
14
15#include <linux/skbuff.h>
16#include <linux/init.h>
17#include <net/protocol.h>
18#include <crypto/aead.h>
19#include <crypto/authenc.h>
20#include <linux/err.h>
21#include <linux/module.h>
22#include <net/ip.h>
23#include <net/xfrm.h>
24#include <net/esp.h>
25#include <linux/scatterlist.h>
26#include <linux/kernel.h>
27#include <linux/slab.h>
28#include <linux/spinlock.h>
29#include <net/udp.h>
30
31static struct sk_buff **esp4_gro_receive(struct sk_buff **head,
32 struct sk_buff *skb)
33{
34 int offset = skb_gro_offset(skb);
35 struct xfrm_offload *xo;
36 struct xfrm_state *x;
37 __be32 seq;
38 __be32 spi;
39 int err;
40
41 skb_pull(skb, offset);
42
43 if ((err = xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq)) != 0)
44 goto out;
45
46 err = secpath_set(skb);
47 if (err)
48 goto out;
49
50 if (skb->sp->len == XFRM_MAX_DEPTH)
51 goto out;
52
53 x = xfrm_state_lookup(dev_net(skb->dev), skb->mark,
54 (xfrm_address_t *)&ip_hdr(skb)->daddr,
55 spi, IPPROTO_ESP, AF_INET);
56 if (!x)
57 goto out;
58
59 skb->sp->xvec[skb->sp->len++] = x;
60 skb->sp->olen++;
61
62 xo = xfrm_offload(skb);
63 if (!xo) {
64 xfrm_state_put(x);
65 goto out;
66 }
67 xo->flags |= XFRM_GRO;
68
69 XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
70 XFRM_SPI_SKB_CB(skb)->family = AF_INET;
71 XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
72 XFRM_SPI_SKB_CB(skb)->seq = seq;
73
74 /* We don't need to handle errors from xfrm_input, it does all
75 * the error handling and frees the resources on error. */
76 xfrm_input(skb, IPPROTO_ESP, spi, -2);
77
78 return ERR_PTR(-EINPROGRESS);
79out:
80 skb_push(skb, offset);
81 NAPI_GRO_CB(skb)->same_flow = 0;
82 NAPI_GRO_CB(skb)->flush = 1;
83
84 return NULL;
85}
86
87static const struct net_offload esp4_offload = {
88 .callbacks = {
89 .gro_receive = esp4_gro_receive,
90 },
91};
92
93static int __init esp4_offload_init(void)
94{
95 return inet_add_offload(&esp4_offload, IPPROTO_ESP);
96}
97
98static void __exit esp4_offload_exit(void)
99{
100 inet_del_offload(&esp4_offload, IPPROTO_ESP);
101}
102
103module_init(esp4_offload_init);
104module_exit(esp4_offload_exit);
105MODULE_LICENSE("GPL");
106MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>");
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 161fc0f0d752..8f2133ffc2ff 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -14,7 +14,7 @@
14 */ 14 */
15 15
16#include <linux/module.h> 16#include <linux/module.h>
17#include <asm/uaccess.h> 17#include <linux/uaccess.h>
18#include <linux/bitops.h> 18#include <linux/bitops.h>
19#include <linux/capability.h> 19#include <linux/capability.h>
20#include <linux/types.h> 20#include <linux/types.h>
@@ -46,6 +46,7 @@
46#include <net/rtnetlink.h> 46#include <net/rtnetlink.h>
47#include <net/xfrm.h> 47#include <net/xfrm.h>
48#include <net/l3mdev.h> 48#include <net/l3mdev.h>
49#include <net/lwtunnel.h>
49#include <trace/events/fib.h> 50#include <trace/events/fib.h>
50 51
51#ifndef CONFIG_IP_MULTIPLE_TABLES 52#ifndef CONFIG_IP_MULTIPLE_TABLES
@@ -85,7 +86,7 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
85 if (tb) 86 if (tb)
86 return tb; 87 return tb;
87 88
88 if (id == RT_TABLE_LOCAL) 89 if (id == RT_TABLE_LOCAL && !net->ipv4.fib_has_custom_rules)
89 alias = fib_new_table(net, RT_TABLE_MAIN); 90 alias = fib_new_table(net, RT_TABLE_MAIN);
90 91
91 tb = fib_trie_table(id, alias); 92 tb = fib_trie_table(id, alias);
@@ -318,7 +319,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
318 int ret, no_addr; 319 int ret, no_addr;
319 struct fib_result res; 320 struct fib_result res;
320 struct flowi4 fl4; 321 struct flowi4 fl4;
321 struct net *net; 322 struct net *net = dev_net(dev);
322 bool dev_match; 323 bool dev_match;
323 324
324 fl4.flowi4_oif = 0; 325 fl4.flowi4_oif = 0;
@@ -331,6 +332,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
331 fl4.flowi4_scope = RT_SCOPE_UNIVERSE; 332 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
332 fl4.flowi4_tun_key.tun_id = 0; 333 fl4.flowi4_tun_key.tun_id = 0;
333 fl4.flowi4_flags = 0; 334 fl4.flowi4_flags = 0;
335 fl4.flowi4_uid = sock_net_uid(net, NULL);
334 336
335 no_addr = idev->ifa_list == NULL; 337 no_addr = idev->ifa_list == NULL;
336 338
@@ -338,13 +340,12 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
338 340
339 trace_fib_validate_source(dev, &fl4); 341 trace_fib_validate_source(dev, &fl4);
340 342
341 net = dev_net(dev);
342 if (fib_lookup(net, &fl4, &res, 0)) 343 if (fib_lookup(net, &fl4, &res, 0))
343 goto last_resort; 344 goto last_resort;
344 if (res.type != RTN_UNICAST && 345 if (res.type != RTN_UNICAST &&
345 (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev))) 346 (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
346 goto e_inval; 347 goto e_inval;
347 if (!rpf && !fib_num_tclassid_users(dev_net(dev)) && 348 if (!rpf && !fib_num_tclassid_users(net) &&
348 (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) 349 (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev)))
349 goto last_resort; 350 goto last_resort;
350 fib_combine_itag(itag, &res); 351 fib_combine_itag(itag, &res);
@@ -620,6 +621,8 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
620 [RTA_FLOW] = { .type = NLA_U32 }, 621 [RTA_FLOW] = { .type = NLA_U32 },
621 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 622 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
622 [RTA_ENCAP] = { .type = NLA_NESTED }, 623 [RTA_ENCAP] = { .type = NLA_NESTED },
624 [RTA_UID] = { .type = NLA_U32 },
625 [RTA_MARK] = { .type = NLA_U32 },
623}; 626};
624 627
625static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, 628static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
@@ -676,6 +679,10 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
676 cfg->fc_mx_len = nla_len(attr); 679 cfg->fc_mx_len = nla_len(attr);
677 break; 680 break;
678 case RTA_MULTIPATH: 681 case RTA_MULTIPATH:
682 err = lwtunnel_valid_encap_type_attr(nla_data(attr),
683 nla_len(attr));
684 if (err < 0)
685 goto errout;
679 cfg->fc_mp = nla_data(attr); 686 cfg->fc_mp = nla_data(attr);
680 cfg->fc_mp_len = nla_len(attr); 687 cfg->fc_mp_len = nla_len(attr);
681 break; 688 break;
@@ -690,6 +697,9 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
690 break; 697 break;
691 case RTA_ENCAP_TYPE: 698 case RTA_ENCAP_TYPE:
692 cfg->fc_encap_type = nla_get_u16(attr); 699 cfg->fc_encap_type = nla_get_u16(attr);
700 err = lwtunnel_valid_encap_type(cfg->fc_encap_type);
701 if (err < 0)
702 goto errout;
693 break; 703 break;
694 } 704 }
695 } 705 }
@@ -1073,7 +1083,8 @@ static void nl_fib_input(struct sk_buff *skb)
1073 1083
1074 net = sock_net(skb->sk); 1084 net = sock_net(skb->sk);
1075 nlh = nlmsg_hdr(skb); 1085 nlh = nlmsg_hdr(skb);
1076 if (skb->len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len || 1086 if (skb->len < nlmsg_total_size(sizeof(*frn)) ||
1087 skb->len < nlh->nlmsg_len ||
1077 nlmsg_len(nlh) < sizeof(*frn)) 1088 nlmsg_len(nlh) < sizeof(*frn))
1078 return; 1089 return;
1079 1090
@@ -1218,6 +1229,8 @@ static int __net_init ip_fib_net_init(struct net *net)
1218 int err; 1229 int err;
1219 size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ; 1230 size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
1220 1231
1232 net->ipv4.fib_seq = 0;
1233
1221 /* Avoid false sharing : Use at least a full cache line */ 1234 /* Avoid false sharing : Use at least a full cache line */
1222 size = max_t(size_t, size, L1_CACHE_BYTES); 1235 size = max_t(size_t, size, L1_CACHE_BYTES);
1223 1236
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 388d3e21629b..317026a39cfa 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -13,7 +13,7 @@
13 * 2 of the License, or (at your option) any later version. 13 * 2 of the License, or (at your option) any later version.
14 */ 14 */
15 15
16#include <asm/uaccess.h> 16#include <linux/uaccess.h>
17#include <linux/bitops.h> 17#include <linux/bitops.h>
18#include <linux/types.h> 18#include <linux/types.h>
19#include <linux/kernel.h> 19#include <linux/kernel.h>
@@ -234,6 +234,7 @@ void free_fib_info(struct fib_info *fi)
234#endif 234#endif
235 call_rcu(&fi->rcu, free_fib_info_rcu); 235 call_rcu(&fi->rcu, free_fib_info_rcu);
236} 236}
237EXPORT_SYMBOL_GPL(free_fib_info);
237 238
238void fib_release_info(struct fib_info *fi) 239void fib_release_info(struct fib_info *fi)
239{ 240{
@@ -470,7 +471,6 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
470static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, 471static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
471 int remaining, struct fib_config *cfg) 472 int remaining, struct fib_config *cfg)
472{ 473{
473 struct net *net = cfg->fc_nlinfo.nl_net;
474 int ret; 474 int ret;
475 475
476 change_nexthops(fi) { 476 change_nexthops(fi) {
@@ -502,16 +502,14 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
502 nla = nla_find(attrs, attrlen, RTA_ENCAP); 502 nla = nla_find(attrs, attrlen, RTA_ENCAP);
503 if (nla) { 503 if (nla) {
504 struct lwtunnel_state *lwtstate; 504 struct lwtunnel_state *lwtstate;
505 struct net_device *dev = NULL;
506 struct nlattr *nla_entype; 505 struct nlattr *nla_entype;
507 506
508 nla_entype = nla_find(attrs, attrlen, 507 nla_entype = nla_find(attrs, attrlen,
509 RTA_ENCAP_TYPE); 508 RTA_ENCAP_TYPE);
510 if (!nla_entype) 509 if (!nla_entype)
511 goto err_inval; 510 goto err_inval;
512 if (cfg->fc_oif) 511
513 dev = __dev_get_by_index(net, cfg->fc_oif); 512 ret = lwtunnel_build_state(nla_get_u16(
514 ret = lwtunnel_build_state(dev, nla_get_u16(
515 nla_entype), 513 nla_entype),
516 nla, AF_INET, cfg, 514 nla, AF_INET, cfg,
517 &lwtstate); 515 &lwtstate);
@@ -596,21 +594,18 @@ static inline void fib_add_weight(struct fib_info *fi,
596 594
597#endif /* CONFIG_IP_ROUTE_MULTIPATH */ 595#endif /* CONFIG_IP_ROUTE_MULTIPATH */
598 596
599static int fib_encap_match(struct net *net, u16 encap_type, 597static int fib_encap_match(u16 encap_type,
600 struct nlattr *encap, 598 struct nlattr *encap,
601 int oif, const struct fib_nh *nh, 599 const struct fib_nh *nh,
602 const struct fib_config *cfg) 600 const struct fib_config *cfg)
603{ 601{
604 struct lwtunnel_state *lwtstate; 602 struct lwtunnel_state *lwtstate;
605 struct net_device *dev = NULL;
606 int ret, result = 0; 603 int ret, result = 0;
607 604
608 if (encap_type == LWTUNNEL_ENCAP_NONE) 605 if (encap_type == LWTUNNEL_ENCAP_NONE)
609 return 0; 606 return 0;
610 607
611 if (oif) 608 ret = lwtunnel_build_state(encap_type, encap,
612 dev = __dev_get_by_index(net, oif);
613 ret = lwtunnel_build_state(dev, encap_type, encap,
614 AF_INET, cfg, &lwtstate); 609 AF_INET, cfg, &lwtstate);
615 if (!ret) { 610 if (!ret) {
616 result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate); 611 result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate);
@@ -622,7 +617,6 @@ static int fib_encap_match(struct net *net, u16 encap_type,
622 617
623int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) 618int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
624{ 619{
625 struct net *net = cfg->fc_nlinfo.nl_net;
626#ifdef CONFIG_IP_ROUTE_MULTIPATH 620#ifdef CONFIG_IP_ROUTE_MULTIPATH
627 struct rtnexthop *rtnh; 621 struct rtnexthop *rtnh;
628 int remaining; 622 int remaining;
@@ -633,9 +627,8 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
633 627
634 if (cfg->fc_oif || cfg->fc_gw) { 628 if (cfg->fc_oif || cfg->fc_gw) {
635 if (cfg->fc_encap) { 629 if (cfg->fc_encap) {
636 if (fib_encap_match(net, cfg->fc_encap_type, 630 if (fib_encap_match(cfg->fc_encap_type,
637 cfg->fc_encap, cfg->fc_oif, 631 cfg->fc_encap, fi->fib_nh, cfg))
638 fi->fib_nh, cfg))
639 return 1; 632 return 1;
640 } 633 }
641 if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) && 634 if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
@@ -1092,13 +1085,10 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
1092 1085
1093 if (cfg->fc_encap) { 1086 if (cfg->fc_encap) {
1094 struct lwtunnel_state *lwtstate; 1087 struct lwtunnel_state *lwtstate;
1095 struct net_device *dev = NULL;
1096 1088
1097 if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE) 1089 if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE)
1098 goto err_inval; 1090 goto err_inval;
1099 if (cfg->fc_oif) 1091 err = lwtunnel_build_state(cfg->fc_encap_type,
1100 dev = __dev_get_by_index(net, cfg->fc_oif);
1101 err = lwtunnel_build_state(dev, cfg->fc_encap_type,
1102 cfg->fc_encap, AF_INET, cfg, 1092 cfg->fc_encap, AF_INET, cfg,
1103 &lwtstate); 1093 &lwtstate);
1104 if (err) 1094 if (err)
@@ -1278,8 +1268,9 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
1278 nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid)) 1268 nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
1279 goto nla_put_failure; 1269 goto nla_put_failure;
1280#endif 1270#endif
1281 if (fi->fib_nh->nh_lwtstate) 1271 if (fi->fib_nh->nh_lwtstate &&
1282 lwtunnel_fill_encap(skb, fi->fib_nh->nh_lwtstate); 1272 lwtunnel_fill_encap(skb, fi->fib_nh->nh_lwtstate) < 0)
1273 goto nla_put_failure;
1283 } 1274 }
1284#ifdef CONFIG_IP_ROUTE_MULTIPATH 1275#ifdef CONFIG_IP_ROUTE_MULTIPATH
1285 if (fi->fib_nhs > 1) { 1276 if (fi->fib_nhs > 1) {
@@ -1315,8 +1306,10 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
1315 nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) 1306 nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
1316 goto nla_put_failure; 1307 goto nla_put_failure;
1317#endif 1308#endif
1318 if (nh->nh_lwtstate) 1309 if (nh->nh_lwtstate &&
1319 lwtunnel_fill_encap(skb, nh->nh_lwtstate); 1310 lwtunnel_fill_encap(skb, nh->nh_lwtstate) < 0)
1311 goto nla_put_failure;
1312
1320 /* length of rtnetlink header + attributes */ 1313 /* length of rtnetlink header + attributes */
1321 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh; 1314 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
1322 } endfor_nexthops(fi); 1315 } endfor_nexthops(fi);
@@ -1362,6 +1355,36 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local)
1362 return ret; 1355 return ret;
1363} 1356}
1364 1357
1358static int call_fib_nh_notifiers(struct fib_nh *fib_nh,
1359 enum fib_event_type event_type)
1360{
1361 struct in_device *in_dev = __in_dev_get_rtnl(fib_nh->nh_dev);
1362 struct fib_nh_notifier_info info = {
1363 .fib_nh = fib_nh,
1364 };
1365
1366 switch (event_type) {
1367 case FIB_EVENT_NH_ADD:
1368 if (fib_nh->nh_flags & RTNH_F_DEAD)
1369 break;
1370 if (IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
1371 fib_nh->nh_flags & RTNH_F_LINKDOWN)
1372 break;
1373 return call_fib_notifiers(dev_net(fib_nh->nh_dev), event_type,
1374 &info.info);
1375 case FIB_EVENT_NH_DEL:
1376 if ((IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
1377 fib_nh->nh_flags & RTNH_F_LINKDOWN) ||
1378 (fib_nh->nh_flags & RTNH_F_DEAD))
1379 return call_fib_notifiers(dev_net(fib_nh->nh_dev),
1380 event_type, &info.info);
1381 default:
1382 break;
1383 }
1384
1385 return NOTIFY_DONE;
1386}
1387
1365/* Event force Flags Description 1388/* Event force Flags Description
1366 * NETDEV_CHANGE 0 LINKDOWN Carrier OFF, not for scope host 1389 * NETDEV_CHANGE 0 LINKDOWN Carrier OFF, not for scope host
1367 * NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host 1390 * NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host
@@ -1403,6 +1426,8 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
1403 nexthop_nh->nh_flags |= RTNH_F_LINKDOWN; 1426 nexthop_nh->nh_flags |= RTNH_F_LINKDOWN;
1404 break; 1427 break;
1405 } 1428 }
1429 call_fib_nh_notifiers(nexthop_nh,
1430 FIB_EVENT_NH_DEL);
1406 dead++; 1431 dead++;
1407 } 1432 }
1408#ifdef CONFIG_IP_ROUTE_MULTIPATH 1433#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -1433,7 +1458,7 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
1433} 1458}
1434 1459
1435/* Must be invoked inside of an RCU protected region. */ 1460/* Must be invoked inside of an RCU protected region. */
1436void fib_select_default(const struct flowi4 *flp, struct fib_result *res) 1461static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
1437{ 1462{
1438 struct fib_info *fi = NULL, *last_resort = NULL; 1463 struct fib_info *fi = NULL, *last_resort = NULL;
1439 struct hlist_head *fa_head = res->fa_head; 1464 struct hlist_head *fa_head = res->fa_head;
@@ -1557,6 +1582,7 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
1557 continue; 1582 continue;
1558 alive++; 1583 alive++;
1559 nexthop_nh->nh_flags &= ~nh_flags; 1584 nexthop_nh->nh_flags &= ~nh_flags;
1585 call_fib_nh_notifiers(nexthop_nh, FIB_EVENT_NH_ADD);
1560 } endfor_nexthops(fi) 1586 } endfor_nexthops(fi)
1561 1587
1562 if (alive > 0) { 1588 if (alive > 0) {
@@ -1617,8 +1643,13 @@ void fib_select_multipath(struct fib_result *res, int hash)
1617void fib_select_path(struct net *net, struct fib_result *res, 1643void fib_select_path(struct net *net, struct fib_result *res,
1618 struct flowi4 *fl4, int mp_hash) 1644 struct flowi4 *fl4, int mp_hash)
1619{ 1645{
1646 bool oif_check;
1647
1648 oif_check = (fl4->flowi4_oif == 0 ||
1649 fl4->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF);
1650
1620#ifdef CONFIG_IP_ROUTE_MULTIPATH 1651#ifdef CONFIG_IP_ROUTE_MULTIPATH
1621 if (res->fi->fib_nhs > 1 && fl4->flowi4_oif == 0) { 1652 if (res->fi->fib_nhs > 1 && oif_check) {
1622 if (mp_hash < 0) 1653 if (mp_hash < 0)
1623 mp_hash = get_hash_from_flowi4(fl4) >> 1; 1654 mp_hash = get_hash_from_flowi4(fl4) >> 1;
1624 1655
@@ -1628,7 +1659,7 @@ void fib_select_path(struct net *net, struct fib_result *res,
1628#endif 1659#endif
1629 if (!res->prefixlen && 1660 if (!res->prefixlen &&
1630 res->table->tb_num_default > 1 && 1661 res->table->tb_num_default > 1 &&
1631 res->type == RTN_UNICAST && !fl4->flowi4_oif) 1662 res->type == RTN_UNICAST && oif_check)
1632 fib_select_default(fl4, res); 1663 fib_select_default(fl4, res);
1633 1664
1634 if (!fl4->saddr) 1665 if (!fl4->saddr)
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index e3665bf7a7f3..2f0d8233950f 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -50,7 +50,7 @@
50 50
51#define VERSION "0.409" 51#define VERSION "0.409"
52 52
53#include <asm/uaccess.h> 53#include <linux/uaccess.h>
54#include <linux/bitops.h> 54#include <linux/bitops.h>
55#include <linux/types.h> 55#include <linux/types.h>
56#include <linux/kernel.h> 56#include <linux/kernel.h>
@@ -84,31 +84,119 @@
84#include <trace/events/fib.h> 84#include <trace/events/fib.h>
85#include "fib_lookup.h" 85#include "fib_lookup.h"
86 86
87static BLOCKING_NOTIFIER_HEAD(fib_chain); 87static unsigned int fib_seq_sum(void)
88{
89 unsigned int fib_seq = 0;
90 struct net *net;
91
92 rtnl_lock();
93 for_each_net(net)
94 fib_seq += net->ipv4.fib_seq;
95 rtnl_unlock();
96
97 return fib_seq;
98}
99
100static ATOMIC_NOTIFIER_HEAD(fib_chain);
101
102static int call_fib_notifier(struct notifier_block *nb, struct net *net,
103 enum fib_event_type event_type,
104 struct fib_notifier_info *info)
105{
106 info->net = net;
107 return nb->notifier_call(nb, event_type, info);
108}
109
110static void fib_rules_notify(struct net *net, struct notifier_block *nb,
111 enum fib_event_type event_type)
112{
113#ifdef CONFIG_IP_MULTIPLE_TABLES
114 struct fib_notifier_info info;
115
116 if (net->ipv4.fib_has_custom_rules)
117 call_fib_notifier(nb, net, event_type, &info);
118#endif
119}
120
121static void fib_notify(struct net *net, struct notifier_block *nb,
122 enum fib_event_type event_type);
123
124static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
125 enum fib_event_type event_type, u32 dst,
126 int dst_len, struct fib_info *fi,
127 u8 tos, u8 type, u32 tb_id)
128{
129 struct fib_entry_notifier_info info = {
130 .dst = dst,
131 .dst_len = dst_len,
132 .fi = fi,
133 .tos = tos,
134 .type = type,
135 .tb_id = tb_id,
136 };
137 return call_fib_notifier(nb, net, event_type, &info.info);
138}
88 139
89int register_fib_notifier(struct notifier_block *nb) 140static bool fib_dump_is_consistent(struct notifier_block *nb,
141 void (*cb)(struct notifier_block *nb),
142 unsigned int fib_seq)
90{ 143{
91 return blocking_notifier_chain_register(&fib_chain, nb); 144 atomic_notifier_chain_register(&fib_chain, nb);
145 if (fib_seq == fib_seq_sum())
146 return true;
147 atomic_notifier_chain_unregister(&fib_chain, nb);
148 if (cb)
149 cb(nb);
150 return false;
151}
152
153#define FIB_DUMP_MAX_RETRIES 5
154int register_fib_notifier(struct notifier_block *nb,
155 void (*cb)(struct notifier_block *nb))
156{
157 int retries = 0;
158
159 do {
160 unsigned int fib_seq = fib_seq_sum();
161 struct net *net;
162
163 /* Mutex semantics guarantee that every change done to
164 * FIB tries before we read the change sequence counter
165 * is now visible to us.
166 */
167 rcu_read_lock();
168 for_each_net_rcu(net) {
169 fib_rules_notify(net, nb, FIB_EVENT_RULE_ADD);
170 fib_notify(net, nb, FIB_EVENT_ENTRY_ADD);
171 }
172 rcu_read_unlock();
173
174 if (fib_dump_is_consistent(nb, cb, fib_seq))
175 return 0;
176 } while (++retries < FIB_DUMP_MAX_RETRIES);
177
178 return -EBUSY;
92} 179}
93EXPORT_SYMBOL(register_fib_notifier); 180EXPORT_SYMBOL(register_fib_notifier);
94 181
95int unregister_fib_notifier(struct notifier_block *nb) 182int unregister_fib_notifier(struct notifier_block *nb)
96{ 183{
97 return blocking_notifier_chain_unregister(&fib_chain, nb); 184 return atomic_notifier_chain_unregister(&fib_chain, nb);
98} 185}
99EXPORT_SYMBOL(unregister_fib_notifier); 186EXPORT_SYMBOL(unregister_fib_notifier);
100 187
101int call_fib_notifiers(struct net *net, enum fib_event_type event_type, 188int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
102 struct fib_notifier_info *info) 189 struct fib_notifier_info *info)
103{ 190{
191 net->ipv4.fib_seq++;
104 info->net = net; 192 info->net = net;
105 return blocking_notifier_call_chain(&fib_chain, event_type, info); 193 return atomic_notifier_call_chain(&fib_chain, event_type, info);
106} 194}
107 195
108static int call_fib_entry_notifiers(struct net *net, 196static int call_fib_entry_notifiers(struct net *net,
109 enum fib_event_type event_type, u32 dst, 197 enum fib_event_type event_type, u32 dst,
110 int dst_len, struct fib_info *fi, 198 int dst_len, struct fib_info *fi,
111 u8 tos, u8 type, u32 tb_id, u32 nlflags) 199 u8 tos, u8 type, u32 tb_id)
112{ 200{
113 struct fib_entry_notifier_info info = { 201 struct fib_entry_notifier_info info = {
114 .dst = dst, 202 .dst = dst,
@@ -117,7 +205,6 @@ static int call_fib_entry_notifiers(struct net *net,
117 .tos = tos, 205 .tos = tos,
118 .type = type, 206 .type = type,
119 .tb_id = tb_id, 207 .tb_id = tb_id,
120 .nlflags = nlflags,
121 }; 208 };
122 return call_fib_notifiers(net, event_type, &info.info); 209 return call_fib_notifiers(net, event_type, &info.info);
123} 210}
@@ -1109,6 +1196,7 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp,
1109int fib_table_insert(struct net *net, struct fib_table *tb, 1196int fib_table_insert(struct net *net, struct fib_table *tb,
1110 struct fib_config *cfg) 1197 struct fib_config *cfg)
1111{ 1198{
1199 enum fib_event_type event = FIB_EVENT_ENTRY_ADD;
1112 struct trie *t = (struct trie *)tb->tb_data; 1200 struct trie *t = (struct trie *)tb->tb_data;
1113 struct fib_alias *fa, *new_fa; 1201 struct fib_alias *fa, *new_fa;
1114 struct key_vector *l, *tp; 1202 struct key_vector *l, *tp;
@@ -1206,6 +1294,13 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
1206 new_fa->tb_id = tb->tb_id; 1294 new_fa->tb_id = tb->tb_id;
1207 new_fa->fa_default = -1; 1295 new_fa->fa_default = -1;
1208 1296
1297 call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE,
1298 key, plen, fi,
1299 new_fa->fa_tos, cfg->fc_type,
1300 tb->tb_id);
1301 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1302 tb->tb_id, &cfg->fc_nlinfo, nlflags);
1303
1209 hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); 1304 hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1210 1305
1211 alias_free_mem_rcu(fa); 1306 alias_free_mem_rcu(fa);
@@ -1214,13 +1309,6 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
1214 if (state & FA_S_ACCESSED) 1309 if (state & FA_S_ACCESSED)
1215 rt_cache_flush(cfg->fc_nlinfo.nl_net); 1310 rt_cache_flush(cfg->fc_nlinfo.nl_net);
1216 1311
1217 call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD,
1218 key, plen, fi,
1219 new_fa->fa_tos, cfg->fc_type,
1220 tb->tb_id, cfg->fc_nlflags);
1221 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1222 tb->tb_id, &cfg->fc_nlinfo, nlflags);
1223
1224 goto succeeded; 1312 goto succeeded;
1225 } 1313 }
1226 /* Error if we find a perfect match which 1314 /* Error if we find a perfect match which
@@ -1230,10 +1318,12 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
1230 if (fa_match) 1318 if (fa_match)
1231 goto out; 1319 goto out;
1232 1320
1233 if (cfg->fc_nlflags & NLM_F_APPEND) 1321 if (cfg->fc_nlflags & NLM_F_APPEND) {
1322 event = FIB_EVENT_ENTRY_APPEND;
1234 nlflags |= NLM_F_APPEND; 1323 nlflags |= NLM_F_APPEND;
1235 else 1324 } else {
1236 fa = fa_first; 1325 fa = fa_first;
1326 }
1237 } 1327 }
1238 err = -ENOENT; 1328 err = -ENOENT;
1239 if (!(cfg->fc_nlflags & NLM_F_CREATE)) 1329 if (!(cfg->fc_nlflags & NLM_F_CREATE))
@@ -1262,8 +1352,8 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
1262 tb->tb_num_default++; 1352 tb->tb_num_default++;
1263 1353
1264 rt_cache_flush(cfg->fc_nlinfo.nl_net); 1354 rt_cache_flush(cfg->fc_nlinfo.nl_net);
1265 call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, key, plen, fi, tos, 1355 call_fib_entry_notifiers(net, event, key, plen, fi, tos, cfg->fc_type,
1266 cfg->fc_type, tb->tb_id, cfg->fc_nlflags); 1356 tb->tb_id);
1267 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id, 1357 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
1268 &cfg->fc_nlinfo, nlflags); 1358 &cfg->fc_nlinfo, nlflags);
1269succeeded: 1359succeeded:
@@ -1564,8 +1654,8 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
1564 return -ESRCH; 1654 return -ESRCH;
1565 1655
1566 call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen, 1656 call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen,
1567 fa_to_delete->fa_info, tos, cfg->fc_type, 1657 fa_to_delete->fa_info, tos,
1568 tb->tb_id, 0); 1658 fa_to_delete->fa_type, tb->tb_id);
1569 rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, 1659 rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
1570 &cfg->fc_nlinfo, 0); 1660 &cfg->fc_nlinfo, 0);
1571 1661
@@ -1874,7 +1964,8 @@ int fib_table_flush(struct net *net, struct fib_table *tb)
1874 hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { 1964 hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
1875 struct fib_info *fi = fa->fa_info; 1965 struct fib_info *fi = fa->fa_info;
1876 1966
1877 if (!fi || !(fi->fib_flags & RTNH_F_DEAD)) { 1967 if (!fi || !(fi->fib_flags & RTNH_F_DEAD) ||
1968 tb->tb_id != fa->tb_id) {
1878 slen = fa->fa_slen; 1969 slen = fa->fa_slen;
1879 continue; 1970 continue;
1880 } 1971 }
@@ -1883,7 +1974,7 @@ int fib_table_flush(struct net *net, struct fib_table *tb)
1883 n->key, 1974 n->key,
1884 KEYLENGTH - fa->fa_slen, 1975 KEYLENGTH - fa->fa_slen,
1885 fi, fa->fa_tos, fa->fa_type, 1976 fi, fa->fa_tos, fa->fa_type,
1886 tb->tb_id, 0); 1977 tb->tb_id);
1887 hlist_del_rcu(&fa->fa_list); 1978 hlist_del_rcu(&fa->fa_list);
1888 fib_release_info(fa->fa_info); 1979 fib_release_info(fa->fa_info);
1889 alias_free_mem_rcu(fa); 1980 alias_free_mem_rcu(fa);
@@ -1903,6 +1994,62 @@ int fib_table_flush(struct net *net, struct fib_table *tb)
1903 return found; 1994 return found;
1904} 1995}
1905 1996
1997static void fib_leaf_notify(struct net *net, struct key_vector *l,
1998 struct fib_table *tb, struct notifier_block *nb,
1999 enum fib_event_type event_type)
2000{
2001 struct fib_alias *fa;
2002
2003 hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
2004 struct fib_info *fi = fa->fa_info;
2005
2006 if (!fi)
2007 continue;
2008
2009 /* local and main table can share the same trie,
2010 * so don't notify twice for the same entry.
2011 */
2012 if (tb->tb_id != fa->tb_id)
2013 continue;
2014
2015 call_fib_entry_notifier(nb, net, event_type, l->key,
2016 KEYLENGTH - fa->fa_slen, fi, fa->fa_tos,
2017 fa->fa_type, fa->tb_id);
2018 }
2019}
2020
2021static void fib_table_notify(struct net *net, struct fib_table *tb,
2022 struct notifier_block *nb,
2023 enum fib_event_type event_type)
2024{
2025 struct trie *t = (struct trie *)tb->tb_data;
2026 struct key_vector *l, *tp = t->kv;
2027 t_key key = 0;
2028
2029 while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
2030 fib_leaf_notify(net, l, tb, nb, event_type);
2031
2032 key = l->key + 1;
2033 /* stop in case of wrap around */
2034 if (key < l->key)
2035 break;
2036 }
2037}
2038
2039static void fib_notify(struct net *net, struct notifier_block *nb,
2040 enum fib_event_type event_type)
2041{
2042 unsigned int h;
2043
2044 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2045 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2046 struct fib_table *tb;
2047
2048 hlist_for_each_entry_rcu(tb, head, tb_hlist)
2049 fib_table_notify(net, tb, nb, event_type);
2050 }
2051}
2052
1906static void __trie_free_rcu(struct rcu_head *head) 2053static void __trie_free_rcu(struct rcu_head *head)
1907{ 2054{
1908 struct fib_table *tb = container_of(head, struct fib_table, rcu); 2055 struct fib_table *tb = container_of(head, struct fib_table, rcu);
@@ -2241,7 +2388,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2241 2388
2242 seq_printf(seq, 2389 seq_printf(seq,
2243 "Basic info: size of leaf:" 2390 "Basic info: size of leaf:"
2244 " %Zd bytes, size of tnode: %Zd bytes.\n", 2391 " %zd bytes, size of tnode: %zd bytes.\n",
2245 LEAF_SIZE, TNODE_SIZE(0)); 2392 LEAF_SIZE, TNODE_SIZE(0));
2246 2393
2247 for (h = 0; h < FIB_TABLE_HASHSZ; h++) { 2394 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 030d1531e897..805f6607f8d9 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -622,14 +622,7 @@ static int fou_destroy(struct net *net, struct fou_cfg *cfg)
622 return err; 622 return err;
623} 623}
624 624
625static struct genl_family fou_nl_family = { 625static struct genl_family fou_nl_family;
626 .id = GENL_ID_GENERATE,
627 .hdrsize = 0,
628 .name = FOU_GENL_NAME,
629 .version = FOU_GENL_VERSION,
630 .maxattr = FOU_ATTR_MAX,
631 .netnsok = true,
632};
633 626
634static const struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = { 627static const struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = {
635 [FOU_ATTR_PORT] = { .type = NLA_U16, }, 628 [FOU_ATTR_PORT] = { .type = NLA_U16, },
@@ -831,6 +824,17 @@ static const struct genl_ops fou_nl_ops[] = {
831 }, 824 },
832}; 825};
833 826
827static struct genl_family fou_nl_family __ro_after_init = {
828 .hdrsize = 0,
829 .name = FOU_GENL_NAME,
830 .version = FOU_GENL_VERSION,
831 .maxattr = FOU_ATTR_MAX,
832 .netnsok = true,
833 .module = THIS_MODULE,
834 .ops = fou_nl_ops,
835 .n_ops = ARRAY_SIZE(fou_nl_ops),
836};
837
834size_t fou_encap_hlen(struct ip_tunnel_encap *e) 838size_t fou_encap_hlen(struct ip_tunnel_encap *e)
835{ 839{
836 return sizeof(struct udphdr); 840 return sizeof(struct udphdr);
@@ -1086,8 +1090,7 @@ static int __init fou_init(void)
1086 if (ret) 1090 if (ret)
1087 goto exit; 1091 goto exit;
1088 1092
1089 ret = genl_register_family_with_ops(&fou_nl_family, 1093 ret = genl_register_family(&fou_nl_family);
1090 fou_nl_ops);
1091 if (ret < 0) 1094 if (ret < 0)
1092 goto unregister; 1095 goto unregister;
1093 1096
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 48734ee6293f..fc310db2708b 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -91,7 +91,7 @@
91#include <linux/errno.h> 91#include <linux/errno.h>
92#include <linux/timer.h> 92#include <linux/timer.h>
93#include <linux/init.h> 93#include <linux/init.h>
94#include <asm/uaccess.h> 94#include <linux/uaccess.h>
95#include <net/checksum.h> 95#include <net/checksum.h>
96#include <net/xfrm.h> 96#include <net/xfrm.h>
97#include <net/inet_common.h> 97#include <net/inet_common.h>
@@ -209,19 +209,17 @@ static struct sock *icmp_sk(struct net *net)
209 return *this_cpu_ptr(net->ipv4.icmp_sk); 209 return *this_cpu_ptr(net->ipv4.icmp_sk);
210} 210}
211 211
212/* Called with BH disabled */
212static inline struct sock *icmp_xmit_lock(struct net *net) 213static inline struct sock *icmp_xmit_lock(struct net *net)
213{ 214{
214 struct sock *sk; 215 struct sock *sk;
215 216
216 local_bh_disable();
217
218 sk = icmp_sk(net); 217 sk = icmp_sk(net);
219 218
220 if (unlikely(!spin_trylock(&sk->sk_lock.slock))) { 219 if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
221 /* This can happen if the output path signals a 220 /* This can happen if the output path signals a
222 * dst_link_failure() for an outgoing ICMP packet. 221 * dst_link_failure() for an outgoing ICMP packet.
223 */ 222 */
224 local_bh_enable();
225 return NULL; 223 return NULL;
226 } 224 }
227 return sk; 225 return sk;
@@ -229,7 +227,7 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
229 227
230static inline void icmp_xmit_unlock(struct sock *sk) 228static inline void icmp_xmit_unlock(struct sock *sk)
231{ 229{
232 spin_unlock_bh(&sk->sk_lock.slock); 230 spin_unlock(&sk->sk_lock.slock);
233} 231}
234 232
235int sysctl_icmp_msgs_per_sec __read_mostly = 1000; 233int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
@@ -282,6 +280,33 @@ bool icmp_global_allow(void)
282} 280}
283EXPORT_SYMBOL(icmp_global_allow); 281EXPORT_SYMBOL(icmp_global_allow);
284 282
283static bool icmpv4_mask_allow(struct net *net, int type, int code)
284{
285 if (type > NR_ICMP_TYPES)
286 return true;
287
288 /* Don't limit PMTU discovery. */
289 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
290 return true;
291
292 /* Limit if icmp type is enabled in ratemask. */
293 if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
294 return true;
295
296 return false;
297}
298
299static bool icmpv4_global_allow(struct net *net, int type, int code)
300{
301 if (icmpv4_mask_allow(net, type, code))
302 return true;
303
304 if (icmp_global_allow())
305 return true;
306
307 return false;
308}
309
285/* 310/*
286 * Send an ICMP frame. 311 * Send an ICMP frame.
287 */ 312 */
@@ -290,34 +315,22 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
290 struct flowi4 *fl4, int type, int code) 315 struct flowi4 *fl4, int type, int code)
291{ 316{
292 struct dst_entry *dst = &rt->dst; 317 struct dst_entry *dst = &rt->dst;
318 struct inet_peer *peer;
293 bool rc = true; 319 bool rc = true;
320 int vif;
294 321
295 if (type > NR_ICMP_TYPES) 322 if (icmpv4_mask_allow(net, type, code))
296 goto out;
297
298 /* Don't limit PMTU discovery. */
299 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
300 goto out; 323 goto out;
301 324
302 /* No rate limit on loopback */ 325 /* No rate limit on loopback */
303 if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) 326 if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
304 goto out; 327 goto out;
305 328
306 /* Limit if icmp type is enabled in ratemask. */ 329 vif = l3mdev_master_ifindex(dst->dev);
307 if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask)) 330 peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
308 goto out; 331 rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit);
309 332 if (peer)
310 rc = false; 333 inet_putpeer(peer);
311 if (icmp_global_allow()) {
312 int vif = l3mdev_master_ifindex(dst->dev);
313 struct inet_peer *peer;
314
315 peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
316 rc = inet_peer_xrlim_allow(peer,
317 net->ipv4.sysctl_icmp_ratelimit);
318 if (peer)
319 inet_putpeer(peer);
320 }
321out: 334out:
322 return rc; 335 return rc;
323} 336}
@@ -396,13 +409,22 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
396 struct inet_sock *inet; 409 struct inet_sock *inet;
397 __be32 daddr, saddr; 410 __be32 daddr, saddr;
398 u32 mark = IP4_REPLY_MARK(net, skb->mark); 411 u32 mark = IP4_REPLY_MARK(net, skb->mark);
412 int type = icmp_param->data.icmph.type;
413 int code = icmp_param->data.icmph.code;
399 414
400 if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb)) 415 if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
401 return; 416 return;
402 417
418 /* Needed by both icmp_global_allow and icmp_xmit_lock */
419 local_bh_disable();
420
421 /* global icmp_msgs_per_sec */
422 if (!icmpv4_global_allow(net, type, code))
423 goto out_bh_enable;
424
403 sk = icmp_xmit_lock(net); 425 sk = icmp_xmit_lock(net);
404 if (!sk) 426 if (!sk)
405 return; 427 goto out_bh_enable;
406 inet = inet_sk(sk); 428 inet = inet_sk(sk);
407 429
408 icmp_param->data.icmph.checksum = 0; 430 icmp_param->data.icmph.checksum = 0;
@@ -425,6 +447,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
425 fl4.daddr = daddr; 447 fl4.daddr = daddr;
426 fl4.saddr = saddr; 448 fl4.saddr = saddr;
427 fl4.flowi4_mark = mark; 449 fl4.flowi4_mark = mark;
450 fl4.flowi4_uid = sock_net_uid(net, NULL);
428 fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); 451 fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
429 fl4.flowi4_proto = IPPROTO_ICMP; 452 fl4.flowi4_proto = IPPROTO_ICMP;
430 fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev); 453 fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev);
@@ -432,12 +455,13 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
432 rt = ip_route_output_key(net, &fl4); 455 rt = ip_route_output_key(net, &fl4);
433 if (IS_ERR(rt)) 456 if (IS_ERR(rt))
434 goto out_unlock; 457 goto out_unlock;
435 if (icmpv4_xrlim_allow(net, rt, &fl4, icmp_param->data.icmph.type, 458 if (icmpv4_xrlim_allow(net, rt, &fl4, type, code))
436 icmp_param->data.icmph.code))
437 icmp_push_reply(icmp_param, &fl4, &ipc, &rt); 459 icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
438 ip_rt_put(rt); 460 ip_rt_put(rt);
439out_unlock: 461out_unlock:
440 icmp_xmit_unlock(sk); 462 icmp_xmit_unlock(sk);
463out_bh_enable:
464 local_bh_enable();
441} 465}
442 466
443#ifdef CONFIG_IP_ROUTE_MULTIPATH 467#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -473,6 +497,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
473 param->replyopts.opt.opt.faddr : iph->saddr); 497 param->replyopts.opt.opt.faddr : iph->saddr);
474 fl4->saddr = saddr; 498 fl4->saddr = saddr;
475 fl4->flowi4_mark = mark; 499 fl4->flowi4_mark = mark;
500 fl4->flowi4_uid = sock_net_uid(net, NULL);
476 fl4->flowi4_tos = RT_TOS(tos); 501 fl4->flowi4_tos = RT_TOS(tos);
477 fl4->flowi4_proto = IPPROTO_ICMP; 502 fl4->flowi4_proto = IPPROTO_ICMP;
478 fl4->fl4_icmp_type = type; 503 fl4->fl4_icmp_type = type;
@@ -569,7 +594,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
569{ 594{
570 struct iphdr *iph; 595 struct iphdr *iph;
571 int room; 596 int room;
572 struct icmp_bxm *icmp_param; 597 struct icmp_bxm icmp_param;
573 struct rtable *rt = skb_rtable(skb_in); 598 struct rtable *rt = skb_rtable(skb_in);
574 struct ipcm_cookie ipc; 599 struct ipcm_cookie ipc;
575 struct flowi4 fl4; 600 struct flowi4 fl4;
@@ -646,13 +671,16 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
646 } 671 }
647 } 672 }
648 673
649 icmp_param = kmalloc(sizeof(*icmp_param), GFP_ATOMIC); 674 /* Needed by both icmp_global_allow and icmp_xmit_lock */
650 if (!icmp_param) 675 local_bh_disable();
651 return; 676
677 /* Check global sysctl_icmp_msgs_per_sec ratelimit */
678 if (!icmpv4_global_allow(net, type, code))
679 goto out_bh_enable;
652 680
653 sk = icmp_xmit_lock(net); 681 sk = icmp_xmit_lock(net);
654 if (!sk) 682 if (!sk)
655 goto out_free; 683 goto out_bh_enable;
656 684
657 /* 685 /*
658 * Construct source address and options. 686 * Construct source address and options.
@@ -679,7 +707,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
679 iph->tos; 707 iph->tos;
680 mark = IP4_REPLY_MARK(net, skb_in->mark); 708 mark = IP4_REPLY_MARK(net, skb_in->mark);
681 709
682 if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in)) 710 if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in))
683 goto out_unlock; 711 goto out_unlock;
684 712
685 713
@@ -687,25 +715,26 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
687 * Prepare data for ICMP header. 715 * Prepare data for ICMP header.
688 */ 716 */
689 717
690 icmp_param->data.icmph.type = type; 718 icmp_param.data.icmph.type = type;
691 icmp_param->data.icmph.code = code; 719 icmp_param.data.icmph.code = code;
692 icmp_param->data.icmph.un.gateway = info; 720 icmp_param.data.icmph.un.gateway = info;
693 icmp_param->data.icmph.checksum = 0; 721 icmp_param.data.icmph.checksum = 0;
694 icmp_param->skb = skb_in; 722 icmp_param.skb = skb_in;
695 icmp_param->offset = skb_network_offset(skb_in); 723 icmp_param.offset = skb_network_offset(skb_in);
696 inet_sk(sk)->tos = tos; 724 inet_sk(sk)->tos = tos;
697 sk->sk_mark = mark; 725 sk->sk_mark = mark;
698 ipc.addr = iph->saddr; 726 ipc.addr = iph->saddr;
699 ipc.opt = &icmp_param->replyopts.opt; 727 ipc.opt = &icmp_param.replyopts.opt;
700 ipc.tx_flags = 0; 728 ipc.tx_flags = 0;
701 ipc.ttl = 0; 729 ipc.ttl = 0;
702 ipc.tos = -1; 730 ipc.tos = -1;
703 731
704 rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark, 732 rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,
705 type, code, icmp_param); 733 type, code, &icmp_param);
706 if (IS_ERR(rt)) 734 if (IS_ERR(rt))
707 goto out_unlock; 735 goto out_unlock;
708 736
737 /* peer icmp_ratelimit */
709 if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code)) 738 if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code))
710 goto ende; 739 goto ende;
711 740
@@ -714,21 +743,21 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
714 room = dst_mtu(&rt->dst); 743 room = dst_mtu(&rt->dst);
715 if (room > 576) 744 if (room > 576)
716 room = 576; 745 room = 576;
717 room -= sizeof(struct iphdr) + icmp_param->replyopts.opt.opt.optlen; 746 room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen;
718 room -= sizeof(struct icmphdr); 747 room -= sizeof(struct icmphdr);
719 748
720 icmp_param->data_len = skb_in->len - icmp_param->offset; 749 icmp_param.data_len = skb_in->len - icmp_param.offset;
721 if (icmp_param->data_len > room) 750 if (icmp_param.data_len > room)
722 icmp_param->data_len = room; 751 icmp_param.data_len = room;
723 icmp_param->head_len = sizeof(struct icmphdr); 752 icmp_param.head_len = sizeof(struct icmphdr);
724 753
725 icmp_push_reply(icmp_param, &fl4, &ipc, &rt); 754 icmp_push_reply(&icmp_param, &fl4, &ipc, &rt);
726ende: 755ende:
727 ip_rt_put(rt); 756 ip_rt_put(rt);
728out_unlock: 757out_unlock:
729 icmp_xmit_unlock(sk); 758 icmp_xmit_unlock(sk);
730out_free: 759out_bh_enable:
731 kfree(icmp_param); 760 local_bh_enable();
732out:; 761out:;
733} 762}
734EXPORT_SYMBOL(icmp_send); 763EXPORT_SYMBOL(icmp_send);
@@ -1045,12 +1074,12 @@ int icmp_rcv(struct sk_buff *skb)
1045 1074
1046 if (success) { 1075 if (success) {
1047 consume_skb(skb); 1076 consume_skb(skb);
1048 return 0; 1077 return NET_RX_SUCCESS;
1049 } 1078 }
1050 1079
1051drop: 1080drop:
1052 kfree_skb(skb); 1081 kfree_skb(skb);
1053 return 0; 1082 return NET_RX_DROP;
1054csum_error: 1083csum_error:
1055 __ICMP_INC_STATS(net, ICMP_MIB_CSUMERRORS); 1084 __ICMP_INC_STATS(net, ICMP_MIB_CSUMERRORS);
1056error: 1085error:
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 15db786d50ed..44fd86de2823 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -72,7 +72,7 @@
72 72
73#include <linux/module.h> 73#include <linux/module.h>
74#include <linux/slab.h> 74#include <linux/slab.h>
75#include <asm/uaccess.h> 75#include <linux/uaccess.h>
76#include <linux/types.h> 76#include <linux/types.h>
77#include <linux/kernel.h> 77#include <linux/kernel.h>
78#include <linux/jiffies.h> 78#include <linux/jiffies.h>
@@ -219,9 +219,14 @@ static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
219static void igmp_gq_start_timer(struct in_device *in_dev) 219static void igmp_gq_start_timer(struct in_device *in_dev)
220{ 220{
221 int tv = prandom_u32() % in_dev->mr_maxdelay; 221 int tv = prandom_u32() % in_dev->mr_maxdelay;
222 unsigned long exp = jiffies + tv + 2;
223
224 if (in_dev->mr_gq_running &&
225 time_after_eq(exp, (in_dev->mr_gq_timer).expires))
226 return;
222 227
223 in_dev->mr_gq_running = 1; 228 in_dev->mr_gq_running = 1;
224 if (!mod_timer(&in_dev->mr_gq_timer, jiffies+tv+2)) 229 if (!mod_timer(&in_dev->mr_gq_timer, exp))
225 in_dev_hold(in_dev); 230 in_dev_hold(in_dev);
226} 231}
227 232
@@ -1167,6 +1172,7 @@ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im)
1167 psf->sf_crcount = im->crcount; 1172 psf->sf_crcount = im->crcount;
1168 } 1173 }
1169 in_dev_put(pmc->interface); 1174 in_dev_put(pmc->interface);
1175 kfree(pmc);
1170 } 1176 }
1171 spin_unlock_bh(&im->lock); 1177 spin_unlock_bh(&im->lock);
1172} 1178}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 61a9deec2993..5e313c1ac94f 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -31,6 +31,86 @@ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
31EXPORT_SYMBOL(inet_csk_timer_bug_msg); 31EXPORT_SYMBOL(inet_csk_timer_bug_msg);
32#endif 32#endif
33 33
34#if IS_ENABLED(CONFIG_IPV6)
35/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6
36 * only, and any IPv4 addresses if not IPv6 only
37 * match_wildcard == false: addresses must be exactly the same, i.e.
38 * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY,
39 * and 0.0.0.0 equals to 0.0.0.0 only
40 */
41static int ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6,
42 const struct in6_addr *sk2_rcv_saddr6,
43 __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
44 bool sk1_ipv6only, bool sk2_ipv6only,
45 bool match_wildcard)
46{
47 int addr_type = ipv6_addr_type(sk1_rcv_saddr6);
48 int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
49
50 /* if both are mapped, treat as IPv4 */
51 if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) {
52 if (!sk2_ipv6only) {
53 if (sk1_rcv_saddr == sk2_rcv_saddr)
54 return 1;
55 if (!sk1_rcv_saddr || !sk2_rcv_saddr)
56 return match_wildcard;
57 }
58 return 0;
59 }
60
61 if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY)
62 return 1;
63
64 if (addr_type2 == IPV6_ADDR_ANY && match_wildcard &&
65 !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
66 return 1;
67
68 if (addr_type == IPV6_ADDR_ANY && match_wildcard &&
69 !(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED))
70 return 1;
71
72 if (sk2_rcv_saddr6 &&
73 ipv6_addr_equal(sk1_rcv_saddr6, sk2_rcv_saddr6))
74 return 1;
75
76 return 0;
77}
78#endif
79
80/* match_wildcard == true: 0.0.0.0 equals to any IPv4 addresses
81 * match_wildcard == false: addresses must be exactly the same, i.e.
82 * 0.0.0.0 only equals to 0.0.0.0
83 */
84static int ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
85 bool sk2_ipv6only, bool match_wildcard)
86{
87 if (!sk2_ipv6only) {
88 if (sk1_rcv_saddr == sk2_rcv_saddr)
89 return 1;
90 if (!sk1_rcv_saddr || !sk2_rcv_saddr)
91 return match_wildcard;
92 }
93 return 0;
94}
95
96int inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
97 bool match_wildcard)
98{
99#if IS_ENABLED(CONFIG_IPV6)
100 if (sk->sk_family == AF_INET6)
101 return ipv6_rcv_saddr_equal(&sk->sk_v6_rcv_saddr,
102 inet6_rcv_saddr(sk2),
103 sk->sk_rcv_saddr,
104 sk2->sk_rcv_saddr,
105 ipv6_only_sock(sk),
106 ipv6_only_sock(sk2),
107 match_wildcard);
108#endif
109 return ipv4_rcv_saddr_equal(sk->sk_rcv_saddr, sk2->sk_rcv_saddr,
110 ipv6_only_sock(sk2), match_wildcard);
111}
112EXPORT_SYMBOL(inet_rcv_saddr_equal);
113
34void inet_get_local_port_range(struct net *net, int *low, int *high) 114void inet_get_local_port_range(struct net *net, int *low, int *high)
35{ 115{
36 unsigned int seq; 116 unsigned int seq;
@@ -44,12 +124,13 @@ void inet_get_local_port_range(struct net *net, int *low, int *high)
44} 124}
45EXPORT_SYMBOL(inet_get_local_port_range); 125EXPORT_SYMBOL(inet_get_local_port_range);
46 126
47int inet_csk_bind_conflict(const struct sock *sk, 127static int inet_csk_bind_conflict(const struct sock *sk,
48 const struct inet_bind_bucket *tb, bool relax) 128 const struct inet_bind_bucket *tb,
129 bool relax, bool reuseport_ok)
49{ 130{
50 struct sock *sk2; 131 struct sock *sk2;
51 int reuse = sk->sk_reuse; 132 bool reuse = sk->sk_reuse;
52 int reuseport = sk->sk_reuseport; 133 bool reuseport = !!sk->sk_reuseport && reuseport_ok;
53 kuid_t uid = sock_i_uid((struct sock *)sk); 134 kuid_t uid = sock_i_uid((struct sock *)sk);
54 135
55 /* 136 /*
@@ -61,7 +142,6 @@ int inet_csk_bind_conflict(const struct sock *sk,
61 142
62 sk_for_each_bound(sk2, &tb->owners) { 143 sk_for_each_bound(sk2, &tb->owners) {
63 if (sk != sk2 && 144 if (sk != sk2 &&
64 !inet_v6_ipv6only(sk2) &&
65 (!sk->sk_bound_dev_if || 145 (!sk->sk_bound_dev_if ||
66 !sk2->sk_bound_dev_if || 146 !sk2->sk_bound_dev_if ||
67 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { 147 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
@@ -71,53 +151,34 @@ int inet_csk_bind_conflict(const struct sock *sk,
71 rcu_access_pointer(sk->sk_reuseport_cb) || 151 rcu_access_pointer(sk->sk_reuseport_cb) ||
72 (sk2->sk_state != TCP_TIME_WAIT && 152 (sk2->sk_state != TCP_TIME_WAIT &&
73 !uid_eq(uid, sock_i_uid(sk2))))) { 153 !uid_eq(uid, sock_i_uid(sk2))))) {
74 154 if (inet_rcv_saddr_equal(sk, sk2, true))
75 if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
76 sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
77 break; 155 break;
78 } 156 }
79 if (!relax && reuse && sk2->sk_reuse && 157 if (!relax && reuse && sk2->sk_reuse &&
80 sk2->sk_state != TCP_LISTEN) { 158 sk2->sk_state != TCP_LISTEN) {
81 159 if (inet_rcv_saddr_equal(sk, sk2, true))
82 if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
83 sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
84 break; 160 break;
85 } 161 }
86 } 162 }
87 } 163 }
88 return sk2 != NULL; 164 return sk2 != NULL;
89} 165}
90EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
91 166
92/* Obtain a reference to a local port for the given sock, 167/*
93 * if snum is zero it means select any available local port. 168 * Find an open port number for the socket. Returns with the
94 * We try to allocate an odd port (and leave even ports for connect()) 169 * inet_bind_hashbucket lock held.
95 */ 170 */
96int inet_csk_get_port(struct sock *sk, unsigned short snum) 171static struct inet_bind_hashbucket *
172inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *port_ret)
97{ 173{
98 bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
99 struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; 174 struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
100 int ret = 1, attempts = 5, port = snum; 175 int port = 0;
101 int smallest_size = -1, smallest_port;
102 struct inet_bind_hashbucket *head; 176 struct inet_bind_hashbucket *head;
103 struct net *net = sock_net(sk); 177 struct net *net = sock_net(sk);
104 int i, low, high, attempt_half; 178 int i, low, high, attempt_half;
105 struct inet_bind_bucket *tb; 179 struct inet_bind_bucket *tb;
106 kuid_t uid = sock_i_uid(sk);
107 u32 remaining, offset; 180 u32 remaining, offset;
108 181
109 if (port) {
110have_port:
111 head = &hinfo->bhash[inet_bhashfn(net, port,
112 hinfo->bhash_size)];
113 spin_lock_bh(&head->lock);
114 inet_bind_bucket_for_each(tb, &head->chain)
115 if (net_eq(ib_net(tb), net) && tb->port == port)
116 goto tb_found;
117
118 goto tb_not_found;
119 }
120again:
121 attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; 182 attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
122other_half_scan: 183other_half_scan:
123 inet_get_local_port_range(net, &low, &high); 184 inet_get_local_port_range(net, &low, &high);
@@ -141,8 +202,6 @@ other_half_scan:
141 * We do the opposite to not pollute connect() users. 202 * We do the opposite to not pollute connect() users.
142 */ 203 */
143 offset |= 1U; 204 offset |= 1U;
144 smallest_size = -1;
145 smallest_port = low; /* avoid compiler warning */
146 205
147other_parity_scan: 206other_parity_scan:
148 port = low + offset; 207 port = low + offset;
@@ -156,29 +215,17 @@ other_parity_scan:
156 spin_lock_bh(&head->lock); 215 spin_lock_bh(&head->lock);
157 inet_bind_bucket_for_each(tb, &head->chain) 216 inet_bind_bucket_for_each(tb, &head->chain)
158 if (net_eq(ib_net(tb), net) && tb->port == port) { 217 if (net_eq(ib_net(tb), net) && tb->port == port) {
159 if (((tb->fastreuse > 0 && reuse) || 218 if (!inet_csk_bind_conflict(sk, tb, false, false))
160 (tb->fastreuseport > 0 && 219 goto success;
161 sk->sk_reuseport &&
162 !rcu_access_pointer(sk->sk_reuseport_cb) &&
163 uid_eq(tb->fastuid, uid))) &&
164 (tb->num_owners < smallest_size || smallest_size == -1)) {
165 smallest_size = tb->num_owners;
166 smallest_port = port;
167 }
168 if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false))
169 goto tb_found;
170 goto next_port; 220 goto next_port;
171 } 221 }
172 goto tb_not_found; 222 tb = NULL;
223 goto success;
173next_port: 224next_port:
174 spin_unlock_bh(&head->lock); 225 spin_unlock_bh(&head->lock);
175 cond_resched(); 226 cond_resched();
176 } 227 }
177 228
178 if (smallest_size != -1) {
179 port = smallest_port;
180 goto have_port;
181 }
182 offset--; 229 offset--;
183 if (!(offset & 1)) 230 if (!(offset & 1))
184 goto other_parity_scan; 231 goto other_parity_scan;
@@ -188,8 +235,74 @@ next_port:
188 attempt_half = 2; 235 attempt_half = 2;
189 goto other_half_scan; 236 goto other_half_scan;
190 } 237 }
191 return ret; 238 return NULL;
239success:
240 *port_ret = port;
241 *tb_ret = tb;
242 return head;
243}
192 244
245static inline int sk_reuseport_match(struct inet_bind_bucket *tb,
246 struct sock *sk)
247{
248 kuid_t uid = sock_i_uid(sk);
249
250 if (tb->fastreuseport <= 0)
251 return 0;
252 if (!sk->sk_reuseport)
253 return 0;
254 if (rcu_access_pointer(sk->sk_reuseport_cb))
255 return 0;
256 if (!uid_eq(tb->fastuid, uid))
257 return 0;
258 /* We only need to check the rcv_saddr if this tb was once marked
259 * without fastreuseport and then was reset, as we can only know that
260 * the fast_*rcv_saddr doesn't have any conflicts with the socks on the
261 * owners list.
262 */
263 if (tb->fastreuseport == FASTREUSEPORT_ANY)
264 return 1;
265#if IS_ENABLED(CONFIG_IPV6)
266 if (tb->fast_sk_family == AF_INET6)
267 return ipv6_rcv_saddr_equal(&tb->fast_v6_rcv_saddr,
268 &sk->sk_v6_rcv_saddr,
269 tb->fast_rcv_saddr,
270 sk->sk_rcv_saddr,
271 tb->fast_ipv6_only,
272 ipv6_only_sock(sk), true);
273#endif
274 return ipv4_rcv_saddr_equal(tb->fast_rcv_saddr, sk->sk_rcv_saddr,
275 ipv6_only_sock(sk), true);
276}
277
278/* Obtain a reference to a local port for the given sock,
279 * if snum is zero it means select any available local port.
280 * We try to allocate an odd port (and leave even ports for connect())
281 */
282int inet_csk_get_port(struct sock *sk, unsigned short snum)
283{
284 bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
285 struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
286 int ret = 1, port = snum;
287 struct inet_bind_hashbucket *head;
288 struct net *net = sock_net(sk);
289 struct inet_bind_bucket *tb = NULL;
290 kuid_t uid = sock_i_uid(sk);
291
292 if (!port) {
293 head = inet_csk_find_open_port(sk, &tb, &port);
294 if (!head)
295 return ret;
296 if (!tb)
297 goto tb_not_found;
298 goto success;
299 }
300 head = &hinfo->bhash[inet_bhashfn(net, port,
301 hinfo->bhash_size)];
302 spin_lock_bh(&head->lock);
303 inet_bind_bucket_for_each(tb, &head->chain)
304 if (net_eq(ib_net(tb), net) && tb->port == port)
305 goto tb_found;
193tb_not_found: 306tb_not_found:
194 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, 307 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
195 net, head, port); 308 net, head, port);
@@ -200,38 +313,54 @@ tb_found:
200 if (sk->sk_reuse == SK_FORCE_REUSE) 313 if (sk->sk_reuse == SK_FORCE_REUSE)
201 goto success; 314 goto success;
202 315
203 if (((tb->fastreuse > 0 && reuse) || 316 if ((tb->fastreuse > 0 && reuse) ||
204 (tb->fastreuseport > 0 && 317 sk_reuseport_match(tb, sk))
205 !rcu_access_pointer(sk->sk_reuseport_cb) &&
206 sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
207 smallest_size == -1)
208 goto success; 318 goto success;
209 if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { 319 if (inet_csk_bind_conflict(sk, tb, true, true))
210 if ((reuse ||
211 (tb->fastreuseport > 0 &&
212 sk->sk_reuseport &&
213 !rcu_access_pointer(sk->sk_reuseport_cb) &&
214 uid_eq(tb->fastuid, uid))) &&
215 smallest_size != -1 && --attempts >= 0) {
216 spin_unlock_bh(&head->lock);
217 goto again;
218 }
219 goto fail_unlock; 320 goto fail_unlock;
321 }
322success:
323 if (!hlist_empty(&tb->owners)) {
324 tb->fastreuse = reuse;
325 if (sk->sk_reuseport) {
326 tb->fastreuseport = FASTREUSEPORT_ANY;
327 tb->fastuid = uid;
328 tb->fast_rcv_saddr = sk->sk_rcv_saddr;
329 tb->fast_ipv6_only = ipv6_only_sock(sk);
330#if IS_ENABLED(CONFIG_IPV6)
331 tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
332#endif
333 } else {
334 tb->fastreuseport = 0;
220 } 335 }
336 } else {
221 if (!reuse) 337 if (!reuse)
222 tb->fastreuse = 0; 338 tb->fastreuse = 0;
223 if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))
224 tb->fastreuseport = 0;
225 } else {
226 tb->fastreuse = reuse;
227 if (sk->sk_reuseport) { 339 if (sk->sk_reuseport) {
228 tb->fastreuseport = 1; 340 /* We didn't match or we don't have fastreuseport set on
229 tb->fastuid = uid; 341 * the tb, but we have sk_reuseport set on this socket
342 * and we know that there are no bind conflicts with
343 * this socket in this tb, so reset our tb's reuseport
344 * settings so that any subsequent sockets that match
345 * our current socket will be put on the fast path.
346 *
347 * If we reset we need to set FASTREUSEPORT_STRICT so we
348 * do extra checking for all subsequent sk_reuseport
349 * socks.
350 */
351 if (!sk_reuseport_match(tb, sk)) {
352 tb->fastreuseport = FASTREUSEPORT_STRICT;
353 tb->fastuid = uid;
354 tb->fast_rcv_saddr = sk->sk_rcv_saddr;
355 tb->fast_ipv6_only = ipv6_only_sock(sk);
356#if IS_ENABLED(CONFIG_IPV6)
357 tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
358#endif
359 }
230 } else { 360 } else {
231 tb->fastreuseport = 0; 361 tb->fastreuseport = 0;
232 } 362 }
233 } 363 }
234success:
235 if (!inet_csk(sk)->icsk_bind_hash) 364 if (!inet_csk(sk)->icsk_bind_hash)
236 inet_bind_hash(sk, tb, port); 365 inet_bind_hash(sk, tb, port);
237 WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); 366 WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
@@ -295,7 +424,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
295/* 424/*
296 * This will accept the next outstanding connection. 425 * This will accept the next outstanding connection.
297 */ 426 */
298struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) 427struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
299{ 428{
300 struct inet_connection_sock *icsk = inet_csk(sk); 429 struct inet_connection_sock *icsk = inet_csk(sk);
301 struct request_sock_queue *queue = &icsk->icsk_accept_queue; 430 struct request_sock_queue *queue = &icsk->icsk_accept_queue;
@@ -415,7 +544,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk,
415 sk->sk_protocol, inet_sk_flowi_flags(sk), 544 sk->sk_protocol, inet_sk_flowi_flags(sk),
416 (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, 545 (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
417 ireq->ir_loc_addr, ireq->ir_rmt_port, 546 ireq->ir_loc_addr, ireq->ir_rmt_port,
418 htons(ireq->ir_num)); 547 htons(ireq->ir_num), sk->sk_uid);
419 security_req_classify_flow(req, flowi4_to_flowi(fl4)); 548 security_req_classify_flow(req, flowi4_to_flowi(fl4));
420 rt = ip_route_output_flow(net, fl4, sk); 549 rt = ip_route_output_flow(net, fl4, sk);
421 if (IS_ERR(rt)) 550 if (IS_ERR(rt))
@@ -452,7 +581,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
452 sk->sk_protocol, inet_sk_flowi_flags(sk), 581 sk->sk_protocol, inet_sk_flowi_flags(sk),
453 (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, 582 (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
454 ireq->ir_loc_addr, ireq->ir_rmt_port, 583 ireq->ir_loc_addr, ireq->ir_rmt_port,
455 htons(ireq->ir_num)); 584 htons(ireq->ir_num), sk->sk_uid);
456 security_req_classify_flow(req, flowi4_to_flowi(fl4)); 585 security_req_classify_flow(req, flowi4_to_flowi(fl4));
457 rt = ip_route_output_flow(net, fl4, sk); 586 rt = ip_route_output_flow(net, fl4, sk);
458 if (IS_ERR(rt)) 587 if (IS_ERR(rt))
@@ -707,9 +836,8 @@ void inet_csk_destroy_sock(struct sock *sk)
707 836
708 sk_refcnt_debug_release(sk); 837 sk_refcnt_debug_release(sk);
709 838
710 local_bh_disable();
711 percpu_counter_dec(sk->sk_prot->orphan_count); 839 percpu_counter_dec(sk->sk_prot->orphan_count);
712 local_bh_enable(); 840
713 sock_put(sk); 841 sock_put(sk);
714} 842}
715EXPORT_SYMBOL(inet_csk_destroy_sock); 843EXPORT_SYMBOL(inet_csk_destroy_sock);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index e4d16fc5bbb3..3828b3a805cd 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -200,13 +200,22 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
200 if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO)) 200 if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO))
201 goto errout; 201 goto errout;
202 202
203 /*
204 * RAW sockets might have user-defined protocols assigned,
205 * so report the one supplied on socket creation.
206 */
207 if (sk->sk_type == SOCK_RAW) {
208 if (nla_put_u8(skb, INET_DIAG_PROTOCOL, sk->sk_protocol))
209 goto errout;
210 }
211
203 if (!icsk) { 212 if (!icsk) {
204 handler->idiag_get_info(sk, r, NULL); 213 handler->idiag_get_info(sk, r, NULL);
205 goto out; 214 goto out;
206 } 215 }
207 216
208 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 217 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
209 icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || 218 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
210 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 219 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
211 r->idiag_timer = 1; 220 r->idiag_timer = 1;
212 r->idiag_retrans = icsk->icsk_retransmits; 221 r->idiag_retrans = icsk->icsk_retransmits;
@@ -852,10 +861,11 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
852 struct netlink_callback *cb, 861 struct netlink_callback *cb,
853 const struct inet_diag_req_v2 *r, struct nlattr *bc) 862 const struct inet_diag_req_v2 *r, struct nlattr *bc)
854{ 863{
864 bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
855 struct net *net = sock_net(skb->sk); 865 struct net *net = sock_net(skb->sk);
856 int i, num, s_i, s_num;
857 u32 idiag_states = r->idiag_states; 866 u32 idiag_states = r->idiag_states;
858 bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); 867 int i, num, s_i, s_num;
868 struct sock *sk;
859 869
860 if (idiag_states & TCPF_SYN_RECV) 870 if (idiag_states & TCPF_SYN_RECV)
861 idiag_states |= TCPF_NEW_SYN_RECV; 871 idiag_states |= TCPF_NEW_SYN_RECV;
@@ -863,16 +873,15 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
863 s_num = num = cb->args[2]; 873 s_num = num = cb->args[2];
864 874
865 if (cb->args[0] == 0) { 875 if (cb->args[0] == 0) {
866 if (!(idiag_states & TCPF_LISTEN)) 876 if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport)
867 goto skip_listen_ht; 877 goto skip_listen_ht;
868 878
869 for (i = s_i; i < INET_LHTABLE_SIZE; i++) { 879 for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
870 struct inet_listen_hashbucket *ilb; 880 struct inet_listen_hashbucket *ilb;
871 struct sock *sk;
872 881
873 num = 0; 882 num = 0;
874 ilb = &hashinfo->listening_hash[i]; 883 ilb = &hashinfo->listening_hash[i];
875 spin_lock_bh(&ilb->lock); 884 spin_lock(&ilb->lock);
876 sk_for_each(sk, &ilb->head) { 885 sk_for_each(sk, &ilb->head) {
877 struct inet_sock *inet = inet_sk(sk); 886 struct inet_sock *inet = inet_sk(sk);
878 887
@@ -892,26 +901,18 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
892 r->id.idiag_sport) 901 r->id.idiag_sport)
893 goto next_listen; 902 goto next_listen;
894 903
895 if (r->id.idiag_dport ||
896 cb->args[3] > 0)
897 goto next_listen;
898
899 if (inet_csk_diag_dump(sk, skb, cb, r, 904 if (inet_csk_diag_dump(sk, skb, cb, r,
900 bc, net_admin) < 0) { 905 bc, net_admin) < 0) {
901 spin_unlock_bh(&ilb->lock); 906 spin_unlock(&ilb->lock);
902 goto done; 907 goto done;
903 } 908 }
904 909
905next_listen: 910next_listen:
906 cb->args[3] = 0;
907 cb->args[4] = 0;
908 ++num; 911 ++num;
909 } 912 }
910 spin_unlock_bh(&ilb->lock); 913 spin_unlock(&ilb->lock);
911 914
912 s_num = 0; 915 s_num = 0;
913 cb->args[3] = 0;
914 cb->args[4] = 0;
915 } 916 }
916skip_listen_ht: 917skip_listen_ht:
917 cb->args[0] = 1; 918 cb->args[0] = 1;
@@ -921,13 +922,14 @@ skip_listen_ht:
921 if (!(idiag_states & ~TCPF_LISTEN)) 922 if (!(idiag_states & ~TCPF_LISTEN))
922 goto out; 923 goto out;
923 924
925#define SKARR_SZ 16
924 for (i = s_i; i <= hashinfo->ehash_mask; i++) { 926 for (i = s_i; i <= hashinfo->ehash_mask; i++) {
925 struct inet_ehash_bucket *head = &hashinfo->ehash[i]; 927 struct inet_ehash_bucket *head = &hashinfo->ehash[i];
926 spinlock_t *lock = inet_ehash_lockp(hashinfo, i); 928 spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
927 struct hlist_nulls_node *node; 929 struct hlist_nulls_node *node;
928 struct sock *sk; 930 struct sock *sk_arr[SKARR_SZ];
929 931 int num_arr[SKARR_SZ];
930 num = 0; 932 int idx, accum, res;
931 933
932 if (hlist_nulls_empty(&head->chain)) 934 if (hlist_nulls_empty(&head->chain))
933 continue; 935 continue;
@@ -935,9 +937,12 @@ skip_listen_ht:
935 if (i > s_i) 937 if (i > s_i)
936 s_num = 0; 938 s_num = 0;
937 939
940next_chunk:
941 num = 0;
942 accum = 0;
938 spin_lock_bh(lock); 943 spin_lock_bh(lock);
939 sk_nulls_for_each(sk, node, &head->chain) { 944 sk_nulls_for_each(sk, node, &head->chain) {
940 int state, res; 945 int state;
941 946
942 if (!net_eq(sock_net(sk), net)) 947 if (!net_eq(sock_net(sk), net))
943 continue; 948 continue;
@@ -961,21 +966,35 @@ skip_listen_ht:
961 if (!inet_diag_bc_sk(bc, sk)) 966 if (!inet_diag_bc_sk(bc, sk))
962 goto next_normal; 967 goto next_normal;
963 968
964 res = sk_diag_fill(sk, skb, r, 969 sock_hold(sk);
970 num_arr[accum] = num;
971 sk_arr[accum] = sk;
972 if (++accum == SKARR_SZ)
973 break;
974next_normal:
975 ++num;
976 }
977 spin_unlock_bh(lock);
978 res = 0;
979 for (idx = 0; idx < accum; idx++) {
980 if (res >= 0) {
981 res = sk_diag_fill(sk_arr[idx], skb, r,
965 sk_user_ns(NETLINK_CB(cb->skb).sk), 982 sk_user_ns(NETLINK_CB(cb->skb).sk),
966 NETLINK_CB(cb->skb).portid, 983 NETLINK_CB(cb->skb).portid,
967 cb->nlh->nlmsg_seq, NLM_F_MULTI, 984 cb->nlh->nlmsg_seq, NLM_F_MULTI,
968 cb->nlh, net_admin); 985 cb->nlh, net_admin);
969 if (res < 0) { 986 if (res < 0)
970 spin_unlock_bh(lock); 987 num = num_arr[idx];
971 goto done;
972 } 988 }
973next_normal: 989 sock_gen_put(sk_arr[idx]);
974 ++num;
975 } 990 }
976 991 if (res < 0)
977 spin_unlock_bh(lock); 992 break;
978 cond_resched(); 993 cond_resched();
994 if (accum == SKARR_SZ) {
995 s_num = num + 1;
996 goto next_chunk;
997 }
979 } 998 }
980 999
981done: 1000done:
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index ca97835bfec4..8bea74298173 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -73,7 +73,6 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
73 tb->port = snum; 73 tb->port = snum;
74 tb->fastreuse = 0; 74 tb->fastreuse = 0;
75 tb->fastreuseport = 0; 75 tb->fastreuseport = 0;
76 tb->num_owners = 0;
77 INIT_HLIST_HEAD(&tb->owners); 76 INIT_HLIST_HEAD(&tb->owners);
78 hlist_add_head(&tb->node, &head->chain); 77 hlist_add_head(&tb->node, &head->chain);
79 } 78 }
@@ -96,7 +95,6 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
96{ 95{
97 inet_sk(sk)->inet_num = snum; 96 inet_sk(sk)->inet_num = snum;
98 sk_add_bind_node(sk, &tb->owners); 97 sk_add_bind_node(sk, &tb->owners);
99 tb->num_owners++;
100 inet_csk(sk)->icsk_bind_hash = tb; 98 inet_csk(sk)->icsk_bind_hash = tb;
101} 99}
102 100
@@ -114,7 +112,6 @@ static void __inet_put_port(struct sock *sk)
114 spin_lock(&head->lock); 112 spin_lock(&head->lock);
115 tb = inet_csk(sk)->icsk_bind_hash; 113 tb = inet_csk(sk)->icsk_bind_hash;
116 __sk_del_bind_node(sk); 114 __sk_del_bind_node(sk);
117 tb->num_owners--;
118 inet_csk(sk)->icsk_bind_hash = NULL; 115 inet_csk(sk)->icsk_bind_hash = NULL;
119 inet_sk(sk)->inet_num = 0; 116 inet_sk(sk)->inet_num = 0;
120 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); 117 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
@@ -435,10 +432,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)
435EXPORT_SYMBOL_GPL(inet_ehash_nolisten); 432EXPORT_SYMBOL_GPL(inet_ehash_nolisten);
436 433
437static int inet_reuseport_add_sock(struct sock *sk, 434static int inet_reuseport_add_sock(struct sock *sk,
438 struct inet_listen_hashbucket *ilb, 435 struct inet_listen_hashbucket *ilb)
439 int (*saddr_same)(const struct sock *sk1,
440 const struct sock *sk2,
441 bool match_wildcard))
442{ 436{
443 struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; 437 struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash;
444 struct sock *sk2; 438 struct sock *sk2;
@@ -451,7 +445,7 @@ static int inet_reuseport_add_sock(struct sock *sk,
451 sk2->sk_bound_dev_if == sk->sk_bound_dev_if && 445 sk2->sk_bound_dev_if == sk->sk_bound_dev_if &&
452 inet_csk(sk2)->icsk_bind_hash == tb && 446 inet_csk(sk2)->icsk_bind_hash == tb &&
453 sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && 447 sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
454 saddr_same(sk, sk2, false)) 448 inet_rcv_saddr_equal(sk, sk2, false))
455 return reuseport_add_sock(sk, sk2); 449 return reuseport_add_sock(sk, sk2);
456 } 450 }
457 451
@@ -461,10 +455,7 @@ static int inet_reuseport_add_sock(struct sock *sk,
461 return 0; 455 return 0;
462} 456}
463 457
464int __inet_hash(struct sock *sk, struct sock *osk, 458int __inet_hash(struct sock *sk, struct sock *osk)
465 int (*saddr_same)(const struct sock *sk1,
466 const struct sock *sk2,
467 bool match_wildcard))
468{ 459{
469 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 460 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
470 struct inet_listen_hashbucket *ilb; 461 struct inet_listen_hashbucket *ilb;
@@ -479,7 +470,7 @@ int __inet_hash(struct sock *sk, struct sock *osk,
479 470
480 spin_lock(&ilb->lock); 471 spin_lock(&ilb->lock);
481 if (sk->sk_reuseport) { 472 if (sk->sk_reuseport) {
482 err = inet_reuseport_add_sock(sk, ilb, saddr_same); 473 err = inet_reuseport_add_sock(sk, ilb);
483 if (err) 474 if (err)
484 goto unlock; 475 goto unlock;
485 } 476 }
@@ -503,7 +494,7 @@ int inet_hash(struct sock *sk)
503 494
504 if (sk->sk_state != TCP_CLOSE) { 495 if (sk->sk_state != TCP_CLOSE) {
505 local_bh_disable(); 496 local_bh_disable();
506 err = __inet_hash(sk, NULL, ipv4_rcv_saddr_equal); 497 err = __inet_hash(sk, NULL);
507 local_bh_enable(); 498 local_bh_enable();
508 } 499 }
509 500
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index ddcd56c08d14..f8aff2c71cde 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -257,8 +257,7 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm)
257} 257}
258EXPORT_SYMBOL_GPL(__inet_twsk_schedule); 258EXPORT_SYMBOL_GPL(__inet_twsk_schedule);
259 259
260void inet_twsk_purge(struct inet_hashinfo *hashinfo, 260void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family)
261 struct inet_timewait_death_row *twdr, int family)
262{ 261{
263 struct inet_timewait_sock *tw; 262 struct inet_timewait_sock *tw;
264 struct sock *sk; 263 struct sock *sk;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index bbe7f72db9c1..b3cdeec85f1f 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -198,6 +198,7 @@ static void ip_expire(unsigned long arg)
198 qp = container_of((struct inet_frag_queue *) arg, struct ipq, q); 198 qp = container_of((struct inet_frag_queue *) arg, struct ipq, q);
199 net = container_of(qp->q.net, struct net, ipv4.frags); 199 net = container_of(qp->q.net, struct net, ipv4.frags);
200 200
201 rcu_read_lock();
201 spin_lock(&qp->q.lock); 202 spin_lock(&qp->q.lock);
202 203
203 if (qp->q.flags & INET_FRAG_COMPLETE) 204 if (qp->q.flags & INET_FRAG_COMPLETE)
@@ -207,7 +208,7 @@ static void ip_expire(unsigned long arg)
207 __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); 208 __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
208 209
209 if (!inet_frag_evicting(&qp->q)) { 210 if (!inet_frag_evicting(&qp->q)) {
210 struct sk_buff *head = qp->q.fragments; 211 struct sk_buff *clone, *head = qp->q.fragments;
211 const struct iphdr *iph; 212 const struct iphdr *iph;
212 int err; 213 int err;
213 214
@@ -216,32 +217,40 @@ static void ip_expire(unsigned long arg)
216 if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments) 217 if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments)
217 goto out; 218 goto out;
218 219
219 rcu_read_lock();
220 head->dev = dev_get_by_index_rcu(net, qp->iif); 220 head->dev = dev_get_by_index_rcu(net, qp->iif);
221 if (!head->dev) 221 if (!head->dev)
222 goto out_rcu_unlock; 222 goto out;
223
223 224
224 /* skb has no dst, perform route lookup again */ 225 /* skb has no dst, perform route lookup again */
225 iph = ip_hdr(head); 226 iph = ip_hdr(head);
226 err = ip_route_input_noref(head, iph->daddr, iph->saddr, 227 err = ip_route_input_noref(head, iph->daddr, iph->saddr,
227 iph->tos, head->dev); 228 iph->tos, head->dev);
228 if (err) 229 if (err)
229 goto out_rcu_unlock; 230 goto out;
230 231
231 /* Only an end host needs to send an ICMP 232 /* Only an end host needs to send an ICMP
232 * "Fragment Reassembly Timeout" message, per RFC792. 233 * "Fragment Reassembly Timeout" message, per RFC792.
233 */ 234 */
234 if (frag_expire_skip_icmp(qp->user) && 235 if (frag_expire_skip_icmp(qp->user) &&
235 (skb_rtable(head)->rt_type != RTN_LOCAL)) 236 (skb_rtable(head)->rt_type != RTN_LOCAL))
236 goto out_rcu_unlock; 237 goto out;
238
239 clone = skb_clone(head, GFP_ATOMIC);
237 240
238 /* Send an ICMP "Fragment Reassembly Timeout" message. */ 241 /* Send an ICMP "Fragment Reassembly Timeout" message. */
239 icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); 242 if (clone) {
240out_rcu_unlock: 243 spin_unlock(&qp->q.lock);
241 rcu_read_unlock(); 244 icmp_send(clone, ICMP_TIME_EXCEEDED,
245 ICMP_EXC_FRAGTIME, 0);
246 consume_skb(clone);
247 goto out_rcu_unlock;
248 }
242 } 249 }
243out: 250out:
244 spin_unlock(&qp->q.lock); 251 spin_unlock(&qp->q.lock);
252out_rcu_unlock:
253 rcu_read_unlock();
245 ipq_put(qp); 254 ipq_put(qp);
246} 255}
247 256
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 576f705d8180..c9c1cb635d9a 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -17,7 +17,7 @@
17#include <linux/types.h> 17#include <linux/types.h>
18#include <linux/kernel.h> 18#include <linux/kernel.h>
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <asm/uaccess.h> 20#include <linux/uaccess.h>
21#include <linux/skbuff.h> 21#include <linux/skbuff.h>
22#include <linux/netdevice.h> 22#include <linux/netdevice.h>
23#include <linux/in.h> 23#include <linux/in.h>
@@ -113,8 +113,8 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
113static struct rtnl_link_ops ipgre_link_ops __read_mostly; 113static struct rtnl_link_ops ipgre_link_ops __read_mostly;
114static int ipgre_tunnel_init(struct net_device *dev); 114static int ipgre_tunnel_init(struct net_device *dev);
115 115
116static int ipgre_net_id __read_mostly; 116static unsigned int ipgre_net_id __read_mostly;
117static int gre_tap_net_id __read_mostly; 117static unsigned int gre_tap_net_id __read_mostly;
118 118
119static void ipgre_err(struct sk_buff *skb, u32 info, 119static void ipgre_err(struct sk_buff *skb, u32 info,
120 const struct tnl_ptk_info *tpi) 120 const struct tnl_ptk_info *tpi)
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 4d158ff1def1..93157f2f4758 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -15,7 +15,7 @@
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/types.h> 17#include <linux/types.h>
18#include <asm/uaccess.h> 18#include <linux/uaccess.h>
19#include <asm/unaligned.h> 19#include <asm/unaligned.h>
20#include <linux/skbuff.h> 20#include <linux/skbuff.h>
21#include <linux/ip.h> 21#include <linux/ip.h>
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 877bdb02e887..7a3fd25e8913 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -42,7 +42,7 @@
42 * Hirokazu Takahashi: sendfile() on UDP works now. 42 * Hirokazu Takahashi: sendfile() on UDP works now.
43 */ 43 */
44 44
45#include <asm/uaccess.h> 45#include <linux/uaccess.h>
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/types.h> 47#include <linux/types.h>
48#include <linux/kernel.h> 48#include <linux/kernel.h>
@@ -74,6 +74,7 @@
74#include <net/checksum.h> 74#include <net/checksum.h>
75#include <net/inetpeer.h> 75#include <net/inetpeer.h>
76#include <net/lwtunnel.h> 76#include <net/lwtunnel.h>
77#include <linux/bpf-cgroup.h>
77#include <linux/igmp.h> 78#include <linux/igmp.h>
78#include <linux/netfilter_ipv4.h> 79#include <linux/netfilter_ipv4.h>
79#include <linux/netfilter_bridge.h> 80#include <linux/netfilter_bridge.h>
@@ -221,7 +222,10 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
221 if (unlikely(!neigh)) 222 if (unlikely(!neigh))
222 neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); 223 neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
223 if (!IS_ERR(neigh)) { 224 if (!IS_ERR(neigh)) {
224 int res = dst_neigh_output(dst, neigh, skb); 225 int res;
226
227 sock_confirm_neigh(skb, neigh);
228 res = neigh_output(neigh, skb);
225 229
226 rcu_read_unlock_bh(); 230 rcu_read_unlock_bh();
227 return res; 231 return res;
@@ -287,6 +291,13 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
287static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 291static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
288{ 292{
289 unsigned int mtu; 293 unsigned int mtu;
294 int ret;
295
296 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
297 if (ret) {
298 kfree_skb(skb);
299 return ret;
300 }
290 301
291#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) 302#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
292 /* Policy lookup after SNAT yielded a new policy */ 303 /* Policy lookup after SNAT yielded a new policy */
@@ -305,6 +316,20 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk
305 return ip_finish_output2(net, sk, skb); 316 return ip_finish_output2(net, sk, skb);
306} 317}
307 318
319static int ip_mc_finish_output(struct net *net, struct sock *sk,
320 struct sk_buff *skb)
321{
322 int ret;
323
324 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
325 if (ret) {
326 kfree_skb(skb);
327 return ret;
328 }
329
330 return dev_loopback_xmit(net, sk, skb);
331}
332
308int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) 333int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
309{ 334{
310 struct rtable *rt = skb_rtable(skb); 335 struct rtable *rt = skb_rtable(skb);
@@ -342,7 +367,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
342 if (newskb) 367 if (newskb)
343 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, 368 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
344 net, sk, newskb, NULL, newskb->dev, 369 net, sk, newskb, NULL, newskb->dev,
345 dev_loopback_xmit); 370 ip_mc_finish_output);
346 } 371 }
347 372
348 /* Multicasts with ttl 0 must not go beyond the host */ 373 /* Multicasts with ttl 0 must not go beyond the host */
@@ -358,7 +383,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
358 if (newskb) 383 if (newskb)
359 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, 384 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
360 net, sk, newskb, NULL, newskb->dev, 385 net, sk, newskb, NULL, newskb->dev,
361 dev_loopback_xmit); 386 ip_mc_finish_output);
362 } 387 }
363 388
364 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, 389 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
@@ -583,7 +608,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
583 */ 608 */
584 if (skb_has_frag_list(skb)) { 609 if (skb_has_frag_list(skb)) {
585 struct sk_buff *frag, *frag2; 610 struct sk_buff *frag, *frag2;
586 int first_len = skb_pagelen(skb); 611 unsigned int first_len = skb_pagelen(skb);
587 612
588 if (first_len - hlen > mtu || 613 if (first_len - hlen > mtu ||
589 ((first_len - hlen) & 7) || 614 ((first_len - hlen) & 7) ||
@@ -804,11 +829,11 @@ ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk
804 struct msghdr *msg = from; 829 struct msghdr *msg = from;
805 830
806 if (skb->ip_summed == CHECKSUM_PARTIAL) { 831 if (skb->ip_summed == CHECKSUM_PARTIAL) {
807 if (copy_from_iter(to, len, &msg->msg_iter) != len) 832 if (!copy_from_iter_full(to, len, &msg->msg_iter))
808 return -EFAULT; 833 return -EFAULT;
809 } else { 834 } else {
810 __wsum csum = 0; 835 __wsum csum = 0;
811 if (csum_and_copy_from_iter(to, len, &csum, &msg->msg_iter) != len) 836 if (!csum_and_copy_from_iter_full(to, len, &csum, &msg->msg_iter))
812 return -EFAULT; 837 return -EFAULT;
813 skb->csum = csum_block_add(skb->csum, csum, odd); 838 skb->csum = csum_block_add(skb->csum, csum, odd);
814 } 839 }
@@ -864,6 +889,9 @@ static inline int ip_ufo_append_data(struct sock *sk,
864 889
865 skb->csum = 0; 890 skb->csum = 0;
866 891
892 if (flags & MSG_CONFIRM)
893 skb_set_dst_pending_confirm(skb, 1);
894
867 __skb_queue_tail(queue, skb); 895 __skb_queue_tail(queue, skb);
868 } else if (skb_is_gso(skb)) { 896 } else if (skb_is_gso(skb)) {
869 goto append; 897 goto append;
@@ -936,9 +964,9 @@ static int __ip_append_data(struct sock *sk,
936 csummode = CHECKSUM_PARTIAL; 964 csummode = CHECKSUM_PARTIAL;
937 965
938 cork->length += length; 966 cork->length += length;
939 if (((length > mtu) || (skb && skb_is_gso(skb))) && 967 if ((((length + fragheaderlen) > mtu) || (skb && skb_is_gso(skb))) &&
940 (sk->sk_protocol == IPPROTO_UDP) && 968 (sk->sk_protocol == IPPROTO_UDP) &&
941 (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len && 969 (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) &&
942 (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) { 970 (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) {
943 err = ip_ufo_append_data(sk, queue, getfrag, from, length, 971 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
944 hh_len, fragheaderlen, transhdrlen, 972 hh_len, fragheaderlen, transhdrlen,
@@ -1064,6 +1092,9 @@ alloc_new_skb:
1064 exthdrlen = 0; 1092 exthdrlen = 0;
1065 csummode = CHECKSUM_NONE; 1093 csummode = CHECKSUM_NONE;
1066 1094
1095 if ((flags & MSG_CONFIRM) && !skb_prev)
1096 skb_set_dst_pending_confirm(skb, 1);
1097
1067 /* 1098 /*
1068 * Put the packet on the pending queue. 1099 * Put the packet on the pending queue.
1069 */ 1100 */
@@ -1594,7 +1625,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
1594 RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, 1625 RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
1595 ip_reply_arg_flowi_flags(arg), 1626 ip_reply_arg_flowi_flags(arg),
1596 daddr, saddr, 1627 daddr, saddr,
1597 tcp_hdr(skb)->source, tcp_hdr(skb)->dest); 1628 tcp_hdr(skb)->source, tcp_hdr(skb)->dest,
1629 arg->uid);
1598 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); 1630 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1599 rt = ip_route_output_key(net, &fl4); 1631 rt = ip_route_output_key(net, &fl4);
1600 if (IS_ERR(rt)) 1632 if (IS_ERR(rt))
@@ -1606,6 +1638,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
1606 sk->sk_protocol = ip_hdr(skb)->protocol; 1638 sk->sk_protocol = ip_hdr(skb)->protocol;
1607 sk->sk_bound_dev_if = arg->bound_dev_if; 1639 sk->sk_bound_dev_if = arg->bound_dev_if;
1608 sk->sk_sndbuf = sysctl_wmem_default; 1640 sk->sk_sndbuf = sysctl_wmem_default;
1641 sk->sk_mark = fl4.flowi4_mark;
1609 err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, 1642 err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
1610 len, 0, &ipc, &rt, MSG_DONTWAIT); 1643 len, 0, &ipc, &rt, MSG_DONTWAIT);
1611 if (unlikely(err)) { 1644 if (unlikely(err)) {
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index b8a2d63d1fb8..1d46d05efb0f 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -44,7 +44,7 @@
44#include <net/ip_fib.h> 44#include <net/ip_fib.h>
45 45
46#include <linux/errqueue.h> 46#include <linux/errqueue.h>
47#include <asm/uaccess.h> 47#include <linux/uaccess.h>
48 48
49/* 49/*
50 * SOL_IP control messages. 50 * SOL_IP control messages.
@@ -97,6 +97,17 @@ static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)
97 put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data); 97 put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data);
98} 98}
99 99
100static void ip_cmsg_recv_fragsize(struct msghdr *msg, struct sk_buff *skb)
101{
102 int val;
103
104 if (IPCB(skb)->frag_max_size == 0)
105 return;
106
107 val = IPCB(skb)->frag_max_size;
108 put_cmsg(msg, SOL_IP, IP_RECVFRAGSIZE, sizeof(val), &val);
109}
110
100static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb, 111static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb,
101 int tlen, int offset) 112 int tlen, int offset)
102{ 113{
@@ -105,10 +116,10 @@ static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb,
105 if (skb->ip_summed != CHECKSUM_COMPLETE) 116 if (skb->ip_summed != CHECKSUM_COMPLETE)
106 return; 117 return;
107 118
108 if (offset != 0) 119 if (offset != 0) {
109 csum = csum_sub(csum, 120 int tend_off = skb_transport_offset(skb) + tlen;
110 csum_partial(skb_transport_header(skb) + tlen, 121 csum = csum_sub(csum, skb_checksum(skb, tend_off, offset, 0));
111 offset, 0)); 122 }
112 123
113 put_cmsg(msg, SOL_IP, IP_CHECKSUM, sizeof(__wsum), &csum); 124 put_cmsg(msg, SOL_IP, IP_CHECKSUM, sizeof(__wsum), &csum);
114} 125}
@@ -137,7 +148,7 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
137 const struct iphdr *iph = ip_hdr(skb); 148 const struct iphdr *iph = ip_hdr(skb);
138 __be16 *ports = (__be16 *)skb_transport_header(skb); 149 __be16 *ports = (__be16 *)skb_transport_header(skb);
139 150
140 if (skb_transport_offset(skb) + 4 > skb->len) 151 if (skb_transport_offset(skb) + 4 > (int)skb->len)
141 return; 152 return;
142 153
143 /* All current transport protocols have the port numbers in the 154 /* All current transport protocols have the port numbers in the
@@ -153,10 +164,10 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
153 put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin); 164 put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin);
154} 165}
155 166
156void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb, 167void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk,
157 int tlen, int offset) 168 struct sk_buff *skb, int tlen, int offset)
158{ 169{
159 struct inet_sock *inet = inet_sk(skb->sk); 170 struct inet_sock *inet = inet_sk(sk);
160 unsigned int flags = inet->cmsg_flags; 171 unsigned int flags = inet->cmsg_flags;
161 172
162 /* Ordered by supposed usage frequency */ 173 /* Ordered by supposed usage frequency */
@@ -218,6 +229,9 @@ void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb,
218 229
219 if (flags & IP_CMSG_CHECKSUM) 230 if (flags & IP_CMSG_CHECKSUM)
220 ip_cmsg_recv_checksum(msg, skb, tlen, offset); 231 ip_cmsg_recv_checksum(msg, skb, tlen, offset);
232
233 if (flags & IP_CMSG_RECVFRAGSIZE)
234 ip_cmsg_recv_fragsize(msg, skb);
221} 235}
222EXPORT_SYMBOL(ip_cmsg_recv_offset); 236EXPORT_SYMBOL(ip_cmsg_recv_offset);
223 237
@@ -258,7 +272,7 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
258 continue; 272 continue;
259 switch (cmsg->cmsg_type) { 273 switch (cmsg->cmsg_type) {
260 case IP_RETOPTS: 274 case IP_RETOPTS:
261 err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); 275 err = cmsg->cmsg_len - sizeof(struct cmsghdr);
262 276
263 /* Our caller is responsible for freeing ipc->opt */ 277 /* Our caller is responsible for freeing ipc->opt */
264 err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg), 278 err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg),
@@ -474,16 +488,15 @@ static bool ipv4_datagram_support_cmsg(const struct sock *sk,
474 return false; 488 return false;
475 489
476 /* Support IP_PKTINFO on tstamp packets if requested, to correlate 490 /* Support IP_PKTINFO on tstamp packets if requested, to correlate
477 * timestamp with egress dev. Not possible for packets without dev 491 * timestamp with egress dev. Not possible for packets without iif
478 * or without payload (SOF_TIMESTAMPING_OPT_TSONLY). 492 * or without payload (SOF_TIMESTAMPING_OPT_TSONLY).
479 */ 493 */
480 if ((!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG)) || 494 info = PKTINFO_SKB_CB(skb);
481 (!skb->dev)) 495 if (!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG) ||
496 !info->ipi_ifindex)
482 return false; 497 return false;
483 498
484 info = PKTINFO_SKB_CB(skb);
485 info->ipi_spec_dst.s_addr = ip_hdr(skb)->saddr; 499 info->ipi_spec_dst.s_addr = ip_hdr(skb)->saddr;
486 info->ipi_ifindex = skb->dev->ifindex;
487 return true; 500 return true;
488} 501}
489 502
@@ -577,6 +590,7 @@ static bool setsockopt_needs_rtnl(int optname)
577 case MCAST_LEAVE_GROUP: 590 case MCAST_LEAVE_GROUP:
578 case MCAST_LEAVE_SOURCE_GROUP: 591 case MCAST_LEAVE_SOURCE_GROUP:
579 case MCAST_UNBLOCK_SOURCE: 592 case MCAST_UNBLOCK_SOURCE:
593 case IP_ROUTER_ALERT:
580 return true; 594 return true;
581 } 595 }
582 return false; 596 return false;
@@ -614,6 +628,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
614 case IP_MULTICAST_LOOP: 628 case IP_MULTICAST_LOOP:
615 case IP_RECVORIGDSTADDR: 629 case IP_RECVORIGDSTADDR:
616 case IP_CHECKSUM: 630 case IP_CHECKSUM:
631 case IP_RECVFRAGSIZE:
617 if (optlen >= sizeof(int)) { 632 if (optlen >= sizeof(int)) {
618 if (get_user(val, (int __user *) optval)) 633 if (get_user(val, (int __user *) optval))
619 return -EFAULT; 634 return -EFAULT;
@@ -726,6 +741,14 @@ static int do_ip_setsockopt(struct sock *sk, int level,
726 } 741 }
727 } 742 }
728 break; 743 break;
744 case IP_RECVFRAGSIZE:
745 if (sk->sk_type != SOCK_RAW && sk->sk_type != SOCK_DGRAM)
746 goto e_inval;
747 if (val)
748 inet->cmsg_flags |= IP_CMSG_RECVFRAGSIZE;
749 else
750 inet->cmsg_flags &= ~IP_CMSG_RECVFRAGSIZE;
751 break;
729 case IP_TOS: /* This sets both TOS and Precedence */ 752 case IP_TOS: /* This sets both TOS and Precedence */
730 if (sk->sk_type == SOCK_STREAM) { 753 if (sk->sk_type == SOCK_STREAM) {
731 val &= ~INET_ECN_MASK; 754 val &= ~INET_ECN_MASK;
@@ -820,6 +843,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
820 { 843 {
821 struct ip_mreqn mreq; 844 struct ip_mreqn mreq;
822 struct net_device *dev = NULL; 845 struct net_device *dev = NULL;
846 int midx;
823 847
824 if (sk->sk_type == SOCK_STREAM) 848 if (sk->sk_type == SOCK_STREAM)
825 goto e_inval; 849 goto e_inval;
@@ -864,11 +888,15 @@ static int do_ip_setsockopt(struct sock *sk, int level,
864 err = -EADDRNOTAVAIL; 888 err = -EADDRNOTAVAIL;
865 if (!dev) 889 if (!dev)
866 break; 890 break;
891
892 midx = l3mdev_master_ifindex(dev);
893
867 dev_put(dev); 894 dev_put(dev);
868 895
869 err = -EINVAL; 896 err = -EINVAL;
870 if (sk->sk_bound_dev_if && 897 if (sk->sk_bound_dev_if &&
871 mreq.imr_ifindex != sk->sk_bound_dev_if) 898 mreq.imr_ifindex != sk->sk_bound_dev_if &&
899 (!midx || midx != sk->sk_bound_dev_if))
872 break; 900 break;
873 901
874 inet->mc_index = mreq.imr_ifindex; 902 inet->mc_index = mreq.imr_ifindex;
@@ -1202,14 +1230,27 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
1202 * which has interface index (iif) as the first member of the 1230 * which has interface index (iif) as the first member of the
1203 * underlying inet{6}_skb_parm struct. This code then overlays 1231 * underlying inet{6}_skb_parm struct. This code then overlays
1204 * PKTINFO_SKB_CB and in_pktinfo also has iif as the first 1232 * PKTINFO_SKB_CB and in_pktinfo also has iif as the first
1205 * element so the iif is picked up from the prior IPCB 1233 * element so the iif is picked up from the prior IPCB. If iif
1234 * is the loopback interface, then return the sending interface
1235 * (e.g., process binds socket to eth0 for Tx which is
1236 * redirected to loopback in the rtable/dst).
1206 */ 1237 */
1238 if (pktinfo->ipi_ifindex == LOOPBACK_IFINDEX)
1239 pktinfo->ipi_ifindex = inet_iif(skb);
1240
1207 pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb); 1241 pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
1208 } else { 1242 } else {
1209 pktinfo->ipi_ifindex = 0; 1243 pktinfo->ipi_ifindex = 0;
1210 pktinfo->ipi_spec_dst.s_addr = 0; 1244 pktinfo->ipi_spec_dst.s_addr = 0;
1211 } 1245 }
1212 skb_dst_drop(skb); 1246 /* We need to keep the dst for __ip_options_echo()
1247 * We could restrict the test to opt.ts_needtime || opt.srr,
1248 * but the following is good enough as IP options are not often used.
1249 */
1250 if (unlikely(IPCB(skb)->opt.optlen))
1251 skb_dst_force(skb);
1252 else
1253 skb_dst_drop(skb);
1213} 1254}
1214 1255
1215int ip_setsockopt(struct sock *sk, int level, 1256int ip_setsockopt(struct sock *sk, int level,
@@ -1357,6 +1398,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
1357 case IP_CHECKSUM: 1398 case IP_CHECKSUM:
1358 val = (inet->cmsg_flags & IP_CMSG_CHECKSUM) != 0; 1399 val = (inet->cmsg_flags & IP_CMSG_CHECKSUM) != 0;
1359 break; 1400 break;
1401 case IP_RECVFRAGSIZE:
1402 val = (inet->cmsg_flags & IP_CMSG_RECVFRAGSIZE) != 0;
1403 break;
1360 case IP_TOS: 1404 case IP_TOS:
1361 val = inet->tos; 1405 val = inet->tos;
1362 break; 1406 break;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 5719d6ba0824..823abaef006b 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -358,6 +358,7 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
358{ 358{
359 struct ip_tunnel *nt; 359 struct ip_tunnel *nt;
360 struct net_device *dev; 360 struct net_device *dev;
361 int t_hlen;
361 362
362 BUG_ON(!itn->fb_tunnel_dev); 363 BUG_ON(!itn->fb_tunnel_dev);
363 dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms); 364 dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
@@ -367,6 +368,9 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
367 dev->mtu = ip_tunnel_bind_dev(dev); 368 dev->mtu = ip_tunnel_bind_dev(dev);
368 369
369 nt = netdev_priv(dev); 370 nt = netdev_priv(dev);
371 t_hlen = nt->hlen + sizeof(struct iphdr);
372 dev->min_mtu = ETH_MIN_MTU;
373 dev->max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
370 ip_tunnel_add(itn, nt); 374 ip_tunnel_add(itn, nt);
371 return nt; 375 return nt;
372} 376}
@@ -929,7 +933,7 @@ int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
929 int t_hlen = tunnel->hlen + sizeof(struct iphdr); 933 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
930 int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen; 934 int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
931 935
932 if (new_mtu < 68) 936 if (new_mtu < ETH_MIN_MTU)
933 return -EINVAL; 937 return -EINVAL;
934 938
935 if (new_mtu > max_mtu) { 939 if (new_mtu > max_mtu) {
@@ -990,7 +994,7 @@ int ip_tunnel_get_iflink(const struct net_device *dev)
990} 994}
991EXPORT_SYMBOL(ip_tunnel_get_iflink); 995EXPORT_SYMBOL(ip_tunnel_get_iflink);
992 996
993int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, 997int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
994 struct rtnl_link_ops *ops, char *devname) 998 struct rtnl_link_ops *ops, char *devname)
995{ 999{
996 struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id); 1000 struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
@@ -1192,7 +1196,7 @@ void ip_tunnel_uninit(struct net_device *dev)
1192EXPORT_SYMBOL_GPL(ip_tunnel_uninit); 1196EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1193 1197
1194/* Do least required initialization, rest of init is done in tunnel_init call */ 1198/* Do least required initialization, rest of init is done in tunnel_init call */
1195void ip_tunnel_setup(struct net_device *dev, int net_id) 1199void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1196{ 1200{
1197 struct ip_tunnel *tunnel = netdev_priv(dev); 1201 struct ip_tunnel *tunnel = netdev_priv(dev);
1198 tunnel->ip_tnl_net_id = net_id; 1202 tunnel->ip_tnl_net_id = net_id;
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index fed3d29f9eb3..a31f47ccaad9 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -188,8 +188,8 @@ int iptunnel_handle_offloads(struct sk_buff *skb,
188EXPORT_SYMBOL_GPL(iptunnel_handle_offloads); 188EXPORT_SYMBOL_GPL(iptunnel_handle_offloads);
189 189
190/* Often modified stats are per cpu, other are shared (netdev->stats) */ 190/* Often modified stats are per cpu, other are shared (netdev->stats) */
191struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, 191void ip_tunnel_get_stats64(struct net_device *dev,
192 struct rtnl_link_stats64 *tot) 192 struct rtnl_link_stats64 *tot)
193{ 193{
194 int i; 194 int i;
195 195
@@ -214,8 +214,6 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
214 tot->rx_bytes += rx_bytes; 214 tot->rx_bytes += rx_bytes;
215 tot->tx_bytes += tx_bytes; 215 tot->tx_bytes += tx_bytes;
216 } 216 }
217
218 return tot;
219} 217}
220EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64); 218EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
221 219
@@ -228,7 +226,7 @@ static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = {
228 [LWTUNNEL_IP_FLAGS] = { .type = NLA_U16 }, 226 [LWTUNNEL_IP_FLAGS] = { .type = NLA_U16 },
229}; 227};
230 228
231static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr, 229static int ip_tun_build_state(struct nlattr *attr,
232 unsigned int family, const void *cfg, 230 unsigned int family, const void *cfg,
233 struct lwtunnel_state **ts) 231 struct lwtunnel_state **ts)
234{ 232{
@@ -313,6 +311,7 @@ static const struct lwtunnel_encap_ops ip_tun_lwt_ops = {
313 .fill_encap = ip_tun_fill_encap_info, 311 .fill_encap = ip_tun_fill_encap_info,
314 .get_encap_size = ip_tun_encap_nlsize, 312 .get_encap_size = ip_tun_encap_nlsize,
315 .cmp_encap = ip_tun_cmp_encap, 313 .cmp_encap = ip_tun_cmp_encap,
314 .owner = THIS_MODULE,
316}; 315};
317 316
318static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = { 317static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
@@ -324,7 +323,7 @@ static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
324 [LWTUNNEL_IP6_FLAGS] = { .type = NLA_U16 }, 323 [LWTUNNEL_IP6_FLAGS] = { .type = NLA_U16 },
325}; 324};
326 325
327static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr, 326static int ip6_tun_build_state(struct nlattr *attr,
328 unsigned int family, const void *cfg, 327 unsigned int family, const void *cfg,
329 struct lwtunnel_state **ts) 328 struct lwtunnel_state **ts)
330{ 329{
@@ -403,6 +402,7 @@ static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = {
403 .fill_encap = ip6_tun_fill_encap_info, 402 .fill_encap = ip6_tun_fill_encap_info,
404 .get_encap_size = ip6_tun_encap_nlsize, 403 .get_encap_size = ip6_tun_encap_nlsize,
405 .cmp_encap = ip_tun_cmp_encap, 404 .cmp_encap = ip_tun_cmp_encap,
405 .owner = THIS_MODULE,
406}; 406};
407 407
408void __init ip_tunnel_core_init(void) 408void __init ip_tunnel_core_init(void)
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 5d7944f394d9..8b14f1404c8f 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -46,7 +46,7 @@
46 46
47static struct rtnl_link_ops vti_link_ops __read_mostly; 47static struct rtnl_link_ops vti_link_ops __read_mostly;
48 48
49static int vti_net_id __read_mostly; 49static unsigned int vti_net_id __read_mostly;
50static int vti_tunnel_init(struct net_device *dev); 50static int vti_tunnel_init(struct net_device *dev);
51 51
52static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, 52static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi,
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 071a785c65eb..dfb2ab2dd3c8 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -61,7 +61,7 @@
61#include <net/ipconfig.h> 61#include <net/ipconfig.h>
62#include <net/route.h> 62#include <net/route.h>
63 63
64#include <asm/uaccess.h> 64#include <linux/uaccess.h>
65#include <net/checksum.h> 65#include <net/checksum.h>
66#include <asm/processor.h> 66#include <asm/processor.h>
67 67
@@ -306,7 +306,7 @@ static void __init ic_close_devs(void)
306 while ((d = next)) { 306 while ((d = next)) {
307 next = d->next; 307 next = d->next;
308 dev = d->dev; 308 dev = d->dev;
309 if ((!ic_dev || dev != ic_dev->dev) && !netdev_uses_dsa(dev)) { 309 if (d != ic_dev && !netdev_uses_dsa(dev)) {
310 pr_debug("IP-Config: Downing %s\n", dev->name); 310 pr_debug("IP-Config: Downing %s\n", dev->name);
311 dev_change_flags(dev, d->flags); 311 dev_change_flags(dev, d->flags);
312 } 312 }
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index c9392589c415..00d4229b6954 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -96,7 +96,7 @@
96#include <linux/types.h> 96#include <linux/types.h>
97#include <linux/kernel.h> 97#include <linux/kernel.h>
98#include <linux/slab.h> 98#include <linux/slab.h>
99#include <asm/uaccess.h> 99#include <linux/uaccess.h>
100#include <linux/skbuff.h> 100#include <linux/skbuff.h>
101#include <linux/netdevice.h> 101#include <linux/netdevice.h>
102#include <linux/in.h> 102#include <linux/in.h>
@@ -121,7 +121,7 @@ static bool log_ecn_error = true;
121module_param(log_ecn_error, bool, 0644); 121module_param(log_ecn_error, bool, 0644);
122MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); 122MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
123 123
124static int ipip_net_id __read_mostly; 124static unsigned int ipip_net_id __read_mostly;
125 125
126static int ipip_tunnel_init(struct net_device *dev); 126static int ipip_tunnel_init(struct net_device *dev);
127static struct rtnl_link_ops ipip_link_ops __read_mostly; 127static struct rtnl_link_ops ipip_link_ops __read_mostly;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 27089f5ebbb1..b036e85e093b 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -26,7 +26,7 @@
26 * 26 *
27 */ 27 */
28 28
29#include <asm/uaccess.h> 29#include <linux/uaccess.h>
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/capability.h> 31#include <linux/capability.h>
32#include <linux/errno.h> 32#include <linux/errno.h>
@@ -137,6 +137,9 @@ static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
137 .flags = FIB_LOOKUP_NOREF, 137 .flags = FIB_LOOKUP_NOREF,
138 }; 138 };
139 139
140 /* update flow if oif or iif point to device enslaved to l3mdev */
141 l3mdev_update_flow(net, flowi4_to_flowi(flp4));
142
140 err = fib_rules_lookup(net->ipv4.mr_rules_ops, 143 err = fib_rules_lookup(net->ipv4.mr_rules_ops,
141 flowi4_to_flowi(flp4), 0, &arg); 144 flowi4_to_flowi(flp4), 0, &arg);
142 if (err < 0) 145 if (err < 0)
@@ -163,7 +166,9 @@ static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp,
163 return -EINVAL; 166 return -EINVAL;
164 } 167 }
165 168
166 mrt = ipmr_get_table(rule->fr_net, rule->table); 169 arg->table = fib_rule_get_table(rule, arg);
170
171 mrt = ipmr_get_table(rule->fr_net, arg->table);
167 if (!mrt) 172 if (!mrt)
168 return -EAGAIN; 173 return -EAGAIN;
169 res->mrt = mrt; 174 res->mrt = mrt;
@@ -294,10 +299,29 @@ static void __net_exit ipmr_rules_exit(struct net *net)
294} 299}
295#endif 300#endif
296 301
302static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg,
303 const void *ptr)
304{
305 const struct mfc_cache_cmp_arg *cmparg = arg->key;
306 struct mfc_cache *c = (struct mfc_cache *)ptr;
307
308 return cmparg->mfc_mcastgrp != c->mfc_mcastgrp ||
309 cmparg->mfc_origin != c->mfc_origin;
310}
311
312static const struct rhashtable_params ipmr_rht_params = {
313 .head_offset = offsetof(struct mfc_cache, mnode),
314 .key_offset = offsetof(struct mfc_cache, cmparg),
315 .key_len = sizeof(struct mfc_cache_cmp_arg),
316 .nelem_hint = 3,
317 .locks_mul = 1,
318 .obj_cmpfn = ipmr_hash_cmp,
319 .automatic_shrinking = true,
320};
321
297static struct mr_table *ipmr_new_table(struct net *net, u32 id) 322static struct mr_table *ipmr_new_table(struct net *net, u32 id)
298{ 323{
299 struct mr_table *mrt; 324 struct mr_table *mrt;
300 unsigned int i;
301 325
302 /* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */ 326 /* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */
303 if (id != RT_TABLE_DEFAULT && id >= 1000000000) 327 if (id != RT_TABLE_DEFAULT && id >= 1000000000)
@@ -313,10 +337,8 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
313 write_pnet(&mrt->net, net); 337 write_pnet(&mrt->net, net);
314 mrt->id = id; 338 mrt->id = id;
315 339
316 /* Forwarding cache */ 340 rhltable_init(&mrt->mfc_hash, &ipmr_rht_params);
317 for (i = 0; i < MFC_LINES; i++) 341 INIT_LIST_HEAD(&mrt->mfc_cache_list);
318 INIT_LIST_HEAD(&mrt->mfc_cache_array[i]);
319
320 INIT_LIST_HEAD(&mrt->mfc_unres_queue); 342 INIT_LIST_HEAD(&mrt->mfc_unres_queue);
321 343
322 setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process, 344 setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
@@ -333,6 +355,7 @@ static void ipmr_free_table(struct mr_table *mrt)
333{ 355{
334 del_timer_sync(&mrt->ipmr_expire_timer); 356 del_timer_sync(&mrt->ipmr_expire_timer);
335 mroute_clean_tables(mrt, true); 357 mroute_clean_tables(mrt, true);
358 rhltable_destroy(&mrt->mfc_hash);
336 kfree(mrt); 359 kfree(mrt);
337} 360}
338 361
@@ -834,13 +857,17 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
834 __be32 origin, 857 __be32 origin,
835 __be32 mcastgrp) 858 __be32 mcastgrp)
836{ 859{
837 int line = MFC_HASH(mcastgrp, origin); 860 struct mfc_cache_cmp_arg arg = {
861 .mfc_mcastgrp = mcastgrp,
862 .mfc_origin = origin
863 };
864 struct rhlist_head *tmp, *list;
838 struct mfc_cache *c; 865 struct mfc_cache *c;
839 866
840 list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) { 867 list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
841 if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp) 868 rhl_for_each_entry_rcu(c, tmp, list, mnode)
842 return c; 869 return c;
843 } 870
844 return NULL; 871 return NULL;
845} 872}
846 873
@@ -848,13 +875,16 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
848static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt, 875static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt,
849 int vifi) 876 int vifi)
850{ 877{
851 int line = MFC_HASH(htonl(INADDR_ANY), htonl(INADDR_ANY)); 878 struct mfc_cache_cmp_arg arg = {
879 .mfc_mcastgrp = htonl(INADDR_ANY),
880 .mfc_origin = htonl(INADDR_ANY)
881 };
882 struct rhlist_head *tmp, *list;
852 struct mfc_cache *c; 883 struct mfc_cache *c;
853 884
854 list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) 885 list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
855 if (c->mfc_origin == htonl(INADDR_ANY) && 886 rhl_for_each_entry_rcu(c, tmp, list, mnode)
856 c->mfc_mcastgrp == htonl(INADDR_ANY) && 887 if (c->mfc_un.res.ttls[vifi] < 255)
857 c->mfc_un.res.ttls[vifi] < 255)
858 return c; 888 return c;
859 889
860 return NULL; 890 return NULL;
@@ -864,29 +894,51 @@ static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt,
864static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt, 894static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt,
865 __be32 mcastgrp, int vifi) 895 __be32 mcastgrp, int vifi)
866{ 896{
867 int line = MFC_HASH(mcastgrp, htonl(INADDR_ANY)); 897 struct mfc_cache_cmp_arg arg = {
898 .mfc_mcastgrp = mcastgrp,
899 .mfc_origin = htonl(INADDR_ANY)
900 };
901 struct rhlist_head *tmp, *list;
868 struct mfc_cache *c, *proxy; 902 struct mfc_cache *c, *proxy;
869 903
870 if (mcastgrp == htonl(INADDR_ANY)) 904 if (mcastgrp == htonl(INADDR_ANY))
871 goto skip; 905 goto skip;
872 906
873 list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) 907 list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
874 if (c->mfc_origin == htonl(INADDR_ANY) && 908 rhl_for_each_entry_rcu(c, tmp, list, mnode) {
875 c->mfc_mcastgrp == mcastgrp) { 909 if (c->mfc_un.res.ttls[vifi] < 255)
876 if (c->mfc_un.res.ttls[vifi] < 255) 910 return c;
877 return c; 911
878 912 /* It's ok if the vifi is part of the static tree */
879 /* It's ok if the vifi is part of the static tree */ 913 proxy = ipmr_cache_find_any_parent(mrt, c->mfc_parent);
880 proxy = ipmr_cache_find_any_parent(mrt, 914 if (proxy && proxy->mfc_un.res.ttls[vifi] < 255)
881 c->mfc_parent); 915 return c;
882 if (proxy && proxy->mfc_un.res.ttls[vifi] < 255) 916 }
883 return c;
884 }
885 917
886skip: 918skip:
887 return ipmr_cache_find_any_parent(mrt, vifi); 919 return ipmr_cache_find_any_parent(mrt, vifi);
888} 920}
889 921
922/* Look for a (S,G,iif) entry if parent != -1 */
923static struct mfc_cache *ipmr_cache_find_parent(struct mr_table *mrt,
924 __be32 origin, __be32 mcastgrp,
925 int parent)
926{
927 struct mfc_cache_cmp_arg arg = {
928 .mfc_mcastgrp = mcastgrp,
929 .mfc_origin = origin,
930 };
931 struct rhlist_head *tmp, *list;
932 struct mfc_cache *c;
933
934 list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
935 rhl_for_each_entry_rcu(c, tmp, list, mnode)
936 if (parent == -1 || parent == c->mfc_parent)
937 return c;
938
939 return NULL;
940}
941
890/* Allocate a multicast cache entry */ 942/* Allocate a multicast cache entry */
891static struct mfc_cache *ipmr_cache_alloc(void) 943static struct mfc_cache *ipmr_cache_alloc(void)
892{ 944{
@@ -1023,10 +1075,10 @@ static int ipmr_cache_report(struct mr_table *mrt,
1023static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, 1075static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
1024 struct sk_buff *skb) 1076 struct sk_buff *skb)
1025{ 1077{
1078 const struct iphdr *iph = ip_hdr(skb);
1079 struct mfc_cache *c;
1026 bool found = false; 1080 bool found = false;
1027 int err; 1081 int err;
1028 struct mfc_cache *c;
1029 const struct iphdr *iph = ip_hdr(skb);
1030 1082
1031 spin_lock_bh(&mfc_unres_lock); 1083 spin_lock_bh(&mfc_unres_lock);
1032 list_for_each_entry(c, &mrt->mfc_unres_queue, list) { 1084 list_for_each_entry(c, &mrt->mfc_unres_queue, list) {
@@ -1090,46 +1142,39 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
1090 1142
1091static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) 1143static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
1092{ 1144{
1093 int line; 1145 struct mfc_cache *c;
1094 struct mfc_cache *c, *next;
1095 1146
1096 line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr); 1147 /* The entries are added/deleted only under RTNL */
1148 rcu_read_lock();
1149 c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr,
1150 mfc->mfcc_mcastgrp.s_addr, parent);
1151 rcu_read_unlock();
1152 if (!c)
1153 return -ENOENT;
1154 rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
1155 list_del_rcu(&c->list);
1156 mroute_netlink_event(mrt, c, RTM_DELROUTE);
1157 ipmr_cache_free(c);
1097 1158
1098 list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) { 1159 return 0;
1099 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
1100 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr &&
1101 (parent == -1 || parent == c->mfc_parent)) {
1102 list_del_rcu(&c->list);
1103 mroute_netlink_event(mrt, c, RTM_DELROUTE);
1104 ipmr_cache_free(c);
1105 return 0;
1106 }
1107 }
1108 return -ENOENT;
1109} 1160}
1110 1161
1111static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, 1162static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
1112 struct mfcctl *mfc, int mrtsock, int parent) 1163 struct mfcctl *mfc, int mrtsock, int parent)
1113{ 1164{
1114 bool found = false;
1115 int line;
1116 struct mfc_cache *uc, *c; 1165 struct mfc_cache *uc, *c;
1166 bool found;
1167 int ret;
1117 1168
1118 if (mfc->mfcc_parent >= MAXVIFS) 1169 if (mfc->mfcc_parent >= MAXVIFS)
1119 return -ENFILE; 1170 return -ENFILE;
1120 1171
1121 line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr); 1172 /* The entries are added/deleted only under RTNL */
1122 1173 rcu_read_lock();
1123 list_for_each_entry(c, &mrt->mfc_cache_array[line], list) { 1174 c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr,
1124 if (c->mfc_origin == mfc->mfcc_origin.s_addr && 1175 mfc->mfcc_mcastgrp.s_addr, parent);
1125 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr && 1176 rcu_read_unlock();
1126 (parent == -1 || parent == c->mfc_parent)) { 1177 if (c) {
1127 found = true;
1128 break;
1129 }
1130 }
1131
1132 if (found) {
1133 write_lock_bh(&mrt_lock); 1178 write_lock_bh(&mrt_lock);
1134 c->mfc_parent = mfc->mfcc_parent; 1179 c->mfc_parent = mfc->mfcc_parent;
1135 ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls); 1180 ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
@@ -1155,8 +1200,14 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
1155 if (!mrtsock) 1200 if (!mrtsock)
1156 c->mfc_flags |= MFC_STATIC; 1201 c->mfc_flags |= MFC_STATIC;
1157 1202
1158 list_add_rcu(&c->list, &mrt->mfc_cache_array[line]); 1203 ret = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->mnode,
1159 1204 ipmr_rht_params);
1205 if (ret) {
1206 pr_err("ipmr: rhtable insert error %d\n", ret);
1207 ipmr_cache_free(c);
1208 return ret;
1209 }
1210 list_add_tail_rcu(&c->list, &mrt->mfc_cache_list);
1160 /* Check to see if we resolved a queued list. If so we 1211 /* Check to see if we resolved a queued list. If so we
1161 * need to send on the frames and tidy up. 1212 * need to send on the frames and tidy up.
1162 */ 1213 */
@@ -1186,9 +1237,9 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
1186/* Close the multicast socket, and clear the vif tables etc */ 1237/* Close the multicast socket, and clear the vif tables etc */
1187static void mroute_clean_tables(struct mr_table *mrt, bool all) 1238static void mroute_clean_tables(struct mr_table *mrt, bool all)
1188{ 1239{
1189 int i; 1240 struct mfc_cache *c, *tmp;
1190 LIST_HEAD(list); 1241 LIST_HEAD(list);
1191 struct mfc_cache *c, *next; 1242 int i;
1192 1243
1193 /* Shut down all active vif entries */ 1244 /* Shut down all active vif entries */
1194 for (i = 0; i < mrt->maxvif; i++) { 1245 for (i = 0; i < mrt->maxvif; i++) {
@@ -1199,19 +1250,18 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
1199 unregister_netdevice_many(&list); 1250 unregister_netdevice_many(&list);
1200 1251
1201 /* Wipe the cache */ 1252 /* Wipe the cache */
1202 for (i = 0; i < MFC_LINES; i++) { 1253 list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) {
1203 list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) { 1254 if (!all && (c->mfc_flags & MFC_STATIC))
1204 if (!all && (c->mfc_flags & MFC_STATIC)) 1255 continue;
1205 continue; 1256 rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
1206 list_del_rcu(&c->list); 1257 list_del_rcu(&c->list);
1207 mroute_netlink_event(mrt, c, RTM_DELROUTE); 1258 mroute_netlink_event(mrt, c, RTM_DELROUTE);
1208 ipmr_cache_free(c); 1259 ipmr_cache_free(c);
1209 }
1210 } 1260 }
1211 1261
1212 if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { 1262 if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
1213 spin_lock_bh(&mfc_unres_lock); 1263 spin_lock_bh(&mfc_unres_lock);
1214 list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) { 1264 list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
1215 list_del(&c->list); 1265 list_del(&c->list);
1216 mroute_netlink_event(mrt, c, RTM_DELROUTE); 1266 mroute_netlink_event(mrt, c, RTM_DELROUTE);
1217 ipmr_destroy_unres(mrt, c); 1267 ipmr_destroy_unres(mrt, c);
@@ -1228,7 +1278,7 @@ static void mrtsock_destruct(struct sock *sk)
1228 struct net *net = sock_net(sk); 1278 struct net *net = sock_net(sk);
1229 struct mr_table *mrt; 1279 struct mr_table *mrt;
1230 1280
1231 rtnl_lock(); 1281 ASSERT_RTNL();
1232 ipmr_for_each_table(mrt, net) { 1282 ipmr_for_each_table(mrt, net) {
1233 if (sk == rtnl_dereference(mrt->mroute_sk)) { 1283 if (sk == rtnl_dereference(mrt->mroute_sk)) {
1234 IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; 1284 IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
@@ -1239,7 +1289,6 @@ static void mrtsock_destruct(struct sock *sk)
1239 mroute_clean_tables(mrt, false); 1289 mroute_clean_tables(mrt, false);
1240 } 1290 }
1241 } 1291 }
1242 rtnl_unlock();
1243} 1292}
1244 1293
1245/* Socket options and virtual interface manipulation. The whole 1294/* Socket options and virtual interface manipulation. The whole
@@ -1303,13 +1352,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
1303 if (sk != rcu_access_pointer(mrt->mroute_sk)) { 1352 if (sk != rcu_access_pointer(mrt->mroute_sk)) {
1304 ret = -EACCES; 1353 ret = -EACCES;
1305 } else { 1354 } else {
1306 /* We need to unlock here because mrtsock_destruct takes
1307 * care of rtnl itself and we can't change that due to
1308 * the IP_ROUTER_ALERT setsockopt which runs without it.
1309 */
1310 rtnl_unlock();
1311 ret = ip_ra_control(sk, 0, NULL); 1355 ret = ip_ra_control(sk, 0, NULL);
1312 goto out; 1356 goto out_unlock;
1313 } 1357 }
1314 break; 1358 break;
1315 case MRT_ADD_VIF: 1359 case MRT_ADD_VIF:
@@ -1420,7 +1464,6 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
1420 } 1464 }
1421out_unlock: 1465out_unlock:
1422 rtnl_unlock(); 1466 rtnl_unlock();
1423out:
1424 return ret; 1467 return ret;
1425} 1468}
1426 1469
@@ -1786,9 +1829,9 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
1786 struct sk_buff *skb, struct mfc_cache *cache, 1829 struct sk_buff *skb, struct mfc_cache *cache,
1787 int local) 1830 int local)
1788{ 1831{
1832 int true_vifi = ipmr_find_vif(mrt, skb->dev);
1789 int psend = -1; 1833 int psend = -1;
1790 int vif, ct; 1834 int vif, ct;
1791 int true_vifi = ipmr_find_vif(mrt, skb->dev);
1792 1835
1793 vif = cache->mfc_parent; 1836 vif = cache->mfc_parent;
1794 cache->mfc_un.res.pkt++; 1837 cache->mfc_un.res.pkt++;
@@ -1809,6 +1852,12 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
1809 1852
1810 /* Wrong interface: drop packet and (maybe) send PIM assert. */ 1853 /* Wrong interface: drop packet and (maybe) send PIM assert. */
1811 if (mrt->vif_table[vif].dev != skb->dev) { 1854 if (mrt->vif_table[vif].dev != skb->dev) {
1855 struct net_device *mdev;
1856
1857 mdev = l3mdev_master_dev_rcu(mrt->vif_table[vif].dev);
1858 if (mdev == skb->dev)
1859 goto forward;
1860
1812 if (rt_is_output_route(skb_rtable(skb))) { 1861 if (rt_is_output_route(skb_rtable(skb))) {
1813 /* It is our own packet, looped back. 1862 /* It is our own packet, looped back.
1814 * Very complicated situation... 1863 * Very complicated situation...
@@ -2053,7 +2102,7 @@ static int pim_rcv(struct sk_buff *skb)
2053 goto drop; 2102 goto drop;
2054 2103
2055 pim = (struct pimreghdr *)skb_transport_header(skb); 2104 pim = (struct pimreghdr *)skb_transport_header(skb);
2056 if (pim->type != ((PIM_VERSION << 4) | (PIM_REGISTER)) || 2105 if (pim->type != ((PIM_VERSION << 4) | (PIM_TYPE_REGISTER)) ||
2057 (pim->flags & PIM_NULL_REGISTER) || 2106 (pim->flags & PIM_NULL_REGISTER) ||
2058 (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && 2107 (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
2059 csum_fold(skb_checksum(skb, 0, skb->len, 0)))) 2108 csum_fold(skb_checksum(skb, 0, skb->len, 0))))
@@ -2080,8 +2129,10 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
2080 int ct; 2129 int ct;
2081 2130
2082 /* If cache is unresolved, don't try to parse IIF and OIF */ 2131 /* If cache is unresolved, don't try to parse IIF and OIF */
2083 if (c->mfc_parent >= MAXVIFS) 2132 if (c->mfc_parent >= MAXVIFS) {
2133 rtm->rtm_flags |= RTNH_F_UNRESOLVED;
2084 return -ENOENT; 2134 return -ENOENT;
2135 }
2085 2136
2086 if (VIF_EXISTS(mrt, c->mfc_parent) && 2137 if (VIF_EXISTS(mrt, c->mfc_parent) &&
2087 nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0) 2138 nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
@@ -2123,7 +2174,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
2123 2174
2124int ipmr_get_route(struct net *net, struct sk_buff *skb, 2175int ipmr_get_route(struct net *net, struct sk_buff *skb,
2125 __be32 saddr, __be32 daddr, 2176 __be32 saddr, __be32 daddr,
2126 struct rtmsg *rtm, int nowait, u32 portid) 2177 struct rtmsg *rtm, u32 portid)
2127{ 2178{
2128 struct mfc_cache *cache; 2179 struct mfc_cache *cache;
2129 struct mr_table *mrt; 2180 struct mr_table *mrt;
@@ -2147,11 +2198,6 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
2147 struct net_device *dev; 2198 struct net_device *dev;
2148 int vif = -1; 2199 int vif = -1;
2149 2200
2150 if (nowait) {
2151 rcu_read_unlock();
2152 return -EAGAIN;
2153 }
2154
2155 dev = skb->dev; 2201 dev = skb->dev;
2156 read_lock(&mrt_lock); 2202 read_lock(&mrt_lock);
2157 if (dev) 2203 if (dev)
@@ -2285,34 +2331,30 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
2285 struct mr_table *mrt; 2331 struct mr_table *mrt;
2286 struct mfc_cache *mfc; 2332 struct mfc_cache *mfc;
2287 unsigned int t = 0, s_t; 2333 unsigned int t = 0, s_t;
2288 unsigned int h = 0, s_h;
2289 unsigned int e = 0, s_e; 2334 unsigned int e = 0, s_e;
2290 2335
2291 s_t = cb->args[0]; 2336 s_t = cb->args[0];
2292 s_h = cb->args[1]; 2337 s_e = cb->args[1];
2293 s_e = cb->args[2];
2294 2338
2295 rcu_read_lock(); 2339 rcu_read_lock();
2296 ipmr_for_each_table(mrt, net) { 2340 ipmr_for_each_table(mrt, net) {
2297 if (t < s_t) 2341 if (t < s_t)
2298 goto next_table; 2342 goto next_table;
2299 if (t > s_t) 2343 list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
2300 s_h = 0; 2344 if (e < s_e)
2301 for (h = s_h; h < MFC_LINES; h++) { 2345 goto next_entry;
2302 list_for_each_entry_rcu(mfc, &mrt->mfc_cache_array[h], list) { 2346 if (ipmr_fill_mroute(mrt, skb,
2303 if (e < s_e) 2347 NETLINK_CB(cb->skb).portid,
2304 goto next_entry; 2348 cb->nlh->nlmsg_seq,
2305 if (ipmr_fill_mroute(mrt, skb, 2349 mfc, RTM_NEWROUTE,
2306 NETLINK_CB(cb->skb).portid, 2350 NLM_F_MULTI) < 0)
2307 cb->nlh->nlmsg_seq, 2351 goto done;
2308 mfc, RTM_NEWROUTE,
2309 NLM_F_MULTI) < 0)
2310 goto done;
2311next_entry: 2352next_entry:
2312 e++; 2353 e++;
2313 }
2314 e = s_e = 0;
2315 } 2354 }
2355 e = 0;
2356 s_e = 0;
2357
2316 spin_lock_bh(&mfc_unres_lock); 2358 spin_lock_bh(&mfc_unres_lock);
2317 list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) { 2359 list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
2318 if (e < s_e) 2360 if (e < s_e)
@@ -2329,16 +2371,15 @@ next_entry2:
2329 e++; 2371 e++;
2330 } 2372 }
2331 spin_unlock_bh(&mfc_unres_lock); 2373 spin_unlock_bh(&mfc_unres_lock);
2332 e = s_e = 0; 2374 e = 0;
2333 s_h = 0; 2375 s_e = 0;
2334next_table: 2376next_table:
2335 t++; 2377 t++;
2336 } 2378 }
2337done: 2379done:
2338 rcu_read_unlock(); 2380 rcu_read_unlock();
2339 2381
2340 cb->args[2] = e; 2382 cb->args[1] = e;
2341 cb->args[1] = h;
2342 cb->args[0] = t; 2383 cb->args[0] = t;
2343 2384
2344 return skb->len; 2385 return skb->len;
@@ -2548,7 +2589,7 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
2548 const char *name = vif->dev ? vif->dev->name : "none"; 2589 const char *name = vif->dev ? vif->dev->name : "none";
2549 2590
2550 seq_printf(seq, 2591 seq_printf(seq,
2551 "%2Zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n", 2592 "%2zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n",
2552 vif - mrt->vif_table, 2593 vif - mrt->vif_table,
2553 name, vif->bytes_in, vif->pkt_in, 2594 name, vif->bytes_in, vif->pkt_in,
2554 vif->bytes_out, vif->pkt_out, 2595 vif->bytes_out, vif->pkt_out,
@@ -2582,10 +2623,8 @@ struct ipmr_mfc_iter {
2582 struct seq_net_private p; 2623 struct seq_net_private p;
2583 struct mr_table *mrt; 2624 struct mr_table *mrt;
2584 struct list_head *cache; 2625 struct list_head *cache;
2585 int ct;
2586}; 2626};
2587 2627
2588
2589static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net, 2628static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
2590 struct ipmr_mfc_iter *it, loff_t pos) 2629 struct ipmr_mfc_iter *it, loff_t pos)
2591{ 2630{
@@ -2593,12 +2632,10 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
2593 struct mfc_cache *mfc; 2632 struct mfc_cache *mfc;
2594 2633
2595 rcu_read_lock(); 2634 rcu_read_lock();
2596 for (it->ct = 0; it->ct < MFC_LINES; it->ct++) { 2635 it->cache = &mrt->mfc_cache_list;
2597 it->cache = &mrt->mfc_cache_array[it->ct]; 2636 list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
2598 list_for_each_entry_rcu(mfc, it->cache, list) 2637 if (pos-- == 0)
2599 if (pos-- == 0) 2638 return mfc;
2600 return mfc;
2601 }
2602 rcu_read_unlock(); 2639 rcu_read_unlock();
2603 2640
2604 spin_lock_bh(&mfc_unres_lock); 2641 spin_lock_bh(&mfc_unres_lock);
@@ -2625,17 +2662,16 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
2625 2662
2626 it->mrt = mrt; 2663 it->mrt = mrt;
2627 it->cache = NULL; 2664 it->cache = NULL;
2628 it->ct = 0;
2629 return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1) 2665 return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
2630 : SEQ_START_TOKEN; 2666 : SEQ_START_TOKEN;
2631} 2667}
2632 2668
2633static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2669static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2634{ 2670{
2635 struct mfc_cache *mfc = v;
2636 struct ipmr_mfc_iter *it = seq->private; 2671 struct ipmr_mfc_iter *it = seq->private;
2637 struct net *net = seq_file_net(seq); 2672 struct net *net = seq_file_net(seq);
2638 struct mr_table *mrt = it->mrt; 2673 struct mr_table *mrt = it->mrt;
2674 struct mfc_cache *mfc = v;
2639 2675
2640 ++*pos; 2676 ++*pos;
2641 2677
@@ -2648,19 +2684,9 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2648 if (it->cache == &mrt->mfc_unres_queue) 2684 if (it->cache == &mrt->mfc_unres_queue)
2649 goto end_of_list; 2685 goto end_of_list;
2650 2686
2651 BUG_ON(it->cache != &mrt->mfc_cache_array[it->ct]);
2652
2653 while (++it->ct < MFC_LINES) {
2654 it->cache = &mrt->mfc_cache_array[it->ct];
2655 if (list_empty(it->cache))
2656 continue;
2657 return list_first_entry(it->cache, struct mfc_cache, list);
2658 }
2659
2660 /* exhausted cache_array, show unresolved */ 2687 /* exhausted cache_array, show unresolved */
2661 rcu_read_unlock(); 2688 rcu_read_unlock();
2662 it->cache = &mrt->mfc_unres_queue; 2689 it->cache = &mrt->mfc_unres_queue;
2663 it->ct = 0;
2664 2690
2665 spin_lock_bh(&mfc_unres_lock); 2691 spin_lock_bh(&mfc_unres_lock);
2666 if (!list_empty(it->cache)) 2692 if (!list_empty(it->cache))
@@ -2680,7 +2706,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
2680 2706
2681 if (it->cache == &mrt->mfc_unres_queue) 2707 if (it->cache == &mrt->mfc_unres_queue)
2682 spin_unlock_bh(&mfc_unres_lock); 2708 spin_unlock_bh(&mfc_unres_lock);
2683 else if (it->cache == &mrt->mfc_cache_array[it->ct]) 2709 else if (it->cache == &mrt->mfc_cache_list)
2684 rcu_read_unlock(); 2710 rcu_read_unlock();
2685} 2711}
2686 2712
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index b3cc1335adbc..c0cc6aa8cfaa 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -23,7 +23,8 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t
23 struct rtable *rt; 23 struct rtable *rt;
24 struct flowi4 fl4 = {}; 24 struct flowi4 fl4 = {};
25 __be32 saddr = iph->saddr; 25 __be32 saddr = iph->saddr;
26 __u8 flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0; 26 const struct sock *sk = skb_to_full_sk(skb);
27 __u8 flags = sk ? inet_sk_flowi_flags(sk) : 0;
27 struct net_device *dev = skb_dst(skb)->dev; 28 struct net_device *dev = skb_dst(skb)->dev;
28 unsigned int hh_len; 29 unsigned int hh_len;
29 30
@@ -40,7 +41,7 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t
40 fl4.daddr = iph->daddr; 41 fl4.daddr = iph->daddr;
41 fl4.saddr = saddr; 42 fl4.saddr = saddr;
42 fl4.flowi4_tos = RT_TOS(iph->tos); 43 fl4.flowi4_tos = RT_TOS(iph->tos);
43 fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; 44 fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0;
44 if (!fl4.flowi4_oif) 45 if (!fl4.flowi4_oif)
45 fl4.flowi4_oif = l3mdev_master_ifindex(dev); 46 fl4.flowi4_oif = l3mdev_master_ifindex(dev);
46 fl4.flowi4_mark = skb->mark; 47 fl4.flowi4_mark = skb->mark;
@@ -61,7 +62,7 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t
61 xfrm_decode_session(skb, flowi4_to_flowi(&fl4), AF_INET) == 0) { 62 xfrm_decode_session(skb, flowi4_to_flowi(&fl4), AF_INET) == 0) {
62 struct dst_entry *dst = skb_dst(skb); 63 struct dst_entry *dst = skb_dst(skb);
63 skb_dst_set(skb, NULL); 64 skb_dst_set(skb, NULL);
64 dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0); 65 dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), sk, 0);
65 if (IS_ERR(dst)) 66 if (IS_ERR(dst))
66 return PTR_ERR(dst); 67 return PTR_ERR(dst);
67 skb_dst_set(skb, dst); 68 skb_dst_set(skb, dst);
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index d613309e3e5d..c11eb1744ab1 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -25,6 +25,12 @@ config NF_CONNTRACK_IPV4
25 25
26 To compile it as a module, choose M here. If unsure, say N. 26 To compile it as a module, choose M here. If unsure, say N.
27 27
28config NF_SOCKET_IPV4
29 tristate "IPv4 socket lookup support"
30 help
31 This option enables the IPv4 socket lookup infrastructure. This is
32 is required by the iptables socket match.
33
28if NF_TABLES 34if NF_TABLES
29 35
30config NF_TABLES_IPV4 36config NF_TABLES_IPV4
@@ -54,6 +60,14 @@ config NFT_DUP_IPV4
54 help 60 help
55 This module enables IPv4 packet duplication support for nf_tables. 61 This module enables IPv4 packet duplication support for nf_tables.
56 62
63config NFT_FIB_IPV4
64 select NFT_FIB
65 tristate "nf_tables fib / ip route lookup support"
66 help
67 This module enables IPv4 FIB lookups, e.g. for reverse path filtering.
68 It also allows query of the FIB for the route type, e.g. local, unicast,
69 multicast or blackhole.
70
57endif # NF_TABLES_IPV4 71endif # NF_TABLES_IPV4
58 72
59config NF_TABLES_ARP 73config NF_TABLES_ARP
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 853328f8fd05..f462fee66ac8 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -14,6 +14,8 @@ obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
14# defrag 14# defrag
15obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o 15obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
16 16
17obj-$(CONFIG_NF_SOCKET_IPV4) += nf_socket_ipv4.o
18
17# logging 19# logging
18obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o 20obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o
19obj-$(CONFIG_NF_LOG_IPV4) += nf_log_ipv4.o 21obj-$(CONFIG_NF_LOG_IPV4) += nf_log_ipv4.o
@@ -34,6 +36,7 @@ obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o
34obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o 36obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o
35obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o 37obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o
36obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o 38obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o
39obj-$(CONFIG_NFT_FIB_IPV4) += nft_fib_ipv4.o
37obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o 40obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o
38obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o 41obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o
39obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o 42obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 697538464e6e..6241a81fd7f5 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -24,7 +24,7 @@
24#include <linux/err.h> 24#include <linux/err.h>
25#include <net/compat.h> 25#include <net/compat.h>
26#include <net/sock.h> 26#include <net/sock.h>
27#include <asm/uaccess.h> 27#include <linux/uaccess.h>
28 28
29#include <linux/netfilter/x_tables.h> 29#include <linux/netfilter/x_tables.h>
30#include <linux/netfilter_arp/arp_tables.h> 30#include <linux/netfilter_arp/arp_tables.h>
@@ -217,11 +217,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
217 */ 217 */
218 e = get_entry(table_base, private->hook_entry[hook]); 218 e = get_entry(table_base, private->hook_entry[hook]);
219 219
220 acpar.net = state->net; 220 acpar.state = state;
221 acpar.in = state->in;
222 acpar.out = state->out;
223 acpar.hooknum = hook;
224 acpar.family = NFPROTO_ARP;
225 acpar.hotdrop = false; 221 acpar.hotdrop = false;
226 222
227 arp = arp_hdr(skb); 223 arp = arp_hdr(skb);
@@ -415,17 +411,15 @@ static inline int check_target(struct arpt_entry *e, const char *name)
415} 411}
416 412
417static inline int 413static inline int
418find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) 414find_check_entry(struct arpt_entry *e, const char *name, unsigned int size,
415 struct xt_percpu_counter_alloc_state *alloc_state)
419{ 416{
420 struct xt_entry_target *t; 417 struct xt_entry_target *t;
421 struct xt_target *target; 418 struct xt_target *target;
422 unsigned long pcnt;
423 int ret; 419 int ret;
424 420
425 pcnt = xt_percpu_counter_alloc(); 421 if (!xt_percpu_counter_alloc(alloc_state, &e->counters))
426 if (IS_ERR_VALUE(pcnt))
427 return -ENOMEM; 422 return -ENOMEM;
428 e->counters.pcnt = pcnt;
429 423
430 t = arpt_get_target(e); 424 t = arpt_get_target(e);
431 target = xt_request_find_target(NFPROTO_ARP, t->u.user.name, 425 target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
@@ -443,7 +437,7 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
443err: 437err:
444 module_put(t->u.kernel.target->me); 438 module_put(t->u.kernel.target->me);
445out: 439out:
446 xt_percpu_counter_free(e->counters.pcnt); 440 xt_percpu_counter_free(&e->counters);
447 441
448 return ret; 442 return ret;
449} 443}
@@ -523,7 +517,7 @@ static inline void cleanup_entry(struct arpt_entry *e)
523 if (par.target->destroy != NULL) 517 if (par.target->destroy != NULL)
524 par.target->destroy(&par); 518 par.target->destroy(&par);
525 module_put(par.target->me); 519 module_put(par.target->me);
526 xt_percpu_counter_free(e->counters.pcnt); 520 xt_percpu_counter_free(&e->counters);
527} 521}
528 522
529/* Checks and translates the user-supplied table segment (held in 523/* Checks and translates the user-supplied table segment (held in
@@ -532,6 +526,7 @@ static inline void cleanup_entry(struct arpt_entry *e)
532static int translate_table(struct xt_table_info *newinfo, void *entry0, 526static int translate_table(struct xt_table_info *newinfo, void *entry0,
533 const struct arpt_replace *repl) 527 const struct arpt_replace *repl)
534{ 528{
529 struct xt_percpu_counter_alloc_state alloc_state = { 0 };
535 struct arpt_entry *iter; 530 struct arpt_entry *iter;
536 unsigned int *offsets; 531 unsigned int *offsets;
537 unsigned int i; 532 unsigned int i;
@@ -594,7 +589,8 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
594 /* Finally, each sanity check must pass */ 589 /* Finally, each sanity check must pass */
595 i = 0; 590 i = 0;
596 xt_entry_foreach(iter, entry0, newinfo->size) { 591 xt_entry_foreach(iter, entry0, newinfo->size) {
597 ret = find_check_entry(iter, repl->name, repl->size); 592 ret = find_check_entry(iter, repl->name, repl->size,
593 &alloc_state);
598 if (ret != 0) 594 if (ret != 0)
599 break; 595 break;
600 ++i; 596 ++i;
@@ -681,11 +677,6 @@ static int copy_entries_to_user(unsigned int total_size,
681 return PTR_ERR(counters); 677 return PTR_ERR(counters);
682 678
683 loc_cpu_entry = private->entries; 679 loc_cpu_entry = private->entries;
684 /* ... then copy entire thing ... */
685 if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
686 ret = -EFAULT;
687 goto free_counters;
688 }
689 680
690 /* FIXME: use iterator macros --RR */ 681 /* FIXME: use iterator macros --RR */
691 /* ... then go back and fix counters and names */ 682 /* ... then go back and fix counters and names */
@@ -693,6 +684,10 @@ static int copy_entries_to_user(unsigned int total_size,
693 const struct xt_entry_target *t; 684 const struct xt_entry_target *t;
694 685
695 e = (struct arpt_entry *)(loc_cpu_entry + off); 686 e = (struct arpt_entry *)(loc_cpu_entry + off);
687 if (copy_to_user(userptr + off, e, sizeof(*e))) {
688 ret = -EFAULT;
689 goto free_counters;
690 }
696 if (copy_to_user(userptr + off 691 if (copy_to_user(userptr + off
697 + offsetof(struct arpt_entry, counters), 692 + offsetof(struct arpt_entry, counters),
698 &counters[num], 693 &counters[num],
@@ -702,11 +697,7 @@ static int copy_entries_to_user(unsigned int total_size,
702 } 697 }
703 698
704 t = arpt_get_target_c(e); 699 t = arpt_get_target_c(e);
705 if (copy_to_user(userptr + off + e->target_offset 700 if (xt_target_to_user(t, userptr + off + e->target_offset)) {
706 + offsetof(struct xt_entry_target,
707 u.user.name),
708 t->u.kernel.target->name,
709 strlen(t->u.kernel.target->name)+1) != 0) {
710 ret = -EFAULT; 701 ret = -EFAULT;
711 goto free_counters; 702 goto free_counters;
712 } 703 }
@@ -809,7 +800,7 @@ static int get_info(struct net *net, void __user *user,
809#endif 800#endif
810 t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name), 801 t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name),
811 "arptable_%s", name); 802 "arptable_%s", name);
812 if (!IS_ERR_OR_NULL(t)) { 803 if (t) {
813 struct arpt_getinfo info; 804 struct arpt_getinfo info;
814 const struct xt_table_info *private = t->private; 805 const struct xt_table_info *private = t->private;
815#ifdef CONFIG_COMPAT 806#ifdef CONFIG_COMPAT
@@ -838,7 +829,7 @@ static int get_info(struct net *net, void __user *user,
838 xt_table_unlock(t); 829 xt_table_unlock(t);
839 module_put(t->me); 830 module_put(t->me);
840 } else 831 } else
841 ret = t ? PTR_ERR(t) : -ENOENT; 832 ret = -ENOENT;
842#ifdef CONFIG_COMPAT 833#ifdef CONFIG_COMPAT
843 if (compat) 834 if (compat)
844 xt_compat_unlock(NFPROTO_ARP); 835 xt_compat_unlock(NFPROTO_ARP);
@@ -863,7 +854,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
863 get.name[sizeof(get.name) - 1] = '\0'; 854 get.name[sizeof(get.name) - 1] = '\0';
864 855
865 t = xt_find_table_lock(net, NFPROTO_ARP, get.name); 856 t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
866 if (!IS_ERR_OR_NULL(t)) { 857 if (t) {
867 const struct xt_table_info *private = t->private; 858 const struct xt_table_info *private = t->private;
868 859
869 if (get.size == private->size) 860 if (get.size == private->size)
@@ -875,7 +866,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
875 module_put(t->me); 866 module_put(t->me);
876 xt_table_unlock(t); 867 xt_table_unlock(t);
877 } else 868 } else
878 ret = t ? PTR_ERR(t) : -ENOENT; 869 ret = -ENOENT;
879 870
880 return ret; 871 return ret;
881} 872}
@@ -902,8 +893,8 @@ static int __do_replace(struct net *net, const char *name,
902 893
903 t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name), 894 t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name),
904 "arptable_%s", name); 895 "arptable_%s", name);
905 if (IS_ERR_OR_NULL(t)) { 896 if (!t) {
906 ret = t ? PTR_ERR(t) : -ENOENT; 897 ret = -ENOENT;
907 goto free_newinfo_counters_untrans; 898 goto free_newinfo_counters_untrans;
908 } 899 }
909 900
@@ -1018,8 +1009,8 @@ static int do_add_counters(struct net *net, const void __user *user,
1018 return PTR_ERR(paddc); 1009 return PTR_ERR(paddc);
1019 1010
1020 t = xt_find_table_lock(net, NFPROTO_ARP, tmp.name); 1011 t = xt_find_table_lock(net, NFPROTO_ARP, tmp.name);
1021 if (IS_ERR_OR_NULL(t)) { 1012 if (!t) {
1022 ret = t ? PTR_ERR(t) : -ENOENT; 1013 ret = -ENOENT;
1023 goto free; 1014 goto free;
1024 } 1015 }
1025 1016
@@ -1408,7 +1399,7 @@ static int compat_get_entries(struct net *net,
1408 1399
1409 xt_compat_lock(NFPROTO_ARP); 1400 xt_compat_lock(NFPROTO_ARP);
1410 t = xt_find_table_lock(net, NFPROTO_ARP, get.name); 1401 t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
1411 if (!IS_ERR_OR_NULL(t)) { 1402 if (t) {
1412 const struct xt_table_info *private = t->private; 1403 const struct xt_table_info *private = t->private;
1413 struct xt_table_info info; 1404 struct xt_table_info info;
1414 1405
@@ -1423,7 +1414,7 @@ static int compat_get_entries(struct net *net,
1423 module_put(t->me); 1414 module_put(t->me);
1424 xt_table_unlock(t); 1415 xt_table_unlock(t);
1425 } else 1416 } else
1426 ret = t ? PTR_ERR(t) : -ENOENT; 1417 ret = -ENOENT;
1427 1418
1428 xt_compat_unlock(NFPROTO_ARP); 1419 xt_compat_unlock(NFPROTO_ARP);
1429 return ret; 1420 return ret;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 7c00ce90adb8..384b85713e06 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -20,7 +20,7 @@
20#include <linux/icmp.h> 20#include <linux/icmp.h>
21#include <net/ip.h> 21#include <net/ip.h>
22#include <net/compat.h> 22#include <net/compat.h>
23#include <asm/uaccess.h> 23#include <linux/uaccess.h>
24#include <linux/mutex.h> 24#include <linux/mutex.h>
25#include <linux/proc_fs.h> 25#include <linux/proc_fs.h>
26#include <linux/err.h> 26#include <linux/err.h>
@@ -261,11 +261,7 @@ ipt_do_table(struct sk_buff *skb,
261 acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET; 261 acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
262 acpar.thoff = ip_hdrlen(skb); 262 acpar.thoff = ip_hdrlen(skb);
263 acpar.hotdrop = false; 263 acpar.hotdrop = false;
264 acpar.net = state->net; 264 acpar.state = state;
265 acpar.in = state->in;
266 acpar.out = state->out;
267 acpar.family = NFPROTO_IPV4;
268 acpar.hooknum = hook;
269 265
270 IP_NF_ASSERT(table->valid_hooks & (1 << hook)); 266 IP_NF_ASSERT(table->valid_hooks & (1 << hook));
271 local_bh_disable(); 267 local_bh_disable();
@@ -535,7 +531,8 @@ static int check_target(struct ipt_entry *e, struct net *net, const char *name)
535 531
536static int 532static int
537find_check_entry(struct ipt_entry *e, struct net *net, const char *name, 533find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
538 unsigned int size) 534 unsigned int size,
535 struct xt_percpu_counter_alloc_state *alloc_state)
539{ 536{
540 struct xt_entry_target *t; 537 struct xt_entry_target *t;
541 struct xt_target *target; 538 struct xt_target *target;
@@ -543,12 +540,9 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
543 unsigned int j; 540 unsigned int j;
544 struct xt_mtchk_param mtpar; 541 struct xt_mtchk_param mtpar;
545 struct xt_entry_match *ematch; 542 struct xt_entry_match *ematch;
546 unsigned long pcnt;
547 543
548 pcnt = xt_percpu_counter_alloc(); 544 if (!xt_percpu_counter_alloc(alloc_state, &e->counters))
549 if (IS_ERR_VALUE(pcnt))
550 return -ENOMEM; 545 return -ENOMEM;
551 e->counters.pcnt = pcnt;
552 546
553 j = 0; 547 j = 0;
554 mtpar.net = net; 548 mtpar.net = net;
@@ -586,7 +580,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
586 cleanup_match(ematch, net); 580 cleanup_match(ematch, net);
587 } 581 }
588 582
589 xt_percpu_counter_free(e->counters.pcnt); 583 xt_percpu_counter_free(&e->counters);
590 584
591 return ret; 585 return ret;
592} 586}
@@ -674,7 +668,7 @@ cleanup_entry(struct ipt_entry *e, struct net *net)
674 if (par.target->destroy != NULL) 668 if (par.target->destroy != NULL)
675 par.target->destroy(&par); 669 par.target->destroy(&par);
676 module_put(par.target->me); 670 module_put(par.target->me);
677 xt_percpu_counter_free(e->counters.pcnt); 671 xt_percpu_counter_free(&e->counters);
678} 672}
679 673
680/* Checks and translates the user-supplied table segment (held in 674/* Checks and translates the user-supplied table segment (held in
@@ -683,6 +677,7 @@ static int
683translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, 677translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
684 const struct ipt_replace *repl) 678 const struct ipt_replace *repl)
685{ 679{
680 struct xt_percpu_counter_alloc_state alloc_state = { 0 };
686 struct ipt_entry *iter; 681 struct ipt_entry *iter;
687 unsigned int *offsets; 682 unsigned int *offsets;
688 unsigned int i; 683 unsigned int i;
@@ -742,7 +737,8 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
742 /* Finally, each sanity check must pass */ 737 /* Finally, each sanity check must pass */
743 i = 0; 738 i = 0;
744 xt_entry_foreach(iter, entry0, newinfo->size) { 739 xt_entry_foreach(iter, entry0, newinfo->size) {
745 ret = find_check_entry(iter, net, repl->name, repl->size); 740 ret = find_check_entry(iter, net, repl->name, repl->size,
741 &alloc_state);
746 if (ret != 0) 742 if (ret != 0)
747 break; 743 break;
748 ++i; 744 ++i;
@@ -830,10 +826,6 @@ copy_entries_to_user(unsigned int total_size,
830 return PTR_ERR(counters); 826 return PTR_ERR(counters);
831 827
832 loc_cpu_entry = private->entries; 828 loc_cpu_entry = private->entries;
833 if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
834 ret = -EFAULT;
835 goto free_counters;
836 }
837 829
838 /* FIXME: use iterator macros --RR */ 830 /* FIXME: use iterator macros --RR */
839 /* ... then go back and fix counters and names */ 831 /* ... then go back and fix counters and names */
@@ -843,6 +835,10 @@ copy_entries_to_user(unsigned int total_size,
843 const struct xt_entry_target *t; 835 const struct xt_entry_target *t;
844 836
845 e = (struct ipt_entry *)(loc_cpu_entry + off); 837 e = (struct ipt_entry *)(loc_cpu_entry + off);
838 if (copy_to_user(userptr + off, e, sizeof(*e))) {
839 ret = -EFAULT;
840 goto free_counters;
841 }
846 if (copy_to_user(userptr + off 842 if (copy_to_user(userptr + off
847 + offsetof(struct ipt_entry, counters), 843 + offsetof(struct ipt_entry, counters),
848 &counters[num], 844 &counters[num],
@@ -856,23 +852,14 @@ copy_entries_to_user(unsigned int total_size,
856 i += m->u.match_size) { 852 i += m->u.match_size) {
857 m = (void *)e + i; 853 m = (void *)e + i;
858 854
859 if (copy_to_user(userptr + off + i 855 if (xt_match_to_user(m, userptr + off + i)) {
860 + offsetof(struct xt_entry_match,
861 u.user.name),
862 m->u.kernel.match->name,
863 strlen(m->u.kernel.match->name)+1)
864 != 0) {
865 ret = -EFAULT; 856 ret = -EFAULT;
866 goto free_counters; 857 goto free_counters;
867 } 858 }
868 } 859 }
869 860
870 t = ipt_get_target_c(e); 861 t = ipt_get_target_c(e);
871 if (copy_to_user(userptr + off + e->target_offset 862 if (xt_target_to_user(t, userptr + off + e->target_offset)) {
872 + offsetof(struct xt_entry_target,
873 u.user.name),
874 t->u.kernel.target->name,
875 strlen(t->u.kernel.target->name)+1) != 0) {
876 ret = -EFAULT; 863 ret = -EFAULT;
877 goto free_counters; 864 goto free_counters;
878 } 865 }
@@ -977,7 +964,7 @@ static int get_info(struct net *net, void __user *user,
977#endif 964#endif
978 t = try_then_request_module(xt_find_table_lock(net, AF_INET, name), 965 t = try_then_request_module(xt_find_table_lock(net, AF_INET, name),
979 "iptable_%s", name); 966 "iptable_%s", name);
980 if (!IS_ERR_OR_NULL(t)) { 967 if (t) {
981 struct ipt_getinfo info; 968 struct ipt_getinfo info;
982 const struct xt_table_info *private = t->private; 969 const struct xt_table_info *private = t->private;
983#ifdef CONFIG_COMPAT 970#ifdef CONFIG_COMPAT
@@ -1007,7 +994,7 @@ static int get_info(struct net *net, void __user *user,
1007 xt_table_unlock(t); 994 xt_table_unlock(t);
1008 module_put(t->me); 995 module_put(t->me);
1009 } else 996 } else
1010 ret = t ? PTR_ERR(t) : -ENOENT; 997 ret = -ENOENT;
1011#ifdef CONFIG_COMPAT 998#ifdef CONFIG_COMPAT
1012 if (compat) 999 if (compat)
1013 xt_compat_unlock(AF_INET); 1000 xt_compat_unlock(AF_INET);
@@ -1032,7 +1019,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr,
1032 get.name[sizeof(get.name) - 1] = '\0'; 1019 get.name[sizeof(get.name) - 1] = '\0';
1033 1020
1034 t = xt_find_table_lock(net, AF_INET, get.name); 1021 t = xt_find_table_lock(net, AF_INET, get.name);
1035 if (!IS_ERR_OR_NULL(t)) { 1022 if (t) {
1036 const struct xt_table_info *private = t->private; 1023 const struct xt_table_info *private = t->private;
1037 if (get.size == private->size) 1024 if (get.size == private->size)
1038 ret = copy_entries_to_user(private->size, 1025 ret = copy_entries_to_user(private->size,
@@ -1043,7 +1030,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr,
1043 module_put(t->me); 1030 module_put(t->me);
1044 xt_table_unlock(t); 1031 xt_table_unlock(t);
1045 } else 1032 } else
1046 ret = t ? PTR_ERR(t) : -ENOENT; 1033 ret = -ENOENT;
1047 1034
1048 return ret; 1035 return ret;
1049} 1036}
@@ -1068,8 +1055,8 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
1068 1055
1069 t = try_then_request_module(xt_find_table_lock(net, AF_INET, name), 1056 t = try_then_request_module(xt_find_table_lock(net, AF_INET, name),
1070 "iptable_%s", name); 1057 "iptable_%s", name);
1071 if (IS_ERR_OR_NULL(t)) { 1058 if (!t) {
1072 ret = t ? PTR_ERR(t) : -ENOENT; 1059 ret = -ENOENT;
1073 goto free_newinfo_counters_untrans; 1060 goto free_newinfo_counters_untrans;
1074 } 1061 }
1075 1062
@@ -1184,8 +1171,8 @@ do_add_counters(struct net *net, const void __user *user,
1184 return PTR_ERR(paddc); 1171 return PTR_ERR(paddc);
1185 1172
1186 t = xt_find_table_lock(net, AF_INET, tmp.name); 1173 t = xt_find_table_lock(net, AF_INET, tmp.name);
1187 if (IS_ERR_OR_NULL(t)) { 1174 if (!t) {
1188 ret = t ? PTR_ERR(t) : -ENOENT; 1175 ret = -ENOENT;
1189 goto free; 1176 goto free;
1190 } 1177 }
1191 1178
@@ -1630,7 +1617,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,
1630 1617
1631 xt_compat_lock(AF_INET); 1618 xt_compat_lock(AF_INET);
1632 t = xt_find_table_lock(net, AF_INET, get.name); 1619 t = xt_find_table_lock(net, AF_INET, get.name);
1633 if (!IS_ERR_OR_NULL(t)) { 1620 if (t) {
1634 const struct xt_table_info *private = t->private; 1621 const struct xt_table_info *private = t->private;
1635 struct xt_table_info info; 1622 struct xt_table_info info;
1636 ret = compat_table_info(private, &info); 1623 ret = compat_table_info(private, &info);
@@ -1644,7 +1631,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,
1644 module_put(t->me); 1631 module_put(t->me);
1645 xt_table_unlock(t); 1632 xt_table_unlock(t);
1646 } else 1633 } else
1647 ret = t ? PTR_ERR(t) : -ENOENT; 1634 ret = -ENOENT;
1648 1635
1649 xt_compat_unlock(AF_INET); 1636 xt_compat_unlock(AF_INET);
1650 return ret; 1637 return ret;
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 4a9e6db9df8d..9b8841316e7b 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -62,7 +62,7 @@ struct clusterip_config {
62static const struct file_operations clusterip_proc_fops; 62static const struct file_operations clusterip_proc_fops;
63#endif 63#endif
64 64
65static int clusterip_net_id __read_mostly; 65static unsigned int clusterip_net_id __read_mostly;
66 66
67struct clusterip_net { 67struct clusterip_net {
68 struct list_head configs; 68 struct list_head configs;
@@ -144,6 +144,11 @@ clusterip_config_find_get(struct net *net, __be32 clusterip, int entry)
144 rcu_read_lock_bh(); 144 rcu_read_lock_bh();
145 c = __clusterip_config_find(net, clusterip); 145 c = __clusterip_config_find(net, clusterip);
146 if (c) { 146 if (c) {
147#ifdef CONFIG_PROC_FS
148 if (!c->pde)
149 c = NULL;
150 else
151#endif
147 if (unlikely(!atomic_inc_not_zero(&c->refcount))) 152 if (unlikely(!atomic_inc_not_zero(&c->refcount)))
148 c = NULL; 153 c = NULL;
149 else if (entry) 154 else if (entry)
@@ -166,14 +171,15 @@ clusterip_config_init_nodelist(struct clusterip_config *c,
166 171
167static struct clusterip_config * 172static struct clusterip_config *
168clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip, 173clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
169 struct net_device *dev) 174 struct net_device *dev)
170{ 175{
176 struct net *net = dev_net(dev);
171 struct clusterip_config *c; 177 struct clusterip_config *c;
172 struct clusterip_net *cn = net_generic(dev_net(dev), clusterip_net_id); 178 struct clusterip_net *cn = net_generic(net, clusterip_net_id);
173 179
174 c = kzalloc(sizeof(*c), GFP_ATOMIC); 180 c = kzalloc(sizeof(*c), GFP_ATOMIC);
175 if (!c) 181 if (!c)
176 return NULL; 182 return ERR_PTR(-ENOMEM);
177 183
178 c->dev = dev; 184 c->dev = dev;
179 c->clusterip = ip; 185 c->clusterip = ip;
@@ -185,6 +191,17 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
185 atomic_set(&c->refcount, 1); 191 atomic_set(&c->refcount, 1);
186 atomic_set(&c->entries, 1); 192 atomic_set(&c->entries, 1);
187 193
194 spin_lock_bh(&cn->lock);
195 if (__clusterip_config_find(net, ip)) {
196 spin_unlock_bh(&cn->lock);
197 kfree(c);
198
199 return ERR_PTR(-EBUSY);
200 }
201
202 list_add_rcu(&c->list, &cn->configs);
203 spin_unlock_bh(&cn->lock);
204
188#ifdef CONFIG_PROC_FS 205#ifdef CONFIG_PROC_FS
189 { 206 {
190 char buffer[16]; 207 char buffer[16];
@@ -195,16 +212,16 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
195 cn->procdir, 212 cn->procdir,
196 &clusterip_proc_fops, c); 213 &clusterip_proc_fops, c);
197 if (!c->pde) { 214 if (!c->pde) {
215 spin_lock_bh(&cn->lock);
216 list_del_rcu(&c->list);
217 spin_unlock_bh(&cn->lock);
198 kfree(c); 218 kfree(c);
199 return NULL; 219
220 return ERR_PTR(-ENOMEM);
200 } 221 }
201 } 222 }
202#endif 223#endif
203 224
204 spin_lock_bh(&cn->lock);
205 list_add_rcu(&c->list, &cn->configs);
206 spin_unlock_bh(&cn->lock);
207
208 return c; 225 return c;
209} 226}
210 227
@@ -410,16 +427,16 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
410 427
411 config = clusterip_config_init(cipinfo, 428 config = clusterip_config_init(cipinfo,
412 e->ip.dst.s_addr, dev); 429 e->ip.dst.s_addr, dev);
413 if (!config) { 430 if (IS_ERR(config)) {
414 dev_put(dev); 431 dev_put(dev);
415 return -ENOMEM; 432 return PTR_ERR(config);
416 } 433 }
417 dev_mc_add(config->dev, config->clustermac); 434 dev_mc_add(config->dev, config->clustermac);
418 } 435 }
419 } 436 }
420 cipinfo->config = config; 437 cipinfo->config = config;
421 438
422 ret = nf_ct_l3proto_try_module_get(par->family); 439 ret = nf_ct_netns_get(par->net, par->family);
423 if (ret < 0) 440 if (ret < 0)
424 pr_info("cannot load conntrack support for proto=%u\n", 441 pr_info("cannot load conntrack support for proto=%u\n",
425 par->family); 442 par->family);
@@ -444,7 +461,7 @@ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par)
444 461
445 clusterip_config_put(cipinfo->config); 462 clusterip_config_put(cipinfo->config);
446 463
447 nf_ct_l3proto_module_put(par->family); 464 nf_ct_netns_put(par->net, par->family);
448} 465}
449 466
450#ifdef CONFIG_COMPAT 467#ifdef CONFIG_COMPAT
@@ -468,6 +485,7 @@ static struct xt_target clusterip_tg_reg __read_mostly = {
468 .checkentry = clusterip_tg_check, 485 .checkentry = clusterip_tg_check,
469 .destroy = clusterip_tg_destroy, 486 .destroy = clusterip_tg_destroy,
470 .targetsize = sizeof(struct ipt_clusterip_tgt_info), 487 .targetsize = sizeof(struct ipt_clusterip_tgt_info),
488 .usersize = offsetof(struct ipt_clusterip_tgt_info, config),
471#ifdef CONFIG_COMPAT 489#ifdef CONFIG_COMPAT
472 .compatsize = sizeof(struct compat_ipt_clusterip_tgt_info), 490 .compatsize = sizeof(struct compat_ipt_clusterip_tgt_info),
473#endif /* CONFIG_COMPAT */ 491#endif /* CONFIG_COMPAT */
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index da7f02a0b868..a03e4e7ef5f9 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -41,7 +41,7 @@ static int masquerade_tg_check(const struct xt_tgchk_param *par)
41 pr_debug("bad rangesize %u\n", mr->rangesize); 41 pr_debug("bad rangesize %u\n", mr->rangesize);
42 return -EINVAL; 42 return -EINVAL;
43 } 43 }
44 return 0; 44 return nf_ct_netns_get(par->net, par->family);
45} 45}
46 46
47static unsigned int 47static unsigned int
@@ -55,7 +55,13 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
55 range.min_proto = mr->range[0].min; 55 range.min_proto = mr->range[0].min;
56 range.max_proto = mr->range[0].max; 56 range.max_proto = mr->range[0].max;
57 57
58 return nf_nat_masquerade_ipv4(skb, par->hooknum, &range, par->out); 58 return nf_nat_masquerade_ipv4(skb, xt_hooknum(par), &range,
59 xt_out(par));
60}
61
62static void masquerade_tg_destroy(const struct xt_tgdtor_param *par)
63{
64 nf_ct_netns_put(par->net, par->family);
59} 65}
60 66
61static struct xt_target masquerade_tg_reg __read_mostly = { 67static struct xt_target masquerade_tg_reg __read_mostly = {
@@ -66,6 +72,7 @@ static struct xt_target masquerade_tg_reg __read_mostly = {
66 .table = "nat", 72 .table = "nat",
67 .hooks = 1 << NF_INET_POST_ROUTING, 73 .hooks = 1 << NF_INET_POST_ROUTING,
68 .checkentry = masquerade_tg_check, 74 .checkentry = masquerade_tg_check,
75 .destroy = masquerade_tg_destroy,
69 .me = THIS_MODULE, 76 .me = THIS_MODULE,
70}; 77};
71 78
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 1d16c0f28df0..8bd0d7b26632 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -34,7 +34,7 @@ static unsigned int
34reject_tg(struct sk_buff *skb, const struct xt_action_param *par) 34reject_tg(struct sk_buff *skb, const struct xt_action_param *par)
35{ 35{
36 const struct ipt_reject_info *reject = par->targinfo; 36 const struct ipt_reject_info *reject = par->targinfo;
37 int hook = par->hooknum; 37 int hook = xt_hooknum(par);
38 38
39 switch (reject->with) { 39 switch (reject->with) {
40 case IPT_ICMP_NET_UNREACHABLE: 40 case IPT_ICMP_NET_UNREACHABLE:
@@ -59,7 +59,7 @@ reject_tg(struct sk_buff *skb, const struct xt_action_param *par)
59 nf_send_unreach(skb, ICMP_PKT_FILTERED, hook); 59 nf_send_unreach(skb, ICMP_PKT_FILTERED, hook);
60 break; 60 break;
61 case IPT_TCP_RESET: 61 case IPT_TCP_RESET:
62 nf_send_reset(par->net, skb, hook); 62 nf_send_reset(xt_net(par), skb, hook);
63 case IPT_ICMP_ECHOREPLY: 63 case IPT_ICMP_ECHOREPLY:
64 /* Doesn't happen. */ 64 /* Doesn't happen. */
65 break; 65 break;
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index db5b87509446..3240a2614e82 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -57,8 +57,7 @@ synproxy_send_tcp(struct net *net,
57 goto free_nskb; 57 goto free_nskb;
58 58
59 if (nfct) { 59 if (nfct) {
60 nskb->nfct = nfct; 60 nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo);
61 nskb->nfctinfo = ctinfo;
62 nf_conntrack_get(nfct); 61 nf_conntrack_get(nfct);
63 } 62 }
64 63
@@ -107,8 +106,8 @@ synproxy_send_client_synack(struct net *net,
107 106
108 synproxy_build_options(nth, opts); 107 synproxy_build_options(nth, opts);
109 108
110 synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, 109 synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
111 niph, nth, tcp_hdr_size); 110 IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
112} 111}
113 112
114static void 113static void
@@ -230,8 +229,8 @@ synproxy_send_client_ack(struct net *net,
230 229
231 synproxy_build_options(nth, opts); 230 synproxy_build_options(nth, opts);
232 231
233 synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, 232 synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
234 niph, nth, tcp_hdr_size); 233 IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
235} 234}
236 235
237static bool 236static bool
@@ -263,12 +262,12 @@ static unsigned int
263synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) 262synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
264{ 263{
265 const struct xt_synproxy_info *info = par->targinfo; 264 const struct xt_synproxy_info *info = par->targinfo;
266 struct net *net = par->net; 265 struct net *net = xt_net(par);
267 struct synproxy_net *snet = synproxy_pernet(net); 266 struct synproxy_net *snet = synproxy_pernet(net);
268 struct synproxy_options opts = {}; 267 struct synproxy_options opts = {};
269 struct tcphdr *th, _th; 268 struct tcphdr *th, _th;
270 269
271 if (nf_ip_checksum(skb, par->hooknum, par->thoff, IPPROTO_TCP)) 270 if (nf_ip_checksum(skb, xt_hooknum(par), par->thoff, IPPROTO_TCP))
272 return NF_DROP; 271 return NF_DROP;
273 272
274 th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th); 273 th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th);
@@ -418,12 +417,12 @@ static int synproxy_tg4_check(const struct xt_tgchk_param *par)
418 e->ip.invflags & XT_INV_PROTO) 417 e->ip.invflags & XT_INV_PROTO)
419 return -EINVAL; 418 return -EINVAL;
420 419
421 return nf_ct_l3proto_try_module_get(par->family); 420 return nf_ct_netns_get(par->net, par->family);
422} 421}
423 422
424static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par) 423static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par)
425{ 424{
426 nf_ct_l3proto_module_put(par->family); 425 nf_ct_netns_put(par->net, par->family);
427} 426}
428 427
429static struct xt_target synproxy_tg4_reg __read_mostly = { 428static struct xt_target synproxy_tg4_reg __read_mostly = {
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
index 78cc64eddfc1..37fb9552e858 100644
--- a/net/ipv4/netfilter/ipt_rpfilter.c
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -63,10 +63,10 @@ static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4,
63 return dev_match || flags & XT_RPFILTER_LOOSE; 63 return dev_match || flags & XT_RPFILTER_LOOSE;
64} 64}
65 65
66static bool rpfilter_is_local(const struct sk_buff *skb) 66static bool
67rpfilter_is_loopback(const struct sk_buff *skb, const struct net_device *in)
67{ 68{
68 const struct rtable *rt = skb_rtable(skb); 69 return skb->pkt_type == PACKET_LOOPBACK || in->flags & IFF_LOOPBACK;
69 return rt && (rt->rt_flags & RTCF_LOCAL);
70} 70}
71 71
72static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) 72static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
@@ -79,14 +79,16 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
79 info = par->matchinfo; 79 info = par->matchinfo;
80 invert = info->flags & XT_RPFILTER_INVERT; 80 invert = info->flags & XT_RPFILTER_INVERT;
81 81
82 if (rpfilter_is_local(skb)) 82 if (rpfilter_is_loopback(skb, xt_in(par)))
83 return true ^ invert; 83 return true ^ invert;
84 84
85 iph = ip_hdr(skb); 85 iph = ip_hdr(skb);
86 if (ipv4_is_multicast(iph->daddr)) { 86 if (ipv4_is_zeronet(iph->saddr)) {
87 if (ipv4_is_zeronet(iph->saddr)) 87 if (ipv4_is_lbcast(iph->daddr) ||
88 return ipv4_is_local_multicast(iph->daddr) ^ invert; 88 ipv4_is_local_multicast(iph->daddr))
89 return true ^ invert;
89 } 90 }
91
90 flow.flowi4_iif = LOOPBACK_IFINDEX; 92 flow.flowi4_iif = LOOPBACK_IFINDEX;
91 flow.daddr = iph->saddr; 93 flow.daddr = iph->saddr;
92 flow.saddr = rpfilter_get_saddr(iph->daddr); 94 flow.saddr = rpfilter_get_saddr(iph->daddr);
@@ -95,7 +97,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
95 flow.flowi4_tos = RT_TOS(iph->tos); 97 flow.flowi4_tos = RT_TOS(iph->tos);
96 flow.flowi4_scope = RT_SCOPE_UNIVERSE; 98 flow.flowi4_scope = RT_SCOPE_UNIVERSE;
97 99
98 return rpfilter_lookup_reverse(par->net, &flow, par->in, info->flags) ^ invert; 100 return rpfilter_lookup_reverse(xt_net(par), &flow, xt_in(par), info->flags) ^ invert;
99} 101}
100 102
101static int rpfilter_check(const struct xt_mtchk_param *par) 103static int rpfilter_check(const struct xt_mtchk_param *par)
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 713c09a74b90..2e14ed11a35c 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -31,6 +31,13 @@
31#include <net/netfilter/ipv4/nf_defrag_ipv4.h> 31#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
32#include <net/netfilter/nf_log.h> 32#include <net/netfilter/nf_log.h>
33 33
34static int conntrack4_net_id __read_mostly;
35static DEFINE_MUTEX(register_ipv4_hooks);
36
37struct conntrack4_net {
38 unsigned int users;
39};
40
34static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, 41static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
35 struct nf_conntrack_tuple *tuple) 42 struct nf_conntrack_tuple *tuple)
36{ 43{
@@ -158,6 +165,10 @@ static unsigned int ipv4_conntrack_local(void *priv,
158 if (skb->len < sizeof(struct iphdr) || 165 if (skb->len < sizeof(struct iphdr) ||
159 ip_hdrlen(skb) < sizeof(struct iphdr)) 166 ip_hdrlen(skb) < sizeof(struct iphdr))
160 return NF_ACCEPT; 167 return NF_ACCEPT;
168
169 if (ip_is_fragment(ip_hdr(skb))) /* IP_NODEFRAG setsockopt set */
170 return NF_ACCEPT;
171
161 return nf_conntrack_in(state->net, PF_INET, state->hook, skb); 172 return nf_conntrack_in(state->net, PF_INET, state->hook, skb);
162} 173}
163 174
@@ -228,7 +239,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
228 } 239 }
229 240
230 if ((unsigned int) *len < sizeof(struct sockaddr_in)) { 241 if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
231 pr_debug("SO_ORIGINAL_DST: len %d not %Zu\n", 242 pr_debug("SO_ORIGINAL_DST: len %d not %zu\n",
232 *len, sizeof(struct sockaddr_in)); 243 *len, sizeof(struct sockaddr_in));
233 return -EINVAL; 244 return -EINVAL;
234 } 245 }
@@ -307,9 +318,42 @@ static struct nf_sockopt_ops so_getorigdst = {
307 .owner = THIS_MODULE, 318 .owner = THIS_MODULE,
308}; 319};
309 320
310static int ipv4_init_net(struct net *net) 321static int ipv4_hooks_register(struct net *net)
311{ 322{
312 return 0; 323 struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id);
324 int err = 0;
325
326 mutex_lock(&register_ipv4_hooks);
327
328 cnet->users++;
329 if (cnet->users > 1)
330 goto out_unlock;
331
332 err = nf_defrag_ipv4_enable(net);
333 if (err) {
334 cnet->users = 0;
335 goto out_unlock;
336 }
337
338 err = nf_register_net_hooks(net, ipv4_conntrack_ops,
339 ARRAY_SIZE(ipv4_conntrack_ops));
340
341 if (err)
342 cnet->users = 0;
343 out_unlock:
344 mutex_unlock(&register_ipv4_hooks);
345 return err;
346}
347
348static void ipv4_hooks_unregister(struct net *net)
349{
350 struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id);
351
352 mutex_lock(&register_ipv4_hooks);
353 if (cnet->users && (--cnet->users == 0))
354 nf_unregister_net_hooks(net, ipv4_conntrack_ops,
355 ARRAY_SIZE(ipv4_conntrack_ops));
356 mutex_unlock(&register_ipv4_hooks);
313} 357}
314 358
315struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { 359struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
@@ -325,7 +369,8 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
325 .nlattr_to_tuple = ipv4_nlattr_to_tuple, 369 .nlattr_to_tuple = ipv4_nlattr_to_tuple,
326 .nla_policy = ipv4_nla_policy, 370 .nla_policy = ipv4_nla_policy,
327#endif 371#endif
328 .init_net = ipv4_init_net, 372 .net_ns_get = ipv4_hooks_register,
373 .net_ns_put = ipv4_hooks_unregister,
329 .me = THIS_MODULE, 374 .me = THIS_MODULE,
330}; 375};
331 376
@@ -336,52 +381,50 @@ MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
336MODULE_ALIAS("ip_conntrack"); 381MODULE_ALIAS("ip_conntrack");
337MODULE_LICENSE("GPL"); 382MODULE_LICENSE("GPL");
338 383
384static struct nf_conntrack_l4proto *builtin_l4proto4[] = {
385 &nf_conntrack_l4proto_tcp4,
386 &nf_conntrack_l4proto_udp4,
387 &nf_conntrack_l4proto_icmp,
388#ifdef CONFIG_NF_CT_PROTO_DCCP
389 &nf_conntrack_l4proto_dccp4,
390#endif
391#ifdef CONFIG_NF_CT_PROTO_SCTP
392 &nf_conntrack_l4proto_sctp4,
393#endif
394#ifdef CONFIG_NF_CT_PROTO_UDPLITE
395 &nf_conntrack_l4proto_udplite4,
396#endif
397};
398
339static int ipv4_net_init(struct net *net) 399static int ipv4_net_init(struct net *net)
340{ 400{
341 int ret = 0; 401 int ret = 0;
342 402
343 ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_tcp4); 403 ret = nf_ct_l4proto_pernet_register(net, builtin_l4proto4,
344 if (ret < 0) { 404 ARRAY_SIZE(builtin_l4proto4));
345 pr_err("nf_conntrack_tcp4: pernet registration failed\n"); 405 if (ret < 0)
346 goto out_tcp; 406 return ret;
347 }
348 ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udp4);
349 if (ret < 0) {
350 pr_err("nf_conntrack_udp4: pernet registration failed\n");
351 goto out_udp;
352 }
353 ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_icmp);
354 if (ret < 0) {
355 pr_err("nf_conntrack_icmp4: pernet registration failed\n");
356 goto out_icmp;
357 }
358 ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv4); 407 ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv4);
359 if (ret < 0) { 408 if (ret < 0) {
360 pr_err("nf_conntrack_ipv4: pernet registration failed\n"); 409 pr_err("nf_conntrack_ipv4: pernet registration failed\n");
361 goto out_ipv4; 410 nf_ct_l4proto_pernet_unregister(net, builtin_l4proto4,
411 ARRAY_SIZE(builtin_l4proto4));
362 } 412 }
363 return 0;
364out_ipv4:
365 nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp);
366out_icmp:
367 nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp4);
368out_udp:
369 nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp4);
370out_tcp:
371 return ret; 413 return ret;
372} 414}
373 415
374static void ipv4_net_exit(struct net *net) 416static void ipv4_net_exit(struct net *net)
375{ 417{
376 nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv4); 418 nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv4);
377 nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp); 419 nf_ct_l4proto_pernet_unregister(net, builtin_l4proto4,
378 nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp4); 420 ARRAY_SIZE(builtin_l4proto4));
379 nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp4);
380} 421}
381 422
382static struct pernet_operations ipv4_net_ops = { 423static struct pernet_operations ipv4_net_ops = {
383 .init = ipv4_net_init, 424 .init = ipv4_net_init,
384 .exit = ipv4_net_exit, 425 .exit = ipv4_net_exit,
426 .id = &conntrack4_net_id,
427 .size = sizeof(struct conntrack4_net),
385}; 428};
386 429
387static int __init nf_conntrack_l3proto_ipv4_init(void) 430static int __init nf_conntrack_l3proto_ipv4_init(void)
@@ -389,7 +432,6 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
389 int ret = 0; 432 int ret = 0;
390 433
391 need_conntrack(); 434 need_conntrack();
392 nf_defrag_ipv4_enable();
393 435
394 ret = nf_register_sockopt(&so_getorigdst); 436 ret = nf_register_sockopt(&so_getorigdst);
395 if (ret < 0) { 437 if (ret < 0) {
@@ -403,46 +445,21 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
403 goto cleanup_sockopt; 445 goto cleanup_sockopt;
404 } 446 }
405 447
406 ret = nf_register_hooks(ipv4_conntrack_ops, 448 ret = nf_ct_l4proto_register(builtin_l4proto4,
407 ARRAY_SIZE(ipv4_conntrack_ops)); 449 ARRAY_SIZE(builtin_l4proto4));
408 if (ret < 0) { 450 if (ret < 0)
409 pr_err("nf_conntrack_ipv4: can't register hooks.\n");
410 goto cleanup_pernet; 451 goto cleanup_pernet;
411 }
412
413 ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_tcp4);
414 if (ret < 0) {
415 pr_err("nf_conntrack_ipv4: can't register tcp4 proto.\n");
416 goto cleanup_hooks;
417 }
418
419 ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udp4);
420 if (ret < 0) {
421 pr_err("nf_conntrack_ipv4: can't register udp4 proto.\n");
422 goto cleanup_tcp4;
423 }
424
425 ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_icmp);
426 if (ret < 0) {
427 pr_err("nf_conntrack_ipv4: can't register icmpv4 proto.\n");
428 goto cleanup_udp4;
429 }
430 452
431 ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv4); 453 ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv4);
432 if (ret < 0) { 454 if (ret < 0) {
433 pr_err("nf_conntrack_ipv4: can't register ipv4 proto.\n"); 455 pr_err("nf_conntrack_ipv4: can't register ipv4 proto.\n");
434 goto cleanup_icmpv4; 456 goto cleanup_l4proto;
435 } 457 }
436 458
437 return ret; 459 return ret;
438 cleanup_icmpv4: 460cleanup_l4proto:
439 nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp); 461 nf_ct_l4proto_unregister(builtin_l4proto4,
440 cleanup_udp4: 462 ARRAY_SIZE(builtin_l4proto4));
441 nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4);
442 cleanup_tcp4:
443 nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
444 cleanup_hooks:
445 nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
446 cleanup_pernet: 463 cleanup_pernet:
447 unregister_pernet_subsys(&ipv4_net_ops); 464 unregister_pernet_subsys(&ipv4_net_ops);
448 cleanup_sockopt: 465 cleanup_sockopt:
@@ -454,10 +471,8 @@ static void __exit nf_conntrack_l3proto_ipv4_fini(void)
454{ 471{
455 synchronize_net(); 472 synchronize_net();
456 nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4); 473 nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
457 nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp); 474 nf_ct_l4proto_unregister(builtin_l4proto4,
458 nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4); 475 ARRAY_SIZE(builtin_l4proto4));
459 nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
460 nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
461 unregister_pernet_subsys(&ipv4_net_ops); 476 unregister_pernet_subsys(&ipv4_net_ops);
462 nf_unregister_sockopt(&so_getorigdst); 477 nf_unregister_sockopt(&so_getorigdst);
463} 478}
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index d075b3cf2400..73c591d8a9a8 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -128,16 +128,16 @@ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
128/* Returns conntrack if it dealt with ICMP, and filled in skb fields */ 128/* Returns conntrack if it dealt with ICMP, and filled in skb fields */
129static int 129static int
130icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, 130icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
131 enum ip_conntrack_info *ctinfo,
132 unsigned int hooknum) 131 unsigned int hooknum)
133{ 132{
134 struct nf_conntrack_tuple innertuple, origtuple; 133 struct nf_conntrack_tuple innertuple, origtuple;
135 const struct nf_conntrack_l4proto *innerproto; 134 const struct nf_conntrack_l4proto *innerproto;
136 const struct nf_conntrack_tuple_hash *h; 135 const struct nf_conntrack_tuple_hash *h;
137 const struct nf_conntrack_zone *zone; 136 const struct nf_conntrack_zone *zone;
137 enum ip_conntrack_info ctinfo;
138 struct nf_conntrack_zone tmp; 138 struct nf_conntrack_zone tmp;
139 139
140 NF_CT_ASSERT(skb->nfct == NULL); 140 NF_CT_ASSERT(!skb_nfct(skb));
141 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); 141 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
142 142
143 /* Are they talking about one of our connections? */ 143 /* Are they talking about one of our connections? */
@@ -160,7 +160,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
160 return -NF_ACCEPT; 160 return -NF_ACCEPT;
161 } 161 }
162 162
163 *ctinfo = IP_CT_RELATED; 163 ctinfo = IP_CT_RELATED;
164 164
165 h = nf_conntrack_find_get(net, zone, &innertuple); 165 h = nf_conntrack_find_get(net, zone, &innertuple);
166 if (!h) { 166 if (!h) {
@@ -169,11 +169,10 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
169 } 169 }
170 170
171 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) 171 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
172 *ctinfo += IP_CT_IS_REPLY; 172 ctinfo += IP_CT_IS_REPLY;
173 173
174 /* Update skb to refer to this connection */ 174 /* Update skb to refer to this connection */
175 skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general; 175 nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo);
176 skb->nfctinfo = *ctinfo;
177 return NF_ACCEPT; 176 return NF_ACCEPT;
178} 177}
179 178
@@ -181,7 +180,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
181static int 180static int
182icmp_error(struct net *net, struct nf_conn *tmpl, 181icmp_error(struct net *net, struct nf_conn *tmpl,
183 struct sk_buff *skb, unsigned int dataoff, 182 struct sk_buff *skb, unsigned int dataoff,
184 enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum) 183 u8 pf, unsigned int hooknum)
185{ 184{
186 const struct icmphdr *icmph; 185 const struct icmphdr *icmph;
187 struct icmphdr _ih; 186 struct icmphdr _ih;
@@ -225,7 +224,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
225 icmph->type != ICMP_REDIRECT) 224 icmph->type != ICMP_REDIRECT)
226 return NF_ACCEPT; 225 return NF_ACCEPT;
227 226
228 return icmp_error_message(net, tmpl, skb, ctinfo, hooknum); 227 return icmp_error_message(net, tmpl, skb, hooknum);
229} 228}
230 229
231#if IS_ENABLED(CONFIG_NF_CT_NETLINK) 230#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index d88da36b383c..346bf7ccac08 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -11,6 +11,7 @@
11#include <linux/netfilter.h> 11#include <linux/netfilter.h>
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/skbuff.h> 13#include <linux/skbuff.h>
14#include <net/netns/generic.h>
14#include <net/route.h> 15#include <net/route.h>
15#include <net/ip.h> 16#include <net/ip.h>
16 17
@@ -22,6 +23,8 @@
22#endif 23#endif
23#include <net/netfilter/nf_conntrack_zones.h> 24#include <net/netfilter/nf_conntrack_zones.h>
24 25
26static DEFINE_MUTEX(defrag4_mutex);
27
25static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb, 28static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb,
26 u_int32_t user) 29 u_int32_t user)
27{ 30{
@@ -42,7 +45,7 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
42{ 45{
43 u16 zone_id = NF_CT_DEFAULT_ZONE_ID; 46 u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
44#if IS_ENABLED(CONFIG_NF_CONNTRACK) 47#if IS_ENABLED(CONFIG_NF_CONNTRACK)
45 if (skb->nfct) { 48 if (skb_nfct(skb)) {
46 enum ip_conntrack_info ctinfo; 49 enum ip_conntrack_info ctinfo;
47 const struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 50 const struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
48 51
@@ -72,7 +75,7 @@ static unsigned int ipv4_conntrack_defrag(void *priv,
72#if !IS_ENABLED(CONFIG_NF_NAT) 75#if !IS_ENABLED(CONFIG_NF_NAT)
73 /* Previously seen (loopback)? Ignore. Do this before 76 /* Previously seen (loopback)? Ignore. Do this before
74 fragment check. */ 77 fragment check. */
75 if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct)) 78 if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb)))
76 return NF_ACCEPT; 79 return NF_ACCEPT;
77#endif 80#endif
78#endif 81#endif
@@ -102,18 +105,50 @@ static struct nf_hook_ops ipv4_defrag_ops[] = {
102 }, 105 },
103}; 106};
104 107
108static void __net_exit defrag4_net_exit(struct net *net)
109{
110 if (net->nf.defrag_ipv4) {
111 nf_unregister_net_hooks(net, ipv4_defrag_ops,
112 ARRAY_SIZE(ipv4_defrag_ops));
113 net->nf.defrag_ipv4 = false;
114 }
115}
116
117static struct pernet_operations defrag4_net_ops = {
118 .exit = defrag4_net_exit,
119};
120
105static int __init nf_defrag_init(void) 121static int __init nf_defrag_init(void)
106{ 122{
107 return nf_register_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); 123 return register_pernet_subsys(&defrag4_net_ops);
108} 124}
109 125
110static void __exit nf_defrag_fini(void) 126static void __exit nf_defrag_fini(void)
111{ 127{
112 nf_unregister_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); 128 unregister_pernet_subsys(&defrag4_net_ops);
113} 129}
114 130
115void nf_defrag_ipv4_enable(void) 131int nf_defrag_ipv4_enable(struct net *net)
116{ 132{
133 int err = 0;
134
135 might_sleep();
136
137 if (net->nf.defrag_ipv4)
138 return 0;
139
140 mutex_lock(&defrag4_mutex);
141 if (net->nf.defrag_ipv4)
142 goto out_unlock;
143
144 err = nf_register_net_hooks(net, ipv4_defrag_ops,
145 ARRAY_SIZE(ipv4_defrag_ops));
146 if (err == 0)
147 net->nf.defrag_ipv4 = true;
148
149 out_unlock:
150 mutex_unlock(&defrag4_mutex);
151 return err;
117} 152}
118EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable); 153EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable);
119 154
diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c
index cf986e1c7bbd..f0dbff05fc28 100644
--- a/net/ipv4/netfilter/nf_dup_ipv4.c
+++ b/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -68,10 +68,9 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
68 68
69#if IS_ENABLED(CONFIG_NF_CONNTRACK) 69#if IS_ENABLED(CONFIG_NF_CONNTRACK)
70 /* Avoid counting cloned packets towards the original connection. */ 70 /* Avoid counting cloned packets towards the original connection. */
71 nf_conntrack_put(skb->nfct); 71 nf_reset(skb);
72 skb->nfct = &nf_ct_untracked_get()->ct_general; 72 nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW);
73 skb->nfctinfo = IP_CT_NEW; 73 nf_conntrack_get(skb_nfct(skb));
74 nf_conntrack_get(skb->nfct);
75#endif 74#endif
76 /* 75 /*
77 * If we are in PREROUTING/INPUT, decrease the TTL to mitigate potential 76 * If we are in PREROUTING/INPUT, decrease the TTL to mitigate potential
diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c
index b24795e2ee6d..2f3895ddc275 100644
--- a/net/ipv4/netfilter/nf_log_arp.c
+++ b/net/ipv4/netfilter/nf_log_arp.c
@@ -69,7 +69,7 @@ static void dump_arp_packet(struct nf_log_buf *m,
69 69
70 ap = skb_header_pointer(skb, sizeof(_arph), sizeof(_arpp), &_arpp); 70 ap = skb_header_pointer(skb, sizeof(_arph), sizeof(_arpp), &_arpp);
71 if (ap == NULL) { 71 if (ap == NULL) {
72 nf_log_buf_add(m, " INCOMPLETE [%Zu bytes]", 72 nf_log_buf_add(m, " INCOMPLETE [%zu bytes]",
73 skb->len - sizeof(_arph)); 73 skb->len - sizeof(_arph));
74 return; 74 return;
75 } 75 }
@@ -87,7 +87,7 @@ static void nf_log_arp_packet(struct net *net, u_int8_t pf,
87 struct nf_log_buf *m; 87 struct nf_log_buf *m;
88 88
89 /* FIXME: Disabled from containers until syslog ns is supported */ 89 /* FIXME: Disabled from containers until syslog ns is supported */
90 if (!net_eq(net, &init_net)) 90 if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns)
91 return; 91 return;
92 92
93 m = nf_log_buf_open(); 93 m = nf_log_buf_open();
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
index 856648966f4c..c83a9963269b 100644
--- a/net/ipv4/netfilter/nf_log_ipv4.c
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -319,7 +319,7 @@ static void nf_log_ip_packet(struct net *net, u_int8_t pf,
319 struct nf_log_buf *m; 319 struct nf_log_buf *m;
320 320
321 /* FIXME: Disabled from containers until syslog ns is supported */ 321 /* FIXME: Disabled from containers until syslog ns is supported */
322 if (!net_eq(net, &init_net)) 322 if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns)
323 return; 323 return;
324 324
325 m = nf_log_buf_open(); 325 m = nf_log_buf_open();
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index f8aad03d674b..6f5e8d01b876 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -255,11 +255,6 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
255 /* maniptype == SRC for postrouting. */ 255 /* maniptype == SRC for postrouting. */
256 enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook); 256 enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook);
257 257
258 /* We never see fragments: conntrack defrags on pre-routing
259 * and local-out, and nf_nat_out protects post-routing.
260 */
261 NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb)));
262
263 ct = nf_ct_get(skb, &ctinfo); 258 ct = nf_ct_get(skb, &ctinfo);
264 /* Can't track? It's not due to stress, or conntrack would 259 /* Can't track? It's not due to stress, or conntrack would
265 * have dropped it. Hence it's the user's responsibilty to 260 * have dropped it. Hence it's the user's responsibilty to
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index c9b52c361da2..53e49f5011d3 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -1260,16 +1260,6 @@ static const struct nf_conntrack_expect_policy snmp_exp_policy = {
1260 .timeout = 180, 1260 .timeout = 180,
1261}; 1261};
1262 1262
1263static struct nf_conntrack_helper snmp_helper __read_mostly = {
1264 .me = THIS_MODULE,
1265 .help = help,
1266 .expect_policy = &snmp_exp_policy,
1267 .name = "snmp",
1268 .tuple.src.l3num = AF_INET,
1269 .tuple.src.u.udp.port = cpu_to_be16(SNMP_PORT),
1270 .tuple.dst.protonum = IPPROTO_UDP,
1271};
1272
1273static struct nf_conntrack_helper snmp_trap_helper __read_mostly = { 1263static struct nf_conntrack_helper snmp_trap_helper __read_mostly = {
1274 .me = THIS_MODULE, 1264 .me = THIS_MODULE,
1275 .help = help, 1265 .help = help,
@@ -1288,22 +1278,16 @@ static struct nf_conntrack_helper snmp_trap_helper __read_mostly = {
1288 1278
1289static int __init nf_nat_snmp_basic_init(void) 1279static int __init nf_nat_snmp_basic_init(void)
1290{ 1280{
1291 int ret = 0;
1292
1293 BUG_ON(nf_nat_snmp_hook != NULL); 1281 BUG_ON(nf_nat_snmp_hook != NULL);
1294 RCU_INIT_POINTER(nf_nat_snmp_hook, help); 1282 RCU_INIT_POINTER(nf_nat_snmp_hook, help);
1295 1283
1296 ret = nf_conntrack_helper_register(&snmp_trap_helper); 1284 return nf_conntrack_helper_register(&snmp_trap_helper);
1297 if (ret < 0) {
1298 nf_conntrack_helper_unregister(&snmp_helper);
1299 return ret;
1300 }
1301 return ret;
1302} 1285}
1303 1286
1304static void __exit nf_nat_snmp_basic_fini(void) 1287static void __exit nf_nat_snmp_basic_fini(void)
1305{ 1288{
1306 RCU_INIT_POINTER(nf_nat_snmp_hook, NULL); 1289 RCU_INIT_POINTER(nf_nat_snmp_hook, NULL);
1290 synchronize_rcu();
1307 nf_conntrack_helper_unregister(&snmp_trap_helper); 1291 nf_conntrack_helper_unregister(&snmp_trap_helper);
1308} 1292}
1309 1293
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index fd8220213afc..146d86105183 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -126,6 +126,8 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook)
126 /* ip_route_me_harder expects skb->dst to be set */ 126 /* ip_route_me_harder expects skb->dst to be set */
127 skb_dst_set_noref(nskb, skb_dst(oldskb)); 127 skb_dst_set_noref(nskb, skb_dst(oldskb));
128 128
129 nskb->mark = IP4_REPLY_MARK(net, oldskb->mark);
130
129 skb_reserve(nskb, LL_MAX_HEADER); 131 skb_reserve(nskb, LL_MAX_HEADER);
130 niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP, 132 niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
131 ip4_dst_hoplimit(skb_dst(nskb))); 133 ip4_dst_hoplimit(skb_dst(nskb)));
diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c
new file mode 100644
index 000000000000..a83d558e1aae
--- /dev/null
+++ b/net/ipv4/netfilter/nf_socket_ipv4.c
@@ -0,0 +1,163 @@
1/*
2 * Copyright (C) 2007-2008 BalaBit IT Ltd.
3 * Author: Krisztian Kovacs
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 *
9 */
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11#include <linux/module.h>
12#include <linux/skbuff.h>
13#include <net/tcp.h>
14#include <net/udp.h>
15#include <net/icmp.h>
16#include <net/sock.h>
17#include <net/inet_sock.h>
18#include <net/netfilter/nf_socket.h>
19#if IS_ENABLED(CONFIG_NF_CONNTRACK)
20#include <net/netfilter/nf_conntrack.h>
21#endif
22
23static int
24extract_icmp4_fields(const struct sk_buff *skb, u8 *protocol,
25 __be32 *raddr, __be32 *laddr,
26 __be16 *rport, __be16 *lport)
27{
28 unsigned int outside_hdrlen = ip_hdrlen(skb);
29 struct iphdr *inside_iph, _inside_iph;
30 struct icmphdr *icmph, _icmph;
31 __be16 *ports, _ports[2];
32
33 icmph = skb_header_pointer(skb, outside_hdrlen,
34 sizeof(_icmph), &_icmph);
35 if (icmph == NULL)
36 return 1;
37
38 switch (icmph->type) {
39 case ICMP_DEST_UNREACH:
40 case ICMP_SOURCE_QUENCH:
41 case ICMP_REDIRECT:
42 case ICMP_TIME_EXCEEDED:
43 case ICMP_PARAMETERPROB:
44 break;
45 default:
46 return 1;
47 }
48
49 inside_iph = skb_header_pointer(skb, outside_hdrlen +
50 sizeof(struct icmphdr),
51 sizeof(_inside_iph), &_inside_iph);
52 if (inside_iph == NULL)
53 return 1;
54
55 if (inside_iph->protocol != IPPROTO_TCP &&
56 inside_iph->protocol != IPPROTO_UDP)
57 return 1;
58
59 ports = skb_header_pointer(skb, outside_hdrlen +
60 sizeof(struct icmphdr) +
61 (inside_iph->ihl << 2),
62 sizeof(_ports), &_ports);
63 if (ports == NULL)
64 return 1;
65
66 /* the inside IP packet is the one quoted from our side, thus
67 * its saddr is the local address */
68 *protocol = inside_iph->protocol;
69 *laddr = inside_iph->saddr;
70 *lport = ports[0];
71 *raddr = inside_iph->daddr;
72 *rport = ports[1];
73
74 return 0;
75}
76
77static struct sock *
78nf_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff,
79 const u8 protocol,
80 const __be32 saddr, const __be32 daddr,
81 const __be16 sport, const __be16 dport,
82 const struct net_device *in)
83{
84 switch (protocol) {
85 case IPPROTO_TCP:
86 return inet_lookup(net, &tcp_hashinfo, skb, doff,
87 saddr, sport, daddr, dport,
88 in->ifindex);
89 case IPPROTO_UDP:
90 return udp4_lib_lookup(net, saddr, sport, daddr, dport,
91 in->ifindex);
92 }
93 return NULL;
94}
95
96struct sock *nf_sk_lookup_slow_v4(struct net *net, const struct sk_buff *skb,
97 const struct net_device *indev)
98{
99 __be32 uninitialized_var(daddr), uninitialized_var(saddr);
100 __be16 uninitialized_var(dport), uninitialized_var(sport);
101 const struct iphdr *iph = ip_hdr(skb);
102 struct sk_buff *data_skb = NULL;
103 u8 uninitialized_var(protocol);
104#if IS_ENABLED(CONFIG_NF_CONNTRACK)
105 enum ip_conntrack_info ctinfo;
106 struct nf_conn const *ct;
107#endif
108 int doff = 0;
109
110 if (iph->protocol == IPPROTO_UDP || iph->protocol == IPPROTO_TCP) {
111 struct udphdr _hdr, *hp;
112
113 hp = skb_header_pointer(skb, ip_hdrlen(skb),
114 sizeof(_hdr), &_hdr);
115 if (hp == NULL)
116 return NULL;
117
118 protocol = iph->protocol;
119 saddr = iph->saddr;
120 sport = hp->source;
121 daddr = iph->daddr;
122 dport = hp->dest;
123 data_skb = (struct sk_buff *)skb;
124 doff = iph->protocol == IPPROTO_TCP ?
125 ip_hdrlen(skb) + __tcp_hdrlen((struct tcphdr *)hp) :
126 ip_hdrlen(skb) + sizeof(*hp);
127
128 } else if (iph->protocol == IPPROTO_ICMP) {
129 if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr,
130 &sport, &dport))
131 return NULL;
132 } else {
133 return NULL;
134 }
135
136#if IS_ENABLED(CONFIG_NF_CONNTRACK)
137 /* Do the lookup with the original socket address in
138 * case this is a reply packet of an established
139 * SNAT-ted connection.
140 */
141 ct = nf_ct_get(skb, &ctinfo);
142 if (ct && !nf_ct_is_untracked(ct) &&
143 ((iph->protocol != IPPROTO_ICMP &&
144 ctinfo == IP_CT_ESTABLISHED_REPLY) ||
145 (iph->protocol == IPPROTO_ICMP &&
146 ctinfo == IP_CT_RELATED_REPLY)) &&
147 (ct->status & IPS_SRC_NAT_DONE)) {
148
149 daddr = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip;
150 dport = (iph->protocol == IPPROTO_TCP) ?
151 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port :
152 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port;
153 }
154#endif
155
156 return nf_socket_get_sock_v4(net, data_skb, doff, protocol, saddr,
157 daddr, sport, dport, indev);
158}
159EXPORT_SYMBOL_GPL(nf_sk_lookup_slow_v4);
160
161MODULE_LICENSE("GPL");
162MODULE_AUTHOR("Krisztian Kovacs, Balazs Scheidler");
163MODULE_DESCRIPTION("Netfilter IPv4 socket lookup infrastructure");
diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c
index 0c01a270bf9f..0af3d8df70dd 100644
--- a/net/ipv4/netfilter/nft_dup_ipv4.c
+++ b/net/ipv4/netfilter/nft_dup_ipv4.c
@@ -30,7 +30,7 @@ static void nft_dup_ipv4_eval(const struct nft_expr *expr,
30 }; 30 };
31 int oif = priv->sreg_dev ? regs->data[priv->sreg_dev] : -1; 31 int oif = priv->sreg_dev ? regs->data[priv->sreg_dev] : -1;
32 32
33 nf_dup_ipv4(pkt->net, pkt->skb, pkt->hook, &gw, oif); 33 nf_dup_ipv4(nft_net(pkt), pkt->skb, nft_hook(pkt), &gw, oif);
34} 34}
35 35
36static int nft_dup_ipv4_init(const struct nft_ctx *ctx, 36static int nft_dup_ipv4_init(const struct nft_ctx *ctx,
diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c
new file mode 100644
index 000000000000..2981291910dd
--- /dev/null
+++ b/net/ipv4/netfilter/nft_fib_ipv4.c
@@ -0,0 +1,236 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License version 2 as
4 * published by the Free Software Foundation.
5 */
6
7#include <linux/kernel.h>
8#include <linux/init.h>
9#include <linux/module.h>
10#include <linux/netlink.h>
11#include <linux/netfilter.h>
12#include <linux/netfilter/nf_tables.h>
13#include <net/netfilter/nf_tables_core.h>
14#include <net/netfilter/nf_tables.h>
15#include <net/netfilter/nft_fib.h>
16
17#include <net/ip_fib.h>
18#include <net/route.h>
19
20/* don't try to find route from mcast/bcast/zeronet */
21static __be32 get_saddr(__be32 addr)
22{
23 if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr) ||
24 ipv4_is_zeronet(addr))
25 return 0;
26 return addr;
27}
28
29#define DSCP_BITS 0xfc
30
31void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs,
32 const struct nft_pktinfo *pkt)
33{
34 const struct nft_fib *priv = nft_expr_priv(expr);
35 u32 *dst = &regs->data[priv->dreg];
36 const struct net_device *dev = NULL;
37 const struct iphdr *iph;
38 __be32 addr;
39
40 if (priv->flags & NFTA_FIB_F_IIF)
41 dev = nft_in(pkt);
42 else if (priv->flags & NFTA_FIB_F_OIF)
43 dev = nft_out(pkt);
44
45 iph = ip_hdr(pkt->skb);
46 if (priv->flags & NFTA_FIB_F_DADDR)
47 addr = iph->daddr;
48 else
49 addr = iph->saddr;
50
51 *dst = inet_dev_addr_type(nft_net(pkt), dev, addr);
52}
53EXPORT_SYMBOL_GPL(nft_fib4_eval_type);
54
55static int get_ifindex(const struct net_device *dev)
56{
57 return dev ? dev->ifindex : 0;
58}
59
60void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
61 const struct nft_pktinfo *pkt)
62{
63 const struct nft_fib *priv = nft_expr_priv(expr);
64 u32 *dest = &regs->data[priv->dreg];
65 const struct iphdr *iph;
66 struct fib_result res;
67 struct flowi4 fl4 = {
68 .flowi4_scope = RT_SCOPE_UNIVERSE,
69 .flowi4_iif = LOOPBACK_IFINDEX,
70 };
71 const struct net_device *oif;
72 struct net_device *found;
73#ifdef CONFIG_IP_ROUTE_MULTIPATH
74 int i;
75#endif
76
77 /*
78 * Do not set flowi4_oif, it restricts results (for example, asking
79 * for oif 3 will get RTN_UNICAST result even if the daddr exits
80 * on another interface.
81 *
82 * Search results for the desired outinterface instead.
83 */
84 if (priv->flags & NFTA_FIB_F_OIF)
85 oif = nft_out(pkt);
86 else if (priv->flags & NFTA_FIB_F_IIF)
87 oif = nft_in(pkt);
88 else
89 oif = NULL;
90
91 if (nft_hook(pkt) == NF_INET_PRE_ROUTING &&
92 nft_fib_is_loopback(pkt->skb, nft_in(pkt))) {
93 nft_fib_store_result(dest, priv->result, pkt,
94 nft_in(pkt)->ifindex);
95 return;
96 }
97
98 iph = ip_hdr(pkt->skb);
99 if (ipv4_is_zeronet(iph->saddr)) {
100 if (ipv4_is_lbcast(iph->daddr) ||
101 ipv4_is_local_multicast(iph->daddr)) {
102 nft_fib_store_result(dest, priv->result, pkt,
103 get_ifindex(pkt->skb->dev));
104 return;
105 }
106 }
107
108 if (priv->flags & NFTA_FIB_F_MARK)
109 fl4.flowi4_mark = pkt->skb->mark;
110
111 fl4.flowi4_tos = iph->tos & DSCP_BITS;
112
113 if (priv->flags & NFTA_FIB_F_DADDR) {
114 fl4.daddr = iph->daddr;
115 fl4.saddr = get_saddr(iph->saddr);
116 } else {
117 fl4.daddr = iph->saddr;
118 fl4.saddr = get_saddr(iph->daddr);
119 }
120
121 *dest = 0;
122
123 if (fib_lookup(nft_net(pkt), &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE))
124 return;
125
126 switch (res.type) {
127 case RTN_UNICAST:
128 break;
129 case RTN_LOCAL: /* Should not see RTN_LOCAL here */
130 return;
131 default:
132 break;
133 }
134
135 if (!oif) {
136 found = FIB_RES_DEV(res);
137 goto ok;
138 }
139
140#ifdef CONFIG_IP_ROUTE_MULTIPATH
141 for (i = 0; i < res.fi->fib_nhs; i++) {
142 struct fib_nh *nh = &res.fi->fib_nh[i];
143
144 if (nh->nh_dev == oif) {
145 found = nh->nh_dev;
146 goto ok;
147 }
148 }
149 return;
150#else
151 found = FIB_RES_DEV(res);
152 if (found != oif)
153 return;
154#endif
155ok:
156 switch (priv->result) {
157 case NFT_FIB_RESULT_OIF:
158 *dest = found->ifindex;
159 break;
160 case NFT_FIB_RESULT_OIFNAME:
161 strncpy((char *)dest, found->name, IFNAMSIZ);
162 break;
163 default:
164 WARN_ON_ONCE(1);
165 break;
166 }
167}
168EXPORT_SYMBOL_GPL(nft_fib4_eval);
169
170static struct nft_expr_type nft_fib4_type;
171
172static const struct nft_expr_ops nft_fib4_type_ops = {
173 .type = &nft_fib4_type,
174 .size = NFT_EXPR_SIZE(sizeof(struct nft_fib)),
175 .eval = nft_fib4_eval_type,
176 .init = nft_fib_init,
177 .dump = nft_fib_dump,
178 .validate = nft_fib_validate,
179};
180
181static const struct nft_expr_ops nft_fib4_ops = {
182 .type = &nft_fib4_type,
183 .size = NFT_EXPR_SIZE(sizeof(struct nft_fib)),
184 .eval = nft_fib4_eval,
185 .init = nft_fib_init,
186 .dump = nft_fib_dump,
187 .validate = nft_fib_validate,
188};
189
190static const struct nft_expr_ops *
191nft_fib4_select_ops(const struct nft_ctx *ctx,
192 const struct nlattr * const tb[])
193{
194 enum nft_fib_result result;
195
196 if (!tb[NFTA_FIB_RESULT])
197 return ERR_PTR(-EINVAL);
198
199 result = ntohl(nla_get_be32(tb[NFTA_FIB_RESULT]));
200
201 switch (result) {
202 case NFT_FIB_RESULT_OIF:
203 return &nft_fib4_ops;
204 case NFT_FIB_RESULT_OIFNAME:
205 return &nft_fib4_ops;
206 case NFT_FIB_RESULT_ADDRTYPE:
207 return &nft_fib4_type_ops;
208 default:
209 return ERR_PTR(-EOPNOTSUPP);
210 }
211}
212
213static struct nft_expr_type nft_fib4_type __read_mostly = {
214 .name = "fib",
215 .select_ops = &nft_fib4_select_ops,
216 .policy = nft_fib_policy,
217 .maxattr = NFTA_FIB_MAX,
218 .family = NFPROTO_IPV4,
219 .owner = THIS_MODULE,
220};
221
222static int __init nft_fib4_module_init(void)
223{
224 return nft_register_expr(&nft_fib4_type);
225}
226
227static void __exit nft_fib4_module_exit(void)
228{
229 nft_unregister_expr(&nft_fib4_type);
230}
231
232module_init(nft_fib4_module_init);
233module_exit(nft_fib4_module_exit);
234MODULE_LICENSE("GPL");
235MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
236MODULE_ALIAS_NFT_AF_EXPR(2, "fib");
diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c
index 51ced81b616c..f18677277119 100644
--- a/net/ipv4/netfilter/nft_masq_ipv4.c
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com> 2 * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify 4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as 5 * it under the terms of the GNU General Public License version 2 as
@@ -26,13 +26,19 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr,
26 memset(&range, 0, sizeof(range)); 26 memset(&range, 0, sizeof(range));
27 range.flags = priv->flags; 27 range.flags = priv->flags;
28 if (priv->sreg_proto_min) { 28 if (priv->sreg_proto_min) {
29 range.min_proto.all = 29 range.min_proto.all = (__force __be16)nft_reg_load16(
30 *(__be16 *)&regs->data[priv->sreg_proto_min]; 30 &regs->data[priv->sreg_proto_min]);
31 range.max_proto.all = 31 range.max_proto.all = (__force __be16)nft_reg_load16(
32 *(__be16 *)&regs->data[priv->sreg_proto_max]; 32 &regs->data[priv->sreg_proto_max]);
33 } 33 }
34 regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->hook, 34 regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, nft_hook(pkt),
35 &range, pkt->out); 35 &range, nft_out(pkt));
36}
37
38static void
39nft_masq_ipv4_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
40{
41 nf_ct_netns_put(ctx->net, NFPROTO_IPV4);
36} 42}
37 43
38static struct nft_expr_type nft_masq_ipv4_type; 44static struct nft_expr_type nft_masq_ipv4_type;
@@ -41,6 +47,7 @@ static const struct nft_expr_ops nft_masq_ipv4_ops = {
41 .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)), 47 .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)),
42 .eval = nft_masq_ipv4_eval, 48 .eval = nft_masq_ipv4_eval,
43 .init = nft_masq_init, 49 .init = nft_masq_init,
50 .destroy = nft_masq_ipv4_destroy,
44 .dump = nft_masq_dump, 51 .dump = nft_masq_dump,
45 .validate = nft_masq_validate, 52 .validate = nft_masq_validate,
46}; 53};
@@ -77,5 +84,5 @@ module_init(nft_masq_ipv4_module_init);
77module_exit(nft_masq_ipv4_module_exit); 84module_exit(nft_masq_ipv4_module_exit);
78 85
79MODULE_LICENSE("GPL"); 86MODULE_LICENSE("GPL");
80MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>"); 87MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org");
81MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "masq"); 88MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "masq");
diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c b/net/ipv4/netfilter/nft_redir_ipv4.c
index c09d4381427e..5120be1d3118 100644
--- a/net/ipv4/netfilter/nft_redir_ipv4.c
+++ b/net/ipv4/netfilter/nft_redir_ipv4.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com> 2 * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify 4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as 5 * it under the terms of the GNU General Public License version 2 as
@@ -26,17 +26,22 @@ static void nft_redir_ipv4_eval(const struct nft_expr *expr,
26 26
27 memset(&mr, 0, sizeof(mr)); 27 memset(&mr, 0, sizeof(mr));
28 if (priv->sreg_proto_min) { 28 if (priv->sreg_proto_min) {
29 mr.range[0].min.all = 29 mr.range[0].min.all = (__force __be16)nft_reg_load16(
30 *(__be16 *)&regs->data[priv->sreg_proto_min]; 30 &regs->data[priv->sreg_proto_min]);
31 mr.range[0].max.all = 31 mr.range[0].max.all = (__force __be16)nft_reg_load16(
32 *(__be16 *)&regs->data[priv->sreg_proto_max]; 32 &regs->data[priv->sreg_proto_max]);
33 mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED; 33 mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
34 } 34 }
35 35
36 mr.range[0].flags |= priv->flags; 36 mr.range[0].flags |= priv->flags;
37 37
38 regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr, 38 regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr, nft_hook(pkt));
39 pkt->hook); 39}
40
41static void
42nft_redir_ipv4_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
43{
44 nf_ct_netns_put(ctx->net, NFPROTO_IPV4);
40} 45}
41 46
42static struct nft_expr_type nft_redir_ipv4_type; 47static struct nft_expr_type nft_redir_ipv4_type;
@@ -45,6 +50,7 @@ static const struct nft_expr_ops nft_redir_ipv4_ops = {
45 .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)), 50 .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)),
46 .eval = nft_redir_ipv4_eval, 51 .eval = nft_redir_ipv4_eval,
47 .init = nft_redir_init, 52 .init = nft_redir_init,
53 .destroy = nft_redir_ipv4_destroy,
48 .dump = nft_redir_dump, 54 .dump = nft_redir_dump,
49 .validate = nft_redir_validate, 55 .validate = nft_redir_validate,
50}; 56};
@@ -72,5 +78,5 @@ module_init(nft_redir_ipv4_module_init);
72module_exit(nft_redir_ipv4_module_exit); 78module_exit(nft_redir_ipv4_module_exit);
73 79
74MODULE_LICENSE("GPL"); 80MODULE_LICENSE("GPL");
75MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>"); 81MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>");
76MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "redir"); 82MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "redir");
diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c
index 2c2553b9026c..517ce93699de 100644
--- a/net/ipv4/netfilter/nft_reject_ipv4.c
+++ b/net/ipv4/netfilter/nft_reject_ipv4.c
@@ -27,10 +27,10 @@ static void nft_reject_ipv4_eval(const struct nft_expr *expr,
27 27
28 switch (priv->type) { 28 switch (priv->type) {
29 case NFT_REJECT_ICMP_UNREACH: 29 case NFT_REJECT_ICMP_UNREACH:
30 nf_send_unreach(pkt->skb, priv->icmp_code, pkt->hook); 30 nf_send_unreach(pkt->skb, priv->icmp_code, nft_hook(pkt));
31 break; 31 break;
32 case NFT_REJECT_TCP_RST: 32 case NFT_REJECT_TCP_RST:
33 nf_send_reset(pkt->net, pkt->skb, pkt->hook); 33 nf_send_reset(nft_net(pkt), pkt->skb, nft_hook(pkt));
34 break; 34 break;
35 default: 35 default:
36 break; 36 break;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 96b8e2b95731..ccfbce13a633 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -156,17 +156,18 @@ int ping_hash(struct sock *sk)
156void ping_unhash(struct sock *sk) 156void ping_unhash(struct sock *sk)
157{ 157{
158 struct inet_sock *isk = inet_sk(sk); 158 struct inet_sock *isk = inet_sk(sk);
159
159 pr_debug("ping_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num); 160 pr_debug("ping_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num);
161 write_lock_bh(&ping_table.lock);
160 if (sk_hashed(sk)) { 162 if (sk_hashed(sk)) {
161 write_lock_bh(&ping_table.lock);
162 hlist_nulls_del(&sk->sk_nulls_node); 163 hlist_nulls_del(&sk->sk_nulls_node);
163 sk_nulls_node_init(&sk->sk_nulls_node); 164 sk_nulls_node_init(&sk->sk_nulls_node);
164 sock_put(sk); 165 sock_put(sk);
165 isk->inet_num = 0; 166 isk->inet_num = 0;
166 isk->inet_sport = 0; 167 isk->inet_sport = 0;
167 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 168 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
168 write_unlock_bh(&ping_table.lock);
169 } 169 }
170 write_unlock_bh(&ping_table.lock);
170} 171}
171EXPORT_SYMBOL_GPL(ping_unhash); 172EXPORT_SYMBOL_GPL(ping_unhash);
172 173
@@ -433,9 +434,9 @@ int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
433 goto out; 434 goto out;
434 } 435 }
435 436
436 pr_debug("after bind(): num = %d, dif = %d\n", 437 pr_debug("after bind(): num = %hu, dif = %d\n",
437 (int)isk->inet_num, 438 isk->inet_num,
438 (int)sk->sk_bound_dev_if); 439 sk->sk_bound_dev_if);
439 440
440 err = 0; 441 err = 0;
441 if (sk->sk_family == AF_INET && isk->inet_rcv_saddr) 442 if (sk->sk_family == AF_INET && isk->inet_rcv_saddr)
@@ -609,15 +610,15 @@ int ping_getfrag(void *from, char *to,
609 fraglen -= sizeof(struct icmphdr); 610 fraglen -= sizeof(struct icmphdr);
610 if (fraglen < 0) 611 if (fraglen < 0)
611 BUG(); 612 BUG();
612 if (csum_and_copy_from_iter(to + sizeof(struct icmphdr), 613 if (!csum_and_copy_from_iter_full(to + sizeof(struct icmphdr),
613 fraglen, &pfh->wcheck, 614 fraglen, &pfh->wcheck,
614 &pfh->msg->msg_iter) != fraglen) 615 &pfh->msg->msg_iter))
615 return -EFAULT; 616 return -EFAULT;
616 } else if (offset < sizeof(struct icmphdr)) { 617 } else if (offset < sizeof(struct icmphdr)) {
617 BUG(); 618 BUG();
618 } else { 619 } else {
619 if (csum_and_copy_from_iter(to, fraglen, &pfh->wcheck, 620 if (!csum_and_copy_from_iter_full(to, fraglen, &pfh->wcheck,
620 &pfh->msg->msg_iter) != fraglen) 621 &pfh->msg->msg_iter))
621 return -EFAULT; 622 return -EFAULT;
622 } 623 }
623 624
@@ -642,6 +643,8 @@ static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh,
642{ 643{
643 struct sk_buff *skb = skb_peek(&sk->sk_write_queue); 644 struct sk_buff *skb = skb_peek(&sk->sk_write_queue);
644 645
646 if (!skb)
647 return 0;
645 pfh->wcheck = csum_partial((char *)&pfh->icmph, 648 pfh->wcheck = csum_partial((char *)&pfh->icmph,
646 sizeof(struct icmphdr), pfh->wcheck); 649 sizeof(struct icmphdr), pfh->wcheck);
647 pfh->icmph.checksum = csum_fold(pfh->wcheck); 650 pfh->icmph.checksum = csum_fold(pfh->wcheck);
@@ -793,7 +796,8 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
793 796
794 flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, 797 flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
795 RT_SCOPE_UNIVERSE, sk->sk_protocol, 798 RT_SCOPE_UNIVERSE, sk->sk_protocol,
796 inet_sk_flowi_flags(sk), faddr, saddr, 0, 0); 799 inet_sk_flowi_flags(sk), faddr, saddr, 0, 0,
800 sk->sk_uid);
797 801
798 security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); 802 security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
799 rt = ip_route_output_flow(net, &fl4, sk); 803 rt = ip_route_output_flow(net, &fl4, sk);
@@ -847,7 +851,8 @@ out:
847 return err; 851 return err;
848 852
849do_confirm: 853do_confirm:
850 dst_confirm(&rt->dst); 854 if (msg->msg_flags & MSG_PROBE)
855 dst_confirm_neigh(&rt->dst, &fl4.daddr);
851 if (!(msg->msg_flags & MSG_PROBE) || len) 856 if (!(msg->msg_flags & MSG_PROBE) || len)
852 goto back_from_confirm; 857 goto back_from_confirm;
853 err = 0; 858 err = 0;
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 7143ca1a6af9..69cf49e8356d 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -57,15 +57,13 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
57 unsigned int frag_mem; 57 unsigned int frag_mem;
58 int orphans, sockets; 58 int orphans, sockets;
59 59
60 local_bh_disable();
61 orphans = percpu_counter_sum_positive(&tcp_orphan_count); 60 orphans = percpu_counter_sum_positive(&tcp_orphan_count);
62 sockets = proto_sockets_allocated_sum_positive(&tcp_prot); 61 sockets = proto_sockets_allocated_sum_positive(&tcp_prot);
63 local_bh_enable();
64 62
65 socket_seq_show(seq); 63 socket_seq_show(seq);
66 seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n", 64 seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
67 sock_prot_inuse_get(net, &tcp_prot), orphans, 65 sock_prot_inuse_get(net, &tcp_prot), orphans,
68 atomic_read(&tcp_death_row.tw_count), sockets, 66 atomic_read(&net->ipv4.tcp_death_row.tw_count), sockets,
69 proto_memory_allocated(&tcp_prot)); 67 proto_memory_allocated(&tcp_prot));
70 seq_printf(seq, "UDP: inuse %d mem %ld\n", 68 seq_printf(seq, "UDP: inuse %d mem %ld\n",
71 sock_prot_inuse_get(net, &udp_prot), 69 sock_prot_inuse_get(net, &udp_prot),
@@ -264,6 +262,7 @@ static const struct snmp_mib snmp4_net_list[] = {
264 SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED), 262 SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED),
265 SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK), 263 SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK),
266 SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP), 264 SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP),
265 SNMP_MIB_ITEM("PFMemallocDrop", LINUX_MIB_PFMEMALLOCDROP),
267 SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP), 266 SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP),
268 SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP), 267 SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP),
269 SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER), 268 SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER),
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index ecbe5a7c2d6d..9d943974de2b 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -41,7 +41,7 @@
41#include <linux/atomic.h> 41#include <linux/atomic.h>
42#include <asm/byteorder.h> 42#include <asm/byteorder.h>
43#include <asm/current.h> 43#include <asm/current.h>
44#include <asm/uaccess.h> 44#include <linux/uaccess.h>
45#include <asm/ioctls.h> 45#include <asm/ioctls.h>
46#include <linux/stddef.h> 46#include <linux/stddef.h>
47#include <linux/slab.h> 47#include <linux/slab.h>
@@ -89,9 +89,10 @@ struct raw_frag_vec {
89 int hlen; 89 int hlen;
90}; 90};
91 91
92static struct raw_hashinfo raw_v4_hashinfo = { 92struct raw_hashinfo raw_v4_hashinfo = {
93 .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock), 93 .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock),
94}; 94};
95EXPORT_SYMBOL_GPL(raw_v4_hashinfo);
95 96
96int raw_hash_sk(struct sock *sk) 97int raw_hash_sk(struct sock *sk)
97{ 98{
@@ -120,7 +121,7 @@ void raw_unhash_sk(struct sock *sk)
120} 121}
121EXPORT_SYMBOL_GPL(raw_unhash_sk); 122EXPORT_SYMBOL_GPL(raw_unhash_sk);
122 123
123static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, 124struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
124 unsigned short num, __be32 raddr, __be32 laddr, int dif) 125 unsigned short num, __be32 raddr, __be32 laddr, int dif)
125{ 126{
126 sk_for_each_from(sk) { 127 sk_for_each_from(sk) {
@@ -136,6 +137,7 @@ static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
136found: 137found:
137 return sk; 138 return sk;
138} 139}
140EXPORT_SYMBOL_GPL(__raw_v4_lookup);
139 141
140/* 142/*
141 * 0 - deliver 143 * 0 - deliver
@@ -381,6 +383,9 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
381 383
382 sock_tx_timestamp(sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags); 384 sock_tx_timestamp(sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
383 385
386 if (flags & MSG_CONFIRM)
387 skb_set_dst_pending_confirm(skb, 1);
388
384 skb->transport_header = skb->network_header; 389 skb->transport_header = skb->network_header;
385 err = -EFAULT; 390 err = -EFAULT;
386 if (memcpy_from_msg(iph, msg, length)) 391 if (memcpy_from_msg(iph, msg, length))
@@ -604,7 +609,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
604 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, 609 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
605 inet_sk_flowi_flags(sk) | 610 inet_sk_flowi_flags(sk) |
606 (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), 611 (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
607 daddr, saddr, 0, 0); 612 daddr, saddr, 0, 0, sk->sk_uid);
608 613
609 if (!inet->hdrincl) { 614 if (!inet->hdrincl) {
610 rfv.msg = msg; 615 rfv.msg = msg;
@@ -664,7 +669,8 @@ out:
664 return len; 669 return len;
665 670
666do_confirm: 671do_confirm:
667 dst_confirm(&rt->dst); 672 if (msg->msg_flags & MSG_PROBE)
673 dst_confirm_neigh(&rt->dst, &fl4.daddr);
668 if (!(msg->msg_flags & MSG_PROBE) || len) 674 if (!(msg->msg_flags & MSG_PROBE) || len)
669 goto back_from_confirm; 675 goto back_from_confirm;
670 err = 0; 676 err = 0;
@@ -676,7 +682,9 @@ static void raw_close(struct sock *sk, long timeout)
676 /* 682 /*
677 * Raw sockets may have direct kernel references. Kill them. 683 * Raw sockets may have direct kernel references. Kill them.
678 */ 684 */
685 rtnl_lock();
679 ip_ra_control(sk, 0, NULL); 686 ip_ra_control(sk, 0, NULL);
687 rtnl_unlock();
680 688
681 sk_common_release(sk); 689 sk_common_release(sk);
682} 690}
@@ -693,12 +701,20 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
693{ 701{
694 struct inet_sock *inet = inet_sk(sk); 702 struct inet_sock *inet = inet_sk(sk);
695 struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; 703 struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
704 u32 tb_id = RT_TABLE_LOCAL;
696 int ret = -EINVAL; 705 int ret = -EINVAL;
697 int chk_addr_ret; 706 int chk_addr_ret;
698 707
699 if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in)) 708 if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in))
700 goto out; 709 goto out;
701 chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); 710
711 if (sk->sk_bound_dev_if)
712 tb_id = l3mdev_fib_table_by_index(sock_net(sk),
713 sk->sk_bound_dev_if) ? : tb_id;
714
715 chk_addr_ret = inet_addr_type_table(sock_net(sk), addr->sin_addr.s_addr,
716 tb_id);
717
702 ret = -EADDRNOTAVAIL; 718 ret = -EADDRNOTAVAIL;
703 if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL && 719 if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL &&
704 chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) 720 chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
@@ -912,6 +928,20 @@ static int compat_raw_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg
912} 928}
913#endif 929#endif
914 930
931int raw_abort(struct sock *sk, int err)
932{
933 lock_sock(sk);
934
935 sk->sk_err = err;
936 sk->sk_error_report(sk);
937 __udp_disconnect(sk, 0);
938
939 release_sock(sk);
940
941 return 0;
942}
943EXPORT_SYMBOL_GPL(raw_abort);
944
915struct proto raw_prot = { 945struct proto raw_prot = {
916 .name = "RAW", 946 .name = "RAW",
917 .owner = THIS_MODULE, 947 .owner = THIS_MODULE,
@@ -937,6 +967,7 @@ struct proto raw_prot = {
937 .compat_getsockopt = compat_raw_getsockopt, 967 .compat_getsockopt = compat_raw_getsockopt,
938 .compat_ioctl = compat_raw_ioctl, 968 .compat_ioctl = compat_raw_ioctl,
939#endif 969#endif
970 .diag_destroy = raw_abort,
940}; 971};
941 972
942#ifdef CONFIG_PROC_FS 973#ifdef CONFIG_PROC_FS
diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c
new file mode 100644
index 000000000000..e1a51ca68d23
--- /dev/null
+++ b/net/ipv4/raw_diag.c
@@ -0,0 +1,266 @@
1#include <linux/module.h>
2
3#include <linux/inet_diag.h>
4#include <linux/sock_diag.h>
5
6#include <net/inet_sock.h>
7#include <net/raw.h>
8#include <net/rawv6.h>
9
10#ifdef pr_fmt
11# undef pr_fmt
12#endif
13
14#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15
16static struct raw_hashinfo *
17raw_get_hashinfo(const struct inet_diag_req_v2 *r)
18{
19 if (r->sdiag_family == AF_INET) {
20 return &raw_v4_hashinfo;
21#if IS_ENABLED(CONFIG_IPV6)
22 } else if (r->sdiag_family == AF_INET6) {
23 return &raw_v6_hashinfo;
24#endif
25 } else {
26 pr_warn_once("Unexpected inet family %d\n",
27 r->sdiag_family);
28 WARN_ON_ONCE(1);
29 return ERR_PTR(-EINVAL);
30 }
31}
32
33/*
34 * Due to requirement of not breaking user API we can't simply
35 * rename @pad field in inet_diag_req_v2 structure, instead
36 * use helper to figure it out.
37 */
38
39static struct sock *raw_lookup(struct net *net, struct sock *from,
40 const struct inet_diag_req_v2 *req)
41{
42 struct inet_diag_req_raw *r = (void *)req;
43 struct sock *sk = NULL;
44
45 if (r->sdiag_family == AF_INET)
46 sk = __raw_v4_lookup(net, from, r->sdiag_raw_protocol,
47 r->id.idiag_dst[0],
48 r->id.idiag_src[0],
49 r->id.idiag_if);
50#if IS_ENABLED(CONFIG_IPV6)
51 else
52 sk = __raw_v6_lookup(net, from, r->sdiag_raw_protocol,
53 (const struct in6_addr *)r->id.idiag_src,
54 (const struct in6_addr *)r->id.idiag_dst,
55 r->id.idiag_if);
56#endif
57 return sk;
58}
59
60static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 *r)
61{
62 struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
63 struct sock *sk = NULL, *s;
64 int slot;
65
66 if (IS_ERR(hashinfo))
67 return ERR_CAST(hashinfo);
68
69 read_lock(&hashinfo->lock);
70 for (slot = 0; slot < RAW_HTABLE_SIZE; slot++) {
71 sk_for_each(s, &hashinfo->ht[slot]) {
72 sk = raw_lookup(net, s, r);
73 if (sk) {
74 /*
75 * Grab it and keep until we fill
76 * diag meaage to be reported, so
77 * caller should call sock_put then.
78 * We can do that because we're keeping
79 * hashinfo->lock here.
80 */
81 sock_hold(sk);
82 goto out_unlock;
83 }
84 }
85 }
86out_unlock:
87 read_unlock(&hashinfo->lock);
88
89 return sk ? sk : ERR_PTR(-ENOENT);
90}
91
92static int raw_diag_dump_one(struct sk_buff *in_skb,
93 const struct nlmsghdr *nlh,
94 const struct inet_diag_req_v2 *r)
95{
96 struct net *net = sock_net(in_skb->sk);
97 struct sk_buff *rep;
98 struct sock *sk;
99 int err;
100
101 sk = raw_sock_get(net, r);
102 if (IS_ERR(sk))
103 return PTR_ERR(sk);
104
105 rep = nlmsg_new(sizeof(struct inet_diag_msg) +
106 sizeof(struct inet_diag_meminfo) + 64,
107 GFP_KERNEL);
108 if (!rep) {
109 sock_put(sk);
110 return -ENOMEM;
111 }
112
113 err = inet_sk_diag_fill(sk, NULL, rep, r,
114 sk_user_ns(NETLINK_CB(in_skb).sk),
115 NETLINK_CB(in_skb).portid,
116 nlh->nlmsg_seq, 0, nlh,
117 netlink_net_capable(in_skb, CAP_NET_ADMIN));
118 sock_put(sk);
119
120 if (err < 0) {
121 kfree_skb(rep);
122 return err;
123 }
124
125 err = netlink_unicast(net->diag_nlsk, rep,
126 NETLINK_CB(in_skb).portid,
127 MSG_DONTWAIT);
128 if (err > 0)
129 err = 0;
130 return err;
131}
132
133static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
134 struct netlink_callback *cb,
135 const struct inet_diag_req_v2 *r,
136 struct nlattr *bc, bool net_admin)
137{
138 if (!inet_diag_bc_sk(bc, sk))
139 return 0;
140
141 return inet_sk_diag_fill(sk, NULL, skb, r,
142 sk_user_ns(NETLINK_CB(cb->skb).sk),
143 NETLINK_CB(cb->skb).portid,
144 cb->nlh->nlmsg_seq, NLM_F_MULTI,
145 cb->nlh, net_admin);
146}
147
148static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
149 const struct inet_diag_req_v2 *r, struct nlattr *bc)
150{
151 bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
152 struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
153 struct net *net = sock_net(skb->sk);
154 int num, s_num, slot, s_slot;
155 struct sock *sk = NULL;
156
157 if (IS_ERR(hashinfo))
158 return;
159
160 s_slot = cb->args[0];
161 num = s_num = cb->args[1];
162
163 read_lock(&hashinfo->lock);
164 for (slot = s_slot; slot < RAW_HTABLE_SIZE; s_num = 0, slot++) {
165 num = 0;
166
167 sk_for_each(sk, &hashinfo->ht[slot]) {
168 struct inet_sock *inet = inet_sk(sk);
169
170 if (!net_eq(sock_net(sk), net))
171 continue;
172 if (num < s_num)
173 goto next;
174 if (sk->sk_family != r->sdiag_family)
175 goto next;
176 if (r->id.idiag_sport != inet->inet_sport &&
177 r->id.idiag_sport)
178 goto next;
179 if (r->id.idiag_dport != inet->inet_dport &&
180 r->id.idiag_dport)
181 goto next;
182 if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0)
183 goto out_unlock;
184next:
185 num++;
186 }
187 }
188
189out_unlock:
190 read_unlock(&hashinfo->lock);
191
192 cb->args[0] = slot;
193 cb->args[1] = num;
194}
195
196static void raw_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
197 void *info)
198{
199 r->idiag_rqueue = sk_rmem_alloc_get(sk);
200 r->idiag_wqueue = sk_wmem_alloc_get(sk);
201}
202
203#ifdef CONFIG_INET_DIAG_DESTROY
204static int raw_diag_destroy(struct sk_buff *in_skb,
205 const struct inet_diag_req_v2 *r)
206{
207 struct net *net = sock_net(in_skb->sk);
208 struct sock *sk;
209 int err;
210
211 sk = raw_sock_get(net, r);
212 if (IS_ERR(sk))
213 return PTR_ERR(sk);
214 err = sock_diag_destroy(sk, ECONNABORTED);
215 sock_put(sk);
216 return err;
217}
218#endif
219
220static const struct inet_diag_handler raw_diag_handler = {
221 .dump = raw_diag_dump,
222 .dump_one = raw_diag_dump_one,
223 .idiag_get_info = raw_diag_get_info,
224 .idiag_type = IPPROTO_RAW,
225 .idiag_info_size = 0,
226#ifdef CONFIG_INET_DIAG_DESTROY
227 .destroy = raw_diag_destroy,
228#endif
229};
230
231static void __always_unused __check_inet_diag_req_raw(void)
232{
233 /*
234 * Make sure the two structures are identical,
235 * except the @pad field.
236 */
237#define __offset_mismatch(m1, m2) \
238 (offsetof(struct inet_diag_req_v2, m1) != \
239 offsetof(struct inet_diag_req_raw, m2))
240
241 BUILD_BUG_ON(sizeof(struct inet_diag_req_v2) !=
242 sizeof(struct inet_diag_req_raw));
243 BUILD_BUG_ON(__offset_mismatch(sdiag_family, sdiag_family));
244 BUILD_BUG_ON(__offset_mismatch(sdiag_protocol, sdiag_protocol));
245 BUILD_BUG_ON(__offset_mismatch(idiag_ext, idiag_ext));
246 BUILD_BUG_ON(__offset_mismatch(pad, sdiag_raw_protocol));
247 BUILD_BUG_ON(__offset_mismatch(idiag_states, idiag_states));
248 BUILD_BUG_ON(__offset_mismatch(id, id));
249#undef __offset_mismatch
250}
251
252static int __init raw_diag_init(void)
253{
254 return inet_diag_register(&raw_diag_handler);
255}
256
257static void __exit raw_diag_exit(void)
258{
259 inet_diag_unregister(&raw_diag_handler);
260}
261
262module_init(raw_diag_init);
263module_exit(raw_diag_exit);
264MODULE_LICENSE("GPL");
265MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-255 /* AF_INET - IPPROTO_RAW */);
266MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10-255 /* AF_INET6 - IPPROTO_RAW */);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 2a57566e6e91..acd69cfe2951 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -65,7 +65,7 @@
65#define pr_fmt(fmt) "IPv4: " fmt 65#define pr_fmt(fmt) "IPv4: " fmt
66 66
67#include <linux/module.h> 67#include <linux/module.h>
68#include <asm/uaccess.h> 68#include <linux/uaccess.h>
69#include <linux/bitops.h> 69#include <linux/bitops.h>
70#include <linux/types.h> 70#include <linux/types.h>
71#include <linux/kernel.h> 71#include <linux/kernel.h>
@@ -154,6 +154,7 @@ static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, 154static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
155 struct sk_buff *skb, 155 struct sk_buff *skb,
156 const void *daddr); 156 const void *daddr);
157static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
157 158
158static struct dst_ops ipv4_dst_ops = { 159static struct dst_ops ipv4_dst_ops = {
159 .family = AF_INET, 160 .family = AF_INET,
@@ -168,6 +169,7 @@ static struct dst_ops ipv4_dst_ops = {
168 .redirect = ip_do_redirect, 169 .redirect = ip_do_redirect,
169 .local_out = __ip_local_out, 170 .local_out = __ip_local_out,
170 .neigh_lookup = ipv4_neigh_lookup, 171 .neigh_lookup = ipv4_neigh_lookup,
172 .confirm_neigh = ipv4_confirm_neigh,
171}; 173};
172 174
173#define ECN_OR_COST(class) TC_PRIO_##class 175#define ECN_OR_COST(class) TC_PRIO_##class
@@ -461,6 +463,23 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
461 return neigh_create(&arp_tbl, pkey, dev); 463 return neigh_create(&arp_tbl, pkey, dev);
462} 464}
463 465
466static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
467{
468 struct net_device *dev = dst->dev;
469 const __be32 *pkey = daddr;
470 const struct rtable *rt;
471
472 rt = (const struct rtable *)dst;
473 if (rt->rt_gateway)
474 pkey = (const __be32 *)&rt->rt_gateway;
475 else if (!daddr ||
476 (rt->rt_flags &
477 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
478 return;
479
480 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
481}
482
464#define IP_IDENTS_SZ 2048u 483#define IP_IDENTS_SZ 2048u
465 484
466static atomic_t *ip_idents __read_mostly; 485static atomic_t *ip_idents __read_mostly;
@@ -507,7 +526,8 @@ void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
507} 526}
508EXPORT_SYMBOL(__ip_select_ident); 527EXPORT_SYMBOL(__ip_select_ident);
509 528
510static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk, 529static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
530 const struct sock *sk,
511 const struct iphdr *iph, 531 const struct iphdr *iph,
512 int oif, u8 tos, 532 int oif, u8 tos,
513 u8 prot, u32 mark, int flow_flags) 533 u8 prot, u32 mark, int flow_flags)
@@ -523,19 +543,21 @@ static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
523 flowi4_init_output(fl4, oif, mark, tos, 543 flowi4_init_output(fl4, oif, mark, tos,
524 RT_SCOPE_UNIVERSE, prot, 544 RT_SCOPE_UNIVERSE, prot,
525 flow_flags, 545 flow_flags,
526 iph->daddr, iph->saddr, 0, 0); 546 iph->daddr, iph->saddr, 0, 0,
547 sock_net_uid(net, sk));
527} 548}
528 549
529static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, 550static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
530 const struct sock *sk) 551 const struct sock *sk)
531{ 552{
553 const struct net *net = dev_net(skb->dev);
532 const struct iphdr *iph = ip_hdr(skb); 554 const struct iphdr *iph = ip_hdr(skb);
533 int oif = skb->dev->ifindex; 555 int oif = skb->dev->ifindex;
534 u8 tos = RT_TOS(iph->tos); 556 u8 tos = RT_TOS(iph->tos);
535 u8 prot = iph->protocol; 557 u8 prot = iph->protocol;
536 u32 mark = skb->mark; 558 u32 mark = skb->mark;
537 559
538 __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0); 560 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
539} 561}
540 562
541static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) 563static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
@@ -552,7 +574,7 @@ static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
552 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, 574 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
553 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, 575 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
554 inet_sk_flowi_flags(sk), 576 inet_sk_flowi_flags(sk),
555 daddr, inet->inet_saddr, 0, 0); 577 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
556 rcu_read_unlock(); 578 rcu_read_unlock();
557} 579}
558 580
@@ -795,6 +817,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
795 struct rtable *rt; 817 struct rtable *rt;
796 struct flowi4 fl4; 818 struct flowi4 fl4;
797 const struct iphdr *iph = (const struct iphdr *) skb->data; 819 const struct iphdr *iph = (const struct iphdr *) skb->data;
820 struct net *net = dev_net(skb->dev);
798 int oif = skb->dev->ifindex; 821 int oif = skb->dev->ifindex;
799 u8 tos = RT_TOS(iph->tos); 822 u8 tos = RT_TOS(iph->tos);
800 u8 prot = iph->protocol; 823 u8 prot = iph->protocol;
@@ -802,7 +825,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
802 825
803 rt = (struct rtable *) dst; 826 rt = (struct rtable *) dst;
804 827
805 __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0); 828 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
806 __ip_do_redirect(rt, skb, &fl4, true); 829 __ip_do_redirect(rt, skb, &fl4, true);
807} 830}
808 831
@@ -1020,7 +1043,7 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1020 if (!mark) 1043 if (!mark)
1021 mark = IP4_REPLY_MARK(net, skb->mark); 1044 mark = IP4_REPLY_MARK(net, skb->mark);
1022 1045
1023 __build_flow_key(&fl4, NULL, iph, oif, 1046 __build_flow_key(net, &fl4, NULL, iph, oif,
1024 RT_TOS(iph->tos), protocol, mark, flow_flags); 1047 RT_TOS(iph->tos), protocol, mark, flow_flags);
1025 rt = __ip_route_output_key(net, &fl4); 1048 rt = __ip_route_output_key(net, &fl4);
1026 if (!IS_ERR(rt)) { 1049 if (!IS_ERR(rt)) {
@@ -1036,7 +1059,7 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1036 struct flowi4 fl4; 1059 struct flowi4 fl4;
1037 struct rtable *rt; 1060 struct rtable *rt;
1038 1061
1039 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); 1062 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1040 1063
1041 if (!fl4.flowi4_mark) 1064 if (!fl4.flowi4_mark)
1042 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark); 1065 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
@@ -1055,6 +1078,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1055 struct rtable *rt; 1078 struct rtable *rt;
1056 struct dst_entry *odst = NULL; 1079 struct dst_entry *odst = NULL;
1057 bool new = false; 1080 bool new = false;
1081 struct net *net = sock_net(sk);
1058 1082
1059 bh_lock_sock(sk); 1083 bh_lock_sock(sk);
1060 1084
@@ -1068,7 +1092,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1068 goto out; 1092 goto out;
1069 } 1093 }
1070 1094
1071 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); 1095 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1072 1096
1073 rt = (struct rtable *)odst; 1097 rt = (struct rtable *)odst;
1074 if (odst->obsolete && !odst->ops->check(odst, 0)) { 1098 if (odst->obsolete && !odst->ops->check(odst, 0)) {
@@ -1108,7 +1132,7 @@ void ipv4_redirect(struct sk_buff *skb, struct net *net,
1108 struct flowi4 fl4; 1132 struct flowi4 fl4;
1109 struct rtable *rt; 1133 struct rtable *rt;
1110 1134
1111 __build_flow_key(&fl4, NULL, iph, oif, 1135 __build_flow_key(net, &fl4, NULL, iph, oif,
1112 RT_TOS(iph->tos), protocol, mark, flow_flags); 1136 RT_TOS(iph->tos), protocol, mark, flow_flags);
1113 rt = __ip_route_output_key(net, &fl4); 1137 rt = __ip_route_output_key(net, &fl4);
1114 if (!IS_ERR(rt)) { 1138 if (!IS_ERR(rt)) {
@@ -1123,9 +1147,10 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1123 const struct iphdr *iph = (const struct iphdr *) skb->data; 1147 const struct iphdr *iph = (const struct iphdr *) skb->data;
1124 struct flowi4 fl4; 1148 struct flowi4 fl4;
1125 struct rtable *rt; 1149 struct rtable *rt;
1150 struct net *net = sock_net(sk);
1126 1151
1127 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); 1152 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1128 rt = __ip_route_output_key(sock_net(sk), &fl4); 1153 rt = __ip_route_output_key(net, &fl4);
1129 if (!IS_ERR(rt)) { 1154 if (!IS_ERR(rt)) {
1130 __ip_do_redirect(rt, skb, &fl4, false); 1155 __ip_do_redirect(rt, skb, &fl4, false);
1131 ip_rt_put(rt); 1156 ip_rt_put(rt);
@@ -1598,6 +1623,19 @@ static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1598 spin_unlock_bh(&fnhe_lock); 1623 spin_unlock_bh(&fnhe_lock);
1599} 1624}
1600 1625
1626static void set_lwt_redirect(struct rtable *rth)
1627{
1628 if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1629 rth->dst.lwtstate->orig_output = rth->dst.output;
1630 rth->dst.output = lwtunnel_output;
1631 }
1632
1633 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1634 rth->dst.lwtstate->orig_input = rth->dst.input;
1635 rth->dst.input = lwtunnel_input;
1636 }
1637}
1638
1601/* called in rcu_read_lock() section */ 1639/* called in rcu_read_lock() section */
1602static int __mkroute_input(struct sk_buff *skb, 1640static int __mkroute_input(struct sk_buff *skb,
1603 const struct fib_result *res, 1641 const struct fib_result *res,
@@ -1687,14 +1725,7 @@ rt_cache:
1687 rth->dst.input = ip_forward; 1725 rth->dst.input = ip_forward;
1688 1726
1689 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag); 1727 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1690 if (lwtunnel_output_redirect(rth->dst.lwtstate)) { 1728 set_lwt_redirect(rth);
1691 rth->dst.lwtstate->orig_output = rth->dst.output;
1692 rth->dst.output = lwtunnel_output;
1693 }
1694 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1695 rth->dst.lwtstate->orig_input = rth->dst.input;
1696 rth->dst.input = lwtunnel_input;
1697 }
1698 skb_dst_set(skb, &rth->dst); 1729 skb_dst_set(skb, &rth->dst);
1699out: 1730out:
1700 err = 0; 1731 err = 0;
@@ -1746,7 +1777,6 @@ standard_hash:
1746 1777
1747static int ip_mkroute_input(struct sk_buff *skb, 1778static int ip_mkroute_input(struct sk_buff *skb,
1748 struct fib_result *res, 1779 struct fib_result *res,
1749 const struct flowi4 *fl4,
1750 struct in_device *in_dev, 1780 struct in_device *in_dev,
1751 __be32 daddr, __be32 saddr, u32 tos) 1781 __be32 daddr, __be32 saddr, u32 tos)
1752{ 1782{
@@ -1846,6 +1876,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1846 fl4.flowi4_flags = 0; 1876 fl4.flowi4_flags = 0;
1847 fl4.daddr = daddr; 1877 fl4.daddr = daddr;
1848 fl4.saddr = saddr; 1878 fl4.saddr = saddr;
1879 fl4.flowi4_uid = sock_net_uid(net, NULL);
1849 err = fib_lookup(net, &fl4, &res, 0); 1880 err = fib_lookup(net, &fl4, &res, 0);
1850 if (err != 0) { 1881 if (err != 0) {
1851 if (!IN_DEV_FORWARD(in_dev)) 1882 if (!IN_DEV_FORWARD(in_dev))
@@ -1871,7 +1902,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1871 if (res.type != RTN_UNICAST) 1902 if (res.type != RTN_UNICAST)
1872 goto martian_destination; 1903 goto martian_destination;
1873 1904
1874 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos); 1905 err = ip_mkroute_input(skb, &res, in_dev, daddr, saddr, tos);
1875out: return err; 1906out: return err;
1876 1907
1877brd_input: 1908brd_input:
@@ -1902,7 +1933,8 @@ local_input:
1902 } 1933 }
1903 } 1934 }
1904 1935
1905 rth = rt_dst_alloc(net->loopback_dev, flags | RTCF_LOCAL, res.type, 1936 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
1937 flags | RTCF_LOCAL, res.type,
1906 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache); 1938 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1907 if (!rth) 1939 if (!rth)
1908 goto e_nobufs; 1940 goto e_nobufs;
@@ -1921,8 +1953,18 @@ local_input:
1921 rth->dst.error= -err; 1953 rth->dst.error= -err;
1922 rth->rt_flags &= ~RTCF_LOCAL; 1954 rth->rt_flags &= ~RTCF_LOCAL;
1923 } 1955 }
1956
1924 if (do_cache) { 1957 if (do_cache) {
1925 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) { 1958 struct fib_nh *nh = &FIB_RES_NH(res);
1959
1960 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1961 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1962 WARN_ON(rth->dst.input == lwtunnel_input);
1963 rth->dst.lwtstate->orig_input = rth->dst.input;
1964 rth->dst.input = lwtunnel_input;
1965 }
1966
1967 if (unlikely(!rt_cache_route(nh, rth))) {
1926 rth->dst.flags |= DST_NOCACHE; 1968 rth->dst.flags |= DST_NOCACHE;
1927 rt_add_uncached_list(rth); 1969 rt_add_uncached_list(rth);
1928 } 1970 }
@@ -1967,6 +2009,7 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1967{ 2009{
1968 int res; 2010 int res;
1969 2011
2012 tos &= IPTOS_RT_MASK;
1970 rcu_read_lock(); 2013 rcu_read_lock();
1971 2014
1972 /* Multicast recognition logic is moved from route cache to here. 2015 /* Multicast recognition logic is moved from route cache to here.
@@ -1982,25 +2025,35 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1982 */ 2025 */
1983 if (ipv4_is_multicast(daddr)) { 2026 if (ipv4_is_multicast(daddr)) {
1984 struct in_device *in_dev = __in_dev_get_rcu(dev); 2027 struct in_device *in_dev = __in_dev_get_rcu(dev);
2028 int our = 0;
1985 2029
1986 if (in_dev) { 2030 if (in_dev)
1987 int our = ip_check_mc_rcu(in_dev, daddr, saddr, 2031 our = ip_check_mc_rcu(in_dev, daddr, saddr,
1988 ip_hdr(skb)->protocol); 2032 ip_hdr(skb)->protocol);
1989 if (our 2033
2034 /* check l3 master if no match yet */
2035 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2036 struct in_device *l3_in_dev;
2037
2038 l3_in_dev = __in_dev_get_rcu(skb->dev);
2039 if (l3_in_dev)
2040 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2041 ip_hdr(skb)->protocol);
2042 }
2043
2044 res = -EINVAL;
2045 if (our
1990#ifdef CONFIG_IP_MROUTE 2046#ifdef CONFIG_IP_MROUTE
1991 || 2047 ||
1992 (!ipv4_is_local_multicast(daddr) && 2048 (!ipv4_is_local_multicast(daddr) &&
1993 IN_DEV_MFORWARD(in_dev)) 2049 IN_DEV_MFORWARD(in_dev))
1994#endif 2050#endif
1995 ) { 2051 ) {
1996 int res = ip_route_input_mc(skb, daddr, saddr, 2052 res = ip_route_input_mc(skb, daddr, saddr,
1997 tos, dev, our); 2053 tos, dev, our);
1998 rcu_read_unlock();
1999 return res;
2000 }
2001 } 2054 }
2002 rcu_read_unlock(); 2055 rcu_read_unlock();
2003 return -EINVAL; 2056 return res;
2004 } 2057 }
2005 res = ip_route_input_slow(skb, daddr, saddr, tos, dev); 2058 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2006 rcu_read_unlock(); 2059 rcu_read_unlock();
@@ -2140,8 +2193,7 @@ add:
2140 } 2193 }
2141 2194
2142 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0); 2195 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2143 if (lwtunnel_output_redirect(rth->dst.lwtstate)) 2196 set_lwt_redirect(rth);
2144 rth->dst.output = lwtunnel_output;
2145 2197
2146 return rth; 2198 return rth;
2147} 2199}
@@ -2268,7 +2320,8 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2268 res.fi = NULL; 2320 res.fi = NULL;
2269 res.table = NULL; 2321 res.table = NULL;
2270 if (fl4->flowi4_oif && 2322 if (fl4->flowi4_oif &&
2271 !netif_index_is_l3_master(net, fl4->flowi4_oif)) { 2323 (ipv4_is_multicast(fl4->daddr) ||
2324 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2272 /* Apparently, routing tables are wrong. Assume, 2325 /* Apparently, routing tables are wrong. Assume,
2273 that the destination is on link. 2326 that the destination is on link.
2274 2327
@@ -2421,7 +2474,7 @@ EXPORT_SYMBOL_GPL(ip_route_output_flow);
2421 2474
2422static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, 2475static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
2423 struct flowi4 *fl4, struct sk_buff *skb, u32 portid, 2476 struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2424 u32 seq, int event, int nowait, unsigned int flags) 2477 u32 seq, int event)
2425{ 2478{
2426 struct rtable *rt = skb_rtable(skb); 2479 struct rtable *rt = skb_rtable(skb);
2427 struct rtmsg *r; 2480 struct rtmsg *r;
@@ -2430,7 +2483,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
2430 u32 error; 2483 u32 error;
2431 u32 metrics[RTAX_MAX]; 2484 u32 metrics[RTAX_MAX];
2432 2485
2433 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags); 2486 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), 0);
2434 if (!nlh) 2487 if (!nlh)
2435 return -EMSGSIZE; 2488 return -EMSGSIZE;
2436 2489
@@ -2439,7 +2492,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
2439 r->rtm_dst_len = 32; 2492 r->rtm_dst_len = 32;
2440 r->rtm_src_len = 0; 2493 r->rtm_src_len = 0;
2441 r->rtm_tos = fl4->flowi4_tos; 2494 r->rtm_tos = fl4->flowi4_tos;
2442 r->rtm_table = table_id; 2495 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2443 if (nla_put_u32(skb, RTA_TABLE, table_id)) 2496 if (nla_put_u32(skb, RTA_TABLE, table_id))
2444 goto nla_put_failure; 2497 goto nla_put_failure;
2445 r->rtm_type = rt->rt_type; 2498 r->rtm_type = rt->rt_type;
@@ -2495,6 +2548,11 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
2495 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) 2548 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2496 goto nla_put_failure; 2549 goto nla_put_failure;
2497 2550
2551 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2552 nla_put_u32(skb, RTA_UID,
2553 from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2554 goto nla_put_failure;
2555
2498 error = rt->dst.error; 2556 error = rt->dst.error;
2499 2557
2500 if (rt_is_input_route(rt)) { 2558 if (rt_is_input_route(rt)) {
@@ -2503,18 +2561,12 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
2503 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { 2561 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2504 int err = ipmr_get_route(net, skb, 2562 int err = ipmr_get_route(net, skb,
2505 fl4->saddr, fl4->daddr, 2563 fl4->saddr, fl4->daddr,
2506 r, nowait, portid); 2564 r, portid);
2507 2565
2508 if (err <= 0) { 2566 if (err <= 0) {
2509 if (!nowait) { 2567 if (err == 0)
2510 if (err == 0) 2568 return 0;
2511 return 0; 2569 goto nla_put_failure;
2512 goto nla_put_failure;
2513 } else {
2514 if (err == -EMSGSIZE)
2515 goto nla_put_failure;
2516 error = err;
2517 }
2518 } 2570 }
2519 } else 2571 } else
2520#endif 2572#endif
@@ -2547,6 +2599,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2547 int mark; 2599 int mark;
2548 struct sk_buff *skb; 2600 struct sk_buff *skb;
2549 u32 table_id = RT_TABLE_MAIN; 2601 u32 table_id = RT_TABLE_MAIN;
2602 kuid_t uid;
2550 2603
2551 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy); 2604 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2552 if (err < 0) 2605 if (err < 0)
@@ -2567,13 +2620,17 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2567 skb_reset_network_header(skb); 2620 skb_reset_network_header(skb);
2568 2621
2569 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */ 2622 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2570 ip_hdr(skb)->protocol = IPPROTO_ICMP; 2623 ip_hdr(skb)->protocol = IPPROTO_UDP;
2571 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); 2624 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2572 2625
2573 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0; 2626 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2574 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0; 2627 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2575 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; 2628 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2576 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; 2629 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2630 if (tb[RTA_UID])
2631 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2632 else
2633 uid = (iif ? INVALID_UID : current_uid());
2577 2634
2578 memset(&fl4, 0, sizeof(fl4)); 2635 memset(&fl4, 0, sizeof(fl4));
2579 fl4.daddr = dst; 2636 fl4.daddr = dst;
@@ -2581,6 +2638,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2581 fl4.flowi4_tos = rtm->rtm_tos; 2638 fl4.flowi4_tos = rtm->rtm_tos;
2582 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; 2639 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2583 fl4.flowi4_mark = mark; 2640 fl4.flowi4_mark = mark;
2641 fl4.flowi4_uid = uid;
2584 2642
2585 if (iif) { 2643 if (iif) {
2586 struct net_device *dev; 2644 struct net_device *dev;
@@ -2594,9 +2652,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2594 skb->protocol = htons(ETH_P_IP); 2652 skb->protocol = htons(ETH_P_IP);
2595 skb->dev = dev; 2653 skb->dev = dev;
2596 skb->mark = mark; 2654 skb->mark = mark;
2597 local_bh_disable();
2598 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); 2655 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2599 local_bh_enable();
2600 2656
2601 rt = skb_rtable(skb); 2657 rt = skb_rtable(skb);
2602 if (err == 0 && rt->dst.error) 2658 if (err == 0 && rt->dst.error)
@@ -2621,7 +2677,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2621 2677
2622 err = rt_fill_info(net, dst, src, table_id, &fl4, skb, 2678 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2623 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 2679 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2624 RTM_NEWROUTE, 0, 0); 2680 RTM_NEWROUTE);
2625 if (err < 0) 2681 if (err < 0)
2626 goto errout_free; 2682 goto errout_free;
2627 2683
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index e3c4043c27de..496b97e17aaf 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -13,13 +13,13 @@
13#include <linux/tcp.h> 13#include <linux/tcp.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/random.h> 15#include <linux/random.h>
16#include <linux/cryptohash.h> 16#include <linux/siphash.h>
17#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/export.h> 18#include <linux/export.h>
19#include <net/tcp.h> 19#include <net/tcp.h>
20#include <net/route.h> 20#include <net/route.h>
21 21
22static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly; 22static siphash_key_t syncookie_secret[2] __read_mostly;
23 23
24#define COOKIEBITS 24 /* Upper bits store count */ 24#define COOKIEBITS 24 /* Upper bits store count */
25#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) 25#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
@@ -48,24 +48,13 @@ static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
48#define TSBITS 6 48#define TSBITS 6
49#define TSMASK (((__u32)1 << TSBITS) - 1) 49#define TSMASK (((__u32)1 << TSBITS) - 1)
50 50
51static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv4_cookie_scratch);
52
53static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, 51static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
54 u32 count, int c) 52 u32 count, int c)
55{ 53{
56 __u32 *tmp;
57
58 net_get_random_once(syncookie_secret, sizeof(syncookie_secret)); 54 net_get_random_once(syncookie_secret, sizeof(syncookie_secret));
59 55 return siphash_4u32((__force u32)saddr, (__force u32)daddr,
60 tmp = this_cpu_ptr(ipv4_cookie_scratch); 56 (__force u32)sport << 16 | (__force u32)dport,
61 memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c])); 57 count, &syncookie_secret[c]);
62 tmp[0] = (__force u32)saddr;
63 tmp[1] = (__force u32)daddr;
64 tmp[2] = ((__force u32)sport << 16) + (__force u32)dport;
65 tmp[3] = count;
66 sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
67
68 return tmp[17];
69} 58}
70 59
71 60
@@ -334,6 +323,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
334 treq = tcp_rsk(req); 323 treq = tcp_rsk(req);
335 treq->rcv_isn = ntohl(th->seq) - 1; 324 treq->rcv_isn = ntohl(th->seq) - 1;
336 treq->snt_isn = cookie; 325 treq->snt_isn = cookie;
326 treq->ts_off = 0;
337 req->mss = mss; 327 req->mss = mss;
338 ireq->ir_num = ntohs(th->dest); 328 ireq->ir_num = ntohs(th->dest);
339 ireq->ir_rmt_port = th->source; 329 ireq->ir_rmt_port = th->source;
@@ -372,7 +362,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
372 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP, 362 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP,
373 inet_sk_flowi_flags(sk), 363 inet_sk_flowi_flags(sk),
374 opt->srr ? opt->faddr : ireq->ir_rmt_addr, 364 opt->srr ? opt->faddr : ireq->ir_rmt_addr,
375 ireq->ir_loc_addr, th->source, th->dest); 365 ireq->ir_loc_addr, th->source, th->dest, sk->sk_uid);
376 security_req_classify_flow(req, flowi4_to_flowi(&fl4)); 366 security_req_classify_flow(req, flowi4_to_flowi(&fl4));
377 rt = ip_route_output_key(sock_net(sk), &fl4); 367 rt = ip_route_output_key(sock_net(sk), &fl4);
378 if (IS_ERR(rt)) { 368 if (IS_ERR(rt)) {
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 80bc36b25de2..d6880a6149ee 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -35,6 +35,8 @@ static int ip_local_port_range_min[] = { 1, 1 };
35static int ip_local_port_range_max[] = { 65535, 65535 }; 35static int ip_local_port_range_max[] = { 65535, 65535 };
36static int tcp_adv_win_scale_min = -31; 36static int tcp_adv_win_scale_min = -31;
37static int tcp_adv_win_scale_max = 31; 37static int tcp_adv_win_scale_max = 31;
38static int ip_privileged_port_min;
39static int ip_privileged_port_max = 65535;
38static int ip_ttl_min = 1; 40static int ip_ttl_min = 1;
39static int ip_ttl_max = 255; 41static int ip_ttl_max = 255;
40static int tcp_syn_retries_min = 1; 42static int tcp_syn_retries_min = 1;
@@ -79,7 +81,12 @@ static int ipv4_local_port_range(struct ctl_table *table, int write,
79 ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); 81 ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
80 82
81 if (write && ret == 0) { 83 if (write && ret == 0) {
82 if (range[1] < range[0]) 84 /* Ensure that the upper limit is not smaller than the lower,
85 * and that the lower does not encroach upon the privileged
86 * port limit.
87 */
88 if ((range[1] < range[0]) ||
89 (range[0] < net->ipv4.sysctl_ip_prot_sock))
83 ret = -EINVAL; 90 ret = -EINVAL;
84 else 91 else
85 set_local_port_range(net, range); 92 set_local_port_range(net, range);
@@ -88,6 +95,40 @@ static int ipv4_local_port_range(struct ctl_table *table, int write,
88 return ret; 95 return ret;
89} 96}
90 97
98/* Validate changes from /proc interface. */
99static int ipv4_privileged_ports(struct ctl_table *table, int write,
100 void __user *buffer, size_t *lenp, loff_t *ppos)
101{
102 struct net *net = container_of(table->data, struct net,
103 ipv4.sysctl_ip_prot_sock);
104 int ret;
105 int pports;
106 int range[2];
107 struct ctl_table tmp = {
108 .data = &pports,
109 .maxlen = sizeof(pports),
110 .mode = table->mode,
111 .extra1 = &ip_privileged_port_min,
112 .extra2 = &ip_privileged_port_max,
113 };
114
115 pports = net->ipv4.sysctl_ip_prot_sock;
116
117 ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
118
119 if (write && ret == 0) {
120 inet_get_local_port_range(net, &range[0], &range[1]);
121 /* Ensure that the local port range doesn't overlap with the
122 * privileged port range.
123 */
124 if (range[0] < pports)
125 ret = -EINVAL;
126 else
127 net->ipv4.sysctl_ip_prot_sock = pports;
128 }
129
130 return ret;
131}
91 132
92static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high) 133static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high)
93{ 134{
@@ -290,13 +331,6 @@ static struct ctl_table ipv4_table[] = {
290 .proc_handler = proc_dointvec 331 .proc_handler = proc_dointvec
291 }, 332 },
292 { 333 {
293 .procname = "tcp_max_tw_buckets",
294 .data = &tcp_death_row.sysctl_max_tw_buckets,
295 .maxlen = sizeof(int),
296 .mode = 0644,
297 .proc_handler = proc_dointvec
298 },
299 {
300 .procname = "tcp_fastopen", 334 .procname = "tcp_fastopen",
301 .data = &sysctl_tcp_fastopen, 335 .data = &sysctl_tcp_fastopen,
302 .maxlen = sizeof(int), 336 .maxlen = sizeof(int),
@@ -310,13 +344,6 @@ static struct ctl_table ipv4_table[] = {
310 .proc_handler = proc_tcp_fastopen_key, 344 .proc_handler = proc_tcp_fastopen_key,
311 }, 345 },
312 { 346 {
313 .procname = "tcp_tw_recycle",
314 .data = &tcp_death_row.sysctl_tw_recycle,
315 .maxlen = sizeof(int),
316 .mode = 0644,
317 .proc_handler = proc_dointvec
318 },
319 {
320 .procname = "tcp_abort_on_overflow", 347 .procname = "tcp_abort_on_overflow",
321 .data = &sysctl_tcp_abort_on_overflow, 348 .data = &sysctl_tcp_abort_on_overflow,
322 .maxlen = sizeof(int), 349 .maxlen = sizeof(int),
@@ -338,13 +365,6 @@ static struct ctl_table ipv4_table[] = {
338 .proc_handler = proc_dointvec 365 .proc_handler = proc_dointvec
339 }, 366 },
340 { 367 {
341 .procname = "tcp_max_syn_backlog",
342 .data = &sysctl_max_syn_backlog,
343 .maxlen = sizeof(int),
344 .mode = 0644,
345 .proc_handler = proc_dointvec
346 },
347 {
348 .procname = "inet_peer_threshold", 368 .procname = "inet_peer_threshold",
349 .data = &inet_peer_threshold, 369 .data = &inet_peer_threshold,
350 .maxlen = sizeof(int), 370 .maxlen = sizeof(int),
@@ -433,13 +453,6 @@ static struct ctl_table ipv4_table[] = {
433 .extra2 = &tcp_adv_win_scale_max, 453 .extra2 = &tcp_adv_win_scale_max,
434 }, 454 },
435 { 455 {
436 .procname = "tcp_tw_reuse",
437 .data = &sysctl_tcp_tw_reuse,
438 .maxlen = sizeof(int),
439 .mode = 0644,
440 .proc_handler = proc_dointvec
441 },
442 {
443 .procname = "tcp_frto", 456 .procname = "tcp_frto",
444 .data = &sysctl_tcp_frto, 457 .data = &sysctl_tcp_frto,
445 .maxlen = sizeof(int), 458 .maxlen = sizeof(int),
@@ -565,13 +578,6 @@ static struct ctl_table ipv4_table[] = {
565 .proc_handler = proc_dointvec 578 .proc_handler = proc_dointvec
566 }, 579 },
567 { 580 {
568 .procname = "tcp_thin_dupack",
569 .data = &sysctl_tcp_thin_dupack,
570 .maxlen = sizeof(int),
571 .mode = 0644,
572 .proc_handler = proc_dointvec
573 },
574 {
575 .procname = "tcp_early_retrans", 581 .procname = "tcp_early_retrans",
576 .data = &sysctl_tcp_early_retrans, 582 .data = &sysctl_tcp_early_retrans,
577 .maxlen = sizeof(int), 583 .maxlen = sizeof(int),
@@ -958,7 +964,35 @@ static struct ctl_table ipv4_net_table[] = {
958 .data = &init_net.ipv4.sysctl_tcp_notsent_lowat, 964 .data = &init_net.ipv4.sysctl_tcp_notsent_lowat,
959 .maxlen = sizeof(unsigned int), 965 .maxlen = sizeof(unsigned int),
960 .mode = 0644, 966 .mode = 0644,
961 .proc_handler = proc_dointvec, 967 .proc_handler = proc_douintvec,
968 },
969 {
970 .procname = "tcp_tw_reuse",
971 .data = &init_net.ipv4.sysctl_tcp_tw_reuse,
972 .maxlen = sizeof(int),
973 .mode = 0644,
974 .proc_handler = proc_dointvec
975 },
976 {
977 .procname = "tcp_max_tw_buckets",
978 .data = &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets,
979 .maxlen = sizeof(int),
980 .mode = 0644,
981 .proc_handler = proc_dointvec
982 },
983 {
984 .procname = "tcp_tw_recycle",
985 .data = &init_net.ipv4.tcp_death_row.sysctl_tw_recycle,
986 .maxlen = sizeof(int),
987 .mode = 0644,
988 .proc_handler = proc_dointvec
989 },
990 {
991 .procname = "tcp_max_syn_backlog",
992 .data = &init_net.ipv4.sysctl_max_syn_backlog,
993 .maxlen = sizeof(int),
994 .mode = 0644,
995 .proc_handler = proc_dointvec
962 }, 996 },
963#ifdef CONFIG_IP_ROUTE_MULTIPATH 997#ifdef CONFIG_IP_ROUTE_MULTIPATH
964 { 998 {
@@ -971,6 +1005,24 @@ static struct ctl_table ipv4_net_table[] = {
971 .extra2 = &one, 1005 .extra2 = &one,
972 }, 1006 },
973#endif 1007#endif
1008 {
1009 .procname = "ip_unprivileged_port_start",
1010 .maxlen = sizeof(int),
1011 .data = &init_net.ipv4.sysctl_ip_prot_sock,
1012 .mode = 0644,
1013 .proc_handler = ipv4_privileged_ports,
1014 },
1015#ifdef CONFIG_NET_L3_MASTER_DEV
1016 {
1017 .procname = "udp_l3mdev_accept",
1018 .data = &init_net.ipv4.sysctl_udp_l3mdev_accept,
1019 .maxlen = sizeof(int),
1020 .mode = 0644,
1021 .proc_handler = proc_dointvec_minmax,
1022 .extra1 = &zero,
1023 .extra2 = &one,
1024 },
1025#endif
974 { } 1026 { }
975}; 1027};
976 1028
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 814af89c1bd3..40ba4249a586 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -277,9 +277,8 @@
277#include <net/ip.h> 277#include <net/ip.h>
278#include <net/sock.h> 278#include <net/sock.h>
279 279
280#include <asm/uaccess.h> 280#include <linux/uaccess.h>
281#include <asm/ioctls.h> 281#include <asm/ioctls.h>
282#include <asm/unaligned.h>
283#include <net/busy_poll.h> 282#include <net/busy_poll.h>
284 283
285int sysctl_tcp_min_tso_segs __read_mostly = 2; 284int sysctl_tcp_min_tso_segs __read_mostly = 2;
@@ -405,10 +404,8 @@ void tcp_init_sock(struct sock *sk)
405 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; 404 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
406 tp->snd_cwnd_clamp = ~0; 405 tp->snd_cwnd_clamp = ~0;
407 tp->mss_cache = TCP_MSS_DEFAULT; 406 tp->mss_cache = TCP_MSS_DEFAULT;
408 u64_stats_init(&tp->syncp);
409 407
410 tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering; 408 tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
411 tcp_enable_early_retrans(tp);
412 tcp_assign_congestion_control(sk); 409 tcp_assign_congestion_control(sk);
413 410
414 tp->tsoffset = 0; 411 tp->tsoffset = 0;
@@ -423,15 +420,13 @@ void tcp_init_sock(struct sock *sk)
423 sk->sk_sndbuf = sysctl_tcp_wmem[1]; 420 sk->sk_sndbuf = sysctl_tcp_wmem[1];
424 sk->sk_rcvbuf = sysctl_tcp_rmem[1]; 421 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
425 422
426 local_bh_disable();
427 sk_sockets_allocated_inc(sk); 423 sk_sockets_allocated_inc(sk);
428 local_bh_enable();
429} 424}
430EXPORT_SYMBOL(tcp_init_sock); 425EXPORT_SYMBOL(tcp_init_sock);
431 426
432static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb) 427static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb)
433{ 428{
434 if (tsflags) { 429 if (tsflags && skb) {
435 struct skb_shared_info *shinfo = skb_shinfo(skb); 430 struct skb_shared_info *shinfo = skb_shinfo(skb);
436 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); 431 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
437 432
@@ -538,6 +533,12 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
538 533
539 if (tp->urg_data & TCP_URG_VALID) 534 if (tp->urg_data & TCP_URG_VALID)
540 mask |= POLLPRI; 535 mask |= POLLPRI;
536 } else if (sk->sk_state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
537 /* Active TCP fastopen socket with defer_connect
538 * Return POLLOUT so application can call write()
539 * in order for kernel to generate SYN+data
540 */
541 mask |= POLLOUT | POLLWRNORM;
541 } 542 }
542 /* This barrier is coupled with smp_wmb() in tcp_reset() */ 543 /* This barrier is coupled with smp_wmb() in tcp_reset() */
543 smp_rmb(); 544 smp_rmb();
@@ -665,9 +666,9 @@ static void tcp_push(struct sock *sk, int flags, int mss_now,
665 if (tcp_should_autocork(sk, skb, size_goal)) { 666 if (tcp_should_autocork(sk, skb, size_goal)) {
666 667
667 /* avoid atomic op if TSQ_THROTTLED bit is already set */ 668 /* avoid atomic op if TSQ_THROTTLED bit is already set */
668 if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) { 669 if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
669 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING); 670 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
670 set_bit(TSQ_THROTTLED, &tp->tsq_flags); 671 set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
671 } 672 }
672 /* It is possible TX completion already happened 673 /* It is possible TX completion already happened
673 * before we set TSQ_THROTTLED. 674 * before we set TSQ_THROTTLED.
@@ -772,6 +773,12 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
772 ret = -EAGAIN; 773 ret = -EAGAIN;
773 break; 774 break;
774 } 775 }
776 /* if __tcp_splice_read() got nothing while we have
777 * an skb in receive queue, we do not want to loop.
778 * This might happen with URG data.
779 */
780 if (!skb_queue_empty(&sk->sk_receive_queue))
781 break;
775 sk_wait_data(sk, &timeo, NULL); 782 sk_wait_data(sk, &timeo, NULL);
776 if (signal_pending(current)) { 783 if (signal_pending(current)) {
777 ret = sock_intr_errno(timeo); 784 ret = sock_intr_errno(timeo);
@@ -960,10 +967,8 @@ new_segment:
960 copied += copy; 967 copied += copy;
961 offset += copy; 968 offset += copy;
962 size -= copy; 969 size -= copy;
963 if (!size) { 970 if (!size)
964 tcp_tx_timestamp(sk, sk->sk_tsflags, skb);
965 goto out; 971 goto out;
966 }
967 972
968 if (skb->len < size_goal || (flags & MSG_OOB)) 973 if (skb->len < size_goal || (flags & MSG_OOB))
969 continue; 974 continue;
@@ -989,8 +994,11 @@ wait_for_memory:
989 } 994 }
990 995
991out: 996out:
992 if (copied && !(flags & MSG_SENDPAGE_NOTLAST)) 997 if (copied) {
993 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); 998 tcp_tx_timestamp(sk, sk->sk_tsflags, tcp_write_queue_tail(sk));
999 if (!(flags & MSG_SENDPAGE_NOTLAST))
1000 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1001 }
994 return copied; 1002 return copied;
995 1003
996do_error: 1004do_error:
@@ -998,8 +1006,11 @@ do_error:
998 goto out; 1006 goto out;
999out_err: 1007out_err:
1000 /* make sure we wake any epoll edge trigger waiter */ 1008 /* make sure we wake any epoll edge trigger waiter */
1001 if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) 1009 if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
1010 err == -EAGAIN)) {
1002 sk->sk_write_space(sk); 1011 sk->sk_write_space(sk);
1012 tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
1013 }
1003 return sk_stream_error(sk, flags, err); 1014 return sk_stream_error(sk, flags, err);
1004} 1015}
1005 1016
@@ -1072,6 +1083,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
1072 int *copied, size_t size) 1083 int *copied, size_t size)
1073{ 1084{
1074 struct tcp_sock *tp = tcp_sk(sk); 1085 struct tcp_sock *tp = tcp_sk(sk);
1086 struct inet_sock *inet = inet_sk(sk);
1075 int err, flags; 1087 int err, flags;
1076 1088
1077 if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE)) 1089 if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
@@ -1086,11 +1098,26 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
1086 tp->fastopen_req->data = msg; 1098 tp->fastopen_req->data = msg;
1087 tp->fastopen_req->size = size; 1099 tp->fastopen_req->size = size;
1088 1100
1101 if (inet->defer_connect) {
1102 err = tcp_connect(sk);
1103 /* Same failure procedure as in tcp_v4/6_connect */
1104 if (err) {
1105 tcp_set_state(sk, TCP_CLOSE);
1106 inet->inet_dport = 0;
1107 sk->sk_route_caps = 0;
1108 }
1109 }
1089 flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; 1110 flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
1090 err = __inet_stream_connect(sk->sk_socket, msg->msg_name, 1111 err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
1091 msg->msg_namelen, flags); 1112 msg->msg_namelen, flags, 1);
1092 *copied = tp->fastopen_req->copied; 1113 /* fastopen_req could already be freed in __inet_stream_connect
1093 tcp_free_fastopen_req(tp); 1114 * if the connection times out or gets rst
1115 */
1116 if (tp->fastopen_req) {
1117 *copied = tp->fastopen_req->copied;
1118 tcp_free_fastopen_req(tp);
1119 inet->defer_connect = 0;
1120 }
1094 return err; 1121 return err;
1095} 1122}
1096 1123
@@ -1108,7 +1135,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
1108 lock_sock(sk); 1135 lock_sock(sk);
1109 1136
1110 flags = msg->msg_flags; 1137 flags = msg->msg_flags;
1111 if (flags & MSG_FASTOPEN) { 1138 if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect)) {
1112 err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size); 1139 err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
1113 if (err == -EINPROGRESS && copied_syn > 0) 1140 if (err == -EINPROGRESS && copied_syn > 0)
1114 goto out; 1141 goto out;
@@ -1266,7 +1293,7 @@ new_segment:
1266 } else { 1293 } else {
1267 skb_fill_page_desc(skb, i, pfrag->page, 1294 skb_fill_page_desc(skb, i, pfrag->page,
1268 pfrag->offset, copy); 1295 pfrag->offset, copy);
1269 get_page(pfrag->page); 1296 page_ref_inc(pfrag->page);
1270 } 1297 }
1271 pfrag->offset += copy; 1298 pfrag->offset += copy;
1272 } 1299 }
@@ -1280,7 +1307,6 @@ new_segment:
1280 1307
1281 copied += copy; 1308 copied += copy;
1282 if (!msg_data_left(msg)) { 1309 if (!msg_data_left(msg)) {
1283 tcp_tx_timestamp(sk, sockc.tsflags, skb);
1284 if (unlikely(flags & MSG_EOR)) 1310 if (unlikely(flags & MSG_EOR))
1285 TCP_SKB_CB(skb)->eor = 1; 1311 TCP_SKB_CB(skb)->eor = 1;
1286 goto out; 1312 goto out;
@@ -1311,8 +1337,10 @@ wait_for_memory:
1311 } 1337 }
1312 1338
1313out: 1339out:
1314 if (copied) 1340 if (copied) {
1341 tcp_tx_timestamp(sk, sockc.tsflags, tcp_write_queue_tail(sk));
1315 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); 1342 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1343 }
1316out_nopush: 1344out_nopush:
1317 release_sock(sk); 1345 release_sock(sk);
1318 return copied + copied_syn; 1346 return copied + copied_syn;
@@ -1333,8 +1361,11 @@ do_error:
1333out_err: 1361out_err:
1334 err = sk_stream_error(sk, flags, err); 1362 err = sk_stream_error(sk, flags, err);
1335 /* make sure we wake any epoll edge trigger waiter */ 1363 /* make sure we wake any epoll edge trigger waiter */
1336 if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) 1364 if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
1365 err == -EAGAIN)) {
1337 sk->sk_write_space(sk); 1366 sk->sk_write_space(sk);
1367 tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
1368 }
1338 release_sock(sk); 1369 release_sock(sk);
1339 return err; 1370 return err;
1340} 1371}
@@ -2291,6 +2322,11 @@ int tcp_disconnect(struct sock *sk, int flags)
2291 tcp_init_send_head(sk); 2322 tcp_init_send_head(sk);
2292 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); 2323 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2293 __sk_dst_reset(sk); 2324 __sk_dst_reset(sk);
2325 tcp_saved_syn_free(tp);
2326
2327 /* Clean up fastopen related fields */
2328 tcp_free_fastopen_req(tp);
2329 inet->defer_connect = 0;
2294 2330
2295 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); 2331 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
2296 2332
@@ -2302,7 +2338,7 @@ EXPORT_SYMBOL(tcp_disconnect);
2302static inline bool tcp_can_repair_sock(const struct sock *sk) 2338static inline bool tcp_can_repair_sock(const struct sock *sk)
2303{ 2339{
2304 return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) && 2340 return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
2305 ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED)); 2341 (sk->sk_state != TCP_LISTEN);
2306} 2342}
2307 2343
2308static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int len) 2344static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int len)
@@ -2469,11 +2505,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2469 case TCP_THIN_DUPACK: 2505 case TCP_THIN_DUPACK:
2470 if (val < 0 || val > 1) 2506 if (val < 0 || val > 1)
2471 err = -EINVAL; 2507 err = -EINVAL;
2472 else {
2473 tp->thin_dupack = val;
2474 if (tp->thin_dupack)
2475 tcp_disable_early_retrans(tp);
2476 }
2477 break; 2508 break;
2478 2509
2479 case TCP_REPAIR: 2510 case TCP_REPAIR:
@@ -2658,6 +2689,18 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2658 err = -EINVAL; 2689 err = -EINVAL;
2659 } 2690 }
2660 break; 2691 break;
2692 case TCP_FASTOPEN_CONNECT:
2693 if (val > 1 || val < 0) {
2694 err = -EINVAL;
2695 } else if (sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
2696 if (sk->sk_state == TCP_CLOSE)
2697 tp->fastopen_connect = val;
2698 else
2699 err = -EINVAL;
2700 } else {
2701 err = -EOPNOTSUPP;
2702 }
2703 break;
2661 case TCP_TIMESTAMP: 2704 case TCP_TIMESTAMP:
2662 if (!tp->repair) 2705 if (!tp->repair)
2663 err = -EPERM; 2706 err = -EPERM;
@@ -2704,15 +2747,33 @@ int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
2704EXPORT_SYMBOL(compat_tcp_setsockopt); 2747EXPORT_SYMBOL(compat_tcp_setsockopt);
2705#endif 2748#endif
2706 2749
2750static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
2751 struct tcp_info *info)
2752{
2753 u64 stats[__TCP_CHRONO_MAX], total = 0;
2754 enum tcp_chrono i;
2755
2756 for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) {
2757 stats[i] = tp->chrono_stat[i - 1];
2758 if (i == tp->chrono_type)
2759 stats[i] += tcp_time_stamp - tp->chrono_start;
2760 stats[i] *= USEC_PER_SEC / HZ;
2761 total += stats[i];
2762 }
2763
2764 info->tcpi_busy_time = total;
2765 info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED];
2766 info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED];
2767}
2768
2707/* Return information about state of tcp endpoint in API format. */ 2769/* Return information about state of tcp endpoint in API format. */
2708void tcp_get_info(struct sock *sk, struct tcp_info *info) 2770void tcp_get_info(struct sock *sk, struct tcp_info *info)
2709{ 2771{
2710 const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ 2772 const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
2711 const struct inet_connection_sock *icsk = inet_csk(sk); 2773 const struct inet_connection_sock *icsk = inet_csk(sk);
2712 u32 now = tcp_time_stamp, intv; 2774 u32 now, intv;
2713 unsigned int start;
2714 int notsent_bytes;
2715 u64 rate64; 2775 u64 rate64;
2776 bool slow;
2716 u32 rate; 2777 u32 rate;
2717 2778
2718 memset(info, 0, sizeof(*info)); 2779 memset(info, 0, sizeof(*info));
@@ -2721,6 +2782,30 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
2721 2782
2722 info->tcpi_state = sk_state_load(sk); 2783 info->tcpi_state = sk_state_load(sk);
2723 2784
2785 /* Report meaningful fields for all TCP states, including listeners */
2786 rate = READ_ONCE(sk->sk_pacing_rate);
2787 rate64 = rate != ~0U ? rate : ~0ULL;
2788 info->tcpi_pacing_rate = rate64;
2789
2790 rate = READ_ONCE(sk->sk_max_pacing_rate);
2791 rate64 = rate != ~0U ? rate : ~0ULL;
2792 info->tcpi_max_pacing_rate = rate64;
2793
2794 info->tcpi_reordering = tp->reordering;
2795 info->tcpi_snd_cwnd = tp->snd_cwnd;
2796
2797 if (info->tcpi_state == TCP_LISTEN) {
2798 /* listeners aliased fields :
2799 * tcpi_unacked -> Number of children ready for accept()
2800 * tcpi_sacked -> max backlog
2801 */
2802 info->tcpi_unacked = sk->sk_ack_backlog;
2803 info->tcpi_sacked = sk->sk_max_ack_backlog;
2804 return;
2805 }
2806
2807 slow = lock_sock_fast(sk);
2808
2724 info->tcpi_ca_state = icsk->icsk_ca_state; 2809 info->tcpi_ca_state = icsk->icsk_ca_state;
2725 info->tcpi_retransmits = icsk->icsk_retransmits; 2810 info->tcpi_retransmits = icsk->icsk_retransmits;
2726 info->tcpi_probes = icsk->icsk_probes_out; 2811 info->tcpi_probes = icsk->icsk_probes_out;
@@ -2748,17 +2833,14 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
2748 info->tcpi_snd_mss = tp->mss_cache; 2833 info->tcpi_snd_mss = tp->mss_cache;
2749 info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss; 2834 info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
2750 2835
2751 if (info->tcpi_state == TCP_LISTEN) { 2836 info->tcpi_unacked = tp->packets_out;
2752 info->tcpi_unacked = sk->sk_ack_backlog; 2837 info->tcpi_sacked = tp->sacked_out;
2753 info->tcpi_sacked = sk->sk_max_ack_backlog; 2838
2754 } else {
2755 info->tcpi_unacked = tp->packets_out;
2756 info->tcpi_sacked = tp->sacked_out;
2757 }
2758 info->tcpi_lost = tp->lost_out; 2839 info->tcpi_lost = tp->lost_out;
2759 info->tcpi_retrans = tp->retrans_out; 2840 info->tcpi_retrans = tp->retrans_out;
2760 info->tcpi_fackets = tp->fackets_out; 2841 info->tcpi_fackets = tp->fackets_out;
2761 2842
2843 now = tcp_time_stamp;
2762 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); 2844 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2763 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); 2845 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
2764 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); 2846 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
@@ -2768,34 +2850,21 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
2768 info->tcpi_rtt = tp->srtt_us >> 3; 2850 info->tcpi_rtt = tp->srtt_us >> 3;
2769 info->tcpi_rttvar = tp->mdev_us >> 2; 2851 info->tcpi_rttvar = tp->mdev_us >> 2;
2770 info->tcpi_snd_ssthresh = tp->snd_ssthresh; 2852 info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2771 info->tcpi_snd_cwnd = tp->snd_cwnd;
2772 info->tcpi_advmss = tp->advmss; 2853 info->tcpi_advmss = tp->advmss;
2773 info->tcpi_reordering = tp->reordering;
2774 2854
2775 info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3; 2855 info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2776 info->tcpi_rcv_space = tp->rcvq_space.space; 2856 info->tcpi_rcv_space = tp->rcvq_space.space;
2777 2857
2778 info->tcpi_total_retrans = tp->total_retrans; 2858 info->tcpi_total_retrans = tp->total_retrans;
2779 2859
2780 rate = READ_ONCE(sk->sk_pacing_rate); 2860 info->tcpi_bytes_acked = tp->bytes_acked;
2781 rate64 = rate != ~0U ? rate : ~0ULL; 2861 info->tcpi_bytes_received = tp->bytes_received;
2782 put_unaligned(rate64, &info->tcpi_pacing_rate); 2862 info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
2863 tcp_get_info_chrono_stats(tp, info);
2783 2864
2784 rate = READ_ONCE(sk->sk_max_pacing_rate);
2785 rate64 = rate != ~0U ? rate : ~0ULL;
2786 put_unaligned(rate64, &info->tcpi_max_pacing_rate);
2787
2788 do {
2789 start = u64_stats_fetch_begin_irq(&tp->syncp);
2790 put_unaligned(tp->bytes_acked, &info->tcpi_bytes_acked);
2791 put_unaligned(tp->bytes_received, &info->tcpi_bytes_received);
2792 } while (u64_stats_fetch_retry_irq(&tp->syncp, start));
2793 info->tcpi_segs_out = tp->segs_out; 2865 info->tcpi_segs_out = tp->segs_out;
2794 info->tcpi_segs_in = tp->segs_in; 2866 info->tcpi_segs_in = tp->segs_in;
2795 2867
2796 notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt);
2797 info->tcpi_notsent_bytes = max(0, notsent_bytes);
2798
2799 info->tcpi_min_rtt = tcp_min_rtt(tp); 2868 info->tcpi_min_rtt = tcp_min_rtt(tp);
2800 info->tcpi_data_segs_in = tp->data_segs_in; 2869 info->tcpi_data_segs_in = tp->data_segs_in;
2801 info->tcpi_data_segs_out = tp->data_segs_out; 2870 info->tcpi_data_segs_out = tp->data_segs_out;
@@ -2806,11 +2875,36 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
2806 if (rate && intv) { 2875 if (rate && intv) {
2807 rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC; 2876 rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
2808 do_div(rate64, intv); 2877 do_div(rate64, intv);
2809 put_unaligned(rate64, &info->tcpi_delivery_rate); 2878 info->tcpi_delivery_rate = rate64;
2810 } 2879 }
2880 unlock_sock_fast(sk, slow);
2811} 2881}
2812EXPORT_SYMBOL_GPL(tcp_get_info); 2882EXPORT_SYMBOL_GPL(tcp_get_info);
2813 2883
2884struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
2885{
2886 const struct tcp_sock *tp = tcp_sk(sk);
2887 struct sk_buff *stats;
2888 struct tcp_info info;
2889
2890 stats = alloc_skb(5 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC);
2891 if (!stats)
2892 return NULL;
2893
2894 tcp_get_info_chrono_stats(tp, &info);
2895 nla_put_u64_64bit(stats, TCP_NLA_BUSY,
2896 info.tcpi_busy_time, TCP_NLA_PAD);
2897 nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED,
2898 info.tcpi_rwnd_limited, TCP_NLA_PAD);
2899 nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
2900 info.tcpi_sndbuf_limited, TCP_NLA_PAD);
2901 nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT,
2902 tp->data_segs_out, TCP_NLA_PAD);
2903 nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
2904 tp->total_retrans, TCP_NLA_PAD);
2905 return stats;
2906}
2907
2814static int do_tcp_getsockopt(struct sock *sk, int level, 2908static int do_tcp_getsockopt(struct sock *sk, int level,
2815 int optname, char __user *optval, int __user *optlen) 2909 int optname, char __user *optval, int __user *optlen)
2816{ 2910{
@@ -2917,8 +3011,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
2917 case TCP_THIN_LINEAR_TIMEOUTS: 3011 case TCP_THIN_LINEAR_TIMEOUTS:
2918 val = tp->thin_lto; 3012 val = tp->thin_lto;
2919 break; 3013 break;
3014
2920 case TCP_THIN_DUPACK: 3015 case TCP_THIN_DUPACK:
2921 val = tp->thin_dupack; 3016 val = 0;
2922 break; 3017 break;
2923 3018
2924 case TCP_REPAIR: 3019 case TCP_REPAIR:
@@ -2971,6 +3066,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
2971 val = icsk->icsk_accept_queue.fastopenq.max_qlen; 3066 val = icsk->icsk_accept_queue.fastopenq.max_qlen;
2972 break; 3067 break;
2973 3068
3069 case TCP_FASTOPEN_CONNECT:
3070 val = tp->fastopen_connect;
3071 break;
3072
2974 case TCP_TIMESTAMP: 3073 case TCP_TIMESTAMP:
2975 val = tcp_time_stamp + tp->tsoffset; 3074 val = tcp_time_stamp + tp->tsoffset;
2976 break; 3075 break;
@@ -3284,6 +3383,7 @@ void __init tcp_init(void)
3284 3383
3285 percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); 3384 percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
3286 percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL); 3385 percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
3386 inet_hashinfo_init(&tcp_hashinfo);
3287 tcp_hashinfo.bind_bucket_cachep = 3387 tcp_hashinfo.bind_bucket_cachep =
3288 kmem_cache_create("tcp_bind_bucket", 3388 kmem_cache_create("tcp_bind_bucket",
3289 sizeof(struct inet_bind_bucket), 0, 3389 sizeof(struct inet_bind_bucket), 0,
@@ -3327,10 +3427,7 @@ void __init tcp_init(void)
3327 3427
3328 3428
3329 cnt = tcp_hashinfo.ehash_mask + 1; 3429 cnt = tcp_hashinfo.ehash_mask + 1;
3330
3331 tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3332 sysctl_tcp_max_orphans = cnt / 2; 3430 sysctl_tcp_max_orphans = cnt / 2;
3333 sysctl_max_syn_backlog = max(128, cnt / 256);
3334 3431
3335 tcp_init_mem(); 3432 tcp_init_mem();
3336 /* Set per-socket limits to no more than 1/128 the pressure threshold */ 3433 /* Set per-socket limits to no more than 1/128 the pressure threshold */
@@ -3349,6 +3446,7 @@ void __init tcp_init(void)
3349 pr_info("Hash tables configured (established %u bind %u)\n", 3446 pr_info("Hash tables configured (established %u bind %u)\n",
3350 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); 3447 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
3351 3448
3449 tcp_v4_init();
3352 tcp_metrics_init(); 3450 tcp_metrics_init();
3353 BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0); 3451 BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
3354 tcp_tasklet_init(); 3452 tcp_tasklet_init();
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 0ea66c2c9344..b89bce4c721e 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -14,6 +14,36 @@
14 * observed, or adjust the sending rate if it estimates there is a 14 * observed, or adjust the sending rate if it estimates there is a
15 * traffic policer, in order to keep the drop rate reasonable. 15 * traffic policer, in order to keep the drop rate reasonable.
16 * 16 *
17 * Here is a state transition diagram for BBR:
18 *
19 * |
20 * V
21 * +---> STARTUP ----+
22 * | | |
23 * | V |
24 * | DRAIN ----+
25 * | | |
26 * | V |
27 * +---> PROBE_BW ----+
28 * | ^ | |
29 * | | | |
30 * | +----+ |
31 * | |
32 * +---- PROBE_RTT <--+
33 *
34 * A BBR flow starts in STARTUP, and ramps up its sending rate quickly.
35 * When it estimates the pipe is full, it enters DRAIN to drain the queue.
36 * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT.
37 * A long-lived BBR flow spends the vast majority of its time remaining
38 * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth
39 * in a fair manner, with a small, bounded queue. *If* a flow has been
40 * continuously sending for the entire min_rtt window, and hasn't seen an RTT
41 * sample that matches or decreases its min_rtt estimate for 10 seconds, then
42 * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe
43 * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if
44 * we estimated that we reached the full bw of the pipe then we enter PROBE_BW;
45 * otherwise we enter STARTUP to try to fill the pipe.
46 *
17 * BBR is described in detail in: 47 * BBR is described in detail in:
18 * "BBR: Congestion-Based Congestion Control", 48 * "BBR: Congestion-Based Congestion Control",
19 * Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh, 49 * Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh,
@@ -51,7 +81,7 @@ enum bbr_mode {
51 BBR_STARTUP, /* ramp up sending rate rapidly to fill pipe */ 81 BBR_STARTUP, /* ramp up sending rate rapidly to fill pipe */
52 BBR_DRAIN, /* drain any queue created during startup */ 82 BBR_DRAIN, /* drain any queue created during startup */
53 BBR_PROBE_BW, /* discover, share bw: pace around estimated bw */ 83 BBR_PROBE_BW, /* discover, share bw: pace around estimated bw */
54 BBR_PROBE_RTT, /* cut cwnd to min to probe min_rtt */ 84 BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */
55}; 85};
56 86
57/* BBR congestion control block */ 87/* BBR congestion control block */
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index 35b280361cb2..50a0f3e51d5b 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -27,6 +27,8 @@
27#include <linux/kernel.h> 27#include <linux/kernel.h>
28#include <linux/random.h> 28#include <linux/random.h>
29#include <linux/module.h> 29#include <linux/module.h>
30#include <linux/sched/clock.h>
31
30#include <net/tcp.h> 32#include <net/tcp.h>
31 33
32#define HYSTART_ACK_TRAIN 1 34#define HYSTART_ACK_TRAIN 1
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index f9038d6b109e..79c4817abc94 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -68,8 +68,9 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
68{ 68{
69 int ret = 0; 69 int ret = 0;
70 70
71 /* all algorithms must implement ssthresh and cong_avoid ops */ 71 /* all algorithms must implement these */
72 if (!ca->ssthresh || !(ca->cong_avoid || ca->cong_control)) { 72 if (!ca->ssthresh || !ca->undo_cwnd ||
73 !(ca->cong_avoid || ca->cong_control)) {
73 pr_err("%s does not implement required ops\n", ca->name); 74 pr_err("%s does not implement required ops\n", ca->name);
74 return -EINVAL; 75 return -EINVAL;
75 } 76 }
@@ -443,10 +444,19 @@ u32 tcp_reno_ssthresh(struct sock *sk)
443} 444}
444EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); 445EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
445 446
447u32 tcp_reno_undo_cwnd(struct sock *sk)
448{
449 const struct tcp_sock *tp = tcp_sk(sk);
450
451 return max(tp->snd_cwnd, tp->snd_ssthresh << 1);
452}
453EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd);
454
446struct tcp_congestion_ops tcp_reno = { 455struct tcp_congestion_ops tcp_reno = {
447 .flags = TCP_CONG_NON_RESTRICTED, 456 .flags = TCP_CONG_NON_RESTRICTED,
448 .name = "reno", 457 .name = "reno",
449 .owner = THIS_MODULE, 458 .owner = THIS_MODULE,
450 .ssthresh = tcp_reno_ssthresh, 459 .ssthresh = tcp_reno_ssthresh,
451 .cong_avoid = tcp_reno_cong_avoid, 460 .cong_avoid = tcp_reno_cong_avoid,
461 .undo_cwnd = tcp_reno_undo_cwnd,
452}; 462};
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index ab37c6775630..5f5e5936760e 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -335,6 +335,7 @@ static struct tcp_congestion_ops dctcp __read_mostly = {
335static struct tcp_congestion_ops dctcp_reno __read_mostly = { 335static struct tcp_congestion_ops dctcp_reno __read_mostly = {
336 .ssthresh = tcp_reno_ssthresh, 336 .ssthresh = tcp_reno_ssthresh,
337 .cong_avoid = tcp_reno_cong_avoid, 337 .cong_avoid = tcp_reno_cong_avoid,
338 .undo_cwnd = tcp_reno_undo_cwnd,
338 .get_info = dctcp_get_info, 339 .get_info = dctcp_get_info,
339 .owner = THIS_MODULE, 340 .owner = THIS_MODULE,
340 .name = "dctcp-reno", 341 .name = "dctcp-reno",
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 4e777a3243f9..8ea4e9787f82 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -113,7 +113,7 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req,
113 struct tcp_fastopen_cookie tmp; 113 struct tcp_fastopen_cookie tmp;
114 114
115 if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) { 115 if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) {
116 struct in6_addr *buf = (struct in6_addr *) tmp.val; 116 struct in6_addr *buf = &tmp.addr;
117 int i; 117 int i;
118 118
119 for (i = 0; i < 4; i++) 119 for (i = 0; i < 4; i++)
@@ -205,6 +205,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
205 * scaled. So correct it appropriately. 205 * scaled. So correct it appropriately.
206 */ 206 */
207 tp->snd_wnd = ntohs(tcp_hdr(skb)->window); 207 tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
208 tp->max_window = tp->snd_wnd;
208 209
209 /* Activate the retrans timer so that SYNACK can be retransmitted. 210 /* Activate the retrans timer so that SYNACK can be retransmitted.
210 * The request socket is not added to the ehash 211 * The request socket is not added to the ehash
@@ -325,3 +326,57 @@ fastopen:
325 *foc = valid_foc; 326 *foc = valid_foc;
326 return NULL; 327 return NULL;
327} 328}
329
330bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
331 struct tcp_fastopen_cookie *cookie)
332{
333 unsigned long last_syn_loss = 0;
334 int syn_loss = 0;
335
336 tcp_fastopen_cache_get(sk, mss, cookie, &syn_loss, &last_syn_loss);
337
338 /* Recurring FO SYN losses: no cookie or data in SYN */
339 if (syn_loss > 1 &&
340 time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
341 cookie->len = -1;
342 return false;
343 }
344 if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) {
345 cookie->len = -1;
346 return true;
347 }
348 return cookie->len > 0;
349}
350
351/* This function checks if we want to defer sending SYN until the first
352 * write(). We defer under the following conditions:
353 * 1. fastopen_connect sockopt is set
354 * 2. we have a valid cookie
355 * Return value: return true if we want to defer until application writes data
356 * return false if we want to send out SYN immediately
357 */
358bool tcp_fastopen_defer_connect(struct sock *sk, int *err)
359{
360 struct tcp_fastopen_cookie cookie = { .len = 0 };
361 struct tcp_sock *tp = tcp_sk(sk);
362 u16 mss;
363
364 if (tp->fastopen_connect && !tp->fastopen_req) {
365 if (tcp_fastopen_cookie_check(sk, &mss, &cookie)) {
366 inet_sk(sk)->defer_connect = 1;
367 return true;
368 }
369
370 /* Alloc fastopen_req in order for FO option to be included
371 * in SYN
372 */
373 tp->fastopen_req = kzalloc(sizeof(*tp->fastopen_req),
374 sk->sk_allocation);
375 if (tp->fastopen_req)
376 tp->fastopen_req->cookie = cookie;
377 else
378 *err = -ENOBUFS;
379 }
380 return false;
381}
382EXPORT_SYMBOL(tcp_fastopen_defer_connect);
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index db7842495a64..6d9879e93648 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -94,6 +94,7 @@ static const struct hstcp_aimd_val {
94 94
95struct hstcp { 95struct hstcp {
96 u32 ai; 96 u32 ai;
97 u32 loss_cwnd;
97}; 98};
98 99
99static void hstcp_init(struct sock *sk) 100static void hstcp_init(struct sock *sk)
@@ -150,16 +151,24 @@ static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
150static u32 hstcp_ssthresh(struct sock *sk) 151static u32 hstcp_ssthresh(struct sock *sk)
151{ 152{
152 const struct tcp_sock *tp = tcp_sk(sk); 153 const struct tcp_sock *tp = tcp_sk(sk);
153 const struct hstcp *ca = inet_csk_ca(sk); 154 struct hstcp *ca = inet_csk_ca(sk);
154 155
156 ca->loss_cwnd = tp->snd_cwnd;
155 /* Do multiplicative decrease */ 157 /* Do multiplicative decrease */
156 return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U); 158 return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
157} 159}
158 160
161static u32 hstcp_cwnd_undo(struct sock *sk)
162{
163 const struct hstcp *ca = inet_csk_ca(sk);
164
165 return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
166}
159 167
160static struct tcp_congestion_ops tcp_highspeed __read_mostly = { 168static struct tcp_congestion_ops tcp_highspeed __read_mostly = {
161 .init = hstcp_init, 169 .init = hstcp_init,
162 .ssthresh = hstcp_ssthresh, 170 .ssthresh = hstcp_ssthresh,
171 .undo_cwnd = hstcp_cwnd_undo,
163 .cong_avoid = hstcp_cong_avoid, 172 .cong_avoid = hstcp_cong_avoid,
164 173
165 .owner = THIS_MODULE, 174 .owner = THIS_MODULE,
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 083831e359df..0f7175c3338e 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -166,6 +166,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked)
166static struct tcp_congestion_ops tcp_hybla __read_mostly = { 166static struct tcp_congestion_ops tcp_hybla __read_mostly = {
167 .init = hybla_init, 167 .init = hybla_init,
168 .ssthresh = tcp_reno_ssthresh, 168 .ssthresh = tcp_reno_ssthresh,
169 .undo_cwnd = tcp_reno_undo_cwnd,
169 .cong_avoid = hybla_cong_avoid, 170 .cong_avoid = hybla_cong_avoid,
170 .set_state = hybla_state, 171 .set_state = hybla_state,
171 172
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index c8e6d86be114..60352ff4f5a8 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -48,6 +48,7 @@ struct illinois {
48 u32 end_seq; /* right edge of current RTT */ 48 u32 end_seq; /* right edge of current RTT */
49 u32 alpha; /* Additive increase */ 49 u32 alpha; /* Additive increase */
50 u32 beta; /* Muliplicative decrease */ 50 u32 beta; /* Muliplicative decrease */
51 u32 loss_cwnd; /* cwnd on loss */
51 u16 acked; /* # packets acked by current ACK */ 52 u16 acked; /* # packets acked by current ACK */
52 u8 rtt_above; /* average rtt has gone above threshold */ 53 u8 rtt_above; /* average rtt has gone above threshold */
53 u8 rtt_low; /* # of rtts measurements below threshold */ 54 u8 rtt_low; /* # of rtts measurements below threshold */
@@ -296,10 +297,18 @@ static u32 tcp_illinois_ssthresh(struct sock *sk)
296 struct tcp_sock *tp = tcp_sk(sk); 297 struct tcp_sock *tp = tcp_sk(sk);
297 struct illinois *ca = inet_csk_ca(sk); 298 struct illinois *ca = inet_csk_ca(sk);
298 299
300 ca->loss_cwnd = tp->snd_cwnd;
299 /* Multiplicative decrease */ 301 /* Multiplicative decrease */
300 return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->beta) >> BETA_SHIFT), 2U); 302 return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->beta) >> BETA_SHIFT), 2U);
301} 303}
302 304
305static u32 tcp_illinois_cwnd_undo(struct sock *sk)
306{
307 const struct illinois *ca = inet_csk_ca(sk);
308
309 return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
310}
311
303/* Extract info for Tcp socket info provided via netlink. */ 312/* Extract info for Tcp socket info provided via netlink. */
304static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr, 313static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr,
305 union tcp_cc_info *info) 314 union tcp_cc_info *info)
@@ -327,6 +336,7 @@ static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr,
327static struct tcp_congestion_ops tcp_illinois __read_mostly = { 336static struct tcp_congestion_ops tcp_illinois __read_mostly = {
328 .init = tcp_illinois_init, 337 .init = tcp_illinois_init,
329 .ssthresh = tcp_illinois_ssthresh, 338 .ssthresh = tcp_illinois_ssthresh,
339 .undo_cwnd = tcp_illinois_cwnd_undo,
330 .cong_avoid = tcp_illinois_cong_avoid, 340 .cong_avoid = tcp_illinois_cong_avoid,
331 .set_state = tcp_illinois_state, 341 .set_state = tcp_illinois_state,
332 .get_info = tcp_illinois_info, 342 .get_info = tcp_illinois_info,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c71d49ce0c93..659d1baefb2b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -79,12 +79,13 @@
79int sysctl_tcp_timestamps __read_mostly = 1; 79int sysctl_tcp_timestamps __read_mostly = 1;
80int sysctl_tcp_window_scaling __read_mostly = 1; 80int sysctl_tcp_window_scaling __read_mostly = 1;
81int sysctl_tcp_sack __read_mostly = 1; 81int sysctl_tcp_sack __read_mostly = 1;
82int sysctl_tcp_fack __read_mostly = 1; 82int sysctl_tcp_fack __read_mostly;
83int sysctl_tcp_max_reordering __read_mostly = 300; 83int sysctl_tcp_max_reordering __read_mostly = 300;
84int sysctl_tcp_dsack __read_mostly = 1; 84int sysctl_tcp_dsack __read_mostly = 1;
85int sysctl_tcp_app_win __read_mostly = 31; 85int sysctl_tcp_app_win __read_mostly = 31;
86int sysctl_tcp_adv_win_scale __read_mostly = 1; 86int sysctl_tcp_adv_win_scale __read_mostly = 1;
87EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); 87EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
88EXPORT_SYMBOL(sysctl_tcp_timestamps);
88 89
89/* rfc5961 challenge ack rate limiting */ 90/* rfc5961 challenge ack rate limiting */
90int sysctl_tcp_challenge_ack_limit = 1000; 91int sysctl_tcp_challenge_ack_limit = 1000;
@@ -94,9 +95,6 @@ int sysctl_tcp_rfc1337 __read_mostly;
94int sysctl_tcp_max_orphans __read_mostly = NR_FILE; 95int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
95int sysctl_tcp_frto __read_mostly = 2; 96int sysctl_tcp_frto __read_mostly = 2;
96int sysctl_tcp_min_rtt_wlen __read_mostly = 300; 97int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
97
98int sysctl_tcp_thin_dupack __read_mostly;
99
100int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; 98int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
101int sysctl_tcp_early_retrans __read_mostly = 3; 99int sysctl_tcp_early_retrans __read_mostly = 3;
102int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; 100int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
@@ -128,7 +126,8 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
128#define REXMIT_LOST 1 /* retransmit packets marked lost */ 126#define REXMIT_LOST 1 /* retransmit packets marked lost */
129#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */ 127#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */
130 128
131static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb) 129static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb,
130 unsigned int len)
132{ 131{
133 static bool __once __read_mostly; 132 static bool __once __read_mostly;
134 133
@@ -139,8 +138,9 @@ static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb)
139 138
140 rcu_read_lock(); 139 rcu_read_lock();
141 dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif); 140 dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
142 pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n", 141 if (!dev || len >= dev->mtu)
143 dev ? dev->name : "Unknown driver"); 142 pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
143 dev ? dev->name : "Unknown driver");
144 rcu_read_unlock(); 144 rcu_read_unlock();
145 } 145 }
146} 146}
@@ -163,8 +163,10 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
163 if (len >= icsk->icsk_ack.rcv_mss) { 163 if (len >= icsk->icsk_ack.rcv_mss) {
164 icsk->icsk_ack.rcv_mss = min_t(unsigned int, len, 164 icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
165 tcp_sk(sk)->advmss); 165 tcp_sk(sk)->advmss);
166 if (unlikely(icsk->icsk_ack.rcv_mss != len)) 166 /* Account for possibly-removed options */
167 tcp_gro_dev_warn(sk, skb); 167 if (unlikely(len > icsk->icsk_ack.rcv_mss +
168 MAX_TCP_OPTION_SPACE))
169 tcp_gro_dev_warn(sk, skb, len);
168 } else { 170 } else {
169 /* Otherwise, we make more careful check taking into account, 171 /* Otherwise, we make more careful check taking into account,
170 * that SACKs block is variable. 172 * that SACKs block is variable.
@@ -876,22 +878,11 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
876 const int ts) 878 const int ts)
877{ 879{
878 struct tcp_sock *tp = tcp_sk(sk); 880 struct tcp_sock *tp = tcp_sk(sk);
879 if (metric > tp->reordering) { 881 int mib_idx;
880 int mib_idx;
881 882
883 if (metric > tp->reordering) {
882 tp->reordering = min(sysctl_tcp_max_reordering, metric); 884 tp->reordering = min(sysctl_tcp_max_reordering, metric);
883 885
884 /* This exciting event is worth to be remembered. 8) */
885 if (ts)
886 mib_idx = LINUX_MIB_TCPTSREORDER;
887 else if (tcp_is_reno(tp))
888 mib_idx = LINUX_MIB_TCPRENOREORDER;
889 else if (tcp_is_fack(tp))
890 mib_idx = LINUX_MIB_TCPFACKREORDER;
891 else
892 mib_idx = LINUX_MIB_TCPSACKREORDER;
893
894 NET_INC_STATS(sock_net(sk), mib_idx);
895#if FASTRETRANS_DEBUG > 1 886#if FASTRETRANS_DEBUG > 1
896 pr_debug("Disorder%d %d %u f%u s%u rr%d\n", 887 pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
897 tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, 888 tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
@@ -903,9 +894,19 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
903 tcp_disable_fack(tp); 894 tcp_disable_fack(tp);
904 } 895 }
905 896
906 if (metric > 0)
907 tcp_disable_early_retrans(tp);
908 tp->rack.reord = 1; 897 tp->rack.reord = 1;
898
899 /* This exciting event is worth to be remembered. 8) */
900 if (ts)
901 mib_idx = LINUX_MIB_TCPTSREORDER;
902 else if (tcp_is_reno(tp))
903 mib_idx = LINUX_MIB_TCPRENOREORDER;
904 else if (tcp_is_fack(tp))
905 mib_idx = LINUX_MIB_TCPFACKREORDER;
906 else
907 mib_idx = LINUX_MIB_TCPSACKREORDER;
908
909 NET_INC_STATS(sock_net(sk), mib_idx);
909} 910}
910 911
911/* This must be called before lost_out is incremented */ 912/* This must be called before lost_out is incremented */
@@ -915,10 +916,6 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
915 before(TCP_SKB_CB(skb)->seq, 916 before(TCP_SKB_CB(skb)->seq,
916 TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) 917 TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
917 tp->retransmit_skb_hint = skb; 918 tp->retransmit_skb_hint = skb;
918
919 if (!tp->lost_out ||
920 after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high))
921 tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
922} 919}
923 920
924/* Sum the number of packets on the wire we have marked as lost. 921/* Sum the number of packets on the wire we have marked as lost.
@@ -1134,6 +1131,7 @@ struct tcp_sacktag_state {
1134 */ 1131 */
1135 struct skb_mstamp first_sackt; 1132 struct skb_mstamp first_sackt;
1136 struct skb_mstamp last_sackt; 1133 struct skb_mstamp last_sackt;
1134 struct skb_mstamp ack_time; /* Timestamp when the S/ACK was received */
1137 struct rate_sample *rate; 1135 struct rate_sample *rate;
1138 int flag; 1136 int flag;
1139}; 1137};
@@ -1216,7 +1214,8 @@ static u8 tcp_sacktag_one(struct sock *sk,
1216 return sacked; 1214 return sacked;
1217 1215
1218 if (!(sacked & TCPCB_SACKED_ACKED)) { 1216 if (!(sacked & TCPCB_SACKED_ACKED)) {
1219 tcp_rack_advance(tp, xmit_time, sacked); 1217 tcp_rack_advance(tp, sacked, end_seq,
1218 xmit_time, &state->ack_time);
1220 1219
1221 if (sacked & TCPCB_SACKED_RETRANS) { 1220 if (sacked & TCPCB_SACKED_RETRANS) {
1222 /* If the segment is not tagged as lost, 1221 /* If the segment is not tagged as lost,
@@ -1981,7 +1980,6 @@ void tcp_enter_loss(struct sock *sk)
1981 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; 1980 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
1982 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; 1981 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1983 tp->lost_out += tcp_skb_pcount(skb); 1982 tp->lost_out += tcp_skb_pcount(skb);
1984 tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
1985 } 1983 }
1986 } 1984 }
1987 tcp_verify_left_out(tp); 1985 tcp_verify_left_out(tp);
@@ -2000,6 +1998,11 @@ void tcp_enter_loss(struct sock *sk)
2000 /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous 1998 /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
2001 * loss recovery is underway except recurring timeout(s) on 1999 * loss recovery is underway except recurring timeout(s) on
2002 * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing 2000 * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
2001 *
2002 * In theory F-RTO can be used repeatedly during loss recovery.
2003 * In practice this interacts badly with broken middle-boxes that
2004 * falsely raise the receive window, which results in repeated
2005 * timeouts and stop-and-go behavior.
2003 */ 2006 */
2004 tp->frto = sysctl_tcp_frto && 2007 tp->frto = sysctl_tcp_frto &&
2005 (new_recovery || icsk->icsk_retransmits) && 2008 (new_recovery || icsk->icsk_retransmits) &&
@@ -2055,30 +2058,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
2055 return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; 2058 return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
2056} 2059}
2057 2060
2058static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
2059{
2060 struct tcp_sock *tp = tcp_sk(sk);
2061 unsigned long delay;
2062
2063 /* Delay early retransmit and entering fast recovery for
2064 * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples
2065 * available, or RTO is scheduled to fire first.
2066 */
2067 if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 ||
2068 (flag & FLAG_ECE) || !tp->srtt_us)
2069 return false;
2070
2071 delay = max(usecs_to_jiffies(tp->srtt_us >> 5),
2072 msecs_to_jiffies(2));
2073
2074 if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
2075 return false;
2076
2077 inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay,
2078 TCP_RTO_MAX);
2079 return true;
2080}
2081
2082/* Linux NewReno/SACK/FACK/ECN state machine. 2061/* Linux NewReno/SACK/FACK/ECN state machine.
2083 * -------------------------------------- 2062 * --------------------------------------
2084 * 2063 *
@@ -2126,10 +2105,26 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
2126 * F.e. after RTO, when all the queue is considered as lost, 2105 * F.e. after RTO, when all the queue is considered as lost,
2127 * lost_out = packets_out and in_flight = retrans_out. 2106 * lost_out = packets_out and in_flight = retrans_out.
2128 * 2107 *
2129 * Essentially, we have now two algorithms counting 2108 * Essentially, we have now a few algorithms detecting
2130 * lost packets. 2109 * lost packets.
2131 * 2110 *
2132 * FACK: It is the simplest heuristics. As soon as we decided 2111 * If the receiver supports SACK:
2112 *
2113 * RFC6675/3517: It is the conventional algorithm. A packet is
2114 * considered lost if the number of higher sequence packets
2115 * SACKed is greater than or equal the DUPACK thoreshold
2116 * (reordering). This is implemented in tcp_mark_head_lost and
2117 * tcp_update_scoreboard.
2118 *
2119 * RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm
2120 * (2017-) that checks timing instead of counting DUPACKs.
2121 * Essentially a packet is considered lost if it's not S/ACKed
2122 * after RTT + reordering_window, where both metrics are
2123 * dynamically measured and adjusted. This is implemented in
2124 * tcp_rack_mark_lost.
2125 *
2126 * FACK (Disabled by default. Subsumbed by RACK):
2127 * It is the simplest heuristics. As soon as we decided
2133 * that something is lost, we decide that _all_ not SACKed 2128 * that something is lost, we decide that _all_ not SACKed
2134 * packets until the most forward SACK are lost. I.e. 2129 * packets until the most forward SACK are lost. I.e.
2135 * lost_out = fackets_out - sacked_out and left_out = fackets_out. 2130 * lost_out = fackets_out - sacked_out and left_out = fackets_out.
@@ -2138,16 +2133,14 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
2138 * takes place. We use FACK by default until reordering 2133 * takes place. We use FACK by default until reordering
2139 * is suspected on the path to this destination. 2134 * is suspected on the path to this destination.
2140 * 2135 *
2141 * NewReno: when Recovery is entered, we assume that one segment 2136 * If the receiver does not support SACK:
2137 *
2138 * NewReno (RFC6582): in Recovery we assume that one segment
2142 * is lost (classic Reno). While we are in Recovery and 2139 * is lost (classic Reno). While we are in Recovery and
2143 * a partial ACK arrives, we assume that one more packet 2140 * a partial ACK arrives, we assume that one more packet
2144 * is lost (NewReno). This heuristics are the same in NewReno 2141 * is lost (NewReno). This heuristics are the same in NewReno
2145 * and SACK. 2142 * and SACK.
2146 * 2143 *
2147 * Imagine, that's all! Forget about all this shamanism about CWND inflation
2148 * deflation etc. CWND is real congestion window, never inflated, changes
2149 * only according to classic VJ rules.
2150 *
2151 * Really tricky (and requiring careful tuning) part of algorithm 2144 * Really tricky (and requiring careful tuning) part of algorithm
2152 * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue(). 2145 * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
2153 * The first determines the moment _when_ we should reduce CWND and, 2146 * The first determines the moment _when_ we should reduce CWND and,
@@ -2175,8 +2168,6 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
2175static bool tcp_time_to_recover(struct sock *sk, int flag) 2168static bool tcp_time_to_recover(struct sock *sk, int flag)
2176{ 2169{
2177 struct tcp_sock *tp = tcp_sk(sk); 2170 struct tcp_sock *tp = tcp_sk(sk);
2178 __u32 packets_out;
2179 int tcp_reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
2180 2171
2181 /* Trick#1: The loss is proven. */ 2172 /* Trick#1: The loss is proven. */
2182 if (tp->lost_out) 2173 if (tp->lost_out)
@@ -2186,39 +2177,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
2186 if (tcp_dupack_heuristics(tp) > tp->reordering) 2177 if (tcp_dupack_heuristics(tp) > tp->reordering)
2187 return true; 2178 return true;
2188 2179
2189 /* Trick#4: It is still not OK... But will it be useful to delay
2190 * recovery more?
2191 */
2192 packets_out = tp->packets_out;
2193 if (packets_out <= tp->reordering &&
2194 tp->sacked_out >= max_t(__u32, packets_out/2, tcp_reordering) &&
2195 !tcp_may_send_now(sk)) {
2196 /* We have nothing to send. This connection is limited
2197 * either by receiver window or by application.
2198 */
2199 return true;
2200 }
2201
2202 /* If a thin stream is detected, retransmit after first
2203 * received dupack. Employ only if SACK is supported in order
2204 * to avoid possible corner-case series of spurious retransmissions
2205 * Use only if there are no unsent data.
2206 */
2207 if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
2208 tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
2209 tcp_is_sack(tp) && !tcp_send_head(sk))
2210 return true;
2211
2212 /* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious
2213 * retransmissions due to small network reorderings, we implement
2214 * Mitigation A.3 in the RFC and delay the retransmission for a short
2215 * interval if appropriate.
2216 */
2217 if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
2218 (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) &&
2219 !tcp_may_send_now(sk))
2220 return !tcp_pause_early_retransmit(sk, flag);
2221
2222 return false; 2180 return false;
2223} 2181}
2224 2182
@@ -2414,10 +2372,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
2414 if (tp->prior_ssthresh) { 2372 if (tp->prior_ssthresh) {
2415 const struct inet_connection_sock *icsk = inet_csk(sk); 2373 const struct inet_connection_sock *icsk = inet_csk(sk);
2416 2374
2417 if (icsk->icsk_ca_ops->undo_cwnd) 2375 tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
2418 tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
2419 else
2420 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1);
2421 2376
2422 if (tp->prior_ssthresh > tp->snd_ssthresh) { 2377 if (tp->prior_ssthresh > tp->snd_ssthresh) {
2423 tp->snd_ssthresh = tp->prior_ssthresh; 2378 tp->snd_ssthresh = tp->prior_ssthresh;
@@ -2523,8 +2478,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk)
2523 tcp_ecn_queue_cwr(tp); 2478 tcp_ecn_queue_cwr(tp);
2524} 2479}
2525 2480
2526static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, 2481void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag)
2527 int flag)
2528{ 2482{
2529 struct tcp_sock *tp = tcp_sk(sk); 2483 struct tcp_sock *tp = tcp_sk(sk);
2530 int sndcnt = 0; 2484 int sndcnt = 0;
@@ -2692,7 +2646,7 @@ void tcp_simple_retransmit(struct sock *sk)
2692} 2646}
2693EXPORT_SYMBOL(tcp_simple_retransmit); 2647EXPORT_SYMBOL(tcp_simple_retransmit);
2694 2648
2695static void tcp_enter_recovery(struct sock *sk, bool ece_ack) 2649void tcp_enter_recovery(struct sock *sk, bool ece_ack)
2696{ 2650{
2697 struct tcp_sock *tp = tcp_sk(sk); 2651 struct tcp_sock *tp = tcp_sk(sk);
2698 int mib_idx; 2652 int mib_idx;
@@ -2728,14 +2682,18 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
2728 tcp_try_undo_loss(sk, false)) 2682 tcp_try_undo_loss(sk, false))
2729 return; 2683 return;
2730 2684
2731 if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ 2685 /* The ACK (s)acks some never-retransmitted data meaning not all
2732 /* Step 3.b. A timeout is spurious if not all data are 2686 * the data packets before the timeout were lost. Therefore we
2733 * lost, i.e., never-retransmitted data are (s)acked. 2687 * undo the congestion window and state. This is essentially
2734 */ 2688 * the operation in F-RTO (RFC5682 section 3.1 step 3.b). Since
2735 if ((flag & FLAG_ORIG_SACK_ACKED) && 2689 * a retransmitted skb is permantly marked, we can apply such an
2736 tcp_try_undo_loss(sk, true)) 2690 * operation even if F-RTO was not used.
2737 return; 2691 */
2692 if ((flag & FLAG_ORIG_SACK_ACKED) &&
2693 tcp_try_undo_loss(sk, tp->undo_marker))
2694 return;
2738 2695
2696 if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
2739 if (after(tp->snd_nxt, tp->high_seq)) { 2697 if (after(tp->snd_nxt, tp->high_seq)) {
2740 if (flag & FLAG_DATA_SACKED || is_dupack) 2698 if (flag & FLAG_DATA_SACKED || is_dupack)
2741 tp->frto = 0; /* Step 3.a. loss was real */ 2699 tp->frto = 0; /* Step 3.a. loss was real */
@@ -2802,6 +2760,21 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked)
2802 return false; 2760 return false;
2803} 2761}
2804 2762
2763static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag,
2764 const struct skb_mstamp *ack_time)
2765{
2766 struct tcp_sock *tp = tcp_sk(sk);
2767
2768 /* Use RACK to detect loss */
2769 if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) {
2770 u32 prior_retrans = tp->retrans_out;
2771
2772 tcp_rack_mark_lost(sk, ack_time);
2773 if (prior_retrans > tp->retrans_out)
2774 *ack_flag |= FLAG_LOST_RETRANS;
2775 }
2776}
2777
2805/* Process an event, which can update packets-in-flight not trivially. 2778/* Process an event, which can update packets-in-flight not trivially.
2806 * Main goal of this function is to calculate new estimate for left_out, 2779 * Main goal of this function is to calculate new estimate for left_out,
2807 * taking into account both packets sitting in receiver's buffer and 2780 * taking into account both packets sitting in receiver's buffer and
@@ -2815,7 +2788,8 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked)
2815 * tcp_xmit_retransmit_queue(). 2788 * tcp_xmit_retransmit_queue().
2816 */ 2789 */
2817static void tcp_fastretrans_alert(struct sock *sk, const int acked, 2790static void tcp_fastretrans_alert(struct sock *sk, const int acked,
2818 bool is_dupack, int *ack_flag, int *rexmit) 2791 bool is_dupack, int *ack_flag, int *rexmit,
2792 const struct skb_mstamp *ack_time)
2819{ 2793{
2820 struct inet_connection_sock *icsk = inet_csk(sk); 2794 struct inet_connection_sock *icsk = inet_csk(sk);
2821 struct tcp_sock *tp = tcp_sk(sk); 2795 struct tcp_sock *tp = tcp_sk(sk);
@@ -2866,13 +2840,6 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
2866 } 2840 }
2867 } 2841 }
2868 2842
2869 /* Use RACK to detect loss */
2870 if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS &&
2871 tcp_rack_mark_lost(sk)) {
2872 flag |= FLAG_LOST_RETRANS;
2873 *ack_flag |= FLAG_LOST_RETRANS;
2874 }
2875
2876 /* E. Process state. */ 2843 /* E. Process state. */
2877 switch (icsk->icsk_ca_state) { 2844 switch (icsk->icsk_ca_state) {
2878 case TCP_CA_Recovery: 2845 case TCP_CA_Recovery:
@@ -2890,11 +2857,13 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
2890 tcp_try_keep_open(sk); 2857 tcp_try_keep_open(sk);
2891 return; 2858 return;
2892 } 2859 }
2860 tcp_rack_identify_loss(sk, ack_flag, ack_time);
2893 break; 2861 break;
2894 case TCP_CA_Loss: 2862 case TCP_CA_Loss:
2895 tcp_process_loss(sk, flag, is_dupack, rexmit); 2863 tcp_process_loss(sk, flag, is_dupack, rexmit);
2896 if (icsk->icsk_ca_state != TCP_CA_Open && 2864 tcp_rack_identify_loss(sk, ack_flag, ack_time);
2897 !(flag & FLAG_LOST_RETRANS)) 2865 if (!(icsk->icsk_ca_state == TCP_CA_Open ||
2866 (*ack_flag & FLAG_LOST_RETRANS)))
2898 return; 2867 return;
2899 /* Change state if cwnd is undone or retransmits are lost */ 2868 /* Change state if cwnd is undone or retransmits are lost */
2900 default: 2869 default:
@@ -2908,6 +2877,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
2908 if (icsk->icsk_ca_state <= TCP_CA_Disorder) 2877 if (icsk->icsk_ca_state <= TCP_CA_Disorder)
2909 tcp_try_undo_dsack(sk); 2878 tcp_try_undo_dsack(sk);
2910 2879
2880 tcp_rack_identify_loss(sk, ack_flag, ack_time);
2911 if (!tcp_time_to_recover(sk, flag)) { 2881 if (!tcp_time_to_recover(sk, flag)) {
2912 tcp_try_to_open(sk, flag); 2882 tcp_try_to_open(sk, flag);
2913 return; 2883 return;
@@ -3026,7 +2996,7 @@ void tcp_rearm_rto(struct sock *sk)
3026 } else { 2996 } else {
3027 u32 rto = inet_csk(sk)->icsk_rto; 2997 u32 rto = inet_csk(sk)->icsk_rto;
3028 /* Offset the time elapsed after installing regular RTO */ 2998 /* Offset the time elapsed after installing regular RTO */
3029 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || 2999 if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
3030 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 3000 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
3031 struct sk_buff *skb = tcp_write_queue_head(sk); 3001 struct sk_buff *skb = tcp_write_queue_head(sk);
3032 const u32 rto_time_stamp = 3002 const u32 rto_time_stamp =
@@ -3043,24 +3013,6 @@ void tcp_rearm_rto(struct sock *sk)
3043 } 3013 }
3044} 3014}
3045 3015
3046/* This function is called when the delayed ER timer fires. TCP enters
3047 * fast recovery and performs fast-retransmit.
3048 */
3049void tcp_resume_early_retransmit(struct sock *sk)
3050{
3051 struct tcp_sock *tp = tcp_sk(sk);
3052
3053 tcp_rearm_rto(sk);
3054
3055 /* Stop if ER is disabled after the delayed ER timer is scheduled */
3056 if (!tp->do_early_retrans)
3057 return;
3058
3059 tcp_enter_recovery(sk, false);
3060 tcp_update_scoreboard(sk, 1);
3061 tcp_xmit_retransmit_queue(sk);
3062}
3063
3064/* If we get here, the whole TSO packet has not been acked. */ 3016/* If we get here, the whole TSO packet has not been acked. */
3065static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) 3017static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
3066{ 3018{
@@ -3103,11 +3055,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
3103 */ 3055 */
3104static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, 3056static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3105 u32 prior_snd_una, int *acked, 3057 u32 prior_snd_una, int *acked,
3106 struct tcp_sacktag_state *sack, 3058 struct tcp_sacktag_state *sack)
3107 struct skb_mstamp *now)
3108{ 3059{
3109 const struct inet_connection_sock *icsk = inet_csk(sk); 3060 const struct inet_connection_sock *icsk = inet_csk(sk);
3110 struct skb_mstamp first_ackt, last_ackt; 3061 struct skb_mstamp first_ackt, last_ackt;
3062 struct skb_mstamp *now = &sack->ack_time;
3111 struct tcp_sock *tp = tcp_sk(sk); 3063 struct tcp_sock *tp = tcp_sk(sk);
3112 u32 prior_sacked = tp->sacked_out; 3064 u32 prior_sacked = tp->sacked_out;
3113 u32 reord = tp->packets_out; 3065 u32 reord = tp->packets_out;
@@ -3167,7 +3119,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3167 } else if (tcp_is_sack(tp)) { 3119 } else if (tcp_is_sack(tp)) {
3168 tp->delivered += acked_pcount; 3120 tp->delivered += acked_pcount;
3169 if (!tcp_skb_spurious_retrans(tp, skb)) 3121 if (!tcp_skb_spurious_retrans(tp, skb))
3170 tcp_rack_advance(tp, &skb->skb_mstamp, sacked); 3122 tcp_rack_advance(tp, sacked, scb->end_seq,
3123 &skb->skb_mstamp,
3124 &sack->ack_time);
3171 } 3125 }
3172 if (sacked & TCPCB_LOST) 3126 if (sacked & TCPCB_LOST)
3173 tp->lost_out -= acked_pcount; 3127 tp->lost_out -= acked_pcount;
@@ -3201,6 +3155,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3201 tp->lost_skb_hint = NULL; 3155 tp->lost_skb_hint = NULL;
3202 } 3156 }
3203 3157
3158 if (!skb)
3159 tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
3160
3204 if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una))) 3161 if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
3205 tp->snd_up = tp->snd_una; 3162 tp->snd_up = tp->snd_una;
3206 3163
@@ -3371,9 +3328,7 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
3371 u32 delta = ack - tp->snd_una; 3328 u32 delta = ack - tp->snd_una;
3372 3329
3373 sock_owned_by_me((struct sock *)tp); 3330 sock_owned_by_me((struct sock *)tp);
3374 u64_stats_update_begin_raw(&tp->syncp);
3375 tp->bytes_acked += delta; 3331 tp->bytes_acked += delta;
3376 u64_stats_update_end_raw(&tp->syncp);
3377 tp->snd_una = ack; 3332 tp->snd_una = ack;
3378} 3333}
3379 3334
@@ -3383,9 +3338,7 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
3383 u32 delta = seq - tp->rcv_nxt; 3338 u32 delta = seq - tp->rcv_nxt;
3384 3339
3385 sock_owned_by_me((struct sock *)tp); 3340 sock_owned_by_me((struct sock *)tp);
3386 u64_stats_update_begin_raw(&tp->syncp);
3387 tp->bytes_received += delta; 3341 tp->bytes_received += delta;
3388 u64_stats_update_end_raw(&tp->syncp);
3389 tp->rcv_nxt = seq; 3342 tp->rcv_nxt = seq;
3390} 3343}
3391 3344
@@ -3598,7 +3551,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3598 u32 lost = tp->lost; 3551 u32 lost = tp->lost;
3599 int acked = 0; /* Number of packets newly acked */ 3552 int acked = 0; /* Number of packets newly acked */
3600 int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ 3553 int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
3601 struct skb_mstamp now;
3602 3554
3603 sack_state.first_sackt.v64 = 0; 3555 sack_state.first_sackt.v64 = 0;
3604 sack_state.rate = &rs; 3556 sack_state.rate = &rs;
@@ -3624,10 +3576,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3624 if (after(ack, tp->snd_nxt)) 3576 if (after(ack, tp->snd_nxt))
3625 goto invalid_ack; 3577 goto invalid_ack;
3626 3578
3627 skb_mstamp_get(&now); 3579 skb_mstamp_get(&sack_state.ack_time);
3628 3580
3629 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || 3581 if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
3630 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
3631 tcp_rearm_rto(sk); 3582 tcp_rearm_rto(sk);
3632 3583
3633 if (after(ack, prior_snd_una)) { 3584 if (after(ack, prior_snd_una)) {
@@ -3692,34 +3643,34 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3692 3643
3693 /* See if we can take anything off of the retransmit queue. */ 3644 /* See if we can take anything off of the retransmit queue. */
3694 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, 3645 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
3695 &sack_state, &now); 3646 &sack_state);
3696 3647
3697 if (tcp_ack_is_dubious(sk, flag)) { 3648 if (tcp_ack_is_dubious(sk, flag)) {
3698 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); 3649 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
3699 tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); 3650 tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
3651 &sack_state.ack_time);
3700 } 3652 }
3701 if (tp->tlp_high_seq) 3653 if (tp->tlp_high_seq)
3702 tcp_process_tlp_ack(sk, ack, flag); 3654 tcp_process_tlp_ack(sk, ack, flag);
3703 3655
3704 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { 3656 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
3705 struct dst_entry *dst = __sk_dst_get(sk); 3657 sk_dst_confirm(sk);
3706 if (dst)
3707 dst_confirm(dst);
3708 }
3709 3658
3710 if (icsk->icsk_pending == ICSK_TIME_RETRANS) 3659 if (icsk->icsk_pending == ICSK_TIME_RETRANS)
3711 tcp_schedule_loss_probe(sk); 3660 tcp_schedule_loss_probe(sk);
3712 delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ 3661 delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */
3713 lost = tp->lost - lost; /* freshly marked lost */ 3662 lost = tp->lost - lost; /* freshly marked lost */
3714 tcp_rate_gen(sk, delivered, lost, &now, &rs); 3663 tcp_rate_gen(sk, delivered, lost, &sack_state.ack_time,
3715 tcp_cong_control(sk, ack, delivered, flag, &rs); 3664 sack_state.rate);
3665 tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
3716 tcp_xmit_recovery(sk, rexmit); 3666 tcp_xmit_recovery(sk, rexmit);
3717 return 1; 3667 return 1;
3718 3668
3719no_queue: 3669no_queue:
3720 /* If data was DSACKed, see if we can undo a cwnd reduction. */ 3670 /* If data was DSACKed, see if we can undo a cwnd reduction. */
3721 if (flag & FLAG_DSACKING_ACK) 3671 if (flag & FLAG_DSACKING_ACK)
3722 tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); 3672 tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
3673 &sack_state.ack_time);
3723 /* If this ack opens up a zero window, clear backoff. It was 3674 /* If this ack opens up a zero window, clear backoff. It was
3724 * being used to time the probes, and is probably far higher than 3675 * being used to time the probes, and is probably far higher than
3725 * it needs to be for normal retransmission. 3676 * it needs to be for normal retransmission.
@@ -3740,9 +3691,11 @@ old_ack:
3740 * If data was DSACKed, see if we can undo a cwnd reduction. 3691 * If data was DSACKed, see if we can undo a cwnd reduction.
3741 */ 3692 */
3742 if (TCP_SKB_CB(skb)->sacked) { 3693 if (TCP_SKB_CB(skb)->sacked) {
3694 skb_mstamp_get(&sack_state.ack_time);
3743 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, 3695 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
3744 &sack_state); 3696 &sack_state);
3745 tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); 3697 tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
3698 &sack_state.ack_time);
3746 tcp_xmit_recovery(sk, rexmit); 3699 tcp_xmit_recovery(sk, rexmit);
3747 } 3700 }
3748 3701
@@ -4560,6 +4513,7 @@ add_sack:
4560end: 4513end:
4561 if (skb) { 4514 if (skb) {
4562 tcp_grow_window(sk, skb); 4515 tcp_grow_window(sk, skb);
4516 skb_condense(skb);
4563 skb_set_owner_r(skb, sk); 4517 skb_set_owner_r(skb, sk);
4564 } 4518 }
4565} 4519}
@@ -5081,10 +5035,13 @@ static void tcp_check_space(struct sock *sk)
5081 if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { 5035 if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
5082 sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); 5036 sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
5083 /* pairs with tcp_poll() */ 5037 /* pairs with tcp_poll() */
5084 smp_mb__after_atomic(); 5038 smp_mb();
5085 if (sk->sk_socket && 5039 if (sk->sk_socket &&
5086 test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) 5040 test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
5087 tcp_new_space(sk); 5041 tcp_new_space(sk);
5042 if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
5043 tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
5044 }
5088 } 5045 }
5089} 5046}
5090 5047
@@ -5249,6 +5206,23 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
5249 return err; 5206 return err;
5250} 5207}
5251 5208
5209/* Accept RST for rcv_nxt - 1 after a FIN.
5210 * When tcp connections are abruptly terminated from Mac OSX (via ^C), a
5211 * FIN is sent followed by a RST packet. The RST is sent with the same
5212 * sequence number as the FIN, and thus according to RFC 5961 a challenge
5213 * ACK should be sent. However, Mac OSX rate limits replies to challenge
5214 * ACKs on the closed socket. In addition middleboxes can drop either the
5215 * challenge ACK or a subsequent RST.
5216 */
5217static bool tcp_reset_check(const struct sock *sk, const struct sk_buff *skb)
5218{
5219 struct tcp_sock *tp = tcp_sk(sk);
5220
5221 return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - 1) &&
5222 (1 << sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK |
5223 TCPF_CLOSING));
5224}
5225
5252/* Does PAWS and seqno based validation of an incoming segment, flags will 5226/* Does PAWS and seqno based validation of an incoming segment, flags will
5253 * play significant role here. 5227 * play significant role here.
5254 */ 5228 */
@@ -5287,20 +5261,25 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
5287 LINUX_MIB_TCPACKSKIPPEDSEQ, 5261 LINUX_MIB_TCPACKSKIPPEDSEQ,
5288 &tp->last_oow_ack_time)) 5262 &tp->last_oow_ack_time))
5289 tcp_send_dupack(sk, skb); 5263 tcp_send_dupack(sk, skb);
5264 } else if (tcp_reset_check(sk, skb)) {
5265 tcp_reset(sk);
5290 } 5266 }
5291 goto discard; 5267 goto discard;
5292 } 5268 }
5293 5269
5294 /* Step 2: check RST bit */ 5270 /* Step 2: check RST bit */
5295 if (th->rst) { 5271 if (th->rst) {
5296 /* RFC 5961 3.2 (extend to match against SACK too if available): 5272 /* RFC 5961 3.2 (extend to match against (RCV.NXT - 1) after a
5297 * If seq num matches RCV.NXT or the right-most SACK block, 5273 * FIN and SACK too if available):
5274 * If seq num matches RCV.NXT or (RCV.NXT - 1) after a FIN, or
5275 * the right-most SACK block,
5298 * then 5276 * then
5299 * RESET the connection 5277 * RESET the connection
5300 * else 5278 * else
5301 * Send a challenge ACK 5279 * Send a challenge ACK
5302 */ 5280 */
5303 if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { 5281 if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt ||
5282 tcp_reset_check(sk, skb)) {
5304 rst_seq_match = true; 5283 rst_seq_match = true;
5305 } else if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) { 5284 } else if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) {
5306 struct tcp_sack_block *sp = &tp->selective_acks[0]; 5285 struct tcp_sack_block *sp = &tp->selective_acks[0];
@@ -5571,6 +5550,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
5571 struct inet_connection_sock *icsk = inet_csk(sk); 5550 struct inet_connection_sock *icsk = inet_csk(sk);
5572 5551
5573 tcp_set_state(sk, TCP_ESTABLISHED); 5552 tcp_set_state(sk, TCP_ESTABLISHED);
5553 icsk->icsk_ack.lrcvtime = tcp_time_stamp;
5574 5554
5575 if (skb) { 5555 if (skb) {
5576 icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); 5556 icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
@@ -5789,7 +5769,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5789 * to stand against the temptation 8) --ANK 5769 * to stand against the temptation 8) --ANK
5790 */ 5770 */
5791 inet_csk_schedule_ack(sk); 5771 inet_csk_schedule_ack(sk);
5792 icsk->icsk_ack.lrcvtime = tcp_time_stamp;
5793 tcp_enter_quickack_mode(sk); 5772 tcp_enter_quickack_mode(sk);
5794 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, 5773 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
5795 TCP_DELACK_MAX, TCP_RTO_MAX); 5774 TCP_DELACK_MAX, TCP_RTO_MAX);
@@ -5916,9 +5895,15 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
5916 if (th->syn) { 5895 if (th->syn) {
5917 if (th->fin) 5896 if (th->fin)
5918 goto discard; 5897 goto discard;
5919 if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) 5898 /* It is possible that we process SYN packets from backlog,
5920 return 1; 5899 * so we need to make sure to disable BH right there.
5900 */
5901 local_bh_disable();
5902 acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
5903 local_bh_enable();
5921 5904
5905 if (!acceptable)
5906 return 1;
5922 consume_skb(skb); 5907 consume_skb(skb);
5923 return 0; 5908 return 0;
5924 } 5909 }
@@ -6022,7 +6007,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
6022 break; 6007 break;
6023 6008
6024 case TCP_FIN_WAIT1: { 6009 case TCP_FIN_WAIT1: {
6025 struct dst_entry *dst;
6026 int tmo; 6010 int tmo;
6027 6011
6028 /* If we enter the TCP_FIN_WAIT1 state and we are a 6012 /* If we enter the TCP_FIN_WAIT1 state and we are a
@@ -6049,9 +6033,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
6049 tcp_set_state(sk, TCP_FIN_WAIT2); 6033 tcp_set_state(sk, TCP_FIN_WAIT2);
6050 sk->sk_shutdown |= SEND_SHUTDOWN; 6034 sk->sk_shutdown |= SEND_SHUTDOWN;
6051 6035
6052 dst = __sk_dst_get(sk); 6036 sk_dst_confirm(sk);
6053 if (dst)
6054 dst_confirm(dst);
6055 6037
6056 if (!sock_flag(sk, SOCK_DEAD)) { 6038 if (!sock_flag(sk, SOCK_DEAD)) {
6057 /* Wake up lingering close() */ 6039 /* Wake up lingering close() */
@@ -6318,13 +6300,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
6318 goto drop; 6300 goto drop;
6319 } 6301 }
6320 6302
6321 6303 if (sk_acceptq_is_full(sk)) {
6322 /* Accept backlog is full. If we have already queued enough
6323 * of warm entries in syn queue, drop request. It is better than
6324 * clogging syn queue with openreqs with exponentially increasing
6325 * timeout.
6326 */
6327 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
6328 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 6304 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
6329 goto drop; 6305 goto drop;
6330 } 6306 }
@@ -6334,6 +6310,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
6334 goto drop; 6310 goto drop;
6335 6311
6336 tcp_rsk(req)->af_specific = af_ops; 6312 tcp_rsk(req)->af_specific = af_ops;
6313 tcp_rsk(req)->ts_off = 0;
6337 6314
6338 tcp_clear_options(&tmp_opt); 6315 tcp_clear_options(&tmp_opt);
6339 tmp_opt.mss_clamp = af_ops->mss_clamp; 6316 tmp_opt.mss_clamp = af_ops->mss_clamp;
@@ -6355,6 +6332,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
6355 if (security_inet_conn_request(sk, skb, req)) 6332 if (security_inet_conn_request(sk, skb, req))
6356 goto drop_and_free; 6333 goto drop_and_free;
6357 6334
6335 if (isn && tmp_opt.tstamp_ok)
6336 af_ops->init_seq(skb, &tcp_rsk(req)->ts_off);
6337
6358 if (!want_cookie && !isn) { 6338 if (!want_cookie && !isn) {
6359 /* VJ's idea. We save last timestamp seen 6339 /* VJ's idea. We save last timestamp seen
6360 * from the destination in peer table, when entering 6340 * from the destination in peer table, when entering
@@ -6365,7 +6345,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
6365 * timewait bucket, so that all the necessary checks 6345 * timewait bucket, so that all the necessary checks
6366 * are made in the function processing timewait state. 6346 * are made in the function processing timewait state.
6367 */ 6347 */
6368 if (tcp_death_row.sysctl_tw_recycle) { 6348 if (net->ipv4.tcp_death_row.sysctl_tw_recycle) {
6369 bool strict; 6349 bool strict;
6370 6350
6371 dst = af_ops->route_req(sk, &fl, req, &strict); 6351 dst = af_ops->route_req(sk, &fl, req, &strict);
@@ -6379,8 +6359,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
6379 } 6359 }
6380 /* Kill the following clause, if you dislike this way. */ 6360 /* Kill the following clause, if you dislike this way. */
6381 else if (!net->ipv4.sysctl_tcp_syncookies && 6361 else if (!net->ipv4.sysctl_tcp_syncookies &&
6382 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < 6362 (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
6383 (sysctl_max_syn_backlog >> 2)) && 6363 (net->ipv4.sysctl_max_syn_backlog >> 2)) &&
6384 !tcp_peer_is_proven(req, dst, false, 6364 !tcp_peer_is_proven(req, dst, false,
6385 tmp_opt.saw_tstamp)) { 6365 tmp_opt.saw_tstamp)) {
6386 /* Without syncookies last quarter of 6366 /* Without syncookies last quarter of
@@ -6395,7 +6375,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
6395 goto drop_and_release; 6375 goto drop_and_release;
6396 } 6376 }
6397 6377
6398 isn = af_ops->init_seq(skb); 6378 isn = af_ops->init_seq(skb, &tcp_rsk(req)->ts_off);
6399 } 6379 }
6400 if (!dst) { 6380 if (!dst) {
6401 dst = af_ops->route_req(sk, &fl, req, NULL); 6381 dst = af_ops->route_req(sk, &fl, req, NULL);
@@ -6407,6 +6387,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
6407 6387
6408 if (want_cookie) { 6388 if (want_cookie) {
6409 isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); 6389 isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
6390 tcp_rsk(req)->ts_off = 0;
6410 req->cookie_ts = tmp_opt.tstamp_ok; 6391 req->cookie_ts = tmp_opt.tstamp_ok;
6411 if (!tmp_opt.tstamp_ok) 6392 if (!tmp_opt.tstamp_ok)
6412 inet_rsk(req)->ecn_ok = 0; 6393 inet_rsk(req)->ecn_ok = 0;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2259114c7242..575e19dcc017 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -84,7 +84,6 @@
84#include <crypto/hash.h> 84#include <crypto/hash.h>
85#include <linux/scatterlist.h> 85#include <linux/scatterlist.h>
86 86
87int sysctl_tcp_tw_reuse __read_mostly;
88int sysctl_tcp_low_latency __read_mostly; 87int sysctl_tcp_low_latency __read_mostly;
89 88
90#ifdef CONFIG_TCP_MD5SIG 89#ifdef CONFIG_TCP_MD5SIG
@@ -95,12 +94,12 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
95struct inet_hashinfo tcp_hashinfo; 94struct inet_hashinfo tcp_hashinfo;
96EXPORT_SYMBOL(tcp_hashinfo); 95EXPORT_SYMBOL(tcp_hashinfo);
97 96
98static __u32 tcp_v4_init_sequence(const struct sk_buff *skb) 97static u32 tcp_v4_init_sequence(const struct sk_buff *skb, u32 *tsoff)
99{ 98{
100 return secure_tcp_sequence_number(ip_hdr(skb)->daddr, 99 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
101 ip_hdr(skb)->saddr, 100 ip_hdr(skb)->saddr,
102 tcp_hdr(skb)->dest, 101 tcp_hdr(skb)->dest,
103 tcp_hdr(skb)->source); 102 tcp_hdr(skb)->source, tsoff);
104} 103}
105 104
106int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 105int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
@@ -120,7 +119,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
120 and use initial timestamp retrieved from peer table. 119 and use initial timestamp retrieved from peer table.
121 */ 120 */
122 if (tcptw->tw_ts_recent_stamp && 121 if (tcptw->tw_ts_recent_stamp &&
123 (!twp || (sysctl_tcp_tw_reuse && 122 (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
124 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { 123 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
125 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; 124 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
126 if (tp->write_seq == 0) 125 if (tp->write_seq == 0)
@@ -146,7 +145,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
146 struct flowi4 *fl4; 145 struct flowi4 *fl4;
147 struct rtable *rt; 146 struct rtable *rt;
148 int err; 147 int err;
148 u32 seq;
149 struct ip_options_rcu *inet_opt; 149 struct ip_options_rcu *inet_opt;
150 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
150 151
151 if (addr_len < sizeof(struct sockaddr_in)) 152 if (addr_len < sizeof(struct sockaddr_in))
152 return -EINVAL; 153 return -EINVAL;
@@ -197,7 +198,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
197 tp->write_seq = 0; 198 tp->write_seq = 0;
198 } 199 }
199 200
200 if (tcp_death_row.sysctl_tw_recycle && 201 if (tcp_death_row->sysctl_tw_recycle &&
201 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) 202 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
202 tcp_fetch_timewait_stamp(sk, &rt->dst); 203 tcp_fetch_timewait_stamp(sk, &rt->dst);
203 204
@@ -216,7 +217,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
216 * complete initialization after this. 217 * complete initialization after this.
217 */ 218 */
218 tcp_set_state(sk, TCP_SYN_SENT); 219 tcp_set_state(sk, TCP_SYN_SENT);
219 err = inet_hash_connect(&tcp_death_row, sk); 220 err = inet_hash_connect(tcp_death_row, sk);
220 if (err) 221 if (err)
221 goto failure; 222 goto failure;
222 223
@@ -232,18 +233,27 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
232 /* OK, now commit destination to socket. */ 233 /* OK, now commit destination to socket. */
233 sk->sk_gso_type = SKB_GSO_TCPV4; 234 sk->sk_gso_type = SKB_GSO_TCPV4;
234 sk_setup_caps(sk, &rt->dst); 235 sk_setup_caps(sk, &rt->dst);
236 rt = NULL;
235 237
236 if (!tp->write_seq && likely(!tp->repair)) 238 if (likely(!tp->repair)) {
237 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr, 239 seq = secure_tcp_sequence_number(inet->inet_saddr,
238 inet->inet_daddr, 240 inet->inet_daddr,
239 inet->inet_sport, 241 inet->inet_sport,
240 usin->sin_port); 242 usin->sin_port,
243 &tp->tsoffset);
244 if (!tp->write_seq)
245 tp->write_seq = seq;
246 }
241 247
242 inet->inet_id = tp->write_seq ^ jiffies; 248 inet->inet_id = tp->write_seq ^ jiffies;
243 249
250 if (tcp_fastopen_defer_connect(sk, &err))
251 return err;
252 if (err)
253 goto failure;
254
244 err = tcp_connect(sk); 255 err = tcp_connect(sk);
245 256
246 rt = NULL;
247 if (err) 257 if (err)
248 goto failure; 258 goto failure;
249 259
@@ -269,10 +279,13 @@ EXPORT_SYMBOL(tcp_v4_connect);
269 */ 279 */
270void tcp_v4_mtu_reduced(struct sock *sk) 280void tcp_v4_mtu_reduced(struct sock *sk)
271{ 281{
272 struct dst_entry *dst;
273 struct inet_sock *inet = inet_sk(sk); 282 struct inet_sock *inet = inet_sk(sk);
274 u32 mtu = tcp_sk(sk)->mtu_info; 283 struct dst_entry *dst;
284 u32 mtu;
275 285
286 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
287 return;
288 mtu = tcp_sk(sk)->mtu_info;
276 dst = inet_csk_update_pmtu(sk, mtu); 289 dst = inet_csk_update_pmtu(sk, mtu);
277 if (!dst) 290 if (!dst)
278 return; 291 return;
@@ -418,7 +431,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
418 431
419 switch (type) { 432 switch (type) {
420 case ICMP_REDIRECT: 433 case ICMP_REDIRECT:
421 do_redirect(icmp_skb, sk); 434 if (!sock_owned_by_user(sk))
435 do_redirect(icmp_skb, sk);
422 goto out; 436 goto out;
423 case ICMP_SOURCE_QUENCH: 437 case ICMP_SOURCE_QUENCH:
424 /* Just silently ignore these. */ 438 /* Just silently ignore these. */
@@ -442,7 +456,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
442 if (!sock_owned_by_user(sk)) { 456 if (!sock_owned_by_user(sk)) {
443 tcp_v4_mtu_reduced(sk); 457 tcp_v4_mtu_reduced(sk);
444 } else { 458 } else {
445 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags)) 459 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
446 sock_hold(sk); 460 sock_hold(sk);
447 } 461 }
448 goto out; 462 goto out;
@@ -691,6 +705,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
691 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 705 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
692 706
693 arg.tos = ip_hdr(skb)->tos; 707 arg.tos = ip_hdr(skb)->tos;
708 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
694 local_bh_disable(); 709 local_bh_disable();
695 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), 710 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
696 skb, &TCP_SKB_CB(skb)->header.h4.opt, 711 skb, &TCP_SKB_CB(skb)->header.h4.opt,
@@ -711,7 +726,7 @@ out:
711 outside socket context is ugly, certainly. What can I do? 726 outside socket context is ugly, certainly. What can I do?
712 */ 727 */
713 728
714static void tcp_v4_send_ack(struct net *net, 729static void tcp_v4_send_ack(const struct sock *sk,
715 struct sk_buff *skb, u32 seq, u32 ack, 730 struct sk_buff *skb, u32 seq, u32 ack,
716 u32 win, u32 tsval, u32 tsecr, int oif, 731 u32 win, u32 tsval, u32 tsecr, int oif,
717 struct tcp_md5sig_key *key, 732 struct tcp_md5sig_key *key,
@@ -726,6 +741,7 @@ static void tcp_v4_send_ack(struct net *net,
726#endif 741#endif
727 ]; 742 ];
728 } rep; 743 } rep;
744 struct net *net = sock_net(sk);
729 struct ip_reply_arg arg; 745 struct ip_reply_arg arg;
730 746
731 memset(&rep.th, 0, sizeof(struct tcphdr)); 747 memset(&rep.th, 0, sizeof(struct tcphdr));
@@ -775,6 +791,7 @@ static void tcp_v4_send_ack(struct net *net,
775 if (oif) 791 if (oif)
776 arg.bound_dev_if = oif; 792 arg.bound_dev_if = oif;
777 arg.tos = tos; 793 arg.tos = tos;
794 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
778 local_bh_disable(); 795 local_bh_disable();
779 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), 796 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
780 skb, &TCP_SKB_CB(skb)->header.h4.opt, 797 skb, &TCP_SKB_CB(skb)->header.h4.opt,
@@ -790,7 +807,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
790 struct inet_timewait_sock *tw = inet_twsk(sk); 807 struct inet_timewait_sock *tw = inet_twsk(sk);
791 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 808 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
792 809
793 tcp_v4_send_ack(sock_net(sk), skb, 810 tcp_v4_send_ack(sk, skb,
794 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 811 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
795 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 812 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
796 tcp_time_stamp + tcptw->tw_ts_offset, 813 tcp_time_stamp + tcptw->tw_ts_offset,
@@ -818,10 +835,10 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
818 * exception of <SYN> segments, MUST be right-shifted by 835 * exception of <SYN> segments, MUST be right-shifted by
819 * Rcv.Wind.Shift bits: 836 * Rcv.Wind.Shift bits:
820 */ 837 */
821 tcp_v4_send_ack(sock_net(sk), skb, seq, 838 tcp_v4_send_ack(sk, skb, seq,
822 tcp_rsk(req)->rcv_nxt, 839 tcp_rsk(req)->rcv_nxt,
823 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 840 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
824 tcp_time_stamp, 841 tcp_time_stamp + tcp_rsk(req)->ts_off,
825 req->ts_recent, 842 req->ts_recent,
826 0, 843 0,
827 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, 844 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
@@ -1315,10 +1332,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1315 tcp_ca_openreq_child(newsk, dst); 1332 tcp_ca_openreq_child(newsk, dst);
1316 1333
1317 tcp_sync_mss(newsk, dst_mtu(dst)); 1334 tcp_sync_mss(newsk, dst_mtu(dst));
1318 newtp->advmss = dst_metric_advmss(dst); 1335 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1319 if (tcp_sk(sk)->rx_opt.user_mss &&
1320 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1321 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1322 1336
1323 tcp_initialize_rcv_mss(newsk); 1337 tcp_initialize_rcv_mss(newsk);
1324 1338
@@ -1552,8 +1566,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1552 * It has been noticed pure SACK packets were sometimes dropped 1566 * It has been noticed pure SACK packets were sometimes dropped
1553 * (if cooked by drivers without copybreak feature). 1567 * (if cooked by drivers without copybreak feature).
1554 */ 1568 */
1555 if (!skb->data_len) 1569 skb_condense(skb);
1556 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
1557 1570
1558 if (unlikely(sk_add_backlog(sk, skb, limit))) { 1571 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1559 bh_unlock_sock(sk); 1572 bh_unlock_sock(sk);
@@ -1813,7 +1826,6 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
1813 .getsockopt = ip_getsockopt, 1826 .getsockopt = ip_getsockopt,
1814 .addr2sockaddr = inet_csk_addr2sockaddr, 1827 .addr2sockaddr = inet_csk_addr2sockaddr,
1815 .sockaddr_len = sizeof(struct sockaddr_in), 1828 .sockaddr_len = sizeof(struct sockaddr_in),
1816 .bind_conflict = inet_csk_bind_conflict,
1817#ifdef CONFIG_COMPAT 1829#ifdef CONFIG_COMPAT
1818 .compat_setsockopt = compat_ip_setsockopt, 1830 .compat_setsockopt = compat_ip_setsockopt,
1819 .compat_getsockopt = compat_ip_getsockopt, 1831 .compat_getsockopt = compat_ip_getsockopt,
@@ -1884,9 +1896,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
1884 tcp_free_fastopen_req(tp); 1896 tcp_free_fastopen_req(tp);
1885 tcp_saved_syn_free(tp); 1897 tcp_saved_syn_free(tp);
1886 1898
1887 local_bh_disable();
1888 sk_sockets_allocated_dec(sk); 1899 sk_sockets_allocated_dec(sk);
1889 local_bh_enable();
1890} 1900}
1891EXPORT_SYMBOL(tcp_v4_destroy_sock); 1901EXPORT_SYMBOL(tcp_v4_destroy_sock);
1892 1902
@@ -1908,7 +1918,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
1908 if (!sk) { 1918 if (!sk) {
1909get_head: 1919get_head:
1910 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 1920 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1911 spin_lock_bh(&ilb->lock); 1921 spin_lock(&ilb->lock);
1912 sk = sk_head(&ilb->head); 1922 sk = sk_head(&ilb->head);
1913 st->offset = 0; 1923 st->offset = 0;
1914 goto get_sk; 1924 goto get_sk;
@@ -1925,7 +1935,7 @@ get_sk:
1925 if (sk->sk_family == st->family) 1935 if (sk->sk_family == st->family)
1926 return sk; 1936 return sk;
1927 } 1937 }
1928 spin_unlock_bh(&ilb->lock); 1938 spin_unlock(&ilb->lock);
1929 st->offset = 0; 1939 st->offset = 0;
1930 if (++st->bucket < INET_LHTABLE_SIZE) 1940 if (++st->bucket < INET_LHTABLE_SIZE)
1931 goto get_head; 1941 goto get_head;
@@ -2133,7 +2143,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
2133 switch (st->state) { 2143 switch (st->state) {
2134 case TCP_SEQ_STATE_LISTENING: 2144 case TCP_SEQ_STATE_LISTENING:
2135 if (v != SEQ_START_TOKEN) 2145 if (v != SEQ_START_TOKEN)
2136 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock); 2146 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2137 break; 2147 break;
2138 case TCP_SEQ_STATE_ESTABLISHED: 2148 case TCP_SEQ_STATE_ESTABLISHED:
2139 if (v) 2149 if (v)
@@ -2225,7 +2235,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2225 int state; 2235 int state;
2226 2236
2227 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2237 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2228 icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || 2238 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2229 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2239 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2230 timer_active = 1; 2240 timer_active = 1;
2231 timer_expires = icsk->icsk_timeout; 2241 timer_expires = icsk->icsk_timeout;
@@ -2372,6 +2382,7 @@ struct proto tcp_prot = {
2372 .shutdown = tcp_shutdown, 2382 .shutdown = tcp_shutdown,
2373 .setsockopt = tcp_setsockopt, 2383 .setsockopt = tcp_setsockopt,
2374 .getsockopt = tcp_getsockopt, 2384 .getsockopt = tcp_getsockopt,
2385 .keepalive = tcp_set_keepalive,
2375 .recvmsg = tcp_recvmsg, 2386 .recvmsg = tcp_recvmsg,
2376 .sendmsg = tcp_sendmsg, 2387 .sendmsg = tcp_sendmsg,
2377 .sendpage = tcp_sendpage, 2388 .sendpage = tcp_sendpage,
@@ -2415,7 +2426,7 @@ static void __net_exit tcp_sk_exit(struct net *net)
2415 2426
2416static int __net_init tcp_sk_init(struct net *net) 2427static int __net_init tcp_sk_init(struct net *net)
2417{ 2428{
2418 int res, cpu; 2429 int res, cpu, cnt;
2419 2430
2420 net->ipv4.tcp_sk = alloc_percpu(struct sock *); 2431 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2421 if (!net->ipv4.tcp_sk) 2432 if (!net->ipv4.tcp_sk)
@@ -2452,6 +2463,14 @@ static int __net_init tcp_sk_init(struct net *net)
2452 net->ipv4.sysctl_tcp_orphan_retries = 0; 2463 net->ipv4.sysctl_tcp_orphan_retries = 0;
2453 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 2464 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2454 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 2465 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2466 net->ipv4.sysctl_tcp_tw_reuse = 0;
2467
2468 cnt = tcp_hashinfo.ehash_mask + 1;
2469 net->ipv4.tcp_death_row.sysctl_tw_recycle = 0;
2470 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2471 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2472
2473 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2455 2474
2456 return 0; 2475 return 0;
2457fail: 2476fail:
@@ -2462,7 +2481,7 @@ fail:
2462 2481
2463static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 2482static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2464{ 2483{
2465 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET); 2484 inet_twsk_purge(&tcp_hashinfo, AF_INET);
2466} 2485}
2467 2486
2468static struct pernet_operations __net_initdata tcp_sk_ops = { 2487static struct pernet_operations __net_initdata tcp_sk_ops = {
@@ -2473,7 +2492,6 @@ static struct pernet_operations __net_initdata tcp_sk_ops = {
2473 2492
2474void __init tcp_v4_init(void) 2493void __init tcp_v4_init(void)
2475{ 2494{
2476 inet_hashinfo_init(&tcp_hashinfo);
2477 if (register_pernet_subsys(&tcp_sk_ops)) 2495 if (register_pernet_subsys(&tcp_sk_ops))
2478 panic("Failed to create the TCP control socket.\n"); 2496 panic("Failed to create the TCP control socket.\n");
2479} 2497}
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index c67ece1390c2..046fd3910873 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -316,6 +316,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample)
316static struct tcp_congestion_ops tcp_lp __read_mostly = { 316static struct tcp_congestion_ops tcp_lp __read_mostly = {
317 .init = tcp_lp_init, 317 .init = tcp_lp_init,
318 .ssthresh = tcp_reno_ssthresh, 318 .ssthresh = tcp_reno_ssthresh,
319 .undo_cwnd = tcp_reno_undo_cwnd,
319 .cong_avoid = tcp_lp_cong_avoid, 320 .cong_avoid = tcp_lp_cong_avoid,
320 .pkts_acked = tcp_lp_pkts_acked, 321 .pkts_acked = tcp_lp_pkts_acked,
321 322
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index bf1f3b2b29d1..0f46e5fe31ad 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -375,12 +375,10 @@ void tcp_update_metrics(struct sock *sk)
375 u32 val; 375 u32 val;
376 int m; 376 int m;
377 377
378 sk_dst_confirm(sk);
378 if (sysctl_tcp_nometrics_save || !dst) 379 if (sysctl_tcp_nometrics_save || !dst)
379 return; 380 return;
380 381
381 if (dst->flags & DST_HOST)
382 dst_confirm(dst);
383
384 rcu_read_lock(); 382 rcu_read_lock();
385 if (icsk->icsk_backoff || !tp->srtt_us) { 383 if (icsk->icsk_backoff || !tp->srtt_us) {
386 /* This session failed to estimate rtt. Why? 384 /* This session failed to estimate rtt. Why?
@@ -493,11 +491,10 @@ void tcp_init_metrics(struct sock *sk)
493 struct tcp_metrics_block *tm; 491 struct tcp_metrics_block *tm;
494 u32 val, crtt = 0; /* cached RTT scaled by 8 */ 492 u32 val, crtt = 0; /* cached RTT scaled by 8 */
495 493
494 sk_dst_confirm(sk);
496 if (!dst) 495 if (!dst)
497 goto reset; 496 goto reset;
498 497
499 dst_confirm(dst);
500
501 rcu_read_lock(); 498 rcu_read_lock();
502 tm = tcp_get_metrics(sk, dst, true); 499 tm = tcp_get_metrics(sk, dst, true);
503 if (!tm) { 500 if (!tm) {
@@ -522,7 +519,6 @@ void tcp_init_metrics(struct sock *sk)
522 val = tcp_metric_get(tm, TCP_METRIC_REORDERING); 519 val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
523 if (val && tp->reordering != val) { 520 if (val && tp->reordering != val) {
524 tcp_disable_fack(tp); 521 tcp_disable_fack(tp);
525 tcp_disable_early_retrans(tp);
526 tp->reordering = val; 522 tp->reordering = val;
527 } 523 }
528 524
@@ -606,7 +602,6 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst,
606 602
607 return ret; 603 return ret;
608} 604}
609EXPORT_SYMBOL_GPL(tcp_peer_is_proven);
610 605
611void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst) 606void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst)
612{ 607{
@@ -742,14 +737,7 @@ void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
742 rcu_read_unlock(); 737 rcu_read_unlock();
743} 738}
744 739
745static struct genl_family tcp_metrics_nl_family = { 740static struct genl_family tcp_metrics_nl_family;
746 .id = GENL_ID_GENERATE,
747 .hdrsize = 0,
748 .name = TCP_METRICS_GENL_NAME,
749 .version = TCP_METRICS_GENL_VERSION,
750 .maxattr = TCP_METRICS_ATTR_MAX,
751 .netnsok = true,
752};
753 741
754static const struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = { 742static const struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = {
755 [TCP_METRICS_ATTR_ADDR_IPV4] = { .type = NLA_U32, }, 743 [TCP_METRICS_ATTR_ADDR_IPV4] = { .type = NLA_U32, },
@@ -1116,6 +1104,17 @@ static const struct genl_ops tcp_metrics_nl_ops[] = {
1116 }, 1104 },
1117}; 1105};
1118 1106
1107static struct genl_family tcp_metrics_nl_family __ro_after_init = {
1108 .hdrsize = 0,
1109 .name = TCP_METRICS_GENL_NAME,
1110 .version = TCP_METRICS_GENL_VERSION,
1111 .maxattr = TCP_METRICS_ATTR_MAX,
1112 .netnsok = true,
1113 .module = THIS_MODULE,
1114 .ops = tcp_metrics_nl_ops,
1115 .n_ops = ARRAY_SIZE(tcp_metrics_nl_ops),
1116};
1117
1119static unsigned int tcpmhash_entries; 1118static unsigned int tcpmhash_entries;
1120static int __init set_tcpmhash_entries(char *str) 1119static int __init set_tcpmhash_entries(char *str)
1121{ 1120{
@@ -1179,8 +1178,7 @@ void __init tcp_metrics_init(void)
1179 if (ret < 0) 1178 if (ret < 0)
1180 panic("Could not allocate the tcp_metrics hash table\n"); 1179 panic("Could not allocate the tcp_metrics hash table\n");
1181 1180
1182 ret = genl_register_family_with_ops(&tcp_metrics_nl_family, 1181 ret = genl_register_family(&tcp_metrics_nl_family);
1183 tcp_metrics_nl_ops);
1184 if (ret < 0) 1182 if (ret < 0)
1185 panic("Could not register tcp_metrics generic netlink\n"); 1183 panic("Could not register tcp_metrics generic netlink\n");
1186} 1184}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 6234ebaa7db1..65c0f3d13eca 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -29,12 +29,6 @@
29 29
30int sysctl_tcp_abort_on_overflow __read_mostly; 30int sysctl_tcp_abort_on_overflow __read_mostly;
31 31
32struct inet_timewait_death_row tcp_death_row = {
33 .sysctl_max_tw_buckets = NR_FILE * 2,
34 .hashinfo = &tcp_hashinfo,
35};
36EXPORT_SYMBOL_GPL(tcp_death_row);
37
38static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) 32static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
39{ 33{
40 if (seq == s_win) 34 if (seq == s_win)
@@ -100,13 +94,15 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
100 struct tcp_options_received tmp_opt; 94 struct tcp_options_received tmp_opt;
101 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); 95 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
102 bool paws_reject = false; 96 bool paws_reject = false;
97 struct inet_timewait_death_row *tcp_death_row = &sock_net((struct sock*)tw)->ipv4.tcp_death_row;
103 98
104 tmp_opt.saw_tstamp = 0; 99 tmp_opt.saw_tstamp = 0;
105 if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { 100 if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
106 tcp_parse_options(skb, &tmp_opt, 0, NULL); 101 tcp_parse_options(skb, &tmp_opt, 0, NULL);
107 102
108 if (tmp_opt.saw_tstamp) { 103 if (tmp_opt.saw_tstamp) {
109 tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset; 104 if (tmp_opt.rcv_tsecr)
105 tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset;
110 tmp_opt.ts_recent = tcptw->tw_ts_recent; 106 tmp_opt.ts_recent = tcptw->tw_ts_recent;
111 tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 107 tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
112 paws_reject = tcp_paws_reject(&tmp_opt, th->rst); 108 paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
@@ -153,7 +149,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
153 tcptw->tw_ts_recent = tmp_opt.rcv_tsval; 149 tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
154 } 150 }
155 151
156 if (tcp_death_row.sysctl_tw_recycle && 152 if (tcp_death_row->sysctl_tw_recycle &&
157 tcptw->tw_ts_recent_stamp && 153 tcptw->tw_ts_recent_stamp &&
158 tcp_tw_remember_stamp(tw)) 154 tcp_tw_remember_stamp(tw))
159 inet_twsk_reschedule(tw, tw->tw_timeout); 155 inet_twsk_reschedule(tw, tw->tw_timeout);
@@ -264,11 +260,12 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
264 const struct tcp_sock *tp = tcp_sk(sk); 260 const struct tcp_sock *tp = tcp_sk(sk);
265 struct inet_timewait_sock *tw; 261 struct inet_timewait_sock *tw;
266 bool recycle_ok = false; 262 bool recycle_ok = false;
263 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
267 264
268 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) 265 if (tcp_death_row->sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
269 recycle_ok = tcp_remember_stamp(sk); 266 recycle_ok = tcp_remember_stamp(sk);
270 267
271 tw = inet_twsk_alloc(sk, &tcp_death_row, state); 268 tw = inet_twsk_alloc(sk, tcp_death_row, state);
272 269
273 if (tw) { 270 if (tw) {
274 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); 271 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
@@ -364,15 +361,12 @@ void tcp_openreq_init_rwin(struct request_sock *req,
364{ 361{
365 struct inet_request_sock *ireq = inet_rsk(req); 362 struct inet_request_sock *ireq = inet_rsk(req);
366 const struct tcp_sock *tp = tcp_sk(sk_listener); 363 const struct tcp_sock *tp = tcp_sk(sk_listener);
367 u16 user_mss = READ_ONCE(tp->rx_opt.user_mss);
368 int full_space = tcp_full_space(sk_listener); 364 int full_space = tcp_full_space(sk_listener);
369 int mss = dst_metric_advmss(dst);
370 u32 window_clamp; 365 u32 window_clamp;
371 __u8 rcv_wscale; 366 __u8 rcv_wscale;
367 int mss;
372 368
373 if (user_mss && user_mss < mss) 369 mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
374 mss = user_mss;
375
376 window_clamp = READ_ONCE(tp->window_clamp); 370 window_clamp = READ_ONCE(tp->window_clamp);
377 /* Set this up on the first call only */ 371 /* Set this up on the first call only */
378 req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW); 372 req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW);
@@ -466,13 +460,13 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
466 newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); 460 newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
467 minmax_reset(&newtp->rtt_min, tcp_time_stamp, ~0U); 461 minmax_reset(&newtp->rtt_min, tcp_time_stamp, ~0U);
468 newicsk->icsk_rto = TCP_TIMEOUT_INIT; 462 newicsk->icsk_rto = TCP_TIMEOUT_INIT;
463 newicsk->icsk_ack.lrcvtime = tcp_time_stamp;
469 464
470 newtp->packets_out = 0; 465 newtp->packets_out = 0;
471 newtp->retrans_out = 0; 466 newtp->retrans_out = 0;
472 newtp->sacked_out = 0; 467 newtp->sacked_out = 0;
473 newtp->fackets_out = 0; 468 newtp->fackets_out = 0;
474 newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; 469 newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
475 tcp_enable_early_retrans(newtp);
476 newtp->tlp_high_seq = 0; 470 newtp->tlp_high_seq = 0;
477 newtp->lsndtime = treq->snt_synack.stamp_jiffies; 471 newtp->lsndtime = treq->snt_synack.stamp_jiffies;
478 newsk->sk_txhash = treq->txhash; 472 newsk->sk_txhash = treq->txhash;
@@ -532,7 +526,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
532 newtp->rx_opt.ts_recent_stamp = 0; 526 newtp->rx_opt.ts_recent_stamp = 0;
533 newtp->tcp_header_len = sizeof(struct tcphdr); 527 newtp->tcp_header_len = sizeof(struct tcphdr);
534 } 528 }
535 newtp->tsoffset = 0; 529 newtp->tsoffset = treq->ts_off;
536#ifdef CONFIG_TCP_MD5SIG 530#ifdef CONFIG_TCP_MD5SIG
537 newtp->md5sig_info = NULL; /*XXX*/ 531 newtp->md5sig_info = NULL; /*XXX*/
538 if (newtp->af_specific->md5_lookup(sk, newsk)) 532 if (newtp->af_specific->md5_lookup(sk, newsk))
@@ -581,6 +575,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
581 575
582 if (tmp_opt.saw_tstamp) { 576 if (tmp_opt.saw_tstamp) {
583 tmp_opt.ts_recent = req->ts_recent; 577 tmp_opt.ts_recent = req->ts_recent;
578 if (tmp_opt.rcv_tsecr)
579 tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off;
584 /* We do not store true stamp, but it is not required, 580 /* We do not store true stamp, but it is not required,
585 * it can be estimated (approximately) 581 * it can be estimated (approximately)
586 * from another data. 582 * from another data.
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 896e9dfbdb5c..c3c082ed3879 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -76,16 +76,15 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
76 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; 76 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
77 77
78 tp->packets_out += tcp_skb_pcount(skb); 78 tp->packets_out += tcp_skb_pcount(skb);
79 if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || 79 if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
80 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
81 tcp_rearm_rto(sk); 80 tcp_rearm_rto(sk);
82 }
83 81
84 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT, 82 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
85 tcp_skb_pcount(skb)); 83 tcp_skb_pcount(skb));
86} 84}
87 85
88/* SND.NXT, if window was not shrunk. 86/* SND.NXT, if window was not shrunk or the amount of shrunk was less than one
87 * window scaling factor due to loss of precision.
89 * If window has been shrunk, what should we make? It is not clear at all. 88 * If window has been shrunk, what should we make? It is not clear at all.
90 * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-( 89 * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
91 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already 90 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
@@ -95,7 +94,9 @@ static inline __u32 tcp_acceptable_seq(const struct sock *sk)
95{ 94{
96 const struct tcp_sock *tp = tcp_sk(sk); 95 const struct tcp_sock *tp = tcp_sk(sk);
97 96
98 if (!before(tcp_wnd_end(tp), tp->snd_nxt)) 97 if (!before(tcp_wnd_end(tp), tp->snd_nxt) ||
98 (tp->rx_opt.wscale_ok &&
99 ((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale))))
99 return tp->snd_nxt; 100 return tp->snd_nxt;
100 else 101 else
101 return tcp_wnd_end(tp); 102 return tcp_wnd_end(tp);
@@ -640,7 +641,7 @@ static unsigned int tcp_synack_options(struct request_sock *req,
640 } 641 }
641 if (likely(ireq->tstamp_ok)) { 642 if (likely(ireq->tstamp_ok)) {
642 opts->options |= OPTION_TS; 643 opts->options |= OPTION_TS;
643 opts->tsval = tcp_skb_timestamp(skb); 644 opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off;
644 opts->tsecr = req->ts_recent; 645 opts->tsecr = req->ts_recent;
645 remaining -= TCPOLEN_TSTAMP_ALIGNED; 646 remaining -= TCPOLEN_TSTAMP_ALIGNED;
646 } 647 }
@@ -769,25 +770,27 @@ static void tcp_tasklet_func(unsigned long data)
769 list_del(&tp->tsq_node); 770 list_del(&tp->tsq_node);
770 771
771 sk = (struct sock *)tp; 772 sk = (struct sock *)tp;
772 bh_lock_sock(sk); 773 smp_mb__before_atomic();
773 774 clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
774 if (!sock_owned_by_user(sk)) { 775
775 tcp_tsq_handler(sk); 776 if (!sk->sk_lock.owned &&
776 } else { 777 test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) {
777 /* defer the work to tcp_release_cb() */ 778 bh_lock_sock(sk);
778 set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags); 779 if (!sock_owned_by_user(sk)) {
780 clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
781 tcp_tsq_handler(sk);
782 }
783 bh_unlock_sock(sk);
779 } 784 }
780 bh_unlock_sock(sk);
781 785
782 clear_bit(TSQ_QUEUED, &tp->tsq_flags);
783 sk_free(sk); 786 sk_free(sk);
784 } 787 }
785} 788}
786 789
787#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \ 790#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \
788 (1UL << TCP_WRITE_TIMER_DEFERRED) | \ 791 TCPF_WRITE_TIMER_DEFERRED | \
789 (1UL << TCP_DELACK_TIMER_DEFERRED) | \ 792 TCPF_DELACK_TIMER_DEFERRED | \
790 (1UL << TCP_MTU_REDUCED_DEFERRED)) 793 TCPF_MTU_REDUCED_DEFERRED)
791/** 794/**
792 * tcp_release_cb - tcp release_sock() callback 795 * tcp_release_cb - tcp release_sock() callback
793 * @sk: socket 796 * @sk: socket
@@ -797,18 +800,17 @@ static void tcp_tasklet_func(unsigned long data)
797 */ 800 */
798void tcp_release_cb(struct sock *sk) 801void tcp_release_cb(struct sock *sk)
799{ 802{
800 struct tcp_sock *tp = tcp_sk(sk);
801 unsigned long flags, nflags; 803 unsigned long flags, nflags;
802 804
803 /* perform an atomic operation only if at least one flag is set */ 805 /* perform an atomic operation only if at least one flag is set */
804 do { 806 do {
805 flags = tp->tsq_flags; 807 flags = sk->sk_tsq_flags;
806 if (!(flags & TCP_DEFERRED_ALL)) 808 if (!(flags & TCP_DEFERRED_ALL))
807 return; 809 return;
808 nflags = flags & ~TCP_DEFERRED_ALL; 810 nflags = flags & ~TCP_DEFERRED_ALL;
809 } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags); 811 } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
810 812
811 if (flags & (1UL << TCP_TSQ_DEFERRED)) 813 if (flags & TCPF_TSQ_DEFERRED)
812 tcp_tsq_handler(sk); 814 tcp_tsq_handler(sk);
813 815
814 /* Here begins the tricky part : 816 /* Here begins the tricky part :
@@ -822,15 +824,15 @@ void tcp_release_cb(struct sock *sk)
822 */ 824 */
823 sock_release_ownership(sk); 825 sock_release_ownership(sk);
824 826
825 if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) { 827 if (flags & TCPF_WRITE_TIMER_DEFERRED) {
826 tcp_write_timer_handler(sk); 828 tcp_write_timer_handler(sk);
827 __sock_put(sk); 829 __sock_put(sk);
828 } 830 }
829 if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) { 831 if (flags & TCPF_DELACK_TIMER_DEFERRED) {
830 tcp_delack_timer_handler(sk); 832 tcp_delack_timer_handler(sk);
831 __sock_put(sk); 833 __sock_put(sk);
832 } 834 }
833 if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) { 835 if (flags & TCPF_MTU_REDUCED_DEFERRED) {
834 inet_csk(sk)->icsk_af_ops->mtu_reduced(sk); 836 inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
835 __sock_put(sk); 837 __sock_put(sk);
836 } 838 }
@@ -860,6 +862,7 @@ void tcp_wfree(struct sk_buff *skb)
860{ 862{
861 struct sock *sk = skb->sk; 863 struct sock *sk = skb->sk;
862 struct tcp_sock *tp = tcp_sk(sk); 864 struct tcp_sock *tp = tcp_sk(sk);
865 unsigned long flags, nval, oval;
863 int wmem; 866 int wmem;
864 867
865 /* Keep one reference on sk_wmem_alloc. 868 /* Keep one reference on sk_wmem_alloc.
@@ -877,16 +880,25 @@ void tcp_wfree(struct sk_buff *skb)
877 if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current) 880 if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
878 goto out; 881 goto out;
879 882
880 if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) && 883 for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
881 !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
882 unsigned long flags;
883 struct tsq_tasklet *tsq; 884 struct tsq_tasklet *tsq;
885 bool empty;
886
887 if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
888 goto out;
889
890 nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
891 nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
892 if (nval != oval)
893 continue;
884 894
885 /* queue this socket to tasklet queue */ 895 /* queue this socket to tasklet queue */
886 local_irq_save(flags); 896 local_irq_save(flags);
887 tsq = this_cpu_ptr(&tsq_tasklet); 897 tsq = this_cpu_ptr(&tsq_tasklet);
898 empty = list_empty(&tsq->head);
888 list_add(&tp->tsq_node, &tsq->head); 899 list_add(&tp->tsq_node, &tsq->head);
889 tasklet_schedule(&tsq->tasklet); 900 if (empty)
901 tasklet_schedule(&tsq->tasklet);
890 local_irq_restore(flags); 902 local_irq_restore(flags);
891 return; 903 return;
892 } 904 }
@@ -955,6 +967,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
955 */ 967 */
956 skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1); 968 skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
957 969
970 /* If we had to use memory reserve to allocate this skb,
971 * this might cause drops if packet is looped back :
972 * Other socket might not have SOCK_MEMALLOC.
973 * Packets not looped back do not care about pfmemalloc.
974 */
975 skb->pfmemalloc = 0;
976
958 skb_push(skb, tcp_header_size); 977 skb_push(skb, tcp_header_size);
959 skb_reset_transport_header(skb); 978 skb_reset_transport_header(skb);
960 979
@@ -964,6 +983,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
964 skb_set_hash_from_sk(skb, sk); 983 skb_set_hash_from_sk(skb, sk);
965 atomic_add(skb->truesize, &sk->sk_wmem_alloc); 984 atomic_add(skb->truesize, &sk->sk_wmem_alloc);
966 985
986 skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
987
967 /* Build TCP header and checksum it. */ 988 /* Build TCP header and checksum it. */
968 th = (struct tcphdr *)skb->data; 989 th = (struct tcphdr *)skb->data;
969 th->source = inet->inet_sport; 990 th->source = inet->inet_sport;
@@ -1027,7 +1048,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
1027 skb_shinfo(skb)->gso_size = tcp_skb_mss(skb); 1048 skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
1028 1049
1029 /* Our usage of tstamp should remain private */ 1050 /* Our usage of tstamp should remain private */
1030 skb->tstamp.tv64 = 0; 1051 skb->tstamp = 0;
1031 1052
1032 /* Cleanup our debris for IP stacks */ 1053 /* Cleanup our debris for IP stacks */
1033 memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), 1054 memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
@@ -1514,6 +1535,18 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
1514 if (sysctl_tcp_slow_start_after_idle && 1535 if (sysctl_tcp_slow_start_after_idle &&
1515 (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto) 1536 (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto)
1516 tcp_cwnd_application_limited(sk); 1537 tcp_cwnd_application_limited(sk);
1538
1539 /* The following conditions together indicate the starvation
1540 * is caused by insufficient sender buffer:
1541 * 1) just sent some data (see tcp_write_xmit)
1542 * 2) not cwnd limited (this else condition)
1543 * 3) no more data to send (null tcp_send_head )
1544 * 4) application is hitting buffer limit (SOCK_NOSPACE)
1545 */
1546 if (!tcp_send_head(sk) && sk->sk_socket &&
1547 test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
1548 (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1549 tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
1517 } 1550 }
1518} 1551}
1519 1552
@@ -1910,26 +1943,26 @@ static inline void tcp_mtu_check_reprobe(struct sock *sk)
1910 */ 1943 */
1911static int tcp_mtu_probe(struct sock *sk) 1944static int tcp_mtu_probe(struct sock *sk)
1912{ 1945{
1913 struct tcp_sock *tp = tcp_sk(sk);
1914 struct inet_connection_sock *icsk = inet_csk(sk); 1946 struct inet_connection_sock *icsk = inet_csk(sk);
1947 struct tcp_sock *tp = tcp_sk(sk);
1915 struct sk_buff *skb, *nskb, *next; 1948 struct sk_buff *skb, *nskb, *next;
1916 struct net *net = sock_net(sk); 1949 struct net *net = sock_net(sk);
1917 int len;
1918 int probe_size; 1950 int probe_size;
1919 int size_needed; 1951 int size_needed;
1920 int copy; 1952 int copy, len;
1921 int mss_now; 1953 int mss_now;
1922 int interval; 1954 int interval;
1923 1955
1924 /* Not currently probing/verifying, 1956 /* Not currently probing/verifying,
1925 * not in recovery, 1957 * not in recovery,
1926 * have enough cwnd, and 1958 * have enough cwnd, and
1927 * not SACKing (the variable headers throw things off) */ 1959 * not SACKing (the variable headers throw things off)
1928 if (!icsk->icsk_mtup.enabled || 1960 */
1929 icsk->icsk_mtup.probe_size || 1961 if (likely(!icsk->icsk_mtup.enabled ||
1930 inet_csk(sk)->icsk_ca_state != TCP_CA_Open || 1962 icsk->icsk_mtup.probe_size ||
1931 tp->snd_cwnd < 11 || 1963 inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
1932 tp->rx_opt.num_sacks || tp->rx_opt.dsack) 1964 tp->snd_cwnd < 11 ||
1965 tp->rx_opt.num_sacks || tp->rx_opt.dsack))
1933 return -1; 1966 return -1;
1934 1967
1935 /* Use binary search for probe_size between tcp_mss_base, 1968 /* Use binary search for probe_size between tcp_mss_base,
@@ -2069,7 +2102,16 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
2069 limit <<= factor; 2102 limit <<= factor;
2070 2103
2071 if (atomic_read(&sk->sk_wmem_alloc) > limit) { 2104 if (atomic_read(&sk->sk_wmem_alloc) > limit) {
2072 set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags); 2105 /* Always send the 1st or 2nd skb in write queue.
2106 * No need to wait for TX completion to call us back,
2107 * after softirq/tasklet schedule.
2108 * This helps when TX completions are delayed too much.
2109 */
2110 if (skb == sk->sk_write_queue.next ||
2111 skb->prev == sk->sk_write_queue.next)
2112 return false;
2113
2114 set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
2073 /* It is possible TX completion already happened 2115 /* It is possible TX completion already happened
2074 * before we set TSQ_THROTTLED, so we must 2116 * before we set TSQ_THROTTLED, so we must
2075 * test again the condition. 2117 * test again the condition.
@@ -2081,6 +2123,47 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
2081 return false; 2123 return false;
2082} 2124}
2083 2125
2126static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new)
2127{
2128 const u32 now = tcp_time_stamp;
2129
2130 if (tp->chrono_type > TCP_CHRONO_UNSPEC)
2131 tp->chrono_stat[tp->chrono_type - 1] += now - tp->chrono_start;
2132 tp->chrono_start = now;
2133 tp->chrono_type = new;
2134}
2135
2136void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type)
2137{
2138 struct tcp_sock *tp = tcp_sk(sk);
2139
2140 /* If there are multiple conditions worthy of tracking in a
2141 * chronograph then the highest priority enum takes precedence
2142 * over the other conditions. So that if something "more interesting"
2143 * starts happening, stop the previous chrono and start a new one.
2144 */
2145 if (type > tp->chrono_type)
2146 tcp_chrono_set(tp, type);
2147}
2148
2149void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
2150{
2151 struct tcp_sock *tp = tcp_sk(sk);
2152
2153
2154 /* There are multiple conditions worthy of tracking in a
2155 * chronograph, so that the highest priority enum takes
2156 * precedence over the other conditions (see tcp_chrono_start).
2157 * If a condition stops, we only stop chrono tracking if
2158 * it's the "most interesting" or current chrono we are
2159 * tracking and starts busy chrono if we have pending data.
2160 */
2161 if (tcp_write_queue_empty(sk))
2162 tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
2163 else if (type == tp->chrono_type)
2164 tcp_chrono_set(tp, TCP_CHRONO_BUSY);
2165}
2166
2084/* This routine writes packets to the network. It advances the 2167/* This routine writes packets to the network. It advances the
2085 * send_head. This happens as incoming acks open up the remote 2168 * send_head. This happens as incoming acks open up the remote
2086 * window for us. 2169 * window for us.
@@ -2103,7 +2186,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2103 unsigned int tso_segs, sent_pkts; 2186 unsigned int tso_segs, sent_pkts;
2104 int cwnd_quota; 2187 int cwnd_quota;
2105 int result; 2188 int result;
2106 bool is_cwnd_limited = false; 2189 bool is_cwnd_limited = false, is_rwnd_limited = false;
2107 u32 max_segs; 2190 u32 max_segs;
2108 2191
2109 sent_pkts = 0; 2192 sent_pkts = 0;
@@ -2140,8 +2223,10 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2140 break; 2223 break;
2141 } 2224 }
2142 2225
2143 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) 2226 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
2227 is_rwnd_limited = true;
2144 break; 2228 break;
2229 }
2145 2230
2146 if (tso_segs == 1) { 2231 if (tso_segs == 1) {
2147 if (unlikely(!tcp_nagle_test(tp, skb, mss_now, 2232 if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
@@ -2167,6 +2252,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2167 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) 2252 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
2168 break; 2253 break;
2169 2254
2255 if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
2256 clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
2170 if (tcp_small_queue_check(sk, skb, 0)) 2257 if (tcp_small_queue_check(sk, skb, 0))
2171 break; 2258 break;
2172 2259
@@ -2186,6 +2273,11 @@ repair:
2186 break; 2273 break;
2187 } 2274 }
2188 2275
2276 if (is_rwnd_limited)
2277 tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED);
2278 else
2279 tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED);
2280
2189 if (likely(sent_pkts)) { 2281 if (likely(sent_pkts)) {
2190 if (tcp_in_cwnd_reduction(sk)) 2282 if (tcp_in_cwnd_reduction(sk))
2191 tp->prr_out += sent_pkts; 2283 tp->prr_out += sent_pkts;
@@ -2207,8 +2299,6 @@ bool tcp_schedule_loss_probe(struct sock *sk)
2207 u32 timeout, tlp_time_stamp, rto_time_stamp; 2299 u32 timeout, tlp_time_stamp, rto_time_stamp;
2208 u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3); 2300 u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);
2209 2301
2210 if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))
2211 return false;
2212 /* No consecutive loss probes. */ 2302 /* No consecutive loss probes. */
2213 if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) { 2303 if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
2214 tcp_rearm_rto(sk); 2304 tcp_rearm_rto(sk);
@@ -2227,8 +2317,9 @@ bool tcp_schedule_loss_probe(struct sock *sk)
2227 /* Schedule a loss probe in 2*RTT for SACK capable connections 2317 /* Schedule a loss probe in 2*RTT for SACK capable connections
2228 * in Open state, that are either limited by cwnd or application. 2318 * in Open state, that are either limited by cwnd or application.
2229 */ 2319 */
2230 if (sysctl_tcp_early_retrans < 3 || !tp->packets_out || 2320 if ((sysctl_tcp_early_retrans != 3 && sysctl_tcp_early_retrans != 4) ||
2231 !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) 2321 !tp->packets_out || !tcp_is_sack(tp) ||
2322 icsk->icsk_ca_state != TCP_CA_Open)
2232 return false; 2323 return false;
2233 2324
2234 if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && 2325 if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
@@ -2436,9 +2527,11 @@ u32 __tcp_select_window(struct sock *sk)
2436 int full_space = min_t(int, tp->window_clamp, allowed_space); 2527 int full_space = min_t(int, tp->window_clamp, allowed_space);
2437 int window; 2528 int window;
2438 2529
2439 if (mss > full_space) 2530 if (unlikely(mss > full_space)) {
2440 mss = full_space; 2531 mss = full_space;
2441 2532 if (mss <= 0)
2533 return 0;
2534 }
2442 if (free_space < (full_space >> 1)) { 2535 if (free_space < (full_space >> 1)) {
2443 icsk->icsk_ack.quick = 0; 2536 icsk->icsk_ack.quick = 0;
2444 2537
@@ -2514,7 +2607,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
2514} 2607}
2515 2608
2516/* Collapses two adjacent SKB's during retransmission. */ 2609/* Collapses two adjacent SKB's during retransmission. */
2517static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) 2610static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
2518{ 2611{
2519 struct tcp_sock *tp = tcp_sk(sk); 2612 struct tcp_sock *tp = tcp_sk(sk);
2520 struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); 2613 struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
@@ -2525,13 +2618,17 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
2525 2618
2526 BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); 2619 BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
2527 2620
2621 if (next_skb_size) {
2622 if (next_skb_size <= skb_availroom(skb))
2623 skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size),
2624 next_skb_size);
2625 else if (!skb_shift(skb, next_skb, next_skb_size))
2626 return false;
2627 }
2528 tcp_highest_sack_combine(sk, next_skb, skb); 2628 tcp_highest_sack_combine(sk, next_skb, skb);
2529 2629
2530 tcp_unlink_write_queue(next_skb, sk); 2630 tcp_unlink_write_queue(next_skb, sk);
2531 2631
2532 skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size),
2533 next_skb_size);
2534
2535 if (next_skb->ip_summed == CHECKSUM_PARTIAL) 2632 if (next_skb->ip_summed == CHECKSUM_PARTIAL)
2536 skb->ip_summed = CHECKSUM_PARTIAL; 2633 skb->ip_summed = CHECKSUM_PARTIAL;
2537 2634
@@ -2560,6 +2657,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
2560 tcp_skb_collapse_tstamp(skb, next_skb); 2657 tcp_skb_collapse_tstamp(skb, next_skb);
2561 2658
2562 sk_wmem_free_skb(sk, next_skb); 2659 sk_wmem_free_skb(sk, next_skb);
2660 return true;
2563} 2661}
2564 2662
2565/* Check if coalescing SKBs is legal. */ 2663/* Check if coalescing SKBs is legal. */
@@ -2567,14 +2665,11 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
2567{ 2665{
2568 if (tcp_skb_pcount(skb) > 1) 2666 if (tcp_skb_pcount(skb) > 1)
2569 return false; 2667 return false;
2570 /* TODO: SACK collapsing could be used to remove this condition */
2571 if (skb_shinfo(skb)->nr_frags != 0)
2572 return false;
2573 if (skb_cloned(skb)) 2668 if (skb_cloned(skb))
2574 return false; 2669 return false;
2575 if (skb == tcp_send_head(sk)) 2670 if (skb == tcp_send_head(sk))
2576 return false; 2671 return false;
2577 /* Some heurestics for collapsing over SACK'd could be invented */ 2672 /* Some heuristics for collapsing over SACK'd could be invented */
2578 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) 2673 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
2579 return false; 2674 return false;
2580 2675
@@ -2612,16 +2707,12 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
2612 2707
2613 if (space < 0) 2708 if (space < 0)
2614 break; 2709 break;
2615 /* Punt if not enough space exists in the first SKB for
2616 * the data in the second
2617 */
2618 if (skb->len > skb_availroom(to))
2619 break;
2620 2710
2621 if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp))) 2711 if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
2622 break; 2712 break;
2623 2713
2624 tcp_collapse_retrans(sk, to); 2714 if (!tcp_collapse_retrans(sk, to))
2715 break;
2625 } 2716 }
2626} 2717}
2627 2718
@@ -2694,6 +2785,13 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2694 if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) 2785 if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
2695 tcp_ecn_clear_syn(sk, skb); 2786 tcp_ecn_clear_syn(sk, skb);
2696 2787
2788 /* Update global and local TCP statistics. */
2789 segs = tcp_skb_pcount(skb);
2790 TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
2791 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
2792 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
2793 tp->total_retrans += segs;
2794
2697 /* make sure skb->data is aligned on arches that require it 2795 /* make sure skb->data is aligned on arches that require it
2698 * and check if ack-trimming & collapsing extended the headroom 2796 * and check if ack-trimming & collapsing extended the headroom
2699 * beyond what csum_start can cover. 2797 * beyond what csum_start can cover.
@@ -2711,14 +2809,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2711 } 2809 }
2712 2810
2713 if (likely(!err)) { 2811 if (likely(!err)) {
2714 segs = tcp_skb_pcount(skb);
2715
2716 TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; 2812 TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
2717 /* Update global TCP statistics. */ 2813 } else if (err != -EBUSY) {
2718 TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs); 2814 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
2719 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
2720 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
2721 tp->total_retrans += segs;
2722 } 2815 }
2723 return err; 2816 return err;
2724} 2817}
@@ -2741,8 +2834,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2741 if (!tp->retrans_stamp) 2834 if (!tp->retrans_stamp)
2742 tp->retrans_stamp = tcp_skb_timestamp(skb); 2835 tp->retrans_stamp = tcp_skb_timestamp(skb);
2743 2836
2744 } else if (err != -EBUSY) {
2745 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
2746 } 2837 }
2747 2838
2748 if (tp->undo_retrans < 0) 2839 if (tp->undo_retrans < 0)
@@ -2751,36 +2842,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2751 return err; 2842 return err;
2752} 2843}
2753 2844
2754/* Check if we forward retransmits are possible in the current
2755 * window/congestion state.
2756 */
2757static bool tcp_can_forward_retransmit(struct sock *sk)
2758{
2759 const struct inet_connection_sock *icsk = inet_csk(sk);
2760 const struct tcp_sock *tp = tcp_sk(sk);
2761
2762 /* Forward retransmissions are possible only during Recovery. */
2763 if (icsk->icsk_ca_state != TCP_CA_Recovery)
2764 return false;
2765
2766 /* No forward retransmissions in Reno are possible. */
2767 if (tcp_is_reno(tp))
2768 return false;
2769
2770 /* Yeah, we have to make difficult choice between forward transmission
2771 * and retransmission... Both ways have their merits...
2772 *
2773 * For now we do not retransmit anything, while we have some new
2774 * segments to send. In the other cases, follow rule 3 for
2775 * NextSeg() specified in RFC3517.
2776 */
2777
2778 if (tcp_may_send_now(sk))
2779 return false;
2780
2781 return true;
2782}
2783
2784/* This gets called after a retransmit timeout, and the initially 2845/* This gets called after a retransmit timeout, and the initially
2785 * retransmitted data is acknowledged. It tries to continue 2846 * retransmitted data is acknowledged. It tries to continue
2786 * resending the rest of the retransmit queue, until either 2847 * resending the rest of the retransmit queue, until either
@@ -2795,24 +2856,16 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
2795 struct tcp_sock *tp = tcp_sk(sk); 2856 struct tcp_sock *tp = tcp_sk(sk);
2796 struct sk_buff *skb; 2857 struct sk_buff *skb;
2797 struct sk_buff *hole = NULL; 2858 struct sk_buff *hole = NULL;
2798 u32 max_segs, last_lost; 2859 u32 max_segs;
2799 int mib_idx; 2860 int mib_idx;
2800 int fwd_rexmitting = 0;
2801 2861
2802 if (!tp->packets_out) 2862 if (!tp->packets_out)
2803 return; 2863 return;
2804 2864
2805 if (!tp->lost_out)
2806 tp->retransmit_high = tp->snd_una;
2807
2808 if (tp->retransmit_skb_hint) { 2865 if (tp->retransmit_skb_hint) {
2809 skb = tp->retransmit_skb_hint; 2866 skb = tp->retransmit_skb_hint;
2810 last_lost = TCP_SKB_CB(skb)->end_seq;
2811 if (after(last_lost, tp->retransmit_high))
2812 last_lost = tp->retransmit_high;
2813 } else { 2867 } else {
2814 skb = tcp_write_queue_head(sk); 2868 skb = tcp_write_queue_head(sk);
2815 last_lost = tp->snd_una;
2816 } 2869 }
2817 2870
2818 max_segs = tcp_tso_segs(sk, tcp_current_mss(sk)); 2871 max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
@@ -2835,31 +2888,14 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
2835 */ 2888 */
2836 segs = min_t(int, segs, max_segs); 2889 segs = min_t(int, segs, max_segs);
2837 2890
2838 if (fwd_rexmitting) { 2891 if (tp->retrans_out >= tp->lost_out) {
2839begin_fwd: 2892 break;
2840 if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
2841 break;
2842 mib_idx = LINUX_MIB_TCPFORWARDRETRANS;
2843
2844 } else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) {
2845 tp->retransmit_high = last_lost;
2846 if (!tcp_can_forward_retransmit(sk))
2847 break;
2848 /* Backtrack if necessary to non-L'ed skb */
2849 if (hole) {
2850 skb = hole;
2851 hole = NULL;
2852 }
2853 fwd_rexmitting = 1;
2854 goto begin_fwd;
2855
2856 } else if (!(sacked & TCPCB_LOST)) { 2893 } else if (!(sacked & TCPCB_LOST)) {
2857 if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED))) 2894 if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
2858 hole = skb; 2895 hole = skb;
2859 continue; 2896 continue;
2860 2897
2861 } else { 2898 } else {
2862 last_lost = TCP_SKB_CB(skb)->end_seq;
2863 if (icsk->icsk_ca_state != TCP_CA_Loss) 2899 if (icsk->icsk_ca_state != TCP_CA_Loss)
2864 mib_idx = LINUX_MIB_TCPFASTRETRANS; 2900 mib_idx = LINUX_MIB_TCPFASTRETRANS;
2865 else 2901 else
@@ -2880,7 +2916,8 @@ begin_fwd:
2880 if (tcp_in_cwnd_reduction(sk)) 2916 if (tcp_in_cwnd_reduction(sk))
2881 tp->prr_out += tcp_skb_pcount(skb); 2917 tp->prr_out += tcp_skb_pcount(skb);
2882 2918
2883 if (skb == tcp_write_queue_head(sk)) 2919 if (skb == tcp_write_queue_head(sk) &&
2920 icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
2884 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 2921 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2885 inet_csk(sk)->icsk_rto, 2922 inet_csk(sk)->icsk_rto,
2886 TCP_RTO_MAX); 2923 TCP_RTO_MAX);
@@ -2962,6 +2999,8 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
2962{ 2999{
2963 struct sk_buff *skb; 3000 struct sk_buff *skb;
2964 3001
3002 TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
3003
2965 /* NOTE: No TCP options attached and we never retransmit this. */ 3004 /* NOTE: No TCP options attached and we never retransmit this. */
2966 skb = alloc_skb(MAX_TCP_HEADER, priority); 3005 skb = alloc_skb(MAX_TCP_HEADER, priority);
2967 if (!skb) { 3006 if (!skb) {
@@ -2977,8 +3016,6 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
2977 /* Send it off. */ 3016 /* Send it off. */
2978 if (tcp_transmit_skb(sk, skb, 0, priority)) 3017 if (tcp_transmit_skb(sk, skb, 0, priority))
2979 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); 3018 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
2980
2981 TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
2982} 3019}
2983 3020
2984/* Send a crossed SYN-ACK during socket establishment. 3021/* Send a crossed SYN-ACK during socket establishment.
@@ -3037,7 +3074,6 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
3037 struct sk_buff *skb; 3074 struct sk_buff *skb;
3038 int tcp_header_size; 3075 int tcp_header_size;
3039 struct tcphdr *th; 3076 struct tcphdr *th;
3040 u16 user_mss;
3041 int mss; 3077 int mss;
3042 3078
3043 skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); 3079 skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
@@ -3067,10 +3103,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
3067 } 3103 }
3068 skb_dst_set(skb, dst); 3104 skb_dst_set(skb, dst);
3069 3105
3070 mss = dst_metric_advmss(dst); 3106 mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
3071 user_mss = READ_ONCE(tp->rx_opt.user_mss);
3072 if (user_mss && user_mss < mss)
3073 mss = user_mss;
3074 3107
3075 memset(&opts, 0, sizeof(opts)); 3108 memset(&opts, 0, sizeof(opts));
3076#ifdef CONFIG_SYN_COOKIES 3109#ifdef CONFIG_SYN_COOKIES
@@ -3123,7 +3156,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
3123#endif 3156#endif
3124 3157
3125 /* Do not fool tcpdump (if any), clean our debris */ 3158 /* Do not fool tcpdump (if any), clean our debris */
3126 skb->tstamp.tv64 = 0; 3159 skb->tstamp = 0;
3127 return skb; 3160 return skb;
3128} 3161}
3129EXPORT_SYMBOL(tcp_make_synack); 3162EXPORT_SYMBOL(tcp_make_synack);
@@ -3176,9 +3209,7 @@ static void tcp_connect_init(struct sock *sk)
3176 3209
3177 if (!tp->window_clamp) 3210 if (!tp->window_clamp)
3178 tp->window_clamp = dst_metric(dst, RTAX_WINDOW); 3211 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
3179 tp->advmss = dst_metric_advmss(dst); 3212 tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
3180 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
3181 tp->advmss = tp->rx_opt.user_mss;
3182 3213
3183 tcp_initialize_rcv_mss(sk); 3214 tcp_initialize_rcv_mss(sk);
3184 3215
@@ -3244,31 +3275,19 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
3244{ 3275{
3245 struct tcp_sock *tp = tcp_sk(sk); 3276 struct tcp_sock *tp = tcp_sk(sk);
3246 struct tcp_fastopen_request *fo = tp->fastopen_req; 3277 struct tcp_fastopen_request *fo = tp->fastopen_req;
3247 int syn_loss = 0, space, err = 0; 3278 int space, err = 0;
3248 unsigned long last_syn_loss = 0;
3249 struct sk_buff *syn_data; 3279 struct sk_buff *syn_data;
3250 3280
3251 tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ 3281 tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */
3252 tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie, 3282 if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie))
3253 &syn_loss, &last_syn_loss);
3254 /* Recurring FO SYN losses: revert to regular handshake temporarily */
3255 if (syn_loss > 1 &&
3256 time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
3257 fo->cookie.len = -1;
3258 goto fallback;
3259 }
3260
3261 if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE)
3262 fo->cookie.len = -1;
3263 else if (fo->cookie.len <= 0)
3264 goto fallback; 3283 goto fallback;
3265 3284
3266 /* MSS for SYN-data is based on cached MSS and bounded by PMTU and 3285 /* MSS for SYN-data is based on cached MSS and bounded by PMTU and
3267 * user-MSS. Reserve maximum option space for middleboxes that add 3286 * user-MSS. Reserve maximum option space for middleboxes that add
3268 * private TCP options. The cost is reduced data space in SYN :( 3287 * private TCP options. The cost is reduced data space in SYN :(
3269 */ 3288 */
3270 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp) 3289 tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp);
3271 tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; 3290
3272 space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) - 3291 space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
3273 MAX_TCP_OPTION_SPACE; 3292 MAX_TCP_OPTION_SPACE;
3274 3293
@@ -3300,6 +3319,8 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
3300 fo->copied = space; 3319 fo->copied = space;
3301 3320
3302 tcp_connect_queue_skb(sk, syn_data); 3321 tcp_connect_queue_skb(sk, syn_data);
3322 if (syn_data->len)
3323 tcp_chrono_start(sk, TCP_CHRONO_BUSY);
3303 3324
3304 err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); 3325 err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
3305 3326
@@ -3464,8 +3485,6 @@ void tcp_send_ack(struct sock *sk)
3464 /* We do not want pure acks influencing TCP Small Queues or fq/pacing 3485 /* We do not want pure acks influencing TCP Small Queues or fq/pacing
3465 * too much. 3486 * too much.
3466 * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784 3487 * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784
3467 * We also avoid tcp_wfree() overhead (cache line miss accessing
3468 * tp->tsq_flags) by using regular sock_wfree()
3469 */ 3488 */
3470 skb_set_tcp_pure_ack(buff); 3489 skb_set_tcp_pure_ack(buff);
3471 3490
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index e36df4fcfeba..d8acbd9f477a 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -1,9 +1,33 @@
1#include <linux/tcp.h> 1#include <linux/tcp.h>
2#include <net/tcp.h> 2#include <net/tcp.h>
3 3
4int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS; 4int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOSS_DETECTION;
5 5
6/* Marks a packet lost, if some packet sent later has been (s)acked. 6static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
7{
8 struct tcp_sock *tp = tcp_sk(sk);
9
10 tcp_skb_mark_lost_uncond_verify(tp, skb);
11 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
12 /* Account for retransmits that are lost again */
13 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
14 tp->retrans_out -= tcp_skb_pcount(skb);
15 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT,
16 tcp_skb_pcount(skb));
17 }
18}
19
20static bool tcp_rack_sent_after(const struct skb_mstamp *t1,
21 const struct skb_mstamp *t2,
22 u32 seq1, u32 seq2)
23{
24 return skb_mstamp_after(t1, t2) ||
25 (t1->v64 == t2->v64 && after(seq1, seq2));
26}
27
28/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
29 *
30 * Marks a packet lost, if some packet sent later has been (s)acked.
7 * The underlying idea is similar to the traditional dupthresh and FACK 31 * The underlying idea is similar to the traditional dupthresh and FACK
8 * but they look at different metrics: 32 * but they look at different metrics:
9 * 33 *
@@ -16,31 +40,26 @@ int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS;
16 * is being more resilient to reordering by simply allowing some 40 * is being more resilient to reordering by simply allowing some
17 * "settling delay", instead of tweaking the dupthresh. 41 * "settling delay", instead of tweaking the dupthresh.
18 * 42 *
19 * The current version is only used after recovery starts but can be 43 * When tcp_rack_detect_loss() detects some packets are lost and we
20 * easily extended to detect the first loss. 44 * are not already in the CA_Recovery state, either tcp_rack_reo_timeout()
45 * or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will
46 * make us enter the CA_Recovery state.
21 */ 47 */
22int tcp_rack_mark_lost(struct sock *sk) 48static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now,
49 u32 *reo_timeout)
23{ 50{
24 struct tcp_sock *tp = tcp_sk(sk); 51 struct tcp_sock *tp = tcp_sk(sk);
25 struct sk_buff *skb; 52 struct sk_buff *skb;
26 u32 reo_wnd, prior_retrans = tp->retrans_out; 53 u32 reo_wnd;
27
28 if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
29 return 0;
30
31 /* Reset the advanced flag to avoid unnecessary queue scanning */
32 tp->rack.advanced = 0;
33 54
55 *reo_timeout = 0;
34 /* To be more reordering resilient, allow min_rtt/4 settling delay 56 /* To be more reordering resilient, allow min_rtt/4 settling delay
35 * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed 57 * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
36 * RTT because reordering is often a path property and less related 58 * RTT because reordering is often a path property and less related
37 * to queuing or delayed ACKs. 59 * to queuing or delayed ACKs.
38 *
39 * TODO: measure and adapt to the observed reordering delay, and
40 * use a timer to retransmit like the delayed early retransmit.
41 */ 60 */
42 reo_wnd = 1000; 61 reo_wnd = 1000;
43 if (tp->rack.reord && tcp_min_rtt(tp) != ~0U) 62 if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U)
44 reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd); 63 reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
45 64
46 tcp_for_write_queue(skb, sk) { 65 tcp_for_write_queue(skb, sk) {
@@ -54,20 +73,29 @@ int tcp_rack_mark_lost(struct sock *sk)
54 scb->sacked & TCPCB_SACKED_ACKED) 73 scb->sacked & TCPCB_SACKED_ACKED)
55 continue; 74 continue;
56 75
57 if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) { 76 if (tcp_rack_sent_after(&tp->rack.mstamp, &skb->skb_mstamp,
77 tp->rack.end_seq, scb->end_seq)) {
78 /* Step 3 in draft-cheng-tcpm-rack-00.txt:
79 * A packet is lost if its elapsed time is beyond
80 * the recent RTT plus the reordering window.
81 */
82 u32 elapsed = skb_mstamp_us_delta(now,
83 &skb->skb_mstamp);
84 s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed;
58 85
59 if (skb_mstamp_us_delta(&tp->rack.mstamp, 86 if (remaining < 0) {
60 &skb->skb_mstamp) <= reo_wnd) 87 tcp_rack_mark_skb_lost(sk, skb);
61 continue; 88 continue;
62
63 /* skb is lost if packet sent later is sacked */
64 tcp_skb_mark_lost_uncond_verify(tp, skb);
65 if (scb->sacked & TCPCB_SACKED_RETRANS) {
66 scb->sacked &= ~TCPCB_SACKED_RETRANS;
67 tp->retrans_out -= tcp_skb_pcount(skb);
68 NET_INC_STATS(sock_net(sk),
69 LINUX_MIB_TCPLOSTRETRANSMIT);
70 } 89 }
90
91 /* Skip ones marked lost but not yet retransmitted */
92 if ((scb->sacked & TCPCB_LOST) &&
93 !(scb->sacked & TCPCB_SACKED_RETRANS))
94 continue;
95
96 /* Record maximum wait time (+1 to avoid 0) */
97 *reo_timeout = max_t(u32, *reo_timeout, 1 + remaining);
98
71 } else if (!(scb->sacked & TCPCB_RETRANS)) { 99 } else if (!(scb->sacked & TCPCB_RETRANS)) {
72 /* Original data are sent sequentially so stop early 100 /* Original data are sent sequentially so stop early
73 * b/c the rest are all sent after rack_sent 101 * b/c the rest are all sent after rack_sent
@@ -75,20 +103,43 @@ int tcp_rack_mark_lost(struct sock *sk)
75 break; 103 break;
76 } 104 }
77 } 105 }
78 return prior_retrans - tp->retrans_out;
79} 106}
80 107
81/* Record the most recently (re)sent time among the (s)acked packets */ 108void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now)
82void tcp_rack_advance(struct tcp_sock *tp, 109{
83 const struct skb_mstamp *xmit_time, u8 sacked) 110 struct tcp_sock *tp = tcp_sk(sk);
111 u32 timeout;
112
113 if (!tp->rack.advanced)
114 return;
115
116 /* Reset the advanced flag to avoid unnecessary queue scanning */
117 tp->rack.advanced = 0;
118 tcp_rack_detect_loss(sk, now, &timeout);
119 if (timeout) {
120 timeout = usecs_to_jiffies(timeout + TCP_REO_TIMEOUT_MIN);
121 inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT,
122 timeout, inet_csk(sk)->icsk_rto);
123 }
124}
125
126/* Record the most recently (re)sent time among the (s)acked packets
127 * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from
128 * draft-cheng-tcpm-rack-00.txt
129 */
130void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
131 const struct skb_mstamp *xmit_time,
132 const struct skb_mstamp *ack_time)
84{ 133{
134 u32 rtt_us;
135
85 if (tp->rack.mstamp.v64 && 136 if (tp->rack.mstamp.v64 &&
86 !skb_mstamp_after(xmit_time, &tp->rack.mstamp)) 137 !tcp_rack_sent_after(xmit_time, &tp->rack.mstamp,
138 end_seq, tp->rack.end_seq))
87 return; 139 return;
88 140
141 rtt_us = skb_mstamp_us_delta(ack_time, xmit_time);
89 if (sacked & TCPCB_RETRANS) { 142 if (sacked & TCPCB_RETRANS) {
90 struct skb_mstamp now;
91
92 /* If the sacked packet was retransmitted, it's ambiguous 143 /* If the sacked packet was retransmitted, it's ambiguous
93 * whether the retransmission or the original (or the prior 144 * whether the retransmission or the original (or the prior
94 * retransmission) was sacked. 145 * retransmission) was sacked.
@@ -99,11 +150,35 @@ void tcp_rack_advance(struct tcp_sock *tp,
99 * so it's at least one RTT (i.e., retransmission is at least 150 * so it's at least one RTT (i.e., retransmission is at least
100 * an RTT later). 151 * an RTT later).
101 */ 152 */
102 skb_mstamp_get(&now); 153 if (rtt_us < tcp_min_rtt(tp))
103 if (skb_mstamp_us_delta(&now, xmit_time) < tcp_min_rtt(tp))
104 return; 154 return;
105 } 155 }
106 156 tp->rack.rtt_us = rtt_us;
107 tp->rack.mstamp = *xmit_time; 157 tp->rack.mstamp = *xmit_time;
158 tp->rack.end_seq = end_seq;
108 tp->rack.advanced = 1; 159 tp->rack.advanced = 1;
109} 160}
161
162/* We have waited long enough to accommodate reordering. Mark the expired
163 * packets lost and retransmit them.
164 */
165void tcp_rack_reo_timeout(struct sock *sk)
166{
167 struct tcp_sock *tp = tcp_sk(sk);
168 struct skb_mstamp now;
169 u32 timeout, prior_inflight;
170
171 skb_mstamp_get(&now);
172 prior_inflight = tcp_packets_in_flight(tp);
173 tcp_rack_detect_loss(sk, &now, &timeout);
174 if (prior_inflight != tcp_packets_in_flight(tp)) {
175 if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) {
176 tcp_enter_recovery(sk, false);
177 if (!inet_csk(sk)->icsk_ca_ops->cong_control)
178 tcp_cwnd_reduction(sk, 1, 0);
179 }
180 tcp_xmit_retransmit_queue(sk);
181 }
182 if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS)
183 tcp_rearm_rto(sk);
184}
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index bf5ea9e9bbc1..f2123075ce6e 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -15,6 +15,10 @@
15#define TCP_SCALABLE_AI_CNT 50U 15#define TCP_SCALABLE_AI_CNT 50U
16#define TCP_SCALABLE_MD_SCALE 3 16#define TCP_SCALABLE_MD_SCALE 3
17 17
18struct scalable {
19 u32 loss_cwnd;
20};
21
18static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked) 22static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
19{ 23{
20 struct tcp_sock *tp = tcp_sk(sk); 24 struct tcp_sock *tp = tcp_sk(sk);
@@ -32,12 +36,23 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
32static u32 tcp_scalable_ssthresh(struct sock *sk) 36static u32 tcp_scalable_ssthresh(struct sock *sk)
33{ 37{
34 const struct tcp_sock *tp = tcp_sk(sk); 38 const struct tcp_sock *tp = tcp_sk(sk);
39 struct scalable *ca = inet_csk_ca(sk);
40
41 ca->loss_cwnd = tp->snd_cwnd;
35 42
36 return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U); 43 return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U);
37} 44}
38 45
46static u32 tcp_scalable_cwnd_undo(struct sock *sk)
47{
48 const struct scalable *ca = inet_csk_ca(sk);
49
50 return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
51}
52
39static struct tcp_congestion_ops tcp_scalable __read_mostly = { 53static struct tcp_congestion_ops tcp_scalable __read_mostly = {
40 .ssthresh = tcp_scalable_ssthresh, 54 .ssthresh = tcp_scalable_ssthresh,
55 .undo_cwnd = tcp_scalable_cwnd_undo,
41 .cong_avoid = tcp_scalable_cong_avoid, 56 .cong_avoid = tcp_scalable_cong_avoid,
42 57
43 .owner = THIS_MODULE, 58 .owner = THIS_MODULE,
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 3ea1cf804748..b2ab411c6d37 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -249,7 +249,8 @@ void tcp_delack_timer_handler(struct sock *sk)
249 249
250 sk_mem_reclaim_partial(sk); 250 sk_mem_reclaim_partial(sk);
251 251
252 if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) 252 if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) ||
253 !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
253 goto out; 254 goto out;
254 255
255 if (time_after(icsk->icsk_ack.timeout, jiffies)) { 256 if (time_after(icsk->icsk_ack.timeout, jiffies)) {
@@ -310,7 +311,7 @@ static void tcp_delack_timer(unsigned long data)
310 inet_csk(sk)->icsk_ack.blocked = 1; 311 inet_csk(sk)->icsk_ack.blocked = 1;
311 __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); 312 __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
312 /* deleguate our work to tcp_release_cb() */ 313 /* deleguate our work to tcp_release_cb() */
313 if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) 314 if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags))
314 sock_hold(sk); 315 sock_hold(sk);
315 } 316 }
316 bh_unlock_sock(sk); 317 bh_unlock_sock(sk);
@@ -552,7 +553,8 @@ void tcp_write_timer_handler(struct sock *sk)
552 struct inet_connection_sock *icsk = inet_csk(sk); 553 struct inet_connection_sock *icsk = inet_csk(sk);
553 int event; 554 int event;
554 555
555 if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending) 556 if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) ||
557 !icsk->icsk_pending)
556 goto out; 558 goto out;
557 559
558 if (time_after(icsk->icsk_timeout, jiffies)) { 560 if (time_after(icsk->icsk_timeout, jiffies)) {
@@ -563,8 +565,8 @@ void tcp_write_timer_handler(struct sock *sk)
563 event = icsk->icsk_pending; 565 event = icsk->icsk_pending;
564 566
565 switch (event) { 567 switch (event) {
566 case ICSK_TIME_EARLY_RETRANS: 568 case ICSK_TIME_REO_TIMEOUT:
567 tcp_resume_early_retransmit(sk); 569 tcp_rack_reo_timeout(sk);
568 break; 570 break;
569 case ICSK_TIME_LOSS_PROBE: 571 case ICSK_TIME_LOSS_PROBE:
570 tcp_send_loss_probe(sk); 572 tcp_send_loss_probe(sk);
@@ -592,7 +594,7 @@ static void tcp_write_timer(unsigned long data)
592 tcp_write_timer_handler(sk); 594 tcp_write_timer_handler(sk);
593 } else { 595 } else {
594 /* delegate our work to tcp_release_cb() */ 596 /* delegate our work to tcp_release_cb() */
595 if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) 597 if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &sk->sk_tsq_flags))
596 sock_hold(sk); 598 sock_hold(sk);
597 } 599 }
598 bh_unlock_sock(sk); 600 bh_unlock_sock(sk);
@@ -617,6 +619,7 @@ void tcp_set_keepalive(struct sock *sk, int val)
617 else if (!val) 619 else if (!val)
618 inet_csk_delete_keepalive_timer(sk); 620 inet_csk_delete_keepalive_timer(sk);
619} 621}
622EXPORT_SYMBOL_GPL(tcp_set_keepalive);
620 623
621 624
622static void tcp_keepalive_timer (unsigned long data) 625static void tcp_keepalive_timer (unsigned long data)
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 4c4bac1b5eab..218cfcc77650 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -307,6 +307,7 @@ EXPORT_SYMBOL_GPL(tcp_vegas_get_info);
307static struct tcp_congestion_ops tcp_vegas __read_mostly = { 307static struct tcp_congestion_ops tcp_vegas __read_mostly = {
308 .init = tcp_vegas_init, 308 .init = tcp_vegas_init,
309 .ssthresh = tcp_reno_ssthresh, 309 .ssthresh = tcp_reno_ssthresh,
310 .undo_cwnd = tcp_reno_undo_cwnd,
310 .cong_avoid = tcp_vegas_cong_avoid, 311 .cong_avoid = tcp_vegas_cong_avoid,
311 .pkts_acked = tcp_vegas_pkts_acked, 312 .pkts_acked = tcp_vegas_pkts_acked,
312 .set_state = tcp_vegas_state, 313 .set_state = tcp_vegas_state,
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 40171e163cff..76005d4b8dfc 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -30,6 +30,7 @@ struct veno {
30 u32 basertt; /* the min of all Veno rtt measurements seen (in usec) */ 30 u32 basertt; /* the min of all Veno rtt measurements seen (in usec) */
31 u32 inc; /* decide whether to increase cwnd */ 31 u32 inc; /* decide whether to increase cwnd */
32 u32 diff; /* calculate the diff rate */ 32 u32 diff; /* calculate the diff rate */
33 u32 loss_cwnd; /* cwnd when loss occured */
33}; 34};
34 35
35/* There are several situations when we must "re-start" Veno: 36/* There are several situations when we must "re-start" Veno:
@@ -193,6 +194,7 @@ static u32 tcp_veno_ssthresh(struct sock *sk)
193 const struct tcp_sock *tp = tcp_sk(sk); 194 const struct tcp_sock *tp = tcp_sk(sk);
194 struct veno *veno = inet_csk_ca(sk); 195 struct veno *veno = inet_csk_ca(sk);
195 196
197 veno->loss_cwnd = tp->snd_cwnd;
196 if (veno->diff < beta) 198 if (veno->diff < beta)
197 /* in "non-congestive state", cut cwnd by 1/5 */ 199 /* in "non-congestive state", cut cwnd by 1/5 */
198 return max(tp->snd_cwnd * 4 / 5, 2U); 200 return max(tp->snd_cwnd * 4 / 5, 2U);
@@ -201,9 +203,17 @@ static u32 tcp_veno_ssthresh(struct sock *sk)
201 return max(tp->snd_cwnd >> 1U, 2U); 203 return max(tp->snd_cwnd >> 1U, 2U);
202} 204}
203 205
206static u32 tcp_veno_cwnd_undo(struct sock *sk)
207{
208 const struct veno *veno = inet_csk_ca(sk);
209
210 return max(tcp_sk(sk)->snd_cwnd, veno->loss_cwnd);
211}
212
204static struct tcp_congestion_ops tcp_veno __read_mostly = { 213static struct tcp_congestion_ops tcp_veno __read_mostly = {
205 .init = tcp_veno_init, 214 .init = tcp_veno_init,
206 .ssthresh = tcp_veno_ssthresh, 215 .ssthresh = tcp_veno_ssthresh,
216 .undo_cwnd = tcp_veno_cwnd_undo,
207 .cong_avoid = tcp_veno_cong_avoid, 217 .cong_avoid = tcp_veno_cong_avoid,
208 .pkts_acked = tcp_veno_pkts_acked, 218 .pkts_acked = tcp_veno_pkts_acked,
209 .set_state = tcp_veno_state, 219 .set_state = tcp_veno_state,
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 4b03a2e2a050..fed66dc0e0f5 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -278,6 +278,7 @@ static struct tcp_congestion_ops tcp_westwood __read_mostly = {
278 .init = tcp_westwood_init, 278 .init = tcp_westwood_init,
279 .ssthresh = tcp_reno_ssthresh, 279 .ssthresh = tcp_reno_ssthresh,
280 .cong_avoid = tcp_reno_cong_avoid, 280 .cong_avoid = tcp_reno_cong_avoid,
281 .undo_cwnd = tcp_reno_undo_cwnd,
281 .cwnd_event = tcp_westwood_event, 282 .cwnd_event = tcp_westwood_event,
282 .in_ack_event = tcp_westwood_ack, 283 .in_ack_event = tcp_westwood_ack,
283 .get_info = tcp_westwood_info, 284 .get_info = tcp_westwood_info,
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 9c5fc973267f..e6ff99c4bd3b 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -37,6 +37,7 @@ struct yeah {
37 u32 fast_count; 37 u32 fast_count;
38 38
39 u32 pkts_acked; 39 u32 pkts_acked;
40 u32 loss_cwnd;
40}; 41};
41 42
42static void tcp_yeah_init(struct sock *sk) 43static void tcp_yeah_init(struct sock *sk)
@@ -219,13 +220,22 @@ static u32 tcp_yeah_ssthresh(struct sock *sk)
219 220
220 yeah->fast_count = 0; 221 yeah->fast_count = 0;
221 yeah->reno_count = max(yeah->reno_count>>1, 2U); 222 yeah->reno_count = max(yeah->reno_count>>1, 2U);
223 yeah->loss_cwnd = tp->snd_cwnd;
222 224
223 return max_t(int, tp->snd_cwnd - reduction, 2); 225 return max_t(int, tp->snd_cwnd - reduction, 2);
224} 226}
225 227
228static u32 tcp_yeah_cwnd_undo(struct sock *sk)
229{
230 const struct yeah *yeah = inet_csk_ca(sk);
231
232 return max(tcp_sk(sk)->snd_cwnd, yeah->loss_cwnd);
233}
234
226static struct tcp_congestion_ops tcp_yeah __read_mostly = { 235static struct tcp_congestion_ops tcp_yeah __read_mostly = {
227 .init = tcp_yeah_init, 236 .init = tcp_yeah_init,
228 .ssthresh = tcp_yeah_ssthresh, 237 .ssthresh = tcp_yeah_ssthresh,
238 .undo_cwnd = tcp_yeah_cwnd_undo,
229 .cong_avoid = tcp_yeah_cong_avoid, 239 .cong_avoid = tcp_yeah_cong_avoid,
230 .set_state = tcp_vegas_state, 240 .set_state = tcp_vegas_state,
231 .cwnd_event = tcp_vegas_cwnd_event, 241 .cwnd_event = tcp_vegas_cwnd_event,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 5bab6c3f7a2f..ea6e4cff9faf 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -79,7 +79,7 @@
79 79
80#define pr_fmt(fmt) "UDP: " fmt 80#define pr_fmt(fmt) "UDP: " fmt
81 81
82#include <asm/uaccess.h> 82#include <linux/uaccess.h>
83#include <asm/ioctls.h> 83#include <asm/ioctls.h>
84#include <linux/bootmem.h> 84#include <linux/bootmem.h>
85#include <linux/highmem.h> 85#include <linux/highmem.h>
@@ -134,14 +134,21 @@ EXPORT_SYMBOL(udp_memory_allocated);
134#define MAX_UDP_PORTS 65536 134#define MAX_UDP_PORTS 65536
135#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN) 135#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
136 136
137/* IPCB reference means this can not be used from early demux */
138static bool udp_lib_exact_dif_match(struct net *net, struct sk_buff *skb)
139{
140#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
141 if (!net->ipv4.sysctl_udp_l3mdev_accept &&
142 skb && ipv4_l3mdev_skb(IPCB(skb)->flags))
143 return true;
144#endif
145 return false;
146}
147
137static int udp_lib_lport_inuse(struct net *net, __u16 num, 148static int udp_lib_lport_inuse(struct net *net, __u16 num,
138 const struct udp_hslot *hslot, 149 const struct udp_hslot *hslot,
139 unsigned long *bitmap, 150 unsigned long *bitmap,
140 struct sock *sk, 151 struct sock *sk, unsigned int log)
141 int (*saddr_comp)(const struct sock *sk1,
142 const struct sock *sk2,
143 bool match_wildcard),
144 unsigned int log)
145{ 152{
146 struct sock *sk2; 153 struct sock *sk2;
147 kuid_t uid = sock_i_uid(sk); 154 kuid_t uid = sock_i_uid(sk);
@@ -153,13 +160,18 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
153 (!sk2->sk_reuse || !sk->sk_reuse) && 160 (!sk2->sk_reuse || !sk->sk_reuse) &&
154 (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || 161 (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
155 sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && 162 sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
156 (!sk2->sk_reuseport || !sk->sk_reuseport || 163 inet_rcv_saddr_equal(sk, sk2, true)) {
157 rcu_access_pointer(sk->sk_reuseport_cb) || 164 if (sk2->sk_reuseport && sk->sk_reuseport &&
158 !uid_eq(uid, sock_i_uid(sk2))) && 165 !rcu_access_pointer(sk->sk_reuseport_cb) &&
159 saddr_comp(sk, sk2, true)) { 166 uid_eq(uid, sock_i_uid(sk2))) {
160 if (!bitmap) 167 if (!bitmap)
161 return 1; 168 return 0;
162 __set_bit(udp_sk(sk2)->udp_port_hash >> log, bitmap); 169 } else {
170 if (!bitmap)
171 return 1;
172 __set_bit(udp_sk(sk2)->udp_port_hash >> log,
173 bitmap);
174 }
163 } 175 }
164 } 176 }
165 return 0; 177 return 0;
@@ -171,10 +183,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
171 */ 183 */
172static int udp_lib_lport_inuse2(struct net *net, __u16 num, 184static int udp_lib_lport_inuse2(struct net *net, __u16 num,
173 struct udp_hslot *hslot2, 185 struct udp_hslot *hslot2,
174 struct sock *sk, 186 struct sock *sk)
175 int (*saddr_comp)(const struct sock *sk1,
176 const struct sock *sk2,
177 bool match_wildcard))
178{ 187{
179 struct sock *sk2; 188 struct sock *sk2;
180 kuid_t uid = sock_i_uid(sk); 189 kuid_t uid = sock_i_uid(sk);
@@ -188,11 +197,14 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
188 (!sk2->sk_reuse || !sk->sk_reuse) && 197 (!sk2->sk_reuse || !sk->sk_reuse) &&
189 (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || 198 (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
190 sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && 199 sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
191 (!sk2->sk_reuseport || !sk->sk_reuseport || 200 inet_rcv_saddr_equal(sk, sk2, true)) {
192 rcu_access_pointer(sk->sk_reuseport_cb) || 201 if (sk2->sk_reuseport && sk->sk_reuseport &&
193 !uid_eq(uid, sock_i_uid(sk2))) && 202 !rcu_access_pointer(sk->sk_reuseport_cb) &&
194 saddr_comp(sk, sk2, true)) { 203 uid_eq(uid, sock_i_uid(sk2))) {
195 res = 1; 204 res = 0;
205 } else {
206 res = 1;
207 }
196 break; 208 break;
197 } 209 }
198 } 210 }
@@ -200,10 +212,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
200 return res; 212 return res;
201} 213}
202 214
203static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot, 215static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
204 int (*saddr_same)(const struct sock *sk1,
205 const struct sock *sk2,
206 bool match_wildcard))
207{ 216{
208 struct net *net = sock_net(sk); 217 struct net *net = sock_net(sk);
209 kuid_t uid = sock_i_uid(sk); 218 kuid_t uid = sock_i_uid(sk);
@@ -217,7 +226,7 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot,
217 (udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) && 226 (udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) &&
218 (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && 227 (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
219 sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && 228 sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
220 (*saddr_same)(sk, sk2, false)) { 229 inet_rcv_saddr_equal(sk, sk2, false)) {
221 return reuseport_add_sock(sk, sk2); 230 return reuseport_add_sock(sk, sk2);
222 } 231 }
223 } 232 }
@@ -233,14 +242,10 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot,
233 * 242 *
234 * @sk: socket struct in question 243 * @sk: socket struct in question
235 * @snum: port number to look up 244 * @snum: port number to look up
236 * @saddr_comp: AF-dependent comparison of bound local IP addresses
237 * @hash2_nulladdr: AF-dependent hash value in secondary hash chains, 245 * @hash2_nulladdr: AF-dependent hash value in secondary hash chains,
238 * with NULL address 246 * with NULL address
239 */ 247 */
240int udp_lib_get_port(struct sock *sk, unsigned short snum, 248int udp_lib_get_port(struct sock *sk, unsigned short snum,
241 int (*saddr_comp)(const struct sock *sk1,
242 const struct sock *sk2,
243 bool match_wildcard),
244 unsigned int hash2_nulladdr) 249 unsigned int hash2_nulladdr)
245{ 250{
246 struct udp_hslot *hslot, *hslot2; 251 struct udp_hslot *hslot, *hslot2;
@@ -269,7 +274,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
269 bitmap_zero(bitmap, PORTS_PER_CHAIN); 274 bitmap_zero(bitmap, PORTS_PER_CHAIN);
270 spin_lock_bh(&hslot->lock); 275 spin_lock_bh(&hslot->lock);
271 udp_lib_lport_inuse(net, snum, hslot, bitmap, sk, 276 udp_lib_lport_inuse(net, snum, hslot, bitmap, sk,
272 saddr_comp, udptable->log); 277 udptable->log);
273 278
274 snum = first; 279 snum = first;
275 /* 280 /*
@@ -285,6 +290,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
285 snum += rand; 290 snum += rand;
286 } while (snum != first); 291 } while (snum != first);
287 spin_unlock_bh(&hslot->lock); 292 spin_unlock_bh(&hslot->lock);
293 cond_resched();
288 } while (++first != last); 294 } while (++first != last);
289 goto fail; 295 goto fail;
290 } else { 296 } else {
@@ -301,12 +307,11 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
301 if (hslot->count < hslot2->count) 307 if (hslot->count < hslot2->count)
302 goto scan_primary_hash; 308 goto scan_primary_hash;
303 309
304 exist = udp_lib_lport_inuse2(net, snum, hslot2, 310 exist = udp_lib_lport_inuse2(net, snum, hslot2, sk);
305 sk, saddr_comp);
306 if (!exist && (hash2_nulladdr != slot2)) { 311 if (!exist && (hash2_nulladdr != slot2)) {
307 hslot2 = udp_hashslot2(udptable, hash2_nulladdr); 312 hslot2 = udp_hashslot2(udptable, hash2_nulladdr);
308 exist = udp_lib_lport_inuse2(net, snum, hslot2, 313 exist = udp_lib_lport_inuse2(net, snum, hslot2,
309 sk, saddr_comp); 314 sk);
310 } 315 }
311 if (exist) 316 if (exist)
312 goto fail_unlock; 317 goto fail_unlock;
@@ -314,8 +319,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
314 goto found; 319 goto found;
315 } 320 }
316scan_primary_hash: 321scan_primary_hash:
317 if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, 322 if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, 0))
318 saddr_comp, 0))
319 goto fail_unlock; 323 goto fail_unlock;
320 } 324 }
321found: 325found:
@@ -324,7 +328,7 @@ found:
324 udp_sk(sk)->udp_portaddr_hash ^= snum; 328 udp_sk(sk)->udp_portaddr_hash ^= snum;
325 if (sk_unhashed(sk)) { 329 if (sk_unhashed(sk)) {
326 if (sk->sk_reuseport && 330 if (sk->sk_reuseport &&
327 udp_reuseport_add_sock(sk, hslot, saddr_comp)) { 331 udp_reuseport_add_sock(sk, hslot)) {
328 inet_sk(sk)->inet_num = 0; 332 inet_sk(sk)->inet_num = 0;
329 udp_sk(sk)->udp_port_hash = 0; 333 udp_sk(sk)->udp_port_hash = 0;
330 udp_sk(sk)->udp_portaddr_hash ^= snum; 334 udp_sk(sk)->udp_portaddr_hash ^= snum;
@@ -356,24 +360,6 @@ fail:
356} 360}
357EXPORT_SYMBOL(udp_lib_get_port); 361EXPORT_SYMBOL(udp_lib_get_port);
358 362
359/* match_wildcard == true: 0.0.0.0 equals to any IPv4 addresses
360 * match_wildcard == false: addresses must be exactly the same, i.e.
361 * 0.0.0.0 only equals to 0.0.0.0
362 */
363int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2,
364 bool match_wildcard)
365{
366 struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2);
367
368 if (!ipv6_only_sock(sk2)) {
369 if (inet1->inet_rcv_saddr == inet2->inet_rcv_saddr)
370 return 1;
371 if (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr)
372 return match_wildcard;
373 }
374 return 0;
375}
376
377static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr, 363static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr,
378 unsigned int port) 364 unsigned int port)
379{ 365{
@@ -389,12 +375,13 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum)
389 375
390 /* precompute partial secondary hash */ 376 /* precompute partial secondary hash */
391 udp_sk(sk)->udp_portaddr_hash = hash2_partial; 377 udp_sk(sk)->udp_portaddr_hash = hash2_partial;
392 return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr); 378 return udp_lib_get_port(sk, snum, hash2_nulladdr);
393} 379}
394 380
395static int compute_score(struct sock *sk, struct net *net, 381static int compute_score(struct sock *sk, struct net *net,
396 __be32 saddr, __be16 sport, 382 __be32 saddr, __be16 sport,
397 __be32 daddr, unsigned short hnum, int dif) 383 __be32 daddr, unsigned short hnum, int dif,
384 bool exact_dif)
398{ 385{
399 int score; 386 int score;
400 struct inet_sock *inet; 387 struct inet_sock *inet;
@@ -425,7 +412,7 @@ static int compute_score(struct sock *sk, struct net *net,
425 score += 4; 412 score += 4;
426 } 413 }
427 414
428 if (sk->sk_bound_dev_if) { 415 if (sk->sk_bound_dev_if || exact_dif) {
429 if (sk->sk_bound_dev_if != dif) 416 if (sk->sk_bound_dev_if != dif)
430 return -1; 417 return -1;
431 score += 4; 418 score += 4;
@@ -450,7 +437,7 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
450/* called with rcu_read_lock() */ 437/* called with rcu_read_lock() */
451static struct sock *udp4_lib_lookup2(struct net *net, 438static struct sock *udp4_lib_lookup2(struct net *net,
452 __be32 saddr, __be16 sport, 439 __be32 saddr, __be16 sport,
453 __be32 daddr, unsigned int hnum, int dif, 440 __be32 daddr, unsigned int hnum, int dif, bool exact_dif,
454 struct udp_hslot *hslot2, 441 struct udp_hslot *hslot2,
455 struct sk_buff *skb) 442 struct sk_buff *skb)
456{ 443{
@@ -462,7 +449,7 @@ static struct sock *udp4_lib_lookup2(struct net *net,
462 badness = 0; 449 badness = 0;
463 udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { 450 udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
464 score = compute_score(sk, net, saddr, sport, 451 score = compute_score(sk, net, saddr, sport,
465 daddr, hnum, dif); 452 daddr, hnum, dif, exact_dif);
466 if (score > badness) { 453 if (score > badness) {
467 reuseport = sk->sk_reuseport; 454 reuseport = sk->sk_reuseport;
468 if (reuseport) { 455 if (reuseport) {
@@ -497,6 +484,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
497 unsigned short hnum = ntohs(dport); 484 unsigned short hnum = ntohs(dport);
498 unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); 485 unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
499 struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; 486 struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
487 bool exact_dif = udp_lib_exact_dif_match(net, skb);
500 int score, badness, matches = 0, reuseport = 0; 488 int score, badness, matches = 0, reuseport = 0;
501 u32 hash = 0; 489 u32 hash = 0;
502 490
@@ -509,7 +497,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
509 497
510 result = udp4_lib_lookup2(net, saddr, sport, 498 result = udp4_lib_lookup2(net, saddr, sport,
511 daddr, hnum, dif, 499 daddr, hnum, dif,
512 hslot2, skb); 500 exact_dif, hslot2, skb);
513 if (!result) { 501 if (!result) {
514 unsigned int old_slot2 = slot2; 502 unsigned int old_slot2 = slot2;
515 hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum); 503 hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
@@ -524,7 +512,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
524 512
525 result = udp4_lib_lookup2(net, saddr, sport, 513 result = udp4_lib_lookup2(net, saddr, sport,
526 daddr, hnum, dif, 514 daddr, hnum, dif,
527 hslot2, skb); 515 exact_dif, hslot2, skb);
528 } 516 }
529 return result; 517 return result;
530 } 518 }
@@ -533,7 +521,7 @@ begin:
533 badness = 0; 521 badness = 0;
534 sk_for_each_rcu(sk, &hslot->head) { 522 sk_for_each_rcu(sk, &hslot->head) {
535 score = compute_score(sk, net, saddr, sport, 523 score = compute_score(sk, net, saddr, sport,
536 daddr, hnum, dif); 524 daddr, hnum, dif, exact_dif);
537 if (score > badness) { 525 if (score > badness) {
538 reuseport = sk->sk_reuseport; 526 reuseport = sk->sk_reuseport;
539 if (reuseport) { 527 if (reuseport) {
@@ -580,7 +568,8 @@ EXPORT_SYMBOL_GPL(udp4_lib_lookup_skb);
580 * Does increment socket refcount. 568 * Does increment socket refcount.
581 */ 569 */
582#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \ 570#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \
583 IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY) 571 IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY) || \
572 IS_ENABLED(CONFIG_NF_SOCKET_IPV4)
584struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, 573struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
585 __be32 daddr, __be16 dport, int dif) 574 __be32 daddr, __be16 dport, int dif)
586{ 575{
@@ -1019,7 +1008,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
1019 flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, 1008 flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
1020 RT_SCOPE_UNIVERSE, sk->sk_protocol, 1009 RT_SCOPE_UNIVERSE, sk->sk_protocol,
1021 flow_flags, 1010 flow_flags,
1022 faddr, saddr, dport, inet->inet_sport); 1011 faddr, saddr, dport, inet->inet_sport,
1012 sk->sk_uid);
1023 1013
1024 security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); 1014 security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
1025 rt = ip_route_output_flow(net, fl4, sk); 1015 rt = ip_route_output_flow(net, fl4, sk);
@@ -1111,7 +1101,8 @@ out:
1111 return err; 1101 return err;
1112 1102
1113do_confirm: 1103do_confirm:
1114 dst_confirm(&rt->dst); 1104 if (msg->msg_flags & MSG_PROBE)
1105 dst_confirm_neigh(&rt->dst, &fl4->daddr);
1115 if (!(msg->msg_flags&MSG_PROBE) || len) 1106 if (!(msg->msg_flags&MSG_PROBE) || len)
1116 goto back_from_confirm; 1107 goto back_from_confirm;
1117 err = 0; 1108 err = 0;
@@ -1172,6 +1163,181 @@ out:
1172 return ret; 1163 return ret;
1173} 1164}
1174 1165
1166/* fully reclaim rmem/fwd memory allocated for skb */
1167static void udp_rmem_release(struct sock *sk, int size, int partial)
1168{
1169 struct udp_sock *up = udp_sk(sk);
1170 int amt;
1171
1172 if (likely(partial)) {
1173 up->forward_deficit += size;
1174 size = up->forward_deficit;
1175 if (size < (sk->sk_rcvbuf >> 2) &&
1176 !skb_queue_empty(&sk->sk_receive_queue))
1177 return;
1178 } else {
1179 size += up->forward_deficit;
1180 }
1181 up->forward_deficit = 0;
1182
1183 sk->sk_forward_alloc += size;
1184 amt = (sk->sk_forward_alloc - partial) & ~(SK_MEM_QUANTUM - 1);
1185 sk->sk_forward_alloc -= amt;
1186
1187 if (amt)
1188 __sk_mem_reduce_allocated(sk, amt >> SK_MEM_QUANTUM_SHIFT);
1189
1190 atomic_sub(size, &sk->sk_rmem_alloc);
1191}
1192
1193/* Note: called with sk_receive_queue.lock held.
1194 * Instead of using skb->truesize here, find a copy of it in skb->dev_scratch
1195 * This avoids a cache line miss while receive_queue lock is held.
1196 * Look at __udp_enqueue_schedule_skb() to find where this copy is done.
1197 */
1198void udp_skb_destructor(struct sock *sk, struct sk_buff *skb)
1199{
1200 udp_rmem_release(sk, skb->dev_scratch, 1);
1201}
1202EXPORT_SYMBOL(udp_skb_destructor);
1203
1204/* Idea of busylocks is to let producers grab an extra spinlock
1205 * to relieve pressure on the receive_queue spinlock shared by consumer.
1206 * Under flood, this means that only one producer can be in line
1207 * trying to acquire the receive_queue spinlock.
1208 * These busylock can be allocated on a per cpu manner, instead of a
1209 * per socket one (that would consume a cache line per socket)
1210 */
1211static int udp_busylocks_log __read_mostly;
1212static spinlock_t *udp_busylocks __read_mostly;
1213
1214static spinlock_t *busylock_acquire(void *ptr)
1215{
1216 spinlock_t *busy;
1217
1218 busy = udp_busylocks + hash_ptr(ptr, udp_busylocks_log);
1219 spin_lock(busy);
1220 return busy;
1221}
1222
1223static void busylock_release(spinlock_t *busy)
1224{
1225 if (busy)
1226 spin_unlock(busy);
1227}
1228
1229int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
1230{
1231 struct sk_buff_head *list = &sk->sk_receive_queue;
1232 int rmem, delta, amt, err = -ENOMEM;
1233 spinlock_t *busy = NULL;
1234 int size;
1235
1236 /* try to avoid the costly atomic add/sub pair when the receive
1237 * queue is full; always allow at least a packet
1238 */
1239 rmem = atomic_read(&sk->sk_rmem_alloc);
1240 if (rmem > sk->sk_rcvbuf)
1241 goto drop;
1242
1243 /* Under mem pressure, it might be helpful to help udp_recvmsg()
1244 * having linear skbs :
1245 * - Reduce memory overhead and thus increase receive queue capacity
1246 * - Less cache line misses at copyout() time
1247 * - Less work at consume_skb() (less alien page frag freeing)
1248 */
1249 if (rmem > (sk->sk_rcvbuf >> 1)) {
1250 skb_condense(skb);
1251
1252 busy = busylock_acquire(sk);
1253 }
1254 size = skb->truesize;
1255 /* Copy skb->truesize into skb->dev_scratch to avoid a cache line miss
1256 * in udp_skb_destructor()
1257 */
1258 skb->dev_scratch = size;
1259
1260 /* we drop only if the receive buf is full and the receive
1261 * queue contains some other skb
1262 */
1263 rmem = atomic_add_return(size, &sk->sk_rmem_alloc);
1264 if (rmem > (size + sk->sk_rcvbuf))
1265 goto uncharge_drop;
1266
1267 spin_lock(&list->lock);
1268 if (size >= sk->sk_forward_alloc) {
1269 amt = sk_mem_pages(size);
1270 delta = amt << SK_MEM_QUANTUM_SHIFT;
1271 if (!__sk_mem_raise_allocated(sk, delta, amt, SK_MEM_RECV)) {
1272 err = -ENOBUFS;
1273 spin_unlock(&list->lock);
1274 goto uncharge_drop;
1275 }
1276
1277 sk->sk_forward_alloc += delta;
1278 }
1279
1280 sk->sk_forward_alloc -= size;
1281
1282 /* no need to setup a destructor, we will explicitly release the
1283 * forward allocated memory on dequeue
1284 */
1285 sock_skb_set_dropcount(sk, skb);
1286
1287 __skb_queue_tail(list, skb);
1288 spin_unlock(&list->lock);
1289
1290 if (!sock_flag(sk, SOCK_DEAD))
1291 sk->sk_data_ready(sk);
1292
1293 busylock_release(busy);
1294 return 0;
1295
1296uncharge_drop:
1297 atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1298
1299drop:
1300 atomic_inc(&sk->sk_drops);
1301 busylock_release(busy);
1302 return err;
1303}
1304EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb);
1305
1306void udp_destruct_sock(struct sock *sk)
1307{
1308 /* reclaim completely the forward allocated memory */
1309 unsigned int total = 0;
1310 struct sk_buff *skb;
1311
1312 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1313 total += skb->truesize;
1314 kfree_skb(skb);
1315 }
1316 udp_rmem_release(sk, total, 0);
1317
1318 inet_sock_destruct(sk);
1319}
1320EXPORT_SYMBOL_GPL(udp_destruct_sock);
1321
1322int udp_init_sock(struct sock *sk)
1323{
1324 sk->sk_destruct = udp_destruct_sock;
1325 return 0;
1326}
1327EXPORT_SYMBOL_GPL(udp_init_sock);
1328
1329void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
1330{
1331 if (unlikely(READ_ONCE(sk->sk_peek_off) >= 0)) {
1332 bool slow = lock_sock_fast(sk);
1333
1334 sk_peek_offset_bwd(sk, len);
1335 unlock_sock_fast(sk, slow);
1336 }
1337 consume_skb(skb);
1338}
1339EXPORT_SYMBOL_GPL(skb_consume_udp);
1340
1175/** 1341/**
1176 * first_packet_length - return length of first packet in receive queue 1342 * first_packet_length - return length of first packet in receive queue
1177 * @sk: socket 1343 * @sk: socket
@@ -1181,12 +1347,11 @@ out:
1181 */ 1347 */
1182static int first_packet_length(struct sock *sk) 1348static int first_packet_length(struct sock *sk)
1183{ 1349{
1184 struct sk_buff_head list_kill, *rcvq = &sk->sk_receive_queue; 1350 struct sk_buff_head *rcvq = &sk->sk_receive_queue;
1185 struct sk_buff *skb; 1351 struct sk_buff *skb;
1352 int total = 0;
1186 int res; 1353 int res;
1187 1354
1188 __skb_queue_head_init(&list_kill);
1189
1190 spin_lock_bh(&rcvq->lock); 1355 spin_lock_bh(&rcvq->lock);
1191 while ((skb = skb_peek(rcvq)) != NULL && 1356 while ((skb = skb_peek(rcvq)) != NULL &&
1192 udp_lib_checksum_complete(skb)) { 1357 udp_lib_checksum_complete(skb)) {
@@ -1196,18 +1361,13 @@ static int first_packet_length(struct sock *sk)
1196 IS_UDPLITE(sk)); 1361 IS_UDPLITE(sk));
1197 atomic_inc(&sk->sk_drops); 1362 atomic_inc(&sk->sk_drops);
1198 __skb_unlink(skb, rcvq); 1363 __skb_unlink(skb, rcvq);
1199 __skb_queue_tail(&list_kill, skb); 1364 total += skb->truesize;
1365 kfree_skb(skb);
1200 } 1366 }
1201 res = skb ? skb->len : -1; 1367 res = skb ? skb->len : -1;
1368 if (total)
1369 udp_rmem_release(sk, total, 1);
1202 spin_unlock_bh(&rcvq->lock); 1370 spin_unlock_bh(&rcvq->lock);
1203
1204 if (!skb_queue_empty(&list_kill)) {
1205 bool slow = lock_sock_fast(sk);
1206
1207 __skb_queue_purge(&list_kill);
1208 sk_mem_reclaim_partial(sk);
1209 unlock_sock_fast(sk, slow);
1210 }
1211 return res; 1371 return res;
1212} 1372}
1213 1373
@@ -1256,15 +1416,13 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
1256 int err; 1416 int err;
1257 int is_udplite = IS_UDPLITE(sk); 1417 int is_udplite = IS_UDPLITE(sk);
1258 bool checksum_valid = false; 1418 bool checksum_valid = false;
1259 bool slow;
1260 1419
1261 if (flags & MSG_ERRQUEUE) 1420 if (flags & MSG_ERRQUEUE)
1262 return ip_recv_error(sk, msg, len, addr_len); 1421 return ip_recv_error(sk, msg, len, addr_len);
1263 1422
1264try_again: 1423try_again:
1265 peeking = off = sk_peek_offset(sk, flags); 1424 peeking = off = sk_peek_offset(sk, flags);
1266 skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), 1425 skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
1267 &peeked, &off, &err);
1268 if (!skb) 1426 if (!skb)
1269 return err; 1427 return err;
1270 1428
@@ -1281,7 +1439,8 @@ try_again:
1281 * coverage checksum (UDP-Lite), do it before the copy. 1439 * coverage checksum (UDP-Lite), do it before the copy.
1282 */ 1440 */
1283 1441
1284 if (copied < ulen || UDP_SKB_CB(skb)->partial_cov || peeking) { 1442 if (copied < ulen || peeking ||
1443 (is_udplite && UDP_SKB_CB(skb)->partial_cov)) {
1285 checksum_valid = !udp_lib_checksum_complete(skb); 1444 checksum_valid = !udp_lib_checksum_complete(skb);
1286 if (!checksum_valid) 1445 if (!checksum_valid)
1287 goto csum_copy_err; 1446 goto csum_copy_err;
@@ -1297,13 +1456,12 @@ try_again:
1297 } 1456 }
1298 1457
1299 if (unlikely(err)) { 1458 if (unlikely(err)) {
1300 trace_kfree_skb(skb, udp_recvmsg);
1301 if (!peeked) { 1459 if (!peeked) {
1302 atomic_inc(&sk->sk_drops); 1460 atomic_inc(&sk->sk_drops);
1303 UDP_INC_STATS(sock_net(sk), 1461 UDP_INC_STATS(sock_net(sk),
1304 UDP_MIB_INERRORS, is_udplite); 1462 UDP_MIB_INERRORS, is_udplite);
1305 } 1463 }
1306 skb_free_datagram_locked(sk, skb); 1464 kfree_skb(skb);
1307 return err; 1465 return err;
1308 } 1466 }
1309 1467
@@ -1322,22 +1480,21 @@ try_again:
1322 *addr_len = sizeof(*sin); 1480 *addr_len = sizeof(*sin);
1323 } 1481 }
1324 if (inet->cmsg_flags) 1482 if (inet->cmsg_flags)
1325 ip_cmsg_recv_offset(msg, skb, sizeof(struct udphdr), off); 1483 ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off);
1326 1484
1327 err = copied; 1485 err = copied;
1328 if (flags & MSG_TRUNC) 1486 if (flags & MSG_TRUNC)
1329 err = ulen; 1487 err = ulen;
1330 1488
1331 __skb_free_datagram_locked(sk, skb, peeking ? -err : err); 1489 skb_consume_udp(sk, skb, peeking ? -err : err);
1332 return err; 1490 return err;
1333 1491
1334csum_copy_err: 1492csum_copy_err:
1335 slow = lock_sock_fast(sk); 1493 if (!__sk_queue_drop_skb(sk, skb, flags, udp_skb_destructor)) {
1336 if (!skb_kill_datagram(sk, skb, flags)) {
1337 UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); 1494 UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
1338 UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); 1495 UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
1339 } 1496 }
1340 unlock_sock_fast(sk, slow); 1497 kfree_skb(skb);
1341 1498
1342 /* starting over for a new packet, but check if we need to yield */ 1499 /* starting over for a new packet, but check if we need to yield */
1343 cond_resched(); 1500 cond_resched();
@@ -1463,9 +1620,11 @@ int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1463 sock_rps_save_rxhash(sk, skb); 1620 sock_rps_save_rxhash(sk, skb);
1464 sk_mark_napi_id(sk, skb); 1621 sk_mark_napi_id(sk, skb);
1465 sk_incoming_cpu_update(sk); 1622 sk_incoming_cpu_update(sk);
1623 } else {
1624 sk_mark_napi_id_once(sk, skb);
1466 } 1625 }
1467 1626
1468 rc = __sock_queue_rcv_skb(sk, skb); 1627 rc = __udp_enqueue_schedule_skb(sk, skb);
1469 if (rc < 0) { 1628 if (rc < 0) {
1470 int is_udplite = IS_UDPLITE(sk); 1629 int is_udplite = IS_UDPLITE(sk);
1471 1630
@@ -1480,7 +1639,6 @@ int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1480 } 1639 }
1481 1640
1482 return 0; 1641 return 0;
1483
1484} 1642}
1485 1643
1486static struct static_key udp_encap_needed __read_mostly; 1644static struct static_key udp_encap_needed __read_mostly;
@@ -1502,7 +1660,6 @@ EXPORT_SYMBOL(udp_encap_enable);
1502int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 1660int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1503{ 1661{
1504 struct udp_sock *up = udp_sk(sk); 1662 struct udp_sock *up = udp_sk(sk);
1505 int rc;
1506 int is_udplite = IS_UDPLITE(sk); 1663 int is_udplite = IS_UDPLITE(sk);
1507 1664
1508 /* 1665 /*
@@ -1589,25 +1746,9 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1589 goto drop; 1746 goto drop;
1590 1747
1591 udp_csum_pull_header(skb); 1748 udp_csum_pull_header(skb);
1592 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
1593 __UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS,
1594 is_udplite);
1595 goto drop;
1596 }
1597
1598 rc = 0;
1599 1749
1600 ipv4_pktinfo_prepare(sk, skb); 1750 ipv4_pktinfo_prepare(sk, skb);
1601 bh_lock_sock(sk); 1751 return __udp_queue_rcv_skb(sk, skb);
1602 if (!sock_owned_by_user(sk))
1603 rc = __udp_queue_rcv_skb(sk, skb);
1604 else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
1605 bh_unlock_sock(sk);
1606 goto drop;
1607 }
1608 bh_unlock_sock(sk);
1609
1610 return rc;
1611 1752
1612csum_error: 1753csum_error:
1613 __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); 1754 __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
@@ -2217,13 +2358,13 @@ struct proto udp_prot = {
2217 .connect = ip4_datagram_connect, 2358 .connect = ip4_datagram_connect,
2218 .disconnect = udp_disconnect, 2359 .disconnect = udp_disconnect,
2219 .ioctl = udp_ioctl, 2360 .ioctl = udp_ioctl,
2361 .init = udp_init_sock,
2220 .destroy = udp_destroy_sock, 2362 .destroy = udp_destroy_sock,
2221 .setsockopt = udp_setsockopt, 2363 .setsockopt = udp_setsockopt,
2222 .getsockopt = udp_getsockopt, 2364 .getsockopt = udp_getsockopt,
2223 .sendmsg = udp_sendmsg, 2365 .sendmsg = udp_sendmsg,
2224 .recvmsg = udp_recvmsg, 2366 .recvmsg = udp_recvmsg,
2225 .sendpage = udp_sendpage, 2367 .sendpage = udp_sendpage,
2226 .backlog_rcv = __udp_queue_rcv_skb,
2227 .release_cb = ip4_datagram_release_cb, 2368 .release_cb = ip4_datagram_release_cb,
2228 .hash = udp_lib_hash, 2369 .hash = udp_lib_hash,
2229 .unhash = udp_lib_unhash, 2370 .unhash = udp_lib_unhash,
@@ -2512,6 +2653,7 @@ EXPORT_SYMBOL(udp_flow_hashrnd);
2512void __init udp_init(void) 2653void __init udp_init(void)
2513{ 2654{
2514 unsigned long limit; 2655 unsigned long limit;
2656 unsigned int i;
2515 2657
2516 udp_table_init(&udp_table, "UDP"); 2658 udp_table_init(&udp_table, "UDP");
2517 limit = nr_free_buffer_pages() / 8; 2659 limit = nr_free_buffer_pages() / 8;
@@ -2522,4 +2664,13 @@ void __init udp_init(void)
2522 2664
2523 sysctl_udp_rmem_min = SK_MEM_QUANTUM; 2665 sysctl_udp_rmem_min = SK_MEM_QUANTUM;
2524 sysctl_udp_wmem_min = SK_MEM_QUANTUM; 2666 sysctl_udp_wmem_min = SK_MEM_QUANTUM;
2667
2668 /* 16 spinlocks per cpu */
2669 udp_busylocks_log = ilog2(nr_cpu_ids) + 4;
2670 udp_busylocks = kmalloc(sizeof(spinlock_t) << udp_busylocks_log,
2671 GFP_KERNEL);
2672 if (!udp_busylocks)
2673 panic("UDP: failed to alloc udp_busylocks\n");
2674 for (i = 0; i < (1U << udp_busylocks_log); i++)
2675 spin_lock_init(udp_busylocks + i);
2525} 2676}
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index ff450c2aad9b..59f10fe9782e 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -50,10 +50,11 @@ struct proto udplite_prot = {
50 .sendmsg = udp_sendmsg, 50 .sendmsg = udp_sendmsg,
51 .recvmsg = udp_recvmsg, 51 .recvmsg = udp_recvmsg,
52 .sendpage = udp_sendpage, 52 .sendpage = udp_sendpage,
53 .backlog_rcv = __udp_queue_rcv_skb,
54 .hash = udp_lib_hash, 53 .hash = udp_lib_hash,
55 .unhash = udp_lib_unhash, 54 .unhash = udp_lib_unhash,
56 .get_port = udp_v4_get_port, 55 .get_port = udp_v4_get_port,
56 .memory_allocated = &udp_memory_allocated,
57 .sysctl_mem = sysctl_udp_mem,
57 .obj_size = sizeof(struct udp_sock), 58 .obj_size = sizeof(struct udp_sock),
58 .h.udp_table = &udplite_table, 59 .h.udp_table = &udplite_table,
59#ifdef CONFIG_COMPAT 60#ifdef CONFIG_COMPAT
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 62e1e72db461..1fc684111ce6 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -40,6 +40,7 @@ drop:
40 40
41int xfrm4_transport_finish(struct sk_buff *skb, int async) 41int xfrm4_transport_finish(struct sk_buff *skb, int async)
42{ 42{
43 struct xfrm_offload *xo = xfrm_offload(skb);
43 struct iphdr *iph = ip_hdr(skb); 44 struct iphdr *iph = ip_hdr(skb);
44 45
45 iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol; 46 iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol;
@@ -53,6 +54,11 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async)
53 iph->tot_len = htons(skb->len); 54 iph->tot_len = htons(skb->len);
54 ip_send_check(iph); 55 ip_send_check(iph);
55 56
57 if (xo && (xo->flags & XFRM_GRO)) {
58 skb_mac_header_rebuild(skb);
59 return 0;
60 }
61
56 NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, 62 NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
57 dev_net(skb->dev), NULL, skb, skb->dev, NULL, 63 dev_net(skb->dev), NULL, skb, skb->dev, NULL,
58 xfrm4_rcv_encap_finish); 64 xfrm4_rcv_encap_finish);
diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c
index fd840c7d75ea..4acc0508c5eb 100644
--- a/net/ipv4/xfrm4_mode_transport.c
+++ b/net/ipv4/xfrm4_mode_transport.c
@@ -43,6 +43,7 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb)
43static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) 43static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
44{ 44{
45 int ihl = skb->data - skb_transport_header(skb); 45 int ihl = skb->data - skb_transport_header(skb);
46 struct xfrm_offload *xo = xfrm_offload(skb);
46 47
47 if (skb->transport_header != skb->network_header) { 48 if (skb->transport_header != skb->network_header) {
48 memmove(skb_transport_header(skb), 49 memmove(skb_transport_header(skb),
@@ -50,7 +51,8 @@ static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
50 skb->network_header = skb->transport_header; 51 skb->network_header = skb->transport_header;
51 } 52 }
52 ip_hdr(skb)->tot_len = htons(skb->len + ihl); 53 ip_hdr(skb)->tot_len = htons(skb->len + ihl);
53 skb_reset_transport_header(skb); 54 if (!xo || !(xo->flags & XFRM_GRO))
55 skb_reset_transport_header(skb);
54 return 0; 56 return 0;
55} 57}
56 58
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 6a7ff6957535..71b4ecc195c7 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -17,8 +17,6 @@
17#include <net/ip.h> 17#include <net/ip.h>
18#include <net/l3mdev.h> 18#include <net/l3mdev.h>
19 19
20static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
21
22static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, 20static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
23 int tos, int oif, 21 int tos, int oif,
24 const xfrm_address_t *saddr, 22 const xfrm_address_t *saddr,
@@ -219,7 +217,7 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops)
219{ 217{
220 struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops); 218 struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops);
221 219
222 xfrm4_policy_afinfo.garbage_collect(net); 220 xfrm_garbage_collect_deferred(net);
223 return (dst_entries_get_slow(ops) > ops->gc_thresh * 2); 221 return (dst_entries_get_slow(ops) > ops->gc_thresh * 2);
224} 222}
225 223
@@ -271,8 +269,7 @@ static struct dst_ops xfrm4_dst_ops_template = {
271 .gc_thresh = INT_MAX, 269 .gc_thresh = INT_MAX,
272}; 270};
273 271
274static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { 272static const struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
275 .family = AF_INET,
276 .dst_ops = &xfrm4_dst_ops_template, 273 .dst_ops = &xfrm4_dst_ops_template,
277 .dst_lookup = xfrm4_dst_lookup, 274 .dst_lookup = xfrm4_dst_lookup,
278 .get_saddr = xfrm4_get_saddr, 275 .get_saddr = xfrm4_get_saddr,
@@ -376,7 +373,7 @@ static struct pernet_operations __net_initdata xfrm4_net_ops = {
376 373
377static void __init xfrm4_policy_init(void) 374static void __init xfrm4_policy_init(void)
378{ 375{
379 xfrm_policy_register_afinfo(&xfrm4_policy_afinfo); 376 xfrm_policy_register_afinfo(&xfrm4_policy_afinfo, AF_INET);
380} 377}
381 378
382void __init xfrm4_init(void) 379void __init xfrm4_init(void)
diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c
index dccefa9d84cf..8dd0e6ab8606 100644
--- a/net/ipv4/xfrm4_protocol.c
+++ b/net/ipv4/xfrm4_protocol.c
@@ -188,9 +188,8 @@ static const struct net_protocol ipcomp4_protocol = {
188 .netns_ok = 1, 188 .netns_ok = 1,
189}; 189};
190 190
191static struct xfrm_input_afinfo xfrm4_input_afinfo = { 191static const struct xfrm_input_afinfo xfrm4_input_afinfo = {
192 .family = AF_INET, 192 .family = AF_INET,
193 .owner = THIS_MODULE,
194 .callback = xfrm4_rcv_cb, 193 .callback = xfrm4_rcv_cb,
195}; 194};
196 195
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 542074c00c78..d6660a8c0ea5 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -90,11 +90,3 @@ void __init xfrm4_state_init(void)
90{ 90{
91 xfrm_state_register_afinfo(&xfrm4_state_afinfo); 91 xfrm_state_register_afinfo(&xfrm4_state_afinfo);
92} 92}
93
94#if 0
95void __exit xfrm4_state_fini(void)
96{
97 xfrm_state_unregister_afinfo(&xfrm4_state_afinfo);
98}
99#endif /* 0 */
100
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 2343e4f2e0bf..e2afe677a9d9 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -75,6 +75,19 @@ config INET6_ESP
75 75
76 If unsure, say Y. 76 If unsure, say Y.
77 77
78config INET6_ESP_OFFLOAD
79 tristate "IPv6: ESP transformation offload"
80 depends on INET6_ESP
81 select XFRM_OFFLOAD
82 default n
83 ---help---
84 Support for ESP transformation offload. This makes sense
85 only if this system really does IPsec and want to do it
86 with high throughput. A typical desktop system does not
87 need it, even if it does IPsec.
88
89 If unsure, say N.
90
78config INET6_IPCOMP 91config INET6_IPCOMP
79 tristate "IPv6: IPComp transformation" 92 tristate "IPv6: IPComp transformation"
80 select INET6_XFRM_TUNNEL 93 select INET6_XFRM_TUNNEL
@@ -208,6 +221,7 @@ config IPV6_TUNNEL
208 tristate "IPv6: IP-in-IPv6 tunnel (RFC2473)" 221 tristate "IPv6: IP-in-IPv6 tunnel (RFC2473)"
209 select INET6_TUNNEL 222 select INET6_TUNNEL
210 select DST_CACHE 223 select DST_CACHE
224 select GRO_CELLS
211 ---help--- 225 ---help---
212 Support for IPv6-in-IPv6 and IPv4-in-IPv6 tunnels described in 226 Support for IPv6-in-IPv6 and IPv4-in-IPv6 tunnels described in
213 RFC 2473. 227 RFC 2473.
@@ -289,4 +303,39 @@ config IPV6_PIMSM_V2
289 Support for IPv6 PIM multicast routing protocol PIM-SMv2. 303 Support for IPv6 PIM multicast routing protocol PIM-SMv2.
290 If unsure, say N. 304 If unsure, say N.
291 305
306config IPV6_SEG6_LWTUNNEL
307 bool "IPv6: Segment Routing Header encapsulation support"
308 depends on IPV6
309 select LWTUNNEL
310 ---help---
311 Support for encapsulation of packets within an outer IPv6
312 header and a Segment Routing Header using the lightweight
313 tunnels mechanism.
314
315 If unsure, say N.
316
317config IPV6_SEG6_INLINE
318 bool "IPv6: direct Segment Routing Header insertion "
319 depends on IPV6_SEG6_LWTUNNEL
320 ---help---
321 Support for direct insertion of the Segment Routing Header,
322 also known as inline mode. Be aware that direct insertion of
323 extension headers (as opposed to encapsulation) may break
324 multiple mechanisms such as PMTUD or IPSec AH. Use this feature
325 only if you know exactly what you are doing.
326
327 If unsure, say N.
328
329config IPV6_SEG6_HMAC
330 bool "IPv6: Segment Routing HMAC support"
331 depends on IPV6
332 select CRYPTO_HMAC
333 select CRYPTO_SHA1
334 select CRYPTO_SHA256
335 ---help---
336 Support for HMAC signature generation and verification
337 of SR-enabled packets.
338
339 If unsure, say N.
340
292endif # IPV6 341endif # IPV6
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index c174ccb340a1..217e9ff0e24b 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -9,7 +9,7 @@ ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \
9 route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \ 9 route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \
10 raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \ 10 raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \
11 exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \ 11 exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \
12 udp_offload.o 12 udp_offload.o seg6.o
13 13
14ipv6-offload := ip6_offload.o tcpv6_offload.o exthdrs_offload.o 14ipv6-offload := ip6_offload.o tcpv6_offload.o exthdrs_offload.o
15 15
@@ -23,11 +23,14 @@ ipv6-$(CONFIG_IPV6_MULTIPLE_TABLES) += fib6_rules.o
23ipv6-$(CONFIG_PROC_FS) += proc.o 23ipv6-$(CONFIG_PROC_FS) += proc.o
24ipv6-$(CONFIG_SYN_COOKIES) += syncookies.o 24ipv6-$(CONFIG_SYN_COOKIES) += syncookies.o
25ipv6-$(CONFIG_NETLABEL) += calipso.o 25ipv6-$(CONFIG_NETLABEL) += calipso.o
26ipv6-$(CONFIG_IPV6_SEG6_LWTUNNEL) += seg6_iptunnel.o
27ipv6-$(CONFIG_IPV6_SEG6_HMAC) += seg6_hmac.o
26 28
27ipv6-objs += $(ipv6-y) 29ipv6-objs += $(ipv6-y)
28 30
29obj-$(CONFIG_INET6_AH) += ah6.o 31obj-$(CONFIG_INET6_AH) += ah6.o
30obj-$(CONFIG_INET6_ESP) += esp6.o 32obj-$(CONFIG_INET6_ESP) += esp6.o
33obj-$(CONFIG_INET6_ESP_OFFLOAD) += esp6_offload.o
31obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o 34obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o
32obj-$(CONFIG_INET6_XFRM_TUNNEL) += xfrm6_tunnel.o 35obj-$(CONFIG_INET6_XFRM_TUNNEL) += xfrm6_tunnel.o
33obj-$(CONFIG_INET6_TUNNEL) += tunnel6.o 36obj-$(CONFIG_INET6_TUNNEL) += tunnel6.o
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 4bc5ba3ae452..80ce478c4851 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -43,6 +43,7 @@
43#include <linux/errno.h> 43#include <linux/errno.h>
44#include <linux/types.h> 44#include <linux/types.h>
45#include <linux/kernel.h> 45#include <linux/kernel.h>
46#include <linux/sched/signal.h>
46#include <linux/socket.h> 47#include <linux/socket.h>
47#include <linux/sockios.h> 48#include <linux/sockios.h>
48#include <linux/net.h> 49#include <linux/net.h>
@@ -238,6 +239,12 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
238 .use_oif_addrs_only = 0, 239 .use_oif_addrs_only = 0,
239 .ignore_routes_with_linkdown = 0, 240 .ignore_routes_with_linkdown = 0,
240 .keep_addr_on_down = 0, 241 .keep_addr_on_down = 0,
242 .seg6_enabled = 0,
243#ifdef CONFIG_IPV6_SEG6_HMAC
244 .seg6_require_hmac = 0,
245#endif
246 .enhanced_dad = 1,
247 .addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64,
241}; 248};
242 249
243static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { 250static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -284,6 +291,12 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
284 .use_oif_addrs_only = 0, 291 .use_oif_addrs_only = 0,
285 .ignore_routes_with_linkdown = 0, 292 .ignore_routes_with_linkdown = 0,
286 .keep_addr_on_down = 0, 293 .keep_addr_on_down = 0,
294 .seg6_enabled = 0,
295#ifdef CONFIG_IPV6_SEG6_HMAC
296 .seg6_require_hmac = 0,
297#endif
298 .enhanced_dad = 1,
299 .addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64,
287}; 300};
288 301
289/* Check if a valid qdisc is available */ 302/* Check if a valid qdisc is available */
@@ -376,9 +389,9 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
376 memcpy(&ndev->cnf, dev_net(dev)->ipv6.devconf_dflt, sizeof(ndev->cnf)); 389 memcpy(&ndev->cnf, dev_net(dev)->ipv6.devconf_dflt, sizeof(ndev->cnf));
377 390
378 if (ndev->cnf.stable_secret.initialized) 391 if (ndev->cnf.stable_secret.initialized)
379 ndev->addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY; 392 ndev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
380 else 393 else
381 ndev->addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64; 394 ndev->cnf.addr_gen_mode = ipv6_devconf_dflt.addr_gen_mode;
382 395
383 ndev->cnf.mtu6 = dev->mtu; 396 ndev->cnf.mtu6 = dev->mtu;
384 ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl); 397 ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl);
@@ -2134,12 +2147,14 @@ static int ipv6_generate_eui64(u8 *eui, struct net_device *dev)
2134 case ARPHRD_SIT: 2147 case ARPHRD_SIT:
2135 return addrconf_ifid_sit(eui, dev); 2148 return addrconf_ifid_sit(eui, dev);
2136 case ARPHRD_IPGRE: 2149 case ARPHRD_IPGRE:
2150 case ARPHRD_TUNNEL:
2137 return addrconf_ifid_gre(eui, dev); 2151 return addrconf_ifid_gre(eui, dev);
2138 case ARPHRD_6LOWPAN: 2152 case ARPHRD_6LOWPAN:
2139 return addrconf_ifid_eui64(eui, dev); 2153 return addrconf_ifid_eui64(eui, dev);
2140 case ARPHRD_IEEE1394: 2154 case ARPHRD_IEEE1394:
2141 return addrconf_ifid_ieee1394(eui, dev); 2155 return addrconf_ifid_ieee1394(eui, dev);
2142 case ARPHRD_TUNNEL6: 2156 case ARPHRD_TUNNEL6:
2157 case ARPHRD_IP6GRE:
2143 return addrconf_ifid_ip6tnl(eui, dev); 2158 return addrconf_ifid_ip6tnl(eui, dev);
2144 } 2159 }
2145 return -1; 2160 return -1;
@@ -2377,8 +2392,8 @@ static void manage_tempaddrs(struct inet6_dev *idev,
2377 2392
2378static bool is_addr_mode_generate_stable(struct inet6_dev *idev) 2393static bool is_addr_mode_generate_stable(struct inet6_dev *idev)
2379{ 2394{
2380 return idev->addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY || 2395 return idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY ||
2381 idev->addr_gen_mode == IN6_ADDR_GEN_MODE_RANDOM; 2396 idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_RANDOM;
2382} 2397}
2383 2398
2384int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, 2399int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
@@ -3142,7 +3157,7 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
3142 3157
3143 ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); 3158 ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0);
3144 3159
3145 switch (idev->addr_gen_mode) { 3160 switch (idev->cnf.addr_gen_mode) {
3146 case IN6_ADDR_GEN_MODE_RANDOM: 3161 case IN6_ADDR_GEN_MODE_RANDOM:
3147 ipv6_gen_mode_random_init(idev); 3162 ipv6_gen_mode_random_init(idev);
3148 /* fallthrough */ 3163 /* fallthrough */
@@ -3183,6 +3198,9 @@ static void addrconf_dev_config(struct net_device *dev)
3183 (dev->type != ARPHRD_IEEE1394) && 3198 (dev->type != ARPHRD_IEEE1394) &&
3184 (dev->type != ARPHRD_TUNNEL6) && 3199 (dev->type != ARPHRD_TUNNEL6) &&
3185 (dev->type != ARPHRD_6LOWPAN) && 3200 (dev->type != ARPHRD_6LOWPAN) &&
3201 (dev->type != ARPHRD_IP6GRE) &&
3202 (dev->type != ARPHRD_IPGRE) &&
3203 (dev->type != ARPHRD_TUNNEL) &&
3186 (dev->type != ARPHRD_NONE)) { 3204 (dev->type != ARPHRD_NONE)) {
3187 /* Alas, we support only Ethernet autoconfiguration. */ 3205 /* Alas, we support only Ethernet autoconfiguration. */
3188 return; 3206 return;
@@ -3194,8 +3212,8 @@ static void addrconf_dev_config(struct net_device *dev)
3194 3212
3195 /* this device type has no EUI support */ 3213 /* this device type has no EUI support */
3196 if (dev->type == ARPHRD_NONE && 3214 if (dev->type == ARPHRD_NONE &&
3197 idev->addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64) 3215 idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64)
3198 idev->addr_gen_mode = IN6_ADDR_GEN_MODE_RANDOM; 3216 idev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_RANDOM;
3199 3217
3200 addrconf_addr_gen(idev, false); 3218 addrconf_addr_gen(idev, false);
3201} 3219}
@@ -3376,9 +3394,15 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
3376 } 3394 }
3377 3395
3378 if (idev) { 3396 if (idev) {
3379 if (idev->if_flags & IF_READY) 3397 if (idev->if_flags & IF_READY) {
3380 /* device is already configured. */ 3398 /* device is already configured -
3399 * but resend MLD reports, we might
3400 * have roamed and need to update
3401 * multicast snooping switches
3402 */
3403 ipv6_mc_up(idev);
3381 break; 3404 break;
3405 }
3382 idev->if_flags |= IF_READY; 3406 idev->if_flags |= IF_READY;
3383 } 3407 }
3384 3408
@@ -3602,14 +3626,19 @@ restart:
3602 INIT_LIST_HEAD(&del_list); 3626 INIT_LIST_HEAD(&del_list);
3603 list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) { 3627 list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) {
3604 struct rt6_info *rt = NULL; 3628 struct rt6_info *rt = NULL;
3629 bool keep;
3605 3630
3606 addrconf_del_dad_work(ifa); 3631 addrconf_del_dad_work(ifa);
3607 3632
3633 keep = keep_addr && (ifa->flags & IFA_F_PERMANENT) &&
3634 !addr_is_local(&ifa->addr);
3635 if (!keep)
3636 list_move(&ifa->if_list, &del_list);
3637
3608 write_unlock_bh(&idev->lock); 3638 write_unlock_bh(&idev->lock);
3609 spin_lock_bh(&ifa->lock); 3639 spin_lock_bh(&ifa->lock);
3610 3640
3611 if (keep_addr && (ifa->flags & IFA_F_PERMANENT) && 3641 if (keep) {
3612 !addr_is_local(&ifa->addr)) {
3613 /* set state to skip the notifier below */ 3642 /* set state to skip the notifier below */
3614 state = INET6_IFADDR_STATE_DEAD; 3643 state = INET6_IFADDR_STATE_DEAD;
3615 ifa->state = 0; 3644 ifa->state = 0;
@@ -3621,8 +3650,6 @@ restart:
3621 } else { 3650 } else {
3622 state = ifa->state; 3651 state = ifa->state;
3623 ifa->state = INET6_IFADDR_STATE_DEAD; 3652 ifa->state = INET6_IFADDR_STATE_DEAD;
3624
3625 list_move(&ifa->if_list, &del_list);
3626 } 3653 }
3627 3654
3628 spin_unlock_bh(&ifa->lock); 3655 spin_unlock_bh(&ifa->lock);
@@ -3727,12 +3754,21 @@ static void addrconf_dad_kick(struct inet6_ifaddr *ifp)
3727{ 3754{
3728 unsigned long rand_num; 3755 unsigned long rand_num;
3729 struct inet6_dev *idev = ifp->idev; 3756 struct inet6_dev *idev = ifp->idev;
3757 u64 nonce;
3730 3758
3731 if (ifp->flags & IFA_F_OPTIMISTIC) 3759 if (ifp->flags & IFA_F_OPTIMISTIC)
3732 rand_num = 0; 3760 rand_num = 0;
3733 else 3761 else
3734 rand_num = prandom_u32() % (idev->cnf.rtr_solicit_delay ? : 1); 3762 rand_num = prandom_u32() % (idev->cnf.rtr_solicit_delay ? : 1);
3735 3763
3764 nonce = 0;
3765 if (idev->cnf.enhanced_dad ||
3766 dev_net(idev->dev)->ipv6.devconf_all->enhanced_dad) {
3767 do
3768 get_random_bytes(&nonce, 6);
3769 while (nonce == 0);
3770 }
3771 ifp->dad_nonce = nonce;
3736 ifp->dad_probes = idev->cnf.dad_transmits; 3772 ifp->dad_probes = idev->cnf.dad_transmits;
3737 addrconf_mod_dad_work(ifp, rand_num); 3773 addrconf_mod_dad_work(ifp, rand_num);
3738} 3774}
@@ -3910,7 +3946,8 @@ static void addrconf_dad_work(struct work_struct *w)
3910 3946
3911 /* send a neighbour solicitation for our addr */ 3947 /* send a neighbour solicitation for our addr */
3912 addrconf_addr_solict_mult(&ifp->addr, &mcaddr); 3948 addrconf_addr_solict_mult(&ifp->addr, &mcaddr);
3913 ndisc_send_ns(ifp->idev->dev, &ifp->addr, &mcaddr, &in6addr_any); 3949 ndisc_send_ns(ifp->idev->dev, &ifp->addr, &mcaddr, &in6addr_any,
3950 ifp->dad_nonce);
3914out: 3951out:
3915 in6_ifa_put(ifp); 3952 in6_ifa_put(ifp);
3916 rtnl_unlock(); 3953 rtnl_unlock();
@@ -3989,6 +4026,12 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id)
3989 4026
3990 if (bump_id) 4027 if (bump_id)
3991 rt_genid_bump_ipv6(dev_net(dev)); 4028 rt_genid_bump_ipv6(dev_net(dev));
4029
4030 /* Make sure that a new temporary address will be created
4031 * before this temporary address becomes deprecated.
4032 */
4033 if (ifp->flags & IFA_F_TEMPORARY)
4034 addrconf_verify_rtnl();
3992} 4035}
3993 4036
3994static void addrconf_dad_run(struct inet6_dev *idev) 4037static void addrconf_dad_run(struct inet6_dev *idev)
@@ -4868,6 +4911,13 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
4868 struct net *net = dev_net(ifa->idev->dev); 4911 struct net *net = dev_net(ifa->idev->dev);
4869 int err = -ENOBUFS; 4912 int err = -ENOBUFS;
4870 4913
4914 /* Don't send DELADDR notification for TENTATIVE address,
4915 * since NEWADDR notification is sent only after removing
4916 * TENTATIVE flag.
4917 */
4918 if (ifa->flags & IFA_F_TENTATIVE && event == RTM_DELADDR)
4919 return;
4920
4871 skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC); 4921 skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC);
4872 if (!skb) 4922 if (!skb)
4873 goto errout; 4923 goto errout;
@@ -4950,6 +5000,12 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
4950 array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = cnf->drop_unicast_in_l2_multicast; 5000 array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = cnf->drop_unicast_in_l2_multicast;
4951 array[DEVCONF_DROP_UNSOLICITED_NA] = cnf->drop_unsolicited_na; 5001 array[DEVCONF_DROP_UNSOLICITED_NA] = cnf->drop_unsolicited_na;
4952 array[DEVCONF_KEEP_ADDR_ON_DOWN] = cnf->keep_addr_on_down; 5002 array[DEVCONF_KEEP_ADDR_ON_DOWN] = cnf->keep_addr_on_down;
5003 array[DEVCONF_SEG6_ENABLED] = cnf->seg6_enabled;
5004#ifdef CONFIG_IPV6_SEG6_HMAC
5005 array[DEVCONF_SEG6_REQUIRE_HMAC] = cnf->seg6_require_hmac;
5006#endif
5007 array[DEVCONF_ENHANCED_DAD] = cnf->enhanced_dad;
5008 array[DEVCONF_ADDR_GEN_MODE] = cnf->addr_gen_mode;
4953} 5009}
4954 5010
4955static inline size_t inet6_ifla6_size(void) 5011static inline size_t inet6_ifla6_size(void)
@@ -5061,7 +5117,7 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev,
5061 if (!nla) 5117 if (!nla)
5062 goto nla_put_failure; 5118 goto nla_put_failure;
5063 5119
5064 if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->addr_gen_mode)) 5120 if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->cnf.addr_gen_mode))
5065 goto nla_put_failure; 5121 goto nla_put_failure;
5066 5122
5067 read_lock_bh(&idev->lock); 5123 read_lock_bh(&idev->lock);
@@ -5179,6 +5235,26 @@ static int inet6_validate_link_af(const struct net_device *dev,
5179 return nla_parse_nested(tb, IFLA_INET6_MAX, nla, inet6_af_policy); 5235 return nla_parse_nested(tb, IFLA_INET6_MAX, nla, inet6_af_policy);
5180} 5236}
5181 5237
5238static int check_addr_gen_mode(int mode)
5239{
5240 if (mode != IN6_ADDR_GEN_MODE_EUI64 &&
5241 mode != IN6_ADDR_GEN_MODE_NONE &&
5242 mode != IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
5243 mode != IN6_ADDR_GEN_MODE_RANDOM)
5244 return -EINVAL;
5245 return 1;
5246}
5247
5248static int check_stable_privacy(struct inet6_dev *idev, struct net *net,
5249 int mode)
5250{
5251 if (mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
5252 !idev->cnf.stable_secret.initialized &&
5253 !net->ipv6.devconf_dflt->stable_secret.initialized)
5254 return -EINVAL;
5255 return 1;
5256}
5257
5182static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla) 5258static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla)
5183{ 5259{
5184 int err = -EINVAL; 5260 int err = -EINVAL;
@@ -5200,18 +5276,11 @@ static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla)
5200 if (tb[IFLA_INET6_ADDR_GEN_MODE]) { 5276 if (tb[IFLA_INET6_ADDR_GEN_MODE]) {
5201 u8 mode = nla_get_u8(tb[IFLA_INET6_ADDR_GEN_MODE]); 5277 u8 mode = nla_get_u8(tb[IFLA_INET6_ADDR_GEN_MODE]);
5202 5278
5203 if (mode != IN6_ADDR_GEN_MODE_EUI64 && 5279 if (check_addr_gen_mode(mode) < 0 ||
5204 mode != IN6_ADDR_GEN_MODE_NONE && 5280 check_stable_privacy(idev, dev_net(dev), mode) < 0)
5205 mode != IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
5206 mode != IN6_ADDR_GEN_MODE_RANDOM)
5207 return -EINVAL;
5208
5209 if (mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
5210 !idev->cnf.stable_secret.initialized &&
5211 !dev_net(dev)->ipv6.devconf_dflt->stable_secret.initialized)
5212 return -EINVAL; 5281 return -EINVAL;
5213 5282
5214 idev->addr_gen_mode = mode; 5283 idev->cnf.addr_gen_mode = mode;
5215 err = 0; 5284 err = 0;
5216 } 5285 }
5217 5286
@@ -5515,8 +5584,7 @@ static void addrconf_disable_change(struct net *net, __s32 newf)
5515 struct net_device *dev; 5584 struct net_device *dev;
5516 struct inet6_dev *idev; 5585 struct inet6_dev *idev;
5517 5586
5518 rcu_read_lock(); 5587 for_each_netdev(net, dev) {
5519 for_each_netdev_rcu(net, dev) {
5520 idev = __in6_dev_get(dev); 5588 idev = __in6_dev_get(dev);
5521 if (idev) { 5589 if (idev) {
5522 int changed = (!idev->cnf.disable_ipv6) ^ (!newf); 5590 int changed = (!idev->cnf.disable_ipv6) ^ (!newf);
@@ -5525,7 +5593,6 @@ static void addrconf_disable_change(struct net *net, __s32 newf)
5525 dev_disable_change(idev); 5593 dev_disable_change(idev);
5526 } 5594 }
5527 } 5595 }
5528 rcu_read_unlock();
5529} 5596}
5530 5597
5531static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int newf) 5598static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int newf)
@@ -5620,6 +5687,55 @@ int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write,
5620 return ret; 5687 return ret;
5621} 5688}
5622 5689
5690static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write,
5691 void __user *buffer, size_t *lenp,
5692 loff_t *ppos)
5693{
5694 int ret = 0;
5695 int new_val;
5696 struct inet6_dev *idev = (struct inet6_dev *)ctl->extra1;
5697 struct net *net = (struct net *)ctl->extra2;
5698
5699 if (!rtnl_trylock())
5700 return restart_syscall();
5701
5702 ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5703
5704 if (write) {
5705 new_val = *((int *)ctl->data);
5706
5707 if (check_addr_gen_mode(new_val) < 0) {
5708 ret = -EINVAL;
5709 goto out;
5710 }
5711
5712 /* request for default */
5713 if (&net->ipv6.devconf_dflt->addr_gen_mode == ctl->data) {
5714 ipv6_devconf_dflt.addr_gen_mode = new_val;
5715
5716 /* request for individual net device */
5717 } else {
5718 if (!idev)
5719 goto out;
5720
5721 if (check_stable_privacy(idev, net, new_val) < 0) {
5722 ret = -EINVAL;
5723 goto out;
5724 }
5725
5726 if (idev->cnf.addr_gen_mode != new_val) {
5727 idev->cnf.addr_gen_mode = new_val;
5728 addrconf_dev_config(idev->dev);
5729 }
5730 }
5731 }
5732
5733out:
5734 rtnl_unlock();
5735
5736 return ret;
5737}
5738
5623static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write, 5739static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write,
5624 void __user *buffer, size_t *lenp, 5740 void __user *buffer, size_t *lenp,
5625 loff_t *ppos) 5741 loff_t *ppos)
@@ -5670,14 +5786,14 @@ static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write,
5670 struct inet6_dev *idev = __in6_dev_get(dev); 5786 struct inet6_dev *idev = __in6_dev_get(dev);
5671 5787
5672 if (idev) { 5788 if (idev) {
5673 idev->addr_gen_mode = 5789 idev->cnf.addr_gen_mode =
5674 IN6_ADDR_GEN_MODE_STABLE_PRIVACY; 5790 IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
5675 } 5791 }
5676 } 5792 }
5677 } else { 5793 } else {
5678 struct inet6_dev *idev = ctl->extra1; 5794 struct inet6_dev *idev = ctl->extra1;
5679 5795
5680 idev->addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY; 5796 idev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
5681 } 5797 }
5682 5798
5683out: 5799out:
@@ -6042,6 +6158,36 @@ static const struct ctl_table addrconf_sysctl[] = {
6042 6158
6043 }, 6159 },
6044 { 6160 {
6161 .procname = "seg6_enabled",
6162 .data = &ipv6_devconf.seg6_enabled,
6163 .maxlen = sizeof(int),
6164 .mode = 0644,
6165 .proc_handler = proc_dointvec,
6166 },
6167#ifdef CONFIG_IPV6_SEG6_HMAC
6168 {
6169 .procname = "seg6_require_hmac",
6170 .data = &ipv6_devconf.seg6_require_hmac,
6171 .maxlen = sizeof(int),
6172 .mode = 0644,
6173 .proc_handler = proc_dointvec,
6174 },
6175#endif
6176 {
6177 .procname = "enhanced_dad",
6178 .data = &ipv6_devconf.enhanced_dad,
6179 .maxlen = sizeof(int),
6180 .mode = 0644,
6181 .proc_handler = proc_dointvec,
6182 },
6183 {
6184 .procname = "addr_gen_mode",
6185 .data = &ipv6_devconf.addr_gen_mode,
6186 .maxlen = sizeof(int),
6187 .mode = 0644,
6188 .proc_handler = addrconf_sysctl_addr_gen_mode,
6189 },
6190 {
6045 /* sentinel */ 6191 /* sentinel */
6046 } 6192 }
6047}; 6193};
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 46ad699937fd..a9a9553ee63d 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -61,8 +61,9 @@
61#include <net/ip6_tunnel.h> 61#include <net/ip6_tunnel.h>
62#endif 62#endif
63#include <net/calipso.h> 63#include <net/calipso.h>
64#include <net/seg6.h>
64 65
65#include <asm/uaccess.h> 66#include <linux/uaccess.h>
66#include <linux/mroute6.h> 67#include <linux/mroute6.h>
67 68
68#include "ip6_offload.h" 69#include "ip6_offload.h"
@@ -257,6 +258,14 @@ lookup_protocol:
257 goto out; 258 goto out;
258 } 259 }
259 } 260 }
261
262 if (!kern) {
263 err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
264 if (err) {
265 sk_common_release(sk);
266 goto out;
267 }
268 }
260out: 269out:
261 return err; 270 return err;
262out_rcu_unlock: 271out_rcu_unlock:
@@ -293,7 +302,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
293 return -EINVAL; 302 return -EINVAL;
294 303
295 snum = ntohs(addr->sin6_port); 304 snum = ntohs(addr->sin6_port);
296 if (snum && snum < PROT_SOCK && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) 305 if (snum && snum < inet_prot_sock(net) &&
306 !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
297 return -EACCES; 307 return -EACCES;
298 308
299 lock_sock(sk); 309 lock_sock(sk);
@@ -678,6 +688,7 @@ int inet6_sk_rebuild_header(struct sock *sk)
678 fl6.flowi6_mark = sk->sk_mark; 688 fl6.flowi6_mark = sk->sk_mark;
679 fl6.fl6_dport = inet->inet_dport; 689 fl6.fl6_dport = inet->inet_dport;
680 fl6.fl6_sport = inet->inet_sport; 690 fl6.fl6_sport = inet->inet_sport;
691 fl6.flowi6_uid = sk->sk_uid;
681 security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); 692 security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
682 693
683 rcu_read_lock(); 694 rcu_read_lock();
@@ -909,12 +920,12 @@ static int __init inet6_init(void)
909 err = register_pernet_subsys(&inet6_net_ops); 920 err = register_pernet_subsys(&inet6_net_ops);
910 if (err) 921 if (err)
911 goto register_pernet_fail; 922 goto register_pernet_fail;
912 err = icmpv6_init();
913 if (err)
914 goto icmp_fail;
915 err = ip6_mr_init(); 923 err = ip6_mr_init();
916 if (err) 924 if (err)
917 goto ipmr_fail; 925 goto ipmr_fail;
926 err = icmpv6_init();
927 if (err)
928 goto icmp_fail;
918 err = ndisc_init(); 929 err = ndisc_init();
919 if (err) 930 if (err)
920 goto ndisc_fail; 931 goto ndisc_fail;
@@ -990,6 +1001,10 @@ static int __init inet6_init(void)
990 if (err) 1001 if (err)
991 goto calipso_fail; 1002 goto calipso_fail;
992 1003
1004 err = seg6_init();
1005 if (err)
1006 goto seg6_fail;
1007
993#ifdef CONFIG_SYSCTL 1008#ifdef CONFIG_SYSCTL
994 err = ipv6_sysctl_register(); 1009 err = ipv6_sysctl_register();
995 if (err) 1010 if (err)
@@ -1000,8 +1015,10 @@ out:
1000 1015
1001#ifdef CONFIG_SYSCTL 1016#ifdef CONFIG_SYSCTL
1002sysctl_fail: 1017sysctl_fail:
1003 calipso_exit(); 1018 seg6_exit();
1004#endif 1019#endif
1020seg6_fail:
1021 calipso_exit();
1005calipso_fail: 1022calipso_fail:
1006 pingv6_exit(); 1023 pingv6_exit();
1007pingv6_fail: 1024pingv6_fail:
@@ -1044,10 +1061,10 @@ igmp_fail:
1044 ndisc_cleanup(); 1061 ndisc_cleanup();
1045ndisc_fail: 1062ndisc_fail:
1046 ip6_mr_cleanup(); 1063 ip6_mr_cleanup();
1047ipmr_fail:
1048 icmpv6_cleanup();
1049icmp_fail: 1064icmp_fail:
1050 unregister_pernet_subsys(&inet6_net_ops); 1065 unregister_pernet_subsys(&inet6_net_ops);
1066ipmr_fail:
1067 icmpv6_cleanup();
1051register_pernet_fail: 1068register_pernet_fail:
1052 sock_unregister(PF_INET6); 1069 sock_unregister(PF_INET6);
1053 rtnl_unregister_all(PF_INET6); 1070 rtnl_unregister_all(PF_INET6);
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 0630a4d5daaa..dda6035e3b84 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -474,6 +474,9 @@ static void ah6_input_done(struct crypto_async_request *base, int err)
474 int hdr_len = skb_network_header_len(skb); 474 int hdr_len = skb_network_header_len(skb);
475 int ah_hlen = (ah->hdrlen + 2) << 2; 475 int ah_hlen = (ah->hdrlen + 2) << 2;
476 476
477 if (err)
478 goto out;
479
477 work_iph = AH_SKB_CB(skb)->tmp; 480 work_iph = AH_SKB_CB(skb)->tmp;
478 auth_data = ah_tmp_auth(work_iph, hdr_len); 481 auth_data = ah_tmp_auth(work_iph, hdr_len);
479 icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len); 482 icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len);
@@ -662,9 +665,10 @@ static int ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
662 return 0; 665 return 0;
663 666
664 if (type == NDISC_REDIRECT) 667 if (type == NDISC_REDIRECT)
665 ip6_redirect(skb, net, skb->dev->ifindex, 0); 668 ip6_redirect(skb, net, skb->dev->ifindex, 0,
669 sock_net_uid(net, NULL));
666 else 670 else
667 ip6_update_pmtu(skb, net, info, 0, 0); 671 ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
668 xfrm_state_put(x); 672 xfrm_state_put(x);
669 673
670 return 0; 674 return 0;
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index ccf40550c475..e011122ebd43 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -33,7 +33,7 @@
33#include <net/dsfield.h> 33#include <net/dsfield.h>
34 34
35#include <linux/errqueue.h> 35#include <linux/errqueue.h>
36#include <asm/uaccess.h> 36#include <linux/uaccess.h>
37 37
38static bool ipv6_mapped_addr_any(const struct in6_addr *a) 38static bool ipv6_mapped_addr_any(const struct in6_addr *a)
39{ 39{
@@ -54,6 +54,7 @@ static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk)
54 fl6->fl6_dport = inet->inet_dport; 54 fl6->fl6_dport = inet->inet_dport;
55 fl6->fl6_sport = inet->inet_sport; 55 fl6->fl6_sport = inet->inet_sport;
56 fl6->flowlabel = np->flow_label; 56 fl6->flowlabel = np->flow_label;
57 fl6->flowi6_uid = sk->sk_uid;
57 58
58 if (!fl6->flowi6_oif) 59 if (!fl6->flowi6_oif)
59 fl6->flowi6_oif = np->sticky_pktinfo.ipi6_ifindex; 60 fl6->flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
@@ -166,18 +167,22 @@ int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr,
166 if (np->sndflow) 167 if (np->sndflow)
167 fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK; 168 fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
168 169
169 addr_type = ipv6_addr_type(&usin->sin6_addr); 170 if (ipv6_addr_any(&usin->sin6_addr)) {
170
171 if (addr_type == IPV6_ADDR_ANY) {
172 /* 171 /*
173 * connect to self 172 * connect to self
174 */ 173 */
175 usin->sin6_addr.s6_addr[15] = 0x01; 174 if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
175 ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK),
176 &usin->sin6_addr);
177 else
178 usin->sin6_addr = in6addr_loopback;
176 } 179 }
177 180
181 addr_type = ipv6_addr_type(&usin->sin6_addr);
182
178 daddr = &usin->sin6_addr; 183 daddr = &usin->sin6_addr;
179 184
180 if (addr_type == IPV6_ADDR_MAPPED) { 185 if (addr_type & IPV6_ADDR_MAPPED) {
181 struct sockaddr_in sin; 186 struct sockaddr_in sin;
182 187
183 if (__ipv6_only_sock(sk)) { 188 if (__ipv6_only_sock(sk)) {
@@ -400,9 +405,6 @@ static inline bool ipv6_datagram_support_addr(struct sock_exterr_skb *serr)
400 * At one point, excluding local errors was a quick test to identify icmp/icmp6 405 * At one point, excluding local errors was a quick test to identify icmp/icmp6
401 * errors. This is no longer true, but the test remained, so the v6 stack, 406 * errors. This is no longer true, but the test remained, so the v6 stack,
402 * unlike v4, also honors cmsg requests on all wifi and timestamp errors. 407 * unlike v4, also honors cmsg requests on all wifi and timestamp errors.
403 *
404 * Timestamp code paths do not initialize the fields expected by cmsg:
405 * the PKTINFO fields in skb->cb[]. Fill those in here.
406 */ 408 */
407static bool ip6_datagram_support_cmsg(struct sk_buff *skb, 409static bool ip6_datagram_support_cmsg(struct sk_buff *skb,
408 struct sock_exterr_skb *serr) 410 struct sock_exterr_skb *serr)
@@ -414,14 +416,9 @@ static bool ip6_datagram_support_cmsg(struct sk_buff *skb,
414 if (serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL) 416 if (serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL)
415 return false; 417 return false;
416 418
417 if (!skb->dev) 419 if (!IP6CB(skb)->iif)
418 return false; 420 return false;
419 421
420 if (skb->protocol == htons(ETH_P_IPV6))
421 IP6CB(skb)->iif = skb->dev->ifindex;
422 else
423 PKTINFO_SKB_CB(skb)->ipi_ifindex = skb->dev->ifindex;
424
425 return true; 422 return true;
426} 423}
427 424
@@ -700,7 +697,7 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg,
700 struct sockaddr_in6 sin6; 697 struct sockaddr_in6 sin6;
701 __be16 *ports = (__be16 *) skb_transport_header(skb); 698 __be16 *ports = (__be16 *) skb_transport_header(skb);
702 699
703 if (skb_transport_offset(skb) + 4 <= skb->len) { 700 if (skb_transport_offset(skb) + 4 <= (int)skb->len) {
704 /* All current transport protocols have the port numbers in the 701 /* All current transport protocols have the port numbers in the
705 * first four bytes of the transport header and this function is 702 * first four bytes of the transport header and this function is
706 * written with this assumption in mind. 703 * written with this assumption in mind.
@@ -717,6 +714,11 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg,
717 put_cmsg(msg, SOL_IPV6, IPV6_ORIGDSTADDR, sizeof(sin6), &sin6); 714 put_cmsg(msg, SOL_IPV6, IPV6_ORIGDSTADDR, sizeof(sin6), &sin6);
718 } 715 }
719 } 716 }
717 if (np->rxopt.bits.recvfragsize && opt->frag_max_size) {
718 int val = opt->frag_max_size;
719
720 put_cmsg(msg, SOL_IPV6, IPV6_RECVFRAGSIZE, sizeof(val), &val);
721 }
720} 722}
721 723
722void ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg, 724void ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg,
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 111ba55fd512..ff54faa75631 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -44,6 +44,8 @@
44#include <net/protocol.h> 44#include <net/protocol.h>
45#include <linux/icmpv6.h> 45#include <linux/icmpv6.h>
46 46
47#include <linux/highmem.h>
48
47struct esp_skb_cb { 49struct esp_skb_cb {
48 struct xfrm_skb_cb xfrm; 50 struct xfrm_skb_cb xfrm;
49 void *tmp; 51 void *tmp;
@@ -114,11 +116,40 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
114 __alignof__(struct scatterlist)); 116 __alignof__(struct scatterlist));
115} 117}
116 118
119static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
120{
121 __be32 *seqhi;
122 struct crypto_aead *aead = x->data;
123 int seqhilen = 0;
124 u8 *iv;
125 struct aead_request *req;
126 struct scatterlist *sg;
127
128 if (x->props.flags & XFRM_STATE_ESN)
129 seqhilen += sizeof(__be32);
130
131 seqhi = esp_tmp_seqhi(tmp);
132 iv = esp_tmp_iv(aead, tmp, seqhilen);
133 req = esp_tmp_req(aead, iv);
134
135 /* Unref skb_frag_pages in the src scatterlist if necessary.
136 * Skip the first sg which comes from skb->data.
137 */
138 if (req->src != req->dst)
139 for (sg = sg_next(req->src); sg; sg = sg_next(sg))
140 put_page(sg_page(sg));
141}
142
117static void esp_output_done(struct crypto_async_request *base, int err) 143static void esp_output_done(struct crypto_async_request *base, int err)
118{ 144{
119 struct sk_buff *skb = base->data; 145 struct sk_buff *skb = base->data;
146 void *tmp;
147 struct dst_entry *dst = skb_dst(skb);
148 struct xfrm_state *x = dst->xfrm;
120 149
121 kfree(ESP_SKB_CB(skb)->tmp); 150 tmp = ESP_SKB_CB(skb)->tmp;
151 esp_ssg_unref(x, tmp);
152 kfree(tmp);
122 xfrm_output_resume(skb, err); 153 xfrm_output_resume(skb, err);
123} 154}
124 155
@@ -138,6 +169,27 @@ static void esp_output_restore_header(struct sk_buff *skb)
138 esp_restore_header(skb, skb_transport_offset(skb) - sizeof(__be32)); 169 esp_restore_header(skb, skb_transport_offset(skb) - sizeof(__be32));
139} 170}
140 171
172static struct ip_esp_hdr *esp_output_set_esn(struct sk_buff *skb,
173 struct ip_esp_hdr *esph,
174 __be32 *seqhi)
175{
176 struct xfrm_state *x = skb_dst(skb)->xfrm;
177
178 /* For ESN we move the header forward by 4 bytes to
179 * accomodate the high bits. We will move it back after
180 * encryption.
181 */
182 if ((x->props.flags & XFRM_STATE_ESN)) {
183 esph = (void *)(skb_transport_header(skb) - sizeof(__be32));
184 *seqhi = esph->spi;
185 esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
186 }
187
188 esph->spi = x->id.spi;
189
190 return esph;
191}
192
141static void esp_output_done_esn(struct crypto_async_request *base, int err) 193static void esp_output_done_esn(struct crypto_async_request *base, int err)
142{ 194{
143 struct sk_buff *skb = base->data; 195 struct sk_buff *skb = base->data;
@@ -146,14 +198,31 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err)
146 esp_output_done(base, err); 198 esp_output_done(base, err);
147} 199}
148 200
201static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto)
202{
203 /* Fill padding... */
204 if (tfclen) {
205 memset(tail, 0, tfclen);
206 tail += tfclen;
207 }
208 do {
209 int i;
210 for (i = 0; i < plen - 2; i++)
211 tail[i] = i + 1;
212 } while (0);
213 tail[plen - 2] = plen - 2;
214 tail[plen - 1] = proto;
215}
216
149static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) 217static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
150{ 218{
151 int err; 219 int err;
152 struct ip_esp_hdr *esph; 220 struct ip_esp_hdr *esph;
153 struct crypto_aead *aead; 221 struct crypto_aead *aead;
154 struct aead_request *req; 222 struct aead_request *req;
155 struct scatterlist *sg; 223 struct scatterlist *sg, *dsg;
156 struct sk_buff *trailer; 224 struct sk_buff *trailer;
225 struct page *page;
157 void *tmp; 226 void *tmp;
158 int blksize; 227 int blksize;
159 int clen; 228 int clen;
@@ -164,10 +233,13 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
164 int nfrags; 233 int nfrags;
165 int assoclen; 234 int assoclen;
166 int seqhilen; 235 int seqhilen;
236 int tailen;
167 u8 *iv; 237 u8 *iv;
168 u8 *tail; 238 u8 *tail;
239 u8 *vaddr;
169 __be32 *seqhi; 240 __be32 *seqhi;
170 __be64 seqno; 241 __be64 seqno;
242 __u8 proto = *skb_mac_header(skb);
171 243
172 /* skb is pure payload to encrypt */ 244 /* skb is pure payload to encrypt */
173 aead = x->data; 245 aead = x->data;
@@ -186,11 +258,7 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
186 blksize = ALIGN(crypto_aead_blocksize(aead), 4); 258 blksize = ALIGN(crypto_aead_blocksize(aead), 4);
187 clen = ALIGN(skb->len + 2 + tfclen, blksize); 259 clen = ALIGN(skb->len + 2 + tfclen, blksize);
188 plen = clen - skb->len - tfclen; 260 plen = clen - skb->len - tfclen;
189 261 tailen = tfclen + plen + alen;
190 err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
191 if (err < 0)
192 goto error;
193 nfrags = err;
194 262
195 assoclen = sizeof(*esph); 263 assoclen = sizeof(*esph);
196 seqhilen = 0; 264 seqhilen = 0;
@@ -200,59 +268,152 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
200 assoclen += seqhilen; 268 assoclen += seqhilen;
201 } 269 }
202 270
203 tmp = esp_alloc_tmp(aead, nfrags, seqhilen); 271 *skb_mac_header(skb) = IPPROTO_ESP;
204 if (!tmp) { 272 esph = ip_esp_hdr(skb);
205 err = -ENOMEM; 273
206 goto error; 274 if (!skb_cloned(skb)) {
275 if (tailen <= skb_availroom(skb)) {
276 nfrags = 1;
277 trailer = skb;
278 tail = skb_tail_pointer(trailer);
279
280 goto skip_cow;
281 } else if ((skb_shinfo(skb)->nr_frags < MAX_SKB_FRAGS)
282 && !skb_has_frag_list(skb)) {
283 int allocsize;
284 struct sock *sk = skb->sk;
285 struct page_frag *pfrag = &x->xfrag;
286
287 allocsize = ALIGN(tailen, L1_CACHE_BYTES);
288
289 spin_lock_bh(&x->lock);
290
291 if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
292 spin_unlock_bh(&x->lock);
293 goto cow;
294 }
295
296 page = pfrag->page;
297 get_page(page);
298
299 vaddr = kmap_atomic(page);
300
301 tail = vaddr + pfrag->offset;
302
303 esp_output_fill_trailer(tail, tfclen, plen, proto);
304
305 kunmap_atomic(vaddr);
306
307 nfrags = skb_shinfo(skb)->nr_frags;
308
309 __skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
310 tailen);
311 skb_shinfo(skb)->nr_frags = ++nfrags;
312
313 pfrag->offset = pfrag->offset + allocsize;
314 nfrags++;
315
316 skb->len += tailen;
317 skb->data_len += tailen;
318 skb->truesize += tailen;
319 if (sk)
320 atomic_add(tailen, &sk->sk_wmem_alloc);
321
322 skb_push(skb, -skb_network_offset(skb));
323
324 esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
325 esph->spi = x->id.spi;
326
327 tmp = esp_alloc_tmp(aead, nfrags + 2, seqhilen);
328 if (!tmp) {
329 spin_unlock_bh(&x->lock);
330 err = -ENOMEM;
331 goto error;
332 }
333 seqhi = esp_tmp_seqhi(tmp);
334 iv = esp_tmp_iv(aead, tmp, seqhilen);
335 req = esp_tmp_req(aead, iv);
336 sg = esp_req_sg(aead, req);
337 dsg = &sg[nfrags];
338
339 esph = esp_output_set_esn(skb, esph, seqhi);
340
341 sg_init_table(sg, nfrags);
342 skb_to_sgvec(skb, sg,
343 (unsigned char *)esph - skb->data,
344 assoclen + ivlen + clen + alen);
345
346 allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES);
347
348 if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
349 spin_unlock_bh(&x->lock);
350 err = -ENOMEM;
351 goto error;
352 }
353
354 skb_shinfo(skb)->nr_frags = 1;
355
356 page = pfrag->page;
357 get_page(page);
358 /* replace page frags in skb with new page */
359 __skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len);
360 pfrag->offset = pfrag->offset + allocsize;
361
362 sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1);
363 skb_to_sgvec(skb, dsg,
364 (unsigned char *)esph - skb->data,
365 assoclen + ivlen + clen + alen);
366
367 spin_unlock_bh(&x->lock);
368
369 goto skip_cow2;
370 }
207 } 371 }
208 372
209 seqhi = esp_tmp_seqhi(tmp); 373cow:
210 iv = esp_tmp_iv(aead, tmp, seqhilen); 374 err = skb_cow_data(skb, tailen, &trailer);
211 req = esp_tmp_req(aead, iv); 375 if (err < 0)
212 sg = esp_req_sg(aead, req); 376 goto error;
377 nfrags = err;
213 378
214 /* Fill padding... */
215 tail = skb_tail_pointer(trailer); 379 tail = skb_tail_pointer(trailer);
216 if (tfclen) { 380 esph = ip_esp_hdr(skb);
217 memset(tail, 0, tfclen);
218 tail += tfclen;
219 }
220 do {
221 int i;
222 for (i = 0; i < plen - 2; i++)
223 tail[i] = i + 1;
224 } while (0);
225 tail[plen - 2] = plen - 2;
226 tail[plen - 1] = *skb_mac_header(skb);
227 pskb_put(skb, trailer, clen - skb->len + alen);
228 381
382skip_cow:
383 esp_output_fill_trailer(tail, tfclen, plen, proto);
384
385 pskb_put(skb, trailer, clen - skb->len + alen);
229 skb_push(skb, -skb_network_offset(skb)); 386 skb_push(skb, -skb_network_offset(skb));
230 esph = ip_esp_hdr(skb);
231 *skb_mac_header(skb) = IPPROTO_ESP;
232 387
233 esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); 388 esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
389 esph->spi = x->id.spi;
234 390
235 aead_request_set_callback(req, 0, esp_output_done, skb); 391 tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
236 392 if (!tmp) {
237 /* For ESN we move the header forward by 4 bytes to 393 err = -ENOMEM;
238 * accomodate the high bits. We will move it back after 394 goto error;
239 * encryption.
240 */
241 if ((x->props.flags & XFRM_STATE_ESN)) {
242 esph = (void *)(skb_transport_header(skb) - sizeof(__be32));
243 *seqhi = esph->spi;
244 esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
245 aead_request_set_callback(req, 0, esp_output_done_esn, skb);
246 } 395 }
247 396
248 esph->spi = x->id.spi; 397 seqhi = esp_tmp_seqhi(tmp);
398 iv = esp_tmp_iv(aead, tmp, seqhilen);
399 req = esp_tmp_req(aead, iv);
400 sg = esp_req_sg(aead, req);
401 dsg = sg;
402
403 esph = esp_output_set_esn(skb, esph, seqhi);
249 404
250 sg_init_table(sg, nfrags); 405 sg_init_table(sg, nfrags);
251 skb_to_sgvec(skb, sg, 406 skb_to_sgvec(skb, sg,
252 (unsigned char *)esph - skb->data, 407 (unsigned char *)esph - skb->data,
253 assoclen + ivlen + clen + alen); 408 assoclen + ivlen + clen + alen);
254 409
255 aead_request_set_crypt(req, sg, sg, ivlen + clen, iv); 410skip_cow2:
411 if ((x->props.flags & XFRM_STATE_ESN))
412 aead_request_set_callback(req, 0, esp_output_done_esn, skb);
413 else
414 aead_request_set_callback(req, 0, esp_output_done, skb);
415
416 aead_request_set_crypt(req, sg, dsg, ivlen + clen, iv);
256 aead_request_set_ad(req, assoclen); 417 aead_request_set_ad(req, assoclen);
257 418
258 seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low + 419 seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low +
@@ -278,6 +439,8 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
278 esp_output_restore_header(skb); 439 esp_output_restore_header(skb);
279 } 440 }
280 441
442 if (sg != dsg)
443 esp_ssg_unref(x, tmp);
281 kfree(tmp); 444 kfree(tmp);
282 445
283error: 446error:
@@ -343,6 +506,23 @@ static void esp_input_restore_header(struct sk_buff *skb)
343 __skb_pull(skb, 4); 506 __skb_pull(skb, 4);
344} 507}
345 508
509static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi)
510{
511 struct xfrm_state *x = xfrm_input_state(skb);
512 struct ip_esp_hdr *esph = (struct ip_esp_hdr *)skb->data;
513
514 /* For ESN we move the header forward by 4 bytes to
515 * accomodate the high bits. We will move it back after
516 * decryption.
517 */
518 if ((x->props.flags & XFRM_STATE_ESN)) {
519 esph = (void *)skb_push(skb, 4);
520 *seqhi = esph->spi;
521 esph->spi = esph->seq_no;
522 esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
523 }
524}
525
346static void esp_input_done_esn(struct crypto_async_request *base, int err) 526static void esp_input_done_esn(struct crypto_async_request *base, int err)
347{ 527{
348 struct sk_buff *skb = base->data; 528 struct sk_buff *skb = base->data;
@@ -378,14 +558,6 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
378 goto out; 558 goto out;
379 } 559 }
380 560
381 nfrags = skb_cow_data(skb, 0, &trailer);
382 if (nfrags < 0) {
383 ret = -EINVAL;
384 goto out;
385 }
386
387 ret = -ENOMEM;
388
389 assoclen = sizeof(*esph); 561 assoclen = sizeof(*esph);
390 seqhilen = 0; 562 seqhilen = 0;
391 563
@@ -394,6 +566,27 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
394 assoclen += seqhilen; 566 assoclen += seqhilen;
395 } 567 }
396 568
569 if (!skb_cloned(skb)) {
570 if (!skb_is_nonlinear(skb)) {
571 nfrags = 1;
572
573 goto skip_cow;
574 } else if (!skb_has_frag_list(skb)) {
575 nfrags = skb_shinfo(skb)->nr_frags;
576 nfrags++;
577
578 goto skip_cow;
579 }
580 }
581
582 nfrags = skb_cow_data(skb, 0, &trailer);
583 if (nfrags < 0) {
584 ret = -EINVAL;
585 goto out;
586 }
587
588skip_cow:
589 ret = -ENOMEM;
397 tmp = esp_alloc_tmp(aead, nfrags, seqhilen); 590 tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
398 if (!tmp) 591 if (!tmp)
399 goto out; 592 goto out;
@@ -404,26 +597,17 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
404 req = esp_tmp_req(aead, iv); 597 req = esp_tmp_req(aead, iv);
405 sg = esp_req_sg(aead, req); 598 sg = esp_req_sg(aead, req);
406 599
407 skb->ip_summed = CHECKSUM_NONE; 600 esp_input_set_header(skb, seqhi);
408 601
409 esph = (struct ip_esp_hdr *)skb->data; 602 sg_init_table(sg, nfrags);
603 skb_to_sgvec(skb, sg, 0, skb->len);
410 604
411 aead_request_set_callback(req, 0, esp_input_done, skb); 605 skb->ip_summed = CHECKSUM_NONE;
412 606
413 /* For ESN we move the header forward by 4 bytes to 607 if ((x->props.flags & XFRM_STATE_ESN))
414 * accomodate the high bits. We will move it back after
415 * decryption.
416 */
417 if ((x->props.flags & XFRM_STATE_ESN)) {
418 esph = (void *)skb_push(skb, 4);
419 *seqhi = esph->spi;
420 esph->spi = esph->seq_no;
421 esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
422 aead_request_set_callback(req, 0, esp_input_done_esn, skb); 608 aead_request_set_callback(req, 0, esp_input_done_esn, skb);
423 } 609 else
424 610 aead_request_set_callback(req, 0, esp_input_done, skb);
425 sg_init_table(sg, nfrags);
426 skb_to_sgvec(skb, sg, 0, skb->len);
427 611
428 aead_request_set_crypt(req, sg, sg, elen + ivlen, iv); 612 aead_request_set_crypt(req, sg, sg, elen + ivlen, iv);
429 aead_request_set_ad(req, assoclen); 613 aead_request_set_ad(req, assoclen);
@@ -474,9 +658,10 @@ static int esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
474 return 0; 658 return 0;
475 659
476 if (type == NDISC_REDIRECT) 660 if (type == NDISC_REDIRECT)
477 ip6_redirect(skb, net, skb->dev->ifindex, 0); 661 ip6_redirect(skb, net, skb->dev->ifindex, 0,
662 sock_net_uid(net, NULL));
478 else 663 else
479 ip6_update_pmtu(skb, net, info, 0, 0); 664 ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
480 xfrm_state_put(x); 665 xfrm_state_put(x);
481 666
482 return 0; 667 return 0;
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
new file mode 100644
index 000000000000..d914eb93204a
--- /dev/null
+++ b/net/ipv6/esp6_offload.c
@@ -0,0 +1,108 @@
1/*
2 * IPV6 GSO/GRO offload support
3 * Linux INET implementation
4 *
5 * Copyright (C) 2016 secunet Security Networks AG
6 * Author: Steffen Klassert <steffen.klassert@secunet.com>
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms and conditions of the GNU General Public License,
10 * version 2, as published by the Free Software Foundation.
11 *
12 * ESP GRO support
13 */
14
15#include <linux/skbuff.h>
16#include <linux/init.h>
17#include <net/protocol.h>
18#include <crypto/aead.h>
19#include <crypto/authenc.h>
20#include <linux/err.h>
21#include <linux/module.h>
22#include <net/ip.h>
23#include <net/xfrm.h>
24#include <net/esp.h>
25#include <linux/scatterlist.h>
26#include <linux/kernel.h>
27#include <linux/slab.h>
28#include <linux/spinlock.h>
29#include <net/ip6_route.h>
30#include <net/ipv6.h>
31#include <linux/icmpv6.h>
32
33static struct sk_buff **esp6_gro_receive(struct sk_buff **head,
34 struct sk_buff *skb)
35{
36 int offset = skb_gro_offset(skb);
37 struct xfrm_offload *xo;
38 struct xfrm_state *x;
39 __be32 seq;
40 __be32 spi;
41 int err;
42
43 skb_pull(skb, offset);
44
45 if ((err = xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq)) != 0)
46 goto out;
47
48 err = secpath_set(skb);
49 if (err)
50 goto out;
51
52 if (skb->sp->len == XFRM_MAX_DEPTH)
53 goto out;
54
55 x = xfrm_state_lookup(dev_net(skb->dev), skb->mark,
56 (xfrm_address_t *)&ipv6_hdr(skb)->daddr,
57 spi, IPPROTO_ESP, AF_INET6);
58 if (!x)
59 goto out;
60
61 skb->sp->xvec[skb->sp->len++] = x;
62 skb->sp->olen++;
63
64 xo = xfrm_offload(skb);
65 if (!xo) {
66 xfrm_state_put(x);
67 goto out;
68 }
69 xo->flags |= XFRM_GRO;
70
71 XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL;
72 XFRM_SPI_SKB_CB(skb)->family = AF_INET6;
73 XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr);
74 XFRM_SPI_SKB_CB(skb)->seq = seq;
75
76 /* We don't need to handle errors from xfrm_input, it does all
77 * the error handling and frees the resources on error. */
78 xfrm_input(skb, IPPROTO_ESP, spi, -2);
79
80 return ERR_PTR(-EINPROGRESS);
81out:
82 skb_push(skb, offset);
83 NAPI_GRO_CB(skb)->same_flow = 0;
84 NAPI_GRO_CB(skb)->flush = 1;
85
86 return NULL;
87}
88
89static const struct net_offload esp6_offload = {
90 .callbacks = {
91 .gro_receive = esp6_gro_receive,
92 },
93};
94
95static int __init esp6_offload_init(void)
96{
97 return inet6_add_offload(&esp6_offload, IPPROTO_ESP);
98}
99
100static void __exit esp6_offload_exit(void)
101{
102 inet6_del_offload(&esp6_offload, IPPROTO_ESP);
103}
104
105module_init(esp6_offload_init);
106module_exit(esp6_offload_exit);
107MODULE_LICENSE("GPL");
108MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>");
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 139ceb68bd37..25192a3b0cd7 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -47,6 +47,11 @@
47#if IS_ENABLED(CONFIG_IPV6_MIP6) 47#if IS_ENABLED(CONFIG_IPV6_MIP6)
48#include <net/xfrm.h> 48#include <net/xfrm.h>
49#endif 49#endif
50#include <linux/seg6.h>
51#include <net/seg6.h>
52#ifdef CONFIG_IPV6_SEG6_HMAC
53#include <net/seg6_hmac.h>
54#endif
50 55
51#include <linux/uaccess.h> 56#include <linux/uaccess.h>
52 57
@@ -227,7 +232,7 @@ static bool ipv6_dest_hao(struct sk_buff *skb, int optoff)
227 ipv6h->saddr = hao->addr; 232 ipv6h->saddr = hao->addr;
228 hao->addr = tmp_addr; 233 hao->addr = tmp_addr;
229 234
230 if (skb->tstamp.tv64 == 0) 235 if (skb->tstamp == 0)
231 __net_timestamp(skb); 236 __net_timestamp(skb);
232 237
233 return true; 238 return true;
@@ -286,6 +291,156 @@ static int ipv6_destopt_rcv(struct sk_buff *skb)
286 return -1; 291 return -1;
287} 292}
288 293
294static void seg6_update_csum(struct sk_buff *skb)
295{
296 struct ipv6_sr_hdr *hdr;
297 struct in6_addr *addr;
298 __be32 from, to;
299
300 /* srh is at transport offset and seg_left is already decremented
301 * but daddr is not yet updated with next segment
302 */
303
304 hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb);
305 addr = hdr->segments + hdr->segments_left;
306
307 hdr->segments_left++;
308 from = *(__be32 *)hdr;
309
310 hdr->segments_left--;
311 to = *(__be32 *)hdr;
312
313 /* update skb csum with diff resulting from seg_left decrement */
314
315 update_csum_diff4(skb, from, to);
316
317 /* compute csum diff between current and next segment and update */
318
319 update_csum_diff16(skb, (__be32 *)(&ipv6_hdr(skb)->daddr),
320 (__be32 *)addr);
321}
322
323static int ipv6_srh_rcv(struct sk_buff *skb)
324{
325 struct inet6_skb_parm *opt = IP6CB(skb);
326 struct net *net = dev_net(skb->dev);
327 struct ipv6_sr_hdr *hdr;
328 struct inet6_dev *idev;
329 struct in6_addr *addr;
330 int accept_seg6;
331
332 hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb);
333
334 idev = __in6_dev_get(skb->dev);
335
336 accept_seg6 = net->ipv6.devconf_all->seg6_enabled;
337 if (accept_seg6 > idev->cnf.seg6_enabled)
338 accept_seg6 = idev->cnf.seg6_enabled;
339
340 if (!accept_seg6) {
341 kfree_skb(skb);
342 return -1;
343 }
344
345#ifdef CONFIG_IPV6_SEG6_HMAC
346 if (!seg6_hmac_validate_skb(skb)) {
347 kfree_skb(skb);
348 return -1;
349 }
350#endif
351
352looped_back:
353 if (hdr->segments_left == 0) {
354 if (hdr->nexthdr == NEXTHDR_IPV6) {
355 int offset = (hdr->hdrlen + 1) << 3;
356
357 skb_postpull_rcsum(skb, skb_network_header(skb),
358 skb_network_header_len(skb));
359
360 if (!pskb_pull(skb, offset)) {
361 kfree_skb(skb);
362 return -1;
363 }
364 skb_postpull_rcsum(skb, skb_transport_header(skb),
365 offset);
366
367 skb_reset_network_header(skb);
368 skb_reset_transport_header(skb);
369 skb->encapsulation = 0;
370
371 __skb_tunnel_rx(skb, skb->dev, net);
372
373 netif_rx(skb);
374 return -1;
375 }
376
377 opt->srcrt = skb_network_header_len(skb);
378 opt->lastopt = opt->srcrt;
379 skb->transport_header += (hdr->hdrlen + 1) << 3;
380 opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb);
381
382 return 1;
383 }
384
385 if (hdr->segments_left >= (hdr->hdrlen >> 1)) {
386 __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
387 IPSTATS_MIB_INHDRERRORS);
388 icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
389 ((&hdr->segments_left) -
390 skb_network_header(skb)));
391 return -1;
392 }
393
394 if (skb_cloned(skb)) {
395 if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) {
396 __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
397 IPSTATS_MIB_OUTDISCARDS);
398 kfree_skb(skb);
399 return -1;
400 }
401 }
402
403 hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb);
404
405 hdr->segments_left--;
406 addr = hdr->segments + hdr->segments_left;
407
408 skb_push(skb, sizeof(struct ipv6hdr));
409
410 if (skb->ip_summed == CHECKSUM_COMPLETE)
411 seg6_update_csum(skb);
412
413 ipv6_hdr(skb)->daddr = *addr;
414
415 skb_dst_drop(skb);
416
417 ip6_route_input(skb);
418
419 if (skb_dst(skb)->error) {
420 dst_input(skb);
421 return -1;
422 }
423
424 if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) {
425 if (ipv6_hdr(skb)->hop_limit <= 1) {
426 __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
427 IPSTATS_MIB_INHDRERRORS);
428 icmpv6_send(skb, ICMPV6_TIME_EXCEED,
429 ICMPV6_EXC_HOPLIMIT, 0);
430 kfree_skb(skb);
431 return -1;
432 }
433 ipv6_hdr(skb)->hop_limit--;
434
435 skb_pull(skb, sizeof(struct ipv6hdr));
436 goto looped_back;
437 }
438
439 dst_input(skb);
440
441 return -1;
442}
443
289/******************************** 444/********************************
290 Routing header. 445 Routing header.
291 ********************************/ 446 ********************************/
@@ -326,6 +481,10 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb)
326 return -1; 481 return -1;
327 } 482 }
328 483
484 /* segment routing */
485 if (hdr->type == IPV6_SRCRT_TYPE_4)
486 return ipv6_srh_rcv(skb);
487
329looped_back: 488looped_back:
330 if (hdr->segments_left == 0) { 489 if (hdr->segments_left == 0) {
331 switch (hdr->type) { 490 switch (hdr->type) {
@@ -679,9 +838,9 @@ int ipv6_parse_hopopts(struct sk_buff *skb)
679 * for headers. 838 * for headers.
680 */ 839 */
681 840
682static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto, 841static void ipv6_push_rthdr0(struct sk_buff *skb, u8 *proto,
683 struct ipv6_rt_hdr *opt, 842 struct ipv6_rt_hdr *opt,
684 struct in6_addr **addr_p) 843 struct in6_addr **addr_p, struct in6_addr *saddr)
685{ 844{
686 struct rt0_hdr *phdr, *ihdr; 845 struct rt0_hdr *phdr, *ihdr;
687 int hops; 846 int hops;
@@ -704,6 +863,62 @@ static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto,
704 *proto = NEXTHDR_ROUTING; 863 *proto = NEXTHDR_ROUTING;
705} 864}
706 865
866static void ipv6_push_rthdr4(struct sk_buff *skb, u8 *proto,
867 struct ipv6_rt_hdr *opt,
868 struct in6_addr **addr_p, struct in6_addr *saddr)
869{
870 struct ipv6_sr_hdr *sr_phdr, *sr_ihdr;
871 int plen, hops;
872
873 sr_ihdr = (struct ipv6_sr_hdr *)opt;
874 plen = (sr_ihdr->hdrlen + 1) << 3;
875
876 sr_phdr = (struct ipv6_sr_hdr *)skb_push(skb, plen);
877 memcpy(sr_phdr, sr_ihdr, sizeof(struct ipv6_sr_hdr));
878
879 hops = sr_ihdr->first_segment + 1;
880 memcpy(sr_phdr->segments + 1, sr_ihdr->segments + 1,
881 (hops - 1) * sizeof(struct in6_addr));
882
883 sr_phdr->segments[0] = **addr_p;
884 *addr_p = &sr_ihdr->segments[hops - 1];
885
886#ifdef CONFIG_IPV6_SEG6_HMAC
887 if (sr_has_hmac(sr_phdr)) {
888 struct net *net = NULL;
889
890 if (skb->dev)
891 net = dev_net(skb->dev);
892 else if (skb->sk)
893 net = sock_net(skb->sk);
894
895 WARN_ON(!net);
896
897 if (net)
898 seg6_push_hmac(net, saddr, sr_phdr);
899 }
900#endif
901
902 sr_phdr->nexthdr = *proto;
903 *proto = NEXTHDR_ROUTING;
904}
905
906static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto,
907 struct ipv6_rt_hdr *opt,
908 struct in6_addr **addr_p, struct in6_addr *saddr)
909{
910 switch (opt->type) {
911 case IPV6_SRCRT_TYPE_0:
912 ipv6_push_rthdr0(skb, proto, opt, addr_p, saddr);
913 break;
914 case IPV6_SRCRT_TYPE_4:
915 ipv6_push_rthdr4(skb, proto, opt, addr_p, saddr);
916 break;
917 default:
918 break;
919 }
920}
921
707static void ipv6_push_exthdr(struct sk_buff *skb, u8 *proto, u8 type, struct ipv6_opt_hdr *opt) 922static void ipv6_push_exthdr(struct sk_buff *skb, u8 *proto, u8 type, struct ipv6_opt_hdr *opt)
708{ 923{
709 struct ipv6_opt_hdr *h = (struct ipv6_opt_hdr *)skb_push(skb, ipv6_optlen(opt)); 924 struct ipv6_opt_hdr *h = (struct ipv6_opt_hdr *)skb_push(skb, ipv6_optlen(opt));
@@ -715,10 +930,10 @@ static void ipv6_push_exthdr(struct sk_buff *skb, u8 *proto, u8 type, struct ipv
715 930
716void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, 931void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt,
717 u8 *proto, 932 u8 *proto,
718 struct in6_addr **daddr) 933 struct in6_addr **daddr, struct in6_addr *saddr)
719{ 934{
720 if (opt->srcrt) { 935 if (opt->srcrt) {
721 ipv6_push_rthdr(skb, proto, opt->srcrt, daddr); 936 ipv6_push_rthdr(skb, proto, opt->srcrt, daddr, saddr);
722 /* 937 /*
723 * IPV6_RTHDRDSTOPTS is ignored 938 * IPV6_RTHDRDSTOPTS is ignored
724 * unless IPV6_RTHDR is set (RFC3542). 939 * unless IPV6_RTHDR is set (RFC3542).
@@ -945,7 +1160,22 @@ struct in6_addr *fl6_update_dst(struct flowi6 *fl6,
945 return NULL; 1160 return NULL;
946 1161
947 *orig = fl6->daddr; 1162 *orig = fl6->daddr;
948 fl6->daddr = *((struct rt0_hdr *)opt->srcrt)->addr; 1163
1164 switch (opt->srcrt->type) {
1165 case IPV6_SRCRT_TYPE_0:
1166 fl6->daddr = *((struct rt0_hdr *)opt->srcrt)->addr;
1167 break;
1168 case IPV6_SRCRT_TYPE_4:
1169 {
1170 struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)opt->srcrt;
1171
1172 fl6->daddr = srh->segments[srh->first_segment];
1173 break;
1174 }
1175 default:
1176 return NULL;
1177 }
1178
949 return orig; 1179 return orig;
950} 1180}
951EXPORT_SYMBOL_GPL(fl6_update_dst); 1181EXPORT_SYMBOL_GPL(fl6_update_dst);
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 2772004ba5a1..230b5aac9f03 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -70,7 +70,7 @@
70#include <net/dsfield.h> 70#include <net/dsfield.h>
71#include <net/l3mdev.h> 71#include <net/l3mdev.h>
72 72
73#include <asm/uaccess.h> 73#include <linux/uaccess.h>
74 74
75/* 75/*
76 * The ICMP socket(s). This is the most convenient way to flow control 76 * The ICMP socket(s). This is the most convenient way to flow control
@@ -92,9 +92,10 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
92 struct net *net = dev_net(skb->dev); 92 struct net *net = dev_net(skb->dev);
93 93
94 if (type == ICMPV6_PKT_TOOBIG) 94 if (type == ICMPV6_PKT_TOOBIG)
95 ip6_update_pmtu(skb, net, info, 0, 0); 95 ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
96 else if (type == NDISC_REDIRECT) 96 else if (type == NDISC_REDIRECT)
97 ip6_redirect(skb, net, skb->dev->ifindex, 0); 97 ip6_redirect(skb, net, skb->dev->ifindex, 0,
98 sock_net_uid(net, NULL));
98 99
99 if (!(type & ICMPV6_INFOMSG_MASK)) 100 if (!(type & ICMPV6_INFOMSG_MASK))
100 if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST) 101 if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST)
@@ -109,19 +110,17 @@ static const struct inet6_protocol icmpv6_protocol = {
109 .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, 110 .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
110}; 111};
111 112
113/* Called with BH disabled */
112static __inline__ struct sock *icmpv6_xmit_lock(struct net *net) 114static __inline__ struct sock *icmpv6_xmit_lock(struct net *net)
113{ 115{
114 struct sock *sk; 116 struct sock *sk;
115 117
116 local_bh_disable();
117
118 sk = icmpv6_sk(net); 118 sk = icmpv6_sk(net);
119 if (unlikely(!spin_trylock(&sk->sk_lock.slock))) { 119 if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
120 /* This can happen if the output path (f.e. SIT or 120 /* This can happen if the output path (f.e. SIT or
121 * ip6ip6 tunnel) signals dst_link_failure() for an 121 * ip6ip6 tunnel) signals dst_link_failure() for an
122 * outgoing ICMP6 packet. 122 * outgoing ICMP6 packet.
123 */ 123 */
124 local_bh_enable();
125 return NULL; 124 return NULL;
126 } 125 }
127 return sk; 126 return sk;
@@ -129,7 +128,7 @@ static __inline__ struct sock *icmpv6_xmit_lock(struct net *net)
129 128
130static __inline__ void icmpv6_xmit_unlock(struct sock *sk) 129static __inline__ void icmpv6_xmit_unlock(struct sock *sk)
131{ 130{
132 spin_unlock_bh(&sk->sk_lock.slock); 131 spin_unlock(&sk->sk_lock.slock);
133} 132}
134 133
135/* 134/*
@@ -167,6 +166,30 @@ static bool is_ineligible(const struct sk_buff *skb)
167 return false; 166 return false;
168} 167}
169 168
169static bool icmpv6_mask_allow(int type)
170{
171 /* Informational messages are not limited. */
172 if (type & ICMPV6_INFOMSG_MASK)
173 return true;
174
175 /* Do not limit pmtu discovery, it would break it. */
176 if (type == ICMPV6_PKT_TOOBIG)
177 return true;
178
179 return false;
180}
181
182static bool icmpv6_global_allow(int type)
183{
184 if (icmpv6_mask_allow(type))
185 return true;
186
187 if (icmp_global_allow())
188 return true;
189
190 return false;
191}
192
170/* 193/*
171 * Check the ICMP output rate limit 194 * Check the ICMP output rate limit
172 */ 195 */
@@ -177,12 +200,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
177 struct dst_entry *dst; 200 struct dst_entry *dst;
178 bool res = false; 201 bool res = false;
179 202
180 /* Informational messages are not limited. */ 203 if (icmpv6_mask_allow(type))
181 if (type & ICMPV6_INFOMSG_MASK)
182 return true;
183
184 /* Do not limit pmtu discovery, it would break it. */
185 if (type == ICMPV6_PKT_TOOBIG)
186 return true; 204 return true;
187 205
188 /* 206 /*
@@ -199,20 +217,16 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
199 } else { 217 } else {
200 struct rt6_info *rt = (struct rt6_info *)dst; 218 struct rt6_info *rt = (struct rt6_info *)dst;
201 int tmo = net->ipv6.sysctl.icmpv6_time; 219 int tmo = net->ipv6.sysctl.icmpv6_time;
220 struct inet_peer *peer;
202 221
203 /* Give more bandwidth to wider prefixes. */ 222 /* Give more bandwidth to wider prefixes. */
204 if (rt->rt6i_dst.plen < 128) 223 if (rt->rt6i_dst.plen < 128)
205 tmo >>= ((128 - rt->rt6i_dst.plen)>>5); 224 tmo >>= ((128 - rt->rt6i_dst.plen)>>5);
206 225
207 if (icmp_global_allow()) { 226 peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr, 1);
208 struct inet_peer *peer; 227 res = inet_peer_xrlim_allow(peer, tmo);
209 228 if (peer)
210 peer = inet_getpeer_v6(net->ipv6.peers, 229 inet_putpeer(peer);
211 &fl6->daddr, 1);
212 res = inet_peer_xrlim_allow(peer, tmo);
213 if (peer)
214 inet_putpeer(peer);
215 }
216 } 230 }
217 dst_release(dst); 231 dst_release(dst);
218 return res; 232 return res;
@@ -473,6 +487,13 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
473 return; 487 return;
474 } 488 }
475 489
490 /* Needed by both icmp_global_allow and icmpv6_xmit_lock */
491 local_bh_disable();
492
493 /* Check global sysctl_icmp_msgs_per_sec ratelimit */
494 if (!icmpv6_global_allow(type))
495 goto out_bh_enable;
496
476 mip6_addr_swap(skb); 497 mip6_addr_swap(skb);
477 498
478 memset(&fl6, 0, sizeof(fl6)); 499 memset(&fl6, 0, sizeof(fl6));
@@ -486,11 +507,13 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
486 fl6.flowi6_oif = iif; 507 fl6.flowi6_oif = iif;
487 fl6.fl6_icmp_type = type; 508 fl6.fl6_icmp_type = type;
488 fl6.fl6_icmp_code = code; 509 fl6.fl6_icmp_code = code;
510 fl6.flowi6_uid = sock_net_uid(net, NULL);
489 security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); 511 security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
490 512
491 sk = icmpv6_xmit_lock(net); 513 sk = icmpv6_xmit_lock(net);
492 if (!sk) 514 if (!sk)
493 return; 515 goto out_bh_enable;
516
494 sk->sk_mark = mark; 517 sk->sk_mark = mark;
495 np = inet6_sk(sk); 518 np = inet6_sk(sk);
496 519
@@ -550,6 +573,8 @@ out_dst_release:
550 dst_release(dst); 573 dst_release(dst);
551out: 574out:
552 icmpv6_xmit_unlock(sk); 575 icmpv6_xmit_unlock(sk);
576out_bh_enable:
577 local_bh_enable();
553} 578}
554 579
555/* Slightly more convenient version of icmp6_send. 580/* Slightly more convenient version of icmp6_send.
@@ -660,11 +685,13 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
660 fl6.flowi6_oif = skb->dev->ifindex; 685 fl6.flowi6_oif = skb->dev->ifindex;
661 fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY; 686 fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY;
662 fl6.flowi6_mark = mark; 687 fl6.flowi6_mark = mark;
688 fl6.flowi6_uid = sock_net_uid(net, NULL);
663 security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); 689 security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
664 690
691 local_bh_disable();
665 sk = icmpv6_xmit_lock(net); 692 sk = icmpv6_xmit_lock(net);
666 if (!sk) 693 if (!sk)
667 return; 694 goto out_bh_enable;
668 sk->sk_mark = mark; 695 sk->sk_mark = mark;
669 np = inet6_sk(sk); 696 np = inet6_sk(sk);
670 697
@@ -706,6 +733,8 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
706 dst_release(dst); 733 dst_release(dst);
707out: 734out:
708 icmpv6_xmit_unlock(sk); 735 icmpv6_xmit_unlock(sk);
736out_bh_enable:
737 local_bh_enable();
709} 738}
710 739
711void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) 740void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info)
diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c
index e50c27a93e17..ce1aae4a7fc8 100644
--- a/net/ipv6/ila/ila_lwt.c
+++ b/net/ipv6/ila/ila_lwt.c
@@ -6,29 +6,88 @@
6#include <linux/socket.h> 6#include <linux/socket.h>
7#include <linux/types.h> 7#include <linux/types.h>
8#include <net/checksum.h> 8#include <net/checksum.h>
9#include <net/dst_cache.h>
9#include <net/ip.h> 10#include <net/ip.h>
10#include <net/ip6_fib.h> 11#include <net/ip6_fib.h>
12#include <net/ip6_route.h>
11#include <net/lwtunnel.h> 13#include <net/lwtunnel.h>
12#include <net/protocol.h> 14#include <net/protocol.h>
13#include <uapi/linux/ila.h> 15#include <uapi/linux/ila.h>
14#include "ila.h" 16#include "ila.h"
15 17
18struct ila_lwt {
19 struct ila_params p;
20 struct dst_cache dst_cache;
21 u32 connected : 1;
22};
23
24static inline struct ila_lwt *ila_lwt_lwtunnel(
25 struct lwtunnel_state *lwt)
26{
27 return (struct ila_lwt *)lwt->data;
28}
29
16static inline struct ila_params *ila_params_lwtunnel( 30static inline struct ila_params *ila_params_lwtunnel(
17 struct lwtunnel_state *lwstate) 31 struct lwtunnel_state *lwt)
18{ 32{
19 return (struct ila_params *)lwstate->data; 33 return &ila_lwt_lwtunnel(lwt)->p;
20} 34}
21 35
22static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) 36static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb)
23{ 37{
24 struct dst_entry *dst = skb_dst(skb); 38 struct dst_entry *orig_dst = skb_dst(skb);
39 struct rt6_info *rt = (struct rt6_info *)orig_dst;
40 struct ila_lwt *ilwt = ila_lwt_lwtunnel(orig_dst->lwtstate);
41 struct dst_entry *dst;
42 int err = -EINVAL;
25 43
26 if (skb->protocol != htons(ETH_P_IPV6)) 44 if (skb->protocol != htons(ETH_P_IPV6))
27 goto drop; 45 goto drop;
28 46
29 ila_update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate), true); 47 ila_update_ipv6_locator(skb, ila_params_lwtunnel(orig_dst->lwtstate),
48 true);
30 49
31 return dst->lwtstate->orig_output(net, sk, skb); 50 if (rt->rt6i_flags & (RTF_GATEWAY | RTF_CACHE)) {
51 /* Already have a next hop address in route, no need for
52 * dest cache route.
53 */
54 return orig_dst->lwtstate->orig_output(net, sk, skb);
55 }
56
57 dst = dst_cache_get(&ilwt->dst_cache);
58 if (unlikely(!dst)) {
59 struct ipv6hdr *ip6h = ipv6_hdr(skb);
60 struct flowi6 fl6;
61
62 /* Lookup a route for the new destination. Take into
63 * account that the base route may already have a gateway.
64 */
65
66 memset(&fl6, 0, sizeof(fl6));
67 fl6.flowi6_oif = orig_dst->dev->ifindex;
68 fl6.flowi6_iif = LOOPBACK_IFINDEX;
69 fl6.daddr = *rt6_nexthop((struct rt6_info *)orig_dst,
70 &ip6h->daddr);
71
72 dst = ip6_route_output(net, NULL, &fl6);
73 if (dst->error) {
74 err = -EHOSTUNREACH;
75 dst_release(dst);
76 goto drop;
77 }
78
79 dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
80 if (IS_ERR(dst)) {
81 err = PTR_ERR(dst);
82 goto drop;
83 }
84
85 if (ilwt->connected)
86 dst_cache_set_ip6(&ilwt->dst_cache, dst, &fl6.saddr);
87 }
88
89 skb_dst_set(skb, dst);
90 return dst_output(net, sk, skb);
32 91
33drop: 92drop:
34 kfree_skb(skb); 93 kfree_skb(skb);
@@ -56,13 +115,13 @@ static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
56 [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, }, 115 [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, },
57}; 116};
58 117
59static int ila_build_state(struct net_device *dev, struct nlattr *nla, 118static int ila_build_state(struct nlattr *nla,
60 unsigned int family, const void *cfg, 119 unsigned int family, const void *cfg,
61 struct lwtunnel_state **ts) 120 struct lwtunnel_state **ts)
62{ 121{
122 struct ila_lwt *ilwt;
63 struct ila_params *p; 123 struct ila_params *p;
64 struct nlattr *tb[ILA_ATTR_MAX + 1]; 124 struct nlattr *tb[ILA_ATTR_MAX + 1];
65 size_t encap_len = sizeof(*p);
66 struct lwtunnel_state *newts; 125 struct lwtunnel_state *newts;
67 const struct fib6_config *cfg6 = cfg; 126 const struct fib6_config *cfg6 = cfg;
68 struct ila_addr *iaddr; 127 struct ila_addr *iaddr;
@@ -71,7 +130,7 @@ static int ila_build_state(struct net_device *dev, struct nlattr *nla,
71 if (family != AF_INET6) 130 if (family != AF_INET6)
72 return -EINVAL; 131 return -EINVAL;
73 132
74 if (cfg6->fc_dst_len < sizeof(struct ila_locator) + 1) { 133 if (cfg6->fc_dst_len < 8 * sizeof(struct ila_locator) + 3) {
75 /* Need to have full locator and at least type field 134 /* Need to have full locator and at least type field
76 * included in destination 135 * included in destination
77 */ 136 */
@@ -95,11 +154,17 @@ static int ila_build_state(struct net_device *dev, struct nlattr *nla,
95 if (!tb[ILA_ATTR_LOCATOR]) 154 if (!tb[ILA_ATTR_LOCATOR])
96 return -EINVAL; 155 return -EINVAL;
97 156
98 newts = lwtunnel_state_alloc(encap_len); 157 newts = lwtunnel_state_alloc(sizeof(*ilwt));
99 if (!newts) 158 if (!newts)
100 return -ENOMEM; 159 return -ENOMEM;
101 160
102 newts->len = encap_len; 161 ilwt = ila_lwt_lwtunnel(newts);
162 ret = dst_cache_init(&ilwt->dst_cache, GFP_ATOMIC);
163 if (ret) {
164 kfree(newts);
165 return ret;
166 }
167
103 p = ila_params_lwtunnel(newts); 168 p = ila_params_lwtunnel(newts);
104 169
105 p->locator.v64 = (__force __be64)nla_get_u64(tb[ILA_ATTR_LOCATOR]); 170 p->locator.v64 = (__force __be64)nla_get_u64(tb[ILA_ATTR_LOCATOR]);
@@ -120,11 +185,19 @@ static int ila_build_state(struct net_device *dev, struct nlattr *nla,
120 newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT | 185 newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT |
121 LWTUNNEL_STATE_INPUT_REDIRECT; 186 LWTUNNEL_STATE_INPUT_REDIRECT;
122 187
188 if (cfg6->fc_dst_len == 8 * sizeof(struct in6_addr))
189 ilwt->connected = 1;
190
123 *ts = newts; 191 *ts = newts;
124 192
125 return 0; 193 return 0;
126} 194}
127 195
196static void ila_destroy_state(struct lwtunnel_state *lwt)
197{
198 dst_cache_destroy(&ila_lwt_lwtunnel(lwt)->dst_cache);
199}
200
128static int ila_fill_encap_info(struct sk_buff *skb, 201static int ila_fill_encap_info(struct sk_buff *skb,
129 struct lwtunnel_state *lwtstate) 202 struct lwtunnel_state *lwtstate)
130{ 203{
@@ -159,11 +232,13 @@ static int ila_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
159 232
160static const struct lwtunnel_encap_ops ila_encap_ops = { 233static const struct lwtunnel_encap_ops ila_encap_ops = {
161 .build_state = ila_build_state, 234 .build_state = ila_build_state,
235 .destroy_state = ila_destroy_state,
162 .output = ila_output, 236 .output = ila_output,
163 .input = ila_input, 237 .input = ila_input,
164 .fill_encap = ila_fill_encap_info, 238 .fill_encap = ila_fill_encap_info,
165 .get_encap_size = ila_encap_nlsize, 239 .get_encap_size = ila_encap_nlsize,
166 .cmp_encap = ila_encap_cmp, 240 .cmp_encap = ila_encap_cmp,
241 .owner = THIS_MODULE,
167}; 242};
168 243
169int ila_lwt_init(void) 244int ila_lwt_init(void)
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index e604013dd814..af8f52ee7180 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -118,15 +118,7 @@ static const struct rhashtable_params rht_params = {
118 .obj_cmpfn = ila_cmpfn, 118 .obj_cmpfn = ila_cmpfn,
119}; 119};
120 120
121static struct genl_family ila_nl_family = { 121static struct genl_family ila_nl_family;
122 .id = GENL_ID_GENERATE,
123 .hdrsize = 0,
124 .name = ILA_GENL_NAME,
125 .version = ILA_GENL_VERSION,
126 .maxattr = ILA_ATTR_MAX,
127 .netnsok = true,
128 .parallel_ops = true,
129};
130 122
131static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { 123static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
132 [ILA_ATTR_LOCATOR] = { .type = NLA_U64, }, 124 [ILA_ATTR_LOCATOR] = { .type = NLA_U64, },
@@ -482,7 +474,15 @@ static int ila_nl_dump_start(struct netlink_callback *cb)
482{ 474{
483 struct net *net = sock_net(cb->skb->sk); 475 struct net *net = sock_net(cb->skb->sk);
484 struct ila_net *ilan = net_generic(net, ila_net_id); 476 struct ila_net *ilan = net_generic(net, ila_net_id);
485 struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args; 477 struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0];
478
479 if (!iter) {
480 iter = kmalloc(sizeof(*iter), GFP_KERNEL);
481 if (!iter)
482 return -ENOMEM;
483
484 cb->args[0] = (long)iter;
485 }
486 486
487 return rhashtable_walk_init(&ilan->rhash_table, &iter->rhiter, 487 return rhashtable_walk_init(&ilan->rhash_table, &iter->rhiter,
488 GFP_KERNEL); 488 GFP_KERNEL);
@@ -490,16 +490,18 @@ static int ila_nl_dump_start(struct netlink_callback *cb)
490 490
491static int ila_nl_dump_done(struct netlink_callback *cb) 491static int ila_nl_dump_done(struct netlink_callback *cb)
492{ 492{
493 struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args; 493 struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0];
494 494
495 rhashtable_walk_exit(&iter->rhiter); 495 rhashtable_walk_exit(&iter->rhiter);
496 496
497 kfree(iter);
498
497 return 0; 499 return 0;
498} 500}
499 501
500static int ila_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) 502static int ila_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
501{ 503{
502 struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args; 504 struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0];
503 struct rhashtable_iter *rhiter = &iter->rhiter; 505 struct rhashtable_iter *rhiter = &iter->rhiter;
504 struct ila_map *ila; 506 struct ila_map *ila;
505 int ret; 507 int ret;
@@ -561,6 +563,18 @@ static const struct genl_ops ila_nl_ops[] = {
561 }, 563 },
562}; 564};
563 565
566static struct genl_family ila_nl_family __ro_after_init = {
567 .hdrsize = 0,
568 .name = ILA_GENL_NAME,
569 .version = ILA_GENL_VERSION,
570 .maxattr = ILA_ATTR_MAX,
571 .netnsok = true,
572 .parallel_ops = true,
573 .module = THIS_MODULE,
574 .ops = ila_nl_ops,
575 .n_ops = ARRAY_SIZE(ila_nl_ops),
576};
577
564#define ILA_HASH_TABLE_SIZE 1024 578#define ILA_HASH_TABLE_SIZE 1024
565 579
566static __net_init int ila_init_net(struct net *net) 580static __net_init int ila_init_net(struct net *net)
@@ -623,7 +637,7 @@ static int ila_xlat_addr(struct sk_buff *skb, bool set_csum_neutral)
623 return 0; 637 return 0;
624} 638}
625 639
626int ila_xlat_init(void) 640int __init ila_xlat_init(void)
627{ 641{
628 int ret; 642 int ret;
629 643
@@ -631,8 +645,7 @@ int ila_xlat_init(void)
631 if (ret) 645 if (ret)
632 goto exit; 646 goto exit;
633 647
634 ret = genl_register_family_with_ops(&ila_nl_family, 648 ret = genl_register_family(&ila_nl_family);
635 ila_nl_ops);
636 if (ret < 0) 649 if (ret < 0)
637 goto unregister; 650 goto unregister;
638 651
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 532c3ef282c5..9a31d13bf180 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -28,45 +28,6 @@
28#include <net/inet6_connection_sock.h> 28#include <net/inet6_connection_sock.h>
29#include <net/sock_reuseport.h> 29#include <net/sock_reuseport.h>
30 30
31int inet6_csk_bind_conflict(const struct sock *sk,
32 const struct inet_bind_bucket *tb, bool relax)
33{
34 const struct sock *sk2;
35 int reuse = sk->sk_reuse;
36 int reuseport = sk->sk_reuseport;
37 kuid_t uid = sock_i_uid((struct sock *)sk);
38
39 /* We must walk the whole port owner list in this case. -DaveM */
40 /*
41 * See comment in inet_csk_bind_conflict about sock lookup
42 * vs net namespaces issues.
43 */
44 sk_for_each_bound(sk2, &tb->owners) {
45 if (sk != sk2 &&
46 (!sk->sk_bound_dev_if ||
47 !sk2->sk_bound_dev_if ||
48 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
49 if ((!reuse || !sk2->sk_reuse ||
50 sk2->sk_state == TCP_LISTEN) &&
51 (!reuseport || !sk2->sk_reuseport ||
52 rcu_access_pointer(sk->sk_reuseport_cb) ||
53 (sk2->sk_state != TCP_TIME_WAIT &&
54 !uid_eq(uid,
55 sock_i_uid((struct sock *)sk2))))) {
56 if (ipv6_rcv_saddr_equal(sk, sk2, true))
57 break;
58 }
59 if (!relax && reuse && sk2->sk_reuse &&
60 sk2->sk_state != TCP_LISTEN &&
61 ipv6_rcv_saddr_equal(sk, sk2, true))
62 break;
63 }
64 }
65
66 return sk2 != NULL;
67}
68EXPORT_SYMBOL_GPL(inet6_csk_bind_conflict);
69
70struct dst_entry *inet6_csk_route_req(const struct sock *sk, 31struct dst_entry *inet6_csk_route_req(const struct sock *sk,
71 struct flowi6 *fl6, 32 struct flowi6 *fl6,
72 const struct request_sock *req, 33 const struct request_sock *req,
@@ -88,6 +49,7 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk,
88 fl6->flowi6_mark = ireq->ir_mark; 49 fl6->flowi6_mark = ireq->ir_mark;
89 fl6->fl6_dport = ireq->ir_rmt_port; 50 fl6->fl6_dport = ireq->ir_rmt_port;
90 fl6->fl6_sport = htons(ireq->ir_num); 51 fl6->fl6_sport = htons(ireq->ir_num);
52 fl6->flowi6_uid = sk->sk_uid;
91 security_req_classify_flow(req, flowi6_to_flowi(fl6)); 53 security_req_classify_flow(req, flowi6_to_flowi(fl6));
92 54
93 dst = ip6_dst_lookup_flow(sk, fl6, final_p); 55 dst = ip6_dst_lookup_flow(sk, fl6, final_p);
@@ -136,6 +98,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk,
136 fl6->flowi6_mark = sk->sk_mark; 98 fl6->flowi6_mark = sk->sk_mark;
137 fl6->fl6_sport = inet->inet_sport; 99 fl6->fl6_sport = inet->inet_sport;
138 fl6->fl6_dport = inet->inet_dport; 100 fl6->fl6_dport = inet->inet_dport;
101 fl6->flowi6_uid = sk->sk_uid;
139 security_sk_classify_flow(sk, flowi6_to_flowi(fl6)); 102 security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
140 103
141 rcu_read_lock(); 104 rcu_read_lock();
@@ -173,7 +136,7 @@ int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused
173 /* Restore final destination back after routing done */ 136 /* Restore final destination back after routing done */
174 fl6.daddr = sk->sk_v6_daddr; 137 fl6.daddr = sk->sk_v6_daddr;
175 138
176 res = ip6_xmit(sk, skb, &fl6, rcu_dereference(np->opt), 139 res = ip6_xmit(sk, skb, &fl6, sk->sk_mark, rcu_dereference(np->opt),
177 np->tclass); 140 np->tclass);
178 rcu_read_unlock(); 141 rcu_read_unlock();
179 return res; 142 return res;
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 02761c9fe43e..d0900918a19e 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -268,54 +268,10 @@ int inet6_hash(struct sock *sk)
268 268
269 if (sk->sk_state != TCP_CLOSE) { 269 if (sk->sk_state != TCP_CLOSE) {
270 local_bh_disable(); 270 local_bh_disable();
271 err = __inet_hash(sk, NULL, ipv6_rcv_saddr_equal); 271 err = __inet_hash(sk, NULL);
272 local_bh_enable(); 272 local_bh_enable();
273 } 273 }
274 274
275 return err; 275 return err;
276} 276}
277EXPORT_SYMBOL_GPL(inet6_hash); 277EXPORT_SYMBOL_GPL(inet6_hash);
278
279/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6
280 * only, and any IPv4 addresses if not IPv6 only
281 * match_wildcard == false: addresses must be exactly the same, i.e.
282 * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY,
283 * and 0.0.0.0 equals to 0.0.0.0 only
284 */
285int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
286 bool match_wildcard)
287{
288 const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2);
289 int sk2_ipv6only = inet_v6_ipv6only(sk2);
290 int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
291 int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
292
293 /* if both are mapped, treat as IPv4 */
294 if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) {
295 if (!sk2_ipv6only) {
296 if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr)
297 return 1;
298 if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr)
299 return match_wildcard;
300 }
301 return 0;
302 }
303
304 if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY)
305 return 1;
306
307 if (addr_type2 == IPV6_ADDR_ANY && match_wildcard &&
308 !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
309 return 1;
310
311 if (addr_type == IPV6_ADDR_ANY && match_wildcard &&
312 !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED))
313 return 1;
314
315 if (sk2_rcv_saddr6 &&
316 ipv6_addr_equal(&sk->sk_v6_rcv_saddr, sk2_rcv_saddr6))
317 return 1;
318
319 return 0;
320}
321EXPORT_SYMBOL_GPL(ipv6_rcv_saddr_equal);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index ef5485204522..d4bf2c68a545 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -318,6 +318,16 @@ static int fib6_dump_node(struct fib6_walker *w)
318 w->leaf = rt; 318 w->leaf = rt;
319 return 1; 319 return 1;
320 } 320 }
321
322 /* Multipath routes are dumped in one route with the
323 * RTA_MULTIPATH attribute. Jump 'rt' to point to the
324 * last sibling of this route (no need to dump the
325 * sibling routes again)
326 */
327 if (rt->rt6i_nsiblings)
328 rt = list_last_entry(&rt->rt6i_siblings,
329 struct rt6_info,
330 rt6i_siblings);
321 } 331 }
322 w->leaf = NULL; 332 w->leaf = NULL;
323 return 0; 333 return 0;
@@ -746,6 +756,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
746 u16 nlflags = NLM_F_EXCL; 756 u16 nlflags = NLM_F_EXCL;
747 int err; 757 int err;
748 758
759 if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND))
760 nlflags |= NLM_F_APPEND;
761
749 ins = &fn->leaf; 762 ins = &fn->leaf;
750 763
751 for (iter = fn->leaf; iter; iter = iter->dst.rt6_next) { 764 for (iter = fn->leaf; iter; iter = iter->dst.rt6_next) {
@@ -868,7 +881,8 @@ add:
868 *ins = rt; 881 *ins = rt;
869 rt->rt6i_node = fn; 882 rt->rt6i_node = fn;
870 atomic_inc(&rt->rt6i_ref); 883 atomic_inc(&rt->rt6i_ref);
871 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 884 if (!info->skip_notify)
885 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
872 info->nl_net->ipv6.rt6_stats->fib_rt_entries++; 886 info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
873 887
874 if (!(fn->fn_flags & RTN_RTINFO)) { 888 if (!(fn->fn_flags & RTN_RTINFO)) {
@@ -894,7 +908,8 @@ add:
894 rt->rt6i_node = fn; 908 rt->rt6i_node = fn;
895 rt->dst.rt6_next = iter->dst.rt6_next; 909 rt->dst.rt6_next = iter->dst.rt6_next;
896 atomic_inc(&rt->rt6i_ref); 910 atomic_inc(&rt->rt6i_ref);
897 inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); 911 if (!info->skip_notify)
912 inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
898 if (!(fn->fn_flags & RTN_RTINFO)) { 913 if (!(fn->fn_flags & RTN_RTINFO)) {
899 info->nl_net->ipv6.rt6_stats->fib_route_nodes++; 914 info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
900 fn->fn_flags |= RTN_RTINFO; 915 fn->fn_flags |= RTN_RTINFO;
@@ -908,6 +923,8 @@ add:
908 ins = &rt->dst.rt6_next; 923 ins = &rt->dst.rt6_next;
909 iter = *ins; 924 iter = *ins;
910 while (iter) { 925 while (iter) {
926 if (iter->rt6i_metric > rt->rt6i_metric)
927 break;
911 if (rt6_qualify_for_ecmp(iter)) { 928 if (rt6_qualify_for_ecmp(iter)) {
912 *ins = iter->dst.rt6_next; 929 *ins = iter->dst.rt6_next;
913 fib6_purge_rt(iter, fn, info->nl_net); 930 fib6_purge_rt(iter, fn, info->nl_net);
@@ -1439,7 +1456,8 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
1439 1456
1440 fib6_purge_rt(rt, fn, net); 1457 fib6_purge_rt(rt, fn, net);
1441 1458
1442 inet6_rt_notify(RTM_DELROUTE, rt, info, 0); 1459 if (!info->skip_notify)
1460 inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
1443 rt6_release(rt); 1461 rt6_release(rt);
1444} 1462}
1445 1463
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index b912f0dbaf72..8081bafe441b 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -29,7 +29,7 @@
29#include <net/rawv6.h> 29#include <net/rawv6.h>
30#include <net/transp_v6.h> 30#include <net/transp_v6.h>
31 31
32#include <asm/uaccess.h> 32#include <linux/uaccess.h>
33 33
34#define FL_MIN_LINGER 6 /* Minimal linger. It is set to 6sec specified 34#define FL_MIN_LINGER 6 /* Minimal linger. It is set to 6sec specified
35 in old IPv6 RFC. Well, it was reasonable value. 35 in old IPv6 RFC. Well, it was reasonable value.
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index d7d6d3ae0b3b..6fcb7cb49bb2 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -64,7 +64,7 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
64#define IP6_GRE_HASH_SIZE_SHIFT 5 64#define IP6_GRE_HASH_SIZE_SHIFT 5
65#define IP6_GRE_HASH_SIZE (1 << IP6_GRE_HASH_SIZE_SHIFT) 65#define IP6_GRE_HASH_SIZE (1 << IP6_GRE_HASH_SIZE_SHIFT)
66 66
67static int ip6gre_net_id __read_mostly; 67static unsigned int ip6gre_net_id __read_mostly;
68struct ip6gre_net { 68struct ip6gre_net {
69 struct ip6_tnl __rcu *tunnels[4][IP6_GRE_HASH_SIZE]; 69 struct ip6_tnl __rcu *tunnels[4][IP6_GRE_HASH_SIZE];
70 70
@@ -367,35 +367,37 @@ static void ip6gre_tunnel_uninit(struct net_device *dev)
367 367
368 368
369static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, 369static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
370 u8 type, u8 code, int offset, __be32 info) 370 u8 type, u8 code, int offset, __be32 info)
371{ 371{
372 const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)skb->data; 372 const struct gre_base_hdr *greh;
373 __be16 *p = (__be16 *)(skb->data + offset); 373 const struct ipv6hdr *ipv6h;
374 int grehlen = offset + 4; 374 int grehlen = sizeof(*greh);
375 struct ip6_tnl *t; 375 struct ip6_tnl *t;
376 int key_off = 0;
376 __be16 flags; 377 __be16 flags;
378 __be32 key;
377 379
378 flags = p[0]; 380 if (!pskb_may_pull(skb, offset + grehlen))
379 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) { 381 return;
380 if (flags&(GRE_VERSION|GRE_ROUTING)) 382 greh = (const struct gre_base_hdr *)(skb->data + offset);
381 return; 383 flags = greh->flags;
382 if (flags&GRE_KEY) { 384 if (flags & (GRE_VERSION | GRE_ROUTING))
383 grehlen += 4; 385 return;
384 if (flags&GRE_CSUM) 386 if (flags & GRE_CSUM)
385 grehlen += 4; 387 grehlen += 4;
386 } 388 if (flags & GRE_KEY) {
389 key_off = grehlen + offset;
390 grehlen += 4;
387 } 391 }
388 392
389 /* If only 8 bytes returned, keyed message will be dropped here */ 393 if (!pskb_may_pull(skb, offset + grehlen))
390 if (!pskb_may_pull(skb, grehlen))
391 return; 394 return;
392 ipv6h = (const struct ipv6hdr *)skb->data; 395 ipv6h = (const struct ipv6hdr *)skb->data;
393 p = (__be16 *)(skb->data + offset); 396 greh = (const struct gre_base_hdr *)(skb->data + offset);
397 key = key_off ? *(__be32 *)(skb->data + key_off) : 0;
394 398
395 t = ip6gre_tunnel_lookup(skb->dev, &ipv6h->daddr, &ipv6h->saddr, 399 t = ip6gre_tunnel_lookup(skb->dev, &ipv6h->daddr, &ipv6h->saddr,
396 flags & GRE_KEY ? 400 key, greh->protocol);
397 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
398 p[1]);
399 if (!t) 401 if (!t)
400 return; 402 return;
401 403
@@ -484,11 +486,6 @@ drop:
484 return 0; 486 return 0;
485} 487}
486 488
487struct ipv6_tel_txoption {
488 struct ipv6_txoptions ops;
489 __u8 dst_opt[8];
490};
491
492static int gre_handle_offloads(struct sk_buff *skb, bool csum) 489static int gre_handle_offloads(struct sk_buff *skb, bool csum)
493{ 490{
494 return iptunnel_handle_offloads(skb, 491 return iptunnel_handle_offloads(skb,
@@ -548,6 +545,8 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev)
548 if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) 545 if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
549 fl6.flowi6_mark = skb->mark; 546 fl6.flowi6_mark = skb->mark;
550 547
548 fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
549
551 err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)); 550 err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM));
552 if (err) 551 if (err)
553 return -1; 552 return -1;
@@ -580,6 +579,9 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev)
580 return -1; 579 return -1;
581 580
582 offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); 581 offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
582 /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */
583 ipv6h = ipv6_hdr(skb);
584
583 if (offset > 0) { 585 if (offset > 0) {
584 struct ipv6_tlv_tnl_enc_lim *tel; 586 struct ipv6_tlv_tnl_enc_lim *tel;
585 tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset]; 587 tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset];
@@ -602,6 +604,8 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev)
602 if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) 604 if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
603 fl6.flowi6_mark = skb->mark; 605 fl6.flowi6_mark = skb->mark;
604 606
607 fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
608
605 if (gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM))) 609 if (gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)))
606 return -1; 610 return -1;
607 611
@@ -994,6 +998,9 @@ static void ip6gre_tunnel_setup(struct net_device *dev)
994 dev->flags |= IFF_NOARP; 998 dev->flags |= IFF_NOARP;
995 dev->addr_len = sizeof(struct in6_addr); 999 dev->addr_len = sizeof(struct in6_addr);
996 netif_keep_dst(dev); 1000 netif_keep_dst(dev);
1001 /* This perm addr will be used as interface identifier by IPv6 */
1002 dev->addr_assign_type = NET_ADDR_RANDOM;
1003 eth_random_addr(dev->perm_addr);
997} 1004}
998 1005
999static int ip6gre_tunnel_init_common(struct net_device *dev) 1006static int ip6gre_tunnel_init_common(struct net_device *dev)
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index aacfb4bce153..c45b12b4431c 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -122,11 +122,14 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
122 max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); 122 max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
123 /* 123 /*
124 * RFC4291 2.5.3 124 * RFC4291 2.5.3
125 * The loopback address must not be used as the source address in IPv6
126 * packets that are sent outside of a single node. [..]
125 * A packet received on an interface with a destination address 127 * A packet received on an interface with a destination address
126 * of loopback must be dropped. 128 * of loopback must be dropped.
127 */ 129 */
128 if (!(dev->flags & IFF_LOOPBACK) && 130 if ((ipv6_addr_loopback(&hdr->saddr) ||
129 ipv6_addr_loopback(&hdr->daddr)) 131 ipv6_addr_loopback(&hdr->daddr)) &&
132 !(dev->flags & IFF_LOOPBACK))
130 goto err; 133 goto err;
131 134
132 /* RFC4291 Errata ID: 3480 135 /* RFC4291 Errata ID: 3480
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 89c59e656f44..93e58a5e1837 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -191,6 +191,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
191 ops = rcu_dereference(inet6_offloads[proto]); 191 ops = rcu_dereference(inet6_offloads[proto]);
192 if (!ops || !ops->callbacks.gro_receive) { 192 if (!ops || !ops->callbacks.gro_receive) {
193 __pskb_pull(skb, skb_gro_offset(skb)); 193 __pskb_pull(skb, skb_gro_offset(skb));
194 skb_gro_frag0_invalidate(skb);
194 proto = ipv6_gso_pull_exthdrs(skb, proto); 195 proto = ipv6_gso_pull_exthdrs(skb, proto);
195 skb_gro_pull(skb, -skb_transport_offset(skb)); 196 skb_gro_pull(skb, -skb_transport_offset(skb));
196 skb_reset_transport_header(skb); 197 skb_reset_transport_header(skb);
@@ -252,7 +253,7 @@ out_unlock:
252 rcu_read_unlock(); 253 rcu_read_unlock();
253 254
254out: 255out:
255 NAPI_GRO_CB(skb)->flush |= flush; 256 skb_gro_flush_final(skb, pp, flush);
256 257
257 return pp; 258 return pp;
258} 259}
@@ -293,8 +294,10 @@ static int ipv6_gro_complete(struct sk_buff *skb, int nhoff)
293 struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + nhoff); 294 struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + nhoff);
294 int err = -ENOSYS; 295 int err = -ENOSYS;
295 296
296 if (skb->encapsulation) 297 if (skb->encapsulation) {
298 skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IPV6));
297 skb_set_inner_network_header(skb, nhoff); 299 skb_set_inner_network_header(skb, nhoff);
300 }
298 301
299 iph->payload_len = htons(skb->len - nhoff - sizeof(*iph)); 302 iph->payload_len = htons(skb->len - nhoff - sizeof(*iph));
300 303
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 59eb4ed99ce8..58f6288e9ba5 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -39,6 +39,7 @@
39#include <linux/module.h> 39#include <linux/module.h>
40#include <linux/slab.h> 40#include <linux/slab.h>
41 41
42#include <linux/bpf-cgroup.h>
42#include <linux/netfilter.h> 43#include <linux/netfilter.h>
43#include <linux/netfilter_ipv6.h> 44#include <linux/netfilter_ipv6.h>
44 45
@@ -118,7 +119,8 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
118 if (unlikely(!neigh)) 119 if (unlikely(!neigh))
119 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); 120 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
120 if (!IS_ERR(neigh)) { 121 if (!IS_ERR(neigh)) {
121 ret = dst_neigh_output(dst, neigh, skb); 122 sock_confirm_neigh(skb, neigh);
123 ret = neigh_output(neigh, skb);
122 rcu_read_unlock_bh(); 124 rcu_read_unlock_bh();
123 return ret; 125 return ret;
124 } 126 }
@@ -131,6 +133,14 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
131 133
132static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 134static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
133{ 135{
136 int ret;
137
138 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
139 if (ret) {
140 kfree_skb(skb);
141 return ret;
142 }
143
134 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || 144 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
135 dst_allfrag(skb_dst(skb)) || 145 dst_allfrag(skb_dst(skb)) ||
136 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) 146 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
@@ -163,7 +173,7 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
163 * which are using proper atomic operations or spinlocks. 173 * which are using proper atomic operations or spinlocks.
164 */ 174 */
165int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, 175int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
166 struct ipv6_txoptions *opt, int tclass) 176 __u32 mark, struct ipv6_txoptions *opt, int tclass)
167{ 177{
168 struct net *net = sock_net(sk); 178 struct net *net = sock_net(sk);
169 const struct ipv6_pinfo *np = inet6_sk(sk); 179 const struct ipv6_pinfo *np = inet6_sk(sk);
@@ -203,7 +213,8 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
203 if (opt->opt_flen) 213 if (opt->opt_flen)
204 ipv6_push_frag_opts(skb, opt, &proto); 214 ipv6_push_frag_opts(skb, opt, &proto);
205 if (opt->opt_nflen) 215 if (opt->opt_nflen)
206 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop); 216 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
217 &fl6->saddr);
207 } 218 }
208 219
209 skb_push(skb, sizeof(struct ipv6hdr)); 220 skb_push(skb, sizeof(struct ipv6hdr));
@@ -230,7 +241,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
230 241
231 skb->protocol = htons(ETH_P_IPV6); 242 skb->protocol = htons(ETH_P_IPV6);
232 skb->priority = sk->sk_priority; 243 skb->priority = sk->sk_priority;
233 skb->mark = sk->sk_mark; 244 skb->mark = mark;
234 245
235 mtu = dst_mtu(dst); 246 mtu = dst_mtu(dst);
236 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { 247 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
@@ -624,7 +635,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
624 635
625 hroom = LL_RESERVED_SPACE(rt->dst.dev); 636 hroom = LL_RESERVED_SPACE(rt->dst.dev);
626 if (skb_has_frag_list(skb)) { 637 if (skb_has_frag_list(skb)) {
627 int first_len = skb_pagelen(skb); 638 unsigned int first_len = skb_pagelen(skb);
628 struct sk_buff *frag2; 639 struct sk_buff *frag2;
629 640
630 if (first_len - hlen > mtu || 641 if (first_len - hlen > mtu ||
@@ -757,13 +768,14 @@ slow_path:
757 * Fragment the datagram. 768 * Fragment the datagram.
758 */ 769 */
759 770
760 *prevhdr = NEXTHDR_FRAGMENT;
761 troom = rt->dst.dev->needed_tailroom; 771 troom = rt->dst.dev->needed_tailroom;
762 772
763 /* 773 /*
764 * Keep copying data until we run out. 774 * Keep copying data until we run out.
765 */ 775 */
766 while (left > 0) { 776 while (left > 0) {
777 u8 *fragnexthdr_offset;
778
767 len = left; 779 len = left;
768 /* IF: it doesn't fit, use 'mtu' - the data space left */ 780 /* IF: it doesn't fit, use 'mtu' - the data space left */
769 if (len > mtu) 781 if (len > mtu)
@@ -808,6 +820,10 @@ slow_path:
808 */ 820 */
809 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen); 821 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
810 822
823 fragnexthdr_offset = skb_network_header(frag);
824 fragnexthdr_offset += prevhdr - skb_network_header(skb);
825 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
826
811 /* 827 /*
812 * Build fragment header. 828 * Build fragment header.
813 */ 829 */
@@ -1011,6 +1027,11 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1011 } 1027 }
1012 } 1028 }
1013#endif 1029#endif
1030 if (ipv6_addr_v4mapped(&fl6->saddr) &&
1031 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1032 err = -EAFNOSUPPORT;
1033 goto out_err_release;
1034 }
1014 1035
1015 return 0; 1036 return 0;
1016 1037
@@ -1134,6 +1155,9 @@ static inline int ip6_ufo_append_data(struct sock *sk,
1134 skb->protocol = htons(ETH_P_IPV6); 1155 skb->protocol = htons(ETH_P_IPV6);
1135 skb->csum = 0; 1156 skb->csum = 0;
1136 1157
1158 if (flags & MSG_CONFIRM)
1159 skb_set_dst_pending_confirm(skb, 1);
1160
1137 __skb_queue_tail(queue, skb); 1161 __skb_queue_tail(queue, skb);
1138 } else if (skb_is_gso(skb)) { 1162 } else if (skb_is_gso(skb)) {
1139 goto append; 1163 goto append;
@@ -1334,7 +1358,7 @@ emsgsize:
1334 */ 1358 */
1335 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && 1359 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1336 headersize == sizeof(struct ipv6hdr) && 1360 headersize == sizeof(struct ipv6hdr) &&
1337 length < mtu - headersize && 1361 length <= mtu - headersize &&
1338 !(flags & MSG_MORE) && 1362 !(flags & MSG_MORE) &&
1339 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) 1363 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1340 csummode = CHECKSUM_PARTIAL; 1364 csummode = CHECKSUM_PARTIAL;
@@ -1363,10 +1387,10 @@ emsgsize:
1363 */ 1387 */
1364 1388
1365 cork->length += length; 1389 cork->length += length;
1366 if (((length > mtu) || 1390 if ((((length + fragheaderlen) > mtu) ||
1367 (skb && skb_is_gso(skb))) && 1391 (skb && skb_is_gso(skb))) &&
1368 (sk->sk_protocol == IPPROTO_UDP) && 1392 (sk->sk_protocol == IPPROTO_UDP) &&
1369 (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len && 1393 (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) &&
1370 (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) { 1394 (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) {
1371 err = ip6_ufo_append_data(sk, queue, getfrag, from, length, 1395 err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1372 hh_len, fragheaderlen, exthdrlen, 1396 hh_len, fragheaderlen, exthdrlen,
@@ -1506,6 +1530,9 @@ alloc_new_skb:
1506 exthdrlen = 0; 1530 exthdrlen = 0;
1507 dst_exthdrlen = 0; 1531 dst_exthdrlen = 0;
1508 1532
1533 if ((flags & MSG_CONFIRM) && !skb_prev)
1534 skb_set_dst_pending_confirm(skb, 1);
1535
1509 /* 1536 /*
1510 * Put the packet on the pending queue 1537 * Put the packet on the pending queue
1511 */ 1538 */
@@ -1672,7 +1699,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
1672 if (opt && opt->opt_flen) 1699 if (opt && opt->opt_flen)
1673 ipv6_push_frag_opts(skb, opt, &proto); 1700 ipv6_push_frag_opts(skb, opt, &proto);
1674 if (opt && opt->opt_nflen) 1701 if (opt && opt->opt_nflen)
1675 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst); 1702 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1676 1703
1677 skb_push(skb, sizeof(struct ipv6hdr)); 1704 skb_push(skb, sizeof(struct ipv6hdr));
1678 skb_reset_network_header(skb); 1705 skb_reset_network_header(skb);
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index d76674efe523..75fac933c209 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -42,7 +42,7 @@
42#include <linux/hash.h> 42#include <linux/hash.h>
43#include <linux/etherdevice.h> 43#include <linux/etherdevice.h>
44 44
45#include <asm/uaccess.h> 45#include <linux/uaccess.h>
46#include <linux/atomic.h> 46#include <linux/atomic.h>
47 47
48#include <net/icmp.h> 48#include <net/icmp.h>
@@ -83,7 +83,7 @@ static int ip6_tnl_dev_init(struct net_device *dev);
83static void ip6_tnl_dev_setup(struct net_device *dev); 83static void ip6_tnl_dev_setup(struct net_device *dev);
84static struct rtnl_link_ops ip6_link_ops __read_mostly; 84static struct rtnl_link_ops ip6_link_ops __read_mostly;
85 85
86static int ip6_tnl_net_id __read_mostly; 86static unsigned int ip6_tnl_net_id __read_mostly;
87struct ip6_tnl_net { 87struct ip6_tnl_net {
88 /* the IPv6 tunnel fallback device */ 88 /* the IPv6 tunnel fallback device */
89 struct net_device *fb_tnl_dev; 89 struct net_device *fb_tnl_dev;
@@ -400,18 +400,19 @@ ip6_tnl_dev_uninit(struct net_device *dev)
400 400
401__u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw) 401__u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw)
402{ 402{
403 const struct ipv6hdr *ipv6h = (const struct ipv6hdr *) raw; 403 const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)raw;
404 __u8 nexthdr = ipv6h->nexthdr; 404 unsigned int nhoff = raw - skb->data;
405 __u16 off = sizeof(*ipv6h); 405 unsigned int off = nhoff + sizeof(*ipv6h);
406 u8 next, nexthdr = ipv6h->nexthdr;
406 407
407 while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) { 408 while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) {
408 __u16 optlen = 0;
409 struct ipv6_opt_hdr *hdr; 409 struct ipv6_opt_hdr *hdr;
410 if (raw + off + sizeof(*hdr) > skb->data && 410 u16 optlen;
411 !pskb_may_pull(skb, raw - skb->data + off + sizeof (*hdr))) 411
412 if (!pskb_may_pull(skb, off + sizeof(*hdr)))
412 break; 413 break;
413 414
414 hdr = (struct ipv6_opt_hdr *) (raw + off); 415 hdr = (struct ipv6_opt_hdr *)(skb->data + off);
415 if (nexthdr == NEXTHDR_FRAGMENT) { 416 if (nexthdr == NEXTHDR_FRAGMENT) {
416 struct frag_hdr *frag_hdr = (struct frag_hdr *) hdr; 417 struct frag_hdr *frag_hdr = (struct frag_hdr *) hdr;
417 if (frag_hdr->frag_off) 418 if (frag_hdr->frag_off)
@@ -422,20 +423,29 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw)
422 } else { 423 } else {
423 optlen = ipv6_optlen(hdr); 424 optlen = ipv6_optlen(hdr);
424 } 425 }
426 /* cache hdr->nexthdr, since pskb_may_pull() might
427 * invalidate hdr
428 */
429 next = hdr->nexthdr;
425 if (nexthdr == NEXTHDR_DEST) { 430 if (nexthdr == NEXTHDR_DEST) {
426 __u16 i = off + 2; 431 u16 i = 2;
432
433 /* Remember : hdr is no longer valid at this point. */
434 if (!pskb_may_pull(skb, off + optlen))
435 break;
436
427 while (1) { 437 while (1) {
428 struct ipv6_tlv_tnl_enc_lim *tel; 438 struct ipv6_tlv_tnl_enc_lim *tel;
429 439
430 /* No more room for encapsulation limit */ 440 /* No more room for encapsulation limit */
431 if (i + sizeof (*tel) > off + optlen) 441 if (i + sizeof(*tel) > optlen)
432 break; 442 break;
433 443
434 tel = (struct ipv6_tlv_tnl_enc_lim *) &raw[i]; 444 tel = (struct ipv6_tlv_tnl_enc_lim *)(skb->data + off + i);
435 /* return index of option if found and valid */ 445 /* return index of option if found and valid */
436 if (tel->type == IPV6_TLV_TNL_ENCAP_LIMIT && 446 if (tel->type == IPV6_TLV_TNL_ENCAP_LIMIT &&
437 tel->length == 1) 447 tel->length == 1)
438 return i; 448 return i + off - nhoff;
439 /* else jump to next option */ 449 /* else jump to next option */
440 if (tel->type) 450 if (tel->type)
441 i += tel->length + 2; 451 i += tel->length + 2;
@@ -443,7 +453,7 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw)
443 i++; 453 i++;
444 } 454 }
445 } 455 }
446 nexthdr = hdr->nexthdr; 456 nexthdr = next;
447 off += optlen; 457 off += optlen;
448 } 458 }
449 return 0; 459 return 0;
@@ -1108,7 +1118,7 @@ route_lookup:
1108 t->parms.name); 1118 t->parms.name);
1109 goto tx_err_dst_release; 1119 goto tx_err_dst_release;
1110 } 1120 }
1111 mtu = dst_mtu(dst) - psh_hlen; 1121 mtu = dst_mtu(dst) - psh_hlen - t->tun_hlen;
1112 if (encap_limit >= 0) { 1122 if (encap_limit >= 0) {
1113 max_headroom += 8; 1123 max_headroom += 8;
1114 mtu -= 8; 1124 mtu -= 8;
@@ -1117,7 +1127,7 @@ route_lookup:
1117 mtu = IPV6_MIN_MTU; 1127 mtu = IPV6_MIN_MTU;
1118 if (skb_dst(skb) && !t->parms.collect_md) 1128 if (skb_dst(skb) && !t->parms.collect_md)
1119 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); 1129 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
1120 if (skb->len > mtu && !skb_is_gso(skb)) { 1130 if (skb->len - t->tun_hlen > mtu && !skb_is_gso(skb)) {
1121 *pmtu = mtu; 1131 *pmtu = mtu;
1122 err = -EMSGSIZE; 1132 err = -EMSGSIZE;
1123 goto tx_err_dst_release; 1133 goto tx_err_dst_release;
@@ -1166,7 +1176,7 @@ route_lookup:
1166 1176
1167 if (encap_limit >= 0) { 1177 if (encap_limit >= 0) {
1168 init_tel_txopt(&opt, encap_limit); 1178 init_tel_txopt(&opt, encap_limit);
1169 ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL); 1179 ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL, NULL);
1170 } 1180 }
1171 1181
1172 /* Calculate max headroom for all the headers and adjust 1182 /* Calculate max headroom for all the headers and adjust
@@ -1248,6 +1258,8 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
1248 fl6.flowi6_mark = skb->mark; 1258 fl6.flowi6_mark = skb->mark;
1249 } 1259 }
1250 1260
1261 fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
1262
1251 if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) 1263 if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
1252 return -1; 1264 return -1;
1253 1265
@@ -1301,6 +1313,8 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
1301 fl6.flowlabel = key->label; 1313 fl6.flowlabel = key->label;
1302 } else { 1314 } else {
1303 offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); 1315 offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
1316 /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */
1317 ipv6h = ipv6_hdr(skb);
1304 if (offset > 0) { 1318 if (offset > 0) {
1305 struct ipv6_tlv_tnl_enc_lim *tel; 1319 struct ipv6_tlv_tnl_enc_lim *tel;
1306 1320
@@ -1326,6 +1340,8 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
1326 fl6.flowi6_mark = skb->mark; 1340 fl6.flowi6_mark = skb->mark;
1327 } 1341 }
1328 1342
1343 fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
1344
1329 if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) 1345 if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
1330 return -1; 1346 return -1;
1331 1347
@@ -1645,7 +1661,7 @@ int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu)
1645 struct ip6_tnl *tnl = netdev_priv(dev); 1661 struct ip6_tnl *tnl = netdev_priv(dev);
1646 1662
1647 if (tnl->parms.proto == IPPROTO_IPIP) { 1663 if (tnl->parms.proto == IPPROTO_IPIP) {
1648 if (new_mtu < 68) 1664 if (new_mtu < ETH_MIN_MTU)
1649 return -EINVAL; 1665 return -EINVAL;
1650 } else { 1666 } else {
1651 if (new_mtu < IPV6_MIN_MTU) 1667 if (new_mtu < IPV6_MIN_MTU)
@@ -1798,6 +1814,8 @@ ip6_tnl_dev_init_gen(struct net_device *dev)
1798 dev->mtu = ETH_DATA_LEN - t_hlen; 1814 dev->mtu = ETH_DATA_LEN - t_hlen;
1799 if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) 1815 if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
1800 dev->mtu -= 8; 1816 dev->mtu -= 8;
1817 dev->min_mtu = ETH_MIN_MTU;
1818 dev->max_mtu = 0xFFF8 - dev->hard_header_len;
1801 1819
1802 return 0; 1820 return 0;
1803 1821
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index c299c1e2bbf0..3d8a3b63b4fd 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -49,6 +49,7 @@
49#include <net/xfrm.h> 49#include <net/xfrm.h>
50#include <net/net_namespace.h> 50#include <net/net_namespace.h>
51#include <net/netns/generic.h> 51#include <net/netns/generic.h>
52#include <linux/etherdevice.h>
52 53
53#define IP6_VTI_HASH_SIZE_SHIFT 5 54#define IP6_VTI_HASH_SIZE_SHIFT 5
54#define IP6_VTI_HASH_SIZE (1 << IP6_VTI_HASH_SIZE_SHIFT) 55#define IP6_VTI_HASH_SIZE (1 << IP6_VTI_HASH_SIZE_SHIFT)
@@ -64,7 +65,7 @@ static int vti6_dev_init(struct net_device *dev);
64static void vti6_dev_setup(struct net_device *dev); 65static void vti6_dev_setup(struct net_device *dev);
65static struct rtnl_link_ops vti6_link_ops __read_mostly; 66static struct rtnl_link_ops vti6_link_ops __read_mostly;
66 67
67static int vti6_net_id __read_mostly; 68static unsigned int vti6_net_id __read_mostly;
68struct vti6_net { 69struct vti6_net {
69 /* the vti6 tunnel fallback device */ 70 /* the vti6 tunnel fallback device */
70 struct net_device *fb_tnl_dev; 71 struct net_device *fb_tnl_dev;
@@ -189,12 +190,12 @@ static int vti6_tnl_create2(struct net_device *dev)
189 struct vti6_net *ip6n = net_generic(net, vti6_net_id); 190 struct vti6_net *ip6n = net_generic(net, vti6_net_id);
190 int err; 191 int err;
191 192
193 dev->rtnl_link_ops = &vti6_link_ops;
192 err = register_netdevice(dev); 194 err = register_netdevice(dev);
193 if (err < 0) 195 if (err < 0)
194 goto out; 196 goto out;
195 197
196 strcpy(t->parms.name, dev->name); 198 strcpy(t->parms.name, dev->name);
197 dev->rtnl_link_ops = &vti6_link_ops;
198 199
199 dev_hold(dev); 200 dev_hold(dev);
200 vti6_tnl_link(ip6n, t); 201 vti6_tnl_link(ip6n, t);
@@ -484,11 +485,15 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
484 if (!skb->ignore_df && skb->len > mtu) { 485 if (!skb->ignore_df && skb->len > mtu) {
485 skb_dst(skb)->ops->update_pmtu(dst, NULL, skb, mtu); 486 skb_dst(skb)->ops->update_pmtu(dst, NULL, skb, mtu);
486 487
487 if (skb->protocol == htons(ETH_P_IPV6)) 488 if (skb->protocol == htons(ETH_P_IPV6)) {
489 if (mtu < IPV6_MIN_MTU)
490 mtu = IPV6_MIN_MTU;
491
488 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 492 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
489 else 493 } else {
490 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, 494 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
491 htonl(mtu)); 495 htonl(mtu));
496 }
492 497
493 return -EMSGSIZE; 498 return -EMSGSIZE;
494 } 499 }
@@ -608,9 +613,10 @@ static int vti6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
608 return 0; 613 return 0;
609 614
610 if (type == NDISC_REDIRECT) 615 if (type == NDISC_REDIRECT)
611 ip6_redirect(skb, net, skb->dev->ifindex, 0); 616 ip6_redirect(skb, net, skb->dev->ifindex, 0,
617 sock_net_uid(net, NULL));
612 else 618 else
613 ip6_update_pmtu(skb, net, info, 0, 0); 619 ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
614 xfrm_state_put(x); 620 xfrm_state_put(x);
615 621
616 return 0; 622 return 0;
@@ -691,6 +697,10 @@ vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p)
691 u->link = p->link; 697 u->link = p->link;
692 u->i_key = p->i_key; 698 u->i_key = p->i_key;
693 u->o_key = p->o_key; 699 u->o_key = p->o_key;
700 if (u->i_key)
701 u->i_flags |= GRE_KEY;
702 if (u->o_key)
703 u->o_flags |= GRE_KEY;
694 u->proto = p->proto; 704 u->proto = p->proto;
695 705
696 memcpy(u->name, p->name, sizeof(u->name)); 706 memcpy(u->name, p->name, sizeof(u->name));
@@ -812,30 +822,11 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
812 return err; 822 return err;
813} 823}
814 824
815/**
816 * vti6_tnl_change_mtu - change mtu manually for tunnel device
817 * @dev: virtual device associated with tunnel
818 * @new_mtu: the new mtu
819 *
820 * Return:
821 * 0 on success,
822 * %-EINVAL if mtu too small
823 **/
824static int vti6_change_mtu(struct net_device *dev, int new_mtu)
825{
826 if (new_mtu < IPV6_MIN_MTU)
827 return -EINVAL;
828
829 dev->mtu = new_mtu;
830 return 0;
831}
832
833static const struct net_device_ops vti6_netdev_ops = { 825static const struct net_device_ops vti6_netdev_ops = {
834 .ndo_init = vti6_dev_init, 826 .ndo_init = vti6_dev_init,
835 .ndo_uninit = vti6_dev_uninit, 827 .ndo_uninit = vti6_dev_uninit,
836 .ndo_start_xmit = vti6_tnl_xmit, 828 .ndo_start_xmit = vti6_tnl_xmit,
837 .ndo_do_ioctl = vti6_ioctl, 829 .ndo_do_ioctl = vti6_ioctl,
838 .ndo_change_mtu = vti6_change_mtu,
839 .ndo_get_stats64 = ip_tunnel_get_stats64, 830 .ndo_get_stats64 = ip_tunnel_get_stats64,
840 .ndo_get_iflink = ip6_tnl_get_iflink, 831 .ndo_get_iflink = ip6_tnl_get_iflink,
841}; 832};
@@ -855,9 +846,14 @@ static void vti6_dev_setup(struct net_device *dev)
855 dev->type = ARPHRD_TUNNEL6; 846 dev->type = ARPHRD_TUNNEL6;
856 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct ipv6hdr); 847 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct ipv6hdr);
857 dev->mtu = ETH_DATA_LEN; 848 dev->mtu = ETH_DATA_LEN;
849 dev->min_mtu = IPV6_MIN_MTU;
850 dev->max_mtu = IP_MAX_MTU;
858 dev->flags |= IFF_NOARP; 851 dev->flags |= IFF_NOARP;
859 dev->addr_len = sizeof(struct in6_addr); 852 dev->addr_len = sizeof(struct in6_addr);
860 netif_keep_dst(dev); 853 netif_keep_dst(dev);
854 /* This perm addr will be used as interface identifier by IPv6 */
855 dev->addr_assign_type = NET_ADDR_RANDOM;
856 eth_random_addr(dev->perm_addr);
861} 857}
862 858
863/** 859/**
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 7f4265b1649b..bf34d0950752 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -16,7 +16,7 @@
16 * 16 *
17 */ 17 */
18 18
19#include <asm/uaccess.h> 19#include <linux/uaccess.h>
20#include <linux/types.h> 20#include <linux/types.h>
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/errno.h> 22#include <linux/errno.h>
@@ -636,7 +636,7 @@ static int pim6_rcv(struct sk_buff *skb)
636 goto drop; 636 goto drop;
637 637
638 pim = (struct pimreghdr *)skb_transport_header(skb); 638 pim = (struct pimreghdr *)skb_transport_header(skb);
639 if (pim->type != ((PIM_VERSION << 4) | PIM_REGISTER) || 639 if (pim->type != ((PIM_VERSION << 4) | PIM_TYPE_REGISTER) ||
640 (pim->flags & PIM_NULL_REGISTER) || 640 (pim->flags & PIM_NULL_REGISTER) ||
641 (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, 641 (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr,
642 sizeof(*pim), IPPROTO_PIM, 642 sizeof(*pim), IPPROTO_PIM,
@@ -774,7 +774,8 @@ failure:
774 * Delete a VIF entry 774 * Delete a VIF entry
775 */ 775 */
776 776
777static int mif6_delete(struct mr6_table *mrt, int vifi, struct list_head *head) 777static int mif6_delete(struct mr6_table *mrt, int vifi, int notify,
778 struct list_head *head)
778{ 779{
779 struct mif_device *v; 780 struct mif_device *v;
780 struct net_device *dev; 781 struct net_device *dev;
@@ -820,7 +821,7 @@ static int mif6_delete(struct mr6_table *mrt, int vifi, struct list_head *head)
820 dev->ifindex, &in6_dev->cnf); 821 dev->ifindex, &in6_dev->cnf);
821 } 822 }
822 823
823 if (v->flags & MIFF_REGISTER) 824 if ((v->flags & MIFF_REGISTER) && !notify)
824 unregister_netdevice_queue(dev, head); 825 unregister_netdevice_queue(dev, head);
825 826
826 dev_put(dev); 827 dev_put(dev);
@@ -1331,7 +1332,6 @@ static int ip6mr_device_event(struct notifier_block *this,
1331 struct mr6_table *mrt; 1332 struct mr6_table *mrt;
1332 struct mif_device *v; 1333 struct mif_device *v;
1333 int ct; 1334 int ct;
1334 LIST_HEAD(list);
1335 1335
1336 if (event != NETDEV_UNREGISTER) 1336 if (event != NETDEV_UNREGISTER)
1337 return NOTIFY_DONE; 1337 return NOTIFY_DONE;
@@ -1340,10 +1340,9 @@ static int ip6mr_device_event(struct notifier_block *this,
1340 v = &mrt->vif6_table[0]; 1340 v = &mrt->vif6_table[0];
1341 for (ct = 0; ct < mrt->maxvif; ct++, v++) { 1341 for (ct = 0; ct < mrt->maxvif; ct++, v++) {
1342 if (v->dev == dev) 1342 if (v->dev == dev)
1343 mif6_delete(mrt, ct, &list); 1343 mif6_delete(mrt, ct, 1, NULL);
1344 } 1344 }
1345 } 1345 }
1346 unregister_netdevice_many(&list);
1347 1346
1348 return NOTIFY_DONE; 1347 return NOTIFY_DONE;
1349} 1348}
@@ -1552,7 +1551,7 @@ static void mroute_clean_tables(struct mr6_table *mrt, bool all)
1552 for (i = 0; i < mrt->maxvif; i++) { 1551 for (i = 0; i < mrt->maxvif; i++) {
1553 if (!all && (mrt->vif6_table[i].flags & VIFF_STATIC)) 1552 if (!all && (mrt->vif6_table[i].flags & VIFF_STATIC))
1554 continue; 1553 continue;
1555 mif6_delete(mrt, i, &list); 1554 mif6_delete(mrt, i, 0, &list);
1556 } 1555 }
1557 unregister_netdevice_many(&list); 1556 unregister_netdevice_many(&list);
1558 1557
@@ -1666,6 +1665,10 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
1666 struct net *net = sock_net(sk); 1665 struct net *net = sock_net(sk);
1667 struct mr6_table *mrt; 1666 struct mr6_table *mrt;
1668 1667
1668 if (sk->sk_type != SOCK_RAW ||
1669 inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
1670 return -EOPNOTSUPP;
1671
1669 mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); 1672 mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
1670 if (!mrt) 1673 if (!mrt)
1671 return -ENOENT; 1674 return -ENOENT;
@@ -1677,9 +1680,6 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
1677 1680
1678 switch (optname) { 1681 switch (optname) {
1679 case MRT6_INIT: 1682 case MRT6_INIT:
1680 if (sk->sk_type != SOCK_RAW ||
1681 inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
1682 return -EOPNOTSUPP;
1683 if (optlen < sizeof(int)) 1683 if (optlen < sizeof(int))
1684 return -EINVAL; 1684 return -EINVAL;
1685 1685
@@ -1706,7 +1706,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
1706 if (copy_from_user(&mifi, optval, sizeof(mifi_t))) 1706 if (copy_from_user(&mifi, optval, sizeof(mifi_t)))
1707 return -EFAULT; 1707 return -EFAULT;
1708 rtnl_lock(); 1708 rtnl_lock();
1709 ret = mif6_delete(mrt, mifi, NULL); 1709 ret = mif6_delete(mrt, mifi, 0, NULL);
1710 rtnl_unlock(); 1710 rtnl_unlock();
1711 return ret; 1711 return ret;
1712 1712
@@ -1815,6 +1815,10 @@ int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval,
1815 struct net *net = sock_net(sk); 1815 struct net *net = sock_net(sk);
1816 struct mr6_table *mrt; 1816 struct mr6_table *mrt;
1817 1817
1818 if (sk->sk_type != SOCK_RAW ||
1819 inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
1820 return -EOPNOTSUPP;
1821
1818 mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); 1822 mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
1819 if (!mrt) 1823 if (!mrt)
1820 return -ENOENT; 1824 return -ENOENT;
@@ -2243,8 +2247,10 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
2243 int ct; 2247 int ct;
2244 2248
2245 /* If cache is unresolved, don't try to parse IIF and OIF */ 2249 /* If cache is unresolved, don't try to parse IIF and OIF */
2246 if (c->mf6c_parent >= MAXMIFS) 2250 if (c->mf6c_parent >= MAXMIFS) {
2251 rtm->rtm_flags |= RTNH_F_UNRESOLVED;
2247 return -ENOENT; 2252 return -ENOENT;
2253 }
2248 2254
2249 if (MIF_EXISTS(mrt, c->mf6c_parent) && 2255 if (MIF_EXISTS(mrt, c->mf6c_parent) &&
2250 nla_put_u32(skb, RTA_IIF, mrt->vif6_table[c->mf6c_parent].dev->ifindex) < 0) 2256 nla_put_u32(skb, RTA_IIF, mrt->vif6_table[c->mf6c_parent].dev->ifindex) < 0)
@@ -2286,7 +2292,7 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
2286} 2292}
2287 2293
2288int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, 2294int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
2289 int nowait, u32 portid) 2295 u32 portid)
2290{ 2296{
2291 int err; 2297 int err;
2292 struct mr6_table *mrt; 2298 struct mr6_table *mrt;
@@ -2313,11 +2319,6 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
2313 struct net_device *dev; 2319 struct net_device *dev;
2314 int vif; 2320 int vif;
2315 2321
2316 if (nowait) {
2317 read_unlock(&mrt_lock);
2318 return -EAGAIN;
2319 }
2320
2321 dev = skb->dev; 2322 dev = skb->dev;
2322 if (!dev || (vif = ip6mr_find_vif(mrt, dev)) < 0) { 2323 if (!dev || (vif = ip6mr_find_vif(mrt, dev)) < 0) {
2323 read_unlock(&mrt_lock); 2324 read_unlock(&mrt_lock);
@@ -2355,7 +2356,7 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
2355 return err; 2356 return err;
2356 } 2357 }
2357 2358
2358 if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY)) 2359 if (rtm->rtm_flags & RTM_F_NOTIFY)
2359 cache->mfc_flags |= MFC_NOTIFY; 2360 cache->mfc_flags |= MFC_NOTIFY;
2360 2361
2361 err = __ip6mr_fill_mroute(mrt, skb, cache, rtm); 2362 err = __ip6mr_fill_mroute(mrt, skb, cache, rtm);
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 1b9316e1386a..54d165b9845a 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -74,9 +74,10 @@ static int ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
74 return 0; 74 return 0;
75 75
76 if (type == NDISC_REDIRECT) 76 if (type == NDISC_REDIRECT)
77 ip6_redirect(skb, net, skb->dev->ifindex, 0); 77 ip6_redirect(skb, net, skb->dev->ifindex, 0,
78 sock_net_uid(net, NULL));
78 else 79 else
79 ip6_update_pmtu(skb, net, info, 0, 0); 80 ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
80 xfrm_state_put(x); 81 xfrm_state_put(x);
81 82
82 return 0; 83 return 0;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 636ec56f5f50..a531ba032b85 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -52,8 +52,9 @@
52#include <net/udplite.h> 52#include <net/udplite.h>
53#include <net/xfrm.h> 53#include <net/xfrm.h>
54#include <net/compat.h> 54#include <net/compat.h>
55#include <net/seg6.h>
55 56
56#include <asm/uaccess.h> 57#include <linux/uaccess.h>
57 58
58struct ip6_ra_chain *ip6_ra_chain; 59struct ip6_ra_chain *ip6_ra_chain;
59DEFINE_RWLOCK(ip6_ra_lock); 60DEFINE_RWLOCK(ip6_ra_lock);
@@ -430,6 +431,15 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
430 431
431 break; 432 break;
432#endif 433#endif
434 case IPV6_SRCRT_TYPE_4:
435 {
436 struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)
437 opt->srcrt;
438
439 if (!seg6_validate_srh(srh, optlen))
440 goto sticky_done;
441 break;
442 }
433 default: 443 default:
434 goto sticky_done; 444 goto sticky_done;
435 } 445 }
@@ -585,16 +595,24 @@ done:
585 595
586 if (val) { 596 if (val) {
587 struct net_device *dev; 597 struct net_device *dev;
598 int midx;
588 599
589 if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != val) 600 rcu_read_lock();
590 goto e_inval;
591 601
592 dev = dev_get_by_index(net, val); 602 dev = dev_get_by_index_rcu(net, val);
593 if (!dev) { 603 if (!dev) {
604 rcu_read_unlock();
594 retv = -ENODEV; 605 retv = -ENODEV;
595 break; 606 break;
596 } 607 }
597 dev_put(dev); 608 midx = l3mdev_master_ifindex_rcu(dev);
609
610 rcu_read_unlock();
611
612 if (sk->sk_bound_dev_if &&
613 sk->sk_bound_dev_if != val &&
614 (!midx || midx != sk->sk_bound_dev_if))
615 goto e_inval;
598 } 616 }
599 np->mcast_oif = val; 617 np->mcast_oif = val;
600 retv = 0; 618 retv = 0;
@@ -868,6 +886,10 @@ pref_skip_coa:
868 np->autoflowlabel = valbool; 886 np->autoflowlabel = valbool;
869 retv = 0; 887 retv = 0;
870 break; 888 break;
889 case IPV6_RECVFRAGSIZE:
890 np->rxopt.bits.recvfragsize = valbool;
891 retv = 0;
892 break;
871 } 893 }
872 894
873 release_sock(sk); 895 release_sock(sk);
@@ -1310,6 +1332,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
1310 val = np->autoflowlabel; 1332 val = np->autoflowlabel;
1311 break; 1333 break;
1312 1334
1335 case IPV6_RECVFRAGSIZE:
1336 val = np->rxopt.bits.recvfragsize;
1337 break;
1338
1313 default: 1339 default:
1314 return -ENOPROTOOPT; 1340 return -ENOPROTOOPT;
1315 } 1341 }
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 14a3903f1c82..1bdc703cb966 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -81,7 +81,7 @@ static void mld_gq_timer_expire(unsigned long data);
81static void mld_ifc_timer_expire(unsigned long data); 81static void mld_ifc_timer_expire(unsigned long data);
82static void mld_ifc_event(struct inet6_dev *idev); 82static void mld_ifc_event(struct inet6_dev *idev);
83static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc); 83static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc);
84static void mld_del_delrec(struct inet6_dev *idev, const struct in6_addr *addr); 84static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc);
85static void mld_clear_delrec(struct inet6_dev *idev); 85static void mld_clear_delrec(struct inet6_dev *idev);
86static bool mld_in_v1_mode(const struct inet6_dev *idev); 86static bool mld_in_v1_mode(const struct inet6_dev *idev);
87static int sf_setstate(struct ifmcaddr6 *pmc); 87static int sf_setstate(struct ifmcaddr6 *pmc);
@@ -692,9 +692,9 @@ static void igmp6_group_dropped(struct ifmcaddr6 *mc)
692 dev_mc_del(dev, buf); 692 dev_mc_del(dev, buf);
693 } 693 }
694 694
695 if (mc->mca_flags & MAF_NOREPORT)
696 goto done;
697 spin_unlock_bh(&mc->mca_lock); 695 spin_unlock_bh(&mc->mca_lock);
696 if (mc->mca_flags & MAF_NOREPORT)
697 return;
698 698
699 if (!mc->idev->dead) 699 if (!mc->idev->dead)
700 igmp6_leave_group(mc); 700 igmp6_leave_group(mc);
@@ -702,8 +702,6 @@ static void igmp6_group_dropped(struct ifmcaddr6 *mc)
702 spin_lock_bh(&mc->mca_lock); 702 spin_lock_bh(&mc->mca_lock);
703 if (del_timer(&mc->mca_timer)) 703 if (del_timer(&mc->mca_timer))
704 atomic_dec(&mc->mca_refcnt); 704 atomic_dec(&mc->mca_refcnt);
705done:
706 ip6_mc_clear_src(mc);
707 spin_unlock_bh(&mc->mca_lock); 705 spin_unlock_bh(&mc->mca_lock);
708} 706}
709 707
@@ -748,10 +746,11 @@ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im)
748 spin_unlock_bh(&idev->mc_lock); 746 spin_unlock_bh(&idev->mc_lock);
749} 747}
750 748
751static void mld_del_delrec(struct inet6_dev *idev, const struct in6_addr *pmca) 749static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im)
752{ 750{
753 struct ifmcaddr6 *pmc, *pmc_prev; 751 struct ifmcaddr6 *pmc, *pmc_prev;
754 struct ip6_sf_list *psf, *psf_next; 752 struct ip6_sf_list *psf;
753 struct in6_addr *pmca = &im->mca_addr;
755 754
756 spin_lock_bh(&idev->mc_lock); 755 spin_lock_bh(&idev->mc_lock);
757 pmc_prev = NULL; 756 pmc_prev = NULL;
@@ -768,14 +767,21 @@ static void mld_del_delrec(struct inet6_dev *idev, const struct in6_addr *pmca)
768 } 767 }
769 spin_unlock_bh(&idev->mc_lock); 768 spin_unlock_bh(&idev->mc_lock);
770 769
770 spin_lock_bh(&im->mca_lock);
771 if (pmc) { 771 if (pmc) {
772 for (psf = pmc->mca_tomb; psf; psf = psf_next) { 772 im->idev = pmc->idev;
773 psf_next = psf->sf_next; 773 im->mca_crcount = idev->mc_qrv;
774 kfree(psf); 774 im->mca_sfmode = pmc->mca_sfmode;
775 if (pmc->mca_sfmode == MCAST_INCLUDE) {
776 im->mca_tomb = pmc->mca_tomb;
777 im->mca_sources = pmc->mca_sources;
778 for (psf = im->mca_sources; psf; psf = psf->sf_next)
779 psf->sf_crcount = im->mca_crcount;
775 } 780 }
776 in6_dev_put(pmc->idev); 781 in6_dev_put(pmc->idev);
777 kfree(pmc); 782 kfree(pmc);
778 } 783 }
784 spin_unlock_bh(&im->mca_lock);
779} 785}
780 786
781static void mld_clear_delrec(struct inet6_dev *idev) 787static void mld_clear_delrec(struct inet6_dev *idev)
@@ -904,7 +910,7 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr)
904 mca_get(mc); 910 mca_get(mc);
905 write_unlock_bh(&idev->lock); 911 write_unlock_bh(&idev->lock);
906 912
907 mld_del_delrec(idev, &mc->mca_addr); 913 mld_del_delrec(idev, mc);
908 igmp6_group_added(mc); 914 igmp6_group_added(mc);
909 ma_put(mc); 915 ma_put(mc);
910 return 0; 916 return 0;
@@ -927,6 +933,7 @@ int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr)
927 write_unlock_bh(&idev->lock); 933 write_unlock_bh(&idev->lock);
928 934
929 igmp6_group_dropped(ma); 935 igmp6_group_dropped(ma);
936 ip6_mc_clear_src(ma);
930 937
931 ma_put(ma); 938 ma_put(ma);
932 return 0; 939 return 0;
@@ -2501,15 +2508,17 @@ void ipv6_mc_down(struct inet6_dev *idev)
2501 /* Withdraw multicast list */ 2508 /* Withdraw multicast list */
2502 2509
2503 read_lock_bh(&idev->lock); 2510 read_lock_bh(&idev->lock);
2504 mld_ifc_stop_timer(idev);
2505 mld_gq_stop_timer(idev);
2506 mld_dad_stop_timer(idev);
2507 2511
2508 for (i = idev->mc_list; i; i = i->next) 2512 for (i = idev->mc_list; i; i = i->next)
2509 igmp6_group_dropped(i); 2513 igmp6_group_dropped(i);
2510 read_unlock_bh(&idev->lock);
2511 2514
2512 mld_clear_delrec(idev); 2515 /* Should stop timer after group drop. or we will
2516 * start timer again in mld_ifc_event()
2517 */
2518 mld_ifc_stop_timer(idev);
2519 mld_gq_stop_timer(idev);
2520 mld_dad_stop_timer(idev);
2521 read_unlock_bh(&idev->lock);
2513} 2522}
2514 2523
2515static void ipv6_mc_reset(struct inet6_dev *idev) 2524static void ipv6_mc_reset(struct inet6_dev *idev)
@@ -2531,8 +2540,10 @@ void ipv6_mc_up(struct inet6_dev *idev)
2531 2540
2532 read_lock_bh(&idev->lock); 2541 read_lock_bh(&idev->lock);
2533 ipv6_mc_reset(idev); 2542 ipv6_mc_reset(idev);
2534 for (i = idev->mc_list; i; i = i->next) 2543 for (i = idev->mc_list; i; i = i->next) {
2544 mld_del_delrec(idev, i);
2535 igmp6_group_added(i); 2545 igmp6_group_added(i);
2546 }
2536 read_unlock_bh(&idev->lock); 2547 read_unlock_bh(&idev->lock);
2537} 2548}
2538 2549
@@ -2565,6 +2576,7 @@ void ipv6_mc_destroy_dev(struct inet6_dev *idev)
2565 2576
2566 /* Deactivate timers */ 2577 /* Deactivate timers */
2567 ipv6_mc_down(idev); 2578 ipv6_mc_down(idev);
2579 mld_clear_delrec(idev);
2568 2580
2569 /* Delete all-nodes address. */ 2581 /* Delete all-nodes address. */
2570 /* We cannot call ipv6_dev_mc_dec() directly, our caller in 2582 /* We cannot call ipv6_dev_mc_dec() directly, our caller in
@@ -2579,11 +2591,9 @@ void ipv6_mc_destroy_dev(struct inet6_dev *idev)
2579 write_lock_bh(&idev->lock); 2591 write_lock_bh(&idev->lock);
2580 while ((i = idev->mc_list) != NULL) { 2592 while ((i = idev->mc_list) != NULL) {
2581 idev->mc_list = i->next; 2593 idev->mc_list = i->next;
2582 write_unlock_bh(&idev->lock);
2583 2594
2584 igmp6_group_dropped(i); 2595 write_unlock_bh(&idev->lock);
2585 ma_put(i); 2596 ma_put(i);
2586
2587 write_lock_bh(&idev->lock); 2597 write_lock_bh(&idev->lock);
2588 } 2598 }
2589 write_unlock_bh(&idev->lock); 2599 write_unlock_bh(&idev->lock);
diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c
index 60c79a08e14a..64f0f7be9e5e 100644
--- a/net/ipv6/mip6.c
+++ b/net/ipv6/mip6.c
@@ -191,7 +191,7 @@ static inline int mip6_report_rl_allow(ktime_t stamp,
191 int allow = 0; 191 int allow = 0;
192 192
193 spin_lock_bh(&mip6_report_rl.lock); 193 spin_lock_bh(&mip6_report_rl.lock);
194 if (!ktime_equal(mip6_report_rl.stamp, stamp) || 194 if (mip6_report_rl.stamp != stamp ||
195 mip6_report_rl.iif != iif || 195 mip6_report_rl.iif != iif ||
196 !ipv6_addr_equal(&mip6_report_rl.src, src) || 196 !ipv6_addr_equal(&mip6_report_rl.src, src) ||
197 !ipv6_addr_equal(&mip6_report_rl.dst, dst)) { 197 !ipv6_addr_equal(&mip6_report_rl.dst, dst)) {
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index d8e671457d10..7ebac630d3c6 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -233,6 +233,7 @@ struct ndisc_options *ndisc_parse_options(const struct net_device *dev,
233 case ND_OPT_SOURCE_LL_ADDR: 233 case ND_OPT_SOURCE_LL_ADDR:
234 case ND_OPT_TARGET_LL_ADDR: 234 case ND_OPT_TARGET_LL_ADDR:
235 case ND_OPT_MTU: 235 case ND_OPT_MTU:
236 case ND_OPT_NONCE:
236 case ND_OPT_REDIRECT_HDR: 237 case ND_OPT_REDIRECT_HDR:
237 if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) { 238 if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) {
238 ND_PRINTK(2, warn, 239 ND_PRINTK(2, warn,
@@ -568,7 +569,8 @@ static void ndisc_send_unsol_na(struct net_device *dev)
568} 569}
569 570
570void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, 571void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
571 const struct in6_addr *daddr, const struct in6_addr *saddr) 572 const struct in6_addr *daddr, const struct in6_addr *saddr,
573 u64 nonce)
572{ 574{
573 struct sk_buff *skb; 575 struct sk_buff *skb;
574 struct in6_addr addr_buf; 576 struct in6_addr addr_buf;
@@ -588,6 +590,8 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
588 if (inc_opt) 590 if (inc_opt)
589 optlen += ndisc_opt_addr_space(dev, 591 optlen += ndisc_opt_addr_space(dev,
590 NDISC_NEIGHBOUR_SOLICITATION); 592 NDISC_NEIGHBOUR_SOLICITATION);
593 if (nonce != 0)
594 optlen += 8;
591 595
592 skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); 596 skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
593 if (!skb) 597 if (!skb)
@@ -605,6 +609,13 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
605 ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, 609 ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR,
606 dev->dev_addr, 610 dev->dev_addr,
607 NDISC_NEIGHBOUR_SOLICITATION); 611 NDISC_NEIGHBOUR_SOLICITATION);
612 if (nonce != 0) {
613 u8 *opt = skb_put(skb, 8);
614
615 opt[0] = ND_OPT_NONCE;
616 opt[1] = 8 >> 3;
617 memcpy(opt + 2, &nonce, 6);
618 }
608 619
609 ndisc_send_skb(skb, daddr, saddr); 620 ndisc_send_skb(skb, daddr, saddr);
610} 621}
@@ -693,12 +704,12 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb)
693 "%s: trying to ucast probe in NUD_INVALID: %pI6\n", 704 "%s: trying to ucast probe in NUD_INVALID: %pI6\n",
694 __func__, target); 705 __func__, target);
695 } 706 }
696 ndisc_send_ns(dev, target, target, saddr); 707 ndisc_send_ns(dev, target, target, saddr, 0);
697 } else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) { 708 } else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) {
698 neigh_app_ns(neigh); 709 neigh_app_ns(neigh);
699 } else { 710 } else {
700 addrconf_addr_solict_mult(target, &mcaddr); 711 addrconf_addr_solict_mult(target, &mcaddr);
701 ndisc_send_ns(dev, target, &mcaddr, saddr); 712 ndisc_send_ns(dev, target, &mcaddr, saddr, 0);
702 } 713 }
703} 714}
704 715
@@ -742,6 +753,7 @@ static void ndisc_recv_ns(struct sk_buff *skb)
742 int dad = ipv6_addr_any(saddr); 753 int dad = ipv6_addr_any(saddr);
743 bool inc; 754 bool inc;
744 int is_router = -1; 755 int is_router = -1;
756 u64 nonce = 0;
745 757
746 if (skb->len < sizeof(struct nd_msg)) { 758 if (skb->len < sizeof(struct nd_msg)) {
747 ND_PRINTK(2, warn, "NS: packet too short\n"); 759 ND_PRINTK(2, warn, "NS: packet too short\n");
@@ -786,6 +798,8 @@ static void ndisc_recv_ns(struct sk_buff *skb)
786 return; 798 return;
787 } 799 }
788 } 800 }
801 if (ndopts.nd_opts_nonce)
802 memcpy(&nonce, (u8 *)(ndopts.nd_opts_nonce + 1), 6);
789 803
790 inc = ipv6_addr_is_multicast(daddr); 804 inc = ipv6_addr_is_multicast(daddr);
791 805
@@ -794,6 +808,15 @@ static void ndisc_recv_ns(struct sk_buff *skb)
794have_ifp: 808have_ifp:
795 if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) { 809 if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) {
796 if (dad) { 810 if (dad) {
811 if (nonce != 0 && ifp->dad_nonce == nonce) {
812 u8 *np = (u8 *)&nonce;
813 /* Matching nonce if looped back */
814 ND_PRINTK(2, notice,
815 "%s: IPv6 DAD loopback for address %pI6c nonce %pM ignored\n",
816 ifp->idev->dev->name,
817 &ifp->addr, np);
818 goto out;
819 }
797 /* 820 /*
798 * We are colliding with another node 821 * We are colliding with another node
799 * who is doing DAD 822 * who is doing DAD
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index d11c46833d61..39970e212ad5 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -26,6 +26,7 @@ int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
26 struct flowi6 fl6 = { 26 struct flowi6 fl6 = {
27 .flowi6_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0, 27 .flowi6_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
28 .flowi6_mark = skb->mark, 28 .flowi6_mark = skb->mark,
29 .flowi6_uid = sock_net_uid(net, skb->sk),
29 .daddr = iph->daddr, 30 .daddr = iph->daddr,
30 .saddr = iph->saddr, 31 .saddr = iph->saddr,
31 }; 32 };
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index e10a04c9cdc7..6acb2eecd986 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -25,6 +25,12 @@ config NF_CONNTRACK_IPV6
25 25
26 To compile it as a module, choose M here. If unsure, say N. 26 To compile it as a module, choose M here. If unsure, say N.
27 27
28config NF_SOCKET_IPV6
29 tristate "IPv6 socket lookup support"
30 help
31 This option enables the IPv6 socket lookup infrastructure. This
32 is used by the ip6tables socket match.
33
28if NF_TABLES 34if NF_TABLES
29 35
30config NF_TABLES_IPV6 36config NF_TABLES_IPV6
@@ -54,6 +60,14 @@ config NFT_DUP_IPV6
54 help 60 help
55 This module enables IPv6 packet duplication support for nf_tables. 61 This module enables IPv6 packet duplication support for nf_tables.
56 62
63config NFT_FIB_IPV6
64 tristate "nf_tables fib / ipv6 route lookup support"
65 select NFT_FIB
66 help
67 This module enables IPv6 FIB lookups, e.g. for reverse path filtering.
68 It also allows query of the FIB for the route type, e.g. local, unicast,
69 multicast or blackhole.
70
57endif # NF_TABLES_IPV6 71endif # NF_TABLES_IPV6
58endif # NF_TABLES 72endif # NF_TABLES
59 73
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index b4f7d0b4e2af..fe180c96040e 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -24,6 +24,8 @@ obj-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o
24nf_defrag_ipv6-y := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o 24nf_defrag_ipv6-y := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o
25obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o 25obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o
26 26
27obj-$(CONFIG_NF_SOCKET_IPV6) += nf_socket_ipv6.o
28
27# logging 29# logging
28obj-$(CONFIG_NF_LOG_IPV6) += nf_log_ipv6.o 30obj-$(CONFIG_NF_LOG_IPV6) += nf_log_ipv6.o
29 31
@@ -40,6 +42,7 @@ obj-$(CONFIG_NFT_REJECT_IPV6) += nft_reject_ipv6.o
40obj-$(CONFIG_NFT_MASQ_IPV6) += nft_masq_ipv6.o 42obj-$(CONFIG_NFT_MASQ_IPV6) += nft_masq_ipv6.o
41obj-$(CONFIG_NFT_REDIR_IPV6) += nft_redir_ipv6.o 43obj-$(CONFIG_NFT_REDIR_IPV6) += nft_redir_ipv6.o
42obj-$(CONFIG_NFT_DUP_IPV6) += nft_dup_ipv6.o 44obj-$(CONFIG_NFT_DUP_IPV6) += nft_dup_ipv6.o
45obj-$(CONFIG_NFT_FIB_IPV6) += nft_fib_ipv6.o
43 46
44# matches 47# matches
45obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o 48obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 55aacea24396..1e15c54fd5e2 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -24,7 +24,7 @@
24#include <linux/icmpv6.h> 24#include <linux/icmpv6.h>
25#include <net/ipv6.h> 25#include <net/ipv6.h>
26#include <net/compat.h> 26#include <net/compat.h>
27#include <asm/uaccess.h> 27#include <linux/uaccess.h>
28#include <linux/mutex.h> 28#include <linux/mutex.h>
29#include <linux/proc_fs.h> 29#include <linux/proc_fs.h>
30#include <linux/err.h> 30#include <linux/err.h>
@@ -291,11 +291,7 @@ ip6t_do_table(struct sk_buff *skb,
291 * rule is also a fragment-specific rule, non-fragments won't 291 * rule is also a fragment-specific rule, non-fragments won't
292 * match it. */ 292 * match it. */
293 acpar.hotdrop = false; 293 acpar.hotdrop = false;
294 acpar.net = state->net; 294 acpar.state = state;
295 acpar.in = state->in;
296 acpar.out = state->out;
297 acpar.family = NFPROTO_IPV6;
298 acpar.hooknum = hook;
299 295
300 IP_NF_ASSERT(table->valid_hooks & (1 << hook)); 296 IP_NF_ASSERT(table->valid_hooks & (1 << hook));
301 297
@@ -566,7 +562,8 @@ static int check_target(struct ip6t_entry *e, struct net *net, const char *name)
566 562
567static int 563static int
568find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, 564find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
569 unsigned int size) 565 unsigned int size,
566 struct xt_percpu_counter_alloc_state *alloc_state)
570{ 567{
571 struct xt_entry_target *t; 568 struct xt_entry_target *t;
572 struct xt_target *target; 569 struct xt_target *target;
@@ -574,12 +571,9 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
574 unsigned int j; 571 unsigned int j;
575 struct xt_mtchk_param mtpar; 572 struct xt_mtchk_param mtpar;
576 struct xt_entry_match *ematch; 573 struct xt_entry_match *ematch;
577 unsigned long pcnt;
578 574
579 pcnt = xt_percpu_counter_alloc(); 575 if (!xt_percpu_counter_alloc(alloc_state, &e->counters))
580 if (IS_ERR_VALUE(pcnt))
581 return -ENOMEM; 576 return -ENOMEM;
582 e->counters.pcnt = pcnt;
583 577
584 j = 0; 578 j = 0;
585 mtpar.net = net; 579 mtpar.net = net;
@@ -616,7 +610,7 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
616 cleanup_match(ematch, net); 610 cleanup_match(ematch, net);
617 } 611 }
618 612
619 xt_percpu_counter_free(e->counters.pcnt); 613 xt_percpu_counter_free(&e->counters);
620 614
621 return ret; 615 return ret;
622} 616}
@@ -703,8 +697,7 @@ static void cleanup_entry(struct ip6t_entry *e, struct net *net)
703 if (par.target->destroy != NULL) 697 if (par.target->destroy != NULL)
704 par.target->destroy(&par); 698 par.target->destroy(&par);
705 module_put(par.target->me); 699 module_put(par.target->me);
706 700 xt_percpu_counter_free(&e->counters);
707 xt_percpu_counter_free(e->counters.pcnt);
708} 701}
709 702
710/* Checks and translates the user-supplied table segment (held in 703/* Checks and translates the user-supplied table segment (held in
@@ -713,6 +706,7 @@ static int
713translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, 706translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
714 const struct ip6t_replace *repl) 707 const struct ip6t_replace *repl)
715{ 708{
709 struct xt_percpu_counter_alloc_state alloc_state = { 0 };
716 struct ip6t_entry *iter; 710 struct ip6t_entry *iter;
717 unsigned int *offsets; 711 unsigned int *offsets;
718 unsigned int i; 712 unsigned int i;
@@ -772,7 +766,8 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
772 /* Finally, each sanity check must pass */ 766 /* Finally, each sanity check must pass */
773 i = 0; 767 i = 0;
774 xt_entry_foreach(iter, entry0, newinfo->size) { 768 xt_entry_foreach(iter, entry0, newinfo->size) {
775 ret = find_check_entry(iter, net, repl->name, repl->size); 769 ret = find_check_entry(iter, net, repl->name, repl->size,
770 &alloc_state);
776 if (ret != 0) 771 if (ret != 0)
777 break; 772 break;
778 ++i; 773 ++i;
@@ -860,10 +855,6 @@ copy_entries_to_user(unsigned int total_size,
860 return PTR_ERR(counters); 855 return PTR_ERR(counters);
861 856
862 loc_cpu_entry = private->entries; 857 loc_cpu_entry = private->entries;
863 if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
864 ret = -EFAULT;
865 goto free_counters;
866 }
867 858
868 /* FIXME: use iterator macros --RR */ 859 /* FIXME: use iterator macros --RR */
869 /* ... then go back and fix counters and names */ 860 /* ... then go back and fix counters and names */
@@ -873,6 +864,10 @@ copy_entries_to_user(unsigned int total_size,
873 const struct xt_entry_target *t; 864 const struct xt_entry_target *t;
874 865
875 e = (struct ip6t_entry *)(loc_cpu_entry + off); 866 e = (struct ip6t_entry *)(loc_cpu_entry + off);
867 if (copy_to_user(userptr + off, e, sizeof(*e))) {
868 ret = -EFAULT;
869 goto free_counters;
870 }
876 if (copy_to_user(userptr + off 871 if (copy_to_user(userptr + off
877 + offsetof(struct ip6t_entry, counters), 872 + offsetof(struct ip6t_entry, counters),
878 &counters[num], 873 &counters[num],
@@ -886,23 +881,14 @@ copy_entries_to_user(unsigned int total_size,
886 i += m->u.match_size) { 881 i += m->u.match_size) {
887 m = (void *)e + i; 882 m = (void *)e + i;
888 883
889 if (copy_to_user(userptr + off + i 884 if (xt_match_to_user(m, userptr + off + i)) {
890 + offsetof(struct xt_entry_match,
891 u.user.name),
892 m->u.kernel.match->name,
893 strlen(m->u.kernel.match->name)+1)
894 != 0) {
895 ret = -EFAULT; 885 ret = -EFAULT;
896 goto free_counters; 886 goto free_counters;
897 } 887 }
898 } 888 }
899 889
900 t = ip6t_get_target_c(e); 890 t = ip6t_get_target_c(e);
901 if (copy_to_user(userptr + off + e->target_offset 891 if (xt_target_to_user(t, userptr + off + e->target_offset)) {
902 + offsetof(struct xt_entry_target,
903 u.user.name),
904 t->u.kernel.target->name,
905 strlen(t->u.kernel.target->name)+1) != 0) {
906 ret = -EFAULT; 892 ret = -EFAULT;
907 goto free_counters; 893 goto free_counters;
908 } 894 }
@@ -1007,7 +993,7 @@ static int get_info(struct net *net, void __user *user,
1007#endif 993#endif
1008 t = try_then_request_module(xt_find_table_lock(net, AF_INET6, name), 994 t = try_then_request_module(xt_find_table_lock(net, AF_INET6, name),
1009 "ip6table_%s", name); 995 "ip6table_%s", name);
1010 if (!IS_ERR_OR_NULL(t)) { 996 if (t) {
1011 struct ip6t_getinfo info; 997 struct ip6t_getinfo info;
1012 const struct xt_table_info *private = t->private; 998 const struct xt_table_info *private = t->private;
1013#ifdef CONFIG_COMPAT 999#ifdef CONFIG_COMPAT
@@ -1037,7 +1023,7 @@ static int get_info(struct net *net, void __user *user,
1037 xt_table_unlock(t); 1023 xt_table_unlock(t);
1038 module_put(t->me); 1024 module_put(t->me);
1039 } else 1025 } else
1040 ret = t ? PTR_ERR(t) : -ENOENT; 1026 ret = -ENOENT;
1041#ifdef CONFIG_COMPAT 1027#ifdef CONFIG_COMPAT
1042 if (compat) 1028 if (compat)
1043 xt_compat_unlock(AF_INET6); 1029 xt_compat_unlock(AF_INET6);
@@ -1063,7 +1049,7 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr,
1063 get.name[sizeof(get.name) - 1] = '\0'; 1049 get.name[sizeof(get.name) - 1] = '\0';
1064 1050
1065 t = xt_find_table_lock(net, AF_INET6, get.name); 1051 t = xt_find_table_lock(net, AF_INET6, get.name);
1066 if (!IS_ERR_OR_NULL(t)) { 1052 if (t) {
1067 struct xt_table_info *private = t->private; 1053 struct xt_table_info *private = t->private;
1068 if (get.size == private->size) 1054 if (get.size == private->size)
1069 ret = copy_entries_to_user(private->size, 1055 ret = copy_entries_to_user(private->size,
@@ -1074,7 +1060,7 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr,
1074 module_put(t->me); 1060 module_put(t->me);
1075 xt_table_unlock(t); 1061 xt_table_unlock(t);
1076 } else 1062 } else
1077 ret = t ? PTR_ERR(t) : -ENOENT; 1063 ret = -ENOENT;
1078 1064
1079 return ret; 1065 return ret;
1080} 1066}
@@ -1099,8 +1085,8 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
1099 1085
1100 t = try_then_request_module(xt_find_table_lock(net, AF_INET6, name), 1086 t = try_then_request_module(xt_find_table_lock(net, AF_INET6, name),
1101 "ip6table_%s", name); 1087 "ip6table_%s", name);
1102 if (IS_ERR_OR_NULL(t)) { 1088 if (!t) {
1103 ret = t ? PTR_ERR(t) : -ENOENT; 1089 ret = -ENOENT;
1104 goto free_newinfo_counters_untrans; 1090 goto free_newinfo_counters_untrans;
1105 } 1091 }
1106 1092
@@ -1214,8 +1200,8 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
1214 if (IS_ERR(paddc)) 1200 if (IS_ERR(paddc))
1215 return PTR_ERR(paddc); 1201 return PTR_ERR(paddc);
1216 t = xt_find_table_lock(net, AF_INET6, tmp.name); 1202 t = xt_find_table_lock(net, AF_INET6, tmp.name);
1217 if (IS_ERR_OR_NULL(t)) { 1203 if (!t) {
1218 ret = t ? PTR_ERR(t) : -ENOENT; 1204 ret = -ENOENT;
1219 goto free; 1205 goto free;
1220 } 1206 }
1221 1207
@@ -1651,7 +1637,7 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr,
1651 1637
1652 xt_compat_lock(AF_INET6); 1638 xt_compat_lock(AF_INET6);
1653 t = xt_find_table_lock(net, AF_INET6, get.name); 1639 t = xt_find_table_lock(net, AF_INET6, get.name);
1654 if (!IS_ERR_OR_NULL(t)) { 1640 if (t) {
1655 const struct xt_table_info *private = t->private; 1641 const struct xt_table_info *private = t->private;
1656 struct xt_table_info info; 1642 struct xt_table_info info;
1657 ret = compat_table_info(private, &info); 1643 ret = compat_table_info(private, &info);
@@ -1665,7 +1651,7 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr,
1665 module_put(t->me); 1651 module_put(t->me);
1666 xt_table_unlock(t); 1652 xt_table_unlock(t);
1667 } else 1653 } else
1668 ret = t ? PTR_ERR(t) : -ENOENT; 1654 ret = -ENOENT;
1669 1655
1670 xt_compat_unlock(AF_INET6); 1656 xt_compat_unlock(AF_INET6);
1671 return ret; 1657 return ret;
diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c
index 7f9f45d829d2..2b1a15846f9a 100644
--- a/net/ipv6/netfilter/ip6t_MASQUERADE.c
+++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c
@@ -24,7 +24,7 @@
24static unsigned int 24static unsigned int
25masquerade_tg6(struct sk_buff *skb, const struct xt_action_param *par) 25masquerade_tg6(struct sk_buff *skb, const struct xt_action_param *par)
26{ 26{
27 return nf_nat_masquerade_ipv6(skb, par->targinfo, par->out); 27 return nf_nat_masquerade_ipv6(skb, par->targinfo, xt_out(par));
28} 28}
29 29
30static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par) 30static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par)
diff --git a/net/ipv6/netfilter/ip6t_NPT.c b/net/ipv6/netfilter/ip6t_NPT.c
index 590f767db5d4..a379d2f79b19 100644
--- a/net/ipv6/netfilter/ip6t_NPT.c
+++ b/net/ipv6/netfilter/ip6t_NPT.c
@@ -112,6 +112,7 @@ static struct xt_target ip6t_npt_target_reg[] __read_mostly = {
112 .table = "mangle", 112 .table = "mangle",
113 .target = ip6t_snpt_tg, 113 .target = ip6t_snpt_tg,
114 .targetsize = sizeof(struct ip6t_npt_tginfo), 114 .targetsize = sizeof(struct ip6t_npt_tginfo),
115 .usersize = offsetof(struct ip6t_npt_tginfo, adjustment),
115 .checkentry = ip6t_npt_checkentry, 116 .checkentry = ip6t_npt_checkentry,
116 .family = NFPROTO_IPV6, 117 .family = NFPROTO_IPV6,
117 .hooks = (1 << NF_INET_LOCAL_IN) | 118 .hooks = (1 << NF_INET_LOCAL_IN) |
@@ -123,6 +124,7 @@ static struct xt_target ip6t_npt_target_reg[] __read_mostly = {
123 .table = "mangle", 124 .table = "mangle",
124 .target = ip6t_dnpt_tg, 125 .target = ip6t_dnpt_tg,
125 .targetsize = sizeof(struct ip6t_npt_tginfo), 126 .targetsize = sizeof(struct ip6t_npt_tginfo),
127 .usersize = offsetof(struct ip6t_npt_tginfo, adjustment),
126 .checkentry = ip6t_npt_checkentry, 128 .checkentry = ip6t_npt_checkentry,
127 .family = NFPROTO_IPV6, 129 .family = NFPROTO_IPV6,
128 .hooks = (1 << NF_INET_PRE_ROUTING) | 130 .hooks = (1 << NF_INET_PRE_ROUTING) |
diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
index db29bbf41b59..fa51a205918d 100644
--- a/net/ipv6/netfilter/ip6t_REJECT.c
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -39,35 +39,40 @@ static unsigned int
39reject_tg6(struct sk_buff *skb, const struct xt_action_param *par) 39reject_tg6(struct sk_buff *skb, const struct xt_action_param *par)
40{ 40{
41 const struct ip6t_reject_info *reject = par->targinfo; 41 const struct ip6t_reject_info *reject = par->targinfo;
42 struct net *net = par->net; 42 struct net *net = xt_net(par);
43 43
44 switch (reject->with) { 44 switch (reject->with) {
45 case IP6T_ICMP6_NO_ROUTE: 45 case IP6T_ICMP6_NO_ROUTE:
46 nf_send_unreach6(net, skb, ICMPV6_NOROUTE, par->hooknum); 46 nf_send_unreach6(net, skb, ICMPV6_NOROUTE, xt_hooknum(par));
47 break; 47 break;
48 case IP6T_ICMP6_ADM_PROHIBITED: 48 case IP6T_ICMP6_ADM_PROHIBITED:
49 nf_send_unreach6(net, skb, ICMPV6_ADM_PROHIBITED, par->hooknum); 49 nf_send_unreach6(net, skb, ICMPV6_ADM_PROHIBITED,
50 xt_hooknum(par));
50 break; 51 break;
51 case IP6T_ICMP6_NOT_NEIGHBOUR: 52 case IP6T_ICMP6_NOT_NEIGHBOUR:
52 nf_send_unreach6(net, skb, ICMPV6_NOT_NEIGHBOUR, par->hooknum); 53 nf_send_unreach6(net, skb, ICMPV6_NOT_NEIGHBOUR,
54 xt_hooknum(par));
53 break; 55 break;
54 case IP6T_ICMP6_ADDR_UNREACH: 56 case IP6T_ICMP6_ADDR_UNREACH:
55 nf_send_unreach6(net, skb, ICMPV6_ADDR_UNREACH, par->hooknum); 57 nf_send_unreach6(net, skb, ICMPV6_ADDR_UNREACH,
58 xt_hooknum(par));
56 break; 59 break;
57 case IP6T_ICMP6_PORT_UNREACH: 60 case IP6T_ICMP6_PORT_UNREACH:
58 nf_send_unreach6(net, skb, ICMPV6_PORT_UNREACH, par->hooknum); 61 nf_send_unreach6(net, skb, ICMPV6_PORT_UNREACH,
62 xt_hooknum(par));
59 break; 63 break;
60 case IP6T_ICMP6_ECHOREPLY: 64 case IP6T_ICMP6_ECHOREPLY:
61 /* Do nothing */ 65 /* Do nothing */
62 break; 66 break;
63 case IP6T_TCP_RESET: 67 case IP6T_TCP_RESET:
64 nf_send_reset6(net, skb, par->hooknum); 68 nf_send_reset6(net, skb, xt_hooknum(par));
65 break; 69 break;
66 case IP6T_ICMP6_POLICY_FAIL: 70 case IP6T_ICMP6_POLICY_FAIL:
67 nf_send_unreach6(net, skb, ICMPV6_POLICY_FAIL, par->hooknum); 71 nf_send_unreach6(net, skb, ICMPV6_POLICY_FAIL, xt_hooknum(par));
68 break; 72 break;
69 case IP6T_ICMP6_REJECT_ROUTE: 73 case IP6T_ICMP6_REJECT_ROUTE:
70 nf_send_unreach6(net, skb, ICMPV6_REJECT_ROUTE, par->hooknum); 74 nf_send_unreach6(net, skb, ICMPV6_REJECT_ROUTE,
75 xt_hooknum(par));
71 break; 76 break;
72 } 77 }
73 78
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c
index 06bed74cf5ee..4ef1ddd4bbbd 100644
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -71,8 +71,7 @@ synproxy_send_tcp(struct net *net,
71 skb_dst_set(nskb, dst); 71 skb_dst_set(nskb, dst);
72 72
73 if (nfct) { 73 if (nfct) {
74 nskb->nfct = nfct; 74 nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo);
75 nskb->nfctinfo = ctinfo;
76 nf_conntrack_get(nfct); 75 nf_conntrack_get(nfct);
77 } 76 }
78 77
@@ -121,8 +120,8 @@ synproxy_send_client_synack(struct net *net,
121 120
122 synproxy_build_options(nth, opts); 121 synproxy_build_options(nth, opts);
123 122
124 synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, 123 synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
125 niph, nth, tcp_hdr_size); 124 IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
126} 125}
127 126
128static void 127static void
@@ -244,8 +243,8 @@ synproxy_send_client_ack(struct net *net,
244 243
245 synproxy_build_options(nth, opts); 244 synproxy_build_options(nth, opts);
246 245
247 synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, 246 synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
248 niph, nth, tcp_hdr_size); 247 IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
249} 248}
250 249
251static bool 250static bool
@@ -277,12 +276,12 @@ static unsigned int
277synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) 276synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
278{ 277{
279 const struct xt_synproxy_info *info = par->targinfo; 278 const struct xt_synproxy_info *info = par->targinfo;
280 struct net *net = par->net; 279 struct net *net = xt_net(par);
281 struct synproxy_net *snet = synproxy_pernet(net); 280 struct synproxy_net *snet = synproxy_pernet(net);
282 struct synproxy_options opts = {}; 281 struct synproxy_options opts = {};
283 struct tcphdr *th, _th; 282 struct tcphdr *th, _th;
284 283
285 if (nf_ip6_checksum(skb, par->hooknum, par->thoff, IPPROTO_TCP)) 284 if (nf_ip6_checksum(skb, xt_hooknum(par), par->thoff, IPPROTO_TCP))
286 return NF_DROP; 285 return NF_DROP;
287 286
288 th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th); 287 th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th);
@@ -440,12 +439,12 @@ static int synproxy_tg6_check(const struct xt_tgchk_param *par)
440 e->ipv6.invflags & XT_INV_PROTO) 439 e->ipv6.invflags & XT_INV_PROTO)
441 return -EINVAL; 440 return -EINVAL;
442 441
443 return nf_ct_l3proto_try_module_get(par->family); 442 return nf_ct_netns_get(par->net, par->family);
444} 443}
445 444
446static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par) 445static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par)
447{ 446{
448 nf_ct_l3proto_module_put(par->family); 447 nf_ct_netns_put(par->net, par->family);
449} 448}
450 449
451static struct xt_target synproxy_tg6_reg __read_mostly = { 450static struct xt_target synproxy_tg6_reg __read_mostly = {
diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c
index 1ee1b25df096..b12e61b7b16c 100644
--- a/net/ipv6/netfilter/ip6t_rpfilter.c
+++ b/net/ipv6/netfilter/ip6t_rpfilter.c
@@ -72,10 +72,10 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb,
72 return ret; 72 return ret;
73} 73}
74 74
75static bool rpfilter_is_local(const struct sk_buff *skb) 75static bool
76rpfilter_is_loopback(const struct sk_buff *skb, const struct net_device *in)
76{ 77{
77 const struct rt6_info *rt = (const void *) skb_dst(skb); 78 return skb->pkt_type == PACKET_LOOPBACK || in->flags & IFF_LOOPBACK;
78 return rt && (rt->rt6i_flags & RTF_LOCAL);
79} 79}
80 80
81static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) 81static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
@@ -85,7 +85,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
85 struct ipv6hdr *iph; 85 struct ipv6hdr *iph;
86 bool invert = info->flags & XT_RPFILTER_INVERT; 86 bool invert = info->flags & XT_RPFILTER_INVERT;
87 87
88 if (rpfilter_is_local(skb)) 88 if (rpfilter_is_loopback(skb, xt_in(par)))
89 return true ^ invert; 89 return true ^ invert;
90 90
91 iph = ipv6_hdr(skb); 91 iph = ipv6_hdr(skb);
@@ -93,7 +93,8 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
93 if (unlikely(saddrtype == IPV6_ADDR_ANY)) 93 if (unlikely(saddrtype == IPV6_ADDR_ANY))
94 return true ^ invert; /* not routable: forward path will drop it */ 94 return true ^ invert; /* not routable: forward path will drop it */
95 95
96 return rpfilter_lookup_reverse6(par->net, skb, par->in, info->flags) ^ invert; 96 return rpfilter_lookup_reverse6(xt_net(par), skb, xt_in(par),
97 info->flags) ^ invert;
97} 98}
98 99
99static int rpfilter_check(const struct xt_mtchk_param *par) 100static int rpfilter_check(const struct xt_mtchk_param *par)
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 963ee3848675..4e3402486833 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -34,6 +34,13 @@
34#include <net/netfilter/ipv6/nf_defrag_ipv6.h> 34#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
35#include <net/netfilter/nf_log.h> 35#include <net/netfilter/nf_log.h>
36 36
37static int conntrack6_net_id;
38static DEFINE_MUTEX(register_ipv6_hooks);
39
40struct conntrack6_net {
41 unsigned int users;
42};
43
37static bool ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, 44static bool ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
38 struct nf_conntrack_tuple *tuple) 45 struct nf_conntrack_tuple *tuple)
39{ 46{
@@ -308,6 +315,42 @@ static int ipv6_nlattr_tuple_size(void)
308} 315}
309#endif 316#endif
310 317
318static int ipv6_hooks_register(struct net *net)
319{
320 struct conntrack6_net *cnet = net_generic(net, conntrack6_net_id);
321 int err = 0;
322
323 mutex_lock(&register_ipv6_hooks);
324 cnet->users++;
325 if (cnet->users > 1)
326 goto out_unlock;
327
328 err = nf_defrag_ipv6_enable(net);
329 if (err < 0) {
330 cnet->users = 0;
331 goto out_unlock;
332 }
333
334 err = nf_register_net_hooks(net, ipv6_conntrack_ops,
335 ARRAY_SIZE(ipv6_conntrack_ops));
336 if (err)
337 cnet->users = 0;
338 out_unlock:
339 mutex_unlock(&register_ipv6_hooks);
340 return err;
341}
342
343static void ipv6_hooks_unregister(struct net *net)
344{
345 struct conntrack6_net *cnet = net_generic(net, conntrack6_net_id);
346
347 mutex_lock(&register_ipv6_hooks);
348 if (cnet->users && (--cnet->users == 0))
349 nf_unregister_net_hooks(net, ipv6_conntrack_ops,
350 ARRAY_SIZE(ipv6_conntrack_ops));
351 mutex_unlock(&register_ipv6_hooks);
352}
353
311struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = { 354struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = {
312 .l3proto = PF_INET6, 355 .l3proto = PF_INET6,
313 .name = "ipv6", 356 .name = "ipv6",
@@ -321,6 +364,8 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = {
321 .nlattr_to_tuple = ipv6_nlattr_to_tuple, 364 .nlattr_to_tuple = ipv6_nlattr_to_tuple,
322 .nla_policy = ipv6_nla_policy, 365 .nla_policy = ipv6_nla_policy,
323#endif 366#endif
367 .net_ns_get = ipv6_hooks_register,
368 .net_ns_put = ipv6_hooks_unregister,
324 .me = THIS_MODULE, 369 .me = THIS_MODULE,
325}; 370};
326 371
@@ -336,52 +381,51 @@ static struct nf_sockopt_ops so_getorigdst6 = {
336 .owner = THIS_MODULE, 381 .owner = THIS_MODULE,
337}; 382};
338 383
384static struct nf_conntrack_l4proto *builtin_l4proto6[] = {
385 &nf_conntrack_l4proto_tcp6,
386 &nf_conntrack_l4proto_udp6,
387 &nf_conntrack_l4proto_icmpv6,
388#ifdef CONFIG_NF_CT_PROTO_DCCP
389 &nf_conntrack_l4proto_dccp6,
390#endif
391#ifdef CONFIG_NF_CT_PROTO_SCTP
392 &nf_conntrack_l4proto_sctp6,
393#endif
394#ifdef CONFIG_NF_CT_PROTO_UDPLITE
395 &nf_conntrack_l4proto_udplite6,
396#endif
397};
398
339static int ipv6_net_init(struct net *net) 399static int ipv6_net_init(struct net *net)
340{ 400{
341 int ret = 0; 401 int ret = 0;
342 402
343 ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_tcp6); 403 ret = nf_ct_l4proto_pernet_register(net, builtin_l4proto6,
344 if (ret < 0) { 404 ARRAY_SIZE(builtin_l4proto6));
345 pr_err("nf_conntrack_tcp6: pernet registration failed\n"); 405 if (ret < 0)
346 goto out; 406 return ret;
347 } 407
348 ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udp6);
349 if (ret < 0) {
350 pr_err("nf_conntrack_udp6: pernet registration failed\n");
351 goto cleanup_tcp6;
352 }
353 ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_icmpv6);
354 if (ret < 0) {
355 pr_err("nf_conntrack_icmp6: pernet registration failed\n");
356 goto cleanup_udp6;
357 }
358 ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv6); 408 ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv6);
359 if (ret < 0) { 409 if (ret < 0) {
360 pr_err("nf_conntrack_ipv6: pernet registration failed.\n"); 410 pr_err("nf_conntrack_ipv6: pernet registration failed.\n");
361 goto cleanup_icmpv6; 411 nf_ct_l4proto_pernet_unregister(net, builtin_l4proto6,
412 ARRAY_SIZE(builtin_l4proto6));
362 } 413 }
363 return 0;
364 cleanup_icmpv6:
365 nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmpv6);
366 cleanup_udp6:
367 nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp6);
368 cleanup_tcp6:
369 nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp6);
370 out:
371 return ret; 414 return ret;
372} 415}
373 416
374static void ipv6_net_exit(struct net *net) 417static void ipv6_net_exit(struct net *net)
375{ 418{
376 nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv6); 419 nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv6);
377 nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmpv6); 420 nf_ct_l4proto_pernet_unregister(net, builtin_l4proto6,
378 nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp6); 421 ARRAY_SIZE(builtin_l4proto6));
379 nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp6);
380} 422}
381 423
382static struct pernet_operations ipv6_net_ops = { 424static struct pernet_operations ipv6_net_ops = {
383 .init = ipv6_net_init, 425 .init = ipv6_net_init,
384 .exit = ipv6_net_exit, 426 .exit = ipv6_net_exit,
427 .id = &conntrack6_net_id,
428 .size = sizeof(struct conntrack6_net),
385}; 429};
386 430
387static int __init nf_conntrack_l3proto_ipv6_init(void) 431static int __init nf_conntrack_l3proto_ipv6_init(void)
@@ -389,7 +433,6 @@ static int __init nf_conntrack_l3proto_ipv6_init(void)
389 int ret = 0; 433 int ret = 0;
390 434
391 need_conntrack(); 435 need_conntrack();
392 nf_defrag_ipv6_enable();
393 436
394 ret = nf_register_sockopt(&so_getorigdst6); 437 ret = nf_register_sockopt(&so_getorigdst6);
395 if (ret < 0) { 438 if (ret < 0) {
@@ -401,47 +444,20 @@ static int __init nf_conntrack_l3proto_ipv6_init(void)
401 if (ret < 0) 444 if (ret < 0)
402 goto cleanup_sockopt; 445 goto cleanup_sockopt;
403 446
404 ret = nf_register_hooks(ipv6_conntrack_ops, 447 ret = nf_ct_l4proto_register(builtin_l4proto6,
405 ARRAY_SIZE(ipv6_conntrack_ops)); 448 ARRAY_SIZE(builtin_l4proto6));
406 if (ret < 0) { 449 if (ret < 0)
407 pr_err("nf_conntrack_ipv6: can't register pre-routing defrag "
408 "hook.\n");
409 goto cleanup_pernet; 450 goto cleanup_pernet;
410 }
411
412 ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_tcp6);
413 if (ret < 0) {
414 pr_err("nf_conntrack_ipv6: can't register tcp6 proto.\n");
415 goto cleanup_hooks;
416 }
417
418 ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udp6);
419 if (ret < 0) {
420 pr_err("nf_conntrack_ipv6: can't register udp6 proto.\n");
421 goto cleanup_tcp6;
422 }
423
424 ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_icmpv6);
425 if (ret < 0) {
426 pr_err("nf_conntrack_ipv6: can't register icmpv6 proto.\n");
427 goto cleanup_udp6;
428 }
429 451
430 ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv6); 452 ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv6);
431 if (ret < 0) { 453 if (ret < 0) {
432 pr_err("nf_conntrack_ipv6: can't register ipv6 proto.\n"); 454 pr_err("nf_conntrack_ipv6: can't register ipv6 proto.\n");
433 goto cleanup_icmpv6; 455 goto cleanup_l4proto;
434 } 456 }
435 return ret; 457 return ret;
436 458cleanup_l4proto:
437 cleanup_icmpv6: 459 nf_ct_l4proto_unregister(builtin_l4proto6,
438 nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmpv6); 460 ARRAY_SIZE(builtin_l4proto6));
439 cleanup_udp6:
440 nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp6);
441 cleanup_tcp6:
442 nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp6);
443 cleanup_hooks:
444 nf_unregister_hooks(ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops));
445 cleanup_pernet: 461 cleanup_pernet:
446 unregister_pernet_subsys(&ipv6_net_ops); 462 unregister_pernet_subsys(&ipv6_net_ops);
447 cleanup_sockopt: 463 cleanup_sockopt:
@@ -453,10 +469,8 @@ static void __exit nf_conntrack_l3proto_ipv6_fini(void)
453{ 469{
454 synchronize_net(); 470 synchronize_net();
455 nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv6); 471 nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv6);
456 nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp6); 472 nf_ct_l4proto_unregister(builtin_l4proto6,
457 nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp6); 473 ARRAY_SIZE(builtin_l4proto6));
458 nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmpv6);
459 nf_unregister_hooks(ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops));
460 unregister_pernet_subsys(&ipv6_net_ops); 474 unregister_pernet_subsys(&ipv6_net_ops);
461 nf_unregister_sockopt(&so_getorigdst6); 475 nf_unregister_sockopt(&so_getorigdst6);
462} 476}
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index f5a61bc3ec2b..d2c2ccbfbe72 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -145,15 +145,15 @@ static int
145icmpv6_error_message(struct net *net, struct nf_conn *tmpl, 145icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
146 struct sk_buff *skb, 146 struct sk_buff *skb,
147 unsigned int icmp6off, 147 unsigned int icmp6off,
148 enum ip_conntrack_info *ctinfo,
149 unsigned int hooknum) 148 unsigned int hooknum)
150{ 149{
151 struct nf_conntrack_tuple intuple, origtuple; 150 struct nf_conntrack_tuple intuple, origtuple;
152 const struct nf_conntrack_tuple_hash *h; 151 const struct nf_conntrack_tuple_hash *h;
153 const struct nf_conntrack_l4proto *inproto; 152 const struct nf_conntrack_l4proto *inproto;
153 enum ip_conntrack_info ctinfo;
154 struct nf_conntrack_zone tmp; 154 struct nf_conntrack_zone tmp;
155 155
156 NF_CT_ASSERT(skb->nfct == NULL); 156 NF_CT_ASSERT(!skb_nfct(skb));
157 157
158 /* Are they talking about one of our connections? */ 158 /* Are they talking about one of our connections? */
159 if (!nf_ct_get_tuplepr(skb, 159 if (!nf_ct_get_tuplepr(skb,
@@ -176,7 +176,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
176 return -NF_ACCEPT; 176 return -NF_ACCEPT;
177 } 177 }
178 178
179 *ctinfo = IP_CT_RELATED; 179 ctinfo = IP_CT_RELATED;
180 180
181 h = nf_conntrack_find_get(net, nf_ct_zone_tmpl(tmpl, skb, &tmp), 181 h = nf_conntrack_find_get(net, nf_ct_zone_tmpl(tmpl, skb, &tmp),
182 &intuple); 182 &intuple);
@@ -185,19 +185,18 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
185 return -NF_ACCEPT; 185 return -NF_ACCEPT;
186 } else { 186 } else {
187 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) 187 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
188 *ctinfo += IP_CT_IS_REPLY; 188 ctinfo += IP_CT_IS_REPLY;
189 } 189 }
190 190
191 /* Update skb to refer to this connection */ 191 /* Update skb to refer to this connection */
192 skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general; 192 nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo);
193 skb->nfctinfo = *ctinfo;
194 return NF_ACCEPT; 193 return NF_ACCEPT;
195} 194}
196 195
197static int 196static int
198icmpv6_error(struct net *net, struct nf_conn *tmpl, 197icmpv6_error(struct net *net, struct nf_conn *tmpl,
199 struct sk_buff *skb, unsigned int dataoff, 198 struct sk_buff *skb, unsigned int dataoff,
200 enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum) 199 u8 pf, unsigned int hooknum)
201{ 200{
202 const struct icmp6hdr *icmp6h; 201 const struct icmp6hdr *icmp6h;
203 struct icmp6hdr _ih; 202 struct icmp6hdr _ih;
@@ -222,9 +221,8 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl,
222 type = icmp6h->icmp6_type - 130; 221 type = icmp6h->icmp6_type - 130;
223 if (type >= 0 && type < sizeof(noct_valid_new) && 222 if (type >= 0 && type < sizeof(noct_valid_new) &&
224 noct_valid_new[type]) { 223 noct_valid_new[type]) {
225 skb->nfct = &nf_ct_untracked_get()->ct_general; 224 nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW);
226 skb->nfctinfo = IP_CT_NEW; 225 nf_conntrack_get(skb_nfct(skb));
227 nf_conntrack_get(skb->nfct);
228 return NF_ACCEPT; 226 return NF_ACCEPT;
229 } 227 }
230 228
@@ -232,7 +230,7 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl,
232 if (icmp6h->icmp6_type >= 128) 230 if (icmp6h->icmp6_type >= 128)
233 return NF_ACCEPT; 231 return NF_ACCEPT;
234 232
235 return icmpv6_error_message(net, tmpl, skb, dataoff, ctinfo, hooknum); 233 return icmpv6_error_message(net, tmpl, skb, dataoff, hooknum);
236} 234}
237 235
238#if IS_ENABLED(CONFIG_NF_CT_NETLINK) 236#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 9948b5ce52da..986d4ca38832 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -589,6 +589,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
589 hdr = ipv6_hdr(skb); 589 hdr = ipv6_hdr(skb);
590 fhdr = (struct frag_hdr *)skb_transport_header(skb); 590 fhdr = (struct frag_hdr *)skb_transport_header(skb);
591 591
592 skb_orphan(skb);
592 fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr, 593 fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr,
593 skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); 594 skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr));
594 if (fq == NULL) { 595 if (fq == NULL) {
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index f06b0471f39f..ada60d1a991b 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -30,12 +30,14 @@
30#include <net/netfilter/nf_conntrack_zones.h> 30#include <net/netfilter/nf_conntrack_zones.h>
31#include <net/netfilter/ipv6/nf_defrag_ipv6.h> 31#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
32 32
33static DEFINE_MUTEX(defrag6_mutex);
34
33static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, 35static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum,
34 struct sk_buff *skb) 36 struct sk_buff *skb)
35{ 37{
36 u16 zone_id = NF_CT_DEFAULT_ZONE_ID; 38 u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
37#if IS_ENABLED(CONFIG_NF_CONNTRACK) 39#if IS_ENABLED(CONFIG_NF_CONNTRACK)
38 if (skb->nfct) { 40 if (skb_nfct(skb)) {
39 enum ip_conntrack_info ctinfo; 41 enum ip_conntrack_info ctinfo;
40 const struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 42 const struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
41 43
@@ -59,7 +61,7 @@ static unsigned int ipv6_defrag(void *priv,
59 61
60#if IS_ENABLED(CONFIG_NF_CONNTRACK) 62#if IS_ENABLED(CONFIG_NF_CONNTRACK)
61 /* Previously seen (loopback)? */ 63 /* Previously seen (loopback)? */
62 if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct)) 64 if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb)))
63 return NF_ACCEPT; 65 return NF_ACCEPT;
64#endif 66#endif
65 67
@@ -87,6 +89,19 @@ static struct nf_hook_ops ipv6_defrag_ops[] = {
87 }, 89 },
88}; 90};
89 91
92static void __net_exit defrag6_net_exit(struct net *net)
93{
94 if (net->nf.defrag_ipv6) {
95 nf_unregister_net_hooks(net, ipv6_defrag_ops,
96 ARRAY_SIZE(ipv6_defrag_ops));
97 net->nf.defrag_ipv6 = false;
98 }
99}
100
101static struct pernet_operations defrag6_net_ops = {
102 .exit = defrag6_net_exit,
103};
104
90static int __init nf_defrag_init(void) 105static int __init nf_defrag_init(void)
91{ 106{
92 int ret = 0; 107 int ret = 0;
@@ -96,9 +111,9 @@ static int __init nf_defrag_init(void)
96 pr_err("nf_defrag_ipv6: can't initialize frag6.\n"); 111 pr_err("nf_defrag_ipv6: can't initialize frag6.\n");
97 return ret; 112 return ret;
98 } 113 }
99 ret = nf_register_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); 114 ret = register_pernet_subsys(&defrag6_net_ops);
100 if (ret < 0) { 115 if (ret < 0) {
101 pr_err("nf_defrag_ipv6: can't register hooks\n"); 116 pr_err("nf_defrag_ipv6: can't register pernet ops\n");
102 goto cleanup_frag6; 117 goto cleanup_frag6;
103 } 118 }
104 return ret; 119 return ret;
@@ -111,12 +126,31 @@ cleanup_frag6:
111 126
112static void __exit nf_defrag_fini(void) 127static void __exit nf_defrag_fini(void)
113{ 128{
114 nf_unregister_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); 129 unregister_pernet_subsys(&defrag6_net_ops);
115 nf_ct_frag6_cleanup(); 130 nf_ct_frag6_cleanup();
116} 131}
117 132
118void nf_defrag_ipv6_enable(void) 133int nf_defrag_ipv6_enable(struct net *net)
119{ 134{
135 int err = 0;
136
137 might_sleep();
138
139 if (net->nf.defrag_ipv6)
140 return 0;
141
142 mutex_lock(&defrag6_mutex);
143 if (net->nf.defrag_ipv6)
144 goto out_unlock;
145
146 err = nf_register_net_hooks(net, ipv6_defrag_ops,
147 ARRAY_SIZE(ipv6_defrag_ops));
148 if (err == 0)
149 net->nf.defrag_ipv6 = true;
150
151 out_unlock:
152 mutex_unlock(&defrag6_mutex);
153 return err;
120} 154}
121EXPORT_SYMBOL_GPL(nf_defrag_ipv6_enable); 155EXPORT_SYMBOL_GPL(nf_defrag_ipv6_enable);
122 156
diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c
index 4a84b5ad9ecb..888ecd106e5f 100644
--- a/net/ipv6/netfilter/nf_dup_ipv6.c
+++ b/net/ipv6/netfilter/nf_dup_ipv6.c
@@ -57,10 +57,9 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum,
57 return; 57 return;
58 58
59#if IS_ENABLED(CONFIG_NF_CONNTRACK) 59#if IS_ENABLED(CONFIG_NF_CONNTRACK)
60 nf_conntrack_put(skb->nfct); 60 nf_reset(skb);
61 skb->nfct = &nf_ct_untracked_get()->ct_general; 61 nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW);
62 skb->nfctinfo = IP_CT_NEW; 62 nf_conntrack_get(skb_nfct(skb));
63 nf_conntrack_get(skb->nfct);
64#endif 63#endif
65 if (hooknum == NF_INET_PRE_ROUTING || 64 if (hooknum == NF_INET_PRE_ROUTING ||
66 hooknum == NF_INET_LOCAL_IN) { 65 hooknum == NF_INET_LOCAL_IN) {
diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c
index 57d86066a13b..97c724224da7 100644
--- a/net/ipv6/netfilter/nf_log_ipv6.c
+++ b/net/ipv6/netfilter/nf_log_ipv6.c
@@ -64,7 +64,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
64 nf_log_buf_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr); 64 nf_log_buf_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr);
65 65
66 /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */ 66 /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */
67 nf_log_buf_add(m, "LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ", 67 nf_log_buf_add(m, "LEN=%zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
68 ntohs(ih->payload_len) + sizeof(struct ipv6hdr), 68 ntohs(ih->payload_len) + sizeof(struct ipv6hdr),
69 (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20, 69 (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20,
70 ih->hop_limit, 70 ih->hop_limit,
@@ -351,7 +351,7 @@ static void nf_log_ip6_packet(struct net *net, u_int8_t pf,
351 struct nf_log_buf *m; 351 struct nf_log_buf *m;
352 352
353 /* FIXME: Disabled from containers until syslog ns is supported */ 353 /* FIXME: Disabled from containers until syslog ns is supported */
354 if (!net_eq(net, &init_net)) 354 if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns)
355 return; 355 return;
356 356
357 m = nf_log_buf_open(); 357 m = nf_log_buf_open();
diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c
index 10090400c72f..eedee5d108d9 100644
--- a/net/ipv6/netfilter/nf_reject_ipv6.c
+++ b/net/ipv6/netfilter/nf_reject_ipv6.c
@@ -157,6 +157,7 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
157 fl6.fl6_sport = otcph->dest; 157 fl6.fl6_sport = otcph->dest;
158 fl6.fl6_dport = otcph->source; 158 fl6.fl6_dport = otcph->source;
159 fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst(oldskb)->dev); 159 fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst(oldskb)->dev);
160 fl6.flowi6_mark = IP6_REPLY_MARK(net, oldskb->mark);
160 security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6)); 161 security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6));
161 dst = ip6_route_output(net, NULL, &fl6); 162 dst = ip6_route_output(net, NULL, &fl6);
162 if (dst->error) { 163 if (dst->error) {
@@ -180,6 +181,8 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
180 181
181 skb_dst_set(nskb, dst); 182 skb_dst_set(nskb, dst);
182 183
184 nskb->mark = fl6.flowi6_mark;
185
183 skb_reserve(nskb, hh_len + dst->header_len); 186 skb_reserve(nskb, hh_len + dst->header_len);
184 ip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP, 187 ip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP,
185 ip6_dst_hoplimit(dst)); 188 ip6_dst_hoplimit(dst));
diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c
new file mode 100644
index 000000000000..ebb2bf84232a
--- /dev/null
+++ b/net/ipv6/netfilter/nf_socket_ipv6.c
@@ -0,0 +1,151 @@
1/*
2 * Copyright (C) 2007-2008 BalaBit IT Ltd.
3 * Author: Krisztian Kovacs
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 *
9 */
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11#include <linux/module.h>
12#include <linux/skbuff.h>
13#include <net/tcp.h>
14#include <net/udp.h>
15#include <net/icmp.h>
16#include <net/sock.h>
17#include <net/inet_sock.h>
18#include <net/inet6_hashtables.h>
19#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
20#include <net/netfilter/nf_socket.h>
21#if IS_ENABLED(CONFIG_NF_CONNTRACK)
22#include <net/netfilter/nf_conntrack.h>
23#endif
24
25static int
26extract_icmp6_fields(const struct sk_buff *skb,
27 unsigned int outside_hdrlen,
28 int *protocol,
29 const struct in6_addr **raddr,
30 const struct in6_addr **laddr,
31 __be16 *rport,
32 __be16 *lport,
33 struct ipv6hdr *ipv6_var)
34{
35 const struct ipv6hdr *inside_iph;
36 struct icmp6hdr *icmph, _icmph;
37 __be16 *ports, _ports[2];
38 u8 inside_nexthdr;
39 __be16 inside_fragoff;
40 int inside_hdrlen;
41
42 icmph = skb_header_pointer(skb, outside_hdrlen,
43 sizeof(_icmph), &_icmph);
44 if (icmph == NULL)
45 return 1;
46
47 if (icmph->icmp6_type & ICMPV6_INFOMSG_MASK)
48 return 1;
49
50 inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(_icmph),
51 sizeof(*ipv6_var), ipv6_var);
52 if (inside_iph == NULL)
53 return 1;
54 inside_nexthdr = inside_iph->nexthdr;
55
56 inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) +
57 sizeof(*ipv6_var),
58 &inside_nexthdr, &inside_fragoff);
59 if (inside_hdrlen < 0)
60 return 1; /* hjm: Packet has no/incomplete transport layer headers. */
61
62 if (inside_nexthdr != IPPROTO_TCP &&
63 inside_nexthdr != IPPROTO_UDP)
64 return 1;
65
66 ports = skb_header_pointer(skb, inside_hdrlen,
67 sizeof(_ports), &_ports);
68 if (ports == NULL)
69 return 1;
70
71 /* the inside IP packet is the one quoted from our side, thus
72 * its saddr is the local address */
73 *protocol = inside_nexthdr;
74 *laddr = &inside_iph->saddr;
75 *lport = ports[0];
76 *raddr = &inside_iph->daddr;
77 *rport = ports[1];
78
79 return 0;
80}
81
82static struct sock *
83nf_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff,
84 const u8 protocol,
85 const struct in6_addr *saddr, const struct in6_addr *daddr,
86 const __be16 sport, const __be16 dport,
87 const struct net_device *in)
88{
89 switch (protocol) {
90 case IPPROTO_TCP:
91 return inet6_lookup(net, &tcp_hashinfo, skb, doff,
92 saddr, sport, daddr, dport,
93 in->ifindex);
94 case IPPROTO_UDP:
95 return udp6_lib_lookup(net, saddr, sport, daddr, dport,
96 in->ifindex);
97 }
98
99 return NULL;
100}
101
102struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb,
103 const struct net_device *indev)
104{
105 __be16 uninitialized_var(dport), uninitialized_var(sport);
106 const struct in6_addr *daddr = NULL, *saddr = NULL;
107 struct ipv6hdr *iph = ipv6_hdr(skb);
108 struct sk_buff *data_skb = NULL;
109 int doff = 0;
110 int thoff = 0, tproto;
111
112 tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);
113 if (tproto < 0) {
114 pr_debug("unable to find transport header in IPv6 packet, dropping\n");
115 return NULL;
116 }
117
118 if (tproto == IPPROTO_UDP || tproto == IPPROTO_TCP) {
119 struct udphdr _hdr, *hp;
120
121 hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
122 if (hp == NULL)
123 return NULL;
124
125 saddr = &iph->saddr;
126 sport = hp->source;
127 daddr = &iph->daddr;
128 dport = hp->dest;
129 data_skb = (struct sk_buff *)skb;
130 doff = tproto == IPPROTO_TCP ?
131 thoff + __tcp_hdrlen((struct tcphdr *)hp) :
132 thoff + sizeof(*hp);
133
134 } else if (tproto == IPPROTO_ICMPV6) {
135 struct ipv6hdr ipv6_var;
136
137 if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr,
138 &sport, &dport, &ipv6_var))
139 return NULL;
140 } else {
141 return NULL;
142 }
143
144 return nf_socket_get_sock_v6(net, data_skb, doff, tproto, saddr, daddr,
145 sport, dport, indev);
146}
147EXPORT_SYMBOL_GPL(nf_sk_lookup_slow_v6);
148
149MODULE_LICENSE("GPL");
150MODULE_AUTHOR("Krisztian Kovacs, Balazs Scheidler");
151MODULE_DESCRIPTION("Netfilter IPv6 socket lookup infrastructure");
diff --git a/net/ipv6/netfilter/nft_dup_ipv6.c b/net/ipv6/netfilter/nft_dup_ipv6.c
index 831f86e1ec08..d8b5b60b7d53 100644
--- a/net/ipv6/netfilter/nft_dup_ipv6.c
+++ b/net/ipv6/netfilter/nft_dup_ipv6.c
@@ -28,7 +28,7 @@ static void nft_dup_ipv6_eval(const struct nft_expr *expr,
28 struct in6_addr *gw = (struct in6_addr *)&regs->data[priv->sreg_addr]; 28 struct in6_addr *gw = (struct in6_addr *)&regs->data[priv->sreg_addr];
29 int oif = priv->sreg_dev ? regs->data[priv->sreg_dev] : -1; 29 int oif = priv->sreg_dev ? regs->data[priv->sreg_dev] : -1;
30 30
31 nf_dup_ipv6(pkt->net, pkt->skb, pkt->hook, gw, oif); 31 nf_dup_ipv6(nft_net(pkt), pkt->skb, nft_hook(pkt), gw, oif);
32} 32}
33 33
34static int nft_dup_ipv6_init(const struct nft_ctx *ctx, 34static int nft_dup_ipv6_init(const struct nft_ctx *ctx,
diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c
new file mode 100644
index 000000000000..765facf03d45
--- /dev/null
+++ b/net/ipv6/netfilter/nft_fib_ipv6.c
@@ -0,0 +1,270 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License version 2 as
4 * published by the Free Software Foundation.
5 */
6
7#include <linux/kernel.h>
8#include <linux/init.h>
9#include <linux/module.h>
10#include <linux/netlink.h>
11#include <linux/netfilter.h>
12#include <linux/netfilter/nf_tables.h>
13#include <linux/netfilter_ipv6.h>
14#include <net/netfilter/nf_tables_core.h>
15#include <net/netfilter/nf_tables.h>
16#include <net/netfilter/nft_fib.h>
17
18#include <net/ip6_fib.h>
19#include <net/ip6_route.h>
20
21static int get_ifindex(const struct net_device *dev)
22{
23 return dev ? dev->ifindex : 0;
24}
25
26static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv,
27 const struct nft_pktinfo *pkt,
28 const struct net_device *dev)
29{
30 const struct ipv6hdr *iph = ipv6_hdr(pkt->skb);
31 int lookup_flags = 0;
32
33 if (priv->flags & NFTA_FIB_F_DADDR) {
34 fl6->daddr = iph->daddr;
35 fl6->saddr = iph->saddr;
36 } else {
37 fl6->daddr = iph->saddr;
38 fl6->saddr = iph->daddr;
39 }
40
41 if (ipv6_addr_type(&fl6->daddr) & IPV6_ADDR_LINKLOCAL) {
42 lookup_flags |= RT6_LOOKUP_F_IFACE;
43 fl6->flowi6_oif = get_ifindex(dev ? dev : pkt->skb->dev);
44 }
45
46 if (ipv6_addr_type(&fl6->saddr) & IPV6_ADDR_UNICAST)
47 lookup_flags |= RT6_LOOKUP_F_HAS_SADDR;
48
49 if (priv->flags & NFTA_FIB_F_MARK)
50 fl6->flowi6_mark = pkt->skb->mark;
51
52 fl6->flowlabel = (*(__be32 *)iph) & IPV6_FLOWINFO_MASK;
53
54 return lookup_flags;
55}
56
57static u32 __nft_fib6_eval_type(const struct nft_fib *priv,
58 const struct nft_pktinfo *pkt)
59{
60 const struct net_device *dev = NULL;
61 const struct nf_ipv6_ops *v6ops;
62 const struct nf_afinfo *afinfo;
63 int route_err, addrtype;
64 struct rt6_info *rt;
65 struct flowi6 fl6 = {
66 .flowi6_iif = LOOPBACK_IFINDEX,
67 .flowi6_proto = pkt->tprot,
68 };
69 u32 ret = 0;
70
71 afinfo = nf_get_afinfo(NFPROTO_IPV6);
72 if (!afinfo)
73 return RTN_UNREACHABLE;
74
75 if (priv->flags & NFTA_FIB_F_IIF)
76 dev = nft_in(pkt);
77 else if (priv->flags & NFTA_FIB_F_OIF)
78 dev = nft_out(pkt);
79
80 nft_fib6_flowi_init(&fl6, priv, pkt, dev);
81
82 v6ops = nf_get_ipv6_ops();
83 if (dev && v6ops && v6ops->chk_addr(nft_net(pkt), &fl6.daddr, dev, true))
84 ret = RTN_LOCAL;
85
86 route_err = afinfo->route(nft_net(pkt), (struct dst_entry **)&rt,
87 flowi6_to_flowi(&fl6), false);
88 if (route_err)
89 goto err;
90
91 if (rt->rt6i_flags & RTF_REJECT) {
92 route_err = rt->dst.error;
93 dst_release(&rt->dst);
94 goto err;
95 }
96
97 if (ipv6_anycast_destination((struct dst_entry *)rt, &fl6.daddr))
98 ret = RTN_ANYCAST;
99 else if (!dev && rt->rt6i_flags & RTF_LOCAL)
100 ret = RTN_LOCAL;
101
102 dst_release(&rt->dst);
103
104 if (ret)
105 return ret;
106
107 addrtype = ipv6_addr_type(&fl6.daddr);
108
109 if (addrtype & IPV6_ADDR_MULTICAST)
110 return RTN_MULTICAST;
111 if (addrtype & IPV6_ADDR_UNICAST)
112 return RTN_UNICAST;
113
114 return RTN_UNSPEC;
115 err:
116 switch (route_err) {
117 case -EINVAL:
118 return RTN_BLACKHOLE;
119 case -EACCES:
120 return RTN_PROHIBIT;
121 case -EAGAIN:
122 return RTN_THROW;
123 default:
124 break;
125 }
126
127 return RTN_UNREACHABLE;
128}
129
130void nft_fib6_eval_type(const struct nft_expr *expr, struct nft_regs *regs,
131 const struct nft_pktinfo *pkt)
132{
133 const struct nft_fib *priv = nft_expr_priv(expr);
134 u32 *dest = &regs->data[priv->dreg];
135
136 *dest = __nft_fib6_eval_type(priv, pkt);
137}
138EXPORT_SYMBOL_GPL(nft_fib6_eval_type);
139
140void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
141 const struct nft_pktinfo *pkt)
142{
143 const struct nft_fib *priv = nft_expr_priv(expr);
144 const struct net_device *oif = NULL;
145 u32 *dest = &regs->data[priv->dreg];
146 struct flowi6 fl6 = {
147 .flowi6_iif = LOOPBACK_IFINDEX,
148 .flowi6_proto = pkt->tprot,
149 };
150 struct rt6_info *rt;
151 int lookup_flags;
152
153 if (priv->flags & NFTA_FIB_F_IIF)
154 oif = nft_in(pkt);
155 else if (priv->flags & NFTA_FIB_F_OIF)
156 oif = nft_out(pkt);
157
158 lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, oif);
159
160 if (nft_hook(pkt) == NF_INET_PRE_ROUTING &&
161 nft_fib_is_loopback(pkt->skb, nft_in(pkt))) {
162 nft_fib_store_result(dest, priv->result, pkt,
163 nft_in(pkt)->ifindex);
164 return;
165 }
166
167 *dest = 0;
168 again:
169 rt = (void *)ip6_route_lookup(nft_net(pkt), &fl6, lookup_flags);
170 if (rt->dst.error)
171 goto put_rt_err;
172
173 /* Should not see RTF_LOCAL here */
174 if (rt->rt6i_flags & (RTF_REJECT | RTF_ANYCAST | RTF_LOCAL))
175 goto put_rt_err;
176
177 if (oif && oif != rt->rt6i_idev->dev) {
178 /* multipath route? Try again with F_IFACE */
179 if ((lookup_flags & RT6_LOOKUP_F_IFACE) == 0) {
180 lookup_flags |= RT6_LOOKUP_F_IFACE;
181 fl6.flowi6_oif = oif->ifindex;
182 ip6_rt_put(rt);
183 goto again;
184 }
185 }
186
187 switch (priv->result) {
188 case NFT_FIB_RESULT_OIF:
189 *dest = rt->rt6i_idev->dev->ifindex;
190 break;
191 case NFT_FIB_RESULT_OIFNAME:
192 strncpy((char *)dest, rt->rt6i_idev->dev->name, IFNAMSIZ);
193 break;
194 default:
195 WARN_ON_ONCE(1);
196 break;
197 }
198
199 put_rt_err:
200 ip6_rt_put(rt);
201}
202EXPORT_SYMBOL_GPL(nft_fib6_eval);
203
204static struct nft_expr_type nft_fib6_type;
205
206static const struct nft_expr_ops nft_fib6_type_ops = {
207 .type = &nft_fib6_type,
208 .size = NFT_EXPR_SIZE(sizeof(struct nft_fib)),
209 .eval = nft_fib6_eval_type,
210 .init = nft_fib_init,
211 .dump = nft_fib_dump,
212 .validate = nft_fib_validate,
213};
214
215static const struct nft_expr_ops nft_fib6_ops = {
216 .type = &nft_fib6_type,
217 .size = NFT_EXPR_SIZE(sizeof(struct nft_fib)),
218 .eval = nft_fib6_eval,
219 .init = nft_fib_init,
220 .dump = nft_fib_dump,
221 .validate = nft_fib_validate,
222};
223
224static const struct nft_expr_ops *
225nft_fib6_select_ops(const struct nft_ctx *ctx,
226 const struct nlattr * const tb[])
227{
228 enum nft_fib_result result;
229
230 if (!tb[NFTA_FIB_RESULT])
231 return ERR_PTR(-EINVAL);
232
233 result = ntohl(nla_get_be32(tb[NFTA_FIB_RESULT]));
234
235 switch (result) {
236 case NFT_FIB_RESULT_OIF:
237 return &nft_fib6_ops;
238 case NFT_FIB_RESULT_OIFNAME:
239 return &nft_fib6_ops;
240 case NFT_FIB_RESULT_ADDRTYPE:
241 return &nft_fib6_type_ops;
242 default:
243 return ERR_PTR(-EOPNOTSUPP);
244 }
245}
246
247static struct nft_expr_type nft_fib6_type __read_mostly = {
248 .name = "fib",
249 .select_ops = &nft_fib6_select_ops,
250 .policy = nft_fib_policy,
251 .maxattr = NFTA_FIB_MAX,
252 .family = NFPROTO_IPV6,
253 .owner = THIS_MODULE,
254};
255
256static int __init nft_fib6_module_init(void)
257{
258 return nft_register_expr(&nft_fib6_type);
259}
260
261static void __exit nft_fib6_module_exit(void)
262{
263 nft_unregister_expr(&nft_fib6_type);
264}
265module_init(nft_fib6_module_init);
266module_exit(nft_fib6_module_exit);
267
268MODULE_LICENSE("GPL");
269MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
270MODULE_ALIAS_NFT_AF_EXPR(10, "fib");
diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c
index 9597ffb74077..4146536e9c15 100644
--- a/net/ipv6/netfilter/nft_masq_ipv6.c
+++ b/net/ipv6/netfilter/nft_masq_ipv6.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com> 2 * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify 4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as 5 * it under the terms of the GNU General Public License version 2 as
@@ -27,12 +27,19 @@ static void nft_masq_ipv6_eval(const struct nft_expr *expr,
27 memset(&range, 0, sizeof(range)); 27 memset(&range, 0, sizeof(range));
28 range.flags = priv->flags; 28 range.flags = priv->flags;
29 if (priv->sreg_proto_min) { 29 if (priv->sreg_proto_min) {
30 range.min_proto.all = 30 range.min_proto.all = (__force __be16)nft_reg_load16(
31 *(__be16 *)&regs->data[priv->sreg_proto_min]; 31 &regs->data[priv->sreg_proto_min]);
32 range.max_proto.all = 32 range.max_proto.all = (__force __be16)nft_reg_load16(
33 *(__be16 *)&regs->data[priv->sreg_proto_max]; 33 &regs->data[priv->sreg_proto_max]);
34 } 34 }
35 regs->verdict.code = nf_nat_masquerade_ipv6(pkt->skb, &range, pkt->out); 35 regs->verdict.code = nf_nat_masquerade_ipv6(pkt->skb, &range,
36 nft_out(pkt));
37}
38
39static void
40nft_masq_ipv6_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
41{
42 nf_ct_netns_put(ctx->net, NFPROTO_IPV6);
36} 43}
37 44
38static struct nft_expr_type nft_masq_ipv6_type; 45static struct nft_expr_type nft_masq_ipv6_type;
@@ -41,6 +48,7 @@ static const struct nft_expr_ops nft_masq_ipv6_ops = {
41 .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)), 48 .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)),
42 .eval = nft_masq_ipv6_eval, 49 .eval = nft_masq_ipv6_eval,
43 .init = nft_masq_init, 50 .init = nft_masq_init,
51 .destroy = nft_masq_ipv6_destroy,
44 .dump = nft_masq_dump, 52 .dump = nft_masq_dump,
45 .validate = nft_masq_validate, 53 .validate = nft_masq_validate,
46}; 54};
@@ -77,5 +85,5 @@ module_init(nft_masq_ipv6_module_init);
77module_exit(nft_masq_ipv6_module_exit); 85module_exit(nft_masq_ipv6_module_exit);
78 86
79MODULE_LICENSE("GPL"); 87MODULE_LICENSE("GPL");
80MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>"); 88MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>");
81MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "masq"); 89MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "masq");
diff --git a/net/ipv6/netfilter/nft_redir_ipv6.c b/net/ipv6/netfilter/nft_redir_ipv6.c
index aca44e89a881..a27e424f690d 100644
--- a/net/ipv6/netfilter/nft_redir_ipv6.c
+++ b/net/ipv6/netfilter/nft_redir_ipv6.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com> 2 * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify 4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as 5 * it under the terms of the GNU General Public License version 2 as
@@ -26,16 +26,23 @@ static void nft_redir_ipv6_eval(const struct nft_expr *expr,
26 26
27 memset(&range, 0, sizeof(range)); 27 memset(&range, 0, sizeof(range));
28 if (priv->sreg_proto_min) { 28 if (priv->sreg_proto_min) {
29 range.min_proto.all = 29 range.min_proto.all = (__force __be16)nft_reg_load16(
30 *(__be16 *)&regs->data[priv->sreg_proto_min], 30 &regs->data[priv->sreg_proto_min]);
31 range.max_proto.all = 31 range.max_proto.all = (__force __be16)nft_reg_load16(
32 *(__be16 *)&regs->data[priv->sreg_proto_max], 32 &regs->data[priv->sreg_proto_max]);
33 range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; 33 range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
34 } 34 }
35 35
36 range.flags |= priv->flags; 36 range.flags |= priv->flags;
37 37
38 regs->verdict.code = nf_nat_redirect_ipv6(pkt->skb, &range, pkt->hook); 38 regs->verdict.code =
39 nf_nat_redirect_ipv6(pkt->skb, &range, nft_hook(pkt));
40}
41
42static void
43nft_redir_ipv6_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
44{
45 nf_ct_netns_put(ctx->net, NFPROTO_IPV6);
39} 46}
40 47
41static struct nft_expr_type nft_redir_ipv6_type; 48static struct nft_expr_type nft_redir_ipv6_type;
@@ -44,6 +51,7 @@ static const struct nft_expr_ops nft_redir_ipv6_ops = {
44 .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)), 51 .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)),
45 .eval = nft_redir_ipv6_eval, 52 .eval = nft_redir_ipv6_eval,
46 .init = nft_redir_init, 53 .init = nft_redir_init,
54 .destroy = nft_redir_ipv6_destroy,
47 .dump = nft_redir_dump, 55 .dump = nft_redir_dump,
48 .validate = nft_redir_validate, 56 .validate = nft_redir_validate,
49}; 57};
@@ -71,5 +79,5 @@ module_init(nft_redir_ipv6_module_init);
71module_exit(nft_redir_ipv6_module_exit); 79module_exit(nft_redir_ipv6_module_exit);
72 80
73MODULE_LICENSE("GPL"); 81MODULE_LICENSE("GPL");
74MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>"); 82MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>");
75MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "redir"); 83MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "redir");
diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c
index 92bda9908bb9..057deeaff1cb 100644
--- a/net/ipv6/netfilter/nft_reject_ipv6.c
+++ b/net/ipv6/netfilter/nft_reject_ipv6.c
@@ -27,11 +27,11 @@ static void nft_reject_ipv6_eval(const struct nft_expr *expr,
27 27
28 switch (priv->type) { 28 switch (priv->type) {
29 case NFT_REJECT_ICMP_UNREACH: 29 case NFT_REJECT_ICMP_UNREACH:
30 nf_send_unreach6(pkt->net, pkt->skb, priv->icmp_code, 30 nf_send_unreach6(nft_net(pkt), pkt->skb, priv->icmp_code,
31 pkt->hook); 31 nft_hook(pkt));
32 break; 32 break;
33 case NFT_REJECT_TCP_RST: 33 case NFT_REJECT_TCP_RST:
34 nf_send_reset6(pkt->net, pkt->skb, pkt->hook); 34 nf_send_reset6(nft_net(pkt), pkt->skb, nft_hook(pkt));
35 break; 35 break;
36 default: 36 default:
37 break; 37 break;
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 66e2d9dfc43a..9b522fa90e6d 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -113,6 +113,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
113 fl6.daddr = *daddr; 113 fl6.daddr = *daddr;
114 fl6.flowi6_oif = oif; 114 fl6.flowi6_oif = oif;
115 fl6.flowi6_mark = sk->sk_mark; 115 fl6.flowi6_mark = sk->sk_mark;
116 fl6.flowi6_uid = sk->sk_uid;
116 fl6.fl6_icmp_type = user_icmph.icmp6_type; 117 fl6.fl6_icmp_type = user_icmph.icmp6_type;
117 fl6.fl6_icmp_code = user_icmph.icmp6_code; 118 fl6.fl6_icmp_code = user_icmph.icmp6_code;
118 security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); 119 security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
@@ -125,12 +126,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
125 return PTR_ERR(dst); 126 return PTR_ERR(dst);
126 rt = (struct rt6_info *) dst; 127 rt = (struct rt6_info *) dst;
127 128
128 np = inet6_sk(sk);
129 if (!np) {
130 err = -EBADF;
131 goto dst_err_out;
132 }
133
134 if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) 129 if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
135 fl6.flowi6_oif = np->mcast_oif; 130 fl6.flowi6_oif = np->mcast_oif;
136 else if (!fl6.flowi6_oif) 131 else if (!fl6.flowi6_oif)
@@ -165,7 +160,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
165 } 160 }
166 release_sock(sk); 161 release_sock(sk);
167 162
168dst_err_out:
169 dst_release(dst); 163 dst_release(dst);
170 164
171 if (err) 165 if (err)
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 054a1d84fc5e..f174e76e6505 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -65,11 +65,12 @@
65 65
66#define ICMPV6_HDRLEN 4 /* ICMPv6 header, RFC 4443 Section 2.1 */ 66#define ICMPV6_HDRLEN 4 /* ICMPv6 header, RFC 4443 Section 2.1 */
67 67
68static struct raw_hashinfo raw_v6_hashinfo = { 68struct raw_hashinfo raw_v6_hashinfo = {
69 .lock = __RW_LOCK_UNLOCKED(raw_v6_hashinfo.lock), 69 .lock = __RW_LOCK_UNLOCKED(raw_v6_hashinfo.lock),
70}; 70};
71EXPORT_SYMBOL_GPL(raw_v6_hashinfo);
71 72
72static struct sock *__raw_v6_lookup(struct net *net, struct sock *sk, 73struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
73 unsigned short num, const struct in6_addr *loc_addr, 74 unsigned short num, const struct in6_addr *loc_addr,
74 const struct in6_addr *rmt_addr, int dif) 75 const struct in6_addr *rmt_addr, int dif)
75{ 76{
@@ -102,6 +103,7 @@ static struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
102found: 103found:
103 return sk; 104 return sk;
104} 105}
106EXPORT_SYMBOL_GPL(__raw_v6_lookup);
105 107
106/* 108/*
107 * 0 - deliver 109 * 0 - deliver
@@ -589,7 +591,11 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
589 } 591 }
590 592
591 offset += skb_transport_offset(skb); 593 offset += skb_transport_offset(skb);
592 BUG_ON(skb_copy_bits(skb, offset, &csum, 2)); 594 err = skb_copy_bits(skb, offset, &csum, 2);
595 if (err < 0) {
596 ip6_flush_pending_frames(sk);
597 goto out;
598 }
593 599
594 /* in case cksum was not initialized */ 600 /* in case cksum was not initialized */
595 if (unlikely(csum)) 601 if (unlikely(csum))
@@ -648,6 +654,9 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
648 654
649 skb->ip_summed = CHECKSUM_NONE; 655 skb->ip_summed = CHECKSUM_NONE;
650 656
657 if (flags & MSG_CONFIRM)
658 skb_set_dst_pending_confirm(skb, 1);
659
651 skb->transport_header = skb->network_header; 660 skb->transport_header = skb->network_header;
652 err = memcpy_from_msg(iph, msg, length); 661 err = memcpy_from_msg(iph, msg, length);
653 if (err) 662 if (err)
@@ -774,6 +783,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
774 memset(&fl6, 0, sizeof(fl6)); 783 memset(&fl6, 0, sizeof(fl6));
775 784
776 fl6.flowi6_mark = sk->sk_mark; 785 fl6.flowi6_mark = sk->sk_mark;
786 fl6.flowi6_uid = sk->sk_uid;
777 787
778 ipc6.hlimit = -1; 788 ipc6.hlimit = -1;
779 ipc6.tclass = -1; 789 ipc6.tclass = -1;
@@ -927,7 +937,8 @@ out:
927 txopt_put(opt_to_free); 937 txopt_put(opt_to_free);
928 return err < 0 ? err : len; 938 return err < 0 ? err : len;
929do_confirm: 939do_confirm:
930 dst_confirm(dst); 940 if (msg->msg_flags & MSG_PROBE)
941 dst_confirm_neigh(dst, &fl6.daddr);
931 if (!(msg->msg_flags & MSG_PROBE) || len) 942 if (!(msg->msg_flags & MSG_PROBE) || len)
932 goto back_from_confirm; 943 goto back_from_confirm;
933 err = 0; 944 err = 0;
@@ -1259,6 +1270,7 @@ struct proto rawv6_prot = {
1259 .compat_getsockopt = compat_rawv6_getsockopt, 1270 .compat_getsockopt = compat_rawv6_getsockopt,
1260 .compat_ioctl = compat_rawv6_ioctl, 1271 .compat_ioctl = compat_rawv6_ioctl,
1261#endif 1272#endif
1273 .diag_destroy = raw_abort,
1262}; 1274};
1263 1275
1264#ifdef CONFIG_PROC_FS 1276#ifdef CONFIG_PROC_FS
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 3815e8505ed2..e1da5b888cc4 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -211,7 +211,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
211{ 211{
212 struct sk_buff *prev, *next; 212 struct sk_buff *prev, *next;
213 struct net_device *dev; 213 struct net_device *dev;
214 int offset, end; 214 int offset, end, fragsize;
215 struct net *net = dev_net(skb_dst(skb)->dev); 215 struct net *net = dev_net(skb_dst(skb)->dev);
216 u8 ecn; 216 u8 ecn;
217 217
@@ -336,6 +336,10 @@ found:
336 fq->ecn |= ecn; 336 fq->ecn |= ecn;
337 add_frag_mem_limit(fq->q.net, skb->truesize); 337 add_frag_mem_limit(fq->q.net, skb->truesize);
338 338
339 fragsize = -skb_network_offset(skb) + skb->len;
340 if (fragsize > fq->q.max_size)
341 fq->q.max_size = fragsize;
342
339 /* The first fragment. 343 /* The first fragment.
340 * nhoffset is obtained from the first fragment, of course. 344 * nhoffset is obtained from the first fragment, of course.
341 */ 345 */
@@ -495,6 +499,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
495 ipv6_change_dsfield(ipv6_hdr(head), 0xff, ecn); 499 ipv6_change_dsfield(ipv6_hdr(head), 0xff, ecn);
496 IP6CB(head)->nhoff = nhoff; 500 IP6CB(head)->nhoff = nhoff;
497 IP6CB(head)->flags |= IP6SKB_FRAGMENTED; 501 IP6CB(head)->flags |= IP6SKB_FRAGMENTED;
502 IP6CB(head)->frag_max_size = fq->q.max_size;
498 503
499 /* Yes, and fold redundant checksum back. 8) */ 504 /* Yes, and fold redundant checksum back. 8) */
500 skb_postpush_rcsum(head, skb_network_header(head), 505 skb_postpush_rcsum(head, skb_network_header(head),
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1b57e11e6e0d..fb174b590fd3 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -64,7 +64,7 @@
64#include <net/l3mdev.h> 64#include <net/l3mdev.h>
65#include <trace/events/fib6.h> 65#include <trace/events/fib6.h>
66 66
67#include <asm/uaccess.h> 67#include <linux/uaccess.h>
68 68
69#ifdef CONFIG_SYSCTL 69#ifdef CONFIG_SYSCTL
70#include <linux/sysctl.h> 70#include <linux/sysctl.h>
@@ -98,6 +98,12 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98 struct sk_buff *skb); 98 struct sk_buff *skb);
99static void rt6_dst_from_metrics_check(struct rt6_info *rt); 99static void rt6_dst_from_metrics_check(struct rt6_info *rt);
100static int rt6_score_route(struct rt6_info *rt, int oif, int strict); 100static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
101static size_t rt6_nlmsg_size(struct rt6_info *rt);
102static int rt6_fill_node(struct net *net,
103 struct sk_buff *skb, struct rt6_info *rt,
104 struct in6_addr *dst, struct in6_addr *src,
105 int iif, int type, u32 portid, u32 seq,
106 unsigned int flags);
101 107
102#ifdef CONFIG_IPV6_ROUTE_INFO 108#ifdef CONFIG_IPV6_ROUTE_INFO
103static struct rt6_info *rt6_add_route_info(struct net *net, 109static struct rt6_info *rt6_add_route_info(struct net *net,
@@ -217,6 +223,21 @@ static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
217 return neigh_create(&nd_tbl, daddr, dst->dev); 223 return neigh_create(&nd_tbl, daddr, dst->dev);
218} 224}
219 225
226static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
227{
228 struct net_device *dev = dst->dev;
229 struct rt6_info *rt = (struct rt6_info *)dst;
230
231 daddr = choose_neigh_daddr(rt, NULL, daddr);
232 if (!daddr)
233 return;
234 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
235 return;
236 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
237 return;
238 __ipv6_confirm_neigh(dev, daddr);
239}
240
220static struct dst_ops ip6_dst_ops_template = { 241static struct dst_ops ip6_dst_ops_template = {
221 .family = AF_INET6, 242 .family = AF_INET6,
222 .gc = ip6_dst_gc, 243 .gc = ip6_dst_gc,
@@ -233,6 +254,7 @@ static struct dst_ops ip6_dst_ops_template = {
233 .redirect = rt6_do_redirect, 254 .redirect = rt6_do_redirect,
234 .local_out = __ip6_local_out, 255 .local_out = __ip6_local_out,
235 .neigh_lookup = ip6_neigh_lookup, 256 .neigh_lookup = ip6_neigh_lookup,
257 .confirm_neigh = ip6_confirm_neigh,
236}; 258};
237 259
238static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 260static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
@@ -527,7 +549,7 @@ static void rt6_probe_deferred(struct work_struct *w)
527 container_of(w, struct __rt6_probe_work, work); 549 container_of(w, struct __rt6_probe_work, work);
528 550
529 addrconf_addr_solict_mult(&work->target, &mcaddr); 551 addrconf_addr_solict_mult(&work->target, &mcaddr);
530 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL); 552 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
531 dev_put(work->dev); 553 dev_put(work->dev);
532 kfree(work); 554 kfree(work);
533} 555}
@@ -1359,6 +1381,7 @@ static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1359static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 1381static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1360 const struct ipv6hdr *iph, u32 mtu) 1382 const struct ipv6hdr *iph, u32 mtu)
1361{ 1383{
1384 const struct in6_addr *daddr, *saddr;
1362 struct rt6_info *rt6 = (struct rt6_info *)dst; 1385 struct rt6_info *rt6 = (struct rt6_info *)dst;
1363 1386
1364 if (rt6->rt6i_flags & RTF_LOCAL) 1387 if (rt6->rt6i_flags & RTF_LOCAL)
@@ -1367,26 +1390,26 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1367 if (dst_metric_locked(dst, RTAX_MTU)) 1390 if (dst_metric_locked(dst, RTAX_MTU))
1368 return; 1391 return;
1369 1392
1370 dst_confirm(dst); 1393 if (iph) {
1394 daddr = &iph->daddr;
1395 saddr = &iph->saddr;
1396 } else if (sk) {
1397 daddr = &sk->sk_v6_daddr;
1398 saddr = &inet6_sk(sk)->saddr;
1399 } else {
1400 daddr = NULL;
1401 saddr = NULL;
1402 }
1403 dst_confirm_neigh(dst, daddr);
1371 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 1404 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1372 if (mtu >= dst_mtu(dst)) 1405 if (mtu >= dst_mtu(dst))
1373 return; 1406 return;
1374 1407
1375 if (!rt6_cache_allowed_for_pmtu(rt6)) { 1408 if (!rt6_cache_allowed_for_pmtu(rt6)) {
1376 rt6_do_update_pmtu(rt6, mtu); 1409 rt6_do_update_pmtu(rt6, mtu);
1377 } else { 1410 } else if (daddr) {
1378 const struct in6_addr *daddr, *saddr;
1379 struct rt6_info *nrt6; 1411 struct rt6_info *nrt6;
1380 1412
1381 if (iph) {
1382 daddr = &iph->daddr;
1383 saddr = &iph->saddr;
1384 } else if (sk) {
1385 daddr = &sk->sk_v6_daddr;
1386 saddr = &inet6_sk(sk)->saddr;
1387 } else {
1388 return;
1389 }
1390 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr); 1413 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1391 if (nrt6) { 1414 if (nrt6) {
1392 rt6_do_update_pmtu(nrt6, mtu); 1415 rt6_do_update_pmtu(nrt6, mtu);
@@ -1408,7 +1431,7 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1408} 1431}
1409 1432
1410void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 1433void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1411 int oif, u32 mark) 1434 int oif, u32 mark, kuid_t uid)
1412{ 1435{
1413 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 1436 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1414 struct dst_entry *dst; 1437 struct dst_entry *dst;
@@ -1420,6 +1443,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1420 fl6.daddr = iph->daddr; 1443 fl6.daddr = iph->daddr;
1421 fl6.saddr = iph->saddr; 1444 fl6.saddr = iph->saddr;
1422 fl6.flowlabel = ip6_flowinfo(iph); 1445 fl6.flowlabel = ip6_flowinfo(iph);
1446 fl6.flowi6_uid = uid;
1423 1447
1424 dst = ip6_route_output(net, NULL, &fl6); 1448 dst = ip6_route_output(net, NULL, &fl6);
1425 if (!dst->error) 1449 if (!dst->error)
@@ -1433,7 +1457,7 @@ void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1433 struct dst_entry *dst; 1457 struct dst_entry *dst;
1434 1458
1435 ip6_update_pmtu(skb, sock_net(sk), mtu, 1459 ip6_update_pmtu(skb, sock_net(sk), mtu,
1436 sk->sk_bound_dev_if, sk->sk_mark); 1460 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
1437 1461
1438 dst = __sk_dst_get(sk); 1462 dst = __sk_dst_get(sk);
1439 if (!dst || !dst->obsolete || 1463 if (!dst || !dst->obsolete ||
@@ -1463,7 +1487,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
1463 struct fib6_node *fn; 1487 struct fib6_node *fn;
1464 1488
1465 /* Get the "current" route for this destination and 1489 /* Get the "current" route for this destination and
1466 * check if the redirect has come from approriate router. 1490 * check if the redirect has come from appropriate router.
1467 * 1491 *
1468 * RFC 4861 specifies that redirects should only be 1492 * RFC 4861 specifies that redirects should only be
1469 * accepted if they come from the nexthop to the target. 1493 * accepted if they come from the nexthop to the target.
@@ -1525,7 +1549,8 @@ static struct dst_entry *ip6_route_redirect(struct net *net,
1525 flags, __ip6_route_redirect); 1549 flags, __ip6_route_redirect);
1526} 1550}
1527 1551
1528void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark) 1552void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
1553 kuid_t uid)
1529{ 1554{
1530 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 1555 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1531 struct dst_entry *dst; 1556 struct dst_entry *dst;
@@ -1538,6 +1563,7 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1538 fl6.daddr = iph->daddr; 1563 fl6.daddr = iph->daddr;
1539 fl6.saddr = iph->saddr; 1564 fl6.saddr = iph->saddr;
1540 fl6.flowlabel = ip6_flowinfo(iph); 1565 fl6.flowlabel = ip6_flowinfo(iph);
1566 fl6.flowi6_uid = uid;
1541 1567
1542 dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr); 1568 dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1543 rt6_do_redirect(dst, NULL, skb); 1569 rt6_do_redirect(dst, NULL, skb);
@@ -1559,6 +1585,7 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1559 fl6.flowi6_mark = mark; 1585 fl6.flowi6_mark = mark;
1560 fl6.daddr = msg->dest; 1586 fl6.daddr = msg->dest;
1561 fl6.saddr = iph->daddr; 1587 fl6.saddr = iph->daddr;
1588 fl6.flowi6_uid = sock_net_uid(net, NULL);
1562 1589
1563 dst = ip6_route_redirect(net, &fl6, &iph->saddr); 1590 dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1564 rt6_do_redirect(dst, NULL, skb); 1591 rt6_do_redirect(dst, NULL, skb);
@@ -1567,7 +1594,8 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1567 1594
1568void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 1595void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1569{ 1596{
1570 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark); 1597 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
1598 sk->sk_uid);
1571} 1599}
1572EXPORT_SYMBOL_GPL(ip6_sk_redirect); 1600EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1573 1601
@@ -1826,6 +1854,10 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
1826 int addr_type; 1854 int addr_type;
1827 int err = -EINVAL; 1855 int err = -EINVAL;
1828 1856
1857 /* RTF_PCPU is an internal flag; can not be set by userspace */
1858 if (cfg->fc_flags & RTF_PCPU)
1859 goto out;
1860
1829 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128) 1861 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1830 goto out; 1862 goto out;
1831#ifndef CONFIG_IPV6_SUBTREES 1863#ifndef CONFIG_IPV6_SUBTREES
@@ -1892,7 +1924,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
1892 if (cfg->fc_encap) { 1924 if (cfg->fc_encap) {
1893 struct lwtunnel_state *lwtstate; 1925 struct lwtunnel_state *lwtstate;
1894 1926
1895 err = lwtunnel_build_state(dev, cfg->fc_encap_type, 1927 err = lwtunnel_build_state(cfg->fc_encap_type,
1896 cfg->fc_encap, AF_INET6, cfg, 1928 cfg->fc_encap, AF_INET6, cfg,
1897 &lwtstate); 1929 &lwtstate);
1898 if (err) 1930 if (err)
@@ -1995,8 +2027,11 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
1995 It is very good, but in some (rare!) circumstances 2027 It is very good, but in some (rare!) circumstances
1996 (SIT, PtP, NBMA NOARP links) it is handy to allow 2028 (SIT, PtP, NBMA NOARP links) it is handy to allow
1997 some exceptions. --ANK 2029 some exceptions. --ANK
2030 We allow IPv4-mapped nexthops to support RFC4798-type
2031 addressing
1998 */ 2032 */
1999 if (!(gwa_type & IPV6_ADDR_UNICAST)) 2033 if (!(gwa_type & (IPV6_ADDR_UNICAST |
2034 IPV6_ADDR_MAPPED)))
2000 goto out; 2035 goto out;
2001 2036
2002 if (cfg->fc_table) { 2037 if (cfg->fc_table) {
@@ -2135,6 +2170,58 @@ int ip6_del_rt(struct rt6_info *rt)
2135 return __ip6_del_rt(rt, &info); 2170 return __ip6_del_rt(rt, &info);
2136} 2171}
2137 2172
2173static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2174{
2175 struct nl_info *info = &cfg->fc_nlinfo;
2176 struct net *net = info->nl_net;
2177 struct sk_buff *skb = NULL;
2178 struct fib6_table *table;
2179 int err = -ENOENT;
2180
2181 if (rt == net->ipv6.ip6_null_entry)
2182 goto out_put;
2183 table = rt->rt6i_table;
2184 write_lock_bh(&table->tb6_lock);
2185
2186 if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2187 struct rt6_info *sibling, *next_sibling;
2188
2189 /* prefer to send a single notification with all hops */
2190 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2191 if (skb) {
2192 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2193
2194 if (rt6_fill_node(net, skb, rt,
2195 NULL, NULL, 0, RTM_DELROUTE,
2196 info->portid, seq, 0) < 0) {
2197 kfree_skb(skb);
2198 skb = NULL;
2199 } else
2200 info->skip_notify = 1;
2201 }
2202
2203 list_for_each_entry_safe(sibling, next_sibling,
2204 &rt->rt6i_siblings,
2205 rt6i_siblings) {
2206 err = fib6_del(sibling, info);
2207 if (err)
2208 goto out_unlock;
2209 }
2210 }
2211
2212 err = fib6_del(rt, info);
2213out_unlock:
2214 write_unlock_bh(&table->tb6_lock);
2215out_put:
2216 ip6_rt_put(rt);
2217
2218 if (skb) {
2219 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2220 info->nlh, gfp_any());
2221 }
2222 return err;
2223}
2224
2138static int ip6_route_del(struct fib6_config *cfg) 2225static int ip6_route_del(struct fib6_config *cfg)
2139{ 2226{
2140 struct fib6_table *table; 2227 struct fib6_table *table;
@@ -2166,10 +2253,16 @@ static int ip6_route_del(struct fib6_config *cfg)
2166 continue; 2253 continue;
2167 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric) 2254 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2168 continue; 2255 continue;
2256 if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2257 continue;
2169 dst_hold(&rt->dst); 2258 dst_hold(&rt->dst);
2170 read_unlock_bh(&table->tb6_lock); 2259 read_unlock_bh(&table->tb6_lock);
2171 2260
2172 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 2261 /* if gateway was specified only delete the one hop */
2262 if (cfg->fc_flags & RTF_GATEWAY)
2263 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2264
2265 return __ip6_del_rt_siblings(rt, cfg);
2173 } 2266 }
2174 } 2267 }
2175 read_unlock_bh(&table->tb6_lock); 2268 read_unlock_bh(&table->tb6_lock);
@@ -2248,7 +2341,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
2248 * Look, redirects are sent only in response to data packets, 2341 * Look, redirects are sent only in response to data packets,
2249 * so that this nexthop apparently is reachable. --ANK 2342 * so that this nexthop apparently is reachable. --ANK
2250 */ 2343 */
2251 dst_confirm(&rt->dst); 2344 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
2252 2345
2253 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 2346 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2254 if (!neigh) 2347 if (!neigh)
@@ -2624,6 +2717,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2624 rt->dst.output = ip6_output; 2717 rt->dst.output = ip6_output;
2625 rt->rt6i_idev = idev; 2718 rt->rt6i_idev = idev;
2626 2719
2720 rt->rt6i_protocol = RTPROT_KERNEL;
2627 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; 2721 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2628 if (anycast) 2722 if (anycast)
2629 rt->rt6i_flags |= RTF_ANYCAST; 2723 rt->rt6i_flags |= RTF_ANYCAST;
@@ -2701,13 +2795,16 @@ struct arg_dev_net {
2701 struct net *net; 2795 struct net *net;
2702}; 2796};
2703 2797
2798/* called with write lock held for table with rt */
2704static int fib6_ifdown(struct rt6_info *rt, void *arg) 2799static int fib6_ifdown(struct rt6_info *rt, void *arg)
2705{ 2800{
2706 const struct arg_dev_net *adn = arg; 2801 const struct arg_dev_net *adn = arg;
2707 const struct net_device *dev = adn->dev; 2802 const struct net_device *dev = adn->dev;
2708 2803
2709 if ((rt->dst.dev == dev || !dev) && 2804 if ((rt->dst.dev == dev || !dev) &&
2710 rt != adn->net->ipv6.ip6_null_entry) 2805 rt != adn->net->ipv6.ip6_null_entry &&
2806 (rt->rt6i_nsiblings == 0 ||
2807 !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
2711 return -1; 2808 return -1;
2712 2809
2713 return 0; 2810 return 0;
@@ -2758,7 +2855,7 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2758 old MTU is the lowest MTU in the path, update the route PMTU 2855 old MTU is the lowest MTU in the path, update the route PMTU
2759 to reflect the increase. In this case if the other nodes' MTU 2856 to reflect the increase. In this case if the other nodes' MTU
2760 also have the lowest MTU, TOO BIG MESSAGE will be lead to 2857 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2761 PMTU discouvery. 2858 PMTU discovery.
2762 */ 2859 */
2763 if (rt->dst.dev == arg->dev && 2860 if (rt->dst.dev == arg->dev &&
2764 dst_metric_raw(&rt->dst, RTAX_MTU) && 2861 dst_metric_raw(&rt->dst, RTAX_MTU) &&
@@ -2801,6 +2898,8 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2801 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 2898 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
2802 [RTA_ENCAP] = { .type = NLA_NESTED }, 2899 [RTA_ENCAP] = { .type = NLA_NESTED },
2803 [RTA_EXPIRES] = { .type = NLA_U32 }, 2900 [RTA_EXPIRES] = { .type = NLA_U32 },
2901 [RTA_UID] = { .type = NLA_U32 },
2902 [RTA_MARK] = { .type = NLA_U32 },
2804}; 2903};
2805 2904
2806static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 2905static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -2885,6 +2984,11 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2885 if (tb[RTA_MULTIPATH]) { 2984 if (tb[RTA_MULTIPATH]) {
2886 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 2985 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2887 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 2986 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2987
2988 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
2989 cfg->fc_mp_len);
2990 if (err < 0)
2991 goto errout;
2888 } 2992 }
2889 2993
2890 if (tb[RTA_PREF]) { 2994 if (tb[RTA_PREF]) {
@@ -2898,9 +3002,14 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2898 if (tb[RTA_ENCAP]) 3002 if (tb[RTA_ENCAP])
2899 cfg->fc_encap = tb[RTA_ENCAP]; 3003 cfg->fc_encap = tb[RTA_ENCAP];
2900 3004
2901 if (tb[RTA_ENCAP_TYPE]) 3005 if (tb[RTA_ENCAP_TYPE]) {
2902 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 3006 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
2903 3007
3008 err = lwtunnel_valid_encap_type(cfg->fc_encap_type);
3009 if (err < 0)
3010 goto errout;
3011 }
3012
2904 if (tb[RTA_EXPIRES]) { 3013 if (tb[RTA_EXPIRES]) {
2905 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 3014 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
2906 3015
@@ -2927,7 +3036,7 @@ static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
2927 struct rt6_nh *nh; 3036 struct rt6_nh *nh;
2928 3037
2929 list_for_each_entry(nh, rt6_nh_list, next) { 3038 list_for_each_entry(nh, rt6_nh_list, next) {
2930 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6 nexthop %pI6 ifi %d\n", 3039 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
2931 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway, 3040 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
2932 nh->r_cfg.fc_ifindex); 3041 nh->r_cfg.fc_ifindex);
2933 } 3042 }
@@ -2966,13 +3075,37 @@ static int ip6_route_info_append(struct list_head *rt6_nh_list,
2966 return 0; 3075 return 0;
2967} 3076}
2968 3077
3078static void ip6_route_mpath_notify(struct rt6_info *rt,
3079 struct rt6_info *rt_last,
3080 struct nl_info *info,
3081 __u16 nlflags)
3082{
3083 /* if this is an APPEND route, then rt points to the first route
3084 * inserted and rt_last points to last route inserted. Userspace
3085 * wants a consistent dump of the route which starts at the first
3086 * nexthop. Since sibling routes are always added at the end of
3087 * the list, find the first sibling of the last route appended
3088 */
3089 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3090 rt = list_first_entry(&rt_last->rt6i_siblings,
3091 struct rt6_info,
3092 rt6i_siblings);
3093 }
3094
3095 if (rt)
3096 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3097}
3098
2969static int ip6_route_multipath_add(struct fib6_config *cfg) 3099static int ip6_route_multipath_add(struct fib6_config *cfg)
2970{ 3100{
3101 struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3102 struct nl_info *info = &cfg->fc_nlinfo;
2971 struct fib6_config r_cfg; 3103 struct fib6_config r_cfg;
2972 struct rtnexthop *rtnh; 3104 struct rtnexthop *rtnh;
2973 struct rt6_info *rt; 3105 struct rt6_info *rt;
2974 struct rt6_nh *err_nh; 3106 struct rt6_nh *err_nh;
2975 struct rt6_nh *nh, *nh_safe; 3107 struct rt6_nh *nh, *nh_safe;
3108 __u16 nlflags;
2976 int remaining; 3109 int remaining;
2977 int attrlen; 3110 int attrlen;
2978 int err = 1; 3111 int err = 1;
@@ -2981,6 +3114,10 @@ static int ip6_route_multipath_add(struct fib6_config *cfg)
2981 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 3114 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
2982 LIST_HEAD(rt6_nh_list); 3115 LIST_HEAD(rt6_nh_list);
2983 3116
3117 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3118 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3119 nlflags |= NLM_F_APPEND;
3120
2984 remaining = cfg->fc_mp_len; 3121 remaining = cfg->fc_mp_len;
2985 rtnh = (struct rtnexthop *)cfg->fc_mp; 3122 rtnh = (struct rtnexthop *)cfg->fc_mp;
2986 3123
@@ -3023,9 +3160,20 @@ static int ip6_route_multipath_add(struct fib6_config *cfg)
3023 rtnh = rtnh_next(rtnh, &remaining); 3160 rtnh = rtnh_next(rtnh, &remaining);
3024 } 3161 }
3025 3162
3163 /* for add and replace send one notification with all nexthops.
3164 * Skip the notification in fib6_add_rt2node and send one with
3165 * the full route when done
3166 */
3167 info->skip_notify = 1;
3168
3026 err_nh = NULL; 3169 err_nh = NULL;
3027 list_for_each_entry(nh, &rt6_nh_list, next) { 3170 list_for_each_entry(nh, &rt6_nh_list, next) {
3028 err = __ip6_ins_rt(nh->rt6_info, &cfg->fc_nlinfo, &nh->mxc); 3171 rt_last = nh->rt6_info;
3172 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc);
3173 /* save reference to first route for notification */
3174 if (!rt_notif && !err)
3175 rt_notif = nh->rt6_info;
3176
3029 /* nh->rt6_info is used or freed at this point, reset to NULL*/ 3177 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3030 nh->rt6_info = NULL; 3178 nh->rt6_info = NULL;
3031 if (err) { 3179 if (err) {
@@ -3047,9 +3195,18 @@ static int ip6_route_multipath_add(struct fib6_config *cfg)
3047 nhn++; 3195 nhn++;
3048 } 3196 }
3049 3197
3198 /* success ... tell user about new route */
3199 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3050 goto cleanup; 3200 goto cleanup;
3051 3201
3052add_errout: 3202add_errout:
3203 /* send notification for routes that were added so that
3204 * the delete notifications sent by ip6_route_del are
3205 * coherent
3206 */
3207 if (rt_notif)
3208 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3209
3053 /* Delete routes that were already added */ 3210 /* Delete routes that were already added */
3054 list_for_each_entry(nh, &rt6_nh_list, next) { 3211 list_for_each_entry(nh, &rt6_nh_list, next) {
3055 if (err_nh == nh) 3212 if (err_nh == nh)
@@ -3117,8 +3274,10 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
3117 3274
3118 if (cfg.fc_mp) 3275 if (cfg.fc_mp)
3119 return ip6_route_multipath_del(&cfg); 3276 return ip6_route_multipath_del(&cfg);
3120 else 3277 else {
3278 cfg.fc_delete_all_nh = 1;
3121 return ip6_route_del(&cfg); 3279 return ip6_route_del(&cfg);
3280 }
3122} 3281}
3123 3282
3124static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) 3283static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
@@ -3136,8 +3295,19 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
3136 return ip6_route_add(&cfg); 3295 return ip6_route_add(&cfg);
3137} 3296}
3138 3297
3139static inline size_t rt6_nlmsg_size(struct rt6_info *rt) 3298static size_t rt6_nlmsg_size(struct rt6_info *rt)
3140{ 3299{
3300 int nexthop_len = 0;
3301
3302 if (rt->rt6i_nsiblings) {
3303 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
3304 + NLA_ALIGN(sizeof(struct rtnexthop))
3305 + nla_total_size(16) /* RTA_GATEWAY */
3306 + lwtunnel_get_encap_size(rt->dst.lwtstate);
3307
3308 nexthop_len *= rt->rt6i_nsiblings;
3309 }
3310
3141 return NLMSG_ALIGN(sizeof(struct rtmsg)) 3311 return NLMSG_ALIGN(sizeof(struct rtmsg))
3142 + nla_total_size(16) /* RTA_SRC */ 3312 + nla_total_size(16) /* RTA_SRC */
3143 + nla_total_size(16) /* RTA_DST */ 3313 + nla_total_size(16) /* RTA_DST */
@@ -3151,14 +3321,71 @@ static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
3151 + nla_total_size(sizeof(struct rta_cacheinfo)) 3321 + nla_total_size(sizeof(struct rta_cacheinfo))
3152 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 3322 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3153 + nla_total_size(1) /* RTA_PREF */ 3323 + nla_total_size(1) /* RTA_PREF */
3154 + lwtunnel_get_encap_size(rt->dst.lwtstate); 3324 + lwtunnel_get_encap_size(rt->dst.lwtstate)
3325 + nexthop_len;
3326}
3327
3328static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3329 unsigned int *flags, bool skip_oif)
3330{
3331 if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3332 *flags |= RTNH_F_LINKDOWN;
3333 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3334 *flags |= RTNH_F_DEAD;
3335 }
3336
3337 if (rt->rt6i_flags & RTF_GATEWAY) {
3338 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3339 goto nla_put_failure;
3340 }
3341
3342 /* not needed for multipath encoding b/c it has a rtnexthop struct */
3343 if (!skip_oif && rt->dst.dev &&
3344 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3345 goto nla_put_failure;
3346
3347 if (rt->dst.lwtstate &&
3348 lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3349 goto nla_put_failure;
3350
3351 return 0;
3352
3353nla_put_failure:
3354 return -EMSGSIZE;
3355}
3356
3357/* add multipath next hop */
3358static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
3359{
3360 struct rtnexthop *rtnh;
3361 unsigned int flags = 0;
3362
3363 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
3364 if (!rtnh)
3365 goto nla_put_failure;
3366
3367 rtnh->rtnh_hops = 0;
3368 rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
3369
3370 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
3371 goto nla_put_failure;
3372
3373 rtnh->rtnh_flags = flags;
3374
3375 /* length of rtnetlink header + attributes */
3376 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
3377
3378 return 0;
3379
3380nla_put_failure:
3381 return -EMSGSIZE;
3155} 3382}
3156 3383
3157static int rt6_fill_node(struct net *net, 3384static int rt6_fill_node(struct net *net,
3158 struct sk_buff *skb, struct rt6_info *rt, 3385 struct sk_buff *skb, struct rt6_info *rt,
3159 struct in6_addr *dst, struct in6_addr *src, 3386 struct in6_addr *dst, struct in6_addr *src,
3160 int iif, int type, u32 portid, u32 seq, 3387 int iif, int type, u32 portid, u32 seq,
3161 int prefix, int nowait, unsigned int flags) 3388 unsigned int flags)
3162{ 3389{
3163 u32 metrics[RTAX_MAX]; 3390 u32 metrics[RTAX_MAX];
3164 struct rtmsg *rtm; 3391 struct rtmsg *rtm;
@@ -3166,13 +3393,6 @@ static int rt6_fill_node(struct net *net,
3166 long expires; 3393 long expires;
3167 u32 table; 3394 u32 table;
3168 3395
3169 if (prefix) { /* user wants prefix routes only */
3170 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
3171 /* success since this is not a prefix route */
3172 return 1;
3173 }
3174 }
3175
3176 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 3396 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3177 if (!nlh) 3397 if (!nlh)
3178 return -EMSGSIZE; 3398 return -EMSGSIZE;
@@ -3207,16 +3427,13 @@ static int rt6_fill_node(struct net *net,
3207 } 3427 }
3208 else if (rt->rt6i_flags & RTF_LOCAL) 3428 else if (rt->rt6i_flags & RTF_LOCAL)
3209 rtm->rtm_type = RTN_LOCAL; 3429 rtm->rtm_type = RTN_LOCAL;
3430 else if (rt->rt6i_flags & RTF_ANYCAST)
3431 rtm->rtm_type = RTN_ANYCAST;
3210 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK)) 3432 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
3211 rtm->rtm_type = RTN_LOCAL; 3433 rtm->rtm_type = RTN_LOCAL;
3212 else 3434 else
3213 rtm->rtm_type = RTN_UNICAST; 3435 rtm->rtm_type = RTN_UNICAST;
3214 rtm->rtm_flags = 0; 3436 rtm->rtm_flags = 0;
3215 if (!netif_carrier_ok(rt->dst.dev)) {
3216 rtm->rtm_flags |= RTNH_F_LINKDOWN;
3217 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3218 rtm->rtm_flags |= RTNH_F_DEAD;
3219 }
3220 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 3437 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
3221 rtm->rtm_protocol = rt->rt6i_protocol; 3438 rtm->rtm_protocol = rt->rt6i_protocol;
3222 if (rt->rt6i_flags & RTF_DYNAMIC) 3439 if (rt->rt6i_flags & RTF_DYNAMIC)
@@ -3250,19 +3467,12 @@ static int rt6_fill_node(struct net *net,
3250 if (iif) { 3467 if (iif) {
3251#ifdef CONFIG_IPV6_MROUTE 3468#ifdef CONFIG_IPV6_MROUTE
3252 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) { 3469 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3253 int err = ip6mr_get_route(net, skb, rtm, nowait, 3470 int err = ip6mr_get_route(net, skb, rtm, portid);
3254 portid); 3471
3255 3472 if (err == 0)
3256 if (err <= 0) { 3473 return 0;
3257 if (!nowait) { 3474 if (err < 0)
3258 if (err == 0) 3475 goto nla_put_failure;
3259 return 0;
3260 goto nla_put_failure;
3261 } else {
3262 if (err == -EMSGSIZE)
3263 goto nla_put_failure;
3264 }
3265 }
3266 } else 3476 } else
3267#endif 3477#endif
3268 if (nla_put_u32(skb, RTA_IIF, iif)) 3478 if (nla_put_u32(skb, RTA_IIF, iif))
@@ -3287,17 +3497,35 @@ static int rt6_fill_node(struct net *net,
3287 if (rtnetlink_put_metrics(skb, metrics) < 0) 3497 if (rtnetlink_put_metrics(skb, metrics) < 0)
3288 goto nla_put_failure; 3498 goto nla_put_failure;
3289 3499
3290 if (rt->rt6i_flags & RTF_GATEWAY) {
3291 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3292 goto nla_put_failure;
3293 }
3294
3295 if (rt->dst.dev &&
3296 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3297 goto nla_put_failure;
3298 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric)) 3500 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
3299 goto nla_put_failure; 3501 goto nla_put_failure;
3300 3502
3503 /* For multipath routes, walk the siblings list and add
3504 * each as a nexthop within RTA_MULTIPATH.
3505 */
3506 if (rt->rt6i_nsiblings) {
3507 struct rt6_info *sibling, *next_sibling;
3508 struct nlattr *mp;
3509
3510 mp = nla_nest_start(skb, RTA_MULTIPATH);
3511 if (!mp)
3512 goto nla_put_failure;
3513
3514 if (rt6_add_nexthop(skb, rt) < 0)
3515 goto nla_put_failure;
3516
3517 list_for_each_entry_safe(sibling, next_sibling,
3518 &rt->rt6i_siblings, rt6i_siblings) {
3519 if (rt6_add_nexthop(skb, sibling) < 0)
3520 goto nla_put_failure;
3521 }
3522
3523 nla_nest_end(skb, mp);
3524 } else {
3525 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
3526 goto nla_put_failure;
3527 }
3528
3301 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0; 3529 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3302 3530
3303 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0) 3531 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
@@ -3306,7 +3534,6 @@ static int rt6_fill_node(struct net *net,
3306 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags))) 3534 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
3307 goto nla_put_failure; 3535 goto nla_put_failure;
3308 3536
3309 lwtunnel_fill_encap(skb, rt->dst.lwtstate);
3310 3537
3311 nlmsg_end(skb, nlh); 3538 nlmsg_end(skb, nlh);
3312 return 0; 3539 return 0;
@@ -3319,18 +3546,26 @@ nla_put_failure:
3319int rt6_dump_route(struct rt6_info *rt, void *p_arg) 3546int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3320{ 3547{
3321 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 3548 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3322 int prefix; 3549 struct net *net = arg->net;
3550
3551 if (rt == net->ipv6.ip6_null_entry)
3552 return 0;
3323 3553
3324 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { 3554 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3325 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh); 3555 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3326 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
3327 } else
3328 prefix = 0;
3329 3556
3330 return rt6_fill_node(arg->net, 3557 /* user wants prefix routes only */
3558 if (rtm->rtm_flags & RTM_F_PREFIX &&
3559 !(rt->rt6i_flags & RTF_PREFIX_RT)) {
3560 /* success since this is not a prefix route */
3561 return 1;
3562 }
3563 }
3564
3565 return rt6_fill_node(net,
3331 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE, 3566 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3332 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq, 3567 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3333 prefix, 0, NLM_F_MULTI); 3568 NLM_F_MULTI);
3334} 3569}
3335 3570
3336static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) 3571static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
@@ -3375,6 +3610,12 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
3375 if (tb[RTA_MARK]) 3610 if (tb[RTA_MARK])
3376 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 3611 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3377 3612
3613 if (tb[RTA_UID])
3614 fl6.flowi6_uid = make_kuid(current_user_ns(),
3615 nla_get_u32(tb[RTA_UID]));
3616 else
3617 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
3618
3378 if (iif) { 3619 if (iif) {
3379 struct net_device *dev; 3620 struct net_device *dev;
3380 int flags = 0; 3621 int flags = 0;
@@ -3398,6 +3639,12 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
3398 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6); 3639 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3399 } 3640 }
3400 3641
3642 if (rt == net->ipv6.ip6_null_entry) {
3643 err = rt->dst.error;
3644 ip6_rt_put(rt);
3645 goto errout;
3646 }
3647
3401 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 3648 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3402 if (!skb) { 3649 if (!skb) {
3403 ip6_rt_put(rt); 3650 ip6_rt_put(rt);
@@ -3405,17 +3652,11 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
3405 goto errout; 3652 goto errout;
3406 } 3653 }
3407 3654
3408 /* Reserve room for dummy headers, this skb can pass
3409 through good chunk of routing engine.
3410 */
3411 skb_reset_mac_header(skb);
3412 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
3413
3414 skb_dst_set(skb, &rt->dst); 3655 skb_dst_set(skb, &rt->dst);
3415 3656
3416 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif, 3657 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3417 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 3658 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3418 nlh->nlmsg_seq, 0, 0, 0); 3659 nlh->nlmsg_seq, 0);
3419 if (err < 0) { 3660 if (err < 0) {
3420 kfree_skb(skb); 3661 kfree_skb(skb);
3421 goto errout; 3662 goto errout;
@@ -3442,7 +3683,7 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
3442 goto errout; 3683 goto errout;
3443 3684
3444 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0, 3685 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3445 event, info->portid, seq, 0, 0, nlm_flags); 3686 event, info->portid, seq, nlm_flags);
3446 if (err < 0) { 3687 if (err < 0) {
3447 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 3688 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3448 WARN_ON(err == -EMSGSIZE); 3689 WARN_ON(err == -EMSGSIZE);
diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c
new file mode 100644
index 000000000000..5f44ffed2576
--- /dev/null
+++ b/net/ipv6/seg6.c
@@ -0,0 +1,500 @@
1/*
2 * SR-IPv6 implementation
3 *
4 * Author:
5 * David Lebrun <david.lebrun@uclouvain.be>
6 *
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 */
13
14#include <linux/errno.h>
15#include <linux/types.h>
16#include <linux/socket.h>
17#include <linux/net.h>
18#include <linux/in6.h>
19#include <linux/slab.h>
20
21#include <net/ipv6.h>
22#include <net/protocol.h>
23
24#include <net/seg6.h>
25#include <net/genetlink.h>
26#include <linux/seg6.h>
27#include <linux/seg6_genl.h>
28#ifdef CONFIG_IPV6_SEG6_HMAC
29#include <net/seg6_hmac.h>
30#endif
31
32bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len)
33{
34 int trailing;
35 unsigned int tlv_offset;
36
37 if (srh->type != IPV6_SRCRT_TYPE_4)
38 return false;
39
40 if (((srh->hdrlen + 1) << 3) != len)
41 return false;
42
43 if (srh->segments_left != srh->first_segment)
44 return false;
45
46 tlv_offset = sizeof(*srh) + ((srh->first_segment + 1) << 4);
47
48 trailing = len - tlv_offset;
49 if (trailing < 0)
50 return false;
51
52 while (trailing) {
53 struct sr6_tlv *tlv;
54 unsigned int tlv_len;
55
56 if (trailing < sizeof(*tlv))
57 return false;
58
59 tlv = (struct sr6_tlv *)((unsigned char *)srh + tlv_offset);
60 tlv_len = sizeof(*tlv) + tlv->len;
61
62 trailing -= tlv_len;
63 if (trailing < 0)
64 return false;
65
66 tlv_offset += tlv_len;
67 }
68
69 return true;
70}
71
72static struct genl_family seg6_genl_family;
73
74static const struct nla_policy seg6_genl_policy[SEG6_ATTR_MAX + 1] = {
75 [SEG6_ATTR_DST] = { .type = NLA_BINARY,
76 .len = sizeof(struct in6_addr) },
77 [SEG6_ATTR_DSTLEN] = { .type = NLA_S32, },
78 [SEG6_ATTR_HMACKEYID] = { .type = NLA_U32, },
79 [SEG6_ATTR_SECRET] = { .type = NLA_BINARY, },
80 [SEG6_ATTR_SECRETLEN] = { .type = NLA_U8, },
81 [SEG6_ATTR_ALGID] = { .type = NLA_U8, },
82 [SEG6_ATTR_HMACINFO] = { .type = NLA_NESTED, },
83};
84
85#ifdef CONFIG_IPV6_SEG6_HMAC
86
87static int seg6_genl_sethmac(struct sk_buff *skb, struct genl_info *info)
88{
89 struct net *net = genl_info_net(info);
90 struct seg6_pernet_data *sdata;
91 struct seg6_hmac_info *hinfo;
92 u32 hmackeyid;
93 char *secret;
94 int err = 0;
95 u8 algid;
96 u8 slen;
97
98 sdata = seg6_pernet(net);
99
100 if (!info->attrs[SEG6_ATTR_HMACKEYID] ||
101 !info->attrs[SEG6_ATTR_SECRETLEN] ||
102 !info->attrs[SEG6_ATTR_ALGID])
103 return -EINVAL;
104
105 hmackeyid = nla_get_u32(info->attrs[SEG6_ATTR_HMACKEYID]);
106 slen = nla_get_u8(info->attrs[SEG6_ATTR_SECRETLEN]);
107 algid = nla_get_u8(info->attrs[SEG6_ATTR_ALGID]);
108
109 if (hmackeyid == 0)
110 return -EINVAL;
111
112 if (slen > SEG6_HMAC_SECRET_LEN)
113 return -EINVAL;
114
115 mutex_lock(&sdata->lock);
116 hinfo = seg6_hmac_info_lookup(net, hmackeyid);
117
118 if (!slen) {
119 if (!hinfo)
120 err = -ENOENT;
121
122 err = seg6_hmac_info_del(net, hmackeyid);
123
124 goto out_unlock;
125 }
126
127 if (!info->attrs[SEG6_ATTR_SECRET]) {
128 err = -EINVAL;
129 goto out_unlock;
130 }
131
132 if (hinfo) {
133 err = seg6_hmac_info_del(net, hmackeyid);
134 if (err)
135 goto out_unlock;
136 }
137
138 secret = (char *)nla_data(info->attrs[SEG6_ATTR_SECRET]);
139
140 hinfo = kzalloc(sizeof(*hinfo), GFP_KERNEL);
141 if (!hinfo) {
142 err = -ENOMEM;
143 goto out_unlock;
144 }
145
146 memcpy(hinfo->secret, secret, slen);
147 hinfo->slen = slen;
148 hinfo->alg_id = algid;
149 hinfo->hmackeyid = hmackeyid;
150
151 err = seg6_hmac_info_add(net, hmackeyid, hinfo);
152 if (err)
153 kfree(hinfo);
154
155out_unlock:
156 mutex_unlock(&sdata->lock);
157 return err;
158}
159
160#else
161
162static int seg6_genl_sethmac(struct sk_buff *skb, struct genl_info *info)
163{
164 return -ENOTSUPP;
165}
166
167#endif
168
169static int seg6_genl_set_tunsrc(struct sk_buff *skb, struct genl_info *info)
170{
171 struct net *net = genl_info_net(info);
172 struct in6_addr *val, *t_old, *t_new;
173 struct seg6_pernet_data *sdata;
174
175 sdata = seg6_pernet(net);
176
177 if (!info->attrs[SEG6_ATTR_DST])
178 return -EINVAL;
179
180 val = nla_data(info->attrs[SEG6_ATTR_DST]);
181 t_new = kmemdup(val, sizeof(*val), GFP_KERNEL);
182 if (!t_new)
183 return -ENOMEM;
184
185 mutex_lock(&sdata->lock);
186
187 t_old = sdata->tun_src;
188 rcu_assign_pointer(sdata->tun_src, t_new);
189
190 mutex_unlock(&sdata->lock);
191
192 synchronize_net();
193 kfree(t_old);
194
195 return 0;
196}
197
198static int seg6_genl_get_tunsrc(struct sk_buff *skb, struct genl_info *info)
199{
200 struct net *net = genl_info_net(info);
201 struct in6_addr *tun_src;
202 struct sk_buff *msg;
203 void *hdr;
204
205 msg = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
206 if (!msg)
207 return -ENOMEM;
208
209 hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq,
210 &seg6_genl_family, 0, SEG6_CMD_GET_TUNSRC);
211 if (!hdr)
212 goto free_msg;
213
214 rcu_read_lock();
215 tun_src = rcu_dereference(seg6_pernet(net)->tun_src);
216
217 if (nla_put(msg, SEG6_ATTR_DST, sizeof(struct in6_addr), tun_src))
218 goto nla_put_failure;
219
220 rcu_read_unlock();
221
222 genlmsg_end(msg, hdr);
223 genlmsg_reply(msg, info);
224
225 return 0;
226
227nla_put_failure:
228 rcu_read_unlock();
229 genlmsg_cancel(msg, hdr);
230free_msg:
231 nlmsg_free(msg);
232 return -ENOMEM;
233}
234
235#ifdef CONFIG_IPV6_SEG6_HMAC
236
237static int __seg6_hmac_fill_info(struct seg6_hmac_info *hinfo,
238 struct sk_buff *msg)
239{
240 if (nla_put_u32(msg, SEG6_ATTR_HMACKEYID, hinfo->hmackeyid) ||
241 nla_put_u8(msg, SEG6_ATTR_SECRETLEN, hinfo->slen) ||
242 nla_put(msg, SEG6_ATTR_SECRET, hinfo->slen, hinfo->secret) ||
243 nla_put_u8(msg, SEG6_ATTR_ALGID, hinfo->alg_id))
244 return -1;
245
246 return 0;
247}
248
249static int __seg6_genl_dumphmac_element(struct seg6_hmac_info *hinfo,
250 u32 portid, u32 seq, u32 flags,
251 struct sk_buff *skb, u8 cmd)
252{
253 void *hdr;
254
255 hdr = genlmsg_put(skb, portid, seq, &seg6_genl_family, flags, cmd);
256 if (!hdr)
257 return -ENOMEM;
258
259 if (__seg6_hmac_fill_info(hinfo, skb) < 0)
260 goto nla_put_failure;
261
262 genlmsg_end(skb, hdr);
263 return 0;
264
265nla_put_failure:
266 genlmsg_cancel(skb, hdr);
267 return -EMSGSIZE;
268}
269
270static int seg6_genl_dumphmac_start(struct netlink_callback *cb)
271{
272 struct net *net = sock_net(cb->skb->sk);
273 struct seg6_pernet_data *sdata;
274 struct rhashtable_iter *iter;
275
276 sdata = seg6_pernet(net);
277 iter = (struct rhashtable_iter *)cb->args[0];
278
279 if (!iter) {
280 iter = kmalloc(sizeof(*iter), GFP_KERNEL);
281 if (!iter)
282 return -ENOMEM;
283
284 cb->args[0] = (long)iter;
285 }
286
287 rhashtable_walk_enter(&sdata->hmac_infos, iter);
288
289 return 0;
290}
291
292static int seg6_genl_dumphmac_done(struct netlink_callback *cb)
293{
294 struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0];
295
296 rhashtable_walk_exit(iter);
297
298 kfree(iter);
299
300 return 0;
301}
302
303static int seg6_genl_dumphmac(struct sk_buff *skb, struct netlink_callback *cb)
304{
305 struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0];
306 struct net *net = sock_net(skb->sk);
307 struct seg6_pernet_data *sdata;
308 struct seg6_hmac_info *hinfo;
309 int ret;
310
311 sdata = seg6_pernet(net);
312
313 ret = rhashtable_walk_start(iter);
314 if (ret && ret != -EAGAIN)
315 goto done;
316
317 for (;;) {
318 hinfo = rhashtable_walk_next(iter);
319
320 if (IS_ERR(hinfo)) {
321 if (PTR_ERR(hinfo) == -EAGAIN)
322 continue;
323 ret = PTR_ERR(hinfo);
324 goto done;
325 } else if (!hinfo) {
326 break;
327 }
328
329 ret = __seg6_genl_dumphmac_element(hinfo,
330 NETLINK_CB(cb->skb).portid,
331 cb->nlh->nlmsg_seq,
332 NLM_F_MULTI,
333 skb, SEG6_CMD_DUMPHMAC);
334 if (ret)
335 goto done;
336 }
337
338 ret = skb->len;
339
340done:
341 rhashtable_walk_stop(iter);
342 return ret;
343}
344
345#else
346
347static int seg6_genl_dumphmac_start(struct netlink_callback *cb)
348{
349 return 0;
350}
351
352static int seg6_genl_dumphmac_done(struct netlink_callback *cb)
353{
354 return 0;
355}
356
357static int seg6_genl_dumphmac(struct sk_buff *skb, struct netlink_callback *cb)
358{
359 return -ENOTSUPP;
360}
361
362#endif
363
364static int __net_init seg6_net_init(struct net *net)
365{
366 struct seg6_pernet_data *sdata;
367
368 sdata = kzalloc(sizeof(*sdata), GFP_KERNEL);
369 if (!sdata)
370 return -ENOMEM;
371
372 mutex_init(&sdata->lock);
373
374 sdata->tun_src = kzalloc(sizeof(*sdata->tun_src), GFP_KERNEL);
375 if (!sdata->tun_src) {
376 kfree(sdata);
377 return -ENOMEM;
378 }
379
380 net->ipv6.seg6_data = sdata;
381
382#ifdef CONFIG_IPV6_SEG6_HMAC
383 seg6_hmac_net_init(net);
384#endif
385
386 return 0;
387}
388
389static void __net_exit seg6_net_exit(struct net *net)
390{
391 struct seg6_pernet_data *sdata = seg6_pernet(net);
392
393#ifdef CONFIG_IPV6_SEG6_HMAC
394 seg6_hmac_net_exit(net);
395#endif
396
397 kfree(sdata->tun_src);
398 kfree(sdata);
399}
400
401static struct pernet_operations ip6_segments_ops = {
402 .init = seg6_net_init,
403 .exit = seg6_net_exit,
404};
405
406static const struct genl_ops seg6_genl_ops[] = {
407 {
408 .cmd = SEG6_CMD_SETHMAC,
409 .doit = seg6_genl_sethmac,
410 .policy = seg6_genl_policy,
411 .flags = GENL_ADMIN_PERM,
412 },
413 {
414 .cmd = SEG6_CMD_DUMPHMAC,
415 .start = seg6_genl_dumphmac_start,
416 .dumpit = seg6_genl_dumphmac,
417 .done = seg6_genl_dumphmac_done,
418 .policy = seg6_genl_policy,
419 .flags = GENL_ADMIN_PERM,
420 },
421 {
422 .cmd = SEG6_CMD_SET_TUNSRC,
423 .doit = seg6_genl_set_tunsrc,
424 .policy = seg6_genl_policy,
425 .flags = GENL_ADMIN_PERM,
426 },
427 {
428 .cmd = SEG6_CMD_GET_TUNSRC,
429 .doit = seg6_genl_get_tunsrc,
430 .policy = seg6_genl_policy,
431 .flags = GENL_ADMIN_PERM,
432 },
433};
434
435static struct genl_family seg6_genl_family __ro_after_init = {
436 .hdrsize = 0,
437 .name = SEG6_GENL_NAME,
438 .version = SEG6_GENL_VERSION,
439 .maxattr = SEG6_ATTR_MAX,
440 .netnsok = true,
441 .parallel_ops = true,
442 .ops = seg6_genl_ops,
443 .n_ops = ARRAY_SIZE(seg6_genl_ops),
444 .module = THIS_MODULE,
445};
446
447int __init seg6_init(void)
448{
449 int err = -ENOMEM;
450
451 err = genl_register_family(&seg6_genl_family);
452 if (err)
453 goto out;
454
455 err = register_pernet_subsys(&ip6_segments_ops);
456 if (err)
457 goto out_unregister_genl;
458
459#ifdef CONFIG_IPV6_SEG6_LWTUNNEL
460 err = seg6_iptunnel_init();
461 if (err)
462 goto out_unregister_pernet;
463#endif
464
465#ifdef CONFIG_IPV6_SEG6_HMAC
466 err = seg6_hmac_init();
467 if (err)
468 goto out_unregister_iptun;
469#endif
470
471 pr_info("Segment Routing with IPv6\n");
472
473out:
474 return err;
475#ifdef CONFIG_IPV6_SEG6_HMAC
476out_unregister_iptun:
477#ifdef CONFIG_IPV6_SEG6_LWTUNNEL
478 seg6_iptunnel_exit();
479#endif
480#endif
481#ifdef CONFIG_IPV6_SEG6_LWTUNNEL
482out_unregister_pernet:
483 unregister_pernet_subsys(&ip6_segments_ops);
484#endif
485out_unregister_genl:
486 genl_unregister_family(&seg6_genl_family);
487 goto out;
488}
489
490void seg6_exit(void)
491{
492#ifdef CONFIG_IPV6_SEG6_HMAC
493 seg6_hmac_exit();
494#endif
495#ifdef CONFIG_IPV6_SEG6_LWTUNNEL
496 seg6_iptunnel_exit();
497#endif
498 unregister_pernet_subsys(&ip6_segments_ops);
499 genl_unregister_family(&seg6_genl_family);
500}
diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c
new file mode 100644
index 000000000000..f950cb53d5e3
--- /dev/null
+++ b/net/ipv6/seg6_hmac.c
@@ -0,0 +1,448 @@
1/*
2 * SR-IPv6 implementation -- HMAC functions
3 *
4 * Author:
5 * David Lebrun <david.lebrun@uclouvain.be>
6 *
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 */
13
14#include <linux/errno.h>
15#include <linux/types.h>
16#include <linux/socket.h>
17#include <linux/sockios.h>
18#include <linux/net.h>
19#include <linux/netdevice.h>
20#include <linux/in6.h>
21#include <linux/icmpv6.h>
22#include <linux/mroute6.h>
23#include <linux/slab.h>
24
25#include <linux/netfilter.h>
26#include <linux/netfilter_ipv6.h>
27
28#include <net/sock.h>
29#include <net/snmp.h>
30
31#include <net/ipv6.h>
32#include <net/protocol.h>
33#include <net/transp_v6.h>
34#include <net/rawv6.h>
35#include <net/ndisc.h>
36#include <net/ip6_route.h>
37#include <net/addrconf.h>
38#include <net/xfrm.h>
39
40#include <linux/cryptohash.h>
41#include <crypto/hash.h>
42#include <crypto/sha.h>
43#include <net/seg6.h>
44#include <net/genetlink.h>
45#include <net/seg6_hmac.h>
46#include <linux/random.h>
47
48static DEFINE_PER_CPU(char [SEG6_HMAC_RING_SIZE], hmac_ring);
49
50static int seg6_hmac_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
51{
52 const struct seg6_hmac_info *hinfo = obj;
53
54 return (hinfo->hmackeyid != *(__u32 *)arg->key);
55}
56
57static inline void seg6_hinfo_release(struct seg6_hmac_info *hinfo)
58{
59 kfree_rcu(hinfo, rcu);
60}
61
62static void seg6_free_hi(void *ptr, void *arg)
63{
64 struct seg6_hmac_info *hinfo = (struct seg6_hmac_info *)ptr;
65
66 if (hinfo)
67 seg6_hinfo_release(hinfo);
68}
69
70static const struct rhashtable_params rht_params = {
71 .head_offset = offsetof(struct seg6_hmac_info, node),
72 .key_offset = offsetof(struct seg6_hmac_info, hmackeyid),
73 .key_len = sizeof(u32),
74 .automatic_shrinking = true,
75 .obj_cmpfn = seg6_hmac_cmpfn,
76};
77
78static struct seg6_hmac_algo hmac_algos[] = {
79 {
80 .alg_id = SEG6_HMAC_ALGO_SHA1,
81 .name = "hmac(sha1)",
82 },
83 {
84 .alg_id = SEG6_HMAC_ALGO_SHA256,
85 .name = "hmac(sha256)",
86 },
87};
88
89static struct sr6_tlv_hmac *seg6_get_tlv_hmac(struct ipv6_sr_hdr *srh)
90{
91 struct sr6_tlv_hmac *tlv;
92
93 if (srh->hdrlen < (srh->first_segment + 1) * 2 + 5)
94 return NULL;
95
96 if (!sr_has_hmac(srh))
97 return NULL;
98
99 tlv = (struct sr6_tlv_hmac *)
100 ((char *)srh + ((srh->hdrlen + 1) << 3) - 40);
101
102 if (tlv->tlvhdr.type != SR6_TLV_HMAC || tlv->tlvhdr.len != 38)
103 return NULL;
104
105 return tlv;
106}
107
108static struct seg6_hmac_algo *__hmac_get_algo(u8 alg_id)
109{
110 struct seg6_hmac_algo *algo;
111 int i, alg_count;
112
113 alg_count = sizeof(hmac_algos) / sizeof(struct seg6_hmac_algo);
114 for (i = 0; i < alg_count; i++) {
115 algo = &hmac_algos[i];
116 if (algo->alg_id == alg_id)
117 return algo;
118 }
119
120 return NULL;
121}
122
123static int __do_hmac(struct seg6_hmac_info *hinfo, const char *text, u8 psize,
124 u8 *output, int outlen)
125{
126 struct seg6_hmac_algo *algo;
127 struct crypto_shash *tfm;
128 struct shash_desc *shash;
129 int ret, dgsize;
130
131 algo = __hmac_get_algo(hinfo->alg_id);
132 if (!algo)
133 return -ENOENT;
134
135 tfm = *this_cpu_ptr(algo->tfms);
136
137 dgsize = crypto_shash_digestsize(tfm);
138 if (dgsize > outlen) {
139 pr_debug("sr-ipv6: __do_hmac: digest size too big (%d / %d)\n",
140 dgsize, outlen);
141 return -ENOMEM;
142 }
143
144 ret = crypto_shash_setkey(tfm, hinfo->secret, hinfo->slen);
145 if (ret < 0) {
146 pr_debug("sr-ipv6: crypto_shash_setkey failed: err %d\n", ret);
147 goto failed;
148 }
149
150 shash = *this_cpu_ptr(algo->shashs);
151 shash->tfm = tfm;
152
153 ret = crypto_shash_digest(shash, text, psize, output);
154 if (ret < 0) {
155 pr_debug("sr-ipv6: crypto_shash_digest failed: err %d\n", ret);
156 goto failed;
157 }
158
159 return dgsize;
160
161failed:
162 return ret;
163}
164
165int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr,
166 struct in6_addr *saddr, u8 *output)
167{
168 __be32 hmackeyid = cpu_to_be32(hinfo->hmackeyid);
169 u8 tmp_out[SEG6_HMAC_MAX_DIGESTSIZE];
170 int plen, i, dgsize, wrsize;
171 char *ring, *off;
172
173 /* a 160-byte buffer for digest output allows to store highest known
174 * hash function (RadioGatun) with up to 1216 bits
175 */
176
177 /* saddr(16) + first_seg(1) + flags(1) + keyid(4) + seglist(16n) */
178 plen = 16 + 1 + 1 + 4 + (hdr->first_segment + 1) * 16;
179
180 /* this limit allows for 14 segments */
181 if (plen >= SEG6_HMAC_RING_SIZE)
182 return -EMSGSIZE;
183
184 /* Let's build the HMAC text on the ring buffer. The text is composed
185 * as follows, in order:
186 *
187 * 1. Source IPv6 address (128 bits)
188 * 2. first_segment value (8 bits)
189 * 3. Flags (8 bits)
190 * 4. HMAC Key ID (32 bits)
191 * 5. All segments in the segments list (n * 128 bits)
192 */
193
194 local_bh_disable();
195 ring = this_cpu_ptr(hmac_ring);
196 off = ring;
197
198 /* source address */
199 memcpy(off, saddr, 16);
200 off += 16;
201
202 /* first_segment value */
203 *off++ = hdr->first_segment;
204
205 /* flags */
206 *off++ = hdr->flags;
207
208 /* HMAC Key ID */
209 memcpy(off, &hmackeyid, 4);
210 off += 4;
211
212 /* all segments in the list */
213 for (i = 0; i < hdr->first_segment + 1; i++) {
214 memcpy(off, hdr->segments + i, 16);
215 off += 16;
216 }
217
218 dgsize = __do_hmac(hinfo, ring, plen, tmp_out,
219 SEG6_HMAC_MAX_DIGESTSIZE);
220 local_bh_enable();
221
222 if (dgsize < 0)
223 return dgsize;
224
225 wrsize = SEG6_HMAC_FIELD_LEN;
226 if (wrsize > dgsize)
227 wrsize = dgsize;
228
229 memset(output, 0, SEG6_HMAC_FIELD_LEN);
230 memcpy(output, tmp_out, wrsize);
231
232 return 0;
233}
234EXPORT_SYMBOL(seg6_hmac_compute);
235
236/* checks if an incoming SR-enabled packet's HMAC status matches
237 * the incoming policy.
238 *
239 * called with rcu_read_lock()
240 */
241bool seg6_hmac_validate_skb(struct sk_buff *skb)
242{
243 u8 hmac_output[SEG6_HMAC_FIELD_LEN];
244 struct net *net = dev_net(skb->dev);
245 struct seg6_hmac_info *hinfo;
246 struct sr6_tlv_hmac *tlv;
247 struct ipv6_sr_hdr *srh;
248 struct inet6_dev *idev;
249
250 idev = __in6_dev_get(skb->dev);
251
252 srh = (struct ipv6_sr_hdr *)skb_transport_header(skb);
253
254 tlv = seg6_get_tlv_hmac(srh);
255
256 /* mandatory check but no tlv */
257 if (idev->cnf.seg6_require_hmac > 0 && !tlv)
258 return false;
259
260 /* no check */
261 if (idev->cnf.seg6_require_hmac < 0)
262 return true;
263
264 /* check only if present */
265 if (idev->cnf.seg6_require_hmac == 0 && !tlv)
266 return true;
267
268 /* now, seg6_require_hmac >= 0 && tlv */
269
270 hinfo = seg6_hmac_info_lookup(net, be32_to_cpu(tlv->hmackeyid));
271 if (!hinfo)
272 return false;
273
274 if (seg6_hmac_compute(hinfo, srh, &ipv6_hdr(skb)->saddr, hmac_output))
275 return false;
276
277 if (memcmp(hmac_output, tlv->hmac, SEG6_HMAC_FIELD_LEN) != 0)
278 return false;
279
280 return true;
281}
282EXPORT_SYMBOL(seg6_hmac_validate_skb);
283
284/* called with rcu_read_lock() */
285struct seg6_hmac_info *seg6_hmac_info_lookup(struct net *net, u32 key)
286{
287 struct seg6_pernet_data *sdata = seg6_pernet(net);
288 struct seg6_hmac_info *hinfo;
289
290 hinfo = rhashtable_lookup_fast(&sdata->hmac_infos, &key, rht_params);
291
292 return hinfo;
293}
294EXPORT_SYMBOL(seg6_hmac_info_lookup);
295
296int seg6_hmac_info_add(struct net *net, u32 key, struct seg6_hmac_info *hinfo)
297{
298 struct seg6_pernet_data *sdata = seg6_pernet(net);
299 int err;
300
301 err = rhashtable_lookup_insert_fast(&sdata->hmac_infos, &hinfo->node,
302 rht_params);
303
304 return err;
305}
306EXPORT_SYMBOL(seg6_hmac_info_add);
307
308int seg6_hmac_info_del(struct net *net, u32 key)
309{
310 struct seg6_pernet_data *sdata = seg6_pernet(net);
311 struct seg6_hmac_info *hinfo;
312 int err = -ENOENT;
313
314 hinfo = rhashtable_lookup_fast(&sdata->hmac_infos, &key, rht_params);
315 if (!hinfo)
316 goto out;
317
318 err = rhashtable_remove_fast(&sdata->hmac_infos, &hinfo->node,
319 rht_params);
320 if (err)
321 goto out;
322
323 seg6_hinfo_release(hinfo);
324
325out:
326 return err;
327}
328EXPORT_SYMBOL(seg6_hmac_info_del);
329
330int seg6_push_hmac(struct net *net, struct in6_addr *saddr,
331 struct ipv6_sr_hdr *srh)
332{
333 struct seg6_hmac_info *hinfo;
334 struct sr6_tlv_hmac *tlv;
335 int err = -ENOENT;
336
337 tlv = seg6_get_tlv_hmac(srh);
338 if (!tlv)
339 return -EINVAL;
340
341 rcu_read_lock();
342
343 hinfo = seg6_hmac_info_lookup(net, be32_to_cpu(tlv->hmackeyid));
344 if (!hinfo)
345 goto out;
346
347 memset(tlv->hmac, 0, SEG6_HMAC_FIELD_LEN);
348 err = seg6_hmac_compute(hinfo, srh, saddr, tlv->hmac);
349
350out:
351 rcu_read_unlock();
352 return err;
353}
354EXPORT_SYMBOL(seg6_push_hmac);
355
356static int seg6_hmac_init_algo(void)
357{
358 struct seg6_hmac_algo *algo;
359 struct crypto_shash *tfm;
360 struct shash_desc *shash;
361 int i, alg_count, cpu;
362
363 alg_count = sizeof(hmac_algos) / sizeof(struct seg6_hmac_algo);
364
365 for (i = 0; i < alg_count; i++) {
366 struct crypto_shash **p_tfm;
367 int shsize;
368
369 algo = &hmac_algos[i];
370 algo->tfms = alloc_percpu(struct crypto_shash *);
371 if (!algo->tfms)
372 return -ENOMEM;
373
374 for_each_possible_cpu(cpu) {
375 tfm = crypto_alloc_shash(algo->name, 0, GFP_KERNEL);
376 if (IS_ERR(tfm))
377 return PTR_ERR(tfm);
378 p_tfm = per_cpu_ptr(algo->tfms, cpu);
379 *p_tfm = tfm;
380 }
381
382 p_tfm = raw_cpu_ptr(algo->tfms);
383 tfm = *p_tfm;
384
385 shsize = sizeof(*shash) + crypto_shash_descsize(tfm);
386
387 algo->shashs = alloc_percpu(struct shash_desc *);
388 if (!algo->shashs)
389 return -ENOMEM;
390
391 for_each_possible_cpu(cpu) {
392 shash = kzalloc_node(shsize, GFP_KERNEL,
393 cpu_to_node(cpu));
394 if (!shash)
395 return -ENOMEM;
396 *per_cpu_ptr(algo->shashs, cpu) = shash;
397 }
398 }
399
400 return 0;
401}
402
403int __init seg6_hmac_init(void)
404{
405 return seg6_hmac_init_algo();
406}
407EXPORT_SYMBOL(seg6_hmac_init);
408
409int __net_init seg6_hmac_net_init(struct net *net)
410{
411 struct seg6_pernet_data *sdata = seg6_pernet(net);
412
413 rhashtable_init(&sdata->hmac_infos, &rht_params);
414
415 return 0;
416}
417EXPORT_SYMBOL(seg6_hmac_net_init);
418
419void seg6_hmac_exit(void)
420{
421 struct seg6_hmac_algo *algo = NULL;
422 int i, alg_count, cpu;
423
424 alg_count = sizeof(hmac_algos) / sizeof(struct seg6_hmac_algo);
425 for (i = 0; i < alg_count; i++) {
426 algo = &hmac_algos[i];
427 for_each_possible_cpu(cpu) {
428 struct crypto_shash *tfm;
429 struct shash_desc *shash;
430
431 shash = *per_cpu_ptr(algo->shashs, cpu);
432 kfree(shash);
433 tfm = *per_cpu_ptr(algo->tfms, cpu);
434 crypto_free_shash(tfm);
435 }
436 free_percpu(algo->tfms);
437 free_percpu(algo->shashs);
438 }
439}
440EXPORT_SYMBOL(seg6_hmac_exit);
441
442void __net_exit seg6_hmac_net_exit(struct net *net)
443{
444 struct seg6_pernet_data *sdata = seg6_pernet(net);
445
446 rhashtable_free_and_destroy(&sdata->hmac_infos, seg6_free_hi, NULL);
447}
448EXPORT_SYMBOL(seg6_hmac_net_exit);
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
new file mode 100644
index 000000000000..85582257d3af
--- /dev/null
+++ b/net/ipv6/seg6_iptunnel.c
@@ -0,0 +1,436 @@
1/*
2 * SR-IPv6 implementation
3 *
4 * Author:
5 * David Lebrun <david.lebrun@uclouvain.be>
6 *
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 */
13
14#include <linux/types.h>
15#include <linux/skbuff.h>
16#include <linux/net.h>
17#include <linux/module.h>
18#include <net/ip.h>
19#include <net/lwtunnel.h>
20#include <net/netevent.h>
21#include <net/netns/generic.h>
22#include <net/ip6_fib.h>
23#include <net/route.h>
24#include <net/seg6.h>
25#include <linux/seg6.h>
26#include <linux/seg6_iptunnel.h>
27#include <net/addrconf.h>
28#include <net/ip6_route.h>
29#ifdef CONFIG_DST_CACHE
30#include <net/dst_cache.h>
31#endif
32#ifdef CONFIG_IPV6_SEG6_HMAC
33#include <net/seg6_hmac.h>
34#endif
35
36struct seg6_lwt {
37#ifdef CONFIG_DST_CACHE
38 struct dst_cache cache;
39#endif
40 struct seg6_iptunnel_encap tuninfo[0];
41};
42
43static inline struct seg6_lwt *seg6_lwt_lwtunnel(struct lwtunnel_state *lwt)
44{
45 return (struct seg6_lwt *)lwt->data;
46}
47
48static inline struct seg6_iptunnel_encap *
49seg6_encap_lwtunnel(struct lwtunnel_state *lwt)
50{
51 return seg6_lwt_lwtunnel(lwt)->tuninfo;
52}
53
54static const struct nla_policy seg6_iptunnel_policy[SEG6_IPTUNNEL_MAX + 1] = {
55 [SEG6_IPTUNNEL_SRH] = { .type = NLA_BINARY },
56};
57
58static int nla_put_srh(struct sk_buff *skb, int attrtype,
59 struct seg6_iptunnel_encap *tuninfo)
60{
61 struct seg6_iptunnel_encap *data;
62 struct nlattr *nla;
63 int len;
64
65 len = SEG6_IPTUN_ENCAP_SIZE(tuninfo);
66
67 nla = nla_reserve(skb, attrtype, len);
68 if (!nla)
69 return -EMSGSIZE;
70
71 data = nla_data(nla);
72 memcpy(data, tuninfo, len);
73
74 return 0;
75}
76
77static void set_tun_src(struct net *net, struct net_device *dev,
78 struct in6_addr *daddr, struct in6_addr *saddr)
79{
80 struct seg6_pernet_data *sdata = seg6_pernet(net);
81 struct in6_addr *tun_src;
82
83 rcu_read_lock();
84
85 tun_src = rcu_dereference(sdata->tun_src);
86
87 if (!ipv6_addr_any(tun_src)) {
88 memcpy(saddr, tun_src, sizeof(struct in6_addr));
89 } else {
90 ipv6_dev_get_saddr(net, dev, daddr, IPV6_PREFER_SRC_PUBLIC,
91 saddr);
92 }
93
94 rcu_read_unlock();
95}
96
97/* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */
98static int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
99{
100 struct net *net = dev_net(skb_dst(skb)->dev);
101 struct ipv6hdr *hdr, *inner_hdr;
102 struct ipv6_sr_hdr *isrh;
103 int hdrlen, tot_len, err;
104
105 hdrlen = (osrh->hdrlen + 1) << 3;
106 tot_len = hdrlen + sizeof(*hdr);
107
108 err = pskb_expand_head(skb, tot_len, 0, GFP_ATOMIC);
109 if (unlikely(err))
110 return err;
111
112 inner_hdr = ipv6_hdr(skb);
113
114 skb_push(skb, tot_len);
115 skb_reset_network_header(skb);
116 skb_mac_header_rebuild(skb);
117 hdr = ipv6_hdr(skb);
118
119 /* inherit tc, flowlabel and hlim
120 * hlim will be decremented in ip6_forward() afterwards and
121 * decapsulation will overwrite inner hlim with outer hlim
122 */
123 ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)),
124 ip6_flowlabel(inner_hdr));
125 hdr->hop_limit = inner_hdr->hop_limit;
126 hdr->nexthdr = NEXTHDR_ROUTING;
127
128 isrh = (void *)hdr + sizeof(*hdr);
129 memcpy(isrh, osrh, hdrlen);
130
131 isrh->nexthdr = NEXTHDR_IPV6;
132
133 hdr->daddr = isrh->segments[isrh->first_segment];
134 set_tun_src(net, skb->dev, &hdr->daddr, &hdr->saddr);
135
136#ifdef CONFIG_IPV6_SEG6_HMAC
137 if (sr_has_hmac(isrh)) {
138 err = seg6_push_hmac(net, &hdr->saddr, isrh);
139 if (unlikely(err))
140 return err;
141 }
142#endif
143
144 skb_postpush_rcsum(skb, hdr, tot_len);
145
146 return 0;
147}
148
149/* insert an SRH within an IPv6 packet, just after the IPv6 header */
150#ifdef CONFIG_IPV6_SEG6_INLINE
151static int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
152{
153 struct ipv6hdr *hdr, *oldhdr;
154 struct ipv6_sr_hdr *isrh;
155 int hdrlen, err;
156
157 hdrlen = (osrh->hdrlen + 1) << 3;
158
159 err = pskb_expand_head(skb, hdrlen, 0, GFP_ATOMIC);
160 if (unlikely(err))
161 return err;
162
163 oldhdr = ipv6_hdr(skb);
164
165 skb_pull(skb, sizeof(struct ipv6hdr));
166 skb_postpull_rcsum(skb, skb_network_header(skb),
167 sizeof(struct ipv6hdr));
168
169 skb_push(skb, sizeof(struct ipv6hdr) + hdrlen);
170 skb_reset_network_header(skb);
171 skb_mac_header_rebuild(skb);
172
173 hdr = ipv6_hdr(skb);
174
175 memmove(hdr, oldhdr, sizeof(*hdr));
176
177 isrh = (void *)hdr + sizeof(*hdr);
178 memcpy(isrh, osrh, hdrlen);
179
180 isrh->nexthdr = hdr->nexthdr;
181 hdr->nexthdr = NEXTHDR_ROUTING;
182
183 isrh->segments[0] = hdr->daddr;
184 hdr->daddr = isrh->segments[isrh->first_segment];
185
186#ifdef CONFIG_IPV6_SEG6_HMAC
187 if (sr_has_hmac(isrh)) {
188 struct net *net = dev_net(skb_dst(skb)->dev);
189
190 err = seg6_push_hmac(net, &hdr->saddr, isrh);
191 if (unlikely(err))
192 return err;
193 }
194#endif
195
196 skb_postpush_rcsum(skb, hdr, sizeof(struct ipv6hdr) + hdrlen);
197
198 return 0;
199}
200#endif
201
202static int seg6_do_srh(struct sk_buff *skb)
203{
204 struct dst_entry *dst = skb_dst(skb);
205 struct seg6_iptunnel_encap *tinfo;
206 int err = 0;
207
208 tinfo = seg6_encap_lwtunnel(dst->lwtstate);
209
210 if (likely(!skb->encapsulation)) {
211 skb_reset_inner_headers(skb);
212 skb->encapsulation = 1;
213 }
214
215 switch (tinfo->mode) {
216#ifdef CONFIG_IPV6_SEG6_INLINE
217 case SEG6_IPTUN_MODE_INLINE:
218 err = seg6_do_srh_inline(skb, tinfo->srh);
219 skb_reset_inner_headers(skb);
220 break;
221#endif
222 case SEG6_IPTUN_MODE_ENCAP:
223 err = seg6_do_srh_encap(skb, tinfo->srh);
224 break;
225 }
226
227 if (err)
228 return err;
229
230 ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
231 skb_set_transport_header(skb, sizeof(struct ipv6hdr));
232
233 skb_set_inner_protocol(skb, skb->protocol);
234
235 return 0;
236}
237
238static int seg6_input(struct sk_buff *skb)
239{
240 int err;
241
242 err = seg6_do_srh(skb);
243 if (unlikely(err)) {
244 kfree_skb(skb);
245 return err;
246 }
247
248 skb_dst_drop(skb);
249 ip6_route_input(skb);
250
251 return dst_input(skb);
252}
253
254static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
255{
256 struct dst_entry *orig_dst = skb_dst(skb);
257 struct dst_entry *dst = NULL;
258 struct seg6_lwt *slwt;
259 int err = -EINVAL;
260
261 err = seg6_do_srh(skb);
262 if (unlikely(err))
263 goto drop;
264
265 slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate);
266
267#ifdef CONFIG_DST_CACHE
268 preempt_disable();
269 dst = dst_cache_get(&slwt->cache);
270 preempt_enable();
271#endif
272
273 if (unlikely(!dst)) {
274 struct ipv6hdr *hdr = ipv6_hdr(skb);
275 struct flowi6 fl6;
276
277 fl6.daddr = hdr->daddr;
278 fl6.saddr = hdr->saddr;
279 fl6.flowlabel = ip6_flowinfo(hdr);
280 fl6.flowi6_mark = skb->mark;
281 fl6.flowi6_proto = hdr->nexthdr;
282
283 dst = ip6_route_output(net, NULL, &fl6);
284 if (dst->error) {
285 err = dst->error;
286 dst_release(dst);
287 goto drop;
288 }
289
290#ifdef CONFIG_DST_CACHE
291 preempt_disable();
292 dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr);
293 preempt_enable();
294#endif
295 }
296
297 skb_dst_drop(skb);
298 skb_dst_set(skb, dst);
299
300 return dst_output(net, sk, skb);
301drop:
302 kfree_skb(skb);
303 return err;
304}
305
306static int seg6_build_state(struct nlattr *nla,
307 unsigned int family, const void *cfg,
308 struct lwtunnel_state **ts)
309{
310 struct nlattr *tb[SEG6_IPTUNNEL_MAX + 1];
311 struct seg6_iptunnel_encap *tuninfo;
312 struct lwtunnel_state *newts;
313 int tuninfo_len, min_size;
314 struct seg6_lwt *slwt;
315 int err;
316
317 err = nla_parse_nested(tb, SEG6_IPTUNNEL_MAX, nla,
318 seg6_iptunnel_policy);
319
320 if (err < 0)
321 return err;
322
323 if (!tb[SEG6_IPTUNNEL_SRH])
324 return -EINVAL;
325
326 tuninfo = nla_data(tb[SEG6_IPTUNNEL_SRH]);
327 tuninfo_len = nla_len(tb[SEG6_IPTUNNEL_SRH]);
328
329 /* tuninfo must contain at least the iptunnel encap structure,
330 * the SRH and one segment
331 */
332 min_size = sizeof(*tuninfo) + sizeof(struct ipv6_sr_hdr) +
333 sizeof(struct in6_addr);
334 if (tuninfo_len < min_size)
335 return -EINVAL;
336
337 switch (tuninfo->mode) {
338#ifdef CONFIG_IPV6_SEG6_INLINE
339 case SEG6_IPTUN_MODE_INLINE:
340 break;
341#endif
342 case SEG6_IPTUN_MODE_ENCAP:
343 break;
344 default:
345 return -EINVAL;
346 }
347
348 /* verify that SRH is consistent */
349 if (!seg6_validate_srh(tuninfo->srh, tuninfo_len - sizeof(*tuninfo)))
350 return -EINVAL;
351
352 newts = lwtunnel_state_alloc(tuninfo_len + sizeof(*slwt));
353 if (!newts)
354 return -ENOMEM;
355
356 slwt = seg6_lwt_lwtunnel(newts);
357
358#ifdef CONFIG_DST_CACHE
359 err = dst_cache_init(&slwt->cache, GFP_KERNEL);
360 if (err) {
361 kfree(newts);
362 return err;
363 }
364#endif
365
366 memcpy(&slwt->tuninfo, tuninfo, tuninfo_len);
367
368 newts->type = LWTUNNEL_ENCAP_SEG6;
369 newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT |
370 LWTUNNEL_STATE_INPUT_REDIRECT;
371 newts->headroom = seg6_lwt_headroom(tuninfo);
372
373 *ts = newts;
374
375 return 0;
376}
377
378#ifdef CONFIG_DST_CACHE
379static void seg6_destroy_state(struct lwtunnel_state *lwt)
380{
381 dst_cache_destroy(&seg6_lwt_lwtunnel(lwt)->cache);
382}
383#endif
384
385static int seg6_fill_encap_info(struct sk_buff *skb,
386 struct lwtunnel_state *lwtstate)
387{
388 struct seg6_iptunnel_encap *tuninfo = seg6_encap_lwtunnel(lwtstate);
389
390 if (nla_put_srh(skb, SEG6_IPTUNNEL_SRH, tuninfo))
391 return -EMSGSIZE;
392
393 return 0;
394}
395
396static int seg6_encap_nlsize(struct lwtunnel_state *lwtstate)
397{
398 struct seg6_iptunnel_encap *tuninfo = seg6_encap_lwtunnel(lwtstate);
399
400 return nla_total_size(SEG6_IPTUN_ENCAP_SIZE(tuninfo));
401}
402
403static int seg6_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
404{
405 struct seg6_iptunnel_encap *a_hdr = seg6_encap_lwtunnel(a);
406 struct seg6_iptunnel_encap *b_hdr = seg6_encap_lwtunnel(b);
407 int len = SEG6_IPTUN_ENCAP_SIZE(a_hdr);
408
409 if (len != SEG6_IPTUN_ENCAP_SIZE(b_hdr))
410 return 1;
411
412 return memcmp(a_hdr, b_hdr, len);
413}
414
415static const struct lwtunnel_encap_ops seg6_iptun_ops = {
416 .build_state = seg6_build_state,
417#ifdef CONFIG_DST_CACHE
418 .destroy_state = seg6_destroy_state,
419#endif
420 .output = seg6_output,
421 .input = seg6_input,
422 .fill_encap = seg6_fill_encap_info,
423 .get_encap_size = seg6_encap_nlsize,
424 .cmp_encap = seg6_encap_cmp,
425 .owner = THIS_MODULE,
426};
427
428int __init seg6_iptunnel_init(void)
429{
430 return lwtunnel_encap_add_ops(&seg6_iptun_ops, LWTUNNEL_ENCAP_SEG6);
431}
432
433void seg6_iptunnel_exit(void)
434{
435 lwtunnel_encap_del_ops(&seg6_iptun_ops, LWTUNNEL_ENCAP_SEG6);
436}
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index b1cdf8009d29..99853c6e33a8 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -31,7 +31,7 @@
31#include <linux/if_arp.h> 31#include <linux/if_arp.h>
32#include <linux/icmp.h> 32#include <linux/icmp.h>
33#include <linux/slab.h> 33#include <linux/slab.h>
34#include <asm/uaccess.h> 34#include <linux/uaccess.h>
35#include <linux/init.h> 35#include <linux/init.h>
36#include <linux/netfilter_ipv4.h> 36#include <linux/netfilter_ipv4.h>
37#include <linux/if_ether.h> 37#include <linux/if_ether.h>
@@ -76,7 +76,7 @@ static bool check_6rd(struct ip_tunnel *tunnel, const struct in6_addr *v6dst,
76 __be32 *v4dst); 76 __be32 *v4dst);
77static struct rtnl_link_ops sit_link_ops __read_mostly; 77static struct rtnl_link_ops sit_link_ops __read_mostly;
78 78
79static int sit_net_id __read_mostly; 79static unsigned int sit_net_id __read_mostly;
80struct sit_net { 80struct sit_net {
81 struct ip_tunnel __rcu *tunnels_r_l[IP6_SIT_HASH_SIZE]; 81 struct ip_tunnel __rcu *tunnels_r_l[IP6_SIT_HASH_SIZE];
82 struct ip_tunnel __rcu *tunnels_r[IP6_SIT_HASH_SIZE]; 82 struct ip_tunnel __rcu *tunnels_r[IP6_SIT_HASH_SIZE];
@@ -1318,23 +1318,11 @@ done:
1318 return err; 1318 return err;
1319} 1319}
1320 1320
1321static int ipip6_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1322{
1323 struct ip_tunnel *tunnel = netdev_priv(dev);
1324 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
1325
1326 if (new_mtu < IPV6_MIN_MTU || new_mtu > 0xFFF8 - t_hlen)
1327 return -EINVAL;
1328 dev->mtu = new_mtu;
1329 return 0;
1330}
1331
1332static const struct net_device_ops ipip6_netdev_ops = { 1321static const struct net_device_ops ipip6_netdev_ops = {
1333 .ndo_init = ipip6_tunnel_init, 1322 .ndo_init = ipip6_tunnel_init,
1334 .ndo_uninit = ipip6_tunnel_uninit, 1323 .ndo_uninit = ipip6_tunnel_uninit,
1335 .ndo_start_xmit = sit_tunnel_xmit, 1324 .ndo_start_xmit = sit_tunnel_xmit,
1336 .ndo_do_ioctl = ipip6_tunnel_ioctl, 1325 .ndo_do_ioctl = ipip6_tunnel_ioctl,
1337 .ndo_change_mtu = ipip6_tunnel_change_mtu,
1338 .ndo_get_stats64 = ip_tunnel_get_stats64, 1326 .ndo_get_stats64 = ip_tunnel_get_stats64,
1339 .ndo_get_iflink = ip_tunnel_get_iflink, 1327 .ndo_get_iflink = ip_tunnel_get_iflink,
1340}; 1328};
@@ -1365,6 +1353,8 @@ static void ipip6_tunnel_setup(struct net_device *dev)
1365 dev->type = ARPHRD_SIT; 1353 dev->type = ARPHRD_SIT;
1366 dev->hard_header_len = LL_MAX_HEADER + t_hlen; 1354 dev->hard_header_len = LL_MAX_HEADER + t_hlen;
1367 dev->mtu = ETH_DATA_LEN - t_hlen; 1355 dev->mtu = ETH_DATA_LEN - t_hlen;
1356 dev->min_mtu = IPV6_MIN_MTU;
1357 dev->max_mtu = 0xFFF8 - t_hlen;
1368 dev->flags = IFF_NOARP; 1358 dev->flags = IFF_NOARP;
1369 netif_keep_dst(dev); 1359 netif_keep_dst(dev);
1370 dev->addr_len = 4; 1360 dev->addr_len = 4;
@@ -1390,6 +1380,7 @@ static int ipip6_tunnel_init(struct net_device *dev)
1390 err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); 1380 err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1391 if (err) { 1381 if (err) {
1392 free_percpu(dev->tstats); 1382 free_percpu(dev->tstats);
1383 dev->tstats = NULL;
1393 return err; 1384 return err;
1394 } 1385 }
1395 1386
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 59c483937aec..895ff650db43 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -16,7 +16,7 @@
16 16
17#include <linux/tcp.h> 17#include <linux/tcp.h>
18#include <linux/random.h> 18#include <linux/random.h>
19#include <linux/cryptohash.h> 19#include <linux/siphash.h>
20#include <linux/kernel.h> 20#include <linux/kernel.h>
21#include <net/ipv6.h> 21#include <net/ipv6.h>
22#include <net/tcp.h> 22#include <net/tcp.h>
@@ -24,7 +24,7 @@
24#define COOKIEBITS 24 /* Upper bits store count */ 24#define COOKIEBITS 24 /* Upper bits store count */
25#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) 25#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
26 26
27static u32 syncookie6_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly; 27static siphash_key_t syncookie6_secret[2] __read_mostly;
28 28
29/* RFC 2460, Section 8.3: 29/* RFC 2460, Section 8.3:
30 * [ipv6 tcp] MSS must be computed as the maximum packet size minus 60 [..] 30 * [ipv6 tcp] MSS must be computed as the maximum packet size minus 60 [..]
@@ -41,30 +41,27 @@ static __u16 const msstab[] = {
41 9000 - 60, 41 9000 - 60,
42}; 42};
43 43
44static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv6_cookie_scratch); 44static u32 cookie_hash(const struct in6_addr *saddr,
45 45 const struct in6_addr *daddr,
46static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *daddr,
47 __be16 sport, __be16 dport, u32 count, int c) 46 __be16 sport, __be16 dport, u32 count, int c)
48{ 47{
49 __u32 *tmp; 48 const struct {
49 struct in6_addr saddr;
50 struct in6_addr daddr;
51 u32 count;
52 __be16 sport;
53 __be16 dport;
54 } __aligned(SIPHASH_ALIGNMENT) combined = {
55 .saddr = *saddr,
56 .daddr = *daddr,
57 .count = count,
58 .sport = sport,
59 .dport = dport
60 };
50 61
51 net_get_random_once(syncookie6_secret, sizeof(syncookie6_secret)); 62 net_get_random_once(syncookie6_secret, sizeof(syncookie6_secret));
52 63 return siphash(&combined, offsetofend(typeof(combined), dport),
53 tmp = this_cpu_ptr(ipv6_cookie_scratch); 64 &syncookie6_secret[c]);
54
55 /*
56 * we have 320 bits of information to hash, copy in the remaining
57 * 192 bits required for sha_transform, from the syncookie6_secret
58 * and overwrite the digest with the secret
59 */
60 memcpy(tmp + 10, syncookie6_secret[c], 44);
61 memcpy(tmp, saddr, 16);
62 memcpy(tmp + 4, daddr, 16);
63 tmp[8] = ((__force u32)sport << 16) + (__force u32)dport;
64 tmp[9] = count;
65 sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
66
67 return tmp[17];
68} 65}
69 66
70static __u32 secure_tcp_syn_cookie(const struct in6_addr *saddr, 67static __u32 secure_tcp_syn_cookie(const struct in6_addr *saddr,
@@ -209,6 +206,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
209 treq->snt_synack.v64 = 0; 206 treq->snt_synack.v64 = 0;
210 treq->rcv_isn = ntohl(th->seq) - 1; 207 treq->rcv_isn = ntohl(th->seq) - 1;
211 treq->snt_isn = cookie; 208 treq->snt_isn = cookie;
209 treq->ts_off = 0;
212 210
213 /* 211 /*
214 * We need to lookup the dst_entry to get the correct window size. 212 * We need to lookup the dst_entry to get the correct window size.
@@ -227,6 +225,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
227 fl6.flowi6_mark = ireq->ir_mark; 225 fl6.flowi6_mark = ireq->ir_mark;
228 fl6.fl6_dport = ireq->ir_rmt_port; 226 fl6.fl6_dport = ireq->ir_rmt_port;
229 fl6.fl6_sport = inet_sk(sk)->inet_sport; 227 fl6.fl6_sport = inet_sk(sk)->inet_sport;
228 fl6.flowi6_uid = sk->sk_uid;
230 security_req_classify_flow(req, flowi6_to_flowi(&fl6)); 229 security_req_classify_flow(req, flowi6_to_flowi(&fl6));
231 230
232 dst = ip6_dst_lookup_flow(sk, &fl6, final_p); 231 dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index b9f1fee9a886..49fa2e8c3fa9 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -101,12 +101,12 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
101 } 101 }
102} 102}
103 103
104static __u32 tcp_v6_init_sequence(const struct sk_buff *skb) 104static u32 tcp_v6_init_sequence(const struct sk_buff *skb, u32 *tsoff)
105{ 105{
106 return secure_tcpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32, 106 return secure_tcpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32,
107 ipv6_hdr(skb)->saddr.s6_addr32, 107 ipv6_hdr(skb)->saddr.s6_addr32,
108 tcp_hdr(skb)->dest, 108 tcp_hdr(skb)->dest,
109 tcp_hdr(skb)->source); 109 tcp_hdr(skb)->source, tsoff);
110} 110}
111 111
112static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, 112static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
@@ -122,7 +122,9 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
122 struct flowi6 fl6; 122 struct flowi6 fl6;
123 struct dst_entry *dst; 123 struct dst_entry *dst;
124 int addr_type; 124 int addr_type;
125 u32 seq;
125 int err; 126 int err;
127 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
126 128
127 if (addr_len < SIN6_LEN_RFC2133) 129 if (addr_len < SIN6_LEN_RFC2133)
128 return -EINVAL; 130 return -EINVAL;
@@ -148,8 +150,13 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
148 * connect() to INADDR_ANY means loopback (BSD'ism). 150 * connect() to INADDR_ANY means loopback (BSD'ism).
149 */ 151 */
150 152
151 if (ipv6_addr_any(&usin->sin6_addr)) 153 if (ipv6_addr_any(&usin->sin6_addr)) {
152 usin->sin6_addr.s6_addr[15] = 0x1; 154 if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
155 ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK),
156 &usin->sin6_addr);
157 else
158 usin->sin6_addr = in6addr_loopback;
159 }
153 160
154 addr_type = ipv6_addr_type(&usin->sin6_addr); 161 addr_type = ipv6_addr_type(&usin->sin6_addr);
155 162
@@ -188,7 +195,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
188 * TCP over IPv4 195 * TCP over IPv4
189 */ 196 */
190 197
191 if (addr_type == IPV6_ADDR_MAPPED) { 198 if (addr_type & IPV6_ADDR_MAPPED) {
192 u32 exthdrlen = icsk->icsk_ext_hdr_len; 199 u32 exthdrlen = icsk->icsk_ext_hdr_len;
193 struct sockaddr_in sin; 200 struct sockaddr_in sin;
194 201
@@ -233,6 +240,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
233 fl6.flowi6_mark = sk->sk_mark; 240 fl6.flowi6_mark = sk->sk_mark;
234 fl6.fl6_dport = usin->sin6_port; 241 fl6.fl6_dport = usin->sin6_port;
235 fl6.fl6_sport = inet->inet_sport; 242 fl6.fl6_sport = inet->inet_sport;
243 fl6.flowi6_uid = sk->sk_uid;
236 244
237 opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); 245 opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
238 final_p = fl6_update_dst(&fl6, opt, &final); 246 final_p = fl6_update_dst(&fl6, opt, &final);
@@ -257,7 +265,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
257 sk->sk_gso_type = SKB_GSO_TCPV6; 265 sk->sk_gso_type = SKB_GSO_TCPV6;
258 ip6_dst_store(sk, dst, NULL, NULL); 266 ip6_dst_store(sk, dst, NULL, NULL);
259 267
260 if (tcp_death_row.sysctl_tw_recycle && 268 if (tcp_death_row->sysctl_tw_recycle &&
261 !tp->rx_opt.ts_recent_stamp && 269 !tp->rx_opt.ts_recent_stamp &&
262 ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr)) 270 ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr))
263 tcp_fetch_timewait_stamp(sk, dst); 271 tcp_fetch_timewait_stamp(sk, dst);
@@ -272,17 +280,26 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
272 inet->inet_dport = usin->sin6_port; 280 inet->inet_dport = usin->sin6_port;
273 281
274 tcp_set_state(sk, TCP_SYN_SENT); 282 tcp_set_state(sk, TCP_SYN_SENT);
275 err = inet6_hash_connect(&tcp_death_row, sk); 283 err = inet6_hash_connect(tcp_death_row, sk);
276 if (err) 284 if (err)
277 goto late_failure; 285 goto late_failure;
278 286
279 sk_set_txhash(sk); 287 sk_set_txhash(sk);
280 288
281 if (!tp->write_seq && likely(!tp->repair)) 289 if (likely(!tp->repair)) {
282 tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32, 290 seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32,
283 sk->sk_v6_daddr.s6_addr32, 291 sk->sk_v6_daddr.s6_addr32,
284 inet->inet_sport, 292 inet->inet_sport,
285 inet->inet_dport); 293 inet->inet_dport,
294 &tp->tsoffset);
295 if (!tp->write_seq)
296 tp->write_seq = seq;
297 }
298
299 if (tcp_fastopen_defer_connect(sk, &err))
300 return err;
301 if (err)
302 goto late_failure;
286 303
287 err = tcp_connect(sk); 304 err = tcp_connect(sk);
288 if (err) 305 if (err)
@@ -292,7 +309,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
292 309
293late_failure: 310late_failure:
294 tcp_set_state(sk, TCP_CLOSE); 311 tcp_set_state(sk, TCP_CLOSE);
295 __sk_dst_reset(sk);
296failure: 312failure:
297 inet->inet_dport = 0; 313 inet->inet_dport = 0;
298 sk->sk_route_caps = 0; 314 sk->sk_route_caps = 0;
@@ -375,10 +391,12 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
375 np = inet6_sk(sk); 391 np = inet6_sk(sk);
376 392
377 if (type == NDISC_REDIRECT) { 393 if (type == NDISC_REDIRECT) {
378 struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); 394 if (!sock_owned_by_user(sk)) {
395 struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);
379 396
380 if (dst) 397 if (dst)
381 dst->ops->redirect(dst, sk, skb); 398 dst->ops->redirect(dst, sk, skb);
399 }
382 goto out; 400 goto out;
383 } 401 }
384 402
@@ -397,7 +415,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
397 if (!sock_owned_by_user(sk)) 415 if (!sock_owned_by_user(sk))
398 tcp_v6_mtu_reduced(sk); 416 tcp_v6_mtu_reduced(sk);
399 else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, 417 else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
400 &tp->tsq_flags)) 418 &sk->sk_tsq_flags))
401 sock_hold(sk); 419 sock_hold(sk);
402 goto out; 420 goto out;
403 } 421 }
@@ -467,7 +485,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
467 opt = ireq->ipv6_opt; 485 opt = ireq->ipv6_opt;
468 if (!opt) 486 if (!opt)
469 opt = rcu_dereference(np->opt); 487 opt = rcu_dereference(np->opt);
470 err = ip6_xmit(sk, skb, fl6, opt, np->tclass); 488 err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass);
471 rcu_read_unlock(); 489 rcu_read_unlock();
472 err = net_xmit_eval(err); 490 err = net_xmit_eval(err);
473 } 491 }
@@ -828,6 +846,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
828 fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark); 846 fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark);
829 fl6.fl6_dport = t1->dest; 847 fl6.fl6_dport = t1->dest;
830 fl6.fl6_sport = t1->source; 848 fl6.fl6_sport = t1->source;
849 fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
831 security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); 850 security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
832 851
833 /* Pass a socket to ip6_dst_lookup either it is for RST 852 /* Pass a socket to ip6_dst_lookup either it is for RST
@@ -837,7 +856,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
837 dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL); 856 dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL);
838 if (!IS_ERR(dst)) { 857 if (!IS_ERR(dst)) {
839 skb_dst_set(buff, dst); 858 skb_dst_set(buff, dst);
840 ip6_xmit(ctl_sk, buff, &fl6, NULL, tclass); 859 ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass);
841 TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 860 TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
842 if (rst) 861 if (rst)
843 TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 862 TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
@@ -954,7 +973,8 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
954 tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, 973 tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
955 tcp_rsk(req)->rcv_nxt, 974 tcp_rsk(req)->rcv_nxt,
956 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 975 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
957 tcp_time_stamp, req->ts_recent, sk->sk_bound_dev_if, 976 tcp_time_stamp + tcp_rsk(req)->ts_off,
977 req->ts_recent, sk->sk_bound_dev_if,
958 tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr), 978 tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr),
959 0, 0); 979 0, 0);
960} 980}
@@ -987,6 +1007,16 @@ drop:
987 return 0; /* don't send reset */ 1007 return 0; /* don't send reset */
988} 1008}
989 1009
1010static void tcp_v6_restore_cb(struct sk_buff *skb)
1011{
1012 /* We need to move header back to the beginning if xfrm6_policy_check()
1013 * and tcp_v6_fill_cb() are going to be called again.
1014 * ip6_datagram_recv_specific_ctl() also expects IP6CB to be there.
1015 */
1016 memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6,
1017 sizeof(struct inet6_skb_parm));
1018}
1019
990static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1020static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
991 struct request_sock *req, 1021 struct request_sock *req,
992 struct dst_entry *dst, 1022 struct dst_entry *dst,
@@ -1138,10 +1168,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
1138 tcp_ca_openreq_child(newsk, dst); 1168 tcp_ca_openreq_child(newsk, dst);
1139 1169
1140 tcp_sync_mss(newsk, dst_mtu(dst)); 1170 tcp_sync_mss(newsk, dst_mtu(dst));
1141 newtp->advmss = dst_metric_advmss(dst); 1171 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1142 if (tcp_sk(sk)->rx_opt.user_mss &&
1143 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1144 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1145 1172
1146 tcp_initialize_rcv_mss(newsk); 1173 tcp_initialize_rcv_mss(newsk);
1147 1174
@@ -1178,8 +1205,10 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
1178 sk_gfp_mask(sk, GFP_ATOMIC)); 1205 sk_gfp_mask(sk, GFP_ATOMIC));
1179 consume_skb(ireq->pktopts); 1206 consume_skb(ireq->pktopts);
1180 ireq->pktopts = NULL; 1207 ireq->pktopts = NULL;
1181 if (newnp->pktoptions) 1208 if (newnp->pktoptions) {
1209 tcp_v6_restore_cb(newnp->pktoptions);
1182 skb_set_owner_r(newnp->pktoptions, newsk); 1210 skb_set_owner_r(newnp->pktoptions, newsk);
1211 }
1183 } 1212 }
1184 } 1213 }
1185 1214
@@ -1194,16 +1223,6 @@ out:
1194 return NULL; 1223 return NULL;
1195} 1224}
1196 1225
1197static void tcp_v6_restore_cb(struct sk_buff *skb)
1198{
1199 /* We need to move header back to the beginning if xfrm6_policy_check()
1200 * and tcp_v6_fill_cb() are going to be called again.
1201 * ip6_datagram_recv_specific_ctl() also expects IP6CB to be there.
1202 */
1203 memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6,
1204 sizeof(struct inet6_skb_parm));
1205}
1206
1207/* The socket must have it's spinlock held when we get 1226/* The socket must have it's spinlock held when we get
1208 * here, unless it is a TCP_LISTEN socket. 1227 * here, unless it is a TCP_LISTEN socket.
1209 * 1228 *
@@ -1616,7 +1635,6 @@ static const struct inet_connection_sock_af_ops ipv6_specific = {
1616 .getsockopt = ipv6_getsockopt, 1635 .getsockopt = ipv6_getsockopt,
1617 .addr2sockaddr = inet6_csk_addr2sockaddr, 1636 .addr2sockaddr = inet6_csk_addr2sockaddr,
1618 .sockaddr_len = sizeof(struct sockaddr_in6), 1637 .sockaddr_len = sizeof(struct sockaddr_in6),
1619 .bind_conflict = inet6_csk_bind_conflict,
1620#ifdef CONFIG_COMPAT 1638#ifdef CONFIG_COMPAT
1621 .compat_setsockopt = compat_ipv6_setsockopt, 1639 .compat_setsockopt = compat_ipv6_setsockopt,
1622 .compat_getsockopt = compat_ipv6_getsockopt, 1640 .compat_getsockopt = compat_ipv6_getsockopt,
@@ -1647,7 +1665,6 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = {
1647 .getsockopt = ipv6_getsockopt, 1665 .getsockopt = ipv6_getsockopt,
1648 .addr2sockaddr = inet6_csk_addr2sockaddr, 1666 .addr2sockaddr = inet6_csk_addr2sockaddr,
1649 .sockaddr_len = sizeof(struct sockaddr_in6), 1667 .sockaddr_len = sizeof(struct sockaddr_in6),
1650 .bind_conflict = inet6_csk_bind_conflict,
1651#ifdef CONFIG_COMPAT 1668#ifdef CONFIG_COMPAT
1652 .compat_setsockopt = compat_ipv6_setsockopt, 1669 .compat_setsockopt = compat_ipv6_setsockopt,
1653 .compat_getsockopt = compat_ipv6_getsockopt, 1670 .compat_getsockopt = compat_ipv6_getsockopt,
@@ -1740,7 +1757,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
1740 srcp = ntohs(inet->inet_sport); 1757 srcp = ntohs(inet->inet_sport);
1741 1758
1742 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 1759 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
1743 icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || 1760 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
1744 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 1761 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
1745 timer_active = 1; 1762 timer_active = 1;
1746 timer_expires = icsk->icsk_timeout; 1763 timer_expires = icsk->icsk_timeout;
@@ -1884,6 +1901,7 @@ struct proto tcpv6_prot = {
1884 .shutdown = tcp_shutdown, 1901 .shutdown = tcp_shutdown,
1885 .setsockopt = tcp_setsockopt, 1902 .setsockopt = tcp_setsockopt,
1886 .getsockopt = tcp_getsockopt, 1903 .getsockopt = tcp_getsockopt,
1904 .keepalive = tcp_set_keepalive,
1887 .recvmsg = tcp_recvmsg, 1905 .recvmsg = tcp_recvmsg,
1888 .sendmsg = tcp_sendmsg, 1906 .sendmsg = tcp_sendmsg,
1889 .sendpage = tcp_sendpage, 1907 .sendpage = tcp_sendpage,
@@ -1944,7 +1962,7 @@ static void __net_exit tcpv6_net_exit(struct net *net)
1944 1962
1945static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list) 1963static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list)
1946{ 1964{
1947 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET6); 1965 inet_twsk_purge(&tcp_hashinfo, AF_INET6);
1948} 1966}
1949 1967
1950static struct pernet_operations tcpv6_net_ops = { 1968static struct pernet_operations tcpv6_net_ops = {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index e4a8000d59ad..e28082f0a307 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -35,7 +35,7 @@
35#include <linux/module.h> 35#include <linux/module.h>
36#include <linux/skbuff.h> 36#include <linux/skbuff.h>
37#include <linux/slab.h> 37#include <linux/slab.h>
38#include <asm/uaccess.h> 38#include <linux/uaccess.h>
39 39
40#include <net/addrconf.h> 40#include <net/addrconf.h>
41#include <net/ndisc.h> 41#include <net/ndisc.h>
@@ -55,6 +55,16 @@
55#include <trace/events/skb.h> 55#include <trace/events/skb.h>
56#include "udp_impl.h" 56#include "udp_impl.h"
57 57
58static bool udp6_lib_exact_dif_match(struct net *net, struct sk_buff *skb)
59{
60#if defined(CONFIG_NET_L3_MASTER_DEV)
61 if (!net->ipv4.sysctl_udp_l3mdev_accept &&
62 skb && ipv6_l3mdev_skb(IP6CB(skb)->flags))
63 return true;
64#endif
65 return false;
66}
67
58static u32 udp6_ehashfn(const struct net *net, 68static u32 udp6_ehashfn(const struct net *net,
59 const struct in6_addr *laddr, 69 const struct in6_addr *laddr,
60 const u16 lport, 70 const u16 lport,
@@ -103,7 +113,7 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum)
103 113
104 /* precompute partial secondary hash */ 114 /* precompute partial secondary hash */
105 udp_sk(sk)->udp_portaddr_hash = hash2_partial; 115 udp_sk(sk)->udp_portaddr_hash = hash2_partial;
106 return udp_lib_get_port(sk, snum, ipv6_rcv_saddr_equal, hash2_nulladdr); 116 return udp_lib_get_port(sk, snum, hash2_nulladdr);
107} 117}
108 118
109static void udp_v6_rehash(struct sock *sk) 119static void udp_v6_rehash(struct sock *sk)
@@ -118,7 +128,7 @@ static void udp_v6_rehash(struct sock *sk)
118static int compute_score(struct sock *sk, struct net *net, 128static int compute_score(struct sock *sk, struct net *net,
119 const struct in6_addr *saddr, __be16 sport, 129 const struct in6_addr *saddr, __be16 sport,
120 const struct in6_addr *daddr, unsigned short hnum, 130 const struct in6_addr *daddr, unsigned short hnum,
121 int dif) 131 int dif, bool exact_dif)
122{ 132{
123 int score; 133 int score;
124 struct inet_sock *inet; 134 struct inet_sock *inet;
@@ -149,7 +159,7 @@ static int compute_score(struct sock *sk, struct net *net,
149 score++; 159 score++;
150 } 160 }
151 161
152 if (sk->sk_bound_dev_if) { 162 if (sk->sk_bound_dev_if || exact_dif) {
153 if (sk->sk_bound_dev_if != dif) 163 if (sk->sk_bound_dev_if != dif)
154 return -1; 164 return -1;
155 score++; 165 score++;
@@ -165,7 +175,7 @@ static int compute_score(struct sock *sk, struct net *net,
165static struct sock *udp6_lib_lookup2(struct net *net, 175static struct sock *udp6_lib_lookup2(struct net *net,
166 const struct in6_addr *saddr, __be16 sport, 176 const struct in6_addr *saddr, __be16 sport,
167 const struct in6_addr *daddr, unsigned int hnum, int dif, 177 const struct in6_addr *daddr, unsigned int hnum, int dif,
168 struct udp_hslot *hslot2, 178 bool exact_dif, struct udp_hslot *hslot2,
169 struct sk_buff *skb) 179 struct sk_buff *skb)
170{ 180{
171 struct sock *sk, *result; 181 struct sock *sk, *result;
@@ -176,7 +186,7 @@ static struct sock *udp6_lib_lookup2(struct net *net,
176 badness = -1; 186 badness = -1;
177 udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { 187 udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
178 score = compute_score(sk, net, saddr, sport, 188 score = compute_score(sk, net, saddr, sport,
179 daddr, hnum, dif); 189 daddr, hnum, dif, exact_dif);
180 if (score > badness) { 190 if (score > badness) {
181 reuseport = sk->sk_reuseport; 191 reuseport = sk->sk_reuseport;
182 if (reuseport) { 192 if (reuseport) {
@@ -212,6 +222,7 @@ struct sock *__udp6_lib_lookup(struct net *net,
212 unsigned short hnum = ntohs(dport); 222 unsigned short hnum = ntohs(dport);
213 unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); 223 unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
214 struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; 224 struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
225 bool exact_dif = udp6_lib_exact_dif_match(net, skb);
215 int score, badness, matches = 0, reuseport = 0; 226 int score, badness, matches = 0, reuseport = 0;
216 u32 hash = 0; 227 u32 hash = 0;
217 228
@@ -223,7 +234,7 @@ struct sock *__udp6_lib_lookup(struct net *net,
223 goto begin; 234 goto begin;
224 235
225 result = udp6_lib_lookup2(net, saddr, sport, 236 result = udp6_lib_lookup2(net, saddr, sport,
226 daddr, hnum, dif, 237 daddr, hnum, dif, exact_dif,
227 hslot2, skb); 238 hslot2, skb);
228 if (!result) { 239 if (!result) {
229 unsigned int old_slot2 = slot2; 240 unsigned int old_slot2 = slot2;
@@ -239,7 +250,8 @@ struct sock *__udp6_lib_lookup(struct net *net,
239 250
240 result = udp6_lib_lookup2(net, saddr, sport, 251 result = udp6_lib_lookup2(net, saddr, sport,
241 daddr, hnum, dif, 252 daddr, hnum, dif,
242 hslot2, skb); 253 exact_dif, hslot2,
254 skb);
243 } 255 }
244 return result; 256 return result;
245 } 257 }
@@ -247,7 +259,8 @@ begin:
247 result = NULL; 259 result = NULL;
248 badness = -1; 260 badness = -1;
249 sk_for_each_rcu(sk, &hslot->head) { 261 sk_for_each_rcu(sk, &hslot->head) {
250 score = compute_score(sk, net, saddr, sport, daddr, hnum, dif); 262 score = compute_score(sk, net, saddr, sport, daddr, hnum, dif,
263 exact_dif);
251 if (score > badness) { 264 if (score > badness) {
252 reuseport = sk->sk_reuseport; 265 reuseport = sk->sk_reuseport;
253 if (reuseport) { 266 if (reuseport) {
@@ -302,7 +315,8 @@ EXPORT_SYMBOL_GPL(udp6_lib_lookup_skb);
302 * Does increment socket refcount. 315 * Does increment socket refcount.
303 */ 316 */
304#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \ 317#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \
305 IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY) 318 IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY) || \
319 IS_ENABLED(CONFIG_NF_SOCKET_IPV6)
306struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, 320struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport,
307 const struct in6_addr *daddr, __be16 dport, int dif) 321 const struct in6_addr *daddr, __be16 dport, int dif)
308{ 322{
@@ -334,7 +348,6 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
334 int is_udplite = IS_UDPLITE(sk); 348 int is_udplite = IS_UDPLITE(sk);
335 bool checksum_valid = false; 349 bool checksum_valid = false;
336 int is_udp4; 350 int is_udp4;
337 bool slow;
338 351
339 if (flags & MSG_ERRQUEUE) 352 if (flags & MSG_ERRQUEUE)
340 return ipv6_recv_error(sk, msg, len, addr_len); 353 return ipv6_recv_error(sk, msg, len, addr_len);
@@ -344,8 +357,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
344 357
345try_again: 358try_again:
346 peeking = off = sk_peek_offset(sk, flags); 359 peeking = off = sk_peek_offset(sk, flags);
347 skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), 360 skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
348 &peeked, &off, &err);
349 if (!skb) 361 if (!skb)
350 return err; 362 return err;
351 363
@@ -364,7 +376,8 @@ try_again:
364 * coverage checksum (UDP-Lite), do it before the copy. 376 * coverage checksum (UDP-Lite), do it before the copy.
365 */ 377 */
366 378
367 if (copied < ulen || UDP_SKB_CB(skb)->partial_cov || peeking) { 379 if (copied < ulen || peeking ||
380 (is_udplite && UDP_SKB_CB(skb)->partial_cov)) {
368 checksum_valid = !udp_lib_checksum_complete(skb); 381 checksum_valid = !udp_lib_checksum_complete(skb);
369 if (!checksum_valid) 382 if (!checksum_valid)
370 goto csum_copy_err; 383 goto csum_copy_err;
@@ -378,7 +391,6 @@ try_again:
378 goto csum_copy_err; 391 goto csum_copy_err;
379 } 392 }
380 if (unlikely(err)) { 393 if (unlikely(err)) {
381 trace_kfree_skb(skb, udpv6_recvmsg);
382 if (!peeked) { 394 if (!peeked) {
383 atomic_inc(&sk->sk_drops); 395 atomic_inc(&sk->sk_drops);
384 if (is_udp4) 396 if (is_udp4)
@@ -388,7 +400,7 @@ try_again:
388 UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, 400 UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS,
389 is_udplite); 401 is_udplite);
390 } 402 }
391 skb_free_datagram_locked(sk, skb); 403 kfree_skb(skb);
392 return err; 404 return err;
393 } 405 }
394 if (!peeked) { 406 if (!peeked) {
@@ -427,7 +439,7 @@ try_again:
427 439
428 if (is_udp4) { 440 if (is_udp4) {
429 if (inet->cmsg_flags) 441 if (inet->cmsg_flags)
430 ip_cmsg_recv_offset(msg, skb, 442 ip_cmsg_recv_offset(msg, sk, skb,
431 sizeof(struct udphdr), off); 443 sizeof(struct udphdr), off);
432 } else { 444 } else {
433 if (np->rxopt.all) 445 if (np->rxopt.all)
@@ -438,12 +450,11 @@ try_again:
438 if (flags & MSG_TRUNC) 450 if (flags & MSG_TRUNC)
439 err = ulen; 451 err = ulen;
440 452
441 __skb_free_datagram_locked(sk, skb, peeking ? -err : err); 453 skb_consume_udp(sk, skb, peeking ? -err : err);
442 return err; 454 return err;
443 455
444csum_copy_err: 456csum_copy_err:
445 slow = lock_sock_fast(sk); 457 if (!__sk_queue_drop_skb(sk, skb, flags, udp_skb_destructor)) {
446 if (!skb_kill_datagram(sk, skb, flags)) {
447 if (is_udp4) { 458 if (is_udp4) {
448 UDP_INC_STATS(sock_net(sk), 459 UDP_INC_STATS(sock_net(sk),
449 UDP_MIB_CSUMERRORS, is_udplite); 460 UDP_MIB_CSUMERRORS, is_udplite);
@@ -456,7 +467,7 @@ csum_copy_err:
456 UDP_MIB_INERRORS, is_udplite); 467 UDP_MIB_INERRORS, is_udplite);
457 } 468 }
458 } 469 }
459 unlock_sock_fast(sk, slow); 470 kfree_skb(skb);
460 471
461 /* starting over for a new packet, but check if we need to yield */ 472 /* starting over for a new packet, but check if we need to yield */
462 cond_resched(); 473 cond_resched();
@@ -522,9 +533,11 @@ int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
522 sock_rps_save_rxhash(sk, skb); 533 sock_rps_save_rxhash(sk, skb);
523 sk_mark_napi_id(sk, skb); 534 sk_mark_napi_id(sk, skb);
524 sk_incoming_cpu_update(sk); 535 sk_incoming_cpu_update(sk);
536 } else {
537 sk_mark_napi_id_once(sk, skb);
525 } 538 }
526 539
527 rc = __sock_queue_rcv_skb(sk, skb); 540 rc = __udp_enqueue_schedule_skb(sk, skb);
528 if (rc < 0) { 541 if (rc < 0) {
529 int is_udplite = IS_UDPLITE(sk); 542 int is_udplite = IS_UDPLITE(sk);
530 543
@@ -536,6 +549,7 @@ int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
536 kfree_skb(skb); 549 kfree_skb(skb);
537 return -1; 550 return -1;
538 } 551 }
552
539 return 0; 553 return 0;
540} 554}
541 555
@@ -557,7 +571,6 @@ EXPORT_SYMBOL(udpv6_encap_enable);
557int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 571int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
558{ 572{
559 struct udp_sock *up = udp_sk(sk); 573 struct udp_sock *up = udp_sk(sk);
560 int rc;
561 int is_udplite = IS_UDPLITE(sk); 574 int is_udplite = IS_UDPLITE(sk);
562 575
563 if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) 576 if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
@@ -623,25 +636,10 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
623 goto drop; 636 goto drop;
624 637
625 udp_csum_pull_header(skb); 638 udp_csum_pull_header(skb);
626 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
627 __UDP6_INC_STATS(sock_net(sk),
628 UDP_MIB_RCVBUFERRORS, is_udplite);
629 goto drop;
630 }
631 639
632 skb_dst_drop(skb); 640 skb_dst_drop(skb);
633 641
634 bh_lock_sock(sk); 642 return __udpv6_queue_rcv_skb(sk, skb);
635 rc = 0;
636 if (!sock_owned_by_user(sk))
637 rc = __udpv6_queue_rcv_skb(sk, skb);
638 else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
639 bh_unlock_sock(sk);
640 goto drop;
641 }
642 bh_unlock_sock(sk);
643
644 return rc;
645 643
646csum_error: 644csum_error:
647 __UDP6_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); 645 __UDP6_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
@@ -1037,6 +1035,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
1037 ipc6.hlimit = -1; 1035 ipc6.hlimit = -1;
1038 ipc6.tclass = -1; 1036 ipc6.tclass = -1;
1039 ipc6.dontfrag = -1; 1037 ipc6.dontfrag = -1;
1038 sockc.tsflags = sk->sk_tsflags;
1040 1039
1041 /* destination address check */ 1040 /* destination address check */
1042 if (sin6) { 1041 if (sin6) {
@@ -1048,6 +1047,10 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
1048 if (addr_len < SIN6_LEN_RFC2133) 1047 if (addr_len < SIN6_LEN_RFC2133)
1049 return -EINVAL; 1048 return -EINVAL;
1050 daddr = &sin6->sin6_addr; 1049 daddr = &sin6->sin6_addr;
1050 if (ipv6_addr_any(daddr) &&
1051 ipv6_addr_v4mapped(&np->saddr))
1052 ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK),
1053 daddr);
1051 break; 1054 break;
1052 case AF_INET: 1055 case AF_INET:
1053 goto do_udp_sendmsg; 1056 goto do_udp_sendmsg;
@@ -1156,7 +1159,7 @@ do_udp_sendmsg:
1156 fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex; 1159 fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
1157 1160
1158 fl6.flowi6_mark = sk->sk_mark; 1161 fl6.flowi6_mark = sk->sk_mark;
1159 sockc.tsflags = sk->sk_tsflags; 1162 fl6.flowi6_uid = sk->sk_uid;
1160 1163
1161 if (msg->msg_controllen) { 1164 if (msg->msg_controllen) {
1162 opt = &opt_space; 1165 opt = &opt_space;
@@ -1309,7 +1312,8 @@ out:
1309 return err; 1312 return err;
1310 1313
1311do_confirm: 1314do_confirm:
1312 dst_confirm(dst); 1315 if (msg->msg_flags & MSG_PROBE)
1316 dst_confirm_neigh(dst, &fl6.daddr);
1313 if (!(msg->msg_flags&MSG_PROBE) || len) 1317 if (!(msg->msg_flags&MSG_PROBE) || len)
1314 goto back_from_confirm; 1318 goto back_from_confirm;
1315 err = 0; 1319 err = 0;
@@ -1434,12 +1438,12 @@ struct proto udpv6_prot = {
1434 .connect = ip6_datagram_connect, 1438 .connect = ip6_datagram_connect,
1435 .disconnect = udp_disconnect, 1439 .disconnect = udp_disconnect,
1436 .ioctl = udp_ioctl, 1440 .ioctl = udp_ioctl,
1441 .init = udp_init_sock,
1437 .destroy = udpv6_destroy_sock, 1442 .destroy = udpv6_destroy_sock,
1438 .setsockopt = udpv6_setsockopt, 1443 .setsockopt = udpv6_setsockopt,
1439 .getsockopt = udpv6_getsockopt, 1444 .getsockopt = udpv6_getsockopt,
1440 .sendmsg = udpv6_sendmsg, 1445 .sendmsg = udpv6_sendmsg,
1441 .recvmsg = udpv6_recvmsg, 1446 .recvmsg = udpv6_recvmsg,
1442 .backlog_rcv = __udpv6_queue_rcv_skb,
1443 .release_cb = ip6_datagram_release_cb, 1447 .release_cb = ip6_datagram_release_cb,
1444 .hash = udp_lib_hash, 1448 .hash = udp_lib_hash,
1445 .unhash = udp_lib_unhash, 1449 .unhash = udp_lib_unhash,
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index 2f5101a12283..2784cc363f2b 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -45,10 +45,11 @@ struct proto udplitev6_prot = {
45 .getsockopt = udpv6_getsockopt, 45 .getsockopt = udpv6_getsockopt,
46 .sendmsg = udpv6_sendmsg, 46 .sendmsg = udpv6_sendmsg,
47 .recvmsg = udpv6_recvmsg, 47 .recvmsg = udpv6_recvmsg,
48 .backlog_rcv = __udpv6_queue_rcv_skb,
49 .hash = udp_lib_hash, 48 .hash = udp_lib_hash,
50 .unhash = udp_lib_unhash, 49 .unhash = udp_lib_unhash,
51 .get_port = udp_v6_get_port, 50 .get_port = udp_v6_get_port,
51 .memory_allocated = &udp_memory_allocated,
52 .sysctl_mem = sysctl_udp_mem,
52 .obj_size = sizeof(struct udp6_sock), 53 .obj_size = sizeof(struct udp6_sock),
53 .h.udp_table = &udplite_table, 54 .h.udp_table = &udplite_table,
54#ifdef CONFIG_COMPAT 55#ifdef CONFIG_COMPAT
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index b5789562aded..08a807b29298 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -33,6 +33,8 @@ EXPORT_SYMBOL(xfrm6_rcv_spi);
33 33
34int xfrm6_transport_finish(struct sk_buff *skb, int async) 34int xfrm6_transport_finish(struct sk_buff *skb, int async)
35{ 35{
36 struct xfrm_offload *xo = xfrm_offload(skb);
37
36 skb_network_header(skb)[IP6CB(skb)->nhoff] = 38 skb_network_header(skb)[IP6CB(skb)->nhoff] =
37 XFRM_MODE_SKB_CB(skb)->protocol; 39 XFRM_MODE_SKB_CB(skb)->protocol;
38 40
@@ -44,6 +46,11 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async)
44 ipv6_hdr(skb)->payload_len = htons(skb->len); 46 ipv6_hdr(skb)->payload_len = htons(skb->len);
45 __skb_push(skb, skb->data - skb_network_header(skb)); 47 __skb_push(skb, skb->data - skb_network_header(skb));
46 48
49 if (xo && (xo->flags & XFRM_GRO)) {
50 skb_mac_header_rebuild(skb);
51 return -1;
52 }
53
47 NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, 54 NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING,
48 dev_net(skb->dev), NULL, skb, skb->dev, NULL, 55 dev_net(skb->dev), NULL, skb, skb->dev, NULL,
49 ip6_rcv_finish); 56 ip6_rcv_finish);
@@ -69,18 +76,9 @@ int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
69 struct xfrm_state *x = NULL; 76 struct xfrm_state *x = NULL;
70 int i = 0; 77 int i = 0;
71 78
72 /* Allocate new secpath or COW existing one. */ 79 if (secpath_set(skb)) {
73 if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) { 80 XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR);
74 struct sec_path *sp; 81 goto drop;
75
76 sp = secpath_dup(skb->sp);
77 if (!sp) {
78 XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR);
79 goto drop;
80 }
81 if (skb->sp)
82 secpath_put(skb->sp);
83 skb->sp = sp;
84 } 82 }
85 83
86 if (1 + skb->sp->len == XFRM_MAX_DEPTH) { 84 if (1 + skb->sp->len == XFRM_MAX_DEPTH) {
diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c
index 4e344105b3fd..4439ee44c8b0 100644
--- a/net/ipv6/xfrm6_mode_transport.c
+++ b/net/ipv6/xfrm6_mode_transport.c
@@ -47,6 +47,7 @@ static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb)
47static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb) 47static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb)
48{ 48{
49 int ihl = skb->data - skb_transport_header(skb); 49 int ihl = skb->data - skb_transport_header(skb);
50 struct xfrm_offload *xo = xfrm_offload(skb);
50 51
51 if (skb->transport_header != skb->network_header) { 52 if (skb->transport_header != skb->network_header) {
52 memmove(skb_transport_header(skb), 53 memmove(skb_transport_header(skb),
@@ -55,7 +56,8 @@ static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb)
55 } 56 }
56 ipv6_hdr(skb)->payload_len = htons(skb->len + ihl - 57 ipv6_hdr(skb)->payload_len = htons(skb->len + ihl -
57 sizeof(struct ipv6hdr)); 58 sizeof(struct ipv6hdr));
58 skb_reset_transport_header(skb); 59 if (!xo || !(xo->flags & XFRM_GRO))
60 skb_reset_transport_header(skb);
59 return 0; 61 return 0;
60} 62}
61 63
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index e0f71c01d728..79651bc71bf0 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -25,8 +25,6 @@
25#include <net/mip6.h> 25#include <net/mip6.h>
26#endif 26#endif
27 27
28static struct xfrm_policy_afinfo xfrm6_policy_afinfo;
29
30static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif, 28static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif,
31 const xfrm_address_t *saddr, 29 const xfrm_address_t *saddr,
32 const xfrm_address_t *daddr) 30 const xfrm_address_t *daddr)
@@ -220,7 +218,7 @@ static inline int xfrm6_garbage_collect(struct dst_ops *ops)
220{ 218{
221 struct net *net = container_of(ops, struct net, xfrm.xfrm6_dst_ops); 219 struct net *net = container_of(ops, struct net, xfrm.xfrm6_dst_ops);
222 220
223 xfrm6_policy_afinfo.garbage_collect(net); 221 xfrm_garbage_collect_deferred(net);
224 return dst_entries_get_fast(ops) > ops->gc_thresh * 2; 222 return dst_entries_get_fast(ops) > ops->gc_thresh * 2;
225} 223}
226 224
@@ -291,8 +289,7 @@ static struct dst_ops xfrm6_dst_ops_template = {
291 .gc_thresh = INT_MAX, 289 .gc_thresh = INT_MAX,
292}; 290};
293 291
294static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { 292static const struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
295 .family = AF_INET6,
296 .dst_ops = &xfrm6_dst_ops_template, 293 .dst_ops = &xfrm6_dst_ops_template,
297 .dst_lookup = xfrm6_dst_lookup, 294 .dst_lookup = xfrm6_dst_lookup,
298 .get_saddr = xfrm6_get_saddr, 295 .get_saddr = xfrm6_get_saddr,
@@ -305,7 +302,7 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
305 302
306static int __init xfrm6_policy_init(void) 303static int __init xfrm6_policy_init(void)
307{ 304{
308 return xfrm_policy_register_afinfo(&xfrm6_policy_afinfo); 305 return xfrm_policy_register_afinfo(&xfrm6_policy_afinfo, AF_INET6);
309} 306}
310 307
311static void xfrm6_policy_fini(void) 308static void xfrm6_policy_fini(void)
diff --git a/net/ipv6/xfrm6_protocol.c b/net/ipv6/xfrm6_protocol.c
index 54d13f8dbbae..b2dc8ce49378 100644
--- a/net/ipv6/xfrm6_protocol.c
+++ b/net/ipv6/xfrm6_protocol.c
@@ -162,9 +162,8 @@ static const struct inet6_protocol ipcomp6_protocol = {
162 .flags = INET6_PROTO_NOPOLICY, 162 .flags = INET6_PROTO_NOPOLICY,
163}; 163};
164 164
165static struct xfrm_input_afinfo xfrm6_input_afinfo = { 165static const struct xfrm_input_afinfo xfrm6_input_afinfo = {
166 .family = AF_INET6, 166 .family = AF_INET6,
167 .owner = THIS_MODULE,
168 .callback = xfrm6_rcv_cb, 167 .callback = xfrm6_rcv_cb,
169}; 168};
170 169
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index e1c0bbe7996c..d7b731a78d09 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -44,7 +44,7 @@ struct xfrm6_tunnel_net {
44 u32 spi; 44 u32 spi;
45}; 45};
46 46
47static int xfrm6_tunnel_net_id __read_mostly; 47static unsigned int xfrm6_tunnel_net_id __read_mostly;
48static inline struct xfrm6_tunnel_net *xfrm6_tunnel_pernet(struct net *net) 48static inline struct xfrm6_tunnel_net *xfrm6_tunnel_pernet(struct net *net)
49{ 49{
50 return net_generic(net, xfrm6_tunnel_net_id); 50 return net_generic(net, xfrm6_tunnel_net_id);
diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index 48d0dc89b58d..8a9219ff2e77 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -56,7 +56,7 @@
56#include <net/tcp_states.h> 56#include <net/tcp_states.h>
57#include <net/net_namespace.h> 57#include <net/net_namespace.h>
58 58
59#include <asm/uaccess.h> 59#include <linux/uaccess.h>
60 60
61/* Configuration Variables */ 61/* Configuration Variables */
62static unsigned char ipxcfg_max_hops = 16; 62static unsigned char ipxcfg_max_hops = 16;
@@ -1809,7 +1809,7 @@ static int ipx_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
1809 rc = skb_copy_datagram_msg(skb, sizeof(struct ipxhdr), msg, copied); 1809 rc = skb_copy_datagram_msg(skb, sizeof(struct ipxhdr), msg, copied);
1810 if (rc) 1810 if (rc)
1811 goto out_free; 1811 goto out_free;
1812 if (skb->tstamp.tv64) 1812 if (skb->tstamp)
1813 sk->sk_stamp = skb->tstamp; 1813 sk->sk_stamp = skb->tstamp;
1814 1814
1815 if (sipx) { 1815 if (sipx) {
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 391c3cbd2eed..8d77ad5cadaf 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -46,13 +46,14 @@
46#include <linux/socket.h> 46#include <linux/socket.h>
47#include <linux/sockios.h> 47#include <linux/sockios.h>
48#include <linux/slab.h> 48#include <linux/slab.h>
49#include <linux/sched/signal.h>
49#include <linux/init.h> 50#include <linux/init.h>
50#include <linux/net.h> 51#include <linux/net.h>
51#include <linux/irda.h> 52#include <linux/irda.h>
52#include <linux/poll.h> 53#include <linux/poll.h>
53 54
54#include <asm/ioctls.h> /* TIOCOUTQ, TIOCINQ */ 55#include <asm/ioctls.h> /* TIOCOUTQ, TIOCINQ */
55#include <asm/uaccess.h> 56#include <linux/uaccess.h>
56 57
57#include <net/sock.h> 58#include <net/sock.h>
58#include <net/tcp_states.h> 59#include <net/tcp_states.h>
@@ -827,7 +828,8 @@ out:
827 * Wait for incoming connection 828 * Wait for incoming connection
828 * 829 *
829 */ 830 */
830static int irda_accept(struct socket *sock, struct socket *newsock, int flags) 831static int irda_accept(struct socket *sock, struct socket *newsock, int flags,
832 bool kern)
831{ 833{
832 struct sock *sk = sock->sk; 834 struct sock *sk = sock->sk;
833 struct irda_sock *new, *self = irda_sk(sk); 835 struct irda_sock *new, *self = irda_sk(sk);
@@ -835,7 +837,7 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
835 struct sk_buff *skb = NULL; 837 struct sk_buff *skb = NULL;
836 int err; 838 int err;
837 839
838 err = irda_create(sock_net(sk), newsock, sk->sk_protocol, 0); 840 err = irda_create(sock_net(sk), newsock, sk->sk_protocol, kern);
839 if (err) 841 if (err)
840 return err; 842 return err;
841 843
diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c
index 873c4b707d6a..f6061c4bb0a8 100644
--- a/net/irda/ircomm/ircomm_tty.c
+++ b/net/irda/ircomm/ircomm_tty.c
@@ -32,7 +32,7 @@
32#include <linux/module.h> 32#include <linux/module.h>
33#include <linux/fs.h> 33#include <linux/fs.h>
34#include <linux/slab.h> 34#include <linux/slab.h>
35#include <linux/sched.h> 35#include <linux/sched/signal.h>
36#include <linux/seq_file.h> 36#include <linux/seq_file.h>
37#include <linux/termios.h> 37#include <linux/termios.h>
38#include <linux/tty.h> 38#include <linux/tty.h>
@@ -40,7 +40,7 @@
40#include <linux/interrupt.h> 40#include <linux/interrupt.h>
41#include <linux/device.h> /* for MODULE_ALIAS_CHARDEV_MAJOR */ 41#include <linux/device.h> /* for MODULE_ALIAS_CHARDEV_MAJOR */
42 42
43#include <asm/uaccess.h> 43#include <linux/uaccess.h>
44 44
45#include <net/irda/irda.h> 45#include <net/irda/irda.h>
46#include <net/irda/irmod.h> 46#include <net/irda/irmod.h>
diff --git a/net/irda/ircomm/ircomm_tty_ioctl.c b/net/irda/ircomm/ircomm_tty_ioctl.c
index 8f5678cb6263..f18070118d05 100644
--- a/net/irda/ircomm/ircomm_tty_ioctl.c
+++ b/net/irda/ircomm/ircomm_tty_ioctl.c
@@ -32,7 +32,7 @@
32#include <linux/tty.h> 32#include <linux/tty.h>
33#include <linux/serial.h> 33#include <linux/serial.h>
34 34
35#include <asm/uaccess.h> 35#include <linux/uaccess.h>
36 36
37#include <net/irda/irda.h> 37#include <net/irda/irda.h>
38#include <net/irda/irmod.h> 38#include <net/irda/irmod.h>
diff --git a/net/irda/irda_device.c b/net/irda/irda_device.c
index 856736656a30..890b90d055d5 100644
--- a/net/irda/irda_device.c
+++ b/net/irda/irda_device.c
@@ -43,7 +43,7 @@
43#include <linux/export.h> 43#include <linux/export.h>
44 44
45#include <asm/ioctls.h> 45#include <asm/ioctls.h>
46#include <asm/uaccess.h> 46#include <linux/uaccess.h>
47#include <asm/dma.h> 47#include <asm/dma.h>
48#include <asm/io.h> 48#include <asm/io.h>
49 49
diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c
index d8b7267280c3..74d09f91709e 100644
--- a/net/irda/irlan/irlan_eth.c
+++ b/net/irda/irlan/irlan_eth.c
@@ -51,7 +51,6 @@ static const struct net_device_ops irlan_eth_netdev_ops = {
51 .ndo_stop = irlan_eth_close, 51 .ndo_stop = irlan_eth_close,
52 .ndo_start_xmit = irlan_eth_xmit, 52 .ndo_start_xmit = irlan_eth_xmit,
53 .ndo_set_rx_mode = irlan_eth_set_multicast_list, 53 .ndo_set_rx_mode = irlan_eth_set_multicast_list,
54 .ndo_change_mtu = eth_change_mtu,
55 .ndo_validate_addr = eth_validate_addr, 54 .ndo_validate_addr = eth_validate_addr,
56}; 55};
57 56
@@ -67,7 +66,8 @@ static void irlan_eth_setup(struct net_device *dev)
67 66
68 dev->netdev_ops = &irlan_eth_netdev_ops; 67 dev->netdev_ops = &irlan_eth_netdev_ops;
69 dev->destructor = free_netdev; 68 dev->destructor = free_netdev;
70 69 dev->min_mtu = 0;
70 dev->max_mtu = ETH_MAX_MTU;
71 71
72 /* 72 /*
73 * Lets do all queueing in IrTTP instead of this device driver. 73 * Lets do all queueing in IrTTP instead of this device driver.
diff --git a/net/irda/irnet/irnet.h b/net/irda/irnet/irnet.h
index 8d65bb9477fc..9d451f8ed47a 100644
--- a/net/irda/irnet/irnet.h
+++ b/net/irda/irnet/irnet.h
@@ -245,12 +245,11 @@
245#include <linux/tty.h> 245#include <linux/tty.h>
246#include <linux/proc_fs.h> 246#include <linux/proc_fs.h>
247#include <linux/netdevice.h> 247#include <linux/netdevice.h>
248#include <linux/miscdevice.h>
249#include <linux/poll.h> 248#include <linux/poll.h>
250#include <linux/capability.h> 249#include <linux/capability.h>
251#include <linux/ctype.h> /* isspace() */ 250#include <linux/ctype.h> /* isspace() */
252#include <linux/string.h> /* skip_spaces() */ 251#include <linux/string.h> /* skip_spaces() */
253#include <asm/uaccess.h> 252#include <linux/uaccess.h>
254#include <linux/init.h> 253#include <linux/init.h>
255 254
256#include <linux/ppp_defs.h> 255#include <linux/ppp_defs.h>
diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c
index 1215693fdd22..7025dcb853d0 100644
--- a/net/irda/irnet/irnet_ppp.c
+++ b/net/irda/irnet/irnet_ppp.c
@@ -13,8 +13,9 @@
13 * 2) as a control channel (write commands, read events) 13 * 2) as a control channel (write commands, read events)
14 */ 14 */
15 15
16#include <linux/sched.h> 16#include <linux/sched/signal.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18
18#include "irnet_ppp.h" /* Private header */ 19#include "irnet_ppp.h" /* Private header */
19/* Please put other headers in irnet.h - Thanks */ 20/* Please put other headers in irnet.h - Thanks */
20 21
@@ -51,7 +52,7 @@ irnet_ctrl_write(irnet_socket * ap,
51 char * next; /* Next command to process */ 52 char * next; /* Next command to process */
52 int length; /* Length of current command */ 53 int length; /* Length of current command */
53 54
54 DENTER(CTRL_TRACE, "(ap=0x%p, count=%Zd)\n", ap, count); 55 DENTER(CTRL_TRACE, "(ap=0x%p, count=%zd)\n", ap, count);
55 56
56 /* Check for overflow... */ 57 /* Check for overflow... */
57 DABORT(count >= IRNET_MAX_COMMAND, -ENOMEM, 58 DABORT(count >= IRNET_MAX_COMMAND, -ENOMEM,
@@ -66,7 +67,7 @@ irnet_ctrl_write(irnet_socket * ap,
66 67
67 /* Safe terminate the string */ 68 /* Safe terminate the string */
68 command[count] = '\0'; 69 command[count] = '\0';
69 DEBUG(CTRL_INFO, "Command line received is ``%s'' (%Zd).\n", 70 DEBUG(CTRL_INFO, "Command line received is ``%s'' (%zd).\n",
70 command, count); 71 command, count);
71 72
72 /* Check every commands in the command line */ 73 /* Check every commands in the command line */
@@ -285,7 +286,7 @@ irnet_ctrl_read(irnet_socket * ap,
285 char event[75]; 286 char event[75];
286 ssize_t ret = 0; 287 ssize_t ret = 0;
287 288
288 DENTER(CTRL_TRACE, "(ap=0x%p, count=%Zd)\n", ap, count); 289 DENTER(CTRL_TRACE, "(ap=0x%p, count=%zd)\n", ap, count);
289 290
290#ifdef INITIAL_DISCOVERY 291#ifdef INITIAL_DISCOVERY
291 /* Check if we have read the log */ 292 /* Check if we have read the log */
@@ -328,7 +329,7 @@ irnet_ctrl_read(irnet_socket * ap,
328 if(ret != 0) 329 if(ret != 0)
329 { 330 {
330 /* No, return the error code */ 331 /* No, return the error code */
331 DEXIT(CTRL_TRACE, " - ret %Zd\n", ret); 332 DEXIT(CTRL_TRACE, " - ret %zd\n", ret);
332 return ret; 333 return ret;
333 } 334 }
334 335
@@ -568,7 +569,7 @@ dev_irnet_write(struct file * file,
568{ 569{
569 irnet_socket * ap = file->private_data; 570 irnet_socket * ap = file->private_data;
570 571
571 DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%Zd)\n", 572 DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%zd)\n",
572 file, ap, count); 573 file, ap, count);
573 DABORT(ap == NULL, -ENXIO, FS_ERROR, "ap is NULL !!!\n"); 574 DABORT(ap == NULL, -ENXIO, FS_ERROR, "ap is NULL !!!\n");
574 575
@@ -592,7 +593,7 @@ dev_irnet_read(struct file * file,
592{ 593{
593 irnet_socket * ap = file->private_data; 594 irnet_socket * ap = file->private_data;
594 595
595 DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%Zd)\n", 596 DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%zd)\n",
596 file, ap, count); 597 file, ap, count);
597 DABORT(ap == NULL, -ENXIO, FS_ERROR, "ap is NULL !!!\n"); 598 DABORT(ap == NULL, -ENXIO, FS_ERROR, "ap is NULL !!!\n");
598 599
diff --git a/net/irda/irnet/irnet_ppp.h b/net/irda/irnet/irnet_ppp.h
index 940225866da0..32061442cc8e 100644
--- a/net/irda/irnet/irnet_ppp.h
+++ b/net/irda/irnet/irnet_ppp.h
@@ -15,13 +15,10 @@
15/***************************** INCLUDES *****************************/ 15/***************************** INCLUDES *****************************/
16 16
17#include "irnet.h" /* Module global include */ 17#include "irnet.h" /* Module global include */
18#include <linux/miscdevice.h>
18 19
19/************************ CONSTANTS & MACROS ************************/ 20/************************ CONSTANTS & MACROS ************************/
20 21
21/* /dev/irnet file constants */
22#define IRNET_MAJOR 10 /* Misc range */
23#define IRNET_MINOR 187 /* Official allocation */
24
25/* IrNET control channel stuff */ 22/* IrNET control channel stuff */
26#define IRNET_MAX_COMMAND 256 /* Max length of a command line */ 23#define IRNET_MAX_COMMAND 256 /* Max length of a command line */
27 24
@@ -111,9 +108,9 @@ static const struct file_operations irnet_device_fops =
111/* Structure so that the misc major (drivers/char/misc.c) take care of us... */ 108/* Structure so that the misc major (drivers/char/misc.c) take care of us... */
112static struct miscdevice irnet_misc_device = 109static struct miscdevice irnet_misc_device =
113{ 110{
114 IRNET_MINOR, 111 .minor = IRNET_MINOR,
115 "irnet", 112 .name = "irnet",
116 &irnet_device_fops 113 .fops = &irnet_device_fops
117}; 114};
118 115
119#endif /* IRNET_PPP_H */ 116#endif /* IRNET_PPP_H */
diff --git a/net/irda/irnetlink.c b/net/irda/irnetlink.c
index e15c40e86660..7fc340e574cf 100644
--- a/net/irda/irnetlink.c
+++ b/net/irda/irnetlink.c
@@ -24,13 +24,7 @@
24 24
25 25
26 26
27static struct genl_family irda_nl_family = { 27static struct genl_family irda_nl_family;
28 .id = GENL_ID_GENERATE,
29 .name = IRDA_NL_NAME,
30 .hdrsize = 0,
31 .version = IRDA_NL_VERSION,
32 .maxattr = IRDA_NL_CMD_MAX,
33};
34 28
35static struct net_device * ifname_to_netdev(struct net *net, struct genl_info *info) 29static struct net_device * ifname_to_netdev(struct net *net, struct genl_info *info)
36{ 30{
@@ -147,9 +141,19 @@ static const struct genl_ops irda_nl_ops[] = {
147 141
148}; 142};
149 143
150int irda_nl_register(void) 144static struct genl_family irda_nl_family __ro_after_init = {
145 .name = IRDA_NL_NAME,
146 .hdrsize = 0,
147 .version = IRDA_NL_VERSION,
148 .maxattr = IRDA_NL_CMD_MAX,
149 .module = THIS_MODULE,
150 .ops = irda_nl_ops,
151 .n_ops = ARRAY_SIZE(irda_nl_ops),
152};
153
154int __init irda_nl_register(void)
151{ 155{
152 return genl_register_family_with_ops(&irda_nl_family, irda_nl_ops); 156 return genl_register_family(&irda_nl_family);
153} 157}
154 158
155void irda_nl_unregister(void) 159void irda_nl_unregister(void)
diff --git a/net/irda/irproc.c b/net/irda/irproc.c
index b9ac598e2116..77cfdde9d82f 100644
--- a/net/irda/irproc.c
+++ b/net/irda/irproc.c
@@ -23,7 +23,6 @@
23 * 23 *
24 ********************************************************************/ 24 ********************************************************************/
25 25
26#include <linux/miscdevice.h>
27#include <linux/proc_fs.h> 26#include <linux/proc_fs.h>
28#include <linux/seq_file.h> 27#include <linux/seq_file.h>
29#include <linux/module.h> 28#include <linux/module.h>
diff --git a/net/irda/irqueue.c b/net/irda/irqueue.c
index acbe61c7e683..160dc89335e2 100644
--- a/net/irda/irqueue.c
+++ b/net/irda/irqueue.c
@@ -383,9 +383,6 @@ EXPORT_SYMBOL(hashbin_new);
383 * for deallocating this structure if it's complex. If not the user can 383 * for deallocating this structure if it's complex. If not the user can
384 * just supply kfree, which should take care of the job. 384 * just supply kfree, which should take care of the job.
385 */ 385 */
386#ifdef CONFIG_LOCKDEP
387static int hashbin_lock_depth = 0;
388#endif
389int hashbin_delete( hashbin_t* hashbin, FREE_FUNC free_func) 386int hashbin_delete( hashbin_t* hashbin, FREE_FUNC free_func)
390{ 387{
391 irda_queue_t* queue; 388 irda_queue_t* queue;
@@ -396,22 +393,27 @@ int hashbin_delete( hashbin_t* hashbin, FREE_FUNC free_func)
396 IRDA_ASSERT(hashbin->magic == HB_MAGIC, return -1;); 393 IRDA_ASSERT(hashbin->magic == HB_MAGIC, return -1;);
397 394
398 /* Synchronize */ 395 /* Synchronize */
399 if ( hashbin->hb_type & HB_LOCK ) { 396 if (hashbin->hb_type & HB_LOCK)
400 spin_lock_irqsave_nested(&hashbin->hb_spinlock, flags, 397 spin_lock_irqsave(&hashbin->hb_spinlock, flags);
401 hashbin_lock_depth++);
402 }
403 398
404 /* 399 /*
405 * Free the entries in the hashbin, TODO: use hashbin_clear when 400 * Free the entries in the hashbin, TODO: use hashbin_clear when
406 * it has been shown to work 401 * it has been shown to work
407 */ 402 */
408 for (i = 0; i < HASHBIN_SIZE; i ++ ) { 403 for (i = 0; i < HASHBIN_SIZE; i ++ ) {
409 queue = dequeue_first((irda_queue_t**) &hashbin->hb_queue[i]); 404 while (1) {
410 while (queue ) { 405 queue = dequeue_first((irda_queue_t**) &hashbin->hb_queue[i]);
411 if (free_func) 406
412 (*free_func)(queue); 407 if (!queue)
413 queue = dequeue_first( 408 break;
414 (irda_queue_t**) &hashbin->hb_queue[i]); 409
410 if (free_func) {
411 if (hashbin->hb_type & HB_LOCK)
412 spin_unlock_irqrestore(&hashbin->hb_spinlock, flags);
413 free_func(queue);
414 if (hashbin->hb_type & HB_LOCK)
415 spin_lock_irqsave(&hashbin->hb_spinlock, flags);
416 }
415 } 417 }
416 } 418 }
417 419
@@ -420,12 +422,8 @@ int hashbin_delete( hashbin_t* hashbin, FREE_FUNC free_func)
420 hashbin->magic = ~HB_MAGIC; 422 hashbin->magic = ~HB_MAGIC;
421 423
422 /* Release lock */ 424 /* Release lock */
423 if ( hashbin->hb_type & HB_LOCK) { 425 if (hashbin->hb_type & HB_LOCK)
424 spin_unlock_irqrestore(&hashbin->hb_spinlock, flags); 426 spin_unlock_irqrestore(&hashbin->hb_spinlock, flags);
425#ifdef CONFIG_LOCKDEP
426 hashbin_lock_depth--;
427#endif
428 }
429 427
430 /* 428 /*
431 * Free the hashbin structure 429 * Free the hashbin structure
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 02b45a8e8b35..84de7b6326dc 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -17,7 +17,7 @@
17#include <linux/list.h> 17#include <linux/list.h>
18#include <linux/errno.h> 18#include <linux/errno.h>
19#include <linux/kernel.h> 19#include <linux/kernel.h>
20#include <linux/sched.h> 20#include <linux/sched/signal.h>
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/skbuff.h> 22#include <linux/skbuff.h>
23#include <linux/init.h> 23#include <linux/init.h>
@@ -453,19 +453,27 @@ static void iucv_sever_path(struct sock *sk, int with_user_data)
453 } 453 }
454} 454}
455 455
456/* Send FIN through an IUCV socket for HIPER transport */ 456/* Send controlling flags through an IUCV socket for HIPER transport */
457static int iucv_send_ctrl(struct sock *sk, u8 flags) 457static int iucv_send_ctrl(struct sock *sk, u8 flags)
458{ 458{
459 int err = 0; 459 int err = 0;
460 int blen; 460 int blen;
461 struct sk_buff *skb; 461 struct sk_buff *skb;
462 u8 shutdown = 0;
462 463
463 blen = sizeof(struct af_iucv_trans_hdr) + ETH_HLEN; 464 blen = sizeof(struct af_iucv_trans_hdr) + ETH_HLEN;
465 if (sk->sk_shutdown & SEND_SHUTDOWN) {
466 /* controlling flags should be sent anyway */
467 shutdown = sk->sk_shutdown;
468 sk->sk_shutdown &= RCV_SHUTDOWN;
469 }
464 skb = sock_alloc_send_skb(sk, blen, 1, &err); 470 skb = sock_alloc_send_skb(sk, blen, 1, &err);
465 if (skb) { 471 if (skb) {
466 skb_reserve(skb, blen); 472 skb_reserve(skb, blen);
467 err = afiucv_hs_send(NULL, sk, skb, flags); 473 err = afiucv_hs_send(NULL, sk, skb, flags);
468 } 474 }
475 if (shutdown)
476 sk->sk_shutdown = shutdown;
469 return err; 477 return err;
470} 478}
471 479
@@ -930,7 +938,7 @@ done:
930 938
931/* Accept a pending connection */ 939/* Accept a pending connection */
932static int iucv_sock_accept(struct socket *sock, struct socket *newsock, 940static int iucv_sock_accept(struct socket *sock, struct socket *newsock,
933 int flags) 941 int flags, bool kern)
934{ 942{
935 DECLARE_WAITQUEUE(wait, current); 943 DECLARE_WAITQUEUE(wait, current);
936 struct sock *sk = sock->sk, *nsk; 944 struct sock *sk = sock->sk, *nsk;
@@ -1036,7 +1044,8 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg,
1036{ 1044{
1037 struct sock *sk = sock->sk; 1045 struct sock *sk = sock->sk;
1038 struct iucv_sock *iucv = iucv_sk(sk); 1046 struct iucv_sock *iucv = iucv_sk(sk);
1039 size_t headroom, linear; 1047 size_t headroom = 0;
1048 size_t linear;
1040 struct sk_buff *skb; 1049 struct sk_buff *skb;
1041 struct iucv_message txmsg = {0}; 1050 struct iucv_message txmsg = {0};
1042 struct cmsghdr *cmsg; 1051 struct cmsghdr *cmsg;
@@ -1114,18 +1123,20 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg,
1114 * this is fine for SOCK_SEQPACKET (unless we want to support 1123 * this is fine for SOCK_SEQPACKET (unless we want to support
1115 * segmented records using the MSG_EOR flag), but 1124 * segmented records using the MSG_EOR flag), but
1116 * for SOCK_STREAM we might want to improve it in future */ 1125 * for SOCK_STREAM we might want to improve it in future */
1117 headroom = (iucv->transport == AF_IUCV_TRANS_HIPER) 1126 if (iucv->transport == AF_IUCV_TRANS_HIPER) {
1118 ? sizeof(struct af_iucv_trans_hdr) + ETH_HLEN : 0; 1127 headroom = sizeof(struct af_iucv_trans_hdr) + ETH_HLEN;
1119 if (headroom + len < PAGE_SIZE) {
1120 linear = len; 1128 linear = len;
1121 } else { 1129 } else {
1122 /* In nonlinear "classic" iucv skb, 1130 if (len < PAGE_SIZE) {
1123 * reserve space for iucv_array 1131 linear = len;
1124 */ 1132 } else {
1125 if (iucv->transport != AF_IUCV_TRANS_HIPER) 1133 /* In nonlinear "classic" iucv skb,
1126 headroom += sizeof(struct iucv_array) * 1134 * reserve space for iucv_array
1127 (MAX_SKB_FRAGS + 1); 1135 */
1128 linear = PAGE_SIZE - headroom; 1136 headroom = sizeof(struct iucv_array) *
1137 (MAX_SKB_FRAGS + 1);
1138 linear = PAGE_SIZE - headroom;
1139 }
1129 } 1140 }
1130 skb = sock_alloc_send_pskb(sk, headroom + linear, len - linear, 1141 skb = sock_alloc_send_pskb(sk, headroom + linear, len - linear,
1131 noblock, &err, 0); 1142 noblock, &err, 0);
@@ -1315,8 +1326,13 @@ static void iucv_process_message(struct sock *sk, struct sk_buff *skb,
1315 } 1326 }
1316 1327
1317 IUCV_SKB_CB(skb)->offset = 0; 1328 IUCV_SKB_CB(skb)->offset = 0;
1318 if (sock_queue_rcv_skb(sk, skb)) 1329 if (sk_filter(sk, skb)) {
1319 skb_queue_head(&iucv_sk(sk)->backlog_skb_q, skb); 1330 atomic_inc(&sk->sk_drops); /* skb rejected by filter */
1331 kfree_skb(skb);
1332 return;
1333 }
1334 if (__sock_queue_rcv_skb(sk, skb)) /* handle rcv queue full */
1335 skb_queue_tail(&iucv_sk(sk)->backlog_skb_q, skb);
1320} 1336}
1321 1337
1322/* iucv_process_message_q() - Process outstanding IUCV messages 1338/* iucv_process_message_q() - Process outstanding IUCV messages
@@ -1430,13 +1446,13 @@ static int iucv_sock_recvmsg(struct socket *sock, struct msghdr *msg,
1430 rskb = skb_dequeue(&iucv->backlog_skb_q); 1446 rskb = skb_dequeue(&iucv->backlog_skb_q);
1431 while (rskb) { 1447 while (rskb) {
1432 IUCV_SKB_CB(rskb)->offset = 0; 1448 IUCV_SKB_CB(rskb)->offset = 0;
1433 if (sock_queue_rcv_skb(sk, rskb)) { 1449 if (__sock_queue_rcv_skb(sk, rskb)) {
1450 /* handle rcv queue full */
1434 skb_queue_head(&iucv->backlog_skb_q, 1451 skb_queue_head(&iucv->backlog_skb_q,
1435 rskb); 1452 rskb);
1436 break; 1453 break;
1437 } else {
1438 rskb = skb_dequeue(&iucv->backlog_skb_q);
1439 } 1454 }
1455 rskb = skb_dequeue(&iucv->backlog_skb_q);
1440 } 1456 }
1441 if (skb_queue_empty(&iucv->backlog_skb_q)) { 1457 if (skb_queue_empty(&iucv->backlog_skb_q)) {
1442 if (!list_empty(&iucv->message_q.list)) 1458 if (!list_empty(&iucv->message_q.list))
@@ -2116,12 +2132,17 @@ static int afiucv_hs_callback_rx(struct sock *sk, struct sk_buff *skb)
2116 skb_reset_transport_header(skb); 2132 skb_reset_transport_header(skb);
2117 skb_reset_network_header(skb); 2133 skb_reset_network_header(skb);
2118 IUCV_SKB_CB(skb)->offset = 0; 2134 IUCV_SKB_CB(skb)->offset = 0;
2135 if (sk_filter(sk, skb)) {
2136 atomic_inc(&sk->sk_drops); /* skb rejected by filter */
2137 kfree_skb(skb);
2138 return NET_RX_SUCCESS;
2139 }
2140
2119 spin_lock(&iucv->message_q.lock); 2141 spin_lock(&iucv->message_q.lock);
2120 if (skb_queue_empty(&iucv->backlog_skb_q)) { 2142 if (skb_queue_empty(&iucv->backlog_skb_q)) {
2121 if (sock_queue_rcv_skb(sk, skb)) { 2143 if (__sock_queue_rcv_skb(sk, skb))
2122 /* handle rcv queue full */ 2144 /* handle rcv queue full */
2123 skb_queue_tail(&iucv->backlog_skb_q, skb); 2145 skb_queue_tail(&iucv->backlog_skb_q, skb);
2124 }
2125 } else 2146 } else
2126 skb_queue_tail(&iucv_sk(sk)->backlog_skb_q, skb); 2147 skb_queue_tail(&iucv_sk(sk)->backlog_skb_q, skb);
2127 spin_unlock(&iucv->message_q.lock); 2148 spin_unlock(&iucv->message_q.lock);
diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c
index 88a2a3ba4212..8f7ef167c45a 100644
--- a/net/iucv/iucv.c
+++ b/net/iucv/iucv.c
@@ -639,7 +639,7 @@ static void iucv_disable(void)
639 put_online_cpus(); 639 put_online_cpus();
640} 640}
641 641
642static void free_iucv_data(int cpu) 642static int iucv_cpu_dead(unsigned int cpu)
643{ 643{
644 kfree(iucv_param_irq[cpu]); 644 kfree(iucv_param_irq[cpu]);
645 iucv_param_irq[cpu] = NULL; 645 iucv_param_irq[cpu] = NULL;
@@ -647,9 +647,10 @@ static void free_iucv_data(int cpu)
647 iucv_param[cpu] = NULL; 647 iucv_param[cpu] = NULL;
648 kfree(iucv_irq_data[cpu]); 648 kfree(iucv_irq_data[cpu]);
649 iucv_irq_data[cpu] = NULL; 649 iucv_irq_data[cpu] = NULL;
650 return 0;
650} 651}
651 652
652static int alloc_iucv_data(int cpu) 653static int iucv_cpu_prepare(unsigned int cpu)
653{ 654{
654 /* Note: GFP_DMA used to get memory below 2G */ 655 /* Note: GFP_DMA used to get memory below 2G */
655 iucv_irq_data[cpu] = kmalloc_node(sizeof(struct iucv_irq_data), 656 iucv_irq_data[cpu] = kmalloc_node(sizeof(struct iucv_irq_data),
@@ -671,58 +672,38 @@ static int alloc_iucv_data(int cpu)
671 return 0; 672 return 0;
672 673
673out_free: 674out_free:
674 free_iucv_data(cpu); 675 iucv_cpu_dead(cpu);
675 return -ENOMEM; 676 return -ENOMEM;
676} 677}
677 678
678static int iucv_cpu_notify(struct notifier_block *self, 679static int iucv_cpu_online(unsigned int cpu)
679 unsigned long action, void *hcpu)
680{ 680{
681 cpumask_t cpumask; 681 if (!iucv_path_table)
682 long cpu = (long) hcpu; 682 return 0;
683 683 iucv_declare_cpu(NULL);
684 switch (action) { 684 return 0;
685 case CPU_UP_PREPARE:
686 case CPU_UP_PREPARE_FROZEN:
687 if (alloc_iucv_data(cpu))
688 return notifier_from_errno(-ENOMEM);
689 break;
690 case CPU_UP_CANCELED:
691 case CPU_UP_CANCELED_FROZEN:
692 case CPU_DEAD:
693 case CPU_DEAD_FROZEN:
694 free_iucv_data(cpu);
695 break;
696 case CPU_ONLINE:
697 case CPU_ONLINE_FROZEN:
698 case CPU_DOWN_FAILED:
699 case CPU_DOWN_FAILED_FROZEN:
700 if (!iucv_path_table)
701 break;
702 smp_call_function_single(cpu, iucv_declare_cpu, NULL, 1);
703 break;
704 case CPU_DOWN_PREPARE:
705 case CPU_DOWN_PREPARE_FROZEN:
706 if (!iucv_path_table)
707 break;
708 cpumask_copy(&cpumask, &iucv_buffer_cpumask);
709 cpumask_clear_cpu(cpu, &cpumask);
710 if (cpumask_empty(&cpumask))
711 /* Can't offline last IUCV enabled cpu. */
712 return notifier_from_errno(-EINVAL);
713 smp_call_function_single(cpu, iucv_retrieve_cpu, NULL, 1);
714 if (cpumask_empty(&iucv_irq_cpumask))
715 smp_call_function_single(
716 cpumask_first(&iucv_buffer_cpumask),
717 iucv_allow_cpu, NULL, 1);
718 break;
719 }
720 return NOTIFY_OK;
721} 685}
722 686
723static struct notifier_block __refdata iucv_cpu_notifier = { 687static int iucv_cpu_down_prep(unsigned int cpu)
724 .notifier_call = iucv_cpu_notify, 688{
725}; 689 cpumask_t cpumask;
690
691 if (!iucv_path_table)
692 return 0;
693
694 cpumask_copy(&cpumask, &iucv_buffer_cpumask);
695 cpumask_clear_cpu(cpu, &cpumask);
696 if (cpumask_empty(&cpumask))
697 /* Can't offline last IUCV enabled cpu. */
698 return -EINVAL;
699
700 iucv_retrieve_cpu(NULL);
701 if (!cpumask_empty(&iucv_irq_cpumask))
702 return 0;
703 smp_call_function_single(cpumask_first(&iucv_buffer_cpumask),
704 iucv_allow_cpu, NULL, 1);
705 return 0;
706}
726 707
727/** 708/**
728 * iucv_sever_pathid 709 * iucv_sever_pathid
@@ -2027,6 +2008,7 @@ struct iucv_interface iucv_if = {
2027}; 2008};
2028EXPORT_SYMBOL(iucv_if); 2009EXPORT_SYMBOL(iucv_if);
2029 2010
2011static enum cpuhp_state iucv_online;
2030/** 2012/**
2031 * iucv_init 2013 * iucv_init
2032 * 2014 *
@@ -2035,7 +2017,6 @@ EXPORT_SYMBOL(iucv_if);
2035static int __init iucv_init(void) 2017static int __init iucv_init(void)
2036{ 2018{
2037 int rc; 2019 int rc;
2038 int cpu;
2039 2020
2040 if (!MACHINE_IS_VM) { 2021 if (!MACHINE_IS_VM) {
2041 rc = -EPROTONOSUPPORT; 2022 rc = -EPROTONOSUPPORT;
@@ -2054,23 +2035,19 @@ static int __init iucv_init(void)
2054 goto out_int; 2035 goto out_int;
2055 } 2036 }
2056 2037
2057 cpu_notifier_register_begin(); 2038 rc = cpuhp_setup_state(CPUHP_NET_IUCV_PREPARE, "net/iucv:prepare",
2058 2039 iucv_cpu_prepare, iucv_cpu_dead);
2059 for_each_online_cpu(cpu) {
2060 if (alloc_iucv_data(cpu)) {
2061 rc = -ENOMEM;
2062 goto out_free;
2063 }
2064 }
2065 rc = __register_hotcpu_notifier(&iucv_cpu_notifier);
2066 if (rc) 2040 if (rc)
2067 goto out_free; 2041 goto out_dev;
2068 2042 rc = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "net/iucv:online",
2069 cpu_notifier_register_done(); 2043 iucv_cpu_online, iucv_cpu_down_prep);
2044 if (rc < 0)
2045 goto out_prep;
2046 iucv_online = rc;
2070 2047
2071 rc = register_reboot_notifier(&iucv_reboot_notifier); 2048 rc = register_reboot_notifier(&iucv_reboot_notifier);
2072 if (rc) 2049 if (rc)
2073 goto out_cpu; 2050 goto out_remove_hp;
2074 ASCEBC(iucv_error_no_listener, 16); 2051 ASCEBC(iucv_error_no_listener, 16);
2075 ASCEBC(iucv_error_no_memory, 16); 2052 ASCEBC(iucv_error_no_memory, 16);
2076 ASCEBC(iucv_error_pathid, 16); 2053 ASCEBC(iucv_error_pathid, 16);
@@ -2084,15 +2061,11 @@ static int __init iucv_init(void)
2084 2061
2085out_reboot: 2062out_reboot:
2086 unregister_reboot_notifier(&iucv_reboot_notifier); 2063 unregister_reboot_notifier(&iucv_reboot_notifier);
2087out_cpu: 2064out_remove_hp:
2088 cpu_notifier_register_begin(); 2065 cpuhp_remove_state(iucv_online);
2089 __unregister_hotcpu_notifier(&iucv_cpu_notifier); 2066out_prep:
2090out_free: 2067 cpuhp_remove_state(CPUHP_NET_IUCV_PREPARE);
2091 for_each_possible_cpu(cpu) 2068out_dev:
2092 free_iucv_data(cpu);
2093
2094 cpu_notifier_register_done();
2095
2096 root_device_unregister(iucv_root); 2069 root_device_unregister(iucv_root);
2097out_int: 2070out_int:
2098 unregister_external_irq(EXT_IRQ_IUCV, iucv_external_interrupt); 2071 unregister_external_irq(EXT_IRQ_IUCV, iucv_external_interrupt);
@@ -2110,7 +2083,6 @@ out:
2110static void __exit iucv_exit(void) 2083static void __exit iucv_exit(void)
2111{ 2084{
2112 struct iucv_irq_list *p, *n; 2085 struct iucv_irq_list *p, *n;
2113 int cpu;
2114 2086
2115 spin_lock_irq(&iucv_queue_lock); 2087 spin_lock_irq(&iucv_queue_lock);
2116 list_for_each_entry_safe(p, n, &iucv_task_queue, list) 2088 list_for_each_entry_safe(p, n, &iucv_task_queue, list)
@@ -2119,11 +2091,9 @@ static void __exit iucv_exit(void)
2119 kfree(p); 2091 kfree(p);
2120 spin_unlock_irq(&iucv_queue_lock); 2092 spin_unlock_irq(&iucv_queue_lock);
2121 unregister_reboot_notifier(&iucv_reboot_notifier); 2093 unregister_reboot_notifier(&iucv_reboot_notifier);
2122 cpu_notifier_register_begin(); 2094
2123 __unregister_hotcpu_notifier(&iucv_cpu_notifier); 2095 cpuhp_remove_state_nocalls(iucv_online);
2124 for_each_possible_cpu(cpu) 2096 cpuhp_remove_state(CPUHP_NET_IUCV_PREPARE);
2125 free_iucv_data(cpu);
2126 cpu_notifier_register_done();
2127 root_device_unregister(iucv_root); 2097 root_device_unregister(iucv_root);
2128 bus_unregister(&iucv_bus); 2098 bus_unregister(&iucv_bus);
2129 unregister_external_irq(EXT_IRQ_IUCV, iucv_external_interrupt); 2099 unregister_external_irq(EXT_IRQ_IUCV, iucv_external_interrupt);
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 7e08a4d3d77d..31762f76cdb5 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -24,6 +24,8 @@
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/workqueue.h> 25#include <linux/workqueue.h>
26#include <linux/syscalls.h> 26#include <linux/syscalls.h>
27#include <linux/sched/signal.h>
28
27#include <net/kcm.h> 29#include <net/kcm.h>
28#include <net/netns/generic.h> 30#include <net/netns/generic.h>
29#include <net/sock.h> 31#include <net/sock.h>
@@ -929,23 +931,25 @@ static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
929 goto out_error; 931 goto out_error;
930 } 932 }
931 933
932 /* New message, alloc head skb */ 934 if (msg_data_left(msg)) {
933 head = alloc_skb(0, sk->sk_allocation); 935 /* New message, alloc head skb */
934 while (!head) {
935 kcm_push(kcm);
936 err = sk_stream_wait_memory(sk, &timeo);
937 if (err)
938 goto out_error;
939
940 head = alloc_skb(0, sk->sk_allocation); 936 head = alloc_skb(0, sk->sk_allocation);
941 } 937 while (!head) {
938 kcm_push(kcm);
939 err = sk_stream_wait_memory(sk, &timeo);
940 if (err)
941 goto out_error;
942
943 head = alloc_skb(0, sk->sk_allocation);
944 }
942 945
943 skb = head; 946 skb = head;
944 947
945 /* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling 948 /* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling
946 * csum_and_copy_from_iter from skb_do_copy_data_nocache. 949 * csum_and_copy_from_iter from skb_do_copy_data_nocache.
947 */ 950 */
948 skb->ip_summed = CHECKSUM_UNNECESSARY; 951 skb->ip_summed = CHECKSUM_UNNECESSARY;
952 }
949 953
950start: 954start:
951 while (msg_data_left(msg)) { 955 while (msg_data_left(msg)) {
@@ -1018,10 +1022,12 @@ wait_for_memory:
1018 if (eor) { 1022 if (eor) {
1019 bool not_busy = skb_queue_empty(&sk->sk_write_queue); 1023 bool not_busy = skb_queue_empty(&sk->sk_write_queue);
1020 1024
1021 /* Message complete, queue it on send buffer */ 1025 if (head) {
1022 __skb_queue_tail(&sk->sk_write_queue, head); 1026 /* Message complete, queue it on send buffer */
1023 kcm->seq_skb = NULL; 1027 __skb_queue_tail(&sk->sk_write_queue, head);
1024 KCM_STATS_INCR(kcm->stats.tx_msgs); 1028 kcm->seq_skb = NULL;
1029 KCM_STATS_INCR(kcm->stats.tx_msgs);
1030 }
1025 1031
1026 if (msg->msg_flags & MSG_BATCH) { 1032 if (msg->msg_flags & MSG_BATCH) {
1027 kcm->tx_wait_more = true; 1033 kcm->tx_wait_more = true;
@@ -1040,8 +1046,10 @@ wait_for_memory:
1040 } else { 1046 } else {
1041 /* Message not complete, save state */ 1047 /* Message not complete, save state */
1042partial_message: 1048partial_message:
1043 kcm->seq_skb = head; 1049 if (head) {
1044 kcm_tx_msg(head)->last_skb = skb; 1050 kcm->seq_skb = head;
1051 kcm_tx_msg(head)->last_skb = skb;
1052 }
1045 } 1053 }
1046 1054
1047 KCM_STATS_ADD(kcm->stats.tx_bytes, copied); 1055 KCM_STATS_ADD(kcm->stats.tx_bytes, copied);
@@ -1679,7 +1687,7 @@ static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1679 struct kcm_attach info; 1687 struct kcm_attach info;
1680 1688
1681 if (copy_from_user(&info, (void __user *)arg, sizeof(info))) 1689 if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
1682 err = -EFAULT; 1690 return -EFAULT;
1683 1691
1684 err = kcm_attach_ioctl(sock, &info); 1692 err = kcm_attach_ioctl(sock, &info);
1685 1693
@@ -1689,7 +1697,7 @@ static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1689 struct kcm_unattach info; 1697 struct kcm_unattach info;
1690 1698
1691 if (copy_from_user(&info, (void __user *)arg, sizeof(info))) 1699 if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
1692 err = -EFAULT; 1700 return -EFAULT;
1693 1701
1694 err = kcm_unattach_ioctl(sock, &info); 1702 err = kcm_unattach_ioctl(sock, &info);
1695 1703
@@ -1700,7 +1708,7 @@ static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1700 struct socket *newsock = NULL; 1708 struct socket *newsock = NULL;
1701 1709
1702 if (copy_from_user(&info, (void __user *)arg, sizeof(info))) 1710 if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
1703 err = -EFAULT; 1711 return -EFAULT;
1704 1712
1705 err = kcm_clone(sock, &info, &newsock); 1713 err = kcm_clone(sock, &info, &newsock);
1706 1714
diff --git a/net/key/af_key.c b/net/key/af_key.c
index f9c9ecb0cdd3..be8cecc65002 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -36,7 +36,7 @@
36#define _X2KEY(x) ((x) == XFRM_INF ? 0 : (x)) 36#define _X2KEY(x) ((x) == XFRM_INF ? 0 : (x))
37#define _KEY2X(x) ((x) == 0 ? XFRM_INF : (x)) 37#define _KEY2X(x) ((x) == 0 ? XFRM_INF : (x))
38 38
39static int pfkey_net_id __read_mostly; 39static unsigned int pfkey_net_id __read_mostly;
40struct netns_pfkey { 40struct netns_pfkey {
41 /* List of all pfkey sockets. */ 41 /* List of all pfkey sockets. */
42 struct hlist_head table; 42 struct hlist_head table;
@@ -63,8 +63,13 @@ struct pfkey_sock {
63 } u; 63 } u;
64 struct sk_buff *skb; 64 struct sk_buff *skb;
65 } dump; 65 } dump;
66 struct mutex dump_lock;
66}; 67};
67 68
69static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len,
70 xfrm_address_t *saddr, xfrm_address_t *daddr,
71 u16 *family);
72
68static inline struct pfkey_sock *pfkey_sk(struct sock *sk) 73static inline struct pfkey_sock *pfkey_sk(struct sock *sk)
69{ 74{
70 return (struct pfkey_sock *)sk; 75 return (struct pfkey_sock *)sk;
@@ -139,6 +144,7 @@ static int pfkey_create(struct net *net, struct socket *sock, int protocol,
139{ 144{
140 struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); 145 struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id);
141 struct sock *sk; 146 struct sock *sk;
147 struct pfkey_sock *pfk;
142 int err; 148 int err;
143 149
144 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 150 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
@@ -153,6 +159,9 @@ static int pfkey_create(struct net *net, struct socket *sock, int protocol,
153 if (sk == NULL) 159 if (sk == NULL)
154 goto out; 160 goto out;
155 161
162 pfk = pfkey_sk(sk);
163 mutex_init(&pfk->dump_lock);
164
156 sock->ops = &pfkey_ops; 165 sock->ops = &pfkey_ops;
157 sock_init_data(sock, sk); 166 sock_init_data(sock, sk);
158 167
@@ -281,13 +290,23 @@ static int pfkey_do_dump(struct pfkey_sock *pfk)
281 struct sadb_msg *hdr; 290 struct sadb_msg *hdr;
282 int rc; 291 int rc;
283 292
293 mutex_lock(&pfk->dump_lock);
294 if (!pfk->dump.dump) {
295 rc = 0;
296 goto out;
297 }
298
284 rc = pfk->dump.dump(pfk); 299 rc = pfk->dump.dump(pfk);
285 if (rc == -ENOBUFS) 300 if (rc == -ENOBUFS) {
286 return 0; 301 rc = 0;
302 goto out;
303 }
287 304
288 if (pfk->dump.skb) { 305 if (pfk->dump.skb) {
289 if (!pfkey_can_dump(&pfk->sk)) 306 if (!pfkey_can_dump(&pfk->sk)) {
290 return 0; 307 rc = 0;
308 goto out;
309 }
291 310
292 hdr = (struct sadb_msg *) pfk->dump.skb->data; 311 hdr = (struct sadb_msg *) pfk->dump.skb->data;
293 hdr->sadb_msg_seq = 0; 312 hdr->sadb_msg_seq = 0;
@@ -298,6 +317,9 @@ static int pfkey_do_dump(struct pfkey_sock *pfk)
298 } 317 }
299 318
300 pfkey_terminate_dump(pfk); 319 pfkey_terminate_dump(pfk);
320
321out:
322 mutex_unlock(&pfk->dump_lock);
301 return rc; 323 return rc;
302} 324}
303 325
@@ -1793,19 +1815,26 @@ static int pfkey_dump(struct sock *sk, struct sk_buff *skb, const struct sadb_ms
1793 struct xfrm_address_filter *filter = NULL; 1815 struct xfrm_address_filter *filter = NULL;
1794 struct pfkey_sock *pfk = pfkey_sk(sk); 1816 struct pfkey_sock *pfk = pfkey_sk(sk);
1795 1817
1796 if (pfk->dump.dump != NULL) 1818 mutex_lock(&pfk->dump_lock);
1819 if (pfk->dump.dump != NULL) {
1820 mutex_unlock(&pfk->dump_lock);
1797 return -EBUSY; 1821 return -EBUSY;
1822 }
1798 1823
1799 proto = pfkey_satype2proto(hdr->sadb_msg_satype); 1824 proto = pfkey_satype2proto(hdr->sadb_msg_satype);
1800 if (proto == 0) 1825 if (proto == 0) {
1826 mutex_unlock(&pfk->dump_lock);
1801 return -EINVAL; 1827 return -EINVAL;
1828 }
1802 1829
1803 if (ext_hdrs[SADB_X_EXT_FILTER - 1]) { 1830 if (ext_hdrs[SADB_X_EXT_FILTER - 1]) {
1804 struct sadb_x_filter *xfilter = ext_hdrs[SADB_X_EXT_FILTER - 1]; 1831 struct sadb_x_filter *xfilter = ext_hdrs[SADB_X_EXT_FILTER - 1];
1805 1832
1806 filter = kmalloc(sizeof(*filter), GFP_KERNEL); 1833 filter = kmalloc(sizeof(*filter), GFP_KERNEL);
1807 if (filter == NULL) 1834 if (filter == NULL) {
1835 mutex_unlock(&pfk->dump_lock);
1808 return -ENOMEM; 1836 return -ENOMEM;
1837 }
1809 1838
1810 memcpy(&filter->saddr, &xfilter->sadb_x_filter_saddr, 1839 memcpy(&filter->saddr, &xfilter->sadb_x_filter_saddr,
1811 sizeof(xfrm_address_t)); 1840 sizeof(xfrm_address_t));
@@ -1821,6 +1850,7 @@ static int pfkey_dump(struct sock *sk, struct sk_buff *skb, const struct sadb_ms
1821 pfk->dump.dump = pfkey_dump_sa; 1850 pfk->dump.dump = pfkey_dump_sa;
1822 pfk->dump.done = pfkey_dump_sa_done; 1851 pfk->dump.done = pfkey_dump_sa_done;
1823 xfrm_state_walk_init(&pfk->dump.u.state, proto, filter); 1852 xfrm_state_walk_init(&pfk->dump.u.state, proto, filter);
1853 mutex_unlock(&pfk->dump_lock);
1824 1854
1825 return pfkey_do_dump(pfk); 1855 return pfkey_do_dump(pfk);
1826} 1856}
@@ -1913,19 +1943,14 @@ parse_ipsecrequest(struct xfrm_policy *xp, struct sadb_x_ipsecrequest *rq)
1913 1943
1914 /* addresses present only in tunnel mode */ 1944 /* addresses present only in tunnel mode */
1915 if (t->mode == XFRM_MODE_TUNNEL) { 1945 if (t->mode == XFRM_MODE_TUNNEL) {
1916 u8 *sa = (u8 *) (rq + 1); 1946 int err;
1917 int family, socklen;
1918 1947
1919 family = pfkey_sockaddr_extract((struct sockaddr *)sa, 1948 err = parse_sockaddr_pair(
1920 &t->saddr); 1949 (struct sockaddr *)(rq + 1),
1921 if (!family) 1950 rq->sadb_x_ipsecrequest_len - sizeof(*rq),
1922 return -EINVAL; 1951 &t->saddr, &t->id.daddr, &t->encap_family);
1923 1952 if (err)
1924 socklen = pfkey_sockaddr_len(family); 1953 return err;
1925 if (pfkey_sockaddr_extract((struct sockaddr *)(sa + socklen),
1926 &t->id.daddr) != family)
1927 return -EINVAL;
1928 t->encap_family = family;
1929 } else 1954 } else
1930 t->encap_family = xp->family; 1955 t->encap_family = xp->family;
1931 1956
@@ -1945,7 +1970,11 @@ parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol)
1945 if (pol->sadb_x_policy_len * 8 < sizeof(struct sadb_x_policy)) 1970 if (pol->sadb_x_policy_len * 8 < sizeof(struct sadb_x_policy))
1946 return -EINVAL; 1971 return -EINVAL;
1947 1972
1948 while (len >= sizeof(struct sadb_x_ipsecrequest)) { 1973 while (len >= sizeof(*rq)) {
1974 if (len < rq->sadb_x_ipsecrequest_len ||
1975 rq->sadb_x_ipsecrequest_len < sizeof(*rq))
1976 return -EINVAL;
1977
1949 if ((err = parse_ipsecrequest(xp, rq)) < 0) 1978 if ((err = parse_ipsecrequest(xp, rq)) < 0)
1950 return err; 1979 return err;
1951 len -= rq->sadb_x_ipsecrequest_len; 1980 len -= rq->sadb_x_ipsecrequest_len;
@@ -2408,7 +2437,6 @@ out:
2408 return err; 2437 return err;
2409} 2438}
2410 2439
2411#ifdef CONFIG_NET_KEY_MIGRATE
2412static int pfkey_sockaddr_pair_size(sa_family_t family) 2440static int pfkey_sockaddr_pair_size(sa_family_t family)
2413{ 2441{
2414 return PFKEY_ALIGN8(pfkey_sockaddr_len(family) * 2); 2442 return PFKEY_ALIGN8(pfkey_sockaddr_len(family) * 2);
@@ -2420,7 +2448,7 @@ static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len,
2420{ 2448{
2421 int af, socklen; 2449 int af, socklen;
2422 2450
2423 if (ext_len < pfkey_sockaddr_pair_size(sa->sa_family)) 2451 if (ext_len < 2 || ext_len < pfkey_sockaddr_pair_size(sa->sa_family))
2424 return -EINVAL; 2452 return -EINVAL;
2425 2453
2426 af = pfkey_sockaddr_extract(sa, saddr); 2454 af = pfkey_sockaddr_extract(sa, saddr);
@@ -2436,6 +2464,7 @@ static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len,
2436 return 0; 2464 return 0;
2437} 2465}
2438 2466
2467#ifdef CONFIG_NET_KEY_MIGRATE
2439static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len, 2468static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len,
2440 struct xfrm_migrate *m) 2469 struct xfrm_migrate *m)
2441{ 2470{
@@ -2443,13 +2472,14 @@ static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len,
2443 struct sadb_x_ipsecrequest *rq2; 2472 struct sadb_x_ipsecrequest *rq2;
2444 int mode; 2473 int mode;
2445 2474
2446 if (len <= sizeof(struct sadb_x_ipsecrequest) || 2475 if (len < sizeof(*rq1) ||
2447 len < rq1->sadb_x_ipsecrequest_len) 2476 len < rq1->sadb_x_ipsecrequest_len ||
2477 rq1->sadb_x_ipsecrequest_len < sizeof(*rq1))
2448 return -EINVAL; 2478 return -EINVAL;
2449 2479
2450 /* old endoints */ 2480 /* old endoints */
2451 err = parse_sockaddr_pair((struct sockaddr *)(rq1 + 1), 2481 err = parse_sockaddr_pair((struct sockaddr *)(rq1 + 1),
2452 rq1->sadb_x_ipsecrequest_len, 2482 rq1->sadb_x_ipsecrequest_len - sizeof(*rq1),
2453 &m->old_saddr, &m->old_daddr, 2483 &m->old_saddr, &m->old_daddr,
2454 &m->old_family); 2484 &m->old_family);
2455 if (err) 2485 if (err)
@@ -2458,13 +2488,14 @@ static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len,
2458 rq2 = (struct sadb_x_ipsecrequest *)((u8 *)rq1 + rq1->sadb_x_ipsecrequest_len); 2488 rq2 = (struct sadb_x_ipsecrequest *)((u8 *)rq1 + rq1->sadb_x_ipsecrequest_len);
2459 len -= rq1->sadb_x_ipsecrequest_len; 2489 len -= rq1->sadb_x_ipsecrequest_len;
2460 2490
2461 if (len <= sizeof(struct sadb_x_ipsecrequest) || 2491 if (len <= sizeof(*rq2) ||
2462 len < rq2->sadb_x_ipsecrequest_len) 2492 len < rq2->sadb_x_ipsecrequest_len ||
2493 rq2->sadb_x_ipsecrequest_len < sizeof(*rq2))
2463 return -EINVAL; 2494 return -EINVAL;
2464 2495
2465 /* new endpoints */ 2496 /* new endpoints */
2466 err = parse_sockaddr_pair((struct sockaddr *)(rq2 + 1), 2497 err = parse_sockaddr_pair((struct sockaddr *)(rq2 + 1),
2467 rq2->sadb_x_ipsecrequest_len, 2498 rq2->sadb_x_ipsecrequest_len - sizeof(*rq2),
2468 &m->new_saddr, &m->new_daddr, 2499 &m->new_saddr, &m->new_daddr,
2469 &m->new_family); 2500 &m->new_family);
2470 if (err) 2501 if (err)
@@ -2679,14 +2710,18 @@ static int pfkey_spddump(struct sock *sk, struct sk_buff *skb, const struct sadb
2679{ 2710{
2680 struct pfkey_sock *pfk = pfkey_sk(sk); 2711 struct pfkey_sock *pfk = pfkey_sk(sk);
2681 2712
2682 if (pfk->dump.dump != NULL) 2713 mutex_lock(&pfk->dump_lock);
2714 if (pfk->dump.dump != NULL) {
2715 mutex_unlock(&pfk->dump_lock);
2683 return -EBUSY; 2716 return -EBUSY;
2717 }
2684 2718
2685 pfk->dump.msg_version = hdr->sadb_msg_version; 2719 pfk->dump.msg_version = hdr->sadb_msg_version;
2686 pfk->dump.msg_portid = hdr->sadb_msg_pid; 2720 pfk->dump.msg_portid = hdr->sadb_msg_pid;
2687 pfk->dump.dump = pfkey_dump_sp; 2721 pfk->dump.dump = pfkey_dump_sp;
2688 pfk->dump.done = pfkey_dump_sp_done; 2722 pfk->dump.done = pfkey_dump_sp_done;
2689 xfrm_policy_walk_init(&pfk->dump.u.policy, XFRM_POLICY_TYPE_MAIN); 2723 xfrm_policy_walk_init(&pfk->dump.u.policy, XFRM_POLICY_TYPE_MAIN);
2724 mutex_unlock(&pfk->dump_lock);
2690 2725
2691 return pfkey_do_dump(pfk); 2726 return pfkey_do_dump(pfk);
2692} 2727}
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index a2ed3bda4ddc..e37d9554da7b 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -278,7 +278,57 @@ struct l2tp_session *l2tp_session_find(struct net *net, struct l2tp_tunnel *tunn
278} 278}
279EXPORT_SYMBOL_GPL(l2tp_session_find); 279EXPORT_SYMBOL_GPL(l2tp_session_find);
280 280
281struct l2tp_session *l2tp_session_find_nth(struct l2tp_tunnel *tunnel, int nth) 281/* Like l2tp_session_find() but takes a reference on the returned session.
282 * Optionally calls session->ref() too if do_ref is true.
283 */
284struct l2tp_session *l2tp_session_get(struct net *net,
285 struct l2tp_tunnel *tunnel,
286 u32 session_id, bool do_ref)
287{
288 struct hlist_head *session_list;
289 struct l2tp_session *session;
290
291 if (!tunnel) {
292 struct l2tp_net *pn = l2tp_pernet(net);
293
294 session_list = l2tp_session_id_hash_2(pn, session_id);
295
296 rcu_read_lock_bh();
297 hlist_for_each_entry_rcu(session, session_list, global_hlist) {
298 if (session->session_id == session_id) {
299 l2tp_session_inc_refcount(session);
300 if (do_ref && session->ref)
301 session->ref(session);
302 rcu_read_unlock_bh();
303
304 return session;
305 }
306 }
307 rcu_read_unlock_bh();
308
309 return NULL;
310 }
311
312 session_list = l2tp_session_id_hash(tunnel, session_id);
313 read_lock_bh(&tunnel->hlist_lock);
314 hlist_for_each_entry(session, session_list, hlist) {
315 if (session->session_id == session_id) {
316 l2tp_session_inc_refcount(session);
317 if (do_ref && session->ref)
318 session->ref(session);
319 read_unlock_bh(&tunnel->hlist_lock);
320
321 return session;
322 }
323 }
324 read_unlock_bh(&tunnel->hlist_lock);
325
326 return NULL;
327}
328EXPORT_SYMBOL_GPL(l2tp_session_get);
329
330struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth,
331 bool do_ref)
282{ 332{
283 int hash; 333 int hash;
284 struct l2tp_session *session; 334 struct l2tp_session *session;
@@ -288,6 +338,9 @@ struct l2tp_session *l2tp_session_find_nth(struct l2tp_tunnel *tunnel, int nth)
288 for (hash = 0; hash < L2TP_HASH_SIZE; hash++) { 338 for (hash = 0; hash < L2TP_HASH_SIZE; hash++) {
289 hlist_for_each_entry(session, &tunnel->session_hlist[hash], hlist) { 339 hlist_for_each_entry(session, &tunnel->session_hlist[hash], hlist) {
290 if (++count > nth) { 340 if (++count > nth) {
341 l2tp_session_inc_refcount(session);
342 if (do_ref && session->ref)
343 session->ref(session);
291 read_unlock_bh(&tunnel->hlist_lock); 344 read_unlock_bh(&tunnel->hlist_lock);
292 return session; 345 return session;
293 } 346 }
@@ -298,12 +351,13 @@ struct l2tp_session *l2tp_session_find_nth(struct l2tp_tunnel *tunnel, int nth)
298 351
299 return NULL; 352 return NULL;
300} 353}
301EXPORT_SYMBOL_GPL(l2tp_session_find_nth); 354EXPORT_SYMBOL_GPL(l2tp_session_get_nth);
302 355
303/* Lookup a session by interface name. 356/* Lookup a session by interface name.
304 * This is very inefficient but is only used by management interfaces. 357 * This is very inefficient but is only used by management interfaces.
305 */ 358 */
306struct l2tp_session *l2tp_session_find_by_ifname(struct net *net, char *ifname) 359struct l2tp_session *l2tp_session_get_by_ifname(struct net *net, char *ifname,
360 bool do_ref)
307{ 361{
308 struct l2tp_net *pn = l2tp_pernet(net); 362 struct l2tp_net *pn = l2tp_pernet(net);
309 int hash; 363 int hash;
@@ -313,7 +367,11 @@ struct l2tp_session *l2tp_session_find_by_ifname(struct net *net, char *ifname)
313 for (hash = 0; hash < L2TP_HASH_SIZE_2; hash++) { 367 for (hash = 0; hash < L2TP_HASH_SIZE_2; hash++) {
314 hlist_for_each_entry_rcu(session, &pn->l2tp_session_hlist[hash], global_hlist) { 368 hlist_for_each_entry_rcu(session, &pn->l2tp_session_hlist[hash], global_hlist) {
315 if (!strcmp(session->ifname, ifname)) { 369 if (!strcmp(session->ifname, ifname)) {
370 l2tp_session_inc_refcount(session);
371 if (do_ref && session->ref)
372 session->ref(session);
316 rcu_read_unlock_bh(); 373 rcu_read_unlock_bh();
374
317 return session; 375 return session;
318 } 376 }
319 } 377 }
@@ -323,7 +381,49 @@ struct l2tp_session *l2tp_session_find_by_ifname(struct net *net, char *ifname)
323 381
324 return NULL; 382 return NULL;
325} 383}
326EXPORT_SYMBOL_GPL(l2tp_session_find_by_ifname); 384EXPORT_SYMBOL_GPL(l2tp_session_get_by_ifname);
385
386static int l2tp_session_add_to_tunnel(struct l2tp_tunnel *tunnel,
387 struct l2tp_session *session)
388{
389 struct l2tp_session *session_walk;
390 struct hlist_head *g_head;
391 struct hlist_head *head;
392 struct l2tp_net *pn;
393
394 head = l2tp_session_id_hash(tunnel, session->session_id);
395
396 write_lock_bh(&tunnel->hlist_lock);
397 hlist_for_each_entry(session_walk, head, hlist)
398 if (session_walk->session_id == session->session_id)
399 goto exist;
400
401 if (tunnel->version == L2TP_HDR_VER_3) {
402 pn = l2tp_pernet(tunnel->l2tp_net);
403 g_head = l2tp_session_id_hash_2(l2tp_pernet(tunnel->l2tp_net),
404 session->session_id);
405
406 spin_lock_bh(&pn->l2tp_session_hlist_lock);
407 hlist_for_each_entry(session_walk, g_head, global_hlist)
408 if (session_walk->session_id == session->session_id)
409 goto exist_glob;
410
411 hlist_add_head_rcu(&session->global_hlist, g_head);
412 spin_unlock_bh(&pn->l2tp_session_hlist_lock);
413 }
414
415 hlist_add_head(&session->hlist, head);
416 write_unlock_bh(&tunnel->hlist_lock);
417
418 return 0;
419
420exist_glob:
421 spin_unlock_bh(&pn->l2tp_session_hlist_lock);
422exist:
423 write_unlock_bh(&tunnel->hlist_lock);
424
425 return -EEXIST;
426}
327 427
328/* Lookup a tunnel by id 428/* Lookup a tunnel by id
329 */ 429 */
@@ -633,6 +733,9 @@ discard:
633 * a data (not control) frame before coming here. Fields up to the 733 * a data (not control) frame before coming here. Fields up to the
634 * session-id have already been parsed and ptr points to the data 734 * session-id have already been parsed and ptr points to the data
635 * after the session-id. 735 * after the session-id.
736 *
737 * session->ref() must have been called prior to l2tp_recv_common().
738 * session->deref() will be called automatically after skb is processed.
636 */ 739 */
637void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, 740void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
638 unsigned char *ptr, unsigned char *optr, u16 hdrflags, 741 unsigned char *ptr, unsigned char *optr, u16 hdrflags,
@@ -642,14 +745,6 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
642 int offset; 745 int offset;
643 u32 ns, nr; 746 u32 ns, nr;
644 747
645 /* The ref count is increased since we now hold a pointer to
646 * the session. Take care to decrement the refcnt when exiting
647 * this function from now on...
648 */
649 l2tp_session_inc_refcount(session);
650 if (session->ref)
651 (*session->ref)(session);
652
653 /* Parse and check optional cookie */ 748 /* Parse and check optional cookie */
654 if (session->peer_cookie_len > 0) { 749 if (session->peer_cookie_len > 0) {
655 if (memcmp(ptr, &session->peer_cookie[0], session->peer_cookie_len)) { 750 if (memcmp(ptr, &session->peer_cookie[0], session->peer_cookie_len)) {
@@ -715,7 +810,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
715 l2tp_info(session, L2TP_MSG_SEQ, 810 l2tp_info(session, L2TP_MSG_SEQ,
716 "%s: requested to enable seq numbers by LNS\n", 811 "%s: requested to enable seq numbers by LNS\n",
717 session->name); 812 session->name);
718 session->send_seq = -1; 813 session->send_seq = 1;
719 l2tp_session_set_header_len(session, tunnel->version); 814 l2tp_session_set_header_len(session, tunnel->version);
720 } 815 }
721 } else { 816 } else {
@@ -802,8 +897,6 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
802 /* Try to dequeue as many skbs from reorder_q as we can. */ 897 /* Try to dequeue as many skbs from reorder_q as we can. */
803 l2tp_recv_dequeue(session); 898 l2tp_recv_dequeue(session);
804 899
805 l2tp_session_dec_refcount(session);
806
807 return; 900 return;
808 901
809discard: 902discard:
@@ -812,8 +905,6 @@ discard:
812 905
813 if (session->deref) 906 if (session->deref)
814 (*session->deref)(session); 907 (*session->deref)(session);
815
816 l2tp_session_dec_refcount(session);
817} 908}
818EXPORT_SYMBOL(l2tp_recv_common); 909EXPORT_SYMBOL(l2tp_recv_common);
819 910
@@ -920,8 +1011,14 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb,
920 } 1011 }
921 1012
922 /* Find the session context */ 1013 /* Find the session context */
923 session = l2tp_session_find(tunnel->l2tp_net, tunnel, session_id); 1014 session = l2tp_session_get(tunnel->l2tp_net, tunnel, session_id, true);
924 if (!session || !session->recv_skb) { 1015 if (!session || !session->recv_skb) {
1016 if (session) {
1017 if (session->deref)
1018 session->deref(session);
1019 l2tp_session_dec_refcount(session);
1020 }
1021
925 /* Not found? Pass to userspace to deal with */ 1022 /* Not found? Pass to userspace to deal with */
926 l2tp_info(tunnel, L2TP_MSG_DATA, 1023 l2tp_info(tunnel, L2TP_MSG_DATA,
927 "%s: no session found (%u/%u). Passing up.\n", 1024 "%s: no session found (%u/%u). Passing up.\n",
@@ -930,6 +1027,7 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb,
930 } 1027 }
931 1028
932 l2tp_recv_common(session, skb, ptr, optr, hdrflags, length, payload_hook); 1029 l2tp_recv_common(session, skb, ptr, optr, hdrflags, length, payload_hook);
1030 l2tp_session_dec_refcount(session);
933 1031
934 return 0; 1032 return 0;
935 1033
@@ -1058,10 +1156,10 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb,
1058 1156
1059 /* Debug */ 1157 /* Debug */
1060 if (session->send_seq) 1158 if (session->send_seq)
1061 l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %Zd bytes, ns=%u\n", 1159 l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %zd bytes, ns=%u\n",
1062 session->name, data_len, session->ns - 1); 1160 session->name, data_len, session->ns - 1);
1063 else 1161 else
1064 l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %Zd bytes\n", 1162 l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %zd bytes\n",
1065 session->name, data_len); 1163 session->name, data_len);
1066 1164
1067 if (session->debug & L2TP_MSG_DATA) { 1165 if (session->debug & L2TP_MSG_DATA) {
@@ -1317,6 +1415,9 @@ static void l2tp_tunnel_del_work(struct work_struct *work)
1317 struct sock *sk = NULL; 1415 struct sock *sk = NULL;
1318 1416
1319 tunnel = container_of(work, struct l2tp_tunnel, del_work); 1417 tunnel = container_of(work, struct l2tp_tunnel, del_work);
1418
1419 l2tp_tunnel_closeall(tunnel);
1420
1320 sk = l2tp_tunnel_sock_lookup(tunnel); 1421 sk = l2tp_tunnel_sock_lookup(tunnel);
1321 if (!sk) 1422 if (!sk)
1322 goto out; 1423 goto out;
@@ -1639,7 +1740,6 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_create);
1639int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) 1740int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel)
1640{ 1741{
1641 l2tp_tunnel_inc_refcount(tunnel); 1742 l2tp_tunnel_inc_refcount(tunnel);
1642 l2tp_tunnel_closeall(tunnel);
1643 if (false == queue_work(l2tp_wq, &tunnel->del_work)) { 1743 if (false == queue_work(l2tp_wq, &tunnel->del_work)) {
1644 l2tp_tunnel_dec_refcount(tunnel); 1744 l2tp_tunnel_dec_refcount(tunnel);
1645 return 1; 1745 return 1;
@@ -1736,6 +1836,7 @@ EXPORT_SYMBOL_GPL(l2tp_session_set_header_len);
1736struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg) 1836struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg)
1737{ 1837{
1738 struct l2tp_session *session; 1838 struct l2tp_session *session;
1839 int err;
1739 1840
1740 session = kzalloc(sizeof(struct l2tp_session) + priv_size, GFP_KERNEL); 1841 session = kzalloc(sizeof(struct l2tp_session) + priv_size, GFP_KERNEL);
1741 if (session != NULL) { 1842 if (session != NULL) {
@@ -1791,6 +1892,13 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn
1791 1892
1792 l2tp_session_set_header_len(session, tunnel->version); 1893 l2tp_session_set_header_len(session, tunnel->version);
1793 1894
1895 err = l2tp_session_add_to_tunnel(tunnel, session);
1896 if (err) {
1897 kfree(session);
1898
1899 return ERR_PTR(err);
1900 }
1901
1794 /* Bump the reference count. The session context is deleted 1902 /* Bump the reference count. The session context is deleted
1795 * only when this drops to zero. 1903 * only when this drops to zero.
1796 */ 1904 */
@@ -1800,28 +1908,14 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn
1800 /* Ensure tunnel socket isn't deleted */ 1908 /* Ensure tunnel socket isn't deleted */
1801 sock_hold(tunnel->sock); 1909 sock_hold(tunnel->sock);
1802 1910
1803 /* Add session to the tunnel's hash list */
1804 write_lock_bh(&tunnel->hlist_lock);
1805 hlist_add_head(&session->hlist,
1806 l2tp_session_id_hash(tunnel, session_id));
1807 write_unlock_bh(&tunnel->hlist_lock);
1808
1809 /* And to the global session list if L2TPv3 */
1810 if (tunnel->version != L2TP_HDR_VER_2) {
1811 struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net);
1812
1813 spin_lock_bh(&pn->l2tp_session_hlist_lock);
1814 hlist_add_head_rcu(&session->global_hlist,
1815 l2tp_session_id_hash_2(pn, session_id));
1816 spin_unlock_bh(&pn->l2tp_session_hlist_lock);
1817 }
1818
1819 /* Ignore management session in session count value */ 1911 /* Ignore management session in session count value */
1820 if (session->session_id != 0) 1912 if (session->session_id != 0)
1821 atomic_inc(&l2tp_session_count); 1913 atomic_inc(&l2tp_session_count);
1914
1915 return session;
1822 } 1916 }
1823 1917
1824 return session; 1918 return ERR_PTR(-ENOMEM);
1825} 1919}
1826EXPORT_SYMBOL_GPL(l2tp_session_create); 1920EXPORT_SYMBOL_GPL(l2tp_session_create);
1827 1921
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index 2599af6378e4..8ce7818c7a9d 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -23,16 +23,6 @@
23#define L2TP_HASH_BITS_2 8 23#define L2TP_HASH_BITS_2 8
24#define L2TP_HASH_SIZE_2 (1 << L2TP_HASH_BITS_2) 24#define L2TP_HASH_SIZE_2 (1 << L2TP_HASH_BITS_2)
25 25
26/* Debug message categories for the DEBUG socket option */
27enum {
28 L2TP_MSG_DEBUG = (1 << 0), /* verbose debug (if
29 * compiled in) */
30 L2TP_MSG_CONTROL = (1 << 1), /* userspace - kernel
31 * interface */
32 L2TP_MSG_SEQ = (1 << 2), /* sequence numbers */
33 L2TP_MSG_DATA = (1 << 3), /* data packets */
34};
35
36struct sk_buff; 26struct sk_buff;
37 27
38struct l2tp_stats { 28struct l2tp_stats {
@@ -240,11 +230,16 @@ out:
240 return tunnel; 230 return tunnel;
241} 231}
242 232
233struct l2tp_session *l2tp_session_get(struct net *net,
234 struct l2tp_tunnel *tunnel,
235 u32 session_id, bool do_ref);
243struct l2tp_session *l2tp_session_find(struct net *net, 236struct l2tp_session *l2tp_session_find(struct net *net,
244 struct l2tp_tunnel *tunnel, 237 struct l2tp_tunnel *tunnel,
245 u32 session_id); 238 u32 session_id);
246struct l2tp_session *l2tp_session_find_nth(struct l2tp_tunnel *tunnel, int nth); 239struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth,
247struct l2tp_session *l2tp_session_find_by_ifname(struct net *net, char *ifname); 240 bool do_ref);
241struct l2tp_session *l2tp_session_get_by_ifname(struct net *net, char *ifname,
242 bool do_ref);
248struct l2tp_tunnel *l2tp_tunnel_find(struct net *net, u32 tunnel_id); 243struct l2tp_tunnel *l2tp_tunnel_find(struct net *net, u32 tunnel_id);
249struct l2tp_tunnel *l2tp_tunnel_find_nth(struct net *net, int nth); 244struct l2tp_tunnel *l2tp_tunnel_find_nth(struct net *net, int nth);
250 245
@@ -273,6 +268,7 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb,
273int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, 268int l2tp_nl_register_ops(enum l2tp_pwtype pw_type,
274 const struct l2tp_nl_cmd_ops *ops); 269 const struct l2tp_nl_cmd_ops *ops);
275void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type); 270void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type);
271int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg);
276 272
277/* Session reference counts. Incremented when code obtains a reference 273/* Session reference counts. Incremented when code obtains a reference
278 * to a session. 274 * to a session.
diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
index 2d6760a2ae34..d100aed3d06f 100644
--- a/net/l2tp/l2tp_debugfs.c
+++ b/net/l2tp/l2tp_debugfs.c
@@ -53,7 +53,7 @@ static void l2tp_dfs_next_tunnel(struct l2tp_dfs_seq_data *pd)
53 53
54static void l2tp_dfs_next_session(struct l2tp_dfs_seq_data *pd) 54static void l2tp_dfs_next_session(struct l2tp_dfs_seq_data *pd)
55{ 55{
56 pd->session = l2tp_session_find_nth(pd->tunnel, pd->session_idx); 56 pd->session = l2tp_session_get_nth(pd->tunnel, pd->session_idx, true);
57 pd->session_idx++; 57 pd->session_idx++;
58 58
59 if (pd->session == NULL) { 59 if (pd->session == NULL) {
@@ -238,10 +238,14 @@ static int l2tp_dfs_seq_show(struct seq_file *m, void *v)
238 } 238 }
239 239
240 /* Show the tunnel or session context */ 240 /* Show the tunnel or session context */
241 if (pd->session == NULL) 241 if (!pd->session) {
242 l2tp_dfs_seq_tunnel_show(m, pd->tunnel); 242 l2tp_dfs_seq_tunnel_show(m, pd->tunnel);
243 else 243 } else {
244 l2tp_dfs_seq_session_show(m, pd->session); 244 l2tp_dfs_seq_session_show(m, pd->session);
245 if (pd->session->deref)
246 pd->session->deref(pd->session);
247 l2tp_session_dec_refcount(pd->session);
248 }
245 249
246out: 250out:
247 return 0; 251 return 0;
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index 965f7e344cef..6fd41d7afe1e 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -106,8 +106,8 @@ static int l2tp_eth_dev_xmit(struct sk_buff *skb, struct net_device *dev)
106 return NETDEV_TX_OK; 106 return NETDEV_TX_OK;
107} 107}
108 108
109static struct rtnl_link_stats64 *l2tp_eth_get_stats64(struct net_device *dev, 109static void l2tp_eth_get_stats64(struct net_device *dev,
110 struct rtnl_link_stats64 *stats) 110 struct rtnl_link_stats64 *stats)
111{ 111{
112 struct l2tp_eth *priv = netdev_priv(dev); 112 struct l2tp_eth *priv = netdev_priv(dev);
113 113
@@ -117,10 +117,8 @@ static struct rtnl_link_stats64 *l2tp_eth_get_stats64(struct net_device *dev,
117 stats->rx_bytes = atomic_long_read(&priv->rx_bytes); 117 stats->rx_bytes = atomic_long_read(&priv->rx_bytes);
118 stats->rx_packets = atomic_long_read(&priv->rx_packets); 118 stats->rx_packets = atomic_long_read(&priv->rx_packets);
119 stats->rx_errors = atomic_long_read(&priv->rx_errors); 119 stats->rx_errors = atomic_long_read(&priv->rx_errors);
120 return stats;
121} 120}
122 121
123
124static const struct net_device_ops l2tp_eth_netdev_ops = { 122static const struct net_device_ops l2tp_eth_netdev_ops = {
125 .ndo_init = l2tp_eth_dev_init, 123 .ndo_init = l2tp_eth_dev_init,
126 .ndo_uninit = l2tp_eth_dev_uninit, 124 .ndo_uninit = l2tp_eth_dev_uninit,
@@ -223,12 +221,6 @@ static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 p
223 goto out; 221 goto out;
224 } 222 }
225 223
226 session = l2tp_session_find(net, tunnel, session_id);
227 if (session) {
228 rc = -EEXIST;
229 goto out;
230 }
231
232 if (cfg->ifname) { 224 if (cfg->ifname) {
233 dev = dev_get_by_name(net, cfg->ifname); 225 dev = dev_get_by_name(net, cfg->ifname);
234 if (dev) { 226 if (dev) {
@@ -242,8 +234,8 @@ static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 p
242 234
243 session = l2tp_session_create(sizeof(*spriv), tunnel, session_id, 235 session = l2tp_session_create(sizeof(*spriv), tunnel, session_id,
244 peer_session_id, cfg); 236 peer_session_id, cfg);
245 if (!session) { 237 if (IS_ERR(session)) {
246 rc = -ENOMEM; 238 rc = PTR_ERR(session);
247 goto out; 239 goto out;
248 } 240 }
249 241
@@ -259,6 +251,8 @@ static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 p
259 session->mtu = dev->mtu - session->hdr_len; 251 session->mtu = dev->mtu - session->hdr_len;
260 dev->mtu = session->mtu; 252 dev->mtu = session->mtu;
261 dev->needed_headroom += session->hdr_len; 253 dev->needed_headroom += session->hdr_len;
254 dev->min_mtu = 0;
255 dev->max_mtu = ETH_MAX_MTU;
262 256
263 priv = netdev_priv(dev); 257 priv = netdev_priv(dev);
264 priv->dev = dev; 258 priv->dev = dev;
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 8938b6ba57a0..4d322c1b7233 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -11,6 +11,7 @@
11 11
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13 13
14#include <asm/ioctls.h>
14#include <linux/icmp.h> 15#include <linux/icmp.h>
15#include <linux/module.h> 16#include <linux/module.h>
16#include <linux/skbuff.h> 17#include <linux/skbuff.h>
@@ -47,23 +48,32 @@ static inline struct l2tp_ip_sock *l2tp_ip_sk(const struct sock *sk)
47 return (struct l2tp_ip_sock *)sk; 48 return (struct l2tp_ip_sock *)sk;
48} 49}
49 50
50static struct sock *__l2tp_ip_bind_lookup(struct net *net, __be32 laddr, int dif, u32 tunnel_id) 51static struct sock *__l2tp_ip_bind_lookup(const struct net *net, __be32 laddr,
52 __be32 raddr, int dif, u32 tunnel_id)
51{ 53{
52 struct sock *sk; 54 struct sock *sk;
53 55
54 sk_for_each_bound(sk, &l2tp_ip_bind_table) { 56 sk_for_each_bound(sk, &l2tp_ip_bind_table) {
55 struct inet_sock *inet = inet_sk(sk); 57 const struct l2tp_ip_sock *l2tp = l2tp_ip_sk(sk);
56 struct l2tp_ip_sock *l2tp = l2tp_ip_sk(sk); 58 const struct inet_sock *inet = inet_sk(sk);
57 59
58 if (l2tp == NULL) 60 if (!net_eq(sock_net(sk), net))
59 continue; 61 continue;
60 62
61 if ((l2tp->conn_id == tunnel_id) && 63 if (sk->sk_bound_dev_if && dif && sk->sk_bound_dev_if != dif)
62 net_eq(sock_net(sk), net) && 64 continue;
63 !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) && 65
64 (!sk->sk_bound_dev_if || !dif || 66 if (inet->inet_rcv_saddr && laddr &&
65 sk->sk_bound_dev_if == dif)) 67 inet->inet_rcv_saddr != laddr)
66 goto found; 68 continue;
69
70 if (inet->inet_daddr && raddr && inet->inet_daddr != raddr)
71 continue;
72
73 if (l2tp->conn_id != tunnel_id)
74 continue;
75
76 goto found;
67 } 77 }
68 78
69 sk = NULL; 79 sk = NULL;
@@ -71,15 +81,6 @@ found:
71 return sk; 81 return sk;
72} 82}
73 83
74static inline struct sock *l2tp_ip_bind_lookup(struct net *net, __be32 laddr, int dif, u32 tunnel_id)
75{
76 struct sock *sk = __l2tp_ip_bind_lookup(net, laddr, dif, tunnel_id);
77 if (sk)
78 sock_hold(sk);
79
80 return sk;
81}
82
83/* When processing receive frames, there are two cases to 84/* When processing receive frames, there are two cases to
84 * consider. Data frames consist of a non-zero session-id and an 85 * consider. Data frames consist of a non-zero session-id and an
85 * optional cookie. Control frames consist of a regular L2TP header 86 * optional cookie. Control frames consist of a regular L2TP header
@@ -142,19 +143,19 @@ static int l2tp_ip_recv(struct sk_buff *skb)
142 } 143 }
143 144
144 /* Ok, this is a data packet. Lookup the session. */ 145 /* Ok, this is a data packet. Lookup the session. */
145 session = l2tp_session_find(net, NULL, session_id); 146 session = l2tp_session_get(net, NULL, session_id, true);
146 if (session == NULL) 147 if (!session)
147 goto discard; 148 goto discard;
148 149
149 tunnel = session->tunnel; 150 tunnel = session->tunnel;
150 if (tunnel == NULL) 151 if (!tunnel)
151 goto discard; 152 goto discard_sess;
152 153
153 /* Trace packet contents, if enabled */ 154 /* Trace packet contents, if enabled */
154 if (tunnel->debug & L2TP_MSG_DATA) { 155 if (tunnel->debug & L2TP_MSG_DATA) {
155 length = min(32u, skb->len); 156 length = min(32u, skb->len);
156 if (!pskb_may_pull(skb, length)) 157 if (!pskb_may_pull(skb, length))
157 goto discard; 158 goto discard_sess;
158 159
159 /* Point to L2TP header */ 160 /* Point to L2TP header */
160 optr = ptr = skb->data; 161 optr = ptr = skb->data;
@@ -164,6 +165,7 @@ static int l2tp_ip_recv(struct sk_buff *skb)
164 } 165 }
165 166
166 l2tp_recv_common(session, skb, ptr, optr, 0, skb->len, tunnel->recv_payload_hook); 167 l2tp_recv_common(session, skb, ptr, optr, 0, skb->len, tunnel->recv_payload_hook);
168 l2tp_session_dec_refcount(session);
167 169
168 return 0; 170 return 0;
169 171
@@ -177,14 +179,15 @@ pass_up:
177 179
178 tunnel_id = ntohl(*(__be32 *) &skb->data[4]); 180 tunnel_id = ntohl(*(__be32 *) &skb->data[4]);
179 tunnel = l2tp_tunnel_find(net, tunnel_id); 181 tunnel = l2tp_tunnel_find(net, tunnel_id);
180 if (tunnel != NULL) 182 if (tunnel) {
181 sk = tunnel->sock; 183 sk = tunnel->sock;
182 else { 184 sock_hold(sk);
185 } else {
183 struct iphdr *iph = (struct iphdr *) skb_network_header(skb); 186 struct iphdr *iph = (struct iphdr *) skb_network_header(skb);
184 187
185 read_lock_bh(&l2tp_ip_lock); 188 read_lock_bh(&l2tp_ip_lock);
186 sk = __l2tp_ip_bind_lookup(net, iph->daddr, inet_iif(skb), 189 sk = __l2tp_ip_bind_lookup(net, iph->daddr, iph->saddr,
187 tunnel_id); 190 inet_iif(skb), tunnel_id);
188 if (!sk) { 191 if (!sk) {
189 read_unlock_bh(&l2tp_ip_lock); 192 read_unlock_bh(&l2tp_ip_lock);
190 goto discard; 193 goto discard;
@@ -201,6 +204,12 @@ pass_up:
201 204
202 return sk_receive_skb(sk, skb, 1); 205 return sk_receive_skb(sk, skb, 1);
203 206
207discard_sess:
208 if (session->deref)
209 session->deref(session);
210 l2tp_session_dec_refcount(session);
211 goto discard;
212
204discard_put: 213discard_put:
205 sock_put(sk); 214 sock_put(sk);
206 215
@@ -265,7 +274,7 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
265 if (!sock_flag(sk, SOCK_ZAPPED)) 274 if (!sock_flag(sk, SOCK_ZAPPED))
266 goto out; 275 goto out;
267 276
268 if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_l2tpip)) 277 if (sk->sk_state != TCP_CLOSE)
269 goto out; 278 goto out;
270 279
271 chk_addr_ret = inet_addr_type(net, addr->l2tp_addr.s_addr); 280 chk_addr_ret = inet_addr_type(net, addr->l2tp_addr.s_addr);
@@ -280,7 +289,7 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
280 inet->inet_saddr = 0; /* Use device */ 289 inet->inet_saddr = 0; /* Use device */
281 290
282 write_lock_bh(&l2tp_ip_lock); 291 write_lock_bh(&l2tp_ip_lock);
283 if (__l2tp_ip_bind_lookup(net, addr->l2tp_addr.s_addr, 292 if (__l2tp_ip_bind_lookup(net, addr->l2tp_addr.s_addr, 0,
284 sk->sk_bound_dev_if, addr->l2tp_conn_id)) { 293 sk->sk_bound_dev_if, addr->l2tp_conn_id)) {
285 write_unlock_bh(&l2tp_ip_lock); 294 write_unlock_bh(&l2tp_ip_lock);
286 ret = -EADDRINUSE; 295 ret = -EADDRINUSE;
@@ -387,7 +396,7 @@ static int l2tp_ip_backlog_recv(struct sock *sk, struct sk_buff *skb)
387drop: 396drop:
388 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_INDISCARDS); 397 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_INDISCARDS);
389 kfree_skb(skb); 398 kfree_skb(skb);
390 return -1; 399 return 0;
391} 400}
392 401
393/* Userspace will call sendmsg() on the tunnel socket to send L2TP 402/* Userspace will call sendmsg() on the tunnel socket to send L2TP
@@ -560,6 +569,30 @@ out:
560 return err ? err : copied; 569 return err ? err : copied;
561} 570}
562 571
572int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg)
573{
574 struct sk_buff *skb;
575 int amount;
576
577 switch (cmd) {
578 case SIOCOUTQ:
579 amount = sk_wmem_alloc_get(sk);
580 break;
581 case SIOCINQ:
582 spin_lock_bh(&sk->sk_receive_queue.lock);
583 skb = skb_peek(&sk->sk_receive_queue);
584 amount = skb ? skb->len : 0;
585 spin_unlock_bh(&sk->sk_receive_queue.lock);
586 break;
587
588 default:
589 return -ENOIOCTLCMD;
590 }
591
592 return put_user(amount, (int __user *)arg);
593}
594EXPORT_SYMBOL(l2tp_ioctl);
595
563static struct proto l2tp_ip_prot = { 596static struct proto l2tp_ip_prot = {
564 .name = "L2TP/IP", 597 .name = "L2TP/IP",
565 .owner = THIS_MODULE, 598 .owner = THIS_MODULE,
@@ -568,7 +601,7 @@ static struct proto l2tp_ip_prot = {
568 .bind = l2tp_ip_bind, 601 .bind = l2tp_ip_bind,
569 .connect = l2tp_ip_connect, 602 .connect = l2tp_ip_connect,
570 .disconnect = l2tp_ip_disconnect, 603 .disconnect = l2tp_ip_disconnect,
571 .ioctl = udp_ioctl, 604 .ioctl = l2tp_ioctl,
572 .destroy = l2tp_ip_destroy_sock, 605 .destroy = l2tp_ip_destroy_sock,
573 .setsockopt = ip_setsockopt, 606 .setsockopt = ip_setsockopt,
574 .getsockopt = ip_getsockopt, 607 .getsockopt = ip_getsockopt,
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index aa821cb639e5..88b397c30d86 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -57,25 +57,36 @@ static inline struct l2tp_ip6_sock *l2tp_ip6_sk(const struct sock *sk)
57 return (struct l2tp_ip6_sock *)sk; 57 return (struct l2tp_ip6_sock *)sk;
58} 58}
59 59
60static struct sock *__l2tp_ip6_bind_lookup(struct net *net, 60static struct sock *__l2tp_ip6_bind_lookup(const struct net *net,
61 struct in6_addr *laddr, 61 const struct in6_addr *laddr,
62 const struct in6_addr *raddr,
62 int dif, u32 tunnel_id) 63 int dif, u32 tunnel_id)
63{ 64{
64 struct sock *sk; 65 struct sock *sk;
65 66
66 sk_for_each_bound(sk, &l2tp_ip6_bind_table) { 67 sk_for_each_bound(sk, &l2tp_ip6_bind_table) {
67 const struct in6_addr *addr = inet6_rcv_saddr(sk); 68 const struct in6_addr *sk_laddr = inet6_rcv_saddr(sk);
68 struct l2tp_ip6_sock *l2tp = l2tp_ip6_sk(sk); 69 const struct in6_addr *sk_raddr = &sk->sk_v6_daddr;
70 const struct l2tp_ip6_sock *l2tp = l2tp_ip6_sk(sk);
69 71
70 if (l2tp == NULL) 72 if (!net_eq(sock_net(sk), net))
71 continue; 73 continue;
72 74
73 if ((l2tp->conn_id == tunnel_id) && 75 if (sk->sk_bound_dev_if && dif && sk->sk_bound_dev_if != dif)
74 net_eq(sock_net(sk), net) && 76 continue;
75 (!addr || ipv6_addr_equal(addr, laddr)) && 77
76 (!sk->sk_bound_dev_if || !dif || 78 if (sk_laddr && !ipv6_addr_any(sk_laddr) &&
77 sk->sk_bound_dev_if == dif)) 79 !ipv6_addr_any(laddr) && !ipv6_addr_equal(sk_laddr, laddr))
78 goto found; 80 continue;
81
82 if (!ipv6_addr_any(sk_raddr) && raddr &&
83 !ipv6_addr_any(raddr) && !ipv6_addr_equal(sk_raddr, raddr))
84 continue;
85
86 if (l2tp->conn_id != tunnel_id)
87 continue;
88
89 goto found;
79 } 90 }
80 91
81 sk = NULL; 92 sk = NULL;
@@ -83,17 +94,6 @@ found:
83 return sk; 94 return sk;
84} 95}
85 96
86static inline struct sock *l2tp_ip6_bind_lookup(struct net *net,
87 struct in6_addr *laddr,
88 int dif, u32 tunnel_id)
89{
90 struct sock *sk = __l2tp_ip6_bind_lookup(net, laddr, dif, tunnel_id);
91 if (sk)
92 sock_hold(sk);
93
94 return sk;
95}
96
97/* When processing receive frames, there are two cases to 97/* When processing receive frames, there are two cases to
98 * consider. Data frames consist of a non-zero session-id and an 98 * consider. Data frames consist of a non-zero session-id and an
99 * optional cookie. Control frames consist of a regular L2TP header 99 * optional cookie. Control frames consist of a regular L2TP header
@@ -156,19 +156,19 @@ static int l2tp_ip6_recv(struct sk_buff *skb)
156 } 156 }
157 157
158 /* Ok, this is a data packet. Lookup the session. */ 158 /* Ok, this is a data packet. Lookup the session. */
159 session = l2tp_session_find(net, NULL, session_id); 159 session = l2tp_session_get(net, NULL, session_id, true);
160 if (session == NULL) 160 if (!session)
161 goto discard; 161 goto discard;
162 162
163 tunnel = session->tunnel; 163 tunnel = session->tunnel;
164 if (tunnel == NULL) 164 if (!tunnel)
165 goto discard; 165 goto discard_sess;
166 166
167 /* Trace packet contents, if enabled */ 167 /* Trace packet contents, if enabled */
168 if (tunnel->debug & L2TP_MSG_DATA) { 168 if (tunnel->debug & L2TP_MSG_DATA) {
169 length = min(32u, skb->len); 169 length = min(32u, skb->len);
170 if (!pskb_may_pull(skb, length)) 170 if (!pskb_may_pull(skb, length))
171 goto discard; 171 goto discard_sess;
172 172
173 /* Point to L2TP header */ 173 /* Point to L2TP header */
174 optr = ptr = skb->data; 174 optr = ptr = skb->data;
@@ -179,6 +179,8 @@ static int l2tp_ip6_recv(struct sk_buff *skb)
179 179
180 l2tp_recv_common(session, skb, ptr, optr, 0, skb->len, 180 l2tp_recv_common(session, skb, ptr, optr, 0, skb->len,
181 tunnel->recv_payload_hook); 181 tunnel->recv_payload_hook);
182 l2tp_session_dec_refcount(session);
183
182 return 0; 184 return 0;
183 185
184pass_up: 186pass_up:
@@ -191,14 +193,15 @@ pass_up:
191 193
192 tunnel_id = ntohl(*(__be32 *) &skb->data[4]); 194 tunnel_id = ntohl(*(__be32 *) &skb->data[4]);
193 tunnel = l2tp_tunnel_find(net, tunnel_id); 195 tunnel = l2tp_tunnel_find(net, tunnel_id);
194 if (tunnel != NULL) 196 if (tunnel) {
195 sk = tunnel->sock; 197 sk = tunnel->sock;
196 else { 198 sock_hold(sk);
199 } else {
197 struct ipv6hdr *iph = ipv6_hdr(skb); 200 struct ipv6hdr *iph = ipv6_hdr(skb);
198 201
199 read_lock_bh(&l2tp_ip6_lock); 202 read_lock_bh(&l2tp_ip6_lock);
200 sk = __l2tp_ip6_bind_lookup(net, &iph->daddr, inet6_iif(skb), 203 sk = __l2tp_ip6_bind_lookup(net, &iph->daddr, &iph->saddr,
201 tunnel_id); 204 inet6_iif(skb), tunnel_id);
202 if (!sk) { 205 if (!sk) {
203 read_unlock_bh(&l2tp_ip6_lock); 206 read_unlock_bh(&l2tp_ip6_lock);
204 goto discard; 207 goto discard;
@@ -215,6 +218,12 @@ pass_up:
215 218
216 return sk_receive_skb(sk, skb, 1); 219 return sk_receive_skb(sk, skb, 1);
217 220
221discard_sess:
222 if (session->deref)
223 session->deref(session);
224 l2tp_session_dec_refcount(session);
225 goto discard;
226
218discard_put: 227discard_put:
219 sock_put(sk); 228 sock_put(sk);
220 229
@@ -330,7 +339,7 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
330 rcu_read_unlock(); 339 rcu_read_unlock();
331 340
332 write_lock_bh(&l2tp_ip6_lock); 341 write_lock_bh(&l2tp_ip6_lock);
333 if (__l2tp_ip6_bind_lookup(net, &addr->l2tp_addr, bound_dev_if, 342 if (__l2tp_ip6_bind_lookup(net, &addr->l2tp_addr, NULL, bound_dev_if,
334 addr->l2tp_conn_id)) { 343 addr->l2tp_conn_id)) {
335 write_unlock_bh(&l2tp_ip6_lock); 344 write_unlock_bh(&l2tp_ip6_lock);
336 err = -EADDRINUSE; 345 err = -EADDRINUSE;
@@ -525,6 +534,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
525 memset(&fl6, 0, sizeof(fl6)); 534 memset(&fl6, 0, sizeof(fl6));
526 535
527 fl6.flowi6_mark = sk->sk_mark; 536 fl6.flowi6_mark = sk->sk_mark;
537 fl6.flowi6_uid = sk->sk_uid;
528 538
529 ipc6.hlimit = -1; 539 ipc6.hlimit = -1;
530 ipc6.tclass = -1; 540 ipc6.tclass = -1;
@@ -657,7 +667,8 @@ out:
657 return err < 0 ? err : len; 667 return err < 0 ? err : len;
658 668
659do_confirm: 669do_confirm:
660 dst_confirm(dst); 670 if (msg->msg_flags & MSG_PROBE)
671 dst_confirm_neigh(dst, &fl6.daddr);
661 if (!(msg->msg_flags & MSG_PROBE) || len) 672 if (!(msg->msg_flags & MSG_PROBE) || len)
662 goto back_from_confirm; 673 goto back_from_confirm;
663 err = 0; 674 err = 0;
@@ -729,7 +740,7 @@ static struct proto l2tp_ip6_prot = {
729 .bind = l2tp_ip6_bind, 740 .bind = l2tp_ip6_bind,
730 .connect = l2tp_ip6_connect, 741 .connect = l2tp_ip6_connect,
731 .disconnect = l2tp_ip6_disconnect, 742 .disconnect = l2tp_ip6_disconnect,
732 .ioctl = udp_ioctl, 743 .ioctl = l2tp_ioctl,
733 .destroy = l2tp_ip6_destroy_sock, 744 .destroy = l2tp_ip6_destroy_sock,
734 .setsockopt = ipv6_setsockopt, 745 .setsockopt = ipv6_setsockopt,
735 .getsockopt = ipv6_getsockopt, 746 .getsockopt = ipv6_getsockopt,
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index bf3117771822..7e3e669baac4 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -31,14 +31,7 @@
31#include "l2tp_core.h" 31#include "l2tp_core.h"
32 32
33 33
34static struct genl_family l2tp_nl_family = { 34static struct genl_family l2tp_nl_family;
35 .id = GENL_ID_GENERATE,
36 .name = L2TP_GENL_NAME,
37 .version = L2TP_GENL_VERSION,
38 .hdrsize = 0,
39 .maxattr = L2TP_ATTR_MAX,
40 .netnsok = true,
41};
42 35
43static const struct genl_multicast_group l2tp_multicast_group[] = { 36static const struct genl_multicast_group l2tp_multicast_group[] = {
44 { 37 {
@@ -55,7 +48,8 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq,
55/* Accessed under genl lock */ 48/* Accessed under genl lock */
56static const struct l2tp_nl_cmd_ops *l2tp_nl_cmd_ops[__L2TP_PWTYPE_MAX]; 49static const struct l2tp_nl_cmd_ops *l2tp_nl_cmd_ops[__L2TP_PWTYPE_MAX];
57 50
58static struct l2tp_session *l2tp_nl_session_find(struct genl_info *info) 51static struct l2tp_session *l2tp_nl_session_get(struct genl_info *info,
52 bool do_ref)
59{ 53{
60 u32 tunnel_id; 54 u32 tunnel_id;
61 u32 session_id; 55 u32 session_id;
@@ -66,14 +60,15 @@ static struct l2tp_session *l2tp_nl_session_find(struct genl_info *info)
66 60
67 if (info->attrs[L2TP_ATTR_IFNAME]) { 61 if (info->attrs[L2TP_ATTR_IFNAME]) {
68 ifname = nla_data(info->attrs[L2TP_ATTR_IFNAME]); 62 ifname = nla_data(info->attrs[L2TP_ATTR_IFNAME]);
69 session = l2tp_session_find_by_ifname(net, ifname); 63 session = l2tp_session_get_by_ifname(net, ifname, do_ref);
70 } else if ((info->attrs[L2TP_ATTR_SESSION_ID]) && 64 } else if ((info->attrs[L2TP_ATTR_SESSION_ID]) &&
71 (info->attrs[L2TP_ATTR_CONN_ID])) { 65 (info->attrs[L2TP_ATTR_CONN_ID])) {
72 tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); 66 tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]);
73 session_id = nla_get_u32(info->attrs[L2TP_ATTR_SESSION_ID]); 67 session_id = nla_get_u32(info->attrs[L2TP_ATTR_SESSION_ID]);
74 tunnel = l2tp_tunnel_find(net, tunnel_id); 68 tunnel = l2tp_tunnel_find(net, tunnel_id);
75 if (tunnel) 69 if (tunnel)
76 session = l2tp_session_find(net, tunnel, session_id); 70 session = l2tp_session_get(net, tunnel, session_id,
71 do_ref);
77 } 72 }
78 73
79 return session; 74 return session;
@@ -227,14 +222,14 @@ static int l2tp_nl_cmd_tunnel_create(struct sk_buff *skb, struct genl_info *info
227 cfg.local_udp_port = nla_get_u16(info->attrs[L2TP_ATTR_UDP_SPORT]); 222 cfg.local_udp_port = nla_get_u16(info->attrs[L2TP_ATTR_UDP_SPORT]);
228 if (info->attrs[L2TP_ATTR_UDP_DPORT]) 223 if (info->attrs[L2TP_ATTR_UDP_DPORT])
229 cfg.peer_udp_port = nla_get_u16(info->attrs[L2TP_ATTR_UDP_DPORT]); 224 cfg.peer_udp_port = nla_get_u16(info->attrs[L2TP_ATTR_UDP_DPORT]);
230 if (info->attrs[L2TP_ATTR_UDP_CSUM]) 225 cfg.use_udp_checksums = nla_get_flag(
231 cfg.use_udp_checksums = nla_get_flag(info->attrs[L2TP_ATTR_UDP_CSUM]); 226 info->attrs[L2TP_ATTR_UDP_CSUM]);
232 227
233#if IS_ENABLED(CONFIG_IPV6) 228#if IS_ENABLED(CONFIG_IPV6)
234 if (info->attrs[L2TP_ATTR_UDP_ZERO_CSUM6_TX]) 229 cfg.udp6_zero_tx_checksums = nla_get_flag(
235 cfg.udp6_zero_tx_checksums = nla_get_flag(info->attrs[L2TP_ATTR_UDP_ZERO_CSUM6_TX]); 230 info->attrs[L2TP_ATTR_UDP_ZERO_CSUM6_TX]);
236 if (info->attrs[L2TP_ATTR_UDP_ZERO_CSUM6_RX]) 231 cfg.udp6_zero_rx_checksums = nla_get_flag(
237 cfg.udp6_zero_rx_checksums = nla_get_flag(info->attrs[L2TP_ATTR_UDP_ZERO_CSUM6_RX]); 232 info->attrs[L2TP_ATTR_UDP_ZERO_CSUM6_RX]);
238#endif 233#endif
239 } 234 }
240 235
@@ -386,9 +381,24 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla
386 381
387 switch (tunnel->encap) { 382 switch (tunnel->encap) {
388 case L2TP_ENCAPTYPE_UDP: 383 case L2TP_ENCAPTYPE_UDP:
384 switch (sk->sk_family) {
385 case AF_INET:
386 if (nla_put_u8(skb, L2TP_ATTR_UDP_CSUM, !sk->sk_no_check_tx))
387 goto nla_put_failure;
388 break;
389#if IS_ENABLED(CONFIG_IPV6)
390 case AF_INET6:
391 if (udp_get_no_check6_tx(sk) &&
392 nla_put_flag(skb, L2TP_ATTR_UDP_ZERO_CSUM6_TX))
393 goto nla_put_failure;
394 if (udp_get_no_check6_rx(sk) &&
395 nla_put_flag(skb, L2TP_ATTR_UDP_ZERO_CSUM6_RX))
396 goto nla_put_failure;
397 break;
398#endif
399 }
389 if (nla_put_u16(skb, L2TP_ATTR_UDP_SPORT, ntohs(inet->inet_sport)) || 400 if (nla_put_u16(skb, L2TP_ATTR_UDP_SPORT, ntohs(inet->inet_sport)) ||
390 nla_put_u16(skb, L2TP_ATTR_UDP_DPORT, ntohs(inet->inet_dport)) || 401 nla_put_u16(skb, L2TP_ATTR_UDP_DPORT, ntohs(inet->inet_dport)))
391 nla_put_u8(skb, L2TP_ATTR_UDP_CSUM, !sk->sk_no_check_tx))
392 goto nla_put_failure; 402 goto nla_put_failure;
393 /* NOBREAK */ 403 /* NOBREAK */
394 case L2TP_ENCAPTYPE_IP: 404 case L2TP_ENCAPTYPE_IP:
@@ -634,10 +644,12 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
634 session_id, peer_session_id, &cfg); 644 session_id, peer_session_id, &cfg);
635 645
636 if (ret >= 0) { 646 if (ret >= 0) {
637 session = l2tp_session_find(net, tunnel, session_id); 647 session = l2tp_session_get(net, tunnel, session_id, false);
638 if (session) 648 if (session) {
639 ret = l2tp_session_notify(&l2tp_nl_family, info, session, 649 ret = l2tp_session_notify(&l2tp_nl_family, info, session,
640 L2TP_CMD_SESSION_CREATE); 650 L2TP_CMD_SESSION_CREATE);
651 l2tp_session_dec_refcount(session);
652 }
641 } 653 }
642 654
643out: 655out:
@@ -650,7 +662,7 @@ static int l2tp_nl_cmd_session_delete(struct sk_buff *skb, struct genl_info *inf
650 struct l2tp_session *session; 662 struct l2tp_session *session;
651 u16 pw_type; 663 u16 pw_type;
652 664
653 session = l2tp_nl_session_find(info); 665 session = l2tp_nl_session_get(info, true);
654 if (session == NULL) { 666 if (session == NULL) {
655 ret = -ENODEV; 667 ret = -ENODEV;
656 goto out; 668 goto out;
@@ -664,6 +676,10 @@ static int l2tp_nl_cmd_session_delete(struct sk_buff *skb, struct genl_info *inf
664 if (l2tp_nl_cmd_ops[pw_type] && l2tp_nl_cmd_ops[pw_type]->session_delete) 676 if (l2tp_nl_cmd_ops[pw_type] && l2tp_nl_cmd_ops[pw_type]->session_delete)
665 ret = (*l2tp_nl_cmd_ops[pw_type]->session_delete)(session); 677 ret = (*l2tp_nl_cmd_ops[pw_type]->session_delete)(session);
666 678
679 if (session->deref)
680 session->deref(session);
681 l2tp_session_dec_refcount(session);
682
667out: 683out:
668 return ret; 684 return ret;
669} 685}
@@ -673,7 +689,7 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf
673 int ret = 0; 689 int ret = 0;
674 struct l2tp_session *session; 690 struct l2tp_session *session;
675 691
676 session = l2tp_nl_session_find(info); 692 session = l2tp_nl_session_get(info, false);
677 if (session == NULL) { 693 if (session == NULL) {
678 ret = -ENODEV; 694 ret = -ENODEV;
679 goto out; 695 goto out;
@@ -708,6 +724,8 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf
708 ret = l2tp_session_notify(&l2tp_nl_family, info, 724 ret = l2tp_session_notify(&l2tp_nl_family, info,
709 session, L2TP_CMD_SESSION_MODIFY); 725 session, L2TP_CMD_SESSION_MODIFY);
710 726
727 l2tp_session_dec_refcount(session);
728
711out: 729out:
712 return ret; 730 return ret;
713} 731}
@@ -803,29 +821,34 @@ static int l2tp_nl_cmd_session_get(struct sk_buff *skb, struct genl_info *info)
803 struct sk_buff *msg; 821 struct sk_buff *msg;
804 int ret; 822 int ret;
805 823
806 session = l2tp_nl_session_find(info); 824 session = l2tp_nl_session_get(info, false);
807 if (session == NULL) { 825 if (session == NULL) {
808 ret = -ENODEV; 826 ret = -ENODEV;
809 goto out; 827 goto err;
810 } 828 }
811 829
812 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 830 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
813 if (!msg) { 831 if (!msg) {
814 ret = -ENOMEM; 832 ret = -ENOMEM;
815 goto out; 833 goto err_ref;
816 } 834 }
817 835
818 ret = l2tp_nl_session_send(msg, info->snd_portid, info->snd_seq, 836 ret = l2tp_nl_session_send(msg, info->snd_portid, info->snd_seq,
819 0, session, L2TP_CMD_SESSION_GET); 837 0, session, L2TP_CMD_SESSION_GET);
820 if (ret < 0) 838 if (ret < 0)
821 goto err_out; 839 goto err_ref_msg;
822 840
823 return genlmsg_unicast(genl_info_net(info), msg, info->snd_portid); 841 ret = genlmsg_unicast(genl_info_net(info), msg, info->snd_portid);
824 842
825err_out: 843 l2tp_session_dec_refcount(session);
826 nlmsg_free(msg);
827 844
828out: 845 return ret;
846
847err_ref_msg:
848 nlmsg_free(msg);
849err_ref:
850 l2tp_session_dec_refcount(session);
851err:
829 return ret; 852 return ret;
830} 853}
831 854
@@ -844,7 +867,7 @@ static int l2tp_nl_cmd_session_dump(struct sk_buff *skb, struct netlink_callback
844 goto out; 867 goto out;
845 } 868 }
846 869
847 session = l2tp_session_find_nth(tunnel, si); 870 session = l2tp_session_get_nth(tunnel, si, false);
848 if (session == NULL) { 871 if (session == NULL) {
849 ti++; 872 ti++;
850 tunnel = NULL; 873 tunnel = NULL;
@@ -854,8 +877,11 @@ static int l2tp_nl_cmd_session_dump(struct sk_buff *skb, struct netlink_callback
854 877
855 if (l2tp_nl_session_send(skb, NETLINK_CB(cb->skb).portid, 878 if (l2tp_nl_session_send(skb, NETLINK_CB(cb->skb).portid,
856 cb->nlh->nlmsg_seq, NLM_F_MULTI, 879 cb->nlh->nlmsg_seq, NLM_F_MULTI,
857 session, L2TP_CMD_SESSION_GET) < 0) 880 session, L2TP_CMD_SESSION_GET) < 0) {
881 l2tp_session_dec_refcount(session);
858 break; 882 break;
883 }
884 l2tp_session_dec_refcount(session);
859 885
860 si++; 886 si++;
861 } 887 }
@@ -977,6 +1003,19 @@ static const struct genl_ops l2tp_nl_ops[] = {
977 }, 1003 },
978}; 1004};
979 1005
1006static struct genl_family l2tp_nl_family __ro_after_init = {
1007 .name = L2TP_GENL_NAME,
1008 .version = L2TP_GENL_VERSION,
1009 .hdrsize = 0,
1010 .maxattr = L2TP_ATTR_MAX,
1011 .netnsok = true,
1012 .module = THIS_MODULE,
1013 .ops = l2tp_nl_ops,
1014 .n_ops = ARRAY_SIZE(l2tp_nl_ops),
1015 .mcgrps = l2tp_multicast_group,
1016 .n_mcgrps = ARRAY_SIZE(l2tp_multicast_group),
1017};
1018
980int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, const struct l2tp_nl_cmd_ops *ops) 1019int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, const struct l2tp_nl_cmd_ops *ops)
981{ 1020{
982 int ret; 1021 int ret;
@@ -1010,12 +1049,10 @@ void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type)
1010} 1049}
1011EXPORT_SYMBOL_GPL(l2tp_nl_unregister_ops); 1050EXPORT_SYMBOL_GPL(l2tp_nl_unregister_ops);
1012 1051
1013static int l2tp_nl_init(void) 1052static int __init l2tp_nl_init(void)
1014{ 1053{
1015 pr_info("L2TP netlink interface\n"); 1054 pr_info("L2TP netlink interface\n");
1016 return genl_register_family_with_ops_groups(&l2tp_nl_family, 1055 return genl_register_family(&l2tp_nl_family);
1017 l2tp_nl_ops,
1018 l2tp_multicast_group);
1019} 1056}
1020 1057
1021static void l2tp_nl_cleanup(void) 1058static void l2tp_nl_cleanup(void)
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 41d47bfda15c..32ea0f3d868c 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -231,14 +231,14 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int
231 if (sk->sk_state & PPPOX_BOUND) { 231 if (sk->sk_state & PPPOX_BOUND) {
232 struct pppox_sock *po; 232 struct pppox_sock *po;
233 233
234 l2tp_dbg(session, PPPOL2TP_MSG_DATA, 234 l2tp_dbg(session, L2TP_MSG_DATA,
235 "%s: recv %d byte data frame, passing to ppp\n", 235 "%s: recv %d byte data frame, passing to ppp\n",
236 session->name, data_len); 236 session->name, data_len);
237 237
238 po = pppox_sk(sk); 238 po = pppox_sk(sk);
239 ppp_input(&po->chan, skb); 239 ppp_input(&po->chan, skb);
240 } else { 240 } else {
241 l2tp_dbg(session, PPPOL2TP_MSG_DATA, 241 l2tp_dbg(session, L2TP_MSG_DATA,
242 "%s: recv %d byte data frame, passing to L2TP socket\n", 242 "%s: recv %d byte data frame, passing to L2TP socket\n",
243 session->name, data_len); 243 session->name, data_len);
244 244
@@ -251,7 +251,7 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int
251 return; 251 return;
252 252
253no_sock: 253no_sock:
254 l2tp_info(session, PPPOL2TP_MSG_DATA, "%s: no socket\n", session->name); 254 l2tp_info(session, L2TP_MSG_DATA, "%s: no socket\n", session->name);
255 kfree_skb(skb); 255 kfree_skb(skb);
256} 256}
257 257
@@ -450,6 +450,10 @@ static void pppol2tp_session_close(struct l2tp_session *session)
450static void pppol2tp_session_destruct(struct sock *sk) 450static void pppol2tp_session_destruct(struct sock *sk)
451{ 451{
452 struct l2tp_session *session = sk->sk_user_data; 452 struct l2tp_session *session = sk->sk_user_data;
453
454 skb_queue_purge(&sk->sk_receive_queue);
455 skb_queue_purge(&sk->sk_write_queue);
456
453 if (session) { 457 if (session) {
454 sk->sk_user_data = NULL; 458 sk->sk_user_data = NULL;
455 BUG_ON(session->magic != L2TP_SESSION_MAGIC); 459 BUG_ON(session->magic != L2TP_SESSION_MAGIC);
@@ -488,9 +492,6 @@ static int pppol2tp_release(struct socket *sock)
488 l2tp_session_queue_purge(session); 492 l2tp_session_queue_purge(session);
489 sock_put(sk); 493 sock_put(sk);
490 } 494 }
491 skb_queue_purge(&sk->sk_receive_queue);
492 skb_queue_purge(&sk->sk_write_queue);
493
494 release_sock(sk); 495 release_sock(sk);
495 496
496 /* This will delete the session context via 497 /* This will delete the session context via
@@ -582,6 +583,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
582 int error = 0; 583 int error = 0;
583 u32 tunnel_id, peer_tunnel_id; 584 u32 tunnel_id, peer_tunnel_id;
584 u32 session_id, peer_session_id; 585 u32 session_id, peer_session_id;
586 bool drop_refcnt = false;
585 int ver = 2; 587 int ver = 2;
586 int fd; 588 int fd;
587 589
@@ -683,36 +685,36 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
683 if (tunnel->peer_tunnel_id == 0) 685 if (tunnel->peer_tunnel_id == 0)
684 tunnel->peer_tunnel_id = peer_tunnel_id; 686 tunnel->peer_tunnel_id = peer_tunnel_id;
685 687
686 /* Create session if it doesn't already exist. We handle the 688 session = l2tp_session_get(sock_net(sk), tunnel, session_id, false);
687 * case where a session was previously created by the netlink 689 if (session) {
688 * interface by checking that the session doesn't already have 690 drop_refcnt = true;
689 * a socket and its tunnel socket are what we expect. If any 691 ps = l2tp_session_priv(session);
690 * of those checks fail, return EEXIST to the caller. 692
691 */ 693 /* Using a pre-existing session is fine as long as it hasn't
692 session = l2tp_session_find(sock_net(sk), tunnel, session_id); 694 * been connected yet.
693 if (session == NULL) {
694 /* Default MTU must allow space for UDP/L2TP/PPP
695 * headers.
696 */ 695 */
697 cfg.mtu = cfg.mru = 1500 - PPPOL2TP_HEADER_OVERHEAD; 696 if (ps->sock) {
697 error = -EEXIST;
698 goto end;
699 }
698 700
699 /* Allocate and initialize a new session context. */ 701 /* consistency checks */
700 session = l2tp_session_create(sizeof(struct pppol2tp_session), 702 if (ps->tunnel_sock != tunnel->sock) {
701 tunnel, session_id, 703 error = -EEXIST;
702 peer_session_id, &cfg);
703 if (session == NULL) {
704 error = -ENOMEM;
705 goto end; 704 goto end;
706 } 705 }
707 } else { 706 } else {
708 ps = l2tp_session_priv(session); 707 /* Default MTU must allow space for UDP/L2TP/PPP headers */
709 error = -EEXIST; 708 cfg.mtu = 1500 - PPPOL2TP_HEADER_OVERHEAD;
710 if (ps->sock != NULL) 709 cfg.mru = cfg.mtu;
711 goto end;
712 710
713 /* consistency checks */ 711 session = l2tp_session_create(sizeof(struct pppol2tp_session),
714 if (ps->tunnel_sock != tunnel->sock) 712 tunnel, session_id,
713 peer_session_id, &cfg);
714 if (IS_ERR(session)) {
715 error = PTR_ERR(session);
715 goto end; 716 goto end;
717 }
716 } 718 }
717 719
718 /* Associate session with its PPPoL2TP socket */ 720 /* Associate session with its PPPoL2TP socket */
@@ -773,10 +775,12 @@ out_no_ppp:
773 /* This is how we get the session context from the socket. */ 775 /* This is how we get the session context from the socket. */
774 sk->sk_user_data = session; 776 sk->sk_user_data = session;
775 sk->sk_state = PPPOX_CONNECTED; 777 sk->sk_state = PPPOX_CONNECTED;
776 l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: created\n", 778 l2tp_info(session, L2TP_MSG_CONTROL, "%s: created\n",
777 session->name); 779 session->name);
778 780
779end: 781end:
782 if (drop_refcnt)
783 l2tp_session_dec_refcount(session);
780 release_sock(sk); 784 release_sock(sk);
781 785
782 return error; 786 return error;
@@ -804,12 +808,6 @@ static int pppol2tp_session_create(struct net *net, u32 tunnel_id, u32 session_i
804 if (tunnel->sock == NULL) 808 if (tunnel->sock == NULL)
805 goto out; 809 goto out;
806 810
807 /* Check that this session doesn't already exist */
808 error = -EEXIST;
809 session = l2tp_session_find(net, tunnel, session_id);
810 if (session != NULL)
811 goto out;
812
813 /* Default MTU values. */ 811 /* Default MTU values. */
814 if (cfg->mtu == 0) 812 if (cfg->mtu == 0)
815 cfg->mtu = 1500 - PPPOL2TP_HEADER_OVERHEAD; 813 cfg->mtu = 1500 - PPPOL2TP_HEADER_OVERHEAD;
@@ -817,17 +815,18 @@ static int pppol2tp_session_create(struct net *net, u32 tunnel_id, u32 session_i
817 cfg->mru = cfg->mtu; 815 cfg->mru = cfg->mtu;
818 816
819 /* Allocate and initialize a new session context. */ 817 /* Allocate and initialize a new session context. */
820 error = -ENOMEM;
821 session = l2tp_session_create(sizeof(struct pppol2tp_session), 818 session = l2tp_session_create(sizeof(struct pppol2tp_session),
822 tunnel, session_id, 819 tunnel, session_id,
823 peer_session_id, cfg); 820 peer_session_id, cfg);
824 if (session == NULL) 821 if (IS_ERR(session)) {
822 error = PTR_ERR(session);
825 goto out; 823 goto out;
824 }
826 825
827 ps = l2tp_session_priv(session); 826 ps = l2tp_session_priv(session);
828 ps->tunnel_sock = tunnel->sock; 827 ps->tunnel_sock = tunnel->sock;
829 828
830 l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: created\n", 829 l2tp_info(session, L2TP_MSG_CONTROL, "%s: created\n",
831 session->name); 830 session->name);
832 831
833 error = 0; 832 error = 0;
@@ -989,7 +988,7 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session,
989 struct l2tp_tunnel *tunnel = session->tunnel; 988 struct l2tp_tunnel *tunnel = session->tunnel;
990 struct pppol2tp_ioc_stats stats; 989 struct pppol2tp_ioc_stats stats;
991 990
992 l2tp_dbg(session, PPPOL2TP_MSG_CONTROL, 991 l2tp_dbg(session, L2TP_MSG_CONTROL,
993 "%s: pppol2tp_session_ioctl(cmd=%#x, arg=%#lx)\n", 992 "%s: pppol2tp_session_ioctl(cmd=%#x, arg=%#lx)\n",
994 session->name, cmd, arg); 993 session->name, cmd, arg);
995 994
@@ -1009,7 +1008,7 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session,
1009 if (copy_to_user((void __user *) arg, &ifr, sizeof(struct ifreq))) 1008 if (copy_to_user((void __user *) arg, &ifr, sizeof(struct ifreq)))
1010 break; 1009 break;
1011 1010
1012 l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: get mtu=%d\n", 1011 l2tp_info(session, L2TP_MSG_CONTROL, "%s: get mtu=%d\n",
1013 session->name, session->mtu); 1012 session->name, session->mtu);
1014 err = 0; 1013 err = 0;
1015 break; 1014 break;
@@ -1025,7 +1024,7 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session,
1025 1024
1026 session->mtu = ifr.ifr_mtu; 1025 session->mtu = ifr.ifr_mtu;
1027 1026
1028 l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: set mtu=%d\n", 1027 l2tp_info(session, L2TP_MSG_CONTROL, "%s: set mtu=%d\n",
1029 session->name, session->mtu); 1028 session->name, session->mtu);
1030 err = 0; 1029 err = 0;
1031 break; 1030 break;
@@ -1039,7 +1038,7 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session,
1039 if (put_user(session->mru, (int __user *) arg)) 1038 if (put_user(session->mru, (int __user *) arg))
1040 break; 1039 break;
1041 1040
1042 l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: get mru=%d\n", 1041 l2tp_info(session, L2TP_MSG_CONTROL, "%s: get mru=%d\n",
1043 session->name, session->mru); 1042 session->name, session->mru);
1044 err = 0; 1043 err = 0;
1045 break; 1044 break;
@@ -1054,7 +1053,7 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session,
1054 break; 1053 break;
1055 1054
1056 session->mru = val; 1055 session->mru = val;
1057 l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: set mru=%d\n", 1056 l2tp_info(session, L2TP_MSG_CONTROL, "%s: set mru=%d\n",
1058 session->name, session->mru); 1057 session->name, session->mru);
1059 err = 0; 1058 err = 0;
1060 break; 1059 break;
@@ -1064,7 +1063,7 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session,
1064 if (put_user(ps->flags, (int __user *) arg)) 1063 if (put_user(ps->flags, (int __user *) arg))
1065 break; 1064 break;
1066 1065
1067 l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: get flags=%d\n", 1066 l2tp_info(session, L2TP_MSG_CONTROL, "%s: get flags=%d\n",
1068 session->name, ps->flags); 1067 session->name, ps->flags);
1069 err = 0; 1068 err = 0;
1070 break; 1069 break;
@@ -1074,7 +1073,7 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session,
1074 if (get_user(val, (int __user *) arg)) 1073 if (get_user(val, (int __user *) arg))
1075 break; 1074 break;
1076 ps->flags = val; 1075 ps->flags = val;
1077 l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: set flags=%d\n", 1076 l2tp_info(session, L2TP_MSG_CONTROL, "%s: set flags=%d\n",
1078 session->name, ps->flags); 1077 session->name, ps->flags);
1079 err = 0; 1078 err = 0;
1080 break; 1079 break;
@@ -1091,7 +1090,7 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session,
1091 if (copy_to_user((void __user *) arg, &stats, 1090 if (copy_to_user((void __user *) arg, &stats,
1092 sizeof(stats))) 1091 sizeof(stats)))
1093 break; 1092 break;
1094 l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: get L2TP stats\n", 1093 l2tp_info(session, L2TP_MSG_CONTROL, "%s: get L2TP stats\n",
1095 session->name); 1094 session->name);
1096 err = 0; 1095 err = 0;
1097 break; 1096 break;
@@ -1119,7 +1118,7 @@ static int pppol2tp_tunnel_ioctl(struct l2tp_tunnel *tunnel,
1119 struct sock *sk; 1118 struct sock *sk;
1120 struct pppol2tp_ioc_stats stats; 1119 struct pppol2tp_ioc_stats stats;
1121 1120
1122 l2tp_dbg(tunnel, PPPOL2TP_MSG_CONTROL, 1121 l2tp_dbg(tunnel, L2TP_MSG_CONTROL,
1123 "%s: pppol2tp_tunnel_ioctl(cmd=%#x, arg=%#lx)\n", 1122 "%s: pppol2tp_tunnel_ioctl(cmd=%#x, arg=%#lx)\n",
1124 tunnel->name, cmd, arg); 1123 tunnel->name, cmd, arg);
1125 1124
@@ -1140,11 +1139,18 @@ static int pppol2tp_tunnel_ioctl(struct l2tp_tunnel *tunnel,
1140 if (stats.session_id != 0) { 1139 if (stats.session_id != 0) {
1141 /* resend to session ioctl handler */ 1140 /* resend to session ioctl handler */
1142 struct l2tp_session *session = 1141 struct l2tp_session *session =
1143 l2tp_session_find(sock_net(sk), tunnel, stats.session_id); 1142 l2tp_session_get(sock_net(sk), tunnel,
1144 if (session != NULL) 1143 stats.session_id, true);
1145 err = pppol2tp_session_ioctl(session, cmd, arg); 1144
1146 else 1145 if (session) {
1146 err = pppol2tp_session_ioctl(session, cmd,
1147 arg);
1148 if (session->deref)
1149 session->deref(session);
1150 l2tp_session_dec_refcount(session);
1151 } else {
1147 err = -EBADR; 1152 err = -EBADR;
1153 }
1148 break; 1154 break;
1149 } 1155 }
1150#ifdef CONFIG_XFRM 1156#ifdef CONFIG_XFRM
@@ -1155,7 +1161,7 @@ static int pppol2tp_tunnel_ioctl(struct l2tp_tunnel *tunnel,
1155 err = -EFAULT; 1161 err = -EFAULT;
1156 break; 1162 break;
1157 } 1163 }
1158 l2tp_info(tunnel, PPPOL2TP_MSG_CONTROL, "%s: get L2TP stats\n", 1164 l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: get L2TP stats\n",
1159 tunnel->name); 1165 tunnel->name);
1160 err = 0; 1166 err = 0;
1161 break; 1167 break;
@@ -1245,7 +1251,7 @@ static int pppol2tp_tunnel_setsockopt(struct sock *sk,
1245 switch (optname) { 1251 switch (optname) {
1246 case PPPOL2TP_SO_DEBUG: 1252 case PPPOL2TP_SO_DEBUG:
1247 tunnel->debug = val; 1253 tunnel->debug = val;
1248 l2tp_info(tunnel, PPPOL2TP_MSG_CONTROL, "%s: set debug=%x\n", 1254 l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: set debug=%x\n",
1249 tunnel->name, tunnel->debug); 1255 tunnel->name, tunnel->debug);
1250 break; 1256 break;
1251 1257
@@ -1272,8 +1278,8 @@ static int pppol2tp_session_setsockopt(struct sock *sk,
1272 err = -EINVAL; 1278 err = -EINVAL;
1273 break; 1279 break;
1274 } 1280 }
1275 session->recv_seq = val ? -1 : 0; 1281 session->recv_seq = !!val;
1276 l2tp_info(session, PPPOL2TP_MSG_CONTROL, 1282 l2tp_info(session, L2TP_MSG_CONTROL,
1277 "%s: set recv_seq=%d\n", 1283 "%s: set recv_seq=%d\n",
1278 session->name, session->recv_seq); 1284 session->name, session->recv_seq);
1279 break; 1285 break;
@@ -1283,7 +1289,7 @@ static int pppol2tp_session_setsockopt(struct sock *sk,
1283 err = -EINVAL; 1289 err = -EINVAL;
1284 break; 1290 break;
1285 } 1291 }
1286 session->send_seq = val ? -1 : 0; 1292 session->send_seq = !!val;
1287 { 1293 {
1288 struct sock *ssk = ps->sock; 1294 struct sock *ssk = ps->sock;
1289 struct pppox_sock *po = pppox_sk(ssk); 1295 struct pppox_sock *po = pppox_sk(ssk);
@@ -1291,7 +1297,7 @@ static int pppol2tp_session_setsockopt(struct sock *sk,
1291 PPPOL2TP_L2TP_HDR_SIZE_NOSEQ; 1297 PPPOL2TP_L2TP_HDR_SIZE_NOSEQ;
1292 } 1298 }
1293 l2tp_session_set_header_len(session, session->tunnel->version); 1299 l2tp_session_set_header_len(session, session->tunnel->version);
1294 l2tp_info(session, PPPOL2TP_MSG_CONTROL, 1300 l2tp_info(session, L2TP_MSG_CONTROL,
1295 "%s: set send_seq=%d\n", 1301 "%s: set send_seq=%d\n",
1296 session->name, session->send_seq); 1302 session->name, session->send_seq);
1297 break; 1303 break;
@@ -1301,21 +1307,21 @@ static int pppol2tp_session_setsockopt(struct sock *sk,
1301 err = -EINVAL; 1307 err = -EINVAL;
1302 break; 1308 break;
1303 } 1309 }
1304 session->lns_mode = val ? -1 : 0; 1310 session->lns_mode = !!val;
1305 l2tp_info(session, PPPOL2TP_MSG_CONTROL, 1311 l2tp_info(session, L2TP_MSG_CONTROL,
1306 "%s: set lns_mode=%d\n", 1312 "%s: set lns_mode=%d\n",
1307 session->name, session->lns_mode); 1313 session->name, session->lns_mode);
1308 break; 1314 break;
1309 1315
1310 case PPPOL2TP_SO_DEBUG: 1316 case PPPOL2TP_SO_DEBUG:
1311 session->debug = val; 1317 session->debug = val;
1312 l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: set debug=%x\n", 1318 l2tp_info(session, L2TP_MSG_CONTROL, "%s: set debug=%x\n",
1313 session->name, session->debug); 1319 session->name, session->debug);
1314 break; 1320 break;
1315 1321
1316 case PPPOL2TP_SO_REORDERTO: 1322 case PPPOL2TP_SO_REORDERTO:
1317 session->reorder_timeout = msecs_to_jiffies(val); 1323 session->reorder_timeout = msecs_to_jiffies(val);
1318 l2tp_info(session, PPPOL2TP_MSG_CONTROL, 1324 l2tp_info(session, L2TP_MSG_CONTROL,
1319 "%s: set reorder_timeout=%d\n", 1325 "%s: set reorder_timeout=%d\n",
1320 session->name, session->reorder_timeout); 1326 session->name, session->reorder_timeout);
1321 break; 1327 break;
@@ -1377,8 +1383,6 @@ static int pppol2tp_setsockopt(struct socket *sock, int level, int optname,
1377 } else 1383 } else
1378 err = pppol2tp_session_setsockopt(sk, session, optname, val); 1384 err = pppol2tp_session_setsockopt(sk, session, optname, val);
1379 1385
1380 err = 0;
1381
1382end_put_sess: 1386end_put_sess:
1383 sock_put(sk); 1387 sock_put(sk);
1384end: 1388end:
@@ -1396,7 +1400,7 @@ static int pppol2tp_tunnel_getsockopt(struct sock *sk,
1396 switch (optname) { 1400 switch (optname) {
1397 case PPPOL2TP_SO_DEBUG: 1401 case PPPOL2TP_SO_DEBUG:
1398 *val = tunnel->debug; 1402 *val = tunnel->debug;
1399 l2tp_info(tunnel, PPPOL2TP_MSG_CONTROL, "%s: get debug=%x\n", 1403 l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: get debug=%x\n",
1400 tunnel->name, tunnel->debug); 1404 tunnel->name, tunnel->debug);
1401 break; 1405 break;
1402 1406
@@ -1419,31 +1423,31 @@ static int pppol2tp_session_getsockopt(struct sock *sk,
1419 switch (optname) { 1423 switch (optname) {
1420 case PPPOL2TP_SO_RECVSEQ: 1424 case PPPOL2TP_SO_RECVSEQ:
1421 *val = session->recv_seq; 1425 *val = session->recv_seq;
1422 l2tp_info(session, PPPOL2TP_MSG_CONTROL, 1426 l2tp_info(session, L2TP_MSG_CONTROL,
1423 "%s: get recv_seq=%d\n", session->name, *val); 1427 "%s: get recv_seq=%d\n", session->name, *val);
1424 break; 1428 break;
1425 1429
1426 case PPPOL2TP_SO_SENDSEQ: 1430 case PPPOL2TP_SO_SENDSEQ:
1427 *val = session->send_seq; 1431 *val = session->send_seq;
1428 l2tp_info(session, PPPOL2TP_MSG_CONTROL, 1432 l2tp_info(session, L2TP_MSG_CONTROL,
1429 "%s: get send_seq=%d\n", session->name, *val); 1433 "%s: get send_seq=%d\n", session->name, *val);
1430 break; 1434 break;
1431 1435
1432 case PPPOL2TP_SO_LNSMODE: 1436 case PPPOL2TP_SO_LNSMODE:
1433 *val = session->lns_mode; 1437 *val = session->lns_mode;
1434 l2tp_info(session, PPPOL2TP_MSG_CONTROL, 1438 l2tp_info(session, L2TP_MSG_CONTROL,
1435 "%s: get lns_mode=%d\n", session->name, *val); 1439 "%s: get lns_mode=%d\n", session->name, *val);
1436 break; 1440 break;
1437 1441
1438 case PPPOL2TP_SO_DEBUG: 1442 case PPPOL2TP_SO_DEBUG:
1439 *val = session->debug; 1443 *val = session->debug;
1440 l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: get debug=%d\n", 1444 l2tp_info(session, L2TP_MSG_CONTROL, "%s: get debug=%d\n",
1441 session->name, *val); 1445 session->name, *val);
1442 break; 1446 break;
1443 1447
1444 case PPPOL2TP_SO_REORDERTO: 1448 case PPPOL2TP_SO_REORDERTO:
1445 *val = (int) jiffies_to_msecs(session->reorder_timeout); 1449 *val = (int) jiffies_to_msecs(session->reorder_timeout);
1446 l2tp_info(session, PPPOL2TP_MSG_CONTROL, 1450 l2tp_info(session, L2TP_MSG_CONTROL,
1447 "%s: get reorder_timeout=%d\n", session->name, *val); 1451 "%s: get reorder_timeout=%d\n", session->name, *val);
1448 break; 1452 break;
1449 1453
@@ -1501,8 +1505,13 @@ static int pppol2tp_getsockopt(struct socket *sock, int level, int optname,
1501 1505
1502 err = pppol2tp_tunnel_getsockopt(sk, tunnel, optname, &val); 1506 err = pppol2tp_tunnel_getsockopt(sk, tunnel, optname, &val);
1503 sock_put(ps->tunnel_sock); 1507 sock_put(ps->tunnel_sock);
1504 } else 1508 if (err)
1509 goto end_put_sess;
1510 } else {
1505 err = pppol2tp_session_getsockopt(sk, session, optname, &val); 1511 err = pppol2tp_session_getsockopt(sk, session, optname, &val);
1512 if (err)
1513 goto end_put_sess;
1514 }
1506 1515
1507 err = -EFAULT; 1516 err = -EFAULT;
1508 if (put_user(len, optlen)) 1517 if (put_user(len, optlen))
@@ -1554,7 +1563,7 @@ static void pppol2tp_next_tunnel(struct net *net, struct pppol2tp_seq_data *pd)
1554 1563
1555static void pppol2tp_next_session(struct net *net, struct pppol2tp_seq_data *pd) 1564static void pppol2tp_next_session(struct net *net, struct pppol2tp_seq_data *pd)
1556{ 1565{
1557 pd->session = l2tp_session_find_nth(pd->tunnel, pd->session_idx); 1566 pd->session = l2tp_session_get_nth(pd->tunnel, pd->session_idx, true);
1558 pd->session_idx++; 1567 pd->session_idx++;
1559 1568
1560 if (pd->session == NULL) { 1569 if (pd->session == NULL) {
@@ -1681,10 +1690,14 @@ static int pppol2tp_seq_show(struct seq_file *m, void *v)
1681 1690
1682 /* Show the tunnel or session context. 1691 /* Show the tunnel or session context.
1683 */ 1692 */
1684 if (pd->session == NULL) 1693 if (!pd->session) {
1685 pppol2tp_seq_tunnel_show(m, pd->tunnel); 1694 pppol2tp_seq_tunnel_show(m, pd->tunnel);
1686 else 1695 } else {
1687 pppol2tp_seq_session_show(m, pd->session); 1696 pppol2tp_seq_session_show(m, pd->session);
1697 if (pd->session->deref)
1698 pd->session->deref(pd->session);
1699 l2tp_session_dec_refcount(pd->session);
1700 }
1688 1701
1689out: 1702out:
1690 return 0; 1703 return 0;
@@ -1843,4 +1856,4 @@ MODULE_DESCRIPTION("PPP over L2TP over UDP");
1843MODULE_LICENSE("GPL"); 1856MODULE_LICENSE("GPL");
1844MODULE_VERSION(PPPOL2TP_DRV_VERSION); 1857MODULE_VERSION(PPPOL2TP_DRV_VERSION);
1845MODULE_ALIAS_NET_PF_PROTO(PF_PPPOX, PX_PROTO_OL2TP); 1858MODULE_ALIAS_NET_PF_PROTO(PF_PPPOX, PX_PROTO_OL2TP);
1846MODULE_ALIAS_L2TP_PWTYPE(11); 1859MODULE_ALIAS_L2TP_PWTYPE(7);
diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c
index fc60d9d738b5..b50b64ac8815 100644
--- a/net/lapb/lapb_iface.c
+++ b/net/lapb/lapb_iface.c
@@ -33,7 +33,7 @@
33#include <linux/skbuff.h> 33#include <linux/skbuff.h>
34#include <linux/slab.h> 34#include <linux/slab.h>
35#include <net/sock.h> 35#include <net/sock.h>
36#include <asm/uaccess.h> 36#include <linux/uaccess.h>
37#include <linux/fcntl.h> 37#include <linux/fcntl.h>
38#include <linux/mm.h> 38#include <linux/mm.h>
39#include <linux/interrupt.h> 39#include <linux/interrupt.h>
diff --git a/net/lapb/lapb_in.c b/net/lapb/lapb_in.c
index 182470847fcf..d5d2110eb717 100644
--- a/net/lapb/lapb_in.c
+++ b/net/lapb/lapb_in.c
@@ -31,7 +31,7 @@
31#include <linux/skbuff.h> 31#include <linux/skbuff.h>
32#include <linux/slab.h> 32#include <linux/slab.h>
33#include <net/sock.h> 33#include <net/sock.h>
34#include <asm/uaccess.h> 34#include <linux/uaccess.h>
35#include <linux/fcntl.h> 35#include <linux/fcntl.h>
36#include <linux/mm.h> 36#include <linux/mm.h>
37#include <linux/interrupt.h> 37#include <linux/interrupt.h>
diff --git a/net/lapb/lapb_out.c b/net/lapb/lapb_out.c
index 482c94d9d958..eda726e22f64 100644
--- a/net/lapb/lapb_out.c
+++ b/net/lapb/lapb_out.c
@@ -29,7 +29,7 @@
29#include <linux/skbuff.h> 29#include <linux/skbuff.h>
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <net/sock.h> 31#include <net/sock.h>
32#include <asm/uaccess.h> 32#include <linux/uaccess.h>
33#include <linux/fcntl.h> 33#include <linux/fcntl.h>
34#include <linux/mm.h> 34#include <linux/mm.h>
35#include <linux/interrupt.h> 35#include <linux/interrupt.h>
diff --git a/net/lapb/lapb_subr.c b/net/lapb/lapb_subr.c
index 3c1914df641f..75efde3e616c 100644
--- a/net/lapb/lapb_subr.c
+++ b/net/lapb/lapb_subr.c
@@ -28,7 +28,7 @@
28#include <linux/skbuff.h> 28#include <linux/skbuff.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <net/sock.h> 30#include <net/sock.h>
31#include <asm/uaccess.h> 31#include <linux/uaccess.h>
32#include <linux/fcntl.h> 32#include <linux/fcntl.h>
33#include <linux/mm.h> 33#include <linux/mm.h>
34#include <linux/interrupt.h> 34#include <linux/interrupt.h>
diff --git a/net/lapb/lapb_timer.c b/net/lapb/lapb_timer.c
index 355cc3b6fa4d..1a5535bc3b8d 100644
--- a/net/lapb/lapb_timer.c
+++ b/net/lapb/lapb_timer.c
@@ -29,7 +29,7 @@
29#include <linux/inet.h> 29#include <linux/inet.h>
30#include <linux/skbuff.h> 30#include <linux/skbuff.h>
31#include <net/sock.h> 31#include <net/sock.h>
32#include <asm/uaccess.h> 32#include <linux/uaccess.h>
33#include <linux/fcntl.h> 33#include <linux/fcntl.h>
34#include <linux/mm.h> 34#include <linux/mm.h>
35#include <linux/interrupt.h> 35#include <linux/interrupt.h>
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index db916cf51ffe..cb4fff785cbf 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -26,6 +26,8 @@
26#include <linux/rtnetlink.h> 26#include <linux/rtnetlink.h>
27#include <linux/init.h> 27#include <linux/init.h>
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/sched/signal.h>
30
29#include <net/llc.h> 31#include <net/llc.h>
30#include <net/llc_sap.h> 32#include <net/llc_sap.h>
31#include <net/llc_pdu.h> 33#include <net/llc_pdu.h>
@@ -532,12 +534,12 @@ out:
532 534
533static int llc_ui_wait_for_disc(struct sock *sk, long timeout) 535static int llc_ui_wait_for_disc(struct sock *sk, long timeout)
534{ 536{
535 DEFINE_WAIT(wait); 537 DEFINE_WAIT_FUNC(wait, woken_wake_function);
536 int rc = 0; 538 int rc = 0;
537 539
540 add_wait_queue(sk_sleep(sk), &wait);
538 while (1) { 541 while (1) {
539 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 542 if (sk_wait_event(sk, &timeout, sk->sk_state == TCP_CLOSE, &wait))
540 if (sk_wait_event(sk, &timeout, sk->sk_state == TCP_CLOSE))
541 break; 543 break;
542 rc = -ERESTARTSYS; 544 rc = -ERESTARTSYS;
543 if (signal_pending(current)) 545 if (signal_pending(current))
@@ -547,39 +549,39 @@ static int llc_ui_wait_for_disc(struct sock *sk, long timeout)
547 break; 549 break;
548 rc = 0; 550 rc = 0;
549 } 551 }
550 finish_wait(sk_sleep(sk), &wait); 552 remove_wait_queue(sk_sleep(sk), &wait);
551 return rc; 553 return rc;
552} 554}
553 555
554static bool llc_ui_wait_for_conn(struct sock *sk, long timeout) 556static bool llc_ui_wait_for_conn(struct sock *sk, long timeout)
555{ 557{
556 DEFINE_WAIT(wait); 558 DEFINE_WAIT_FUNC(wait, woken_wake_function);
557 559
560 add_wait_queue(sk_sleep(sk), &wait);
558 while (1) { 561 while (1) {
559 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 562 if (sk_wait_event(sk, &timeout, sk->sk_state != TCP_SYN_SENT, &wait))
560 if (sk_wait_event(sk, &timeout, sk->sk_state != TCP_SYN_SENT))
561 break; 563 break;
562 if (signal_pending(current) || !timeout) 564 if (signal_pending(current) || !timeout)
563 break; 565 break;
564 } 566 }
565 finish_wait(sk_sleep(sk), &wait); 567 remove_wait_queue(sk_sleep(sk), &wait);
566 return timeout; 568 return timeout;
567} 569}
568 570
569static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout) 571static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout)
570{ 572{
571 DEFINE_WAIT(wait); 573 DEFINE_WAIT_FUNC(wait, woken_wake_function);
572 struct llc_sock *llc = llc_sk(sk); 574 struct llc_sock *llc = llc_sk(sk);
573 int rc; 575 int rc;
574 576
577 add_wait_queue(sk_sleep(sk), &wait);
575 while (1) { 578 while (1) {
576 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
577 rc = 0; 579 rc = 0;
578 if (sk_wait_event(sk, &timeout, 580 if (sk_wait_event(sk, &timeout,
579 (sk->sk_shutdown & RCV_SHUTDOWN) || 581 (sk->sk_shutdown & RCV_SHUTDOWN) ||
580 (!llc_data_accept_state(llc->state) && 582 (!llc_data_accept_state(llc->state) &&
581 !llc->remote_busy_flag && 583 !llc->remote_busy_flag &&
582 !llc->p_flag))) 584 !llc->p_flag), &wait))
583 break; 585 break;
584 rc = -ERESTARTSYS; 586 rc = -ERESTARTSYS;
585 if (signal_pending(current)) 587 if (signal_pending(current))
@@ -588,7 +590,7 @@ static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout)
588 if (!timeout) 590 if (!timeout)
589 break; 591 break;
590 } 592 }
591 finish_wait(sk_sleep(sk), &wait); 593 remove_wait_queue(sk_sleep(sk), &wait);
592 return rc; 594 return rc;
593} 595}
594 596
@@ -639,11 +641,13 @@ static void llc_cmsg_rcv(struct msghdr *msg, struct sk_buff *skb)
639 * @sock: Socket which connections arrive on. 641 * @sock: Socket which connections arrive on.
640 * @newsock: Socket to move incoming connection to. 642 * @newsock: Socket to move incoming connection to.
641 * @flags: User specified operational flags. 643 * @flags: User specified operational flags.
644 * @kern: If the socket is kernel internal
642 * 645 *
643 * Accept a new incoming connection. 646 * Accept a new incoming connection.
644 * Returns 0 upon success, negative otherwise. 647 * Returns 0 upon success, negative otherwise.
645 */ 648 */
646static int llc_ui_accept(struct socket *sock, struct socket *newsock, int flags) 649static int llc_ui_accept(struct socket *sock, struct socket *newsock, int flags,
650 bool kern)
647{ 651{
648 struct sock *sk = sock->sk, *newsk; 652 struct sock *sk = sock->sk, *newsk;
649 struct llc_sock *llc, *newllc; 653 struct llc_sock *llc, *newllc;
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
index 3e821daf9dd4..8bc5a1bd2d45 100644
--- a/net/llc/llc_conn.c
+++ b/net/llc/llc_conn.c
@@ -821,7 +821,10 @@ void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb)
821 * another trick required to cope with how the PROCOM state 821 * another trick required to cope with how the PROCOM state
822 * machine works. -acme 822 * machine works. -acme
823 */ 823 */
824 skb_orphan(skb);
825 sock_hold(sk);
824 skb->sk = sk; 826 skb->sk = sk;
827 skb->destructor = sock_efree;
825 } 828 }
826 if (!sock_owned_by_user(sk)) 829 if (!sock_owned_by_user(sk))
827 llc_conn_rcv(sk, skb); 830 llc_conn_rcv(sk, skb);
diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c
index d0e1e804ebd7..5404d0d195cc 100644
--- a/net/llc/llc_sap.c
+++ b/net/llc/llc_sap.c
@@ -290,7 +290,10 @@ static void llc_sap_rcv(struct llc_sap *sap, struct sk_buff *skb,
290 290
291 ev->type = LLC_SAP_EV_TYPE_PDU; 291 ev->type = LLC_SAP_EV_TYPE_PDU;
292 ev->reason = 0; 292 ev->reason = 0;
293 skb_orphan(skb);
294 sock_hold(sk);
293 skb->sk = sk; 295 skb->sk = sk;
296 skb->destructor = sock_efree;
294 llc_sap_state_process(sap, skb); 297 llc_sap_state_process(sap, skb);
295} 298}
296 299
diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig
index 3891cbd2adea..76e30f4797fb 100644
--- a/net/mac80211/Kconfig
+++ b/net/mac80211/Kconfig
@@ -6,6 +6,7 @@ config MAC80211
6 select CRYPTO_AES 6 select CRYPTO_AES
7 select CRYPTO_CCM 7 select CRYPTO_CCM
8 select CRYPTO_GCM 8 select CRYPTO_GCM
9 select CRYPTO_CMAC
9 select CRC32 10 select CRC32
10 ---help--- 11 ---help---
11 This option enables the hardware independent IEEE 802.11 12 This option enables the hardware independent IEEE 802.11
diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile
index f9137a8341f4..282912245938 100644
--- a/net/mac80211/Makefile
+++ b/net/mac80211/Makefile
@@ -19,6 +19,7 @@ mac80211-y := \
19 aes_gcm.o \ 19 aes_gcm.o \
20 aes_cmac.o \ 20 aes_cmac.o \
21 aes_gmac.o \ 21 aes_gmac.o \
22 fils_aead.o \
22 cfg.o \ 23 cfg.o \
23 ethtool.o \ 24 ethtool.o \
24 rx.o \ 25 rx.o \
@@ -60,4 +61,4 @@ rc80211_minstrel_ht-$(CONFIG_MAC80211_DEBUGFS) += rc80211_minstrel_ht_debugfs.o
60mac80211-$(CONFIG_MAC80211_RC_MINSTREL) += $(rc80211_minstrel-y) 61mac80211-$(CONFIG_MAC80211_RC_MINSTREL) += $(rc80211_minstrel-y)
61mac80211-$(CONFIG_MAC80211_RC_MINSTREL_HT) += $(rc80211_minstrel_ht-y) 62mac80211-$(CONFIG_MAC80211_RC_MINSTREL_HT) += $(rc80211_minstrel_ht-y)
62 63
63ccflags-y += -D__CHECK_ENDIAN__ -DDEBUG 64ccflags-y += -DDEBUG
diff --git a/net/mac80211/aes_cmac.c b/net/mac80211/aes_cmac.c
index bdf0790d89cc..2fb65588490c 100644
--- a/net/mac80211/aes_cmac.c
+++ b/net/mac80211/aes_cmac.c
@@ -22,126 +22,50 @@
22#define CMAC_TLEN_256 16 /* CMAC TLen = 128 bits (16 octets) */ 22#define CMAC_TLEN_256 16 /* CMAC TLen = 128 bits (16 octets) */
23#define AAD_LEN 20 23#define AAD_LEN 20
24 24
25static const u8 zero[CMAC_TLEN_256];
25 26
26static void gf_mulx(u8 *pad) 27void ieee80211_aes_cmac(struct crypto_shash *tfm, const u8 *aad,
27{
28 int i, carry;
29
30 carry = pad[0] & 0x80;
31 for (i = 0; i < AES_BLOCK_SIZE - 1; i++)
32 pad[i] = (pad[i] << 1) | (pad[i + 1] >> 7);
33 pad[AES_BLOCK_SIZE - 1] <<= 1;
34 if (carry)
35 pad[AES_BLOCK_SIZE - 1] ^= 0x87;
36}
37
38static void aes_cmac_vector(struct crypto_cipher *tfm, size_t num_elem,
39 const u8 *addr[], const size_t *len, u8 *mac,
40 size_t mac_len)
41{
42 u8 cbc[AES_BLOCK_SIZE], pad[AES_BLOCK_SIZE];
43 const u8 *pos, *end;
44 size_t i, e, left, total_len;
45
46 memset(cbc, 0, AES_BLOCK_SIZE);
47
48 total_len = 0;
49 for (e = 0; e < num_elem; e++)
50 total_len += len[e];
51 left = total_len;
52
53 e = 0;
54 pos = addr[0];
55 end = pos + len[0];
56
57 while (left >= AES_BLOCK_SIZE) {
58 for (i = 0; i < AES_BLOCK_SIZE; i++) {
59 cbc[i] ^= *pos++;
60 if (pos >= end) {
61 e++;
62 pos = addr[e];
63 end = pos + len[e];
64 }
65 }
66 if (left > AES_BLOCK_SIZE)
67 crypto_cipher_encrypt_one(tfm, cbc, cbc);
68 left -= AES_BLOCK_SIZE;
69 }
70
71 memset(pad, 0, AES_BLOCK_SIZE);
72 crypto_cipher_encrypt_one(tfm, pad, pad);
73 gf_mulx(pad);
74
75 if (left || total_len == 0) {
76 for (i = 0; i < left; i++) {
77 cbc[i] ^= *pos++;
78 if (pos >= end) {
79 e++;
80 pos = addr[e];
81 end = pos + len[e];
82 }
83 }
84 cbc[left] ^= 0x80;
85 gf_mulx(pad);
86 }
87
88 for (i = 0; i < AES_BLOCK_SIZE; i++)
89 pad[i] ^= cbc[i];
90 crypto_cipher_encrypt_one(tfm, pad, pad);
91 memcpy(mac, pad, mac_len);
92}
93
94
95void ieee80211_aes_cmac(struct crypto_cipher *tfm, const u8 *aad,
96 const u8 *data, size_t data_len, u8 *mic) 28 const u8 *data, size_t data_len, u8 *mic)
97{ 29{
98 const u8 *addr[3]; 30 SHASH_DESC_ON_STACK(desc, tfm);
99 size_t len[3]; 31 u8 out[AES_BLOCK_SIZE];
100 u8 zero[CMAC_TLEN];
101 32
102 memset(zero, 0, CMAC_TLEN); 33 desc->tfm = tfm;
103 addr[0] = aad;
104 len[0] = AAD_LEN;
105 addr[1] = data;
106 len[1] = data_len - CMAC_TLEN;
107 addr[2] = zero;
108 len[2] = CMAC_TLEN;
109 34
110 aes_cmac_vector(tfm, 3, addr, len, mic, CMAC_TLEN); 35 crypto_shash_init(desc);
36 crypto_shash_update(desc, aad, AAD_LEN);
37 crypto_shash_update(desc, data, data_len - CMAC_TLEN);
38 crypto_shash_finup(desc, zero, CMAC_TLEN, out);
39
40 memcpy(mic, out, CMAC_TLEN);
111} 41}
112 42
113void ieee80211_aes_cmac_256(struct crypto_cipher *tfm, const u8 *aad, 43void ieee80211_aes_cmac_256(struct crypto_shash *tfm, const u8 *aad,
114 const u8 *data, size_t data_len, u8 *mic) 44 const u8 *data, size_t data_len, u8 *mic)
115{ 45{
116 const u8 *addr[3]; 46 SHASH_DESC_ON_STACK(desc, tfm);
117 size_t len[3];
118 u8 zero[CMAC_TLEN_256];
119 47
120 memset(zero, 0, CMAC_TLEN_256); 48 desc->tfm = tfm;
121 addr[0] = aad;
122 len[0] = AAD_LEN;
123 addr[1] = data;
124 len[1] = data_len - CMAC_TLEN_256;
125 addr[2] = zero;
126 len[2] = CMAC_TLEN_256;
127 49
128 aes_cmac_vector(tfm, 3, addr, len, mic, CMAC_TLEN_256); 50 crypto_shash_init(desc);
51 crypto_shash_update(desc, aad, AAD_LEN);
52 crypto_shash_update(desc, data, data_len - CMAC_TLEN_256);
53 crypto_shash_finup(desc, zero, CMAC_TLEN_256, mic);
129} 54}
130 55
131struct crypto_cipher *ieee80211_aes_cmac_key_setup(const u8 key[], 56struct crypto_shash *ieee80211_aes_cmac_key_setup(const u8 key[],
132 size_t key_len) 57 size_t key_len)
133{ 58{
134 struct crypto_cipher *tfm; 59 struct crypto_shash *tfm;
135 60
136 tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC); 61 tfm = crypto_alloc_shash("cmac(aes)", 0, 0);
137 if (!IS_ERR(tfm)) 62 if (!IS_ERR(tfm))
138 crypto_cipher_setkey(tfm, key, key_len); 63 crypto_shash_setkey(tfm, key, key_len);
139 64
140 return tfm; 65 return tfm;
141} 66}
142 67
143 68void ieee80211_aes_cmac_key_free(struct crypto_shash *tfm)
144void ieee80211_aes_cmac_key_free(struct crypto_cipher *tfm)
145{ 69{
146 crypto_free_cipher(tfm); 70 crypto_free_shash(tfm);
147} 71}
diff --git a/net/mac80211/aes_cmac.h b/net/mac80211/aes_cmac.h
index 3702041f44fd..fef531f42003 100644
--- a/net/mac80211/aes_cmac.h
+++ b/net/mac80211/aes_cmac.h
@@ -10,13 +10,14 @@
10#define AES_CMAC_H 10#define AES_CMAC_H
11 11
12#include <linux/crypto.h> 12#include <linux/crypto.h>
13#include <crypto/hash.h>
13 14
14struct crypto_cipher *ieee80211_aes_cmac_key_setup(const u8 key[], 15struct crypto_shash *ieee80211_aes_cmac_key_setup(const u8 key[],
15 size_t key_len); 16 size_t key_len);
16void ieee80211_aes_cmac(struct crypto_cipher *tfm, const u8 *aad, 17void ieee80211_aes_cmac(struct crypto_shash *tfm, const u8 *aad,
17 const u8 *data, size_t data_len, u8 *mic); 18 const u8 *data, size_t data_len, u8 *mic);
18void ieee80211_aes_cmac_256(struct crypto_cipher *tfm, const u8 *aad, 19void ieee80211_aes_cmac_256(struct crypto_shash *tfm, const u8 *aad,
19 const u8 *data, size_t data_len, u8 *mic); 20 const u8 *data, size_t data_len, u8 *mic);
20void ieee80211_aes_cmac_key_free(struct crypto_cipher *tfm); 21void ieee80211_aes_cmac_key_free(struct crypto_shash *tfm);
21 22
22#endif /* AES_CMAC_H */ 23#endif /* AES_CMAC_H */
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index f6749dced021..4456559cb056 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -85,7 +85,7 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
85 ht_dbg(sta->sdata, 85 ht_dbg(sta->sdata,
86 "Rx BA session stop requested for %pM tid %u %s reason: %d\n", 86 "Rx BA session stop requested for %pM tid %u %s reason: %d\n",
87 sta->sta.addr, tid, 87 sta->sta.addr, tid,
88 initiator == WLAN_BACK_RECIPIENT ? "recipient" : "inititator", 88 initiator == WLAN_BACK_RECIPIENT ? "recipient" : "initiator",
89 (int)reason); 89 (int)reason);
90 90
91 if (drv_ampdu_action(local, sta->sdata, &params)) 91 if (drv_ampdu_action(local, sta->sdata, &params))
@@ -315,11 +315,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
315 mutex_lock(&sta->ampdu_mlme.mtx); 315 mutex_lock(&sta->ampdu_mlme.mtx);
316 316
317 if (test_bit(tid, sta->ampdu_mlme.agg_session_valid)) { 317 if (test_bit(tid, sta->ampdu_mlme.agg_session_valid)) {
318 tid_agg_rx = rcu_dereference_protected( 318 if (sta->ampdu_mlme.tid_rx_token[tid] == dialog_token) {
319 sta->ampdu_mlme.tid_rx[tid],
320 lockdep_is_held(&sta->ampdu_mlme.mtx));
321
322 if (tid_agg_rx->dialog_token == dialog_token) {
323 ht_dbg_ratelimited(sta->sdata, 319 ht_dbg_ratelimited(sta->sdata,
324 "updated AddBA Req from %pM on tid %u\n", 320 "updated AddBA Req from %pM on tid %u\n",
325 sta->sta.addr, tid); 321 sta->sta.addr, tid);
@@ -396,13 +392,13 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
396 } 392 }
397 393
398 /* update data */ 394 /* update data */
399 tid_agg_rx->dialog_token = dialog_token;
400 tid_agg_rx->ssn = start_seq_num; 395 tid_agg_rx->ssn = start_seq_num;
401 tid_agg_rx->head_seq_num = start_seq_num; 396 tid_agg_rx->head_seq_num = start_seq_num;
402 tid_agg_rx->buf_size = buf_size; 397 tid_agg_rx->buf_size = buf_size;
403 tid_agg_rx->timeout = timeout; 398 tid_agg_rx->timeout = timeout;
404 tid_agg_rx->stored_mpdu_num = 0; 399 tid_agg_rx->stored_mpdu_num = 0;
405 tid_agg_rx->auto_seq = auto_seq; 400 tid_agg_rx->auto_seq = auto_seq;
401 tid_agg_rx->started = false;
406 tid_agg_rx->reorder_buf_filtered = 0; 402 tid_agg_rx->reorder_buf_filtered = 0;
407 status = WLAN_STATUS_SUCCESS; 403 status = WLAN_STATUS_SUCCESS;
408 404
@@ -418,6 +414,7 @@ end:
418 if (status == WLAN_STATUS_SUCCESS) { 414 if (status == WLAN_STATUS_SUCCESS) {
419 __set_bit(tid, sta->ampdu_mlme.agg_session_valid); 415 __set_bit(tid, sta->ampdu_mlme.agg_session_valid);
420 __clear_bit(tid, sta->ampdu_mlme.unexpected_agg); 416 __clear_bit(tid, sta->ampdu_mlme.unexpected_agg);
417 sta->ampdu_mlme.tid_rx_token[tid] = dialog_token;
421 } 418 }
422 mutex_unlock(&sta->ampdu_mlme.mtx); 419 mutex_unlock(&sta->ampdu_mlme.mtx);
423 420
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index fd6541f3ade3..ac879bb17870 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -208,8 +208,8 @@ static int ieee80211_nan_change_conf(struct wiphy *wiphy,
208 if (changes & CFG80211_NAN_CONF_CHANGED_PREF) 208 if (changes & CFG80211_NAN_CONF_CHANGED_PREF)
209 new_conf.master_pref = conf->master_pref; 209 new_conf.master_pref = conf->master_pref;
210 210
211 if (changes & CFG80211_NAN_CONF_CHANGED_DUAL) 211 if (changes & CFG80211_NAN_CONF_CHANGED_BANDS)
212 new_conf.dual = conf->dual; 212 new_conf.bands = conf->bands;
213 213
214 ret = drv_nan_change_conf(sdata->local, sdata, &new_conf, changes); 214 ret = drv_nan_change_conf(sdata->local, sdata, &new_conf, changes);
215 if (!ret) 215 if (!ret)
@@ -357,10 +357,7 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
357 mutex_lock(&local->sta_mtx); 357 mutex_lock(&local->sta_mtx);
358 358
359 if (mac_addr) { 359 if (mac_addr) {
360 if (ieee80211_vif_is_mesh(&sdata->vif)) 360 sta = sta_info_get_bss(sdata, mac_addr);
361 sta = sta_info_get(sdata, mac_addr);
362 else
363 sta = sta_info_get_bss(sdata, mac_addr);
364 /* 361 /*
365 * The ASSOC test makes sure the driver is ready to 362 * The ASSOC test makes sure the driver is ready to
366 * receive the key. When wpa_supplicant has roamed 363 * receive the key. When wpa_supplicant has roamed
@@ -867,6 +864,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
867 } 864 }
868 sdata->needed_rx_chains = sdata->local->rx_chains; 865 sdata->needed_rx_chains = sdata->local->rx_chains;
869 866
867 sdata->vif.bss_conf.beacon_int = params->beacon_interval;
868
870 mutex_lock(&local->mtx); 869 mutex_lock(&local->mtx);
871 err = ieee80211_vif_use_channel(sdata, &params->chandef, 870 err = ieee80211_vif_use_channel(sdata, &params->chandef,
872 IEEE80211_CHANCTX_SHARED); 871 IEEE80211_CHANCTX_SHARED);
@@ -897,7 +896,6 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
897 vlan->vif.type); 896 vlan->vif.type);
898 } 897 }
899 898
900 sdata->vif.bss_conf.beacon_int = params->beacon_interval;
901 sdata->vif.bss_conf.dtim_period = params->dtim_period; 899 sdata->vif.bss_conf.dtim_period = params->dtim_period;
902 sdata->vif.bss_conf.enable_beacon = true; 900 sdata->vif.bss_conf.enable_beacon = true;
903 sdata->vif.bss_conf.allow_p2p_go_ps = sdata->vif.p2p; 901 sdata->vif.bss_conf.allow_p2p_go_ps = sdata->vif.p2p;
@@ -1523,9 +1521,6 @@ static int ieee80211_change_station(struct wiphy *wiphy,
1523 goto out_err; 1521 goto out_err;
1524 1522
1525 if (params->vlan && params->vlan != sta->sdata->dev) { 1523 if (params->vlan && params->vlan != sta->sdata->dev) {
1526 bool prev_4addr = false;
1527 bool new_4addr = false;
1528
1529 vlansdata = IEEE80211_DEV_TO_SUB_IF(params->vlan); 1524 vlansdata = IEEE80211_DEV_TO_SUB_IF(params->vlan);
1530 1525
1531 if (params->vlan->ieee80211_ptr->use_4addr) { 1526 if (params->vlan->ieee80211_ptr->use_4addr) {
@@ -1535,26 +1530,21 @@ static int ieee80211_change_station(struct wiphy *wiphy,
1535 } 1530 }
1536 1531
1537 rcu_assign_pointer(vlansdata->u.vlan.sta, sta); 1532 rcu_assign_pointer(vlansdata->u.vlan.sta, sta);
1538 new_4addr = true;
1539 __ieee80211_check_fast_rx_iface(vlansdata); 1533 __ieee80211_check_fast_rx_iface(vlansdata);
1540 } 1534 }
1541 1535
1542 if (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN && 1536 if (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
1543 sta->sdata->u.vlan.sta) { 1537 sta->sdata->u.vlan.sta)
1544 RCU_INIT_POINTER(sta->sdata->u.vlan.sta, NULL); 1538 RCU_INIT_POINTER(sta->sdata->u.vlan.sta, NULL);
1545 prev_4addr = true; 1539
1546 } 1540 if (test_sta_flag(sta, WLAN_STA_AUTHORIZED))
1541 ieee80211_vif_dec_num_mcast(sta->sdata);
1547 1542
1548 sta->sdata = vlansdata; 1543 sta->sdata = vlansdata;
1549 ieee80211_check_fast_xmit(sta); 1544 ieee80211_check_fast_xmit(sta);
1550 1545
1551 if (sta->sta_state == IEEE80211_STA_AUTHORIZED && 1546 if (test_sta_flag(sta, WLAN_STA_AUTHORIZED))
1552 prev_4addr != new_4addr) { 1547 ieee80211_vif_inc_num_mcast(sta->sdata);
1553 if (new_4addr)
1554 atomic_dec(&sta->sdata->bss->num_mcast_sta);
1555 else
1556 atomic_inc(&sta->sdata->bss->num_mcast_sta);
1557 }
1558 1548
1559 ieee80211_send_layer2_update(sta); 1549 ieee80211_send_layer2_update(sta);
1560 } 1550 }
@@ -2480,13 +2470,6 @@ int __ieee80211_request_smps_ap(struct ieee80211_sub_if_data *sdata,
2480 smps_mode == IEEE80211_SMPS_AUTOMATIC) 2470 smps_mode == IEEE80211_SMPS_AUTOMATIC)
2481 return 0; 2471 return 0;
2482 2472
2483 /* If no associated stations, there's no need to do anything */
2484 if (!atomic_read(&sdata->u.ap.num_mcast_sta)) {
2485 sdata->smps_mode = smps_mode;
2486 ieee80211_queue_work(&sdata->local->hw, &sdata->recalc_smps);
2487 return 0;
2488 }
2489
2490 ht_dbg(sdata, 2473 ht_dbg(sdata,
2491 "SMPS %d requested in AP mode, sending Action frame to %d stations\n", 2474 "SMPS %d requested in AP mode, sending Action frame to %d stations\n",
2492 smps_mode, atomic_read(&sdata->u.ap.num_mcast_sta)); 2475 smps_mode, atomic_read(&sdata->u.ap.num_mcast_sta));
@@ -3580,6 +3563,17 @@ void ieee80211_nan_func_match(struct ieee80211_vif *vif,
3580} 3563}
3581EXPORT_SYMBOL(ieee80211_nan_func_match); 3564EXPORT_SYMBOL(ieee80211_nan_func_match);
3582 3565
3566static int ieee80211_set_multicast_to_unicast(struct wiphy *wiphy,
3567 struct net_device *dev,
3568 const bool enabled)
3569{
3570 struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
3571
3572 sdata->u.ap.multicast_to_unicast = enabled;
3573
3574 return 0;
3575}
3576
3583const struct cfg80211_ops mac80211_config_ops = { 3577const struct cfg80211_ops mac80211_config_ops = {
3584 .add_virtual_intf = ieee80211_add_iface, 3578 .add_virtual_intf = ieee80211_add_iface,
3585 .del_virtual_intf = ieee80211_del_iface, 3579 .del_virtual_intf = ieee80211_del_iface,
@@ -3670,4 +3664,5 @@ const struct cfg80211_ops mac80211_config_ops = {
3670 .nan_change_conf = ieee80211_nan_change_conf, 3664 .nan_change_conf = ieee80211_nan_change_conf,
3671 .add_nan_func = ieee80211_add_nan_func, 3665 .add_nan_func = ieee80211_add_nan_func,
3672 .del_nan_func = ieee80211_del_nan_func, 3666 .del_nan_func = ieee80211_del_nan_func,
3667 .set_multicast_to_unicast = ieee80211_set_multicast_to_unicast,
3673}; 3668};
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index e75cbf6ecc26..89178b46b32f 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -231,9 +231,6 @@ ieee80211_get_max_required_bw(struct ieee80211_sub_if_data *sdata)
231 !(sta->sdata->bss && sta->sdata->bss == sdata->bss)) 231 !(sta->sdata->bss && sta->sdata->bss == sdata->bss))
232 continue; 232 continue;
233 233
234 if (!sta->uploaded || !test_sta_flag(sta, WLAN_STA_ASSOC))
235 continue;
236
237 max_bw = max(max_bw, ieee80211_get_sta_bw(&sta->sta)); 234 max_bw = max(max_bw, ieee80211_get_sta_bw(&sta->sta));
238 } 235 }
239 rcu_read_unlock(); 236 rcu_read_unlock();
@@ -1270,7 +1267,7 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
1270 struct ieee80211_sub_if_data *sdata, *sdata_tmp; 1267 struct ieee80211_sub_if_data *sdata, *sdata_tmp;
1271 struct ieee80211_chanctx *ctx, *ctx_tmp, *old_ctx; 1268 struct ieee80211_chanctx *ctx, *ctx_tmp, *old_ctx;
1272 struct ieee80211_chanctx *new_ctx = NULL; 1269 struct ieee80211_chanctx *new_ctx = NULL;
1273 int i, err, n_assigned, n_reserved, n_ready; 1270 int err, n_assigned, n_reserved, n_ready;
1274 int n_ctx = 0, n_vifs_switch = 0, n_vifs_assign = 0, n_vifs_ctxless = 0; 1271 int n_ctx = 0, n_vifs_switch = 0, n_vifs_assign = 0, n_vifs_ctxless = 0;
1275 1272
1276 lockdep_assert_held(&local->mtx); 1273 lockdep_assert_held(&local->mtx);
@@ -1391,8 +1388,6 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
1391 * Update all structures, values and pointers to point to new channel 1388 * Update all structures, values and pointers to point to new channel
1392 * context(s). 1389 * context(s).
1393 */ 1390 */
1394
1395 i = 0;
1396 list_for_each_entry(ctx, &local->chanctx_list, list) { 1391 list_for_each_entry(ctx, &local->chanctx_list, list) {
1397 if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) 1392 if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER)
1398 continue; 1393 continue;
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index f56e2f487d09..5fae001f286c 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -210,6 +210,7 @@ static const char *hw_flag_names[] = {
210 FLAG(TX_AMSDU), 210 FLAG(TX_AMSDU),
211 FLAG(TX_FRAG_LIST), 211 FLAG(TX_FRAG_LIST),
212 FLAG(REPORTS_LOW_ACK), 212 FLAG(REPORTS_LOW_ACK),
213 FLAG(SUPPORTS_TX_FRAG),
213#undef FLAG 214#undef FLAG
214}; 215};
215 216
@@ -242,6 +243,38 @@ static ssize_t hwflags_read(struct file *file, char __user *user_buf,
242 return rv; 243 return rv;
243} 244}
244 245
246static ssize_t misc_read(struct file *file, char __user *user_buf,
247 size_t count, loff_t *ppos)
248{
249 struct ieee80211_local *local = file->private_data;
250 /* Max len of each line is 16 characters, plus 9 for 'pending:\n' */
251 size_t bufsz = IEEE80211_MAX_QUEUES * 16 + 9;
252 char *buf;
253 char *pos, *end;
254 ssize_t rv;
255 int i;
256 int ln;
257
258 buf = kzalloc(bufsz, GFP_KERNEL);
259 if (!buf)
260 return -ENOMEM;
261
262 pos = buf;
263 end = buf + bufsz - 1;
264
265 pos += scnprintf(pos, end - pos, "pending:\n");
266
267 for (i = 0; i < IEEE80211_MAX_QUEUES; i++) {
268 ln = skb_queue_len(&local->pending[i]);
269 pos += scnprintf(pos, end - pos, "[%i] %d\n",
270 i, ln);
271 }
272
273 rv = simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf));
274 kfree(buf);
275 return rv;
276}
277
245static ssize_t queues_read(struct file *file, char __user *user_buf, 278static ssize_t queues_read(struct file *file, char __user *user_buf,
246 size_t count, loff_t *ppos) 279 size_t count, loff_t *ppos)
247{ 280{
@@ -262,6 +295,7 @@ static ssize_t queues_read(struct file *file, char __user *user_buf,
262 295
263DEBUGFS_READONLY_FILE_OPS(hwflags); 296DEBUGFS_READONLY_FILE_OPS(hwflags);
264DEBUGFS_READONLY_FILE_OPS(queues); 297DEBUGFS_READONLY_FILE_OPS(queues);
298DEBUGFS_READONLY_FILE_OPS(misc);
265 299
266/* statistics stuff */ 300/* statistics stuff */
267 301
@@ -329,7 +363,9 @@ void debugfs_hw_add(struct ieee80211_local *local)
329 363
330 DEBUGFS_ADD(total_ps_buffered); 364 DEBUGFS_ADD(total_ps_buffered);
331 DEBUGFS_ADD(wep_iv); 365 DEBUGFS_ADD(wep_iv);
366 DEBUGFS_ADD(rate_ctrl_alg);
332 DEBUGFS_ADD(queues); 367 DEBUGFS_ADD(queues);
368 DEBUGFS_ADD(misc);
333#ifdef CONFIG_PM 369#ifdef CONFIG_PM
334 DEBUGFS_ADD_MODE(reset, 0200); 370 DEBUGFS_ADD_MODE(reset, 0200);
335#endif 371#endif
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index bcec1240f41d..8f5fff8b2040 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -477,6 +477,7 @@ IEEE80211_IF_FILE_RW(tdls_wider_bw);
477IEEE80211_IF_FILE(num_mcast_sta, u.ap.num_mcast_sta, ATOMIC); 477IEEE80211_IF_FILE(num_mcast_sta, u.ap.num_mcast_sta, ATOMIC);
478IEEE80211_IF_FILE(num_sta_ps, u.ap.ps.num_sta_ps, ATOMIC); 478IEEE80211_IF_FILE(num_sta_ps, u.ap.ps.num_sta_ps, ATOMIC);
479IEEE80211_IF_FILE(dtim_count, u.ap.ps.dtim_count, DEC); 479IEEE80211_IF_FILE(dtim_count, u.ap.ps.dtim_count, DEC);
480IEEE80211_IF_FILE(num_mcast_sta_vlan, u.vlan.num_mcast_sta, ATOMIC);
480 481
481static ssize_t ieee80211_if_fmt_num_buffered_multicast( 482static ssize_t ieee80211_if_fmt_num_buffered_multicast(
482 const struct ieee80211_sub_if_data *sdata, char *buf, int buflen) 483 const struct ieee80211_sub_if_data *sdata, char *buf, int buflen)
@@ -518,6 +519,8 @@ static ssize_t ieee80211_if_fmt_aqm(
518} 519}
519IEEE80211_IF_FILE_R(aqm); 520IEEE80211_IF_FILE_R(aqm);
520 521
522IEEE80211_IF_FILE(multicast_to_unicast, u.ap.multicast_to_unicast, HEX);
523
521/* IBSS attributes */ 524/* IBSS attributes */
522static ssize_t ieee80211_if_fmt_tsf( 525static ssize_t ieee80211_if_fmt_tsf(
523 const struct ieee80211_sub_if_data *sdata, char *buf, int buflen) 526 const struct ieee80211_sub_if_data *sdata, char *buf, int buflen)
@@ -682,6 +685,14 @@ static void add_ap_files(struct ieee80211_sub_if_data *sdata)
682 DEBUGFS_ADD(dtim_count); 685 DEBUGFS_ADD(dtim_count);
683 DEBUGFS_ADD(num_buffered_multicast); 686 DEBUGFS_ADD(num_buffered_multicast);
684 DEBUGFS_ADD_MODE(tkip_mic_test, 0200); 687 DEBUGFS_ADD_MODE(tkip_mic_test, 0200);
688 DEBUGFS_ADD_MODE(multicast_to_unicast, 0600);
689}
690
691static void add_vlan_files(struct ieee80211_sub_if_data *sdata)
692{
693 /* add num_mcast_sta_vlan using name num_mcast_sta */
694 debugfs_create_file("num_mcast_sta", 0400, sdata->vif.debugfs_dir,
695 sdata, &num_mcast_sta_vlan_ops);
685} 696}
686 697
687static void add_ibss_files(struct ieee80211_sub_if_data *sdata) 698static void add_ibss_files(struct ieee80211_sub_if_data *sdata)
@@ -787,6 +798,9 @@ static void add_files(struct ieee80211_sub_if_data *sdata)
787 case NL80211_IFTYPE_AP: 798 case NL80211_IFTYPE_AP:
788 add_ap_files(sdata); 799 add_ap_files(sdata);
789 break; 800 break;
801 case NL80211_IFTYPE_AP_VLAN:
802 add_vlan_files(sdata);
803 break;
790 case NL80211_IFTYPE_WDS: 804 case NL80211_IFTYPE_WDS:
791 add_wds_files(sdata); 805 add_wds_files(sdata);
792 break; 806 break;
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index a2fcdb47a0e6..42601820db20 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -199,13 +199,18 @@ static ssize_t sta_agg_status_read(struct file *file, char __user *userbuf,
199 "TID\t\tRX\tDTKN\tSSN\t\tTX\tDTKN\tpending\n"); 199 "TID\t\tRX\tDTKN\tSSN\t\tTX\tDTKN\tpending\n");
200 200
201 for (i = 0; i < IEEE80211_NUM_TIDS; i++) { 201 for (i = 0; i < IEEE80211_NUM_TIDS; i++) {
202 bool tid_rx_valid;
203
202 tid_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[i]); 204 tid_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[i]);
203 tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[i]); 205 tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[i]);
206 tid_rx_valid = test_bit(i, sta->ampdu_mlme.agg_session_valid);
204 207
205 p += scnprintf(p, sizeof(buf) + buf - p, "%02d", i); 208 p += scnprintf(p, sizeof(buf) + buf - p, "%02d", i);
206 p += scnprintf(p, sizeof(buf) + buf - p, "\t\t%x", !!tid_rx); 209 p += scnprintf(p, sizeof(buf) + buf - p, "\t\t%x",
210 tid_rx_valid);
207 p += scnprintf(p, sizeof(buf) + buf - p, "\t%#.2x", 211 p += scnprintf(p, sizeof(buf) + buf - p, "\t%#.2x",
208 tid_rx ? tid_rx->dialog_token : 0); 212 tid_rx_valid ?
213 sta->ampdu_mlme.tid_rx_token[i] : 0);
209 p += scnprintf(p, sizeof(buf) + buf - p, "\t%#.3x", 214 p += scnprintf(p, sizeof(buf) + buf - p, "\t%#.3x",
210 tid_rx ? tid_rx->ssn : 0); 215 tid_rx ? tid_rx->ssn : 0);
211 216
@@ -517,6 +522,7 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta)
517 return; 522 return;
518 523
519 DEBUGFS_ADD(flags); 524 DEBUGFS_ADD(flags);
525 DEBUGFS_ADD(aid);
520 DEBUGFS_ADD(num_ps_buf_frames); 526 DEBUGFS_ADD(num_ps_buf_frames);
521 DEBUGFS_ADD(last_seq_ctrl); 527 DEBUGFS_ADD(last_seq_ctrl);
522 DEBUGFS_ADD(agg_status); 528 DEBUGFS_ADD(agg_status);
diff --git a/net/mac80211/fils_aead.c b/net/mac80211/fils_aead.c
new file mode 100644
index 000000000000..3cfb1e2ab7ac
--- /dev/null
+++ b/net/mac80211/fils_aead.c
@@ -0,0 +1,334 @@
1/*
2 * FILS AEAD for (Re)Association Request/Response frames
3 * Copyright 2016, Qualcomm Atheros, Inc.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 */
9
10#include <crypto/aes.h>
11#include <crypto/algapi.h>
12#include <crypto/hash.h>
13#include <crypto/skcipher.h>
14
15#include "ieee80211_i.h"
16#include "aes_cmac.h"
17#include "fils_aead.h"
18
19static void gf_mulx(u8 *pad)
20{
21 u64 a = get_unaligned_be64(pad);
22 u64 b = get_unaligned_be64(pad + 8);
23
24 put_unaligned_be64((a << 1) | (b >> 63), pad);
25 put_unaligned_be64((b << 1) ^ ((a >> 63) ? 0x87 : 0), pad + 8);
26}
27
28static int aes_s2v(struct crypto_shash *tfm,
29 size_t num_elem, const u8 *addr[], size_t len[], u8 *v)
30{
31 u8 d[AES_BLOCK_SIZE], tmp[AES_BLOCK_SIZE] = {};
32 SHASH_DESC_ON_STACK(desc, tfm);
33 size_t i;
34
35 desc->tfm = tfm;
36
37 /* D = AES-CMAC(K, <zero>) */
38 crypto_shash_digest(desc, tmp, AES_BLOCK_SIZE, d);
39
40 for (i = 0; i < num_elem - 1; i++) {
41 /* D = dbl(D) xor AES_CMAC(K, Si) */
42 gf_mulx(d); /* dbl */
43 crypto_shash_digest(desc, addr[i], len[i], tmp);
44 crypto_xor(d, tmp, AES_BLOCK_SIZE);
45 }
46
47 crypto_shash_init(desc);
48
49 if (len[i] >= AES_BLOCK_SIZE) {
50 /* len(Sn) >= 128 */
51 /* T = Sn xorend D */
52 crypto_shash_update(desc, addr[i], len[i] - AES_BLOCK_SIZE);
53 crypto_xor(d, addr[i] + len[i] - AES_BLOCK_SIZE,
54 AES_BLOCK_SIZE);
55 } else {
56 /* len(Sn) < 128 */
57 /* T = dbl(D) xor pad(Sn) */
58 gf_mulx(d); /* dbl */
59 crypto_xor(d, addr[i], len[i]);
60 d[len[i]] ^= 0x80;
61 }
62 /* V = AES-CMAC(K, T) */
63 crypto_shash_finup(desc, d, AES_BLOCK_SIZE, v);
64
65 return 0;
66}
67
68/* Note: addr[] and len[] needs to have one extra slot at the end. */
69static int aes_siv_encrypt(const u8 *key, size_t key_len,
70 const u8 *plain, size_t plain_len,
71 size_t num_elem, const u8 *addr[],
72 size_t len[], u8 *out)
73{
74 u8 v[AES_BLOCK_SIZE];
75 struct crypto_shash *tfm;
76 struct crypto_skcipher *tfm2;
77 struct skcipher_request *req;
78 int res;
79 struct scatterlist src[1], dst[1];
80 u8 *tmp;
81
82 key_len /= 2; /* S2V key || CTR key */
83
84 addr[num_elem] = plain;
85 len[num_elem] = plain_len;
86 num_elem++;
87
88 /* S2V */
89
90 tfm = crypto_alloc_shash("cmac(aes)", 0, 0);
91 if (IS_ERR(tfm))
92 return PTR_ERR(tfm);
93 /* K1 for S2V */
94 res = crypto_shash_setkey(tfm, key, key_len);
95 if (!res)
96 res = aes_s2v(tfm, num_elem, addr, len, v);
97 crypto_free_shash(tfm);
98 if (res)
99 return res;
100
101 /* Use a temporary buffer of the plaintext to handle need for
102 * overwriting this during AES-CTR.
103 */
104 tmp = kmemdup(plain, plain_len, GFP_KERNEL);
105 if (!tmp)
106 return -ENOMEM;
107
108 /* IV for CTR before encrypted data */
109 memcpy(out, v, AES_BLOCK_SIZE);
110
111 /* Synthetic IV to be used as the initial counter in CTR:
112 * Q = V bitand (1^64 || 0^1 || 1^31 || 0^1 || 1^31)
113 */
114 v[8] &= 0x7f;
115 v[12] &= 0x7f;
116
117 /* CTR */
118
119 tfm2 = crypto_alloc_skcipher("ctr(aes)", 0, CRYPTO_ALG_ASYNC);
120 if (IS_ERR(tfm2)) {
121 kfree(tmp);
122 return PTR_ERR(tfm2);
123 }
124 /* K2 for CTR */
125 res = crypto_skcipher_setkey(tfm2, key + key_len, key_len);
126 if (res)
127 goto fail;
128
129 req = skcipher_request_alloc(tfm2, GFP_KERNEL);
130 if (!req) {
131 res = -ENOMEM;
132 goto fail;
133 }
134
135 sg_init_one(src, tmp, plain_len);
136 sg_init_one(dst, out + AES_BLOCK_SIZE, plain_len);
137 skcipher_request_set_crypt(req, src, dst, plain_len, v);
138 res = crypto_skcipher_encrypt(req);
139 skcipher_request_free(req);
140fail:
141 kfree(tmp);
142 crypto_free_skcipher(tfm2);
143 return res;
144}
145
146/* Note: addr[] and len[] needs to have one extra slot at the end. */
147static int aes_siv_decrypt(const u8 *key, size_t key_len,
148 const u8 *iv_crypt, size_t iv_c_len,
149 size_t num_elem, const u8 *addr[], size_t len[],
150 u8 *out)
151{
152 struct crypto_shash *tfm;
153 struct crypto_skcipher *tfm2;
154 struct skcipher_request *req;
155 struct scatterlist src[1], dst[1];
156 size_t crypt_len;
157 int res;
158 u8 frame_iv[AES_BLOCK_SIZE], iv[AES_BLOCK_SIZE];
159 u8 check[AES_BLOCK_SIZE];
160
161 crypt_len = iv_c_len - AES_BLOCK_SIZE;
162 key_len /= 2; /* S2V key || CTR key */
163 addr[num_elem] = out;
164 len[num_elem] = crypt_len;
165 num_elem++;
166
167 memcpy(iv, iv_crypt, AES_BLOCK_SIZE);
168 memcpy(frame_iv, iv_crypt, AES_BLOCK_SIZE);
169
170 /* Synthetic IV to be used as the initial counter in CTR:
171 * Q = V bitand (1^64 || 0^1 || 1^31 || 0^1 || 1^31)
172 */
173 iv[8] &= 0x7f;
174 iv[12] &= 0x7f;
175
176 /* CTR */
177
178 tfm2 = crypto_alloc_skcipher("ctr(aes)", 0, CRYPTO_ALG_ASYNC);
179 if (IS_ERR(tfm2))
180 return PTR_ERR(tfm2);
181 /* K2 for CTR */
182 res = crypto_skcipher_setkey(tfm2, key + key_len, key_len);
183 if (res) {
184 crypto_free_skcipher(tfm2);
185 return res;
186 }
187
188 req = skcipher_request_alloc(tfm2, GFP_KERNEL);
189 if (!req) {
190 crypto_free_skcipher(tfm2);
191 return -ENOMEM;
192 }
193
194 sg_init_one(src, iv_crypt + AES_BLOCK_SIZE, crypt_len);
195 sg_init_one(dst, out, crypt_len);
196 skcipher_request_set_crypt(req, src, dst, crypt_len, iv);
197 res = crypto_skcipher_decrypt(req);
198 skcipher_request_free(req);
199 crypto_free_skcipher(tfm2);
200 if (res)
201 return res;
202
203 /* S2V */
204
205 tfm = crypto_alloc_shash("cmac(aes)", 0, 0);
206 if (IS_ERR(tfm))
207 return PTR_ERR(tfm);
208 /* K1 for S2V */
209 res = crypto_shash_setkey(tfm, key, key_len);
210 if (!res)
211 res = aes_s2v(tfm, num_elem, addr, len, check);
212 crypto_free_shash(tfm);
213 if (res)
214 return res;
215 if (memcmp(check, frame_iv, AES_BLOCK_SIZE) != 0)
216 return -EINVAL;
217 return 0;
218}
219
220int fils_encrypt_assoc_req(struct sk_buff *skb,
221 struct ieee80211_mgd_assoc_data *assoc_data)
222{
223 struct ieee80211_mgmt *mgmt = (void *)skb->data;
224 u8 *capab, *ies, *encr;
225 const u8 *addr[5 + 1], *session;
226 size_t len[5 + 1];
227 size_t crypt_len;
228
229 if (ieee80211_is_reassoc_req(mgmt->frame_control)) {
230 capab = (u8 *)&mgmt->u.reassoc_req.capab_info;
231 ies = mgmt->u.reassoc_req.variable;
232 } else {
233 capab = (u8 *)&mgmt->u.assoc_req.capab_info;
234 ies = mgmt->u.assoc_req.variable;
235 }
236
237 session = cfg80211_find_ext_ie(WLAN_EID_EXT_FILS_SESSION,
238 ies, skb->data + skb->len - ies);
239 if (!session || session[1] != 1 + 8)
240 return -EINVAL;
241 /* encrypt after FILS Session element */
242 encr = (u8 *)session + 2 + 1 + 8;
243
244 /* AES-SIV AAD vectors */
245
246 /* The STA's MAC address */
247 addr[0] = mgmt->sa;
248 len[0] = ETH_ALEN;
249 /* The AP's BSSID */
250 addr[1] = mgmt->da;
251 len[1] = ETH_ALEN;
252 /* The STA's nonce */
253 addr[2] = assoc_data->fils_nonces;
254 len[2] = FILS_NONCE_LEN;
255 /* The AP's nonce */
256 addr[3] = &assoc_data->fils_nonces[FILS_NONCE_LEN];
257 len[3] = FILS_NONCE_LEN;
258 /* The (Re)Association Request frame from the Capability Information
259 * field to the FILS Session element (both inclusive).
260 */
261 addr[4] = capab;
262 len[4] = encr - capab;
263
264 crypt_len = skb->data + skb->len - encr;
265 skb_put(skb, AES_BLOCK_SIZE);
266 return aes_siv_encrypt(assoc_data->fils_kek, assoc_data->fils_kek_len,
267 encr, crypt_len, 5, addr, len, encr);
268}
269
270int fils_decrypt_assoc_resp(struct ieee80211_sub_if_data *sdata,
271 u8 *frame, size_t *frame_len,
272 struct ieee80211_mgd_assoc_data *assoc_data)
273{
274 struct ieee80211_mgmt *mgmt = (void *)frame;
275 u8 *capab, *ies, *encr;
276 const u8 *addr[5 + 1], *session;
277 size_t len[5 + 1];
278 int res;
279 size_t crypt_len;
280
281 if (*frame_len < 24 + 6)
282 return -EINVAL;
283
284 capab = (u8 *)&mgmt->u.assoc_resp.capab_info;
285 ies = mgmt->u.assoc_resp.variable;
286 session = cfg80211_find_ext_ie(WLAN_EID_EXT_FILS_SESSION,
287 ies, frame + *frame_len - ies);
288 if (!session || session[1] != 1 + 8) {
289 mlme_dbg(sdata,
290 "No (valid) FILS Session element in (Re)Association Response frame from %pM",
291 mgmt->sa);
292 return -EINVAL;
293 }
294 /* decrypt after FILS Session element */
295 encr = (u8 *)session + 2 + 1 + 8;
296
297 /* AES-SIV AAD vectors */
298
299 /* The AP's BSSID */
300 addr[0] = mgmt->sa;
301 len[0] = ETH_ALEN;
302 /* The STA's MAC address */
303 addr[1] = mgmt->da;
304 len[1] = ETH_ALEN;
305 /* The AP's nonce */
306 addr[2] = &assoc_data->fils_nonces[FILS_NONCE_LEN];
307 len[2] = FILS_NONCE_LEN;
308 /* The STA's nonce */
309 addr[3] = assoc_data->fils_nonces;
310 len[3] = FILS_NONCE_LEN;
311 /* The (Re)Association Response frame from the Capability Information
312 * field to the FILS Session element (both inclusive).
313 */
314 addr[4] = capab;
315 len[4] = encr - capab;
316
317 crypt_len = frame + *frame_len - encr;
318 if (crypt_len < AES_BLOCK_SIZE) {
319 mlme_dbg(sdata,
320 "Not enough room for AES-SIV data after FILS Session element in (Re)Association Response frame from %pM",
321 mgmt->sa);
322 return -EINVAL;
323 }
324 res = aes_siv_decrypt(assoc_data->fils_kek, assoc_data->fils_kek_len,
325 encr, crypt_len, 5, addr, len, encr);
326 if (res != 0) {
327 mlme_dbg(sdata,
328 "AES-SIV decryption of (Re)Association Response frame from %pM failed",
329 mgmt->sa);
330 return res;
331 }
332 *frame_len -= AES_BLOCK_SIZE;
333 return 0;
334}
diff --git a/net/mac80211/fils_aead.h b/net/mac80211/fils_aead.h
new file mode 100644
index 000000000000..fbc65232f0b3
--- /dev/null
+++ b/net/mac80211/fils_aead.h
@@ -0,0 +1,19 @@
1/*
2 * FILS AEAD for (Re)Association Request/Response frames
3 * Copyright 2016, Qualcomm Atheros, Inc.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 */
9
10#ifndef FILS_AEAD_H
11#define FILS_AEAD_H
12
13int fils_encrypt_assoc_req(struct sk_buff *skb,
14 struct ieee80211_mgd_assoc_data *assoc_data);
15int fils_decrypt_assoc_resp(struct ieee80211_sub_if_data *sdata,
16 u8 *frame, size_t *frame_len,
17 struct ieee80211_mgd_assoc_data *assoc_data);
18
19#endif /* FILS_AEAD_H */
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index a31d30713d08..98999d3d5262 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -487,14 +487,14 @@ int ieee80211_ibss_csa_beacon(struct ieee80211_sub_if_data *sdata,
487 struct beacon_data *presp, *old_presp; 487 struct beacon_data *presp, *old_presp;
488 struct cfg80211_bss *cbss; 488 struct cfg80211_bss *cbss;
489 const struct cfg80211_bss_ies *ies; 489 const struct cfg80211_bss_ies *ies;
490 u16 capability = 0; 490 u16 capability = WLAN_CAPABILITY_IBSS;
491 u64 tsf; 491 u64 tsf;
492 int ret = 0; 492 int ret = 0;
493 493
494 sdata_assert_lock(sdata); 494 sdata_assert_lock(sdata);
495 495
496 if (ifibss->privacy) 496 if (ifibss->privacy)
497 capability = WLAN_CAPABILITY_PRIVACY; 497 capability |= WLAN_CAPABILITY_PRIVACY;
498 498
499 cbss = cfg80211_get_bss(sdata->local->hw.wiphy, ifibss->chandef.chan, 499 cbss = cfg80211_get_bss(sdata->local->hw.wiphy, ifibss->chandef.chan,
500 ifibss->bssid, ifibss->ssid, 500 ifibss->bssid, ifibss->ssid,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 34c2add2c455..0e718437d080 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -84,6 +84,8 @@ struct ieee80211_local;
84#define IEEE80211_DEFAULT_MAX_SP_LEN \ 84#define IEEE80211_DEFAULT_MAX_SP_LEN \
85 IEEE80211_WMM_IE_STA_QOSINFO_SP_ALL 85 IEEE80211_WMM_IE_STA_QOSINFO_SP_ALL
86 86
87extern const u8 ieee80211_ac_to_qos_mask[IEEE80211_NUM_ACS];
88
87#define IEEE80211_DEAUTH_FRAME_LEN (24 /* hdr */ + 2 /* reason */) 89#define IEEE80211_DEAUTH_FRAME_LEN (24 /* hdr */ + 2 /* reason */)
88 90
89#define IEEE80211_MAX_NAN_INSTANCE_ID 255 91#define IEEE80211_MAX_NAN_INSTANCE_ID 255
@@ -157,7 +159,7 @@ enum ieee80211_bss_valid_data_flags {
157 IEEE80211_BSS_VALID_ERP = BIT(3) 159 IEEE80211_BSS_VALID_ERP = BIT(3)
158}; 160};
159 161
160typedef unsigned __bitwise__ ieee80211_tx_result; 162typedef unsigned __bitwise ieee80211_tx_result;
161#define TX_CONTINUE ((__force ieee80211_tx_result) 0u) 163#define TX_CONTINUE ((__force ieee80211_tx_result) 0u)
162#define TX_DROP ((__force ieee80211_tx_result) 1u) 164#define TX_DROP ((__force ieee80211_tx_result) 1u)
163#define TX_QUEUED ((__force ieee80211_tx_result) 2u) 165#define TX_QUEUED ((__force ieee80211_tx_result) 2u)
@@ -178,7 +180,7 @@ struct ieee80211_tx_data {
178}; 180};
179 181
180 182
181typedef unsigned __bitwise__ ieee80211_rx_result; 183typedef unsigned __bitwise ieee80211_rx_result;
182#define RX_CONTINUE ((__force ieee80211_rx_result) 0u) 184#define RX_CONTINUE ((__force ieee80211_rx_result) 0u)
183#define RX_DROP_UNUSABLE ((__force ieee80211_rx_result) 1u) 185#define RX_DROP_UNUSABLE ((__force ieee80211_rx_result) 1u)
184#define RX_DROP_MONITOR ((__force ieee80211_rx_result) 2u) 186#define RX_DROP_MONITOR ((__force ieee80211_rx_result) 2u)
@@ -295,6 +297,7 @@ struct ieee80211_if_ap {
295 driver_smps_mode; /* smps mode request */ 297 driver_smps_mode; /* smps mode request */
296 298
297 struct work_struct request_smps_work; 299 struct work_struct request_smps_work;
300 bool multicast_to_unicast;
298}; 301};
299 302
300struct ieee80211_if_wds { 303struct ieee80211_if_wds {
@@ -307,6 +310,7 @@ struct ieee80211_if_vlan {
307 310
308 /* used for all tx if the VLAN is configured to 4-addr mode */ 311 /* used for all tx if the VLAN is configured to 4-addr mode */
309 struct sta_info __rcu *sta; 312 struct sta_info __rcu *sta;
313 atomic_t num_mcast_sta; /* number of stations receiving multicast */
310}; 314};
311 315
312struct mesh_stats { 316struct mesh_stats {
@@ -398,6 +402,10 @@ struct ieee80211_mgd_assoc_data {
398 402
399 struct ieee80211_vht_cap ap_vht_cap; 403 struct ieee80211_vht_cap ap_vht_cap;
400 404
405 u8 fils_nonces[2 * FILS_NONCE_LEN];
406 u8 fils_kek[FILS_MAX_KEK_LEN];
407 size_t fils_kek_len;
408
401 size_t ie_len; 409 size_t ie_len;
402 u8 ie[]; 410 u8 ie[];
403}; 411};
@@ -420,7 +428,7 @@ struct ieee80211_sta_tx_tspec {
420 bool downgraded; 428 bool downgraded;
421}; 429};
422 430
423DECLARE_EWMA(beacon_signal, 16, 4) 431DECLARE_EWMA(beacon_signal, 4, 4)
424 432
425struct ieee80211_if_managed { 433struct ieee80211_if_managed {
426 struct timer_list timer; 434 struct timer_list timer;
@@ -442,7 +450,7 @@ struct ieee80211_if_managed {
442 struct ieee80211_mgd_auth_data *auth_data; 450 struct ieee80211_mgd_auth_data *auth_data;
443 struct ieee80211_mgd_assoc_data *assoc_data; 451 struct ieee80211_mgd_assoc_data *assoc_data;
444 452
445 u8 bssid[ETH_ALEN]; 453 u8 bssid[ETH_ALEN] __aligned(2);
446 454
447 u16 aid; 455 u16 aid;
448 456
@@ -617,8 +625,8 @@ struct ieee80211_mesh_sync_ops {
617 struct ieee80211_rx_status *rx_status); 625 struct ieee80211_rx_status *rx_status);
618 626
619 /* should be called with beacon_data under RCU read lock */ 627 /* should be called with beacon_data under RCU read lock */
620 void (*adjust_tbtt)(struct ieee80211_sub_if_data *sdata, 628 void (*adjust_tsf)(struct ieee80211_sub_if_data *sdata,
621 struct beacon_data *beacon); 629 struct beacon_data *beacon);
622 /* add other framework functions here */ 630 /* add other framework functions here */
623}; 631};
624 632
@@ -681,7 +689,6 @@ struct ieee80211_if_mesh {
681 const struct ieee80211_mesh_sync_ops *sync_ops; 689 const struct ieee80211_mesh_sync_ops *sync_ops;
682 s64 sync_offset_clockdrift_max; 690 s64 sync_offset_clockdrift_max;
683 spinlock_t sync_offset_lock; 691 spinlock_t sync_offset_lock;
684 bool adjusting_tbtt;
685 /* mesh power save */ 692 /* mesh power save */
686 enum nl80211_mesh_power_mode nonpeer_pm; 693 enum nl80211_mesh_power_mode nonpeer_pm;
687 int ps_peers_light_sleep; 694 int ps_peers_light_sleep;
@@ -1527,6 +1534,23 @@ ieee80211_have_rx_timestamp(struct ieee80211_rx_status *status)
1527 return false; 1534 return false;
1528} 1535}
1529 1536
1537void ieee80211_vif_inc_num_mcast(struct ieee80211_sub_if_data *sdata);
1538void ieee80211_vif_dec_num_mcast(struct ieee80211_sub_if_data *sdata);
1539
1540/* This function returns the number of multicast stations connected to this
1541 * interface. It returns -1 if that number is not tracked, that is for netdevs
1542 * not in AP or AP_VLAN mode or when using 4addr.
1543 */
1544static inline int
1545ieee80211_vif_get_num_mcast_if(struct ieee80211_sub_if_data *sdata)
1546{
1547 if (sdata->vif.type == NL80211_IFTYPE_AP)
1548 return atomic_read(&sdata->u.ap.num_mcast_sta);
1549 if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN && !sdata->u.vlan.sta)
1550 return atomic_read(&sdata->u.vlan.num_mcast_sta);
1551 return -1;
1552}
1553
1530u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, 1554u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local,
1531 struct ieee80211_rx_status *status, 1555 struct ieee80211_rx_status *status,
1532 unsigned int mpdu_len, 1556 unsigned int mpdu_len,
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 638ec0759078..5bb0c5012819 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -6,6 +6,7 @@
6 * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz> 6 * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz>
7 * Copyright 2008, Johannes Berg <johannes@sipsolutions.net> 7 * Copyright 2008, Johannes Berg <johannes@sipsolutions.net>
8 * Copyright 2013-2014 Intel Mobile Communications GmbH 8 * Copyright 2013-2014 Intel Mobile Communications GmbH
9 * Copyright (c) 2016 Intel Deutschland GmbH
9 * 10 *
10 * This program is free software; you can redistribute it and/or modify 11 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2 as 12 * it under the terms of the GNU General Public License version 2 as
@@ -150,15 +151,6 @@ void ieee80211_recalc_idle(struct ieee80211_local *local)
150 ieee80211_hw_config(local, change); 151 ieee80211_hw_config(local, change);
151} 152}
152 153
153static int ieee80211_change_mtu(struct net_device *dev, int new_mtu)
154{
155 if (new_mtu < 256 || new_mtu > IEEE80211_MAX_DATA_LEN)
156 return -EINVAL;
157
158 dev->mtu = new_mtu;
159 return 0;
160}
161
162static int ieee80211_verify_mac(struct ieee80211_sub_if_data *sdata, u8 *addr, 154static int ieee80211_verify_mac(struct ieee80211_sub_if_data *sdata, u8 *addr,
163 bool check_dup) 155 bool check_dup)
164{ 156{
@@ -726,7 +718,8 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
726 ieee80211_recalc_ps(local); 718 ieee80211_recalc_ps(local);
727 719
728 if (sdata->vif.type == NL80211_IFTYPE_MONITOR || 720 if (sdata->vif.type == NL80211_IFTYPE_MONITOR ||
729 sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { 721 sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
722 local->ops->wake_tx_queue) {
730 /* XXX: for AP_VLAN, actually track AP queues */ 723 /* XXX: for AP_VLAN, actually track AP queues */
731 netif_tx_start_all_queues(dev); 724 netif_tx_start_all_queues(dev);
732 } else if (dev) { 725 } else if (dev) {
@@ -1131,7 +1124,7 @@ static u16 ieee80211_netdev_select_queue(struct net_device *dev,
1131 return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb); 1124 return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb);
1132} 1125}
1133 1126
1134static struct rtnl_link_stats64 * 1127static void
1135ieee80211_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) 1128ieee80211_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
1136{ 1129{
1137 int i; 1130 int i;
@@ -1156,8 +1149,6 @@ ieee80211_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
1156 stats->rx_bytes += rx_bytes; 1149 stats->rx_bytes += rx_bytes;
1157 stats->tx_bytes += tx_bytes; 1150 stats->tx_bytes += tx_bytes;
1158 } 1151 }
1159
1160 return stats;
1161} 1152}
1162 1153
1163static const struct net_device_ops ieee80211_dataif_ops = { 1154static const struct net_device_ops ieee80211_dataif_ops = {
@@ -1166,7 +1157,6 @@ static const struct net_device_ops ieee80211_dataif_ops = {
1166 .ndo_uninit = ieee80211_uninit, 1157 .ndo_uninit = ieee80211_uninit,
1167 .ndo_start_xmit = ieee80211_subif_start_xmit, 1158 .ndo_start_xmit = ieee80211_subif_start_xmit,
1168 .ndo_set_rx_mode = ieee80211_set_multicast_list, 1159 .ndo_set_rx_mode = ieee80211_set_multicast_list,
1169 .ndo_change_mtu = ieee80211_change_mtu,
1170 .ndo_set_mac_address = ieee80211_change_mac, 1160 .ndo_set_mac_address = ieee80211_change_mac,
1171 .ndo_select_queue = ieee80211_netdev_select_queue, 1161 .ndo_select_queue = ieee80211_netdev_select_queue,
1172 .ndo_get_stats64 = ieee80211_get_stats64, 1162 .ndo_get_stats64 = ieee80211_get_stats64,
@@ -1200,7 +1190,6 @@ static const struct net_device_ops ieee80211_monitorif_ops = {
1200 .ndo_uninit = ieee80211_uninit, 1190 .ndo_uninit = ieee80211_uninit,
1201 .ndo_start_xmit = ieee80211_monitor_start_xmit, 1191 .ndo_start_xmit = ieee80211_monitor_start_xmit,
1202 .ndo_set_rx_mode = ieee80211_set_multicast_list, 1192 .ndo_set_rx_mode = ieee80211_set_multicast_list,
1203 .ndo_change_mtu = ieee80211_change_mtu,
1204 .ndo_set_mac_address = ieee80211_change_mac, 1193 .ndo_set_mac_address = ieee80211_change_mac,
1205 .ndo_select_queue = ieee80211_monitor_select_queue, 1194 .ndo_select_queue = ieee80211_monitor_select_queue,
1206 .ndo_get_stats64 = ieee80211_get_stats64, 1195 .ndo_get_stats64 = ieee80211_get_stats64,
@@ -1306,6 +1295,26 @@ static void ieee80211_iface_work(struct work_struct *work)
1306 } else if (ieee80211_is_action(mgmt->frame_control) && 1295 } else if (ieee80211_is_action(mgmt->frame_control) &&
1307 mgmt->u.action.category == WLAN_CATEGORY_VHT) { 1296 mgmt->u.action.category == WLAN_CATEGORY_VHT) {
1308 switch (mgmt->u.action.u.vht_group_notif.action_code) { 1297 switch (mgmt->u.action.u.vht_group_notif.action_code) {
1298 case WLAN_VHT_ACTION_OPMODE_NOTIF: {
1299 struct ieee80211_rx_status *status;
1300 enum nl80211_band band;
1301 u8 opmode;
1302
1303 status = IEEE80211_SKB_RXCB(skb);
1304 band = status->band;
1305 opmode = mgmt->u.action.u.vht_opmode_notif.operating_mode;
1306
1307 mutex_lock(&local->sta_mtx);
1308 sta = sta_info_get_bss(sdata, mgmt->sa);
1309
1310 if (sta)
1311 ieee80211_vht_handle_opmode(sdata, sta,
1312 opmode,
1313 band);
1314
1315 mutex_unlock(&local->sta_mtx);
1316 break;
1317 }
1309 case WLAN_VHT_ACTION_GROUPID_MGMT: 1318 case WLAN_VHT_ACTION_GROUPID_MGMT:
1310 ieee80211_process_mu_groups(sdata, mgmt); 1319 ieee80211_process_mu_groups(sdata, mgmt);
1311 break; 1320 break;
@@ -1884,6 +1893,10 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
1884 1893
1885 netdev_set_default_ethtool_ops(ndev, &ieee80211_ethtool_ops); 1894 netdev_set_default_ethtool_ops(ndev, &ieee80211_ethtool_ops);
1886 1895
1896 /* MTU range: 256 - 2304 */
1897 ndev->min_mtu = 256;
1898 ndev->max_mtu = IEEE80211_MAX_DATA_LEN;
1899
1887 ret = register_netdevice(ndev); 1900 ret = register_netdevice(ndev);
1888 if (ret) { 1901 if (ret) {
1889 ieee80211_if_free(ndev); 1902 ieee80211_if_free(ndev);
@@ -2005,3 +2018,19 @@ void ieee80211_iface_exit(void)
2005{ 2018{
2006 unregister_netdevice_notifier(&mac80211_netdev_notifier); 2019 unregister_netdevice_notifier(&mac80211_netdev_notifier);
2007} 2020}
2021
2022void ieee80211_vif_inc_num_mcast(struct ieee80211_sub_if_data *sdata)
2023{
2024 if (sdata->vif.type == NL80211_IFTYPE_AP)
2025 atomic_inc(&sdata->u.ap.num_mcast_sta);
2026 else if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
2027 atomic_inc(&sdata->u.vlan.num_mcast_sta);
2028}
2029
2030void ieee80211_vif_dec_num_mcast(struct ieee80211_sub_if_data *sdata)
2031{
2032 if (sdata->vif.type == NL80211_IFTYPE_AP)
2033 atomic_dec(&sdata->u.ap.num_mcast_sta);
2034 else if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
2035 atomic_dec(&sdata->u.vlan.num_mcast_sta);
2036}
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index edd6f2945f69..a98fc2b5e0dc 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -265,7 +265,8 @@ static void __ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata,
265 if (uni) { 265 if (uni) {
266 rcu_assign_pointer(sdata->default_unicast_key, key); 266 rcu_assign_pointer(sdata->default_unicast_key, key);
267 ieee80211_check_fast_xmit_iface(sdata); 267 ieee80211_check_fast_xmit_iface(sdata);
268 drv_set_default_unicast_key(sdata->local, sdata, idx); 268 if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN)
269 drv_set_default_unicast_key(sdata->local, sdata, idx);
269 } 270 }
270 271
271 if (multi) 272 if (multi)
diff --git a/net/mac80211/key.h b/net/mac80211/key.h
index 4aa20cef0859..ebdb80b85dc3 100644
--- a/net/mac80211/key.h
+++ b/net/mac80211/key.h
@@ -93,7 +93,7 @@ struct ieee80211_key {
93 } ccmp; 93 } ccmp;
94 struct { 94 struct {
95 u8 rx_pn[IEEE80211_CMAC_PN_LEN]; 95 u8 rx_pn[IEEE80211_CMAC_PN_LEN];
96 struct crypto_cipher *tfm; 96 struct crypto_shash *tfm;
97 u32 replays; /* dot11RSNAStatsCMACReplays */ 97 u32 replays; /* dot11RSNAStatsCMACReplays */
98 u32 icverrors; /* dot11RSNAStatsCMACICVErrors */ 98 u32 icverrors; /* dot11RSNAStatsCMACICVErrors */
99 } aes_cmac; 99 } aes_cmac;
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 1075ac24c8c5..56fb47953b72 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -549,6 +549,7 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
549 NL80211_FEATURE_MAC_ON_CREATE | 549 NL80211_FEATURE_MAC_ON_CREATE |
550 NL80211_FEATURE_USERSPACE_MPM | 550 NL80211_FEATURE_USERSPACE_MPM |
551 NL80211_FEATURE_FULL_AP_CLIENT_STATE; 551 NL80211_FEATURE_FULL_AP_CLIENT_STATE;
552 wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_FILS_STA);
552 553
553 if (!ops->hw_scan) 554 if (!ops->hw_scan)
554 wiphy->features |= NL80211_FEATURE_LOW_PRIORITY_SCAN | 555 wiphy->features |= NL80211_FEATURE_LOW_PRIORITY_SCAN |
@@ -821,6 +822,10 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
821 !local->ops->tdls_recv_channel_switch)) 822 !local->ops->tdls_recv_channel_switch))
822 return -EOPNOTSUPP; 823 return -EOPNOTSUPP;
823 824
825 if (WARN_ON(ieee80211_hw_check(hw, SUPPORTS_TX_FRAG) &&
826 !local->ops->set_frag_threshold))
827 return -EINVAL;
828
824 if (WARN_ON(local->hw.wiphy->interface_modes & 829 if (WARN_ON(local->hw.wiphy->interface_modes &
825 BIT(NL80211_IFTYPE_NAN) && 830 BIT(NL80211_IFTYPE_NAN) &&
826 (!local->ops->start_nan || !local->ops->stop_nan))) 831 (!local->ops->start_nan || !local->ops->stop_nan)))
@@ -908,12 +913,17 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
908 supp_ht = supp_ht || sband->ht_cap.ht_supported; 913 supp_ht = supp_ht || sband->ht_cap.ht_supported;
909 supp_vht = supp_vht || sband->vht_cap.vht_supported; 914 supp_vht = supp_vht || sband->vht_cap.vht_supported;
910 915
911 if (sband->ht_cap.ht_supported) 916 if (!sband->ht_cap.ht_supported)
912 local->rx_chains = 917 continue;
913 max(ieee80211_mcs_to_chains(&sband->ht_cap.mcs),
914 local->rx_chains);
915 918
916 /* TODO: consider VHT for RX chains, hopefully it's the same */ 919 /* TODO: consider VHT for RX chains, hopefully it's the same */
920 local->rx_chains =
921 max(ieee80211_mcs_to_chains(&sband->ht_cap.mcs),
922 local->rx_chains);
923
924 /* no need to mask, SM_PS_DISABLED has all bits set */
925 sband->ht_cap.cap |= WLAN_HT_CAP_SM_PS_DISABLED <<
926 IEEE80211_HT_CAP_SM_PS_SHIFT;
917 } 927 }
918 928
919 /* if low-level driver supports AP, we also support VLAN */ 929 /* if low-level driver supports AP, we also support VLAN */
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 42120d965263..6e7b6a07b7d5 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -279,10 +279,6 @@ int mesh_add_meshconf_ie(struct ieee80211_sub_if_data *sdata,
279 /* Mesh PS mode. See IEEE802.11-2012 8.4.2.100.8 */ 279 /* Mesh PS mode. See IEEE802.11-2012 8.4.2.100.8 */
280 *pos |= ifmsh->ps_peers_deep_sleep ? 280 *pos |= ifmsh->ps_peers_deep_sleep ?
281 IEEE80211_MESHCONF_CAPAB_POWER_SAVE_LEVEL : 0x00; 281 IEEE80211_MESHCONF_CAPAB_POWER_SAVE_LEVEL : 0x00;
282 *pos++ |= ifmsh->adjusting_tbtt ?
283 IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING : 0x00;
284 *pos++ = 0x00;
285
286 return 0; 282 return 0;
287} 283}
288 284
@@ -339,7 +335,7 @@ int mesh_add_vendor_ies(struct ieee80211_sub_if_data *sdata,
339 /* fast-forward to vendor IEs */ 335 /* fast-forward to vendor IEs */
340 offset = ieee80211_ie_split_vendor(ifmsh->ie, ifmsh->ie_len, 0); 336 offset = ieee80211_ie_split_vendor(ifmsh->ie, ifmsh->ie_len, 0);
341 337
342 if (offset) { 338 if (offset < ifmsh->ie_len) {
343 len = ifmsh->ie_len - offset; 339 len = ifmsh->ie_len - offset;
344 data = ifmsh->ie + offset; 340 data = ifmsh->ie + offset;
345 if (skb_tailroom(skb) < len) 341 if (skb_tailroom(skb) < len)
@@ -685,7 +681,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh)
685 2 + /* NULL SSID */ 681 2 + /* NULL SSID */
686 /* Channel Switch Announcement */ 682 /* Channel Switch Announcement */
687 2 + sizeof(struct ieee80211_channel_sw_ie) + 683 2 + sizeof(struct ieee80211_channel_sw_ie) +
688 /* Mesh Channel Swith Parameters */ 684 /* Mesh Channel Switch Parameters */
689 2 + sizeof(struct ieee80211_mesh_chansw_params_ie) + 685 2 + sizeof(struct ieee80211_mesh_chansw_params_ie) +
690 2 + 8 + /* supported rates */ 686 2 + 8 + /* supported rates */
691 2 + 3; /* DS params */ 687 2 + 3; /* DS params */
@@ -850,7 +846,6 @@ int ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata)
850 ifmsh->mesh_cc_id = 0; /* Disabled */ 846 ifmsh->mesh_cc_id = 0; /* Disabled */
851 /* register sync ops from extensible synchronization framework */ 847 /* register sync ops from extensible synchronization framework */
852 ifmsh->sync_ops = ieee80211_mesh_sync_ops_get(ifmsh->mesh_sp_id); 848 ifmsh->sync_ops = ieee80211_mesh_sync_ops_get(ifmsh->mesh_sp_id);
853 ifmsh->adjusting_tbtt = false;
854 ifmsh->sync_offset_clockdrift_max = 0; 849 ifmsh->sync_offset_clockdrift_max = 0;
855 set_bit(MESH_WORK_HOUSEKEEPING, &ifmsh->wrkq_flags); 850 set_bit(MESH_WORK_HOUSEKEEPING, &ifmsh->wrkq_flags);
856 ieee80211_mesh_root_setup(ifmsh); 851 ieee80211_mesh_root_setup(ifmsh);
@@ -1349,7 +1344,7 @@ void ieee80211_mesh_work(struct ieee80211_sub_if_data *sdata)
1349 ieee80211_mesh_rootpath(sdata); 1344 ieee80211_mesh_rootpath(sdata);
1350 1345
1351 if (test_and_clear_bit(MESH_WORK_DRIFT_ADJUST, &ifmsh->wrkq_flags)) 1346 if (test_and_clear_bit(MESH_WORK_DRIFT_ADJUST, &ifmsh->wrkq_flags))
1352 mesh_sync_adjust_tbtt(sdata); 1347 mesh_sync_adjust_tsf(sdata);
1353 1348
1354 if (test_and_clear_bit(MESH_WORK_MBSS_CHANGED, &ifmsh->wrkq_flags)) 1349 if (test_and_clear_bit(MESH_WORK_MBSS_CHANGED, &ifmsh->wrkq_flags))
1355 mesh_bss_info_changed(sdata); 1350 mesh_bss_info_changed(sdata);
diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h
index 26b9ccbe1fce..7e5f271e3c30 100644
--- a/net/mac80211/mesh.h
+++ b/net/mac80211/mesh.h
@@ -341,7 +341,7 @@ static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata)
341} 341}
342 342
343void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata); 343void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata);
344void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata); 344void mesh_sync_adjust_tsf(struct ieee80211_sub_if_data *sdata);
345void ieee80211s_stop(void); 345void ieee80211s_stop(void);
346#else 346#else
347static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata) 347static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata)
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index 7fcdcf622655..953d71e784a9 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -9,6 +9,8 @@
9#include <linux/gfp.h> 9#include <linux/gfp.h>
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/random.h> 11#include <linux/random.h>
12#include <linux/rculist.h>
13
12#include "ieee80211_i.h" 14#include "ieee80211_i.h"
13#include "rate.h" 15#include "rate.h"
14#include "mesh.h" 16#include "mesh.h"
@@ -505,12 +507,14 @@ mesh_sta_info_alloc(struct ieee80211_sub_if_data *sdata, u8 *addr,
505 507
506 /* Userspace handles station allocation */ 508 /* Userspace handles station allocation */
507 if (sdata->u.mesh.user_mpm || 509 if (sdata->u.mesh.user_mpm ||
508 sdata->u.mesh.security & IEEE80211_MESH_SEC_AUTHED) 510 sdata->u.mesh.security & IEEE80211_MESH_SEC_AUTHED) {
509 cfg80211_notify_new_peer_candidate(sdata->dev, addr, 511 if (mesh_peer_accepts_plinks(elems) &&
510 elems->ie_start, 512 mesh_plink_availables(sdata))
511 elems->total_len, 513 cfg80211_notify_new_peer_candidate(sdata->dev, addr,
512 GFP_KERNEL); 514 elems->ie_start,
513 else 515 elems->total_len,
516 GFP_KERNEL);
517 } else
514 sta = __mesh_sta_info_alloc(sdata, addr); 518 sta = __mesh_sta_info_alloc(sdata, addr);
515 519
516 return sta; 520 return sta;
diff --git a/net/mac80211/mesh_sync.c b/net/mac80211/mesh_sync.c
index faca22cd02b5..a435f094a82e 100644
--- a/net/mac80211/mesh_sync.c
+++ b/net/mac80211/mesh_sync.c
@@ -12,7 +12,7 @@
12#include "mesh.h" 12#include "mesh.h"
13#include "driver-ops.h" 13#include "driver-ops.h"
14 14
15/* This is not in the standard. It represents a tolerable tbtt drift below 15/* This is not in the standard. It represents a tolerable tsf drift below
16 * which we do no TSF adjustment. 16 * which we do no TSF adjustment.
17 */ 17 */
18#define TOFFSET_MINIMUM_ADJUSTMENT 10 18#define TOFFSET_MINIMUM_ADJUSTMENT 10
@@ -46,7 +46,7 @@ static bool mesh_peer_tbtt_adjusting(struct ieee802_11_elems *ie)
46 IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING) != 0; 46 IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING) != 0;
47} 47}
48 48
49void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata) 49void mesh_sync_adjust_tsf(struct ieee80211_sub_if_data *sdata)
50{ 50{
51 struct ieee80211_local *local = sdata->local; 51 struct ieee80211_local *local = sdata->local;
52 struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; 52 struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
@@ -57,12 +57,12 @@ void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata)
57 57
58 spin_lock_bh(&ifmsh->sync_offset_lock); 58 spin_lock_bh(&ifmsh->sync_offset_lock);
59 if (ifmsh->sync_offset_clockdrift_max < beacon_int_fraction) { 59 if (ifmsh->sync_offset_clockdrift_max < beacon_int_fraction) {
60 msync_dbg(sdata, "TBTT : max clockdrift=%lld; adjusting\n", 60 msync_dbg(sdata, "TSF : max clockdrift=%lld; adjusting\n",
61 (long long) ifmsh->sync_offset_clockdrift_max); 61 (long long) ifmsh->sync_offset_clockdrift_max);
62 tsfdelta = -ifmsh->sync_offset_clockdrift_max; 62 tsfdelta = -ifmsh->sync_offset_clockdrift_max;
63 ifmsh->sync_offset_clockdrift_max = 0; 63 ifmsh->sync_offset_clockdrift_max = 0;
64 } else { 64 } else {
65 msync_dbg(sdata, "TBTT : max clockdrift=%lld; adjusting by %llu\n", 65 msync_dbg(sdata, "TSF : max clockdrift=%lld; adjusting by %llu\n",
66 (long long) ifmsh->sync_offset_clockdrift_max, 66 (long long) ifmsh->sync_offset_clockdrift_max,
67 (unsigned long long) beacon_int_fraction); 67 (unsigned long long) beacon_int_fraction);
68 tsfdelta = -beacon_int_fraction; 68 tsfdelta = -beacon_int_fraction;
@@ -123,7 +123,6 @@ static void mesh_sync_offset_rx_bcn_presp(struct ieee80211_sub_if_data *sdata,
123 */ 123 */
124 124
125 if (elems->mesh_config && mesh_peer_tbtt_adjusting(elems)) { 125 if (elems->mesh_config && mesh_peer_tbtt_adjusting(elems)) {
126 clear_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN);
127 msync_dbg(sdata, "STA %pM : is adjusting TBTT\n", 126 msync_dbg(sdata, "STA %pM : is adjusting TBTT\n",
128 sta->sta.addr); 127 sta->sta.addr);
129 goto no_sync; 128 goto no_sync;
@@ -168,15 +167,13 @@ no_sync:
168 rcu_read_unlock(); 167 rcu_read_unlock();
169} 168}
170 169
171static void mesh_sync_offset_adjust_tbtt(struct ieee80211_sub_if_data *sdata, 170static void mesh_sync_offset_adjust_tsf(struct ieee80211_sub_if_data *sdata,
172 struct beacon_data *beacon) 171 struct beacon_data *beacon)
173{ 172{
174 struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; 173 struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
175 u8 cap;
176 174
177 WARN_ON(ifmsh->mesh_sp_id != IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET); 175 WARN_ON(ifmsh->mesh_sp_id != IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET);
178 WARN_ON(!rcu_read_lock_held()); 176 WARN_ON(!rcu_read_lock_held());
179 cap = beacon->meshconf->meshconf_cap;
180 177
181 spin_lock_bh(&ifmsh->sync_offset_lock); 178 spin_lock_bh(&ifmsh->sync_offset_lock);
182 179
@@ -187,24 +184,16 @@ static void mesh_sync_offset_adjust_tbtt(struct ieee80211_sub_if_data *sdata,
187 * the tsf adjustment to the mesh tasklet 184 * the tsf adjustment to the mesh tasklet
188 */ 185 */
189 msync_dbg(sdata, 186 msync_dbg(sdata,
190 "TBTT : kicking off TBTT adjustment with clockdrift_max=%lld\n", 187 "TSF : kicking off TSF adjustment with clockdrift_max=%lld\n",
191 ifmsh->sync_offset_clockdrift_max); 188 ifmsh->sync_offset_clockdrift_max);
192 set_bit(MESH_WORK_DRIFT_ADJUST, &ifmsh->wrkq_flags); 189 set_bit(MESH_WORK_DRIFT_ADJUST, &ifmsh->wrkq_flags);
193
194 ifmsh->adjusting_tbtt = true;
195 } else { 190 } else {
196 msync_dbg(sdata, 191 msync_dbg(sdata,
197 "TBTT : max clockdrift=%lld; too small to adjust\n", 192 "TSF : max clockdrift=%lld; too small to adjust\n",
198 (long long)ifmsh->sync_offset_clockdrift_max); 193 (long long)ifmsh->sync_offset_clockdrift_max);
199 ifmsh->sync_offset_clockdrift_max = 0; 194 ifmsh->sync_offset_clockdrift_max = 0;
200
201 ifmsh->adjusting_tbtt = false;
202 } 195 }
203 spin_unlock_bh(&ifmsh->sync_offset_lock); 196 spin_unlock_bh(&ifmsh->sync_offset_lock);
204
205 beacon->meshconf->meshconf_cap = ifmsh->adjusting_tbtt ?
206 IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING | cap :
207 ~IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING & cap;
208} 197}
209 198
210static const struct sync_method sync_methods[] = { 199static const struct sync_method sync_methods[] = {
@@ -212,7 +201,7 @@ static const struct sync_method sync_methods[] = {
212 .method = IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET, 201 .method = IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET,
213 .ops = { 202 .ops = {
214 .rx_bcn_presp = &mesh_sync_offset_rx_bcn_presp, 203 .rx_bcn_presp = &mesh_sync_offset_rx_bcn_presp,
215 .adjust_tbtt = &mesh_sync_offset_adjust_tbtt, 204 .adjust_tsf = &mesh_sync_offset_adjust_tsf,
216 } 205 }
217 }, 206 },
218}; 207};
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 7486f2dab4ba..6e90301154d5 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -30,6 +30,7 @@
30#include "driver-ops.h" 30#include "driver-ops.h"
31#include "rate.h" 31#include "rate.h"
32#include "led.h" 32#include "led.h"
33#include "fils_aead.h"
33 34
34#define IEEE80211_AUTH_TIMEOUT (HZ / 5) 35#define IEEE80211_AUTH_TIMEOUT (HZ / 5)
35#define IEEE80211_AUTH_TIMEOUT_LONG (HZ / 2) 36#define IEEE80211_AUTH_TIMEOUT_LONG (HZ / 2)
@@ -652,6 +653,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
652 2 + sizeof(struct ieee80211_ht_cap) + /* HT */ 653 2 + sizeof(struct ieee80211_ht_cap) + /* HT */
653 2 + sizeof(struct ieee80211_vht_cap) + /* VHT */ 654 2 + sizeof(struct ieee80211_vht_cap) + /* VHT */
654 assoc_data->ie_len + /* extra IEs */ 655 assoc_data->ie_len + /* extra IEs */
656 (assoc_data->fils_kek_len ? 16 /* AES-SIV */ : 0) +
655 9, /* WMM */ 657 9, /* WMM */
656 GFP_KERNEL); 658 GFP_KERNEL);
657 if (!skb) 659 if (!skb)
@@ -875,6 +877,12 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
875 memcpy(pos, assoc_data->ie + offset, noffset - offset); 877 memcpy(pos, assoc_data->ie + offset, noffset - offset);
876 } 878 }
877 879
880 if (assoc_data->fils_kek_len &&
881 fils_encrypt_assoc_req(skb, assoc_data) < 0) {
882 dev_kfree_skb(skb);
883 return;
884 }
885
878 drv_mgd_prepare_tx(local, sdata); 886 drv_mgd_prepare_tx(local, sdata);
879 887
880 IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; 888 IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
@@ -1478,10 +1486,6 @@ void ieee80211_recalc_ps(struct ieee80211_local *local)
1478 1486
1479 if (count == 1 && ieee80211_powersave_allowed(found)) { 1487 if (count == 1 && ieee80211_powersave_allowed(found)) {
1480 u8 dtimper = found->u.mgd.dtim_period; 1488 u8 dtimper = found->u.mgd.dtim_period;
1481 s32 beaconint_us;
1482
1483 beaconint_us = ieee80211_tu_to_usec(
1484 found->vif.bss_conf.beacon_int);
1485 1489
1486 timeout = local->dynamic_ps_forced_timeout; 1490 timeout = local->dynamic_ps_forced_timeout;
1487 if (timeout < 0) 1491 if (timeout < 0)
@@ -2510,7 +2514,7 @@ static void ieee80211_destroy_auth_data(struct ieee80211_sub_if_data *sdata,
2510} 2514}
2511 2515
2512static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata, 2516static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata,
2513 bool assoc) 2517 bool assoc, bool abandon)
2514{ 2518{
2515 struct ieee80211_mgd_assoc_data *assoc_data = sdata->u.mgd.assoc_data; 2519 struct ieee80211_mgd_assoc_data *assoc_data = sdata->u.mgd.assoc_data;
2516 2520
@@ -2533,6 +2537,9 @@ static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata,
2533 mutex_lock(&sdata->local->mtx); 2537 mutex_lock(&sdata->local->mtx);
2534 ieee80211_vif_release_channel(sdata); 2538 ieee80211_vif_release_channel(sdata);
2535 mutex_unlock(&sdata->local->mtx); 2539 mutex_unlock(&sdata->local->mtx);
2540
2541 if (abandon)
2542 cfg80211_abandon_assoc(sdata->dev, assoc_data->bss);
2536 } 2543 }
2537 2544
2538 kfree(assoc_data); 2545 kfree(assoc_data);
@@ -2618,6 +2625,9 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
2618 case WLAN_AUTH_LEAP: 2625 case WLAN_AUTH_LEAP:
2619 case WLAN_AUTH_FT: 2626 case WLAN_AUTH_FT:
2620 case WLAN_AUTH_SAE: 2627 case WLAN_AUTH_SAE:
2628 case WLAN_AUTH_FILS_SK:
2629 case WLAN_AUTH_FILS_SK_PFS:
2630 case WLAN_AUTH_FILS_PK:
2621 break; 2631 break;
2622 case WLAN_AUTH_SHARED_KEY: 2632 case WLAN_AUTH_SHARED_KEY:
2623 if (ifmgd->auth_data->expected_transaction != 4) { 2633 if (ifmgd->auth_data->expected_transaction != 4) {
@@ -2762,7 +2772,7 @@ static void ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata,
2762 bssid, reason_code, 2772 bssid, reason_code,
2763 ieee80211_get_reason_code_string(reason_code)); 2773 ieee80211_get_reason_code_string(reason_code));
2764 2774
2765 ieee80211_destroy_assoc_data(sdata, false); 2775 ieee80211_destroy_assoc_data(sdata, false, true);
2766 2776
2767 cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); 2777 cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len);
2768 return; 2778 return;
@@ -3143,6 +3153,10 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
3143 reassoc ? "Rea" : "A", mgmt->sa, 3153 reassoc ? "Rea" : "A", mgmt->sa,
3144 capab_info, status_code, (u16)(aid & ~(BIT(15) | BIT(14)))); 3154 capab_info, status_code, (u16)(aid & ~(BIT(15) | BIT(14))));
3145 3155
3156 if (assoc_data->fils_kek_len &&
3157 fils_decrypt_assoc_resp(sdata, (u8 *)mgmt, &len, assoc_data) < 0)
3158 return;
3159
3146 pos = mgmt->u.assoc_resp.variable; 3160 pos = mgmt->u.assoc_resp.variable;
3147 ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), false, &elems); 3161 ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), false, &elems);
3148 3162
@@ -3167,14 +3181,14 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
3167 if (status_code != WLAN_STATUS_SUCCESS) { 3181 if (status_code != WLAN_STATUS_SUCCESS) {
3168 sdata_info(sdata, "%pM denied association (code=%d)\n", 3182 sdata_info(sdata, "%pM denied association (code=%d)\n",
3169 mgmt->sa, status_code); 3183 mgmt->sa, status_code);
3170 ieee80211_destroy_assoc_data(sdata, false); 3184 ieee80211_destroy_assoc_data(sdata, false, false);
3171 event.u.mlme.status = MLME_DENIED; 3185 event.u.mlme.status = MLME_DENIED;
3172 event.u.mlme.reason = status_code; 3186 event.u.mlme.reason = status_code;
3173 drv_event_callback(sdata->local, sdata, &event); 3187 drv_event_callback(sdata->local, sdata, &event);
3174 } else { 3188 } else {
3175 if (!ieee80211_assoc_success(sdata, bss, mgmt, len)) { 3189 if (!ieee80211_assoc_success(sdata, bss, mgmt, len)) {
3176 /* oops -- internal error -- send timeout for now */ 3190 /* oops -- internal error -- send timeout for now */
3177 ieee80211_destroy_assoc_data(sdata, false); 3191 ieee80211_destroy_assoc_data(sdata, false, false);
3178 cfg80211_assoc_timeout(sdata->dev, bss); 3192 cfg80211_assoc_timeout(sdata->dev, bss);
3179 return; 3193 return;
3180 } 3194 }
@@ -3187,13 +3201,13 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
3187 * recalc after assoc_data is NULL but before associated 3201 * recalc after assoc_data is NULL but before associated
3188 * is set can cause the interface to go idle 3202 * is set can cause the interface to go idle
3189 */ 3203 */
3190 ieee80211_destroy_assoc_data(sdata, true); 3204 ieee80211_destroy_assoc_data(sdata, true, false);
3191 3205
3192 /* get uapsd queues configuration */ 3206 /* get uapsd queues configuration */
3193 uapsd_queues = 0; 3207 uapsd_queues = 0;
3194 for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) 3208 for (ac = 0; ac < IEEE80211_NUM_ACS; ac++)
3195 if (sdata->tx_conf[ac].uapsd) 3209 if (sdata->tx_conf[ac].uapsd)
3196 uapsd_queues |= BIT(ac); 3210 uapsd_queues |= ieee80211_ac_to_qos_mask[ac];
3197 } 3211 }
3198 3212
3199 cfg80211_rx_assoc_resp(sdata->dev, bss, (u8 *)mgmt, len, uapsd_queues); 3213 cfg80211_rx_assoc_resp(sdata->dev, bss, (u8 *)mgmt, len, uapsd_queues);
@@ -3405,14 +3419,14 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
3405 ieee80211_cqm_rssi_notify( 3419 ieee80211_cqm_rssi_notify(
3406 &sdata->vif, 3420 &sdata->vif,
3407 NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW, 3421 NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW,
3408 GFP_KERNEL); 3422 sig, GFP_KERNEL);
3409 } else if (sig > thold && 3423 } else if (sig > thold &&
3410 (last_event == 0 || sig > last_event + hyst)) { 3424 (last_event == 0 || sig > last_event + hyst)) {
3411 ifmgd->last_cqm_event_signal = sig; 3425 ifmgd->last_cqm_event_signal = sig;
3412 ieee80211_cqm_rssi_notify( 3426 ieee80211_cqm_rssi_notify(
3413 &sdata->vif, 3427 &sdata->vif,
3414 NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH, 3428 NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH,
3415 GFP_KERNEL); 3429 sig, GFP_KERNEL);
3416 } 3430 }
3417 } 3431 }
3418 3432
@@ -3886,7 +3900,7 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata)
3886 .u.mlme.status = MLME_TIMEOUT, 3900 .u.mlme.status = MLME_TIMEOUT,
3887 }; 3901 };
3888 3902
3889 ieee80211_destroy_assoc_data(sdata, false); 3903 ieee80211_destroy_assoc_data(sdata, false, false);
3890 cfg80211_assoc_timeout(sdata->dev, bss); 3904 cfg80211_assoc_timeout(sdata->dev, bss);
3891 drv_event_callback(sdata->local, sdata, &event); 3905 drv_event_callback(sdata->local, sdata, &event);
3892 } 3906 }
@@ -4025,7 +4039,7 @@ void ieee80211_mgd_quiesce(struct ieee80211_sub_if_data *sdata)
4025 WLAN_REASON_DEAUTH_LEAVING, 4039 WLAN_REASON_DEAUTH_LEAVING,
4026 false, frame_buf); 4040 false, frame_buf);
4027 if (ifmgd->assoc_data) 4041 if (ifmgd->assoc_data)
4028 ieee80211_destroy_assoc_data(sdata, false); 4042 ieee80211_destroy_assoc_data(sdata, false, true);
4029 if (ifmgd->auth_data) 4043 if (ifmgd->auth_data)
4030 ieee80211_destroy_auth_data(sdata, false); 4044 ieee80211_destroy_auth_data(sdata, false);
4031 cfg80211_tx_mlme_mgmt(sdata->dev, frame_buf, 4045 cfg80211_tx_mlme_mgmt(sdata->dev, frame_buf,
@@ -4479,24 +4493,36 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
4479 case NL80211_AUTHTYPE_SAE: 4493 case NL80211_AUTHTYPE_SAE:
4480 auth_alg = WLAN_AUTH_SAE; 4494 auth_alg = WLAN_AUTH_SAE;
4481 break; 4495 break;
4496 case NL80211_AUTHTYPE_FILS_SK:
4497 auth_alg = WLAN_AUTH_FILS_SK;
4498 break;
4499 case NL80211_AUTHTYPE_FILS_SK_PFS:
4500 auth_alg = WLAN_AUTH_FILS_SK_PFS;
4501 break;
4502 case NL80211_AUTHTYPE_FILS_PK:
4503 auth_alg = WLAN_AUTH_FILS_PK;
4504 break;
4482 default: 4505 default:
4483 return -EOPNOTSUPP; 4506 return -EOPNOTSUPP;
4484 } 4507 }
4485 4508
4486 auth_data = kzalloc(sizeof(*auth_data) + req->sae_data_len + 4509 auth_data = kzalloc(sizeof(*auth_data) + req->auth_data_len +
4487 req->ie_len, GFP_KERNEL); 4510 req->ie_len, GFP_KERNEL);
4488 if (!auth_data) 4511 if (!auth_data)
4489 return -ENOMEM; 4512 return -ENOMEM;
4490 4513
4491 auth_data->bss = req->bss; 4514 auth_data->bss = req->bss;
4492 4515
4493 if (req->sae_data_len >= 4) { 4516 if (req->auth_data_len >= 4) {
4494 __le16 *pos = (__le16 *) req->sae_data; 4517 if (req->auth_type == NL80211_AUTHTYPE_SAE) {
4495 auth_data->sae_trans = le16_to_cpu(pos[0]); 4518 __le16 *pos = (__le16 *) req->auth_data;
4496 auth_data->sae_status = le16_to_cpu(pos[1]); 4519
4497 memcpy(auth_data->data, req->sae_data + 4, 4520 auth_data->sae_trans = le16_to_cpu(pos[0]);
4498 req->sae_data_len - 4); 4521 auth_data->sae_status = le16_to_cpu(pos[1]);
4499 auth_data->data_len += req->sae_data_len - 4; 4522 }
4523 memcpy(auth_data->data, req->auth_data + 4,
4524 req->auth_data_len - 4);
4525 auth_data->data_len += req->auth_data_len - 4;
4500 } 4526 }
4501 4527
4502 if (req->ie && req->ie_len) { 4528 if (req->ie && req->ie_len) {
@@ -4692,6 +4718,21 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
4692 assoc_data->ie_len = req->ie_len; 4718 assoc_data->ie_len = req->ie_len;
4693 } 4719 }
4694 4720
4721 if (req->fils_kek) {
4722 /* should already be checked in cfg80211 - so warn */
4723 if (WARN_ON(req->fils_kek_len > FILS_MAX_KEK_LEN)) {
4724 err = -EINVAL;
4725 goto err_free;
4726 }
4727 memcpy(assoc_data->fils_kek, req->fils_kek,
4728 req->fils_kek_len);
4729 assoc_data->fils_kek_len = req->fils_kek_len;
4730 }
4731
4732 if (req->fils_nonces)
4733 memcpy(assoc_data->fils_nonces, req->fils_nonces,
4734 2 * FILS_NONCE_LEN);
4735
4695 assoc_data->bss = req->bss; 4736 assoc_data->bss = req->bss;
4696 4737
4697 if (ifmgd->req_smps == IEEE80211_SMPS_AUTOMATIC) { 4738 if (ifmgd->req_smps == IEEE80211_SMPS_AUTOMATIC) {
@@ -4907,7 +4948,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
4907 IEEE80211_STYPE_DEAUTH, 4948 IEEE80211_STYPE_DEAUTH,
4908 req->reason_code, tx, 4949 req->reason_code, tx,
4909 frame_buf); 4950 frame_buf);
4910 ieee80211_destroy_assoc_data(sdata, false); 4951 ieee80211_destroy_assoc_data(sdata, false, true);
4911 ieee80211_report_disconnect(sdata, frame_buf, 4952 ieee80211_report_disconnect(sdata, frame_buf,
4912 sizeof(frame_buf), true, 4953 sizeof(frame_buf), true,
4913 req->reason_code); 4954 req->reason_code);
@@ -4982,7 +5023,7 @@ void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata)
4982 sdata_lock(sdata); 5023 sdata_lock(sdata);
4983 if (ifmgd->assoc_data) { 5024 if (ifmgd->assoc_data) {
4984 struct cfg80211_bss *bss = ifmgd->assoc_data->bss; 5025 struct cfg80211_bss *bss = ifmgd->assoc_data->bss;
4985 ieee80211_destroy_assoc_data(sdata, false); 5026 ieee80211_destroy_assoc_data(sdata, false, false);
4986 cfg80211_assoc_timeout(sdata->dev, bss); 5027 cfg80211_assoc_timeout(sdata->dev, bss);
4987 } 5028 }
4988 if (ifmgd->auth_data) 5029 if (ifmgd->auth_data)
@@ -5000,13 +5041,14 @@ void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata)
5000 5041
5001void ieee80211_cqm_rssi_notify(struct ieee80211_vif *vif, 5042void ieee80211_cqm_rssi_notify(struct ieee80211_vif *vif,
5002 enum nl80211_cqm_rssi_threshold_event rssi_event, 5043 enum nl80211_cqm_rssi_threshold_event rssi_event,
5044 s32 rssi_level,
5003 gfp_t gfp) 5045 gfp_t gfp)
5004{ 5046{
5005 struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); 5047 struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
5006 5048
5007 trace_api_cqm_rssi_notify(sdata, rssi_event); 5049 trace_api_cqm_rssi_notify(sdata, rssi_event, rssi_level);
5008 5050
5009 cfg80211_cqm_rssi_notify(sdata->dev, rssi_event, gfp); 5051 cfg80211_cqm_rssi_notify(sdata->dev, rssi_event, rssi_level, gfp);
5010} 5052}
5011EXPORT_SYMBOL(ieee80211_cqm_rssi_notify); 5053EXPORT_SYMBOL(ieee80211_cqm_rssi_notify);
5012 5054
diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c
index 28a3a0957c9e..76a8bcd8ef11 100644
--- a/net/mac80211/pm.c
+++ b/net/mac80211/pm.c
@@ -168,6 +168,7 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan)
168 break; 168 break;
169 } 169 }
170 170
171 flush_delayed_work(&sdata->dec_tailroom_needed_wk);
171 drv_remove_interface(local, sdata); 172 drv_remove_interface(local, sdata);
172 } 173 }
173 174
diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c
index 14c5ba3a1b1c..3ebe4405a2d4 100644
--- a/net/mac80211/rc80211_minstrel.c
+++ b/net/mac80211/rc80211_minstrel.c
@@ -159,21 +159,23 @@ minstrel_update_rates(struct minstrel_priv *mp, struct minstrel_sta_info *mi)
159void 159void
160minstrel_calc_rate_stats(struct minstrel_rate_stats *mrs) 160minstrel_calc_rate_stats(struct minstrel_rate_stats *mrs)
161{ 161{
162 unsigned int cur_prob;
163
162 if (unlikely(mrs->attempts > 0)) { 164 if (unlikely(mrs->attempts > 0)) {
163 mrs->sample_skipped = 0; 165 mrs->sample_skipped = 0;
164 mrs->cur_prob = MINSTREL_FRAC(mrs->success, mrs->attempts); 166 cur_prob = MINSTREL_FRAC(mrs->success, mrs->attempts);
165 if (unlikely(!mrs->att_hist)) { 167 if (unlikely(!mrs->att_hist)) {
166 mrs->prob_ewma = mrs->cur_prob; 168 mrs->prob_ewma = cur_prob;
167 } else { 169 } else {
168 /* update exponential weighted moving variance */ 170 /* update exponential weighted moving variance */
169 mrs->prob_ewmsd = minstrel_ewmsd(mrs->prob_ewmsd, 171 mrs->prob_ewmv = minstrel_ewmv(mrs->prob_ewmv,
170 mrs->cur_prob, 172 cur_prob,
171 mrs->prob_ewma, 173 mrs->prob_ewma,
172 EWMA_LEVEL); 174 EWMA_LEVEL);
173 175
174 /*update exponential weighted moving avarage */ 176 /*update exponential weighted moving avarage */
175 mrs->prob_ewma = minstrel_ewma(mrs->prob_ewma, 177 mrs->prob_ewma = minstrel_ewma(mrs->prob_ewma,
176 mrs->cur_prob, 178 cur_prob,
177 EWMA_LEVEL); 179 EWMA_LEVEL);
178 } 180 }
179 mrs->att_hist += mrs->attempts; 181 mrs->att_hist += mrs->attempts;
@@ -365,6 +367,11 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta,
365 return; 367 return;
366#endif 368#endif
367 369
370 /* Don't use EAPOL frames for sampling on non-mrr hw */
371 if (mp->hw->max_rates == 1 &&
372 (info->control.flags & IEEE80211_TX_CTRL_PORT_CTRL_PROTO))
373 return;
374
368 delta = (mi->total_packets * sampling_ratio / 100) - 375 delta = (mi->total_packets * sampling_ratio / 100) -
369 (mi->sample_packets + mi->sample_deferred / 2); 376 (mi->sample_packets + mi->sample_deferred / 2);
370 377
diff --git a/net/mac80211/rc80211_minstrel.h b/net/mac80211/rc80211_minstrel.h
index c230bbe93262..be6c3f35f48b 100644
--- a/net/mac80211/rc80211_minstrel.h
+++ b/net/mac80211/rc80211_minstrel.h
@@ -14,7 +14,7 @@
14#define SAMPLE_COLUMNS 10 /* number of columns in sample table */ 14#define SAMPLE_COLUMNS 10 /* number of columns in sample table */
15 15
16/* scaled fraction values */ 16/* scaled fraction values */
17#define MINSTREL_SCALE 16 17#define MINSTREL_SCALE 12
18#define MINSTREL_FRAC(val, div) (((val) << MINSTREL_SCALE) / div) 18#define MINSTREL_FRAC(val, div) (((val) << MINSTREL_SCALE) / div)
19#define MINSTREL_TRUNC(val) ((val) >> MINSTREL_SCALE) 19#define MINSTREL_TRUNC(val) ((val) >> MINSTREL_SCALE)
20 20
@@ -36,21 +36,16 @@ minstrel_ewma(int old, int new, int weight)
36} 36}
37 37
38/* 38/*
39 * Perform EWMSD (Exponentially Weighted Moving Standard Deviation) calculation 39 * Perform EWMV (Exponentially Weighted Moving Variance) calculation
40 */ 40 */
41static inline int 41static inline int
42minstrel_ewmsd(int old_ewmsd, int cur_prob, int prob_ewma, int weight) 42minstrel_ewmv(int old_ewmv, int cur_prob, int prob_ewma, int weight)
43{ 43{
44 int diff, incr, tmp_var; 44 int diff, incr;
45 45
46 /* calculate exponential weighted moving variance */ 46 diff = cur_prob - prob_ewma;
47 diff = MINSTREL_TRUNC((cur_prob - prob_ewma) * 1000000);
48 incr = (EWMA_DIV - weight) * diff / EWMA_DIV; 47 incr = (EWMA_DIV - weight) * diff / EWMA_DIV;
49 tmp_var = old_ewmsd * old_ewmsd; 48 return weight * (old_ewmv + MINSTREL_TRUNC(diff * incr)) / EWMA_DIV;
50 tmp_var = weight * (tmp_var + diff * incr / 1000000) / EWMA_DIV;
51
52 /* return standard deviation */
53 return (u16) int_sqrt(tmp_var);
54} 49}
55 50
56struct minstrel_rate_stats { 51struct minstrel_rate_stats {
@@ -59,15 +54,13 @@ struct minstrel_rate_stats {
59 u16 success, last_success; 54 u16 success, last_success;
60 55
61 /* total attempts/success counters */ 56 /* total attempts/success counters */
62 u64 att_hist, succ_hist; 57 u32 att_hist, succ_hist;
63 58
64 /* statistis of packet delivery probability 59 /* statistis of packet delivery probability
65 * cur_prob - current prob within last update intervall
66 * prob_ewma - exponential weighted moving average of prob 60 * prob_ewma - exponential weighted moving average of prob
67 * prob_ewmsd - exp. weighted moving standard deviation of prob */ 61 * prob_ewmsd - exp. weighted moving standard deviation of prob */
68 unsigned int cur_prob; 62 u16 prob_ewma;
69 unsigned int prob_ewma; 63 u16 prob_ewmv;
70 u16 prob_ewmsd;
71 64
72 /* maximum retry counts */ 65 /* maximum retry counts */
73 u8 retry_count; 66 u8 retry_count;
@@ -153,6 +146,14 @@ struct minstrel_debugfs_info {
153 char buf[]; 146 char buf[];
154}; 147};
155 148
149/* Get EWMSD (Exponentially Weighted Moving Standard Deviation) * 10 */
150static inline int
151minstrel_get_ewmsd10(struct minstrel_rate_stats *mrs)
152{
153 unsigned int ewmv = mrs->prob_ewmv;
154 return int_sqrt(MINSTREL_TRUNC(ewmv * 1000 * 1000));
155}
156
156extern const struct rate_control_ops mac80211_minstrel; 157extern const struct rate_control_ops mac80211_minstrel;
157void minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir); 158void minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir);
158void minstrel_remove_sta_debugfs(void *priv, void *priv_sta); 159void minstrel_remove_sta_debugfs(void *priv, void *priv_sta);
diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c
index 820b0abc9c0d..36fc971deb86 100644
--- a/net/mac80211/rc80211_minstrel_debugfs.c
+++ b/net/mac80211/rc80211_minstrel_debugfs.c
@@ -75,7 +75,7 @@ minstrel_stats_open(struct inode *inode, struct file *file)
75{ 75{
76 struct minstrel_sta_info *mi = inode->i_private; 76 struct minstrel_sta_info *mi = inode->i_private;
77 struct minstrel_debugfs_info *ms; 77 struct minstrel_debugfs_info *ms;
78 unsigned int i, tp_max, tp_avg, prob, eprob; 78 unsigned int i, tp_max, tp_avg, eprob;
79 char *p; 79 char *p;
80 80
81 ms = kmalloc(2048, GFP_KERNEL); 81 ms = kmalloc(2048, GFP_KERNEL);
@@ -86,13 +86,14 @@ minstrel_stats_open(struct inode *inode, struct file *file)
86 p = ms->buf; 86 p = ms->buf;
87 p += sprintf(p, "\n"); 87 p += sprintf(p, "\n");
88 p += sprintf(p, 88 p += sprintf(p,
89 "best __________rate_________ ________statistics________ ________last_______ ______sum-of________\n"); 89 "best __________rate_________ ________statistics________ ____last_____ ______sum-of________\n");
90 p += sprintf(p, 90 p += sprintf(p,
91 "rate [name idx airtime max_tp] [avg(tp) avg(prob) sd(prob)] [prob.|retry|suc|att] [#success | #attempts]\n"); 91 "rate [name idx airtime max_tp] [avg(tp) avg(prob) sd(prob)] [retry|suc|att] [#success | #attempts]\n");
92 92
93 for (i = 0; i < mi->n_rates; i++) { 93 for (i = 0; i < mi->n_rates; i++) {
94 struct minstrel_rate *mr = &mi->r[i]; 94 struct minstrel_rate *mr = &mi->r[i];
95 struct minstrel_rate_stats *mrs = &mi->r[i].stats; 95 struct minstrel_rate_stats *mrs = &mi->r[i].stats;
96 unsigned int prob_ewmsd;
96 97
97 *(p++) = (i == mi->max_tp_rate[0]) ? 'A' : ' '; 98 *(p++) = (i == mi->max_tp_rate[0]) ? 'A' : ' ';
98 *(p++) = (i == mi->max_tp_rate[1]) ? 'B' : ' '; 99 *(p++) = (i == mi->max_tp_rate[1]) ? 'B' : ' ';
@@ -107,17 +108,16 @@ minstrel_stats_open(struct inode *inode, struct file *file)
107 108
108 tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100)); 109 tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100));
109 tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma); 110 tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma);
110 prob = MINSTREL_TRUNC(mrs->cur_prob * 1000);
111 eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); 111 eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
112 prob_ewmsd = minstrel_get_ewmsd10(mrs);
112 113
113 p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u" 114 p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u"
114 " %3u.%1u %3u %3u %-3u " 115 " %3u %3u %-3u "
115 "%9llu %-9llu\n", 116 "%9llu %-9llu\n",
116 tp_max / 10, tp_max % 10, 117 tp_max / 10, tp_max % 10,
117 tp_avg / 10, tp_avg % 10, 118 tp_avg / 10, tp_avg % 10,
118 eprob / 10, eprob % 10, 119 eprob / 10, eprob % 10,
119 mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10, 120 prob_ewmsd / 10, prob_ewmsd % 10,
120 prob / 10, prob % 10,
121 mrs->retry_count, 121 mrs->retry_count,
122 mrs->last_success, 122 mrs->last_success,
123 mrs->last_attempts, 123 mrs->last_attempts,
@@ -148,7 +148,7 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file)
148{ 148{
149 struct minstrel_sta_info *mi = inode->i_private; 149 struct minstrel_sta_info *mi = inode->i_private;
150 struct minstrel_debugfs_info *ms; 150 struct minstrel_debugfs_info *ms;
151 unsigned int i, tp_max, tp_avg, prob, eprob; 151 unsigned int i, tp_max, tp_avg, eprob;
152 char *p; 152 char *p;
153 153
154 ms = kmalloc(2048, GFP_KERNEL); 154 ms = kmalloc(2048, GFP_KERNEL);
@@ -161,6 +161,7 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file)
161 for (i = 0; i < mi->n_rates; i++) { 161 for (i = 0; i < mi->n_rates; i++) {
162 struct minstrel_rate *mr = &mi->r[i]; 162 struct minstrel_rate *mr = &mi->r[i];
163 struct minstrel_rate_stats *mrs = &mi->r[i].stats; 163 struct minstrel_rate_stats *mrs = &mi->r[i].stats;
164 unsigned int prob_ewmsd;
164 165
165 p += sprintf(p, "%s" ,((i == mi->max_tp_rate[0]) ? "A" : "")); 166 p += sprintf(p, "%s" ,((i == mi->max_tp_rate[0]) ? "A" : ""));
166 p += sprintf(p, "%s" ,((i == mi->max_tp_rate[1]) ? "B" : "")); 167 p += sprintf(p, "%s" ,((i == mi->max_tp_rate[1]) ? "B" : ""));
@@ -175,16 +176,15 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file)
175 176
176 tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100)); 177 tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100));
177 tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma); 178 tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma);
178 prob = MINSTREL_TRUNC(mrs->cur_prob * 1000);
179 eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); 179 eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
180 prob_ewmsd = minstrel_get_ewmsd10(mrs);
180 181
181 p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u.%u,%u,%u,%u," 182 p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u,%u,%u,"
182 "%llu,%llu,%d,%d\n", 183 "%llu,%llu,%d,%d\n",
183 tp_max / 10, tp_max % 10, 184 tp_max / 10, tp_max % 10,
184 tp_avg / 10, tp_avg % 10, 185 tp_avg / 10, tp_avg % 10,
185 eprob / 10, eprob % 10, 186 eprob / 10, eprob % 10,
186 mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10, 187 prob_ewmsd / 10, prob_ewmsd % 10,
187 prob / 10, prob % 10,
188 mrs->retry_count, 188 mrs->retry_count,
189 mrs->last_success, 189 mrs->last_success,
190 mrs->last_attempts, 190 mrs->last_attempts,
diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index 30fbabf4bcbc..8e783e197e93 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -14,6 +14,7 @@
14#include <linux/ieee80211.h> 14#include <linux/ieee80211.h>
15#include <net/mac80211.h> 15#include <net/mac80211.h>
16#include "rate.h" 16#include "rate.h"
17#include "sta_info.h"
17#include "rc80211_minstrel.h" 18#include "rc80211_minstrel.h"
18#include "rc80211_minstrel_ht.h" 19#include "rc80211_minstrel_ht.h"
19 20
@@ -154,67 +155,47 @@ MODULE_PARM_DESC(minstrel_vht_only,
154const struct mcs_group minstrel_mcs_groups[] = { 155const struct mcs_group minstrel_mcs_groups[] = {
155 MCS_GROUP(1, 0, BW_20), 156 MCS_GROUP(1, 0, BW_20),
156 MCS_GROUP(2, 0, BW_20), 157 MCS_GROUP(2, 0, BW_20),
157#if MINSTREL_MAX_STREAMS >= 3
158 MCS_GROUP(3, 0, BW_20), 158 MCS_GROUP(3, 0, BW_20),
159#endif
160 159
161 MCS_GROUP(1, 1, BW_20), 160 MCS_GROUP(1, 1, BW_20),
162 MCS_GROUP(2, 1, BW_20), 161 MCS_GROUP(2, 1, BW_20),
163#if MINSTREL_MAX_STREAMS >= 3
164 MCS_GROUP(3, 1, BW_20), 162 MCS_GROUP(3, 1, BW_20),
165#endif
166 163
167 MCS_GROUP(1, 0, BW_40), 164 MCS_GROUP(1, 0, BW_40),
168 MCS_GROUP(2, 0, BW_40), 165 MCS_GROUP(2, 0, BW_40),
169#if MINSTREL_MAX_STREAMS >= 3
170 MCS_GROUP(3, 0, BW_40), 166 MCS_GROUP(3, 0, BW_40),
171#endif
172 167
173 MCS_GROUP(1, 1, BW_40), 168 MCS_GROUP(1, 1, BW_40),
174 MCS_GROUP(2, 1, BW_40), 169 MCS_GROUP(2, 1, BW_40),
175#if MINSTREL_MAX_STREAMS >= 3
176 MCS_GROUP(3, 1, BW_40), 170 MCS_GROUP(3, 1, BW_40),
177#endif
178 171
179 CCK_GROUP, 172 CCK_GROUP,
180 173
181#ifdef CONFIG_MAC80211_RC_MINSTREL_VHT 174#ifdef CONFIG_MAC80211_RC_MINSTREL_VHT
182 VHT_GROUP(1, 0, BW_20), 175 VHT_GROUP(1, 0, BW_20),
183 VHT_GROUP(2, 0, BW_20), 176 VHT_GROUP(2, 0, BW_20),
184#if MINSTREL_MAX_STREAMS >= 3
185 VHT_GROUP(3, 0, BW_20), 177 VHT_GROUP(3, 0, BW_20),
186#endif
187 178
188 VHT_GROUP(1, 1, BW_20), 179 VHT_GROUP(1, 1, BW_20),
189 VHT_GROUP(2, 1, BW_20), 180 VHT_GROUP(2, 1, BW_20),
190#if MINSTREL_MAX_STREAMS >= 3
191 VHT_GROUP(3, 1, BW_20), 181 VHT_GROUP(3, 1, BW_20),
192#endif
193 182
194 VHT_GROUP(1, 0, BW_40), 183 VHT_GROUP(1, 0, BW_40),
195 VHT_GROUP(2, 0, BW_40), 184 VHT_GROUP(2, 0, BW_40),
196#if MINSTREL_MAX_STREAMS >= 3
197 VHT_GROUP(3, 0, BW_40), 185 VHT_GROUP(3, 0, BW_40),
198#endif
199 186
200 VHT_GROUP(1, 1, BW_40), 187 VHT_GROUP(1, 1, BW_40),
201 VHT_GROUP(2, 1, BW_40), 188 VHT_GROUP(2, 1, BW_40),
202#if MINSTREL_MAX_STREAMS >= 3
203 VHT_GROUP(3, 1, BW_40), 189 VHT_GROUP(3, 1, BW_40),
204#endif
205 190
206 VHT_GROUP(1, 0, BW_80), 191 VHT_GROUP(1, 0, BW_80),
207 VHT_GROUP(2, 0, BW_80), 192 VHT_GROUP(2, 0, BW_80),
208#if MINSTREL_MAX_STREAMS >= 3
209 VHT_GROUP(3, 0, BW_80), 193 VHT_GROUP(3, 0, BW_80),
210#endif
211 194
212 VHT_GROUP(1, 1, BW_80), 195 VHT_GROUP(1, 1, BW_80),
213 VHT_GROUP(2, 1, BW_80), 196 VHT_GROUP(2, 1, BW_80),
214#if MINSTREL_MAX_STREAMS >= 3
215 VHT_GROUP(3, 1, BW_80), 197 VHT_GROUP(3, 1, BW_80),
216#endif 198#endif
217#endif
218}; 199};
219 200
220static u8 sample_table[SAMPLE_COLUMNS][MCS_GROUP_RATES] __read_mostly; 201static u8 sample_table[SAMPLE_COLUMNS][MCS_GROUP_RATES] __read_mostly;
@@ -301,7 +282,7 @@ minstrel_ht_get_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
301 break; 282 break;
302 283
303 /* short preamble */ 284 /* short preamble */
304 if (!(mi->groups[group].supported & BIT(idx))) 285 if (!(mi->supported[group] & BIT(idx)))
305 idx += 4; 286 idx += 4;
306 } 287 }
307 return &mi->groups[group].rates[idx]; 288 return &mi->groups[group].rates[idx];
@@ -486,7 +467,7 @@ minstrel_ht_prob_rate_reduce_streams(struct minstrel_ht_sta *mi)
486 MCS_GROUP_RATES].streams; 467 MCS_GROUP_RATES].streams;
487 for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { 468 for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) {
488 mg = &mi->groups[group]; 469 mg = &mi->groups[group];
489 if (!mg->supported || group == MINSTREL_CCK_GROUP) 470 if (!mi->supported[group] || group == MINSTREL_CCK_GROUP)
490 continue; 471 continue;
491 472
492 tmp_idx = mg->max_group_prob_rate % MCS_GROUP_RATES; 473 tmp_idx = mg->max_group_prob_rate % MCS_GROUP_RATES;
@@ -540,7 +521,7 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
540 for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { 521 for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) {
541 522
542 mg = &mi->groups[group]; 523 mg = &mi->groups[group];
543 if (!mg->supported) 524 if (!mi->supported[group])
544 continue; 525 continue;
545 526
546 mi->sample_count++; 527 mi->sample_count++;
@@ -550,7 +531,7 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
550 tmp_group_tp_rate[j] = group; 531 tmp_group_tp_rate[j] = group;
551 532
552 for (i = 0; i < MCS_GROUP_RATES; i++) { 533 for (i = 0; i < MCS_GROUP_RATES; i++) {
553 if (!(mg->supported & BIT(i))) 534 if (!(mi->supported[group] & BIT(i)))
554 continue; 535 continue;
555 536
556 index = MCS_GROUP_RATES * group + i; 537 index = MCS_GROUP_RATES * group + i;
@@ -636,7 +617,7 @@ minstrel_set_next_sample_idx(struct minstrel_ht_sta *mi)
636 mi->sample_group %= ARRAY_SIZE(minstrel_mcs_groups); 617 mi->sample_group %= ARRAY_SIZE(minstrel_mcs_groups);
637 mg = &mi->groups[mi->sample_group]; 618 mg = &mi->groups[mi->sample_group];
638 619
639 if (!mg->supported) 620 if (!mi->supported[mi->sample_group])
640 continue; 621 continue;
641 622
642 if (++mg->index >= MCS_GROUP_RATES) { 623 if (++mg->index >= MCS_GROUP_RATES) {
@@ -657,7 +638,7 @@ minstrel_downgrade_rate(struct minstrel_ht_sta *mi, u16 *idx, bool primary)
657 while (group > 0) { 638 while (group > 0) {
658 group--; 639 group--;
659 640
660 if (!mi->groups[group].supported) 641 if (!mi->supported[group])
661 continue; 642 continue;
662 643
663 if (minstrel_mcs_groups[group].streams > 644 if (minstrel_mcs_groups[group].streams >
@@ -994,7 +975,7 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
994 sample_idx = sample_table[mg->column][mg->index]; 975 sample_idx = sample_table[mg->column][mg->index];
995 minstrel_set_next_sample_idx(mi); 976 minstrel_set_next_sample_idx(mi);
996 977
997 if (!(mg->supported & BIT(sample_idx))) 978 if (!(mi->supported[sample_group] & BIT(sample_idx)))
998 return -1; 979 return -1;
999 980
1000 mrs = &mg->rates[sample_idx]; 981 mrs = &mg->rates[sample_idx];
@@ -1049,22 +1030,6 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
1049} 1030}
1050 1031
1051static void 1032static void
1052minstrel_ht_check_cck_shortpreamble(struct minstrel_priv *mp,
1053 struct minstrel_ht_sta *mi, bool val)
1054{
1055 u8 supported = mi->groups[MINSTREL_CCK_GROUP].supported;
1056
1057 if (!supported || !mi->cck_supported_short)
1058 return;
1059
1060 if (supported & (mi->cck_supported_short << (val * 4)))
1061 return;
1062
1063 supported ^= mi->cck_supported_short | (mi->cck_supported_short << 4);
1064 mi->groups[MINSTREL_CCK_GROUP].supported = supported;
1065}
1066
1067static void
1068minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta, 1033minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta,
1069 struct ieee80211_tx_rate_control *txrc) 1034 struct ieee80211_tx_rate_control *txrc)
1070{ 1035{
@@ -1087,7 +1052,6 @@ minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta,
1087 minstrel_aggr_check(sta, txrc->skb); 1052 minstrel_aggr_check(sta, txrc->skb);
1088 1053
1089 info->flags |= mi->tx_flags; 1054 info->flags |= mi->tx_flags;
1090 minstrel_ht_check_cck_shortpreamble(mp, mi, txrc->short_preamble);
1091 1055
1092#ifdef CONFIG_MAC80211_DEBUGFS 1056#ifdef CONFIG_MAC80211_DEBUGFS
1093 if (mp->fixed_rate_idx != -1) 1057 if (mp->fixed_rate_idx != -1)
@@ -1154,7 +1118,7 @@ minstrel_ht_update_cck(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
1154 mi->cck_supported_short |= BIT(i); 1118 mi->cck_supported_short |= BIT(i);
1155 } 1119 }
1156 1120
1157 mi->groups[MINSTREL_CCK_GROUP].supported = mi->cck_supported; 1121 mi->supported[MINSTREL_CCK_GROUP] = mi->cck_supported;
1158} 1122}
1159 1123
1160static void 1124static void
@@ -1168,6 +1132,7 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
1168 struct ieee80211_mcs_info *mcs = &sta->ht_cap.mcs; 1132 struct ieee80211_mcs_info *mcs = &sta->ht_cap.mcs;
1169 u16 sta_cap = sta->ht_cap.cap; 1133 u16 sta_cap = sta->ht_cap.cap;
1170 struct ieee80211_sta_vht_cap *vht_cap = &sta->vht_cap; 1134 struct ieee80211_sta_vht_cap *vht_cap = &sta->vht_cap;
1135 struct sta_info *sinfo = container_of(sta, struct sta_info, sta);
1171 int use_vht; 1136 int use_vht;
1172 int n_supported = 0; 1137 int n_supported = 0;
1173 int ack_dur; 1138 int ack_dur;
@@ -1224,7 +1189,7 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
1224 u32 gflags = minstrel_mcs_groups[i].flags; 1189 u32 gflags = minstrel_mcs_groups[i].flags;
1225 int bw, nss; 1190 int bw, nss;
1226 1191
1227 mi->groups[i].supported = 0; 1192 mi->supported[i] = 0;
1228 if (i == MINSTREL_CCK_GROUP) { 1193 if (i == MINSTREL_CCK_GROUP) {
1229 minstrel_ht_update_cck(mp, mi, sband, sta); 1194 minstrel_ht_update_cck(mp, mi, sband, sta);
1230 continue; 1195 continue;
@@ -1256,8 +1221,8 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
1256 if (use_vht && minstrel_vht_only) 1221 if (use_vht && minstrel_vht_only)
1257 continue; 1222 continue;
1258#endif 1223#endif
1259 mi->groups[i].supported = mcs->rx_mask[nss - 1]; 1224 mi->supported[i] = mcs->rx_mask[nss - 1];
1260 if (mi->groups[i].supported) 1225 if (mi->supported[i])
1261 n_supported++; 1226 n_supported++;
1262 continue; 1227 continue;
1263 } 1228 }
@@ -1283,16 +1248,19 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
1283 else 1248 else
1284 bw = BW_20; 1249 bw = BW_20;
1285 1250
1286 mi->groups[i].supported = minstrel_get_valid_vht_rates(bw, nss, 1251 mi->supported[i] = minstrel_get_valid_vht_rates(bw, nss,
1287 vht_cap->vht_mcs.tx_mcs_map); 1252 vht_cap->vht_mcs.tx_mcs_map);
1288 1253
1289 if (mi->groups[i].supported) 1254 if (mi->supported[i])
1290 n_supported++; 1255 n_supported++;
1291 } 1256 }
1292 1257
1293 if (!n_supported) 1258 if (!n_supported)
1294 goto use_legacy; 1259 goto use_legacy;
1295 1260
1261 if (test_sta_flag(sinfo, WLAN_STA_SHORT_PREAMBLE))
1262 mi->cck_supported_short |= mi->cck_supported_short << 4;
1263
1296 /* create an initial rate table with the lowest supported rates */ 1264 /* create an initial rate table with the lowest supported rates */
1297 minstrel_ht_update_stats(mp, mi); 1265 minstrel_ht_update_stats(mp, mi);
1298 minstrel_ht_update_rates(mp, mi); 1266 minstrel_ht_update_rates(mp, mi);
diff --git a/net/mac80211/rc80211_minstrel_ht.h b/net/mac80211/rc80211_minstrel_ht.h
index e8b52a94d24b..de1646c42e82 100644
--- a/net/mac80211/rc80211_minstrel_ht.h
+++ b/net/mac80211/rc80211_minstrel_ht.h
@@ -52,9 +52,6 @@ struct minstrel_mcs_group_data {
52 u8 index; 52 u8 index;
53 u8 column; 53 u8 column;
54 54
55 /* bitfield of supported MCS rates of this group */
56 u16 supported;
57
58 /* sorted rate set within a MCS group*/ 55 /* sorted rate set within a MCS group*/
59 u16 max_group_tp_rate[MAX_THR_RATES]; 56 u16 max_group_tp_rate[MAX_THR_RATES];
60 u16 max_group_prob_rate; 57 u16 max_group_prob_rate;
@@ -101,6 +98,9 @@ struct minstrel_ht_sta {
101 u8 cck_supported; 98 u8 cck_supported;
102 u8 cck_supported_short; 99 u8 cck_supported_short;
103 100
101 /* Bitfield of supported MCS rates of all groups */
102 u16 supported[MINSTREL_GROUPS_NB];
103
104 /* MCS rate group info and statistics */ 104 /* MCS rate group info and statistics */
105 struct minstrel_mcs_group_data groups[MINSTREL_GROUPS_NB]; 105 struct minstrel_mcs_group_data groups[MINSTREL_GROUPS_NB];
106}; 106};
diff --git a/net/mac80211/rc80211_minstrel_ht_debugfs.c b/net/mac80211/rc80211_minstrel_ht_debugfs.c
index 5320e35ed3d0..7d969e300fb3 100644
--- a/net/mac80211/rc80211_minstrel_ht_debugfs.c
+++ b/net/mac80211/rc80211_minstrel_ht_debugfs.c
@@ -19,12 +19,12 @@ static char *
19minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p) 19minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p)
20{ 20{
21 const struct mcs_group *mg; 21 const struct mcs_group *mg;
22 unsigned int j, tp_max, tp_avg, prob, eprob, tx_time; 22 unsigned int j, tp_max, tp_avg, eprob, tx_time;
23 char htmode = '2'; 23 char htmode = '2';
24 char gimode = 'L'; 24 char gimode = 'L';
25 u32 gflags; 25 u32 gflags;
26 26
27 if (!mi->groups[i].supported) 27 if (!mi->supported[i])
28 return p; 28 return p;
29 29
30 mg = &minstrel_mcs_groups[i]; 30 mg = &minstrel_mcs_groups[i];
@@ -41,8 +41,9 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p)
41 struct minstrel_rate_stats *mrs = &mi->groups[i].rates[j]; 41 struct minstrel_rate_stats *mrs = &mi->groups[i].rates[j];
42 static const int bitrates[4] = { 10, 20, 55, 110 }; 42 static const int bitrates[4] = { 10, 20, 55, 110 };
43 int idx = i * MCS_GROUP_RATES + j; 43 int idx = i * MCS_GROUP_RATES + j;
44 unsigned int prob_ewmsd;
44 45
45 if (!(mi->groups[i].supported & BIT(j))) 46 if (!(mi->supported[i] & BIT(j)))
46 continue; 47 continue;
47 48
48 if (gflags & IEEE80211_TX_RC_MCS) { 49 if (gflags & IEEE80211_TX_RC_MCS) {
@@ -83,17 +84,16 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p)
83 84
84 tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100)); 85 tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100));
85 tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma); 86 tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma);
86 prob = MINSTREL_TRUNC(mrs->cur_prob * 1000);
87 eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); 87 eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
88 prob_ewmsd = minstrel_get_ewmsd10(mrs);
88 89
89 p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u" 90 p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u"
90 " %3u.%1u %3u %3u %-3u " 91 " %3u %3u %-3u "
91 "%9llu %-9llu\n", 92 "%9llu %-9llu\n",
92 tp_max / 10, tp_max % 10, 93 tp_max / 10, tp_max % 10,
93 tp_avg / 10, tp_avg % 10, 94 tp_avg / 10, tp_avg % 10,
94 eprob / 10, eprob % 10, 95 eprob / 10, eprob % 10,
95 mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10, 96 prob_ewmsd / 10, prob_ewmsd % 10,
96 prob / 10, prob % 10,
97 mrs->retry_count, 97 mrs->retry_count,
98 mrs->last_success, 98 mrs->last_success,
99 mrs->last_attempts, 99 mrs->last_attempts,
@@ -130,9 +130,9 @@ minstrel_ht_stats_open(struct inode *inode, struct file *file)
130 130
131 p += sprintf(p, "\n"); 131 p += sprintf(p, "\n");
132 p += sprintf(p, 132 p += sprintf(p,
133 " best ____________rate__________ ________statistics________ ________last_______ ______sum-of________\n"); 133 " best ____________rate__________ ________statistics________ _____last____ ______sum-of________\n");
134 p += sprintf(p, 134 p += sprintf(p,
135 "mode guard # rate [name idx airtime max_tp] [avg(tp) avg(prob) sd(prob)] [prob.|retry|suc|att] [#success | #attempts]\n"); 135 "mode guard # rate [name idx airtime max_tp] [avg(tp) avg(prob) sd(prob)] [retry|suc|att] [#success | #attempts]\n");
136 136
137 p = minstrel_ht_stats_dump(mi, MINSTREL_CCK_GROUP, p); 137 p = minstrel_ht_stats_dump(mi, MINSTREL_CCK_GROUP, p);
138 for (i = 0; i < MINSTREL_CCK_GROUP; i++) 138 for (i = 0; i < MINSTREL_CCK_GROUP; i++)
@@ -165,12 +165,12 @@ static char *
165minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p) 165minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p)
166{ 166{
167 const struct mcs_group *mg; 167 const struct mcs_group *mg;
168 unsigned int j, tp_max, tp_avg, prob, eprob, tx_time; 168 unsigned int j, tp_max, tp_avg, eprob, tx_time;
169 char htmode = '2'; 169 char htmode = '2';
170 char gimode = 'L'; 170 char gimode = 'L';
171 u32 gflags; 171 u32 gflags;
172 172
173 if (!mi->groups[i].supported) 173 if (!mi->supported[i])
174 return p; 174 return p;
175 175
176 mg = &minstrel_mcs_groups[i]; 176 mg = &minstrel_mcs_groups[i];
@@ -187,8 +187,9 @@ minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p)
187 struct minstrel_rate_stats *mrs = &mi->groups[i].rates[j]; 187 struct minstrel_rate_stats *mrs = &mi->groups[i].rates[j];
188 static const int bitrates[4] = { 10, 20, 55, 110 }; 188 static const int bitrates[4] = { 10, 20, 55, 110 };
189 int idx = i * MCS_GROUP_RATES + j; 189 int idx = i * MCS_GROUP_RATES + j;
190 unsigned int prob_ewmsd;
190 191
191 if (!(mi->groups[i].supported & BIT(j))) 192 if (!(mi->supported[i] & BIT(j)))
192 continue; 193 continue;
193 194
194 if (gflags & IEEE80211_TX_RC_MCS) { 195 if (gflags & IEEE80211_TX_RC_MCS) {
@@ -226,16 +227,15 @@ minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p)
226 227
227 tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100)); 228 tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100));
228 tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma); 229 tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma);
229 prob = MINSTREL_TRUNC(mrs->cur_prob * 1000);
230 eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); 230 eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
231 prob_ewmsd = minstrel_get_ewmsd10(mrs);
231 232
232 p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u.%u,%u,%u," 233 p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u,%u,"
233 "%u,%llu,%llu,", 234 "%u,%llu,%llu,",
234 tp_max / 10, tp_max % 10, 235 tp_max / 10, tp_max % 10,
235 tp_avg / 10, tp_avg % 10, 236 tp_avg / 10, tp_avg % 10,
236 eprob / 10, eprob % 10, 237 eprob / 10, eprob % 10,
237 mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10, 238 prob_ewmsd / 10, prob_ewmsd % 10,
238 prob / 10, prob % 10,
239 mrs->retry_count, 239 mrs->retry_count,
240 mrs->last_success, 240 mrs->last_success,
241 mrs->last_attempts, 241 mrs->last_attempts,
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index a47bbc973f2d..4d7543d1a62c 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -4,7 +4,7 @@
4 * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> 4 * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
5 * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net> 5 * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net>
6 * Copyright 2013-2014 Intel Mobile Communications GmbH 6 * Copyright 2013-2014 Intel Mobile Communications GmbH
7 * Copyright(c) 2015 - 2016 Intel Deutschland GmbH 7 * Copyright(c) 2015 - 2017 Intel Deutschland GmbH
8 * 8 *
9 * This program is free software; you can redistribute it and/or modify 9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as 10 * it under the terms of the GNU General Public License version 2 as
@@ -208,6 +208,51 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local,
208 return len; 208 return len;
209} 209}
210 210
211static void ieee80211_handle_mu_mimo_mon(struct ieee80211_sub_if_data *sdata,
212 struct sk_buff *skb,
213 int rtap_vendor_space)
214{
215 struct {
216 struct ieee80211_hdr_3addr hdr;
217 u8 category;
218 u8 action_code;
219 } __packed action;
220
221 if (!sdata)
222 return;
223
224 BUILD_BUG_ON(sizeof(action) != IEEE80211_MIN_ACTION_SIZE + 1);
225
226 if (skb->len < rtap_vendor_space + sizeof(action) +
227 VHT_MUMIMO_GROUPS_DATA_LEN)
228 return;
229
230 if (!is_valid_ether_addr(sdata->u.mntr.mu_follow_addr))
231 return;
232
233 skb_copy_bits(skb, rtap_vendor_space, &action, sizeof(action));
234
235 if (!ieee80211_is_action(action.hdr.frame_control))
236 return;
237
238 if (action.category != WLAN_CATEGORY_VHT)
239 return;
240
241 if (action.action_code != WLAN_VHT_ACTION_GROUPID_MGMT)
242 return;
243
244 if (!ether_addr_equal(action.hdr.addr1, sdata->u.mntr.mu_follow_addr))
245 return;
246
247 skb = skb_copy(skb, GFP_ATOMIC);
248 if (!skb)
249 return;
250
251 skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME;
252 skb_queue_tail(&sdata->skb_queue, skb);
253 ieee80211_queue_work(&sdata->local->hw, &sdata->work);
254}
255
211/* 256/*
212 * ieee80211_add_rx_radiotap_header - add radiotap header 257 * ieee80211_add_rx_radiotap_header - add radiotap header
213 * 258 *
@@ -515,7 +560,6 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
515 struct net_device *prev_dev = NULL; 560 struct net_device *prev_dev = NULL;
516 int present_fcs_len = 0; 561 int present_fcs_len = 0;
517 unsigned int rtap_vendor_space = 0; 562 unsigned int rtap_vendor_space = 0;
518 struct ieee80211_mgmt *mgmt;
519 struct ieee80211_sub_if_data *monitor_sdata = 563 struct ieee80211_sub_if_data *monitor_sdata =
520 rcu_dereference(local->monitor_sdata); 564 rcu_dereference(local->monitor_sdata);
521 565
@@ -553,6 +597,8 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
553 return remove_monitor_info(local, origskb, rtap_vendor_space); 597 return remove_monitor_info(local, origskb, rtap_vendor_space);
554 } 598 }
555 599
600 ieee80211_handle_mu_mimo_mon(monitor_sdata, origskb, rtap_vendor_space);
601
556 /* room for the radiotap header based on driver features */ 602 /* room for the radiotap header based on driver features */
557 rt_hdrlen = ieee80211_rx_radiotap_hdrlen(local, status, origskb); 603 rt_hdrlen = ieee80211_rx_radiotap_hdrlen(local, status, origskb);
558 needed_headroom = rt_hdrlen - rtap_vendor_space; 604 needed_headroom = rt_hdrlen - rtap_vendor_space;
@@ -618,23 +664,6 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
618 ieee80211_rx_stats(sdata->dev, skb->len); 664 ieee80211_rx_stats(sdata->dev, skb->len);
619 } 665 }
620 666
621 mgmt = (void *)skb->data;
622 if (monitor_sdata &&
623 skb->len >= IEEE80211_MIN_ACTION_SIZE + 1 + VHT_MUMIMO_GROUPS_DATA_LEN &&
624 ieee80211_is_action(mgmt->frame_control) &&
625 mgmt->u.action.category == WLAN_CATEGORY_VHT &&
626 mgmt->u.action.u.vht_group_notif.action_code == WLAN_VHT_ACTION_GROUPID_MGMT &&
627 is_valid_ether_addr(monitor_sdata->u.mntr.mu_follow_addr) &&
628 ether_addr_equal(mgmt->da, monitor_sdata->u.mntr.mu_follow_addr)) {
629 struct sk_buff *mu_skb = skb_copy(skb, GFP_ATOMIC);
630
631 if (mu_skb) {
632 mu_skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME;
633 skb_queue_tail(&monitor_sdata->skb_queue, mu_skb);
634 ieee80211_queue_work(&local->hw, &monitor_sdata->work);
635 }
636 }
637
638 if (prev_dev) { 667 if (prev_dev) {
639 skb->dev = prev_dev; 668 skb->dev = prev_dev;
640 netif_receive_skb(skb); 669 netif_receive_skb(skb);
@@ -1034,6 +1063,18 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_sub_if_data *sdata
1034 buf_size = tid_agg_rx->buf_size; 1063 buf_size = tid_agg_rx->buf_size;
1035 head_seq_num = tid_agg_rx->head_seq_num; 1064 head_seq_num = tid_agg_rx->head_seq_num;
1036 1065
1066 /*
1067 * If the current MPDU's SN is smaller than the SSN, it shouldn't
1068 * be reordered.
1069 */
1070 if (unlikely(!tid_agg_rx->started)) {
1071 if (ieee80211_sn_less(mpdu_seq_num, head_seq_num)) {
1072 ret = false;
1073 goto out;
1074 }
1075 tid_agg_rx->started = true;
1076 }
1077
1037 /* frame with out of date sequence number */ 1078 /* frame with out of date sequence number */
1038 if (ieee80211_sn_less(mpdu_seq_num, head_seq_num)) { 1079 if (ieee80211_sn_less(mpdu_seq_num, head_seq_num)) {
1039 dev_kfree_skb(skb); 1080 dev_kfree_skb(skb);
@@ -1391,16 +1432,18 @@ EXPORT_SYMBOL(ieee80211_sta_pspoll);
1391void ieee80211_sta_uapsd_trigger(struct ieee80211_sta *pubsta, u8 tid) 1432void ieee80211_sta_uapsd_trigger(struct ieee80211_sta *pubsta, u8 tid)
1392{ 1433{
1393 struct sta_info *sta = container_of(pubsta, struct sta_info, sta); 1434 struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
1394 u8 ac = ieee802_1d_to_ac[tid & 7]; 1435 int ac = ieee80211_ac_from_tid(tid);
1395 1436
1396 /* 1437 /*
1397 * If this AC is not trigger-enabled do nothing. 1438 * If this AC is not trigger-enabled do nothing unless the
1439 * driver is calling us after it already checked.
1398 * 1440 *
1399 * NB: This could/should check a separate bitmap of trigger- 1441 * NB: This could/should check a separate bitmap of trigger-
1400 * enabled queues, but for now we only implement uAPSD w/o 1442 * enabled queues, but for now we only implement uAPSD w/o
1401 * TSPEC changes to the ACs, so they're always the same. 1443 * TSPEC changes to the ACs, so they're always the same.
1402 */ 1444 */
1403 if (!(sta->sta.uapsd_queues & BIT(ac))) 1445 if (!(sta->sta.uapsd_queues & ieee80211_ac_to_qos_mask[ac]) &&
1446 tid != IEEE80211_NUM_TIDS)
1404 return; 1447 return;
1405 1448
1406 /* if we are in a service period, do nothing */ 1449 /* if we are in a service period, do nothing */
@@ -1906,7 +1949,6 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
1906 unsigned int frag, seq; 1949 unsigned int frag, seq;
1907 struct ieee80211_fragment_entry *entry; 1950 struct ieee80211_fragment_entry *entry;
1908 struct sk_buff *skb; 1951 struct sk_buff *skb;
1909 struct ieee80211_rx_status *status;
1910 1952
1911 hdr = (struct ieee80211_hdr *)rx->skb->data; 1953 hdr = (struct ieee80211_hdr *)rx->skb->data;
1912 fc = hdr->frame_control; 1954 fc = hdr->frame_control;
@@ -2032,9 +2074,6 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
2032 dev_kfree_skb(skb); 2074 dev_kfree_skb(skb);
2033 } 2075 }
2034 2076
2035 /* Complete frame has been reassembled - process it now */
2036 status = IEEE80211_SKB_RXCB(rx->skb);
2037
2038 out: 2077 out:
2039 ieee80211_led_rx(rx->local); 2078 ieee80211_led_rx(rx->local);
2040 out_no_led: 2079 out_no_led:
@@ -2215,7 +2254,8 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx)
2215 sdata->vif.type == NL80211_IFTYPE_AP_VLAN) && 2254 sdata->vif.type == NL80211_IFTYPE_AP_VLAN) &&
2216 !(sdata->flags & IEEE80211_SDATA_DONT_BRIDGE_PACKETS) && 2255 !(sdata->flags & IEEE80211_SDATA_DONT_BRIDGE_PACKETS) &&
2217 (sdata->vif.type != NL80211_IFTYPE_AP_VLAN || !sdata->u.vlan.sta)) { 2256 (sdata->vif.type != NL80211_IFTYPE_AP_VLAN || !sdata->u.vlan.sta)) {
2218 if (is_multicast_ether_addr(ehdr->h_dest)) { 2257 if (is_multicast_ether_addr(ehdr->h_dest) &&
2258 ieee80211_vif_get_num_mcast_if(sdata) != 0) {
2219 /* 2259 /*
2220 * send multicast frames both to higher layers in 2260 * send multicast frames both to higher layers in
2221 * local net stack and back to the wireless medium 2261 * local net stack and back to the wireless medium
@@ -2224,7 +2264,7 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx)
2224 if (!xmit_skb) 2264 if (!xmit_skb)
2225 net_info_ratelimited("%s: failed to clone multicast frame\n", 2265 net_info_ratelimited("%s: failed to clone multicast frame\n",
2226 dev->name); 2266 dev->name);
2227 } else { 2267 } else if (!is_multicast_ether_addr(ehdr->h_dest)) {
2228 dsta = sta_info_get(sdata, skb->data); 2268 dsta = sta_info_get(sdata, skb->data);
2229 if (dsta) { 2269 if (dsta) {
2230 /* 2270 /*
@@ -2469,7 +2509,8 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
2469 if (!ifmsh->mshcfg.dot11MeshForwarding) 2509 if (!ifmsh->mshcfg.dot11MeshForwarding)
2470 goto out; 2510 goto out;
2471 2511
2472 fwd_skb = skb_copy(skb, GFP_ATOMIC); 2512 fwd_skb = skb_copy_expand(skb, local->tx_headroom +
2513 sdata->encrypt_headroom, 0, GFP_ATOMIC);
2473 if (!fwd_skb) { 2514 if (!fwd_skb) {
2474 net_info_ratelimited("%s: failed to clone mesh frame\n", 2515 net_info_ratelimited("%s: failed to clone mesh frame\n",
2475 sdata->name); 2516 sdata->name);
@@ -2877,17 +2918,10 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
2877 2918
2878 switch (mgmt->u.action.u.vht_opmode_notif.action_code) { 2919 switch (mgmt->u.action.u.vht_opmode_notif.action_code) {
2879 case WLAN_VHT_ACTION_OPMODE_NOTIF: { 2920 case WLAN_VHT_ACTION_OPMODE_NOTIF: {
2880 u8 opmode;
2881
2882 /* verify opmode is present */ 2921 /* verify opmode is present */
2883 if (len < IEEE80211_MIN_ACTION_SIZE + 2) 2922 if (len < IEEE80211_MIN_ACTION_SIZE + 2)
2884 goto invalid; 2923 goto invalid;
2885 2924 goto queue;
2886 opmode = mgmt->u.action.u.vht_opmode_notif.operating_mode;
2887
2888 ieee80211_vht_handle_opmode(rx->sdata, rx->sta,
2889 opmode, status->band);
2890 goto handled;
2891 } 2925 }
2892 case WLAN_VHT_ACTION_GROUPID_MGMT: { 2926 case WLAN_VHT_ACTION_GROUPID_MGMT: {
2893 if (len < IEEE80211_MIN_ACTION_SIZE + 25) 2927 if (len < IEEE80211_MIN_ACTION_SIZE + 25)
@@ -3605,6 +3639,27 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx)
3605 !ether_addr_equal(bssid, hdr->addr1)) 3639 !ether_addr_equal(bssid, hdr->addr1))
3606 return false; 3640 return false;
3607 } 3641 }
3642
3643 /*
3644 * 802.11-2016 Table 9-26 says that for data frames, A1 must be
3645 * the BSSID - we've checked that already but may have accepted
3646 * the wildcard (ff:ff:ff:ff:ff:ff).
3647 *
3648 * It also says:
3649 * The BSSID of the Data frame is determined as follows:
3650 * a) If the STA is contained within an AP or is associated
3651 * with an AP, the BSSID is the address currently in use
3652 * by the STA contained in the AP.
3653 *
3654 * So we should not accept data frames with an address that's
3655 * multicast.
3656 *
3657 * Accepting it also opens a security problem because stations
3658 * could encrypt it with the GTK and inject traffic that way.
3659 */
3660 if (ieee80211_is_data(hdr->frame_control) && multicast)
3661 return false;
3662
3608 return true; 3663 return true;
3609 case NL80211_IFTYPE_WDS: 3664 case NL80211_IFTYPE_WDS:
3610 if (bssid || !ieee80211_is_data(hdr->frame_control)) 3665 if (bssid || !ieee80211_is_data(hdr->frame_control))
@@ -3887,6 +3942,7 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx,
3887 stats->last_rate = sta_stats_encode_rate(status); 3942 stats->last_rate = sta_stats_encode_rate(status);
3888 3943
3889 stats->fragments++; 3944 stats->fragments++;
3945 stats->packets++;
3890 3946
3891 if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) { 3947 if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) {
3892 stats->last_signal = status->signal; 3948 stats->last_signal = status->signal;
@@ -3939,21 +3995,31 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx,
3939 u64_stats_update_end(&stats->syncp); 3995 u64_stats_update_end(&stats->syncp);
3940 3996
3941 if (fast_rx->internal_forward) { 3997 if (fast_rx->internal_forward) {
3942 struct sta_info *dsta = sta_info_get(rx->sdata, skb->data); 3998 struct sk_buff *xmit_skb = NULL;
3999 bool multicast = is_multicast_ether_addr(skb->data);
3943 4000
3944 if (dsta) { 4001 if (multicast) {
4002 xmit_skb = skb_copy(skb, GFP_ATOMIC);
4003 } else if (sta_info_get(rx->sdata, skb->data)) {
4004 xmit_skb = skb;
4005 skb = NULL;
4006 }
4007
4008 if (xmit_skb) {
3945 /* 4009 /*
3946 * Send to wireless media and increase priority by 256 4010 * Send to wireless media and increase priority by 256
3947 * to keep the received priority instead of 4011 * to keep the received priority instead of
3948 * reclassifying the frame (see cfg80211_classify8021d). 4012 * reclassifying the frame (see cfg80211_classify8021d).
3949 */ 4013 */
3950 skb->priority += 256; 4014 xmit_skb->priority += 256;
3951 skb->protocol = htons(ETH_P_802_3); 4015 xmit_skb->protocol = htons(ETH_P_802_3);
3952 skb_reset_network_header(skb); 4016 skb_reset_network_header(xmit_skb);
3953 skb_reset_mac_header(skb); 4017 skb_reset_mac_header(xmit_skb);
3954 dev_queue_xmit(skb); 4018 dev_queue_xmit(xmit_skb);
3955 return true;
3956 } 4019 }
4020
4021 if (!skb)
4022 return true;
3957 } 4023 }
3958 4024
3959 /* deliver to local stack */ 4025 /* deliver to local stack */
@@ -4070,15 +4136,17 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
4070 ieee80211_is_beacon(hdr->frame_control))) 4136 ieee80211_is_beacon(hdr->frame_control)))
4071 ieee80211_scan_rx(local, skb); 4137 ieee80211_scan_rx(local, skb);
4072 4138
4073 if (pubsta) { 4139 if (ieee80211_is_data(fc)) {
4074 rx.sta = container_of(pubsta, struct sta_info, sta);
4075 rx.sdata = rx.sta->sdata;
4076 if (ieee80211_prepare_and_rx_handle(&rx, skb, true))
4077 return;
4078 goto out;
4079 } else if (ieee80211_is_data(fc)) {
4080 struct sta_info *sta, *prev_sta; 4140 struct sta_info *sta, *prev_sta;
4081 4141
4142 if (pubsta) {
4143 rx.sta = container_of(pubsta, struct sta_info, sta);
4144 rx.sdata = rx.sta->sdata;
4145 if (ieee80211_prepare_and_rx_handle(&rx, skb, true))
4146 return;
4147 goto out;
4148 }
4149
4082 prev_sta = NULL; 4150 prev_sta = NULL;
4083 4151
4084 for_each_sta_info(local, hdr->addr2, sta, tmp) { 4152 for_each_sta_info(local, hdr->addr2, sta, tmp) {
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 23d8ac829279..faab3c490d2b 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -1120,7 +1120,6 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
1120 u32 rate_masks[NUM_NL80211_BANDS] = {}; 1120 u32 rate_masks[NUM_NL80211_BANDS] = {};
1121 u8 bands_used = 0; 1121 u8 bands_used = 0;
1122 u8 *ie; 1122 u8 *ie;
1123 size_t len;
1124 1123
1125 iebufsz = local->scan_ies_len + req->ie_len; 1124 iebufsz = local->scan_ies_len + req->ie_len;
1126 1125
@@ -1145,10 +1144,9 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
1145 1144
1146 ieee80211_prepare_scan_chandef(&chandef, req->scan_width); 1145 ieee80211_prepare_scan_chandef(&chandef, req->scan_width);
1147 1146
1148 len = ieee80211_build_preq_ies(local, ie, num_bands * iebufsz, 1147 ieee80211_build_preq_ies(local, ie, num_bands * iebufsz,
1149 &sched_scan_ies, req->ie, 1148 &sched_scan_ies, req->ie,
1150 req->ie_len, bands_used, 1149 req->ie_len, bands_used, rate_masks, &chandef);
1151 rate_masks, &chandef);
1152 1150
1153 ret = drv_sched_scan_start(local, sdata, req, &sched_scan_ies); 1151 ret = drv_sched_scan_start(local, sdata, req, &sched_scan_ies);
1154 if (ret == 0) { 1152 if (ret == 0) {
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 8e05032689f0..3323a2fb289b 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -513,23 +513,23 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU)
513{ 513{
514 struct ieee80211_local *local = sta->local; 514 struct ieee80211_local *local = sta->local;
515 struct ieee80211_sub_if_data *sdata = sta->sdata; 515 struct ieee80211_sub_if_data *sdata = sta->sdata;
516 struct station_info *sinfo; 516 struct station_info *sinfo = NULL;
517 int err = 0; 517 int err = 0;
518 518
519 lockdep_assert_held(&local->sta_mtx); 519 lockdep_assert_held(&local->sta_mtx);
520 520
521 sinfo = kzalloc(sizeof(struct station_info), GFP_KERNEL);
522 if (!sinfo) {
523 err = -ENOMEM;
524 goto out_err;
525 }
526
527 /* check if STA exists already */ 521 /* check if STA exists already */
528 if (sta_info_get_bss(sdata, sta->sta.addr)) { 522 if (sta_info_get_bss(sdata, sta->sta.addr)) {
529 err = -EEXIST; 523 err = -EEXIST;
530 goto out_err; 524 goto out_err;
531 } 525 }
532 526
527 sinfo = kzalloc(sizeof(struct station_info), GFP_KERNEL);
528 if (!sinfo) {
529 err = -ENOMEM;
530 goto out_err;
531 }
532
533 local->num_sta++; 533 local->num_sta++;
534 local->sta_generation++; 534 local->sta_generation++;
535 smp_mb(); 535 smp_mb();
@@ -688,7 +688,7 @@ static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending)
688 } 688 }
689 689
690 /* No need to do anything if the driver does all */ 690 /* No need to do anything if the driver does all */
691 if (ieee80211_hw_check(&local->hw, AP_LINK_PS)) 691 if (ieee80211_hw_check(&local->hw, AP_LINK_PS) && !local->ops->set_tim)
692 return; 692 return;
693 693
694 if (sta->dead) 694 if (sta->dead)
@@ -709,7 +709,7 @@ static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending)
709 for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { 709 for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
710 unsigned long tids; 710 unsigned long tids;
711 711
712 if (ignore_for_tim & BIT(ac)) 712 if (ignore_for_tim & ieee80211_ac_to_qos_mask[ac])
713 continue; 713 continue;
714 714
715 indicate_tim |= !skb_queue_empty(&sta->tx_filtered[ac]) || 715 indicate_tim |= !skb_queue_empty(&sta->tx_filtered[ac]) ||
@@ -1264,7 +1264,7 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta)
1264 sta_info_recalc_tim(sta); 1264 sta_info_recalc_tim(sta);
1265 1265
1266 ps_dbg(sdata, 1266 ps_dbg(sdata,
1267 "STA %pM aid %d sending %d filtered/%d PS frames since STA not sleeping anymore\n", 1267 "STA %pM aid %d sending %d filtered/%d PS frames since STA woke up\n",
1268 sta->sta.addr, sta->sta.aid, filtered, buffered); 1268 sta->sta.addr, sta->sta.aid, filtered, buffered);
1269 1269
1270 ieee80211_check_fast_xmit(sta); 1270 ieee80211_check_fast_xmit(sta);
@@ -1389,7 +1389,7 @@ ieee80211_sta_ps_more_data(struct sta_info *sta, u8 ignored_acs,
1389 return true; 1389 return true;
1390 1390
1391 for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { 1391 for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
1392 if (ignored_acs & BIT(ac)) 1392 if (ignored_acs & ieee80211_ac_to_qos_mask[ac])
1393 continue; 1393 continue;
1394 1394
1395 if (!skb_queue_empty(&sta->tx_filtered[ac]) || 1395 if (!skb_queue_empty(&sta->tx_filtered[ac]) ||
@@ -1414,7 +1414,7 @@ ieee80211_sta_ps_get_frames(struct sta_info *sta, int n_frames, u8 ignored_acs,
1414 for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { 1414 for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
1415 unsigned long tids; 1415 unsigned long tids;
1416 1416
1417 if (ignored_acs & BIT(ac)) 1417 if (ignored_acs & ieee80211_ac_to_qos_mask[ac])
1418 continue; 1418 continue;
1419 1419
1420 tids = ieee80211_tids_for_ac(ac); 1420 tids = ieee80211_tids_for_ac(ac);
@@ -1482,7 +1482,7 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta,
1482 BIT(find_highest_prio_tid(driver_release_tids)); 1482 BIT(find_highest_prio_tid(driver_release_tids));
1483 1483
1484 if (skb_queue_empty(&frames) && !driver_release_tids) { 1484 if (skb_queue_empty(&frames) && !driver_release_tids) {
1485 int tid; 1485 int tid, ac;
1486 1486
1487 /* 1487 /*
1488 * For PS-Poll, this can only happen due to a race condition 1488 * For PS-Poll, this can only happen due to a race condition
@@ -1500,7 +1500,10 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta,
1500 */ 1500 */
1501 1501
1502 /* This will evaluate to 1, 3, 5 or 7. */ 1502 /* This will evaluate to 1, 3, 5 or 7. */
1503 tid = 7 - ((ffs(~ignored_acs) - 1) << 1); 1503 for (ac = IEEE80211_AC_VO; ac < IEEE80211_NUM_ACS; ac++)
1504 if (!(ignored_acs & ieee80211_ac_to_qos_mask[ac]))
1505 break;
1506 tid = 7 - 2 * ac;
1504 1507
1505 ieee80211_send_null_response(sta, tid, reason, true, false); 1508 ieee80211_send_null_response(sta, tid, reason, true, false);
1506 } else if (!driver_release_tids) { 1509 } else if (!driver_release_tids) {
@@ -1871,10 +1874,7 @@ int sta_info_move_state(struct sta_info *sta,
1871 if (!sta->sta.support_p2p_ps) 1874 if (!sta->sta.support_p2p_ps)
1872 ieee80211_recalc_p2p_go_ps_allowed(sta->sdata); 1875 ieee80211_recalc_p2p_go_ps_allowed(sta->sdata);
1873 } else if (sta->sta_state == IEEE80211_STA_AUTHORIZED) { 1876 } else if (sta->sta_state == IEEE80211_STA_AUTHORIZED) {
1874 if (sta->sdata->vif.type == NL80211_IFTYPE_AP || 1877 ieee80211_vif_dec_num_mcast(sta->sdata);
1875 (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
1876 !sta->sdata->u.vlan.sta))
1877 atomic_dec(&sta->sdata->bss->num_mcast_sta);
1878 clear_bit(WLAN_STA_AUTHORIZED, &sta->_flags); 1878 clear_bit(WLAN_STA_AUTHORIZED, &sta->_flags);
1879 ieee80211_clear_fast_xmit(sta); 1879 ieee80211_clear_fast_xmit(sta);
1880 ieee80211_clear_fast_rx(sta); 1880 ieee80211_clear_fast_rx(sta);
@@ -1882,10 +1882,7 @@ int sta_info_move_state(struct sta_info *sta,
1882 break; 1882 break;
1883 case IEEE80211_STA_AUTHORIZED: 1883 case IEEE80211_STA_AUTHORIZED:
1884 if (sta->sta_state == IEEE80211_STA_ASSOC) { 1884 if (sta->sta_state == IEEE80211_STA_ASSOC) {
1885 if (sta->sdata->vif.type == NL80211_IFTYPE_AP || 1885 ieee80211_vif_inc_num_mcast(sta->sdata);
1886 (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
1887 !sta->sdata->u.vlan.sta))
1888 atomic_inc(&sta->sdata->bss->num_mcast_sta);
1889 set_bit(WLAN_STA_AUTHORIZED, &sta->_flags); 1886 set_bit(WLAN_STA_AUTHORIZED, &sta->_flags);
1890 ieee80211_check_fast_xmit(sta); 1887 ieee80211_check_fast_xmit(sta);
1891 ieee80211_check_fast_rx(sta); 1888 ieee80211_check_fast_rx(sta);
@@ -1975,6 +1972,7 @@ static void sta_stats_decode_rate(struct ieee80211_local *local, u16 rate,
1975 u16 brate; 1972 u16 brate;
1976 unsigned int shift; 1973 unsigned int shift;
1977 1974
1975 rinfo->flags = 0;
1978 sband = local->hw.wiphy->bands[(rate >> 4) & 0xf]; 1976 sband = local->hw.wiphy->bands[(rate >> 4) & 0xf];
1979 brate = sband->bitrates[rate & 0xf].bitrate; 1977 brate = sband->bitrates[rate & 0xf].bitrate;
1980 if (rinfo->bw == RATE_INFO_BW_5) 1978 if (rinfo->bw == RATE_INFO_BW_5)
@@ -1990,14 +1988,15 @@ static void sta_stats_decode_rate(struct ieee80211_local *local, u16 rate,
1990 rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI; 1988 rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI;
1991} 1989}
1992 1990
1993static void sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo) 1991static int sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo)
1994{ 1992{
1995 u16 rate = ACCESS_ONCE(sta_get_last_rx_stats(sta)->last_rate); 1993 u16 rate = ACCESS_ONCE(sta_get_last_rx_stats(sta)->last_rate);
1996 1994
1997 if (rate == STA_STATS_RATE_INVALID) 1995 if (rate == STA_STATS_RATE_INVALID)
1998 rinfo->flags = 0; 1996 return -EINVAL;
1999 else 1997
2000 sta_stats_decode_rate(sta->local, rate, rinfo); 1998 sta_stats_decode_rate(sta->local, rate, rinfo);
1999 return 0;
2001} 2000}
2002 2001
2003static void sta_set_tidstats(struct sta_info *sta, 2002static void sta_set_tidstats(struct sta_info *sta,
@@ -2052,16 +2051,12 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
2052{ 2051{
2053 struct ieee80211_sub_if_data *sdata = sta->sdata; 2052 struct ieee80211_sub_if_data *sdata = sta->sdata;
2054 struct ieee80211_local *local = sdata->local; 2053 struct ieee80211_local *local = sdata->local;
2055 struct rate_control_ref *ref = NULL;
2056 u32 thr = 0; 2054 u32 thr = 0;
2057 int i, ac, cpu; 2055 int i, ac, cpu;
2058 struct ieee80211_sta_rx_stats *last_rxstats; 2056 struct ieee80211_sta_rx_stats *last_rxstats;
2059 2057
2060 last_rxstats = sta_get_last_rx_stats(sta); 2058 last_rxstats = sta_get_last_rx_stats(sta);
2061 2059
2062 if (test_sta_flag(sta, WLAN_STA_RATE_CONTROL))
2063 ref = local->rate_ctrl;
2064
2065 sinfo->generation = sdata->local->sta_generation; 2060 sinfo->generation = sdata->local->sta_generation;
2066 2061
2067 /* do before driver, so beacon filtering drivers have a 2062 /* do before driver, so beacon filtering drivers have a
@@ -2202,8 +2197,8 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
2202 } 2197 }
2203 2198
2204 if (!(sinfo->filled & BIT(NL80211_STA_INFO_RX_BITRATE))) { 2199 if (!(sinfo->filled & BIT(NL80211_STA_INFO_RX_BITRATE))) {
2205 sta_set_rate_info_rx(sta, &sinfo->rxrate); 2200 if (sta_set_rate_info_rx(sta, &sinfo->rxrate) == 0)
2206 sinfo->filled |= BIT(NL80211_STA_INFO_RX_BITRATE); 2201 sinfo->filled |= BIT(NL80211_STA_INFO_RX_BITRATE);
2207 } 2202 }
2208 2203
2209 sinfo->filled |= BIT(NL80211_STA_INFO_TID_STATS); 2204 sinfo->filled |= BIT(NL80211_STA_INFO_TID_STATS);
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index ed5fcb984a01..e65cda34d2bc 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -184,12 +184,12 @@ struct tid_ampdu_tx {
184 * @ssn: Starting Sequence Number expected to be aggregated. 184 * @ssn: Starting Sequence Number expected to be aggregated.
185 * @buf_size: buffer size for incoming A-MPDUs 185 * @buf_size: buffer size for incoming A-MPDUs
186 * @timeout: reset timer value (in TUs). 186 * @timeout: reset timer value (in TUs).
187 * @dialog_token: dialog token for aggregation session
188 * @rcu_head: RCU head used for freeing this struct 187 * @rcu_head: RCU head used for freeing this struct
189 * @reorder_lock: serializes access to reorder buffer, see below. 188 * @reorder_lock: serializes access to reorder buffer, see below.
190 * @auto_seq: used for offloaded BA sessions to automatically pick head_seq_and 189 * @auto_seq: used for offloaded BA sessions to automatically pick head_seq_and
191 * and ssn. 190 * and ssn.
192 * @removed: this session is removed (but might have been found due to RCU) 191 * @removed: this session is removed (but might have been found due to RCU)
192 * @started: this session has started (head ssn or higher was received)
193 * 193 *
194 * This structure's lifetime is managed by RCU, assignments to 194 * This structure's lifetime is managed by RCU, assignments to
195 * the array holding it must hold the aggregation mutex. 195 * the array holding it must hold the aggregation mutex.
@@ -213,9 +213,9 @@ struct tid_ampdu_rx {
213 u16 ssn; 213 u16 ssn;
214 u16 buf_size; 214 u16 buf_size;
215 u16 timeout; 215 u16 timeout;
216 u8 dialog_token; 216 u8 auto_seq:1,
217 bool auto_seq; 217 removed:1,
218 bool removed; 218 started:1;
219}; 219};
220 220
221/** 221/**
@@ -225,6 +225,7 @@ struct tid_ampdu_rx {
225 * to tid_tx[idx], which are protected by the sta spinlock) 225 * to tid_tx[idx], which are protected by the sta spinlock)
226 * tid_start_tx is also protected by sta->lock. 226 * tid_start_tx is also protected by sta->lock.
227 * @tid_rx: aggregation info for Rx per TID -- RCU protected 227 * @tid_rx: aggregation info for Rx per TID -- RCU protected
228 * @tid_rx_token: dialog tokens for valid aggregation sessions
228 * @tid_rx_timer_expired: bitmap indicating on which TIDs the 229 * @tid_rx_timer_expired: bitmap indicating on which TIDs the
229 * RX timer expired until the work for it runs 230 * RX timer expired until the work for it runs
230 * @tid_rx_stop_requested: bitmap indicating which BA sessions per TID the 231 * @tid_rx_stop_requested: bitmap indicating which BA sessions per TID the
@@ -243,6 +244,7 @@ struct sta_ampdu_mlme {
243 struct mutex mtx; 244 struct mutex mtx;
244 /* rx */ 245 /* rx */
245 struct tid_ampdu_rx __rcu *tid_rx[IEEE80211_NUM_TIDS]; 246 struct tid_ampdu_rx __rcu *tid_rx[IEEE80211_NUM_TIDS];
247 u8 tid_rx_token[IEEE80211_NUM_TIDS];
246 unsigned long tid_rx_timer_expired[BITS_TO_LONGS(IEEE80211_NUM_TIDS)]; 248 unsigned long tid_rx_timer_expired[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
247 unsigned long tid_rx_stop_requested[BITS_TO_LONGS(IEEE80211_NUM_TIDS)]; 249 unsigned long tid_rx_stop_requested[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
248 unsigned long agg_session_valid[BITS_TO_LONGS(IEEE80211_NUM_TIDS)]; 250 unsigned long agg_session_valid[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
@@ -370,7 +372,7 @@ struct mesh_sta {
370 unsigned int fail_avg; 372 unsigned int fail_avg;
371}; 373};
372 374
373DECLARE_EWMA(signal, 1024, 8) 375DECLARE_EWMA(signal, 10, 8)
374 376
375struct ieee80211_sta_rx_stats { 377struct ieee80211_sta_rx_stats {
376 unsigned long packets; 378 unsigned long packets;
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index ddf71c648cab..83b8b11f24ea 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -51,7 +51,8 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local,
51 struct ieee80211_hdr *hdr = (void *)skb->data; 51 struct ieee80211_hdr *hdr = (void *)skb->data;
52 int ac; 52 int ac;
53 53
54 if (info->flags & IEEE80211_TX_CTL_NO_PS_BUFFER) { 54 if (info->flags & (IEEE80211_TX_CTL_NO_PS_BUFFER |
55 IEEE80211_TX_CTL_AMPDU)) {
55 ieee80211_free_txskb(&local->hw, skb); 56 ieee80211_free_txskb(&local->hw, skb);
56 return; 57 return;
57 } 58 }
@@ -95,7 +96,7 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local,
95 */ 96 */
96 if (*p & IEEE80211_QOS_CTL_EOSP) 97 if (*p & IEEE80211_QOS_CTL_EOSP)
97 *p &= ~IEEE80211_QOS_CTL_EOSP; 98 *p &= ~IEEE80211_QOS_CTL_EOSP;
98 ac = ieee802_1d_to_ac[tid & 7]; 99 ac = ieee80211_ac_from_tid(tid);
99 } else { 100 } else {
100 ac = IEEE80211_AC_BE; 101 ac = IEEE80211_AC_BE;
101 } 102 }
@@ -462,9 +463,7 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local,
462 unsigned long flags; 463 unsigned long flags;
463 464
464 spin_lock_irqsave(&local->ack_status_lock, flags); 465 spin_lock_irqsave(&local->ack_status_lock, flags);
465 skb = idr_find(&local->ack_status_frames, info->ack_frame_id); 466 skb = idr_remove(&local->ack_status_frames, info->ack_frame_id);
466 if (skb)
467 idr_remove(&local->ack_status_frames, info->ack_frame_id);
468 spin_unlock_irqrestore(&local->ack_status_lock, flags); 467 spin_unlock_irqrestore(&local->ack_status_lock, flags);
469 468
470 if (!skb) 469 if (!skb)
@@ -541,6 +540,11 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local,
541 } else if (info->ack_frame_id) { 540 } else if (info->ack_frame_id) {
542 ieee80211_report_ack_skb(local, info, acked, dropped); 541 ieee80211_report_ack_skb(local, info, acked, dropped);
543 } 542 }
543
544 if (!dropped && skb->destructor) {
545 skb->wifi_acked_valid = 1;
546 skb->wifi_acked = acked;
547 }
544} 548}
545 549
546/* 550/*
@@ -633,10 +637,9 @@ void ieee80211_tx_status_noskb(struct ieee80211_hw *hw,
633 struct ieee80211_local *local = hw_to_local(hw); 637 struct ieee80211_local *local = hw_to_local(hw);
634 struct ieee80211_supported_band *sband; 638 struct ieee80211_supported_band *sband;
635 int retry_count; 639 int retry_count;
636 int rates_idx;
637 bool acked, noack_success; 640 bool acked, noack_success;
638 641
639 rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count); 642 ieee80211_tx_get_rates(hw, info, &retry_count);
640 643
641 sband = hw->wiphy->bands[info->band]; 644 sband = hw->wiphy->bands[info->band];
642 645
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 92a47afaa989..0d645bc148d0 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -1736,21 +1736,21 @@ TRACE_EVENT(drv_start_nan,
1736 LOCAL_ENTRY 1736 LOCAL_ENTRY
1737 VIF_ENTRY 1737 VIF_ENTRY
1738 __field(u8, master_pref) 1738 __field(u8, master_pref)
1739 __field(u8, dual) 1739 __field(u8, bands)
1740 ), 1740 ),
1741 1741
1742 TP_fast_assign( 1742 TP_fast_assign(
1743 LOCAL_ASSIGN; 1743 LOCAL_ASSIGN;
1744 VIF_ASSIGN; 1744 VIF_ASSIGN;
1745 __entry->master_pref = conf->master_pref; 1745 __entry->master_pref = conf->master_pref;
1746 __entry->dual = conf->dual; 1746 __entry->bands = conf->bands;
1747 ), 1747 ),
1748 1748
1749 TP_printk( 1749 TP_printk(
1750 LOCAL_PR_FMT VIF_PR_FMT 1750 LOCAL_PR_FMT VIF_PR_FMT
1751 ", master preference: %u, dual: %d", 1751 ", master preference: %u, bands: 0x%0x",
1752 LOCAL_PR_ARG, VIF_PR_ARG, __entry->master_pref, 1752 LOCAL_PR_ARG, VIF_PR_ARG, __entry->master_pref,
1753 __entry->dual 1753 __entry->bands
1754 ) 1754 )
1755); 1755);
1756 1756
@@ -1787,7 +1787,7 @@ TRACE_EVENT(drv_nan_change_conf,
1787 LOCAL_ENTRY 1787 LOCAL_ENTRY
1788 VIF_ENTRY 1788 VIF_ENTRY
1789 __field(u8, master_pref) 1789 __field(u8, master_pref)
1790 __field(u8, dual) 1790 __field(u8, bands)
1791 __field(u32, changes) 1791 __field(u32, changes)
1792 ), 1792 ),
1793 1793
@@ -1795,15 +1795,15 @@ TRACE_EVENT(drv_nan_change_conf,
1795 LOCAL_ASSIGN; 1795 LOCAL_ASSIGN;
1796 VIF_ASSIGN; 1796 VIF_ASSIGN;
1797 __entry->master_pref = conf->master_pref; 1797 __entry->master_pref = conf->master_pref;
1798 __entry->dual = conf->dual; 1798 __entry->bands = conf->bands;
1799 __entry->changes = changes; 1799 __entry->changes = changes;
1800 ), 1800 ),
1801 1801
1802 TP_printk( 1802 TP_printk(
1803 LOCAL_PR_FMT VIF_PR_FMT 1803 LOCAL_PR_FMT VIF_PR_FMT
1804 ", master preference: %u, dual: %d, changes: 0x%x", 1804 ", master preference: %u, bands: 0x%0x, changes: 0x%x",
1805 LOCAL_PR_ARG, VIF_PR_ARG, __entry->master_pref, 1805 LOCAL_PR_ARG, VIF_PR_ARG, __entry->master_pref,
1806 __entry->dual, __entry->changes 1806 __entry->bands, __entry->changes
1807 ) 1807 )
1808); 1808);
1809 1809
@@ -1996,23 +1996,26 @@ TRACE_EVENT(api_connection_loss,
1996 1996
1997TRACE_EVENT(api_cqm_rssi_notify, 1997TRACE_EVENT(api_cqm_rssi_notify,
1998 TP_PROTO(struct ieee80211_sub_if_data *sdata, 1998 TP_PROTO(struct ieee80211_sub_if_data *sdata,
1999 enum nl80211_cqm_rssi_threshold_event rssi_event), 1999 enum nl80211_cqm_rssi_threshold_event rssi_event,
2000 s32 rssi_level),
2000 2001
2001 TP_ARGS(sdata, rssi_event), 2002 TP_ARGS(sdata, rssi_event, rssi_level),
2002 2003
2003 TP_STRUCT__entry( 2004 TP_STRUCT__entry(
2004 VIF_ENTRY 2005 VIF_ENTRY
2005 __field(u32, rssi_event) 2006 __field(u32, rssi_event)
2007 __field(s32, rssi_level)
2006 ), 2008 ),
2007 2009
2008 TP_fast_assign( 2010 TP_fast_assign(
2009 VIF_ASSIGN; 2011 VIF_ASSIGN;
2010 __entry->rssi_event = rssi_event; 2012 __entry->rssi_event = rssi_event;
2013 __entry->rssi_level = rssi_level;
2011 ), 2014 ),
2012 2015
2013 TP_printk( 2016 TP_printk(
2014 VIF_PR_FMT " event:%d", 2017 VIF_PR_FMT " event:%d rssi:%d",
2015 VIF_PR_ARG, __entry->rssi_event 2018 VIF_PR_ARG, __entry->rssi_event, __entry->rssi_level
2016 ) 2019 )
2017); 2020);
2018 2021
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index bd5f4be89435..ba8d7db0a071 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -16,6 +16,7 @@
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/skbuff.h> 18#include <linux/skbuff.h>
19#include <linux/if_vlan.h>
19#include <linux/etherdevice.h> 20#include <linux/etherdevice.h>
20#include <linux/bitmap.h> 21#include <linux/bitmap.h>
21#include <linux/rcupdate.h> 22#include <linux/rcupdate.h>
@@ -63,6 +64,10 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx,
63 struct ieee80211_chanctx_conf *chanctx_conf; 64 struct ieee80211_chanctx_conf *chanctx_conf;
64 u32 rate_flags = 0; 65 u32 rate_flags = 0;
65 66
67 /* assume HW handles this */
68 if (tx->rate.flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS))
69 return 0;
70
66 rcu_read_lock(); 71 rcu_read_lock();
67 chanctx_conf = rcu_dereference(tx->sdata->vif.chanctx_conf); 72 chanctx_conf = rcu_dereference(tx->sdata->vif.chanctx_conf);
68 if (chanctx_conf) { 73 if (chanctx_conf) {
@@ -71,10 +76,6 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx,
71 } 76 }
72 rcu_read_unlock(); 77 rcu_read_unlock();
73 78
74 /* assume HW handles this */
75 if (tx->rate.flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS))
76 return 0;
77
78 /* uh huh? */ 79 /* uh huh? */
79 if (WARN_ON_ONCE(tx->rate.idx < 0)) 80 if (WARN_ON_ONCE(tx->rate.idx < 0))
80 return 0; 81 return 0;
@@ -331,9 +332,8 @@ ieee80211_tx_h_check_assoc(struct ieee80211_tx_data *tx)
331 I802_DEBUG_INC(tx->local->tx_handlers_drop_not_assoc); 332 I802_DEBUG_INC(tx->local->tx_handlers_drop_not_assoc);
332 return TX_DROP; 333 return TX_DROP;
333 } 334 }
334 } else if (unlikely(tx->sdata->vif.type == NL80211_IFTYPE_AP && 335 } else if (unlikely(ieee80211_is_data(hdr->frame_control) &&
335 ieee80211_is_data(hdr->frame_control) && 336 ieee80211_vif_get_num_mcast_if(tx->sdata) == 0)) {
336 !atomic_read(&tx->sdata->u.ap.num_mcast_sta))) {
337 /* 337 /*
338 * No associated STAs - no need to send multicast 338 * No associated STAs - no need to send multicast
339 * frames. 339 * frames.
@@ -935,7 +935,7 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx)
935 if (info->flags & IEEE80211_TX_CTL_DONTFRAG) 935 if (info->flags & IEEE80211_TX_CTL_DONTFRAG)
936 return TX_CONTINUE; 936 return TX_CONTINUE;
937 937
938 if (tx->local->ops->set_frag_threshold) 938 if (ieee80211_hw_check(&tx->local->hw, SUPPORTS_TX_FRAG))
939 return TX_CONTINUE; 939 return TX_CONTINUE;
940 940
941 /* 941 /*
@@ -1244,7 +1244,7 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata,
1244 1244
1245static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local, 1245static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local,
1246 struct ieee80211_vif *vif, 1246 struct ieee80211_vif *vif,
1247 struct ieee80211_sta *pubsta, 1247 struct sta_info *sta,
1248 struct sk_buff *skb) 1248 struct sk_buff *skb)
1249{ 1249{
1250 struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; 1250 struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
@@ -1258,10 +1258,13 @@ static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local,
1258 if (!ieee80211_is_data(hdr->frame_control)) 1258 if (!ieee80211_is_data(hdr->frame_control))
1259 return NULL; 1259 return NULL;
1260 1260
1261 if (pubsta) { 1261 if (sta) {
1262 u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK; 1262 u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK;
1263 1263
1264 txq = pubsta->txq[tid]; 1264 if (!sta->uploaded)
1265 return NULL;
1266
1267 txq = sta->sta.txq[tid];
1265 } else if (vif) { 1268 } else if (vif) {
1266 txq = vif->txq; 1269 txq = vif->txq;
1267 } 1270 }
@@ -1411,7 +1414,7 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
1411 txqi->txq.sta = &sta->sta; 1414 txqi->txq.sta = &sta->sta;
1412 sta->sta.txq[tid] = &txqi->txq; 1415 sta->sta.txq[tid] = &txqi->txq;
1413 txqi->txq.tid = tid; 1416 txqi->txq.tid = tid;
1414 txqi->txq.ac = ieee802_1d_to_ac[tid & 7]; 1417 txqi->txq.ac = ieee80211_ac_from_tid(tid);
1415 } else { 1418 } else {
1416 sdata->vif.txq = &txqi->txq; 1419 sdata->vif.txq = &txqi->txq;
1417 txqi->txq.tid = 0; 1420 txqi->txq.tid = 0;
@@ -1504,23 +1507,17 @@ static bool ieee80211_queue_skb(struct ieee80211_local *local,
1504 struct fq *fq = &local->fq; 1507 struct fq *fq = &local->fq;
1505 struct ieee80211_vif *vif; 1508 struct ieee80211_vif *vif;
1506 struct txq_info *txqi; 1509 struct txq_info *txqi;
1507 struct ieee80211_sta *pubsta;
1508 1510
1509 if (!local->ops->wake_tx_queue || 1511 if (!local->ops->wake_tx_queue ||
1510 sdata->vif.type == NL80211_IFTYPE_MONITOR) 1512 sdata->vif.type == NL80211_IFTYPE_MONITOR)
1511 return false; 1513 return false;
1512 1514
1513 if (sta && sta->uploaded)
1514 pubsta = &sta->sta;
1515 else
1516 pubsta = NULL;
1517
1518 if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) 1515 if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
1519 sdata = container_of(sdata->bss, 1516 sdata = container_of(sdata->bss,
1520 struct ieee80211_sub_if_data, u.ap); 1517 struct ieee80211_sub_if_data, u.ap);
1521 1518
1522 vif = &sdata->vif; 1519 vif = &sdata->vif;
1523 txqi = ieee80211_get_txq(local, vif, pubsta, skb); 1520 txqi = ieee80211_get_txq(local, vif, sta, skb);
1524 1521
1525 if (!txqi) 1522 if (!txqi)
1526 return false; 1523 return false;
@@ -2798,7 +2795,7 @@ void ieee80211_check_fast_xmit(struct sta_info *sta)
2798 2795
2799 /* fast-xmit doesn't handle fragmentation at all */ 2796 /* fast-xmit doesn't handle fragmentation at all */
2800 if (local->hw.wiphy->frag_threshold != (u32)-1 && 2797 if (local->hw.wiphy->frag_threshold != (u32)-1 &&
2801 !local->ops->set_frag_threshold) 2798 !ieee80211_hw_check(&local->hw, SUPPORTS_TX_FRAG))
2802 goto out; 2799 goto out;
2803 2800
2804 rcu_read_lock(); 2801 rcu_read_lock();
@@ -3057,11 +3054,12 @@ static bool ieee80211_amsdu_prepare_head(struct ieee80211_sub_if_data *sdata,
3057 struct ieee80211_local *local = sdata->local; 3054 struct ieee80211_local *local = sdata->local;
3058 struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); 3055 struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
3059 struct ieee80211_hdr *hdr; 3056 struct ieee80211_hdr *hdr;
3060 struct ethhdr amsdu_hdr; 3057 struct ethhdr *amsdu_hdr;
3061 int hdr_len = fast_tx->hdr_len - sizeof(rfc1042_header); 3058 int hdr_len = fast_tx->hdr_len - sizeof(rfc1042_header);
3062 int subframe_len = skb->len - hdr_len; 3059 int subframe_len = skb->len - hdr_len;
3063 void *data; 3060 void *data;
3064 u8 *qc; 3061 u8 *qc, *h_80211_src, *h_80211_dst;
3062 const u8 *bssid;
3065 3063
3066 if (info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE) 3064 if (info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE)
3067 return false; 3065 return false;
@@ -3069,19 +3067,44 @@ static bool ieee80211_amsdu_prepare_head(struct ieee80211_sub_if_data *sdata,
3069 if (info->control.flags & IEEE80211_TX_CTRL_AMSDU) 3067 if (info->control.flags & IEEE80211_TX_CTRL_AMSDU)
3070 return true; 3068 return true;
3071 3069
3072 if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(amsdu_hdr), 3070 if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(*amsdu_hdr),
3073 &subframe_len)) 3071 &subframe_len))
3074 return false; 3072 return false;
3075 3073
3076 amsdu_hdr.h_proto = cpu_to_be16(subframe_len); 3074 data = skb_push(skb, sizeof(*amsdu_hdr));
3077 memcpy(amsdu_hdr.h_source, skb->data + fast_tx->sa_offs, ETH_ALEN); 3075 memmove(data, data + sizeof(*amsdu_hdr), hdr_len);
3078 memcpy(amsdu_hdr.h_dest, skb->data + fast_tx->da_offs, ETH_ALEN); 3076 hdr = data;
3077 amsdu_hdr = data + hdr_len;
3078 /* h_80211_src/dst is addr* field within hdr */
3079 h_80211_src = data + fast_tx->sa_offs;
3080 h_80211_dst = data + fast_tx->da_offs;
3081
3082 amsdu_hdr->h_proto = cpu_to_be16(subframe_len);
3083 ether_addr_copy(amsdu_hdr->h_source, h_80211_src);
3084 ether_addr_copy(amsdu_hdr->h_dest, h_80211_dst);
3085
3086 /* according to IEEE 802.11-2012 8.3.2 table 8-19, the outer SA/DA
3087 * fields needs to be changed to BSSID for A-MSDU frames depending
3088 * on FromDS/ToDS values.
3089 */
3090 switch (sdata->vif.type) {
3091 case NL80211_IFTYPE_STATION:
3092 bssid = sdata->u.mgd.bssid;
3093 break;
3094 case NL80211_IFTYPE_AP:
3095 case NL80211_IFTYPE_AP_VLAN:
3096 bssid = sdata->vif.addr;
3097 break;
3098 default:
3099 bssid = NULL;
3100 }
3101
3102 if (bssid && ieee80211_has_fromds(hdr->frame_control))
3103 ether_addr_copy(h_80211_src, bssid);
3079 3104
3080 data = skb_push(skb, sizeof(amsdu_hdr)); 3105 if (bssid && ieee80211_has_tods(hdr->frame_control))
3081 memmove(data, data + sizeof(amsdu_hdr), hdr_len); 3106 ether_addr_copy(h_80211_dst, bssid);
3082 memcpy(data + hdr_len, &amsdu_hdr, sizeof(amsdu_hdr));
3083 3107
3084 hdr = data;
3085 qc = ieee80211_get_qos_ctl(hdr); 3108 qc = ieee80211_get_qos_ctl(hdr);
3086 *qc |= IEEE80211_QOS_CTL_A_MSDU_PRESENT; 3109 *qc |= IEEE80211_QOS_CTL_A_MSDU_PRESENT;
3087 3110
@@ -3262,7 +3285,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
3262 int extra_head = fast_tx->hdr_len - (ETH_HLEN - 2); 3285 int extra_head = fast_tx->hdr_len - (ETH_HLEN - 2);
3263 int hw_headroom = sdata->local->hw.extra_tx_headroom; 3286 int hw_headroom = sdata->local->hw.extra_tx_headroom;
3264 struct ethhdr eth; 3287 struct ethhdr eth;
3265 struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); 3288 struct ieee80211_tx_info *info;
3266 struct ieee80211_hdr *hdr = (void *)fast_tx->hdr; 3289 struct ieee80211_hdr *hdr = (void *)fast_tx->hdr;
3267 struct ieee80211_tx_data tx; 3290 struct ieee80211_tx_data tx;
3268 ieee80211_tx_result r; 3291 ieee80211_tx_result r;
@@ -3326,6 +3349,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
3326 memcpy(skb->data + fast_tx->da_offs, eth.h_dest, ETH_ALEN); 3349 memcpy(skb->data + fast_tx->da_offs, eth.h_dest, ETH_ALEN);
3327 memcpy(skb->data + fast_tx->sa_offs, eth.h_source, ETH_ALEN); 3350 memcpy(skb->data + fast_tx->sa_offs, eth.h_source, ETH_ALEN);
3328 3351
3352 info = IEEE80211_SKB_CB(skb);
3329 memset(info, 0, sizeof(*info)); 3353 memset(info, 0, sizeof(*info));
3330 info->band = fast_tx->band; 3354 info->band = fast_tx->band;
3331 info->control.vif = &sdata->vif; 3355 info->control.vif = &sdata->vif;
@@ -3548,6 +3572,115 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
3548 rcu_read_unlock(); 3572 rcu_read_unlock();
3549} 3573}
3550 3574
3575static int ieee80211_change_da(struct sk_buff *skb, struct sta_info *sta)
3576{
3577 struct ethhdr *eth;
3578 int err;
3579
3580 err = skb_ensure_writable(skb, ETH_HLEN);
3581 if (unlikely(err))
3582 return err;
3583
3584 eth = (void *)skb->data;
3585 ether_addr_copy(eth->h_dest, sta->sta.addr);
3586
3587 return 0;
3588}
3589
3590static bool ieee80211_multicast_to_unicast(struct sk_buff *skb,
3591 struct net_device *dev)
3592{
3593 struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
3594 const struct ethhdr *eth = (void *)skb->data;
3595 const struct vlan_ethhdr *ethvlan = (void *)skb->data;
3596 __be16 ethertype;
3597
3598 if (likely(!is_multicast_ether_addr(eth->h_dest)))
3599 return false;
3600
3601 switch (sdata->vif.type) {
3602 case NL80211_IFTYPE_AP_VLAN:
3603 if (sdata->u.vlan.sta)
3604 return false;
3605 if (sdata->wdev.use_4addr)
3606 return false;
3607 /* fall through */
3608 case NL80211_IFTYPE_AP:
3609 /* check runtime toggle for this bss */
3610 if (!sdata->bss->multicast_to_unicast)
3611 return false;
3612 break;
3613 default:
3614 return false;
3615 }
3616
3617 /* multicast to unicast conversion only for some payload */
3618 ethertype = eth->h_proto;
3619 if (ethertype == htons(ETH_P_8021Q) && skb->len >= VLAN_ETH_HLEN)
3620 ethertype = ethvlan->h_vlan_encapsulated_proto;
3621 switch (ethertype) {
3622 case htons(ETH_P_ARP):
3623 case htons(ETH_P_IP):
3624 case htons(ETH_P_IPV6):
3625 break;
3626 default:
3627 return false;
3628 }
3629
3630 return true;
3631}
3632
3633static void
3634ieee80211_convert_to_unicast(struct sk_buff *skb, struct net_device *dev,
3635 struct sk_buff_head *queue)
3636{
3637 struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
3638 struct ieee80211_local *local = sdata->local;
3639 const struct ethhdr *eth = (struct ethhdr *)skb->data;
3640 struct sta_info *sta, *first = NULL;
3641 struct sk_buff *cloned_skb;
3642
3643 rcu_read_lock();
3644
3645 list_for_each_entry_rcu(sta, &local->sta_list, list) {
3646 if (sdata != sta->sdata)
3647 /* AP-VLAN mismatch */
3648 continue;
3649 if (unlikely(ether_addr_equal(eth->h_source, sta->sta.addr)))
3650 /* do not send back to source */
3651 continue;
3652 if (!first) {
3653 first = sta;
3654 continue;
3655 }
3656 cloned_skb = skb_clone(skb, GFP_ATOMIC);
3657 if (!cloned_skb)
3658 goto multicast;
3659 if (unlikely(ieee80211_change_da(cloned_skb, sta))) {
3660 dev_kfree_skb(cloned_skb);
3661 goto multicast;
3662 }
3663 __skb_queue_tail(queue, cloned_skb);
3664 }
3665
3666 if (likely(first)) {
3667 if (unlikely(ieee80211_change_da(skb, first)))
3668 goto multicast;
3669 __skb_queue_tail(queue, skb);
3670 } else {
3671 /* no STA connected, drop */
3672 kfree_skb(skb);
3673 skb = NULL;
3674 }
3675
3676 goto out;
3677multicast:
3678 __skb_queue_purge(queue);
3679 __skb_queue_tail(queue, skb);
3680out:
3681 rcu_read_unlock();
3682}
3683
3551/** 3684/**
3552 * ieee80211_subif_start_xmit - netif start_xmit function for 802.3 vifs 3685 * ieee80211_subif_start_xmit - netif start_xmit function for 802.3 vifs
3553 * @skb: packet to be sent 3686 * @skb: packet to be sent
@@ -3558,7 +3691,17 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
3558netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, 3691netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
3559 struct net_device *dev) 3692 struct net_device *dev)
3560{ 3693{
3561 __ieee80211_subif_start_xmit(skb, dev, 0); 3694 if (unlikely(ieee80211_multicast_to_unicast(skb, dev))) {
3695 struct sk_buff_head queue;
3696
3697 __skb_queue_head_init(&queue);
3698 ieee80211_convert_to_unicast(skb, dev, &queue);
3699 while ((skb = __skb_dequeue(&queue)))
3700 __ieee80211_subif_start_xmit(skb, dev, 0);
3701 } else {
3702 __ieee80211_subif_start_xmit(skb, dev, 0);
3703 }
3704
3562 return NETDEV_TX_OK; 3705 return NETDEV_TX_OK;
3563} 3706}
3564 3707
@@ -4051,7 +4194,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
4051 } 4194 }
4052 4195
4053 if (ifmsh->sync_ops) 4196 if (ifmsh->sync_ops)
4054 ifmsh->sync_ops->adjust_tbtt(sdata, beacon); 4197 ifmsh->sync_ops->adjust_tsf(sdata, beacon);
4055 4198
4056 skb = dev_alloc_skb(local->tx_headroom + 4199 skb = dev_alloc_skb(local->tx_headroom +
4057 beacon->head_len + 4200 beacon->head_len +
@@ -4516,7 +4659,7 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
4516 struct sk_buff *skb, int tid, 4659 struct sk_buff *skb, int tid,
4517 enum nl80211_band band) 4660 enum nl80211_band band)
4518{ 4661{
4519 int ac = ieee802_1d_to_ac[tid & 7]; 4662 int ac = ieee80211_ac_from_tid(tid);
4520 4663
4521 skb_reset_mac_header(skb); 4664 skb_reset_mac_header(skb);
4522 skb_set_queue_mapping(skb, ac); 4665 skb_set_queue_mapping(skb, ac);
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 545c79a42a77..ac59fbd280df 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -3308,10 +3308,11 @@ int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata,
3308 struct ieee80211_local *local = sdata->local; 3308 struct ieee80211_local *local = sdata->local;
3309 struct ieee80211_sub_if_data *sdata_iter; 3309 struct ieee80211_sub_if_data *sdata_iter;
3310 enum nl80211_iftype iftype = sdata->wdev.iftype; 3310 enum nl80211_iftype iftype = sdata->wdev.iftype;
3311 int num[NUM_NL80211_IFTYPES];
3312 struct ieee80211_chanctx *ctx; 3311 struct ieee80211_chanctx *ctx;
3313 int num_different_channels = 0;
3314 int total = 1; 3312 int total = 1;
3313 struct iface_combination_params params = {
3314 .radar_detect = radar_detect,
3315 };
3315 3316
3316 lockdep_assert_held(&local->chanctx_mtx); 3317 lockdep_assert_held(&local->chanctx_mtx);
3317 3318
@@ -3322,12 +3323,19 @@ int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata,
3322 !chandef->chan)) 3323 !chandef->chan))
3323 return -EINVAL; 3324 return -EINVAL;
3324 3325
3325 if (chandef)
3326 num_different_channels = 1;
3327
3328 if (WARN_ON(iftype >= NUM_NL80211_IFTYPES)) 3326 if (WARN_ON(iftype >= NUM_NL80211_IFTYPES))
3329 return -EINVAL; 3327 return -EINVAL;
3330 3328
3329 if (sdata->vif.type == NL80211_IFTYPE_AP ||
3330 sdata->vif.type == NL80211_IFTYPE_MESH_POINT) {
3331 /*
3332 * always passing this is harmless, since it'll be the
3333 * same value that cfg80211 finds if it finds the same
3334 * interface ... and that's always allowed
3335 */
3336 params.new_beacon_int = sdata->vif.bss_conf.beacon_int;
3337 }
3338
3331 /* Always allow software iftypes */ 3339 /* Always allow software iftypes */
3332 if (local->hw.wiphy->software_iftypes & BIT(iftype)) { 3340 if (local->hw.wiphy->software_iftypes & BIT(iftype)) {
3333 if (radar_detect) 3341 if (radar_detect)
@@ -3335,24 +3343,26 @@ int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata,
3335 return 0; 3343 return 0;
3336 } 3344 }
3337 3345
3338 memset(num, 0, sizeof(num)); 3346 if (chandef)
3347 params.num_different_channels = 1;
3339 3348
3340 if (iftype != NL80211_IFTYPE_UNSPECIFIED) 3349 if (iftype != NL80211_IFTYPE_UNSPECIFIED)
3341 num[iftype] = 1; 3350 params.iftype_num[iftype] = 1;
3342 3351
3343 list_for_each_entry(ctx, &local->chanctx_list, list) { 3352 list_for_each_entry(ctx, &local->chanctx_list, list) {
3344 if (ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED) 3353 if (ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED)
3345 continue; 3354 continue;
3346 radar_detect |= ieee80211_chanctx_radar_detect(local, ctx); 3355 params.radar_detect |=
3356 ieee80211_chanctx_radar_detect(local, ctx);
3347 if (ctx->mode == IEEE80211_CHANCTX_EXCLUSIVE) { 3357 if (ctx->mode == IEEE80211_CHANCTX_EXCLUSIVE) {
3348 num_different_channels++; 3358 params.num_different_channels++;
3349 continue; 3359 continue;
3350 } 3360 }
3351 if (chandef && chanmode == IEEE80211_CHANCTX_SHARED && 3361 if (chandef && chanmode == IEEE80211_CHANCTX_SHARED &&
3352 cfg80211_chandef_compatible(chandef, 3362 cfg80211_chandef_compatible(chandef,
3353 &ctx->conf.def)) 3363 &ctx->conf.def))
3354 continue; 3364 continue;
3355 num_different_channels++; 3365 params.num_different_channels++;
3356 } 3366 }
3357 3367
3358 list_for_each_entry_rcu(sdata_iter, &local->interfaces, list) { 3368 list_for_each_entry_rcu(sdata_iter, &local->interfaces, list) {
@@ -3365,16 +3375,14 @@ int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata,
3365 local->hw.wiphy->software_iftypes & BIT(wdev_iter->iftype)) 3375 local->hw.wiphy->software_iftypes & BIT(wdev_iter->iftype))
3366 continue; 3376 continue;
3367 3377
3368 num[wdev_iter->iftype]++; 3378 params.iftype_num[wdev_iter->iftype]++;
3369 total++; 3379 total++;
3370 } 3380 }
3371 3381
3372 if (total == 1 && !radar_detect) 3382 if (total == 1 && !params.radar_detect)
3373 return 0; 3383 return 0;
3374 3384
3375 return cfg80211_check_combinations(local->hw.wiphy, 3385 return cfg80211_check_combinations(local->hw.wiphy, &params);
3376 num_different_channels,
3377 radar_detect, num);
3378} 3386}
3379 3387
3380static void 3388static void
@@ -3390,12 +3398,10 @@ ieee80211_iter_max_chans(const struct ieee80211_iface_combination *c,
3390int ieee80211_max_num_channels(struct ieee80211_local *local) 3398int ieee80211_max_num_channels(struct ieee80211_local *local)
3391{ 3399{
3392 struct ieee80211_sub_if_data *sdata; 3400 struct ieee80211_sub_if_data *sdata;
3393 int num[NUM_NL80211_IFTYPES] = {};
3394 struct ieee80211_chanctx *ctx; 3401 struct ieee80211_chanctx *ctx;
3395 int num_different_channels = 0;
3396 u8 radar_detect = 0;
3397 u32 max_num_different_channels = 1; 3402 u32 max_num_different_channels = 1;
3398 int err; 3403 int err;
3404 struct iface_combination_params params = {0};
3399 3405
3400 lockdep_assert_held(&local->chanctx_mtx); 3406 lockdep_assert_held(&local->chanctx_mtx);
3401 3407
@@ -3403,17 +3409,17 @@ int ieee80211_max_num_channels(struct ieee80211_local *local)
3403 if (ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED) 3409 if (ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED)
3404 continue; 3410 continue;
3405 3411
3406 num_different_channels++; 3412 params.num_different_channels++;
3407 3413
3408 radar_detect |= ieee80211_chanctx_radar_detect(local, ctx); 3414 params.radar_detect |=
3415 ieee80211_chanctx_radar_detect(local, ctx);
3409 } 3416 }
3410 3417
3411 list_for_each_entry_rcu(sdata, &local->interfaces, list) 3418 list_for_each_entry_rcu(sdata, &local->interfaces, list)
3412 num[sdata->wdev.iftype]++; 3419 params.iftype_num[sdata->wdev.iftype]++;
3413 3420
3414 err = cfg80211_iter_combinations(local->hw.wiphy, 3421 err = cfg80211_iter_combinations(local->hw.wiphy, &params,
3415 num_different_channels, radar_detect, 3422 ieee80211_iter_max_chans,
3416 num, ieee80211_iter_max_chans,
3417 &max_num_different_channels); 3423 &max_num_different_channels);
3418 if (err < 0) 3424 if (err < 0)
3419 return err; 3425 return err;
@@ -3456,3 +3462,10 @@ void ieee80211_txq_get_depth(struct ieee80211_txq *txq,
3456 *byte_cnt = txqi->tin.backlog_bytes + frag_bytes; 3462 *byte_cnt = txqi->tin.backlog_bytes + frag_bytes;
3457} 3463}
3458EXPORT_SYMBOL(ieee80211_txq_get_depth); 3464EXPORT_SYMBOL(ieee80211_txq_get_depth);
3465
3466const u8 ieee80211_ac_to_qos_mask[IEEE80211_NUM_ACS] = {
3467 IEEE80211_WMM_IE_STA_QOSINFO_AC_VO,
3468 IEEE80211_WMM_IE_STA_QOSINFO_AC_VI,
3469 IEEE80211_WMM_IE_STA_QOSINFO_AC_BE,
3470 IEEE80211_WMM_IE_STA_QOSINFO_AC_BK
3471};
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index 6832bf6ab69f..19ec2189d3ac 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -436,14 +436,10 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
436 struct sta_info *sta, u8 opmode, 436 struct sta_info *sta, u8 opmode,
437 enum nl80211_band band) 437 enum nl80211_band band)
438{ 438{
439 struct ieee80211_local *local = sdata->local;
440 struct ieee80211_supported_band *sband;
441 enum ieee80211_sta_rx_bandwidth new_bw; 439 enum ieee80211_sta_rx_bandwidth new_bw;
442 u32 changed = 0; 440 u32 changed = 0;
443 u8 nss; 441 u8 nss;
444 442
445 sband = local->hw.wiphy->bands[band];
446
447 /* ignore - no support for BF yet */ 443 /* ignore - no support for BF yet */
448 if (opmode & IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF) 444 if (opmode & IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF)
449 return 0; 445 return 0;
@@ -527,8 +523,10 @@ void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
527 523
528 u32 changed = __ieee80211_vht_handle_opmode(sdata, sta, opmode, band); 524 u32 changed = __ieee80211_vht_handle_opmode(sdata, sta, opmode, band);
529 525
530 if (changed > 0) 526 if (changed > 0) {
527 ieee80211_recalc_min_chandef(sdata);
531 rate_control_rate_update(local, sband, sta, changed); 528 rate_control_rate_update(local, sband, sta, changed);
529 }
532} 530}
533 531
534void ieee80211_get_vht_mask_from_cap(__le16 vht_cap, 532void ieee80211_get_vht_mask_from_cap(__le16 vht_cap,
diff --git a/net/mac80211/wep.c b/net/mac80211/wep.c
index efa3f48f1ec5..73e8f347802e 100644
--- a/net/mac80211/wep.c
+++ b/net/mac80211/wep.c
@@ -293,7 +293,8 @@ ieee80211_crypto_wep_decrypt(struct ieee80211_rx_data *rx)
293 return RX_DROP_UNUSABLE; 293 return RX_DROP_UNUSABLE;
294 ieee80211_wep_remove_iv(rx->local, rx->skb, rx->key); 294 ieee80211_wep_remove_iv(rx->local, rx->skb, rx->key);
295 /* remove ICV */ 295 /* remove ICV */
296 if (pskb_trim(rx->skb, rx->skb->len - IEEE80211_WEP_ICV_LEN)) 296 if (!(status->flag & RX_FLAG_ICV_STRIPPED) &&
297 pskb_trim(rx->skb, rx->skb->len - IEEE80211_WEP_ICV_LEN))
297 return RX_DROP_UNUSABLE; 298 return RX_DROP_UNUSABLE;
298 } 299 }
299 300
diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c
index 9eb0aee9105b..3e3d3014e9ab 100644
--- a/net/mac80211/wme.c
+++ b/net/mac80211/wme.c
@@ -236,26 +236,35 @@ void ieee80211_set_qos_hdr(struct ieee80211_sub_if_data *sdata,
236{ 236{
237 struct ieee80211_hdr *hdr = (void *)skb->data; 237 struct ieee80211_hdr *hdr = (void *)skb->data;
238 struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); 238 struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
239 u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
240 u8 flags;
239 u8 *p; 241 u8 *p;
240 u8 ack_policy, tid;
241 242
242 if (!ieee80211_is_data_qos(hdr->frame_control)) 243 if (!ieee80211_is_data_qos(hdr->frame_control))
243 return; 244 return;
244 245
245 p = ieee80211_get_qos_ctl(hdr); 246 p = ieee80211_get_qos_ctl(hdr);
246 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
247 247
248 /* preserve EOSP bit */ 248 /* set up the first byte */
249 ack_policy = *p & IEEE80211_QOS_CTL_EOSP; 249
250 /*
251 * preserve everything but the TID and ACK policy
252 * (which we both write here)
253 */
254 flags = *p & ~(IEEE80211_QOS_CTL_TID_MASK |
255 IEEE80211_QOS_CTL_ACK_POLICY_MASK);
250 256
251 if (is_multicast_ether_addr(hdr->addr1) || 257 if (is_multicast_ether_addr(hdr->addr1) ||
252 sdata->noack_map & BIT(tid)) { 258 sdata->noack_map & BIT(tid)) {
253 ack_policy |= IEEE80211_QOS_CTL_ACK_POLICY_NOACK; 259 flags |= IEEE80211_QOS_CTL_ACK_POLICY_NOACK;
254 info->flags |= IEEE80211_TX_CTL_NO_ACK; 260 info->flags |= IEEE80211_TX_CTL_NO_ACK;
255 } 261 }
256 262
257 /* qos header is 2 bytes */ 263 *p = flags | tid;
258 *p++ = ack_policy | tid; 264
265 /* set up the second byte */
266 p++;
267
259 if (ieee80211_vif_is_mesh(&sdata->vif)) { 268 if (ieee80211_vif_is_mesh(&sdata->vif)) {
260 /* preserve RSPI and Mesh PS Level bit */ 269 /* preserve RSPI and Mesh PS Level bit */
261 *p &= ((IEEE80211_QOS_CTL_RSPI | 270 *p &= ((IEEE80211_QOS_CTL_RSPI |
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index 42ce9bd4426f..c1ef22df865f 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -57,7 +57,7 @@ ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
57 57
58 if (info->control.hw_key && 58 if (info->control.hw_key &&
59 (info->flags & IEEE80211_TX_CTL_DONTFRAG || 59 (info->flags & IEEE80211_TX_CTL_DONTFRAG ||
60 tx->local->ops->set_frag_threshold) && 60 ieee80211_hw_check(&tx->local->hw, SUPPORTS_TX_FRAG)) &&
61 !(tx->key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC)) { 61 !(tx->key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC)) {
62 /* hwaccel - with no need for SW-generated MMIC */ 62 /* hwaccel - with no need for SW-generated MMIC */
63 return TX_CONTINUE; 63 return TX_CONTINUE;
@@ -294,7 +294,8 @@ ieee80211_crypto_tkip_decrypt(struct ieee80211_rx_data *rx)
294 return RX_DROP_UNUSABLE; 294 return RX_DROP_UNUSABLE;
295 295
296 /* Trim ICV */ 296 /* Trim ICV */
297 skb_trim(skb, skb->len - IEEE80211_TKIP_ICV_LEN); 297 if (!(status->flag & RX_FLAG_ICV_STRIPPED))
298 skb_trim(skb, skb->len - IEEE80211_TKIP_ICV_LEN);
298 299
299 /* Remove IV */ 300 /* Remove IV */
300 memmove(skb->data + IEEE80211_TKIP_IV_LEN, skb->data, hdrlen); 301 memmove(skb->data + IEEE80211_TKIP_IV_LEN, skb->data, hdrlen);
diff --git a/net/mac802154/Makefile b/net/mac802154/Makefile
index 17a51e8389e2..5857bb1e1695 100644
--- a/net/mac802154/Makefile
+++ b/net/mac802154/Makefile
@@ -3,5 +3,3 @@ mac802154-objs := main.o rx.o tx.o mac_cmd.o mib.o \
3 iface.o llsec.o util.o cfg.o trace.o 3 iface.o llsec.o util.o cfg.o trace.o
4 4
5CFLAGS_trace.o := -I$(src) 5CFLAGS_trace.o := -I$(src)
6
7ccflags-y += -D__CHECK_ENDIAN__
diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c
index 6a3e1c2181d3..1e1c9b20bab7 100644
--- a/net/mac802154/llsec.c
+++ b/net/mac802154/llsec.c
@@ -18,6 +18,8 @@
18#include <linux/bug.h> 18#include <linux/bug.h>
19#include <linux/completion.h> 19#include <linux/completion.h>
20#include <linux/ieee802154.h> 20#include <linux/ieee802154.h>
21#include <linux/rculist.h>
22
21#include <crypto/aead.h> 23#include <crypto/aead.h>
22#include <crypto/skcipher.h> 24#include <crypto/skcipher.h>
23 25
diff --git a/net/mac802154/util.c b/net/mac802154/util.c
index f9fd0957ab67..7c03fb0ea34c 100644
--- a/net/mac802154/util.c
+++ b/net/mac802154/util.c
@@ -80,11 +80,11 @@ void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb,
80 80
81 if (skb->len > max_sifs_size) 81 if (skb->len > max_sifs_size)
82 hrtimer_start(&local->ifs_timer, 82 hrtimer_start(&local->ifs_timer,
83 ktime_set(0, hw->phy->lifs_period * NSEC_PER_USEC), 83 hw->phy->lifs_period * NSEC_PER_USEC,
84 HRTIMER_MODE_REL); 84 HRTIMER_MODE_REL);
85 else 85 else
86 hrtimer_start(&local->ifs_timer, 86 hrtimer_start(&local->ifs_timer,
87 ktime_set(0, hw->phy->sifs_period * NSEC_PER_USEC), 87 hw->phy->sifs_period * NSEC_PER_USEC,
88 HRTIMER_MODE_REL); 88 HRTIMER_MODE_REL);
89 } else { 89 } else {
90 ieee802154_wake_queue(hw); 90 ieee802154_wake_queue(hw);
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 15fe97644ffe..6414079aa729 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -7,7 +7,9 @@
7#include <linux/if_arp.h> 7#include <linux/if_arp.h>
8#include <linux/ipv6.h> 8#include <linux/ipv6.h>
9#include <linux/mpls.h> 9#include <linux/mpls.h>
10#include <linux/netconf.h>
10#include <linux/vmalloc.h> 11#include <linux/vmalloc.h>
12#include <linux/percpu.h>
11#include <net/ip.h> 13#include <net/ip.h>
12#include <net/dst.h> 14#include <net/dst.h>
13#include <net/sock.h> 15#include <net/sock.h>
@@ -17,8 +19,8 @@
17#include <net/netns/generic.h> 19#include <net/netns/generic.h>
18#if IS_ENABLED(CONFIG_IPV6) 20#if IS_ENABLED(CONFIG_IPV6)
19#include <net/ipv6.h> 21#include <net/ipv6.h>
20#include <net/addrconf.h>
21#endif 22#endif
23#include <net/addrconf.h>
22#include <net/nexthop.h> 24#include <net/nexthop.h>
23#include "internal.h" 25#include "internal.h"
24 26
@@ -48,11 +50,6 @@ static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned index)
48 return rt; 50 return rt;
49} 51}
50 52
51static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev)
52{
53 return rcu_dereference_rtnl(dev->mpls_ptr);
54}
55
56bool mpls_output_possible(const struct net_device *dev) 53bool mpls_output_possible(const struct net_device *dev)
57{ 54{
58 return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev); 55 return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev);
@@ -98,18 +95,44 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
98} 95}
99EXPORT_SYMBOL_GPL(mpls_pkt_too_big); 96EXPORT_SYMBOL_GPL(mpls_pkt_too_big);
100 97
101static u32 mpls_multipath_hash(struct mpls_route *rt, 98void mpls_stats_inc_outucastpkts(struct net_device *dev,
102 struct sk_buff *skb, bool bos) 99 const struct sk_buff *skb)
100{
101 struct mpls_dev *mdev;
102
103 if (skb->protocol == htons(ETH_P_MPLS_UC)) {
104 mdev = mpls_dev_get(dev);
105 if (mdev)
106 MPLS_INC_STATS_LEN(mdev, skb->len,
107 tx_packets,
108 tx_bytes);
109 } else if (skb->protocol == htons(ETH_P_IP)) {
110 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
111#if IS_ENABLED(CONFIG_IPV6)
112 } else if (skb->protocol == htons(ETH_P_IPV6)) {
113 struct inet6_dev *in6dev = __in6_dev_get(dev);
114
115 if (in6dev)
116 IP6_UPD_PO_STATS(dev_net(dev), in6dev,
117 IPSTATS_MIB_OUT, skb->len);
118#endif
119 }
120}
121EXPORT_SYMBOL_GPL(mpls_stats_inc_outucastpkts);
122
123static u32 mpls_multipath_hash(struct mpls_route *rt, struct sk_buff *skb)
103{ 124{
104 struct mpls_entry_decoded dec; 125 struct mpls_entry_decoded dec;
126 unsigned int mpls_hdr_len = 0;
105 struct mpls_shim_hdr *hdr; 127 struct mpls_shim_hdr *hdr;
106 bool eli_seen = false; 128 bool eli_seen = false;
107 int label_index; 129 int label_index;
108 u32 hash = 0; 130 u32 hash = 0;
109 131
110 for (label_index = 0; label_index < MAX_MP_SELECT_LABELS && !bos; 132 for (label_index = 0; label_index < MAX_MP_SELECT_LABELS;
111 label_index++) { 133 label_index++) {
112 if (!pskb_may_pull(skb, sizeof(*hdr) * label_index)) 134 mpls_hdr_len += sizeof(*hdr);
135 if (!pskb_may_pull(skb, mpls_hdr_len))
113 break; 136 break;
114 137
115 /* Read and decode the current label */ 138 /* Read and decode the current label */
@@ -134,37 +157,38 @@ static u32 mpls_multipath_hash(struct mpls_route *rt,
134 eli_seen = true; 157 eli_seen = true;
135 } 158 }
136 159
137 bos = dec.bos; 160 if (!dec.bos)
138 if (bos && pskb_may_pull(skb, sizeof(*hdr) * label_index + 161 continue;
139 sizeof(struct iphdr))) { 162
163 /* found bottom label; does skb have room for a header? */
164 if (pskb_may_pull(skb, mpls_hdr_len + sizeof(struct iphdr))) {
140 const struct iphdr *v4hdr; 165 const struct iphdr *v4hdr;
141 166
142 v4hdr = (const struct iphdr *)(mpls_hdr(skb) + 167 v4hdr = (const struct iphdr *)(hdr + 1);
143 label_index);
144 if (v4hdr->version == 4) { 168 if (v4hdr->version == 4) {
145 hash = jhash_3words(ntohl(v4hdr->saddr), 169 hash = jhash_3words(ntohl(v4hdr->saddr),
146 ntohl(v4hdr->daddr), 170 ntohl(v4hdr->daddr),
147 v4hdr->protocol, hash); 171 v4hdr->protocol, hash);
148 } else if (v4hdr->version == 6 && 172 } else if (v4hdr->version == 6 &&
149 pskb_may_pull(skb, sizeof(*hdr) * label_index + 173 pskb_may_pull(skb, mpls_hdr_len +
150 sizeof(struct ipv6hdr))) { 174 sizeof(struct ipv6hdr))) {
151 const struct ipv6hdr *v6hdr; 175 const struct ipv6hdr *v6hdr;
152 176
153 v6hdr = (const struct ipv6hdr *)(mpls_hdr(skb) + 177 v6hdr = (const struct ipv6hdr *)(hdr + 1);
154 label_index);
155
156 hash = __ipv6_addr_jhash(&v6hdr->saddr, hash); 178 hash = __ipv6_addr_jhash(&v6hdr->saddr, hash);
157 hash = __ipv6_addr_jhash(&v6hdr->daddr, hash); 179 hash = __ipv6_addr_jhash(&v6hdr->daddr, hash);
158 hash = jhash_1word(v6hdr->nexthdr, hash); 180 hash = jhash_1word(v6hdr->nexthdr, hash);
159 } 181 }
160 } 182 }
183
184 break;
161 } 185 }
162 186
163 return hash; 187 return hash;
164} 188}
165 189
166static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt, 190static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
167 struct sk_buff *skb, bool bos) 191 struct sk_buff *skb)
168{ 192{
169 int alive = ACCESS_ONCE(rt->rt_nhn_alive); 193 int alive = ACCESS_ONCE(rt->rt_nhn_alive);
170 u32 hash = 0; 194 u32 hash = 0;
@@ -180,7 +204,7 @@ static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
180 if (alive <= 0) 204 if (alive <= 0)
181 return NULL; 205 return NULL;
182 206
183 hash = mpls_multipath_hash(rt, skb, bos); 207 hash = mpls_multipath_hash(rt, skb);
184 nh_index = hash % alive; 208 nh_index = hash % alive;
185 if (alive == rt->rt_nhn) 209 if (alive == rt->rt_nhn)
186 goto out; 210 goto out;
@@ -253,6 +277,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
253 struct mpls_nh *nh; 277 struct mpls_nh *nh;
254 struct mpls_entry_decoded dec; 278 struct mpls_entry_decoded dec;
255 struct net_device *out_dev; 279 struct net_device *out_dev;
280 struct mpls_dev *out_mdev;
256 struct mpls_dev *mdev; 281 struct mpls_dev *mdev;
257 unsigned int hh_len; 282 unsigned int hh_len;
258 unsigned int new_header_size; 283 unsigned int new_header_size;
@@ -262,56 +287,66 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
262 /* Careful this entire function runs inside of an rcu critical section */ 287 /* Careful this entire function runs inside of an rcu critical section */
263 288
264 mdev = mpls_dev_get(dev); 289 mdev = mpls_dev_get(dev);
265 if (!mdev || !mdev->input_enabled) 290 if (!mdev)
266 goto drop; 291 goto drop;
267 292
268 if (skb->pkt_type != PACKET_HOST) 293 MPLS_INC_STATS_LEN(mdev, skb->len, rx_packets,
294 rx_bytes);
295
296 if (!mdev->input_enabled) {
297 MPLS_INC_STATS(mdev, rx_dropped);
269 goto drop; 298 goto drop;
299 }
300
301 if (skb->pkt_type != PACKET_HOST)
302 goto err;
270 303
271 if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) 304 if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
272 goto drop; 305 goto err;
273 306
274 if (!pskb_may_pull(skb, sizeof(*hdr))) 307 if (!pskb_may_pull(skb, sizeof(*hdr)))
275 goto drop; 308 goto err;
276 309
277 /* Read and decode the label */ 310 /* Read and decode the label */
278 hdr = mpls_hdr(skb); 311 hdr = mpls_hdr(skb);
279 dec = mpls_entry_decode(hdr); 312 dec = mpls_entry_decode(hdr);
280 313
281 /* Pop the label */
282 skb_pull(skb, sizeof(*hdr));
283 skb_reset_network_header(skb);
284
285 skb_orphan(skb);
286
287 rt = mpls_route_input_rcu(net, dec.label); 314 rt = mpls_route_input_rcu(net, dec.label);
288 if (!rt) 315 if (!rt) {
316 MPLS_INC_STATS(mdev, rx_noroute);
289 goto drop; 317 goto drop;
318 }
290 319
291 nh = mpls_select_multipath(rt, skb, dec.bos); 320 nh = mpls_select_multipath(rt, skb);
292 if (!nh) 321 if (!nh)
293 goto drop; 322 goto err;
294 323
295 /* Find the output device */ 324 /* Pop the label */
296 out_dev = rcu_dereference(nh->nh_dev); 325 skb_pull(skb, sizeof(*hdr));
297 if (!mpls_output_possible(out_dev)) 326 skb_reset_network_header(skb);
298 goto drop; 327
328 skb_orphan(skb);
299 329
300 if (skb_warn_if_lro(skb)) 330 if (skb_warn_if_lro(skb))
301 goto drop; 331 goto err;
302 332
303 skb_forward_csum(skb); 333 skb_forward_csum(skb);
304 334
305 /* Verify ttl is valid */ 335 /* Verify ttl is valid */
306 if (dec.ttl <= 1) 336 if (dec.ttl <= 1)
307 goto drop; 337 goto err;
308 dec.ttl -= 1; 338 dec.ttl -= 1;
309 339
340 /* Find the output device */
341 out_dev = rcu_dereference(nh->nh_dev);
342 if (!mpls_output_possible(out_dev))
343 goto tx_err;
344
310 /* Verify the destination can hold the packet */ 345 /* Verify the destination can hold the packet */
311 new_header_size = mpls_nh_header_size(nh); 346 new_header_size = mpls_nh_header_size(nh);
312 mtu = mpls_dev_mtu(out_dev); 347 mtu = mpls_dev_mtu(out_dev);
313 if (mpls_pkt_too_big(skb, mtu - new_header_size)) 348 if (mpls_pkt_too_big(skb, mtu - new_header_size))
314 goto drop; 349 goto tx_err;
315 350
316 hh_len = LL_RESERVED_SPACE(out_dev); 351 hh_len = LL_RESERVED_SPACE(out_dev);
317 if (!out_dev->header_ops) 352 if (!out_dev->header_ops)
@@ -319,7 +354,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
319 354
320 /* Ensure there is enough space for the headers in the skb */ 355 /* Ensure there is enough space for the headers in the skb */
321 if (skb_cow(skb, hh_len + new_header_size)) 356 if (skb_cow(skb, hh_len + new_header_size))
322 goto drop; 357 goto tx_err;
323 358
324 skb->dev = out_dev; 359 skb->dev = out_dev;
325 skb->protocol = htons(ETH_P_MPLS_UC); 360 skb->protocol = htons(ETH_P_MPLS_UC);
@@ -327,7 +362,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
327 if (unlikely(!new_header_size && dec.bos)) { 362 if (unlikely(!new_header_size && dec.bos)) {
328 /* Penultimate hop popping */ 363 /* Penultimate hop popping */
329 if (!mpls_egress(rt, skb, dec)) 364 if (!mpls_egress(rt, skb, dec))
330 goto drop; 365 goto err;
331 } else { 366 } else {
332 bool bos; 367 bool bos;
333 int i; 368 int i;
@@ -343,6 +378,8 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
343 } 378 }
344 } 379 }
345 380
381 mpls_stats_inc_outucastpkts(out_dev, skb);
382
346 /* If via wasn't specified then send out using device address */ 383 /* If via wasn't specified then send out using device address */
347 if (nh->nh_via_table == MPLS_NEIGH_TABLE_UNSPEC) 384 if (nh->nh_via_table == MPLS_NEIGH_TABLE_UNSPEC)
348 err = neigh_xmit(NEIGH_LINK_TABLE, out_dev, 385 err = neigh_xmit(NEIGH_LINK_TABLE, out_dev,
@@ -355,6 +392,13 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
355 __func__, err); 392 __func__, err);
356 return 0; 393 return 0;
357 394
395tx_err:
396 out_mdev = out_dev ? mpls_dev_get(out_dev) : NULL;
397 if (out_mdev)
398 MPLS_INC_STATS(out_mdev, tx_errors);
399 goto drop;
400err:
401 MPLS_INC_STATS(mdev, rx_errors);
358drop: 402drop:
359 kfree_skb(skb); 403 kfree_skb(skb);
360 return NET_RX_DROP; 404 return NET_RX_DROP;
@@ -853,15 +897,279 @@ errout:
853 return err; 897 return err;
854} 898}
855 899
900static void mpls_get_stats(struct mpls_dev *mdev,
901 struct mpls_link_stats *stats)
902{
903 struct mpls_pcpu_stats *p;
904 int i;
905
906 memset(stats, 0, sizeof(*stats));
907
908 for_each_possible_cpu(i) {
909 struct mpls_link_stats local;
910 unsigned int start;
911
912 p = per_cpu_ptr(mdev->stats, i);
913 do {
914 start = u64_stats_fetch_begin(&p->syncp);
915 local = p->stats;
916 } while (u64_stats_fetch_retry(&p->syncp, start));
917
918 stats->rx_packets += local.rx_packets;
919 stats->rx_bytes += local.rx_bytes;
920 stats->tx_packets += local.tx_packets;
921 stats->tx_bytes += local.tx_bytes;
922 stats->rx_errors += local.rx_errors;
923 stats->tx_errors += local.tx_errors;
924 stats->rx_dropped += local.rx_dropped;
925 stats->tx_dropped += local.tx_dropped;
926 stats->rx_noroute += local.rx_noroute;
927 }
928}
929
930static int mpls_fill_stats_af(struct sk_buff *skb,
931 const struct net_device *dev)
932{
933 struct mpls_link_stats *stats;
934 struct mpls_dev *mdev;
935 struct nlattr *nla;
936
937 mdev = mpls_dev_get(dev);
938 if (!mdev)
939 return -ENODATA;
940
941 nla = nla_reserve_64bit(skb, MPLS_STATS_LINK,
942 sizeof(struct mpls_link_stats),
943 MPLS_STATS_UNSPEC);
944 if (!nla)
945 return -EMSGSIZE;
946
947 stats = nla_data(nla);
948 mpls_get_stats(mdev, stats);
949
950 return 0;
951}
952
953static size_t mpls_get_stats_af_size(const struct net_device *dev)
954{
955 struct mpls_dev *mdev;
956
957 mdev = mpls_dev_get(dev);
958 if (!mdev)
959 return 0;
960
961 return nla_total_size_64bit(sizeof(struct mpls_link_stats));
962}
963
964static int mpls_netconf_fill_devconf(struct sk_buff *skb, struct mpls_dev *mdev,
965 u32 portid, u32 seq, int event,
966 unsigned int flags, int type)
967{
968 struct nlmsghdr *nlh;
969 struct netconfmsg *ncm;
970 bool all = false;
971
972 nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
973 flags);
974 if (!nlh)
975 return -EMSGSIZE;
976
977 if (type == NETCONFA_ALL)
978 all = true;
979
980 ncm = nlmsg_data(nlh);
981 ncm->ncm_family = AF_MPLS;
982
983 if (nla_put_s32(skb, NETCONFA_IFINDEX, mdev->dev->ifindex) < 0)
984 goto nla_put_failure;
985
986 if ((all || type == NETCONFA_INPUT) &&
987 nla_put_s32(skb, NETCONFA_INPUT,
988 mdev->input_enabled) < 0)
989 goto nla_put_failure;
990
991 nlmsg_end(skb, nlh);
992 return 0;
993
994nla_put_failure:
995 nlmsg_cancel(skb, nlh);
996 return -EMSGSIZE;
997}
998
999static int mpls_netconf_msgsize_devconf(int type)
1000{
1001 int size = NLMSG_ALIGN(sizeof(struct netconfmsg))
1002 + nla_total_size(4); /* NETCONFA_IFINDEX */
1003 bool all = false;
1004
1005 if (type == NETCONFA_ALL)
1006 all = true;
1007
1008 if (all || type == NETCONFA_INPUT)
1009 size += nla_total_size(4);
1010
1011 return size;
1012}
1013
1014static void mpls_netconf_notify_devconf(struct net *net, int type,
1015 struct mpls_dev *mdev)
1016{
1017 struct sk_buff *skb;
1018 int err = -ENOBUFS;
1019
1020 skb = nlmsg_new(mpls_netconf_msgsize_devconf(type), GFP_KERNEL);
1021 if (!skb)
1022 goto errout;
1023
1024 err = mpls_netconf_fill_devconf(skb, mdev, 0, 0, RTM_NEWNETCONF,
1025 0, type);
1026 if (err < 0) {
1027 /* -EMSGSIZE implies BUG in mpls_netconf_msgsize_devconf() */
1028 WARN_ON(err == -EMSGSIZE);
1029 kfree_skb(skb);
1030 goto errout;
1031 }
1032
1033 rtnl_notify(skb, net, 0, RTNLGRP_MPLS_NETCONF, NULL, GFP_KERNEL);
1034 return;
1035errout:
1036 if (err < 0)
1037 rtnl_set_sk_err(net, RTNLGRP_MPLS_NETCONF, err);
1038}
1039
1040static const struct nla_policy devconf_mpls_policy[NETCONFA_MAX + 1] = {
1041 [NETCONFA_IFINDEX] = { .len = sizeof(int) },
1042};
1043
1044static int mpls_netconf_get_devconf(struct sk_buff *in_skb,
1045 struct nlmsghdr *nlh)
1046{
1047 struct net *net = sock_net(in_skb->sk);
1048 struct nlattr *tb[NETCONFA_MAX + 1];
1049 struct netconfmsg *ncm;
1050 struct net_device *dev;
1051 struct mpls_dev *mdev;
1052 struct sk_buff *skb;
1053 int ifindex;
1054 int err;
1055
1056 err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX,
1057 devconf_mpls_policy);
1058 if (err < 0)
1059 goto errout;
1060
1061 err = -EINVAL;
1062 if (!tb[NETCONFA_IFINDEX])
1063 goto errout;
1064
1065 ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]);
1066 dev = __dev_get_by_index(net, ifindex);
1067 if (!dev)
1068 goto errout;
1069
1070 mdev = mpls_dev_get(dev);
1071 if (!mdev)
1072 goto errout;
1073
1074 err = -ENOBUFS;
1075 skb = nlmsg_new(mpls_netconf_msgsize_devconf(NETCONFA_ALL), GFP_KERNEL);
1076 if (!skb)
1077 goto errout;
1078
1079 err = mpls_netconf_fill_devconf(skb, mdev,
1080 NETLINK_CB(in_skb).portid,
1081 nlh->nlmsg_seq, RTM_NEWNETCONF, 0,
1082 NETCONFA_ALL);
1083 if (err < 0) {
1084 /* -EMSGSIZE implies BUG in mpls_netconf_msgsize_devconf() */
1085 WARN_ON(err == -EMSGSIZE);
1086 kfree_skb(skb);
1087 goto errout;
1088 }
1089 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
1090errout:
1091 return err;
1092}
1093
1094static int mpls_netconf_dump_devconf(struct sk_buff *skb,
1095 struct netlink_callback *cb)
1096{
1097 struct net *net = sock_net(skb->sk);
1098 struct hlist_head *head;
1099 struct net_device *dev;
1100 struct mpls_dev *mdev;
1101 int idx, s_idx;
1102 int h, s_h;
1103
1104 s_h = cb->args[0];
1105 s_idx = idx = cb->args[1];
1106
1107 for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
1108 idx = 0;
1109 head = &net->dev_index_head[h];
1110 rcu_read_lock();
1111 cb->seq = net->dev_base_seq;
1112 hlist_for_each_entry_rcu(dev, head, index_hlist) {
1113 if (idx < s_idx)
1114 goto cont;
1115 mdev = mpls_dev_get(dev);
1116 if (!mdev)
1117 goto cont;
1118 if (mpls_netconf_fill_devconf(skb, mdev,
1119 NETLINK_CB(cb->skb).portid,
1120 cb->nlh->nlmsg_seq,
1121 RTM_NEWNETCONF,
1122 NLM_F_MULTI,
1123 NETCONFA_ALL) < 0) {
1124 rcu_read_unlock();
1125 goto done;
1126 }
1127 nl_dump_check_consistent(cb, nlmsg_hdr(skb));
1128cont:
1129 idx++;
1130 }
1131 rcu_read_unlock();
1132 }
1133done:
1134 cb->args[0] = h;
1135 cb->args[1] = idx;
1136
1137 return skb->len;
1138}
1139
856#define MPLS_PERDEV_SYSCTL_OFFSET(field) \ 1140#define MPLS_PERDEV_SYSCTL_OFFSET(field) \
857 (&((struct mpls_dev *)0)->field) 1141 (&((struct mpls_dev *)0)->field)
858 1142
1143static int mpls_conf_proc(struct ctl_table *ctl, int write,
1144 void __user *buffer,
1145 size_t *lenp, loff_t *ppos)
1146{
1147 int oval = *(int *)ctl->data;
1148 int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
1149
1150 if (write) {
1151 struct mpls_dev *mdev = ctl->extra1;
1152 int i = (int *)ctl->data - (int *)mdev;
1153 struct net *net = ctl->extra2;
1154 int val = *(int *)ctl->data;
1155
1156 if (i == offsetof(struct mpls_dev, input_enabled) &&
1157 val != oval) {
1158 mpls_netconf_notify_devconf(net,
1159 NETCONFA_INPUT,
1160 mdev);
1161 }
1162 }
1163
1164 return ret;
1165}
1166
859static const struct ctl_table mpls_dev_table[] = { 1167static const struct ctl_table mpls_dev_table[] = {
860 { 1168 {
861 .procname = "input", 1169 .procname = "input",
862 .maxlen = sizeof(int), 1170 .maxlen = sizeof(int),
863 .mode = 0644, 1171 .mode = 0644,
864 .proc_handler = proc_dointvec, 1172 .proc_handler = mpls_conf_proc,
865 .data = MPLS_PERDEV_SYSCTL_OFFSET(input_enabled), 1173 .data = MPLS_PERDEV_SYSCTL_OFFSET(input_enabled),
866 }, 1174 },
867 { } 1175 { }
@@ -871,6 +1179,7 @@ static int mpls_dev_sysctl_register(struct net_device *dev,
871 struct mpls_dev *mdev) 1179 struct mpls_dev *mdev)
872{ 1180{
873 char path[sizeof("net/mpls/conf/") + IFNAMSIZ]; 1181 char path[sizeof("net/mpls/conf/") + IFNAMSIZ];
1182 struct net *net = dev_net(dev);
874 struct ctl_table *table; 1183 struct ctl_table *table;
875 int i; 1184 int i;
876 1185
@@ -881,8 +1190,11 @@ static int mpls_dev_sysctl_register(struct net_device *dev,
881 /* Table data contains only offsets relative to the base of 1190 /* Table data contains only offsets relative to the base of
882 * the mdev at this point, so make them absolute. 1191 * the mdev at this point, so make them absolute.
883 */ 1192 */
884 for (i = 0; i < ARRAY_SIZE(mpls_dev_table); i++) 1193 for (i = 0; i < ARRAY_SIZE(mpls_dev_table); i++) {
885 table[i].data = (char *)mdev + (uintptr_t)table[i].data; 1194 table[i].data = (char *)mdev + (uintptr_t)table[i].data;
1195 table[i].extra1 = mdev;
1196 table[i].extra2 = net;
1197 }
886 1198
887 snprintf(path, sizeof(path), "net/mpls/conf/%s", dev->name); 1199 snprintf(path, sizeof(path), "net/mpls/conf/%s", dev->name);
888 1200
@@ -911,6 +1223,7 @@ static struct mpls_dev *mpls_add_dev(struct net_device *dev)
911{ 1223{
912 struct mpls_dev *mdev; 1224 struct mpls_dev *mdev;
913 int err = -ENOMEM; 1225 int err = -ENOMEM;
1226 int i;
914 1227
915 ASSERT_RTNL(); 1228 ASSERT_RTNL();
916 1229
@@ -918,23 +1231,46 @@ static struct mpls_dev *mpls_add_dev(struct net_device *dev)
918 if (!mdev) 1231 if (!mdev)
919 return ERR_PTR(err); 1232 return ERR_PTR(err);
920 1233
1234 mdev->stats = alloc_percpu(struct mpls_pcpu_stats);
1235 if (!mdev->stats)
1236 goto free;
1237
1238 for_each_possible_cpu(i) {
1239 struct mpls_pcpu_stats *mpls_stats;
1240
1241 mpls_stats = per_cpu_ptr(mdev->stats, i);
1242 u64_stats_init(&mpls_stats->syncp);
1243 }
1244
921 err = mpls_dev_sysctl_register(dev, mdev); 1245 err = mpls_dev_sysctl_register(dev, mdev);
922 if (err) 1246 if (err)
923 goto free; 1247 goto free;
924 1248
1249 mdev->dev = dev;
925 rcu_assign_pointer(dev->mpls_ptr, mdev); 1250 rcu_assign_pointer(dev->mpls_ptr, mdev);
926 1251
927 return mdev; 1252 return mdev;
928 1253
929free: 1254free:
1255 free_percpu(mdev->stats);
930 kfree(mdev); 1256 kfree(mdev);
931 return ERR_PTR(err); 1257 return ERR_PTR(err);
932} 1258}
933 1259
1260static void mpls_dev_destroy_rcu(struct rcu_head *head)
1261{
1262 struct mpls_dev *mdev = container_of(head, struct mpls_dev, rcu);
1263
1264 free_percpu(mdev->stats);
1265 kfree(mdev);
1266}
1267
934static void mpls_ifdown(struct net_device *dev, int event) 1268static void mpls_ifdown(struct net_device *dev, int event)
935{ 1269{
936 struct mpls_route __rcu **platform_label; 1270 struct mpls_route __rcu **platform_label;
937 struct net *net = dev_net(dev); 1271 struct net *net = dev_net(dev);
1272 unsigned int nh_flags = RTNH_F_DEAD | RTNH_F_LINKDOWN;
1273 unsigned int alive;
938 unsigned index; 1274 unsigned index;
939 1275
940 platform_label = rtnl_dereference(net->mpls.platform_label); 1276 platform_label = rtnl_dereference(net->mpls.platform_label);
@@ -944,9 +1280,11 @@ static void mpls_ifdown(struct net_device *dev, int event)
944 if (!rt) 1280 if (!rt)
945 continue; 1281 continue;
946 1282
1283 alive = 0;
947 change_nexthops(rt) { 1284 change_nexthops(rt) {
948 if (rtnl_dereference(nh->nh_dev) != dev) 1285 if (rtnl_dereference(nh->nh_dev) != dev)
949 continue; 1286 goto next;
1287
950 switch (event) { 1288 switch (event) {
951 case NETDEV_DOWN: 1289 case NETDEV_DOWN:
952 case NETDEV_UNREGISTER: 1290 case NETDEV_UNREGISTER:
@@ -954,12 +1292,16 @@ static void mpls_ifdown(struct net_device *dev, int event)
954 /* fall through */ 1292 /* fall through */
955 case NETDEV_CHANGE: 1293 case NETDEV_CHANGE:
956 nh->nh_flags |= RTNH_F_LINKDOWN; 1294 nh->nh_flags |= RTNH_F_LINKDOWN;
957 ACCESS_ONCE(rt->rt_nhn_alive) = rt->rt_nhn_alive - 1;
958 break; 1295 break;
959 } 1296 }
960 if (event == NETDEV_UNREGISTER) 1297 if (event == NETDEV_UNREGISTER)
961 RCU_INIT_POINTER(nh->nh_dev, NULL); 1298 RCU_INIT_POINTER(nh->nh_dev, NULL);
1299next:
1300 if (!(nh->nh_flags & nh_flags))
1301 alive++;
962 } endfor_nexthops(rt); 1302 } endfor_nexthops(rt);
1303
1304 WRITE_ONCE(rt->rt_nhn_alive, alive);
963 } 1305 }
964} 1306}
965 1307
@@ -1045,7 +1387,7 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
1045 if (mdev) { 1387 if (mdev) {
1046 mpls_dev_sysctl_unregister(mdev); 1388 mpls_dev_sysctl_unregister(mdev);
1047 RCU_INIT_POINTER(dev->mpls_ptr, NULL); 1389 RCU_INIT_POINTER(dev->mpls_ptr, NULL);
1048 kfree_rcu(mdev, rcu); 1390 call_rcu(&mdev->rcu, mpls_dev_destroy_rcu);
1049 } 1391 }
1050 break; 1392 break;
1051 case NETDEV_CHANGENAME: 1393 case NETDEV_CHANGENAME:
@@ -1694,6 +2036,7 @@ static void mpls_net_exit(struct net *net)
1694 for (index = 0; index < platform_labels; index++) { 2036 for (index = 0; index < platform_labels; index++) {
1695 struct mpls_route *rt = rtnl_dereference(platform_label[index]); 2037 struct mpls_route *rt = rtnl_dereference(platform_label[index]);
1696 RCU_INIT_POINTER(platform_label[index], NULL); 2038 RCU_INIT_POINTER(platform_label[index], NULL);
2039 mpls_notify_route(net, index, rt, NULL, NULL);
1697 mpls_rt_free(rt); 2040 mpls_rt_free(rt);
1698 } 2041 }
1699 rtnl_unlock(); 2042 rtnl_unlock();
@@ -1706,6 +2049,12 @@ static struct pernet_operations mpls_net_ops = {
1706 .exit = mpls_net_exit, 2049 .exit = mpls_net_exit,
1707}; 2050};
1708 2051
2052static struct rtnl_af_ops mpls_af_ops __read_mostly = {
2053 .family = AF_MPLS,
2054 .fill_stats_af = mpls_fill_stats_af,
2055 .get_stats_af_size = mpls_get_stats_af_size,
2056};
2057
1709static int __init mpls_init(void) 2058static int __init mpls_init(void)
1710{ 2059{
1711 int err; 2060 int err;
@@ -1722,9 +2071,13 @@ static int __init mpls_init(void)
1722 2071
1723 dev_add_pack(&mpls_packet_type); 2072 dev_add_pack(&mpls_packet_type);
1724 2073
2074 rtnl_af_register(&mpls_af_ops);
2075
1725 rtnl_register(PF_MPLS, RTM_NEWROUTE, mpls_rtm_newroute, NULL, NULL); 2076 rtnl_register(PF_MPLS, RTM_NEWROUTE, mpls_rtm_newroute, NULL, NULL);
1726 rtnl_register(PF_MPLS, RTM_DELROUTE, mpls_rtm_delroute, NULL, NULL); 2077 rtnl_register(PF_MPLS, RTM_DELROUTE, mpls_rtm_delroute, NULL, NULL);
1727 rtnl_register(PF_MPLS, RTM_GETROUTE, NULL, mpls_dump_routes, NULL); 2078 rtnl_register(PF_MPLS, RTM_GETROUTE, NULL, mpls_dump_routes, NULL);
2079 rtnl_register(PF_MPLS, RTM_GETNETCONF, mpls_netconf_get_devconf,
2080 mpls_netconf_dump_devconf, NULL);
1728 err = 0; 2081 err = 0;
1729out: 2082out:
1730 return err; 2083 return err;
@@ -1738,6 +2091,7 @@ module_init(mpls_init);
1738static void __exit mpls_exit(void) 2091static void __exit mpls_exit(void)
1739{ 2092{
1740 rtnl_unregister_all(PF_MPLS); 2093 rtnl_unregister_all(PF_MPLS);
2094 rtnl_af_unregister(&mpls_af_ops);
1741 dev_remove_pack(&mpls_packet_type); 2095 dev_remove_pack(&mpls_packet_type);
1742 unregister_netdevice_notifier(&mpls_dev_notifier); 2096 unregister_netdevice_notifier(&mpls_dev_notifier);
1743 unregister_pernet_subsys(&mpls_net_ops); 2097 unregister_pernet_subsys(&mpls_net_ops);
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
index bdfef6c3271a..76360d8b9579 100644
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -9,13 +9,58 @@ struct mpls_entry_decoded {
9 u8 bos; 9 u8 bos;
10}; 10};
11 11
12struct mpls_pcpu_stats {
13 struct mpls_link_stats stats;
14 struct u64_stats_sync syncp;
15};
16
12struct mpls_dev { 17struct mpls_dev {
13 int input_enabled; 18 int input_enabled;
19 struct net_device *dev;
20 struct mpls_pcpu_stats __percpu *stats;
14 21
15 struct ctl_table_header *sysctl; 22 struct ctl_table_header *sysctl;
16 struct rcu_head rcu; 23 struct rcu_head rcu;
17}; 24};
18 25
26#if BITS_PER_LONG == 32
27
28#define MPLS_INC_STATS_LEN(mdev, len, pkts_field, bytes_field) \
29 do { \
30 __typeof__(*(mdev)->stats) *ptr = \
31 raw_cpu_ptr((mdev)->stats); \
32 local_bh_disable(); \
33 u64_stats_update_begin(&ptr->syncp); \
34 ptr->stats.pkts_field++; \
35 ptr->stats.bytes_field += (len); \
36 u64_stats_update_end(&ptr->syncp); \
37 local_bh_enable(); \
38 } while (0)
39
40#define MPLS_INC_STATS(mdev, field) \
41 do { \
42 __typeof__(*(mdev)->stats) *ptr = \
43 raw_cpu_ptr((mdev)->stats); \
44 local_bh_disable(); \
45 u64_stats_update_begin(&ptr->syncp); \
46 ptr->stats.field++; \
47 u64_stats_update_end(&ptr->syncp); \
48 local_bh_enable(); \
49 } while (0)
50
51#else
52
53#define MPLS_INC_STATS_LEN(mdev, len, pkts_field, bytes_field) \
54 do { \
55 this_cpu_inc((mdev)->stats->stats.pkts_field); \
56 this_cpu_add((mdev)->stats->stats.bytes_field, (len)); \
57 } while (0)
58
59#define MPLS_INC_STATS(mdev, field) \
60 this_cpu_inc((mdev)->stats->stats.field)
61
62#endif
63
19struct sk_buff; 64struct sk_buff;
20 65
21#define LABEL_NOT_SPECIFIED (1 << 20) 66#define LABEL_NOT_SPECIFIED (1 << 20)
@@ -114,6 +159,11 @@ static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr *
114 return result; 159 return result;
115} 160}
116 161
162static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev)
163{
164 return rcu_dereference_rtnl(dev->mpls_ptr);
165}
166
117int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels, 167int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels,
118 const u32 label[]); 168 const u32 label[]);
119int nla_get_labels(const struct nlattr *nla, u32 max_labels, u8 *labels, 169int nla_get_labels(const struct nlattr *nla, u32 max_labels, u8 *labels,
@@ -123,5 +173,7 @@ int nla_get_via(const struct nlattr *nla, u8 *via_alen, u8 *via_table,
123bool mpls_output_possible(const struct net_device *dev); 173bool mpls_output_possible(const struct net_device *dev);
124unsigned int mpls_dev_mtu(const struct net_device *dev); 174unsigned int mpls_dev_mtu(const struct net_device *dev);
125bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu); 175bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu);
176void mpls_stats_inc_outucastpkts(struct net_device *dev,
177 const struct sk_buff *skb);
126 178
127#endif /* MPLS_INTERNAL_H */ 179#endif /* MPLS_INTERNAL_H */
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
index cf52cf30ac4b..e4e4424f9eb1 100644
--- a/net/mpls/mpls_iptunnel.c
+++ b/net/mpls/mpls_iptunnel.c
@@ -48,11 +48,15 @@ static int mpls_xmit(struct sk_buff *skb)
48 struct dst_entry *dst = skb_dst(skb); 48 struct dst_entry *dst = skb_dst(skb);
49 struct rtable *rt = NULL; 49 struct rtable *rt = NULL;
50 struct rt6_info *rt6 = NULL; 50 struct rt6_info *rt6 = NULL;
51 struct mpls_dev *out_mdev;
51 int err = 0; 52 int err = 0;
52 bool bos; 53 bool bos;
53 int i; 54 int i;
54 unsigned int ttl; 55 unsigned int ttl;
55 56
57 /* Find the output device */
58 out_dev = dst->dev;
59
56 /* Obtain the ttl */ 60 /* Obtain the ttl */
57 if (dst->ops->family == AF_INET) { 61 if (dst->ops->family == AF_INET) {
58 ttl = ip_hdr(skb)->ttl; 62 ttl = ip_hdr(skb)->ttl;
@@ -66,8 +70,6 @@ static int mpls_xmit(struct sk_buff *skb)
66 70
67 skb_orphan(skb); 71 skb_orphan(skb);
68 72
69 /* Find the output device */
70 out_dev = dst->dev;
71 if (!mpls_output_possible(out_dev) || 73 if (!mpls_output_possible(out_dev) ||
72 !dst->lwtstate || skb_warn_if_lro(skb)) 74 !dst->lwtstate || skb_warn_if_lro(skb))
73 goto drop; 75 goto drop;
@@ -109,6 +111,8 @@ static int mpls_xmit(struct sk_buff *skb)
109 bos = false; 111 bos = false;
110 } 112 }
111 113
114 mpls_stats_inc_outucastpkts(out_dev, skb);
115
112 if (rt) 116 if (rt)
113 err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gateway, 117 err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gateway,
114 skb); 118 skb);
@@ -122,18 +126,20 @@ static int mpls_xmit(struct sk_buff *skb)
122 return LWTUNNEL_XMIT_DONE; 126 return LWTUNNEL_XMIT_DONE;
123 127
124drop: 128drop:
129 out_mdev = out_dev ? mpls_dev_get(out_dev) : NULL;
130 if (out_mdev)
131 MPLS_INC_STATS(out_mdev, tx_errors);
125 kfree_skb(skb); 132 kfree_skb(skb);
126 return -EINVAL; 133 return -EINVAL;
127} 134}
128 135
129static int mpls_build_state(struct net_device *dev, struct nlattr *nla, 136static int mpls_build_state(struct nlattr *nla,
130 unsigned int family, const void *cfg, 137 unsigned int family, const void *cfg,
131 struct lwtunnel_state **ts) 138 struct lwtunnel_state **ts)
132{ 139{
133 struct mpls_iptunnel_encap *tun_encap_info; 140 struct mpls_iptunnel_encap *tun_encap_info;
134 struct nlattr *tb[MPLS_IPTUNNEL_MAX + 1]; 141 struct nlattr *tb[MPLS_IPTUNNEL_MAX + 1];
135 struct lwtunnel_state *newts; 142 struct lwtunnel_state *newts;
136 int tun_encap_info_len;
137 int ret; 143 int ret;
138 144
139 ret = nla_parse_nested(tb, MPLS_IPTUNNEL_MAX, nla, 145 ret = nla_parse_nested(tb, MPLS_IPTUNNEL_MAX, nla,
@@ -144,13 +150,11 @@ static int mpls_build_state(struct net_device *dev, struct nlattr *nla,
144 if (!tb[MPLS_IPTUNNEL_DST]) 150 if (!tb[MPLS_IPTUNNEL_DST])
145 return -EINVAL; 151 return -EINVAL;
146 152
147 tun_encap_info_len = sizeof(*tun_encap_info);
148 153
149 newts = lwtunnel_state_alloc(tun_encap_info_len); 154 newts = lwtunnel_state_alloc(sizeof(*tun_encap_info));
150 if (!newts) 155 if (!newts)
151 return -ENOMEM; 156 return -ENOMEM;
152 157
153 newts->len = tun_encap_info_len;
154 tun_encap_info = mpls_lwtunnel_encap(newts); 158 tun_encap_info = mpls_lwtunnel_encap(newts);
155 ret = nla_get_labels(tb[MPLS_IPTUNNEL_DST], MAX_NEW_LABELS, 159 ret = nla_get_labels(tb[MPLS_IPTUNNEL_DST], MAX_NEW_LABELS,
156 &tun_encap_info->labels, tun_encap_info->label); 160 &tun_encap_info->labels, tun_encap_info->label);
@@ -218,6 +222,7 @@ static const struct lwtunnel_encap_ops mpls_iptun_ops = {
218 .fill_encap = mpls_fill_encap_info, 222 .fill_encap = mpls_fill_encap_info,
219 .get_encap_size = mpls_encap_nlsize, 223 .get_encap_size = mpls_encap_nlsize,
220 .cmp_encap = mpls_encap_cmp, 224 .cmp_encap = mpls_encap_cmp,
225 .owner = THIS_MODULE,
221}; 226};
222 227
223static int __init mpls_iptunnel_init(void) 228static int __init mpls_iptunnel_init(void)
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index e8d56d9a4df2..9b28864cc36a 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -57,6 +57,10 @@ config NF_CONNTRACK
57config NF_LOG_COMMON 57config NF_LOG_COMMON
58 tristate 58 tristate
59 59
60config NF_LOG_NETDEV
61 tristate "Netdev packet logging"
62 select NF_LOG_COMMON
63
60if NF_CONNTRACK 64if NF_CONNTRACK
61 65
62config NF_CONNTRACK_MARK 66config NF_CONNTRACK_MARK
@@ -142,38 +146,39 @@ config NF_CONNTRACK_LABELS
142 to connection tracking entries. It selected by the connlabel match. 146 to connection tracking entries. It selected by the connlabel match.
143 147
144config NF_CT_PROTO_DCCP 148config NF_CT_PROTO_DCCP
145 tristate 'DCCP protocol connection tracking support' 149 bool 'DCCP protocol connection tracking support'
146 depends on NETFILTER_ADVANCED 150 depends on NETFILTER_ADVANCED
147 default IP_DCCP 151 default y
148 help 152 help
149 With this option enabled, the layer 3 independent connection 153 With this option enabled, the layer 3 independent connection
150 tracking code will be able to do state tracking on DCCP connections. 154 tracking code will be able to do state tracking on DCCP connections.
151 155
152 If unsure, say 'N'. 156 If unsure, say Y.
153 157
154config NF_CT_PROTO_GRE 158config NF_CT_PROTO_GRE
155 tristate 159 tristate
156 160
157config NF_CT_PROTO_SCTP 161config NF_CT_PROTO_SCTP
158 tristate 'SCTP protocol connection tracking support' 162 bool 'SCTP protocol connection tracking support'
159 depends on NETFILTER_ADVANCED 163 depends on NETFILTER_ADVANCED
160 default IP_SCTP 164 default y
165 select LIBCRC32C
161 help 166 help
162 With this option enabled, the layer 3 independent connection 167 With this option enabled, the layer 3 independent connection
163 tracking code will be able to do state tracking on SCTP connections. 168 tracking code will be able to do state tracking on SCTP connections.
164 169
165 If you want to compile it as a module, say M here and read 170 If unsure, say Y.
166 <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
167 171
168config NF_CT_PROTO_UDPLITE 172config NF_CT_PROTO_UDPLITE
169 tristate 'UDP-Lite protocol connection tracking support' 173 bool 'UDP-Lite protocol connection tracking support'
170 depends on NETFILTER_ADVANCED 174 depends on NETFILTER_ADVANCED
175 default y
171 help 176 help
172 With this option enabled, the layer 3 independent connection 177 With this option enabled, the layer 3 independent connection
173 tracking code will be able to do state tracking on UDP-Lite 178 tracking code will be able to do state tracking on UDP-Lite
174 connections. 179 connections.
175 180
176 To compile it as a module, choose M here. If unsure, say N. 181 If unsure, say Y.
177 182
178config NF_CONNTRACK_AMANDA 183config NF_CONNTRACK_AMANDA
179 tristate "Amanda backup protocol support" 184 tristate "Amanda backup protocol support"
@@ -380,20 +385,19 @@ config NF_NAT_NEEDED
380 default y 385 default y
381 386
382config NF_NAT_PROTO_DCCP 387config NF_NAT_PROTO_DCCP
383 tristate 388 bool
384 depends on NF_NAT && NF_CT_PROTO_DCCP 389 depends on NF_NAT && NF_CT_PROTO_DCCP
385 default NF_NAT && NF_CT_PROTO_DCCP 390 default NF_NAT && NF_CT_PROTO_DCCP
386 391
387config NF_NAT_PROTO_UDPLITE 392config NF_NAT_PROTO_UDPLITE
388 tristate 393 bool
389 depends on NF_NAT && NF_CT_PROTO_UDPLITE 394 depends on NF_NAT && NF_CT_PROTO_UDPLITE
390 default NF_NAT && NF_CT_PROTO_UDPLITE 395 default NF_NAT && NF_CT_PROTO_UDPLITE
391 396
392config NF_NAT_PROTO_SCTP 397config NF_NAT_PROTO_SCTP
393 tristate 398 bool
394 default NF_NAT && NF_CT_PROTO_SCTP 399 default NF_NAT && NF_CT_PROTO_SCTP
395 depends on NF_NAT && NF_CT_PROTO_SCTP 400 depends on NF_NAT && NF_CT_PROTO_SCTP
396 select LIBCRC32C
397 401
398config NF_NAT_AMANDA 402config NF_NAT_AMANDA
399 tristate 403 tristate
@@ -463,10 +467,10 @@ config NF_TABLES_NETDEV
463 This option enables support for the "netdev" table. 467 This option enables support for the "netdev" table.
464 468
465config NFT_EXTHDR 469config NFT_EXTHDR
466 tristate "Netfilter nf_tables IPv6 exthdr module" 470 tristate "Netfilter nf_tables exthdr module"
467 help 471 help
468 This option adds the "exthdr" expression that you can use to match 472 This option adds the "exthdr" expression that you can use to match
469 IPv6 extension headers. 473 IPv6 extension headers and tcp options.
470 474
471config NFT_META 475config NFT_META
472 tristate "Netfilter nf_tables meta module" 476 tristate "Netfilter nf_tables meta module"
@@ -474,6 +478,12 @@ config NFT_META
474 This option adds the "meta" expression that you can use to match and 478 This option adds the "meta" expression that you can use to match and
475 to set packet metainformation such as the packet mark. 479 to set packet metainformation such as the packet mark.
476 480
481config NFT_RT
482 tristate "Netfilter nf_tables routing module"
483 help
484 This option adds the "rt" expression that you can use to match
485 packet routing information such as the packet nexthop.
486
477config NFT_NUMGEN 487config NFT_NUMGEN
478 tristate "Netfilter nf_tables number generator module" 488 tristate "Netfilter nf_tables number generator module"
479 help 489 help
@@ -484,7 +494,7 @@ config NFT_CT
484 depends on NF_CONNTRACK 494 depends on NF_CONNTRACK
485 tristate "Netfilter nf_tables conntrack module" 495 tristate "Netfilter nf_tables conntrack module"
486 help 496 help
487 This option adds the "meta" expression that you can use to match 497 This option adds the "ct" expression that you can use to match
488 connection tracking information such as the flow state. 498 connection tracking information such as the flow state.
489 499
490config NFT_SET_RBTREE 500config NFT_SET_RBTREE
@@ -499,6 +509,12 @@ config NFT_SET_HASH
499 This option adds the "hash" set type that is used to build one-way 509 This option adds the "hash" set type that is used to build one-way
500 mappings between matchings and actions. 510 mappings between matchings and actions.
501 511
512config NFT_SET_BITMAP
513 tristate "Netfilter nf_tables bitmap set module"
514 help
515 This option adds the "bitmap" set type that is used to build sets
516 whose keys are smaller or equal to 16 bits.
517
502config NFT_COUNTER 518config NFT_COUNTER
503 tristate "Netfilter nf_tables counter module" 519 tristate "Netfilter nf_tables counter module"
504 help 520 help
@@ -541,6 +557,12 @@ config NFT_NAT
541 This option adds the "nat" expression that you can use to perform 557 This option adds the "nat" expression that you can use to perform
542 typical Network Address Translation (NAT) packet transformations. 558 typical Network Address Translation (NAT) packet transformations.
543 559
560config NFT_OBJREF
561 tristate "Netfilter nf_tables stateful object reference module"
562 help
563 This option adds the "objref" expression that allows you to refer to
564 stateful objects, such as counters and quotas.
565
544config NFT_QUEUE 566config NFT_QUEUE
545 depends on NETFILTER_NETLINK_QUEUE 567 depends on NETFILTER_NETLINK_QUEUE
546 tristate "Netfilter nf_tables queue module" 568 tristate "Netfilter nf_tables queue module"
@@ -581,6 +603,19 @@ config NFT_HASH
581 This option adds the "hash" expression that you can use to perform 603 This option adds the "hash" expression that you can use to perform
582 a hash operation on registers. 604 a hash operation on registers.
583 605
606config NFT_FIB
607 tristate
608
609config NFT_FIB_INET
610 depends on NF_TABLES_INET
611 depends on NFT_FIB_IPV4
612 depends on NFT_FIB_IPV6
613 tristate "Netfilter nf_tables fib inet support"
614 help
615 This option allows using the FIB expression from the inet table.
616 The lookup will be delegated to the IPv4 or IPv6 FIB depending
617 on the protocol of the packet.
618
584if NF_TABLES_NETDEV 619if NF_TABLES_NETDEV
585 620
586config NF_DUP_NETDEV 621config NF_DUP_NETDEV
@@ -1409,9 +1444,10 @@ config NETFILTER_XT_MATCH_SOCKET
1409 tristate '"socket" match support' 1444 tristate '"socket" match support'
1410 depends on NETFILTER_XTABLES 1445 depends on NETFILTER_XTABLES
1411 depends on NETFILTER_ADVANCED 1446 depends on NETFILTER_ADVANCED
1412 depends on !NF_CONNTRACK || NF_CONNTRACK
1413 depends on IPV6 || IPV6=n 1447 depends on IPV6 || IPV6=n
1414 depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n 1448 depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n
1449 depends on NF_SOCKET_IPV4
1450 depends on NF_SOCKET_IPV6
1415 select NF_DEFRAG_IPV4 1451 select NF_DEFRAG_IPV4
1416 select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n 1452 select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n
1417 help 1453 help
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index c23c3c84416f..c9b78e7b342f 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -5,6 +5,8 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o
5nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o 5nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o
6nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o 6nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o
7nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o 7nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o
8nf_conntrack-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o
9nf_conntrack-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
8 10
9obj-$(CONFIG_NETFILTER) = netfilter.o 11obj-$(CONFIG_NETFILTER) = netfilter.o
10 12
@@ -16,11 +18,7 @@ obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o
16# connection tracking 18# connection tracking
17obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o 19obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o
18 20
19# SCTP protocol connection tracking
20obj-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o
21obj-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o 21obj-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o
22obj-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
23obj-$(CONFIG_NF_CT_PROTO_UDPLITE) += nf_conntrack_proto_udplite.o
24 22
25# netlink interface for nf_conntrack 23# netlink interface for nf_conntrack
26obj-$(CONFIG_NF_CT_NETLINK) += nf_conntrack_netlink.o 24obj-$(CONFIG_NF_CT_NETLINK) += nf_conntrack_netlink.o
@@ -45,17 +43,19 @@ obj-$(CONFIG_NF_CONNTRACK_TFTP) += nf_conntrack_tftp.o
45nf_nat-y := nf_nat_core.o nf_nat_proto_unknown.o nf_nat_proto_common.o \ 43nf_nat-y := nf_nat_core.o nf_nat_proto_unknown.o nf_nat_proto_common.o \
46 nf_nat_proto_udp.o nf_nat_proto_tcp.o nf_nat_helper.o 44 nf_nat_proto_udp.o nf_nat_proto_tcp.o nf_nat_helper.o
47 45
46# NAT protocols (nf_nat)
47nf_nat-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o
48nf_nat-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o
49
48# generic transport layer logging 50# generic transport layer logging
49obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o 51obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o
50 52
53# packet logging for netdev family
54obj-$(CONFIG_NF_LOG_NETDEV) += nf_log_netdev.o
55
51obj-$(CONFIG_NF_NAT) += nf_nat.o 56obj-$(CONFIG_NF_NAT) += nf_nat.o
52obj-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o 57obj-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o
53 58
54# NAT protocols (nf_nat)
55obj-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o
56obj-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o
57obj-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o
58
59# NAT helpers 59# NAT helpers
60obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o 60obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o
61obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o 61obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o
@@ -81,21 +81,26 @@ obj-$(CONFIG_NF_TABLES_NETDEV) += nf_tables_netdev.o
81obj-$(CONFIG_NFT_COMPAT) += nft_compat.o 81obj-$(CONFIG_NFT_COMPAT) += nft_compat.o
82obj-$(CONFIG_NFT_EXTHDR) += nft_exthdr.o 82obj-$(CONFIG_NFT_EXTHDR) += nft_exthdr.o
83obj-$(CONFIG_NFT_META) += nft_meta.o 83obj-$(CONFIG_NFT_META) += nft_meta.o
84obj-$(CONFIG_NFT_RT) += nft_rt.o
84obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o 85obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o
85obj-$(CONFIG_NFT_CT) += nft_ct.o 86obj-$(CONFIG_NFT_CT) += nft_ct.o
86obj-$(CONFIG_NFT_LIMIT) += nft_limit.o 87obj-$(CONFIG_NFT_LIMIT) += nft_limit.o
87obj-$(CONFIG_NFT_NAT) += nft_nat.o 88obj-$(CONFIG_NFT_NAT) += nft_nat.o
89obj-$(CONFIG_NFT_OBJREF) += nft_objref.o
88obj-$(CONFIG_NFT_QUEUE) += nft_queue.o 90obj-$(CONFIG_NFT_QUEUE) += nft_queue.o
89obj-$(CONFIG_NFT_QUOTA) += nft_quota.o 91obj-$(CONFIG_NFT_QUOTA) += nft_quota.o
90obj-$(CONFIG_NFT_REJECT) += nft_reject.o 92obj-$(CONFIG_NFT_REJECT) += nft_reject.o
91obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o 93obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o
92obj-$(CONFIG_NFT_SET_RBTREE) += nft_set_rbtree.o 94obj-$(CONFIG_NFT_SET_RBTREE) += nft_set_rbtree.o
93obj-$(CONFIG_NFT_SET_HASH) += nft_set_hash.o 95obj-$(CONFIG_NFT_SET_HASH) += nft_set_hash.o
96obj-$(CONFIG_NFT_SET_BITMAP) += nft_set_bitmap.o
94obj-$(CONFIG_NFT_COUNTER) += nft_counter.o 97obj-$(CONFIG_NFT_COUNTER) += nft_counter.o
95obj-$(CONFIG_NFT_LOG) += nft_log.o 98obj-$(CONFIG_NFT_LOG) += nft_log.o
96obj-$(CONFIG_NFT_MASQ) += nft_masq.o 99obj-$(CONFIG_NFT_MASQ) += nft_masq.o
97obj-$(CONFIG_NFT_REDIR) += nft_redir.o 100obj-$(CONFIG_NFT_REDIR) += nft_redir.o
98obj-$(CONFIG_NFT_HASH) += nft_hash.o 101obj-$(CONFIG_NFT_HASH) += nft_hash.o
102obj-$(CONFIG_NFT_FIB) += nft_fib.o
103obj-$(CONFIG_NFT_FIB_INET) += nft_fib_inet.o
99 104
100# nf_tables netdev 105# nf_tables netdev
101obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o 106obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 004af030ef1a..a87a6f8a74d8 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -102,17 +102,14 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
102 if (!entry) 102 if (!entry)
103 return -ENOMEM; 103 return -ENOMEM;
104 104
105 entry->orig_ops = reg; 105 nf_hook_entry_init(entry, reg);
106 entry->ops = *reg;
107 entry->next = NULL;
108 106
109 mutex_lock(&nf_hook_mutex); 107 mutex_lock(&nf_hook_mutex);
110 108
111 /* Find the spot in the list */ 109 /* Find the spot in the list */
112 while ((p = nf_entry_dereference(*pp)) != NULL) { 110 for (; (p = nf_entry_dereference(*pp)) != NULL; pp = &p->next) {
113 if (reg->priority < p->orig_ops->priority) 111 if (reg->priority < nf_hook_entry_priority(p))
114 break; 112 break;
115 pp = &p->next;
116 } 113 }
117 rcu_assign_pointer(entry->next, p); 114 rcu_assign_pointer(entry->next, p);
118 rcu_assign_pointer(*pp, entry); 115 rcu_assign_pointer(*pp, entry);
@@ -139,12 +136,11 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
139 return; 136 return;
140 137
141 mutex_lock(&nf_hook_mutex); 138 mutex_lock(&nf_hook_mutex);
142 while ((p = nf_entry_dereference(*pp)) != NULL) { 139 for (; (p = nf_entry_dereference(*pp)) != NULL; pp = &p->next) {
143 if (p->orig_ops == reg) { 140 if (nf_hook_entry_ops(p) == reg) {
144 rcu_assign_pointer(*pp, p->next); 141 rcu_assign_pointer(*pp, p->next);
145 break; 142 break;
146 } 143 }
147 pp = &p->next;
148 } 144 }
149 mutex_unlock(&nf_hook_mutex); 145 mutex_unlock(&nf_hook_mutex);
150 if (!p) { 146 if (!p) {
@@ -302,70 +298,40 @@ void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
302} 298}
303EXPORT_SYMBOL(_nf_unregister_hooks); 299EXPORT_SYMBOL(_nf_unregister_hooks);
304 300
305unsigned int nf_iterate(struct sk_buff *skb, 301/* Returns 1 if okfn() needs to be executed by the caller,
306 struct nf_hook_state *state, 302 * -EPERM for NF_DROP, 0 otherwise. Caller must hold rcu_read_lock. */
307 struct nf_hook_entry **entryp) 303int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
304 struct nf_hook_entry *entry)
308{ 305{
309 unsigned int verdict; 306 unsigned int verdict;
307 int ret;
310 308
311 /* 309 do {
312 * The caller must not block between calls to this 310 verdict = nf_hook_entry_hookfn(entry, skb, state);
313 * function because of risk of continuing from deleted element. 311 switch (verdict & NF_VERDICT_MASK) {
314 */ 312 case NF_ACCEPT:
315 while (*entryp) { 313 entry = rcu_dereference(entry->next);
316 if (state->thresh > (*entryp)->ops.priority) { 314 break;
317 *entryp = rcu_dereference((*entryp)->next); 315 case NF_DROP:
318 continue; 316 kfree_skb(skb);
319 } 317 ret = NF_DROP_GETERR(verdict);
320 318 if (ret == 0)
321 /* Optimization: we don't need to hold module 319 ret = -EPERM;
322 reference here, since function can't sleep. --RR */ 320 return ret;
323repeat: 321 case NF_QUEUE:
324 verdict = (*entryp)->ops.hook((*entryp)->ops.priv, skb, state); 322 ret = nf_queue(skb, state, &entry, verdict);
325 if (verdict != NF_ACCEPT) { 323 if (ret == 1 && entry)
326#ifdef CONFIG_NETFILTER_DEBUG
327 if (unlikely((verdict & NF_VERDICT_MASK)
328 > NF_MAX_VERDICT)) {
329 NFDEBUG("Evil return from %p(%u).\n",
330 (*entryp)->ops.hook, state->hook);
331 *entryp = rcu_dereference((*entryp)->next);
332 continue; 324 continue;
333 } 325 return ret;
334#endif 326 default:
335 if (verdict != NF_REPEAT) 327 /* Implicit handling for NF_STOLEN, as well as any other
336 return verdict; 328 * non conventional verdicts.
337 goto repeat; 329 */
330 return 0;
338 } 331 }
339 *entryp = rcu_dereference((*entryp)->next); 332 } while (entry);
340 }
341 return NF_ACCEPT;
342}
343
344 333
345/* Returns 1 if okfn() needs to be executed by the caller, 334 return 1;
346 * -EPERM for NF_DROP, 0 otherwise. Caller must hold rcu_read_lock. */
347int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state)
348{
349 struct nf_hook_entry *entry;
350 unsigned int verdict;
351 int ret = 0;
352
353 entry = rcu_dereference(state->hook_entries);
354next_hook:
355 verdict = nf_iterate(skb, state, &entry);
356 if (verdict == NF_ACCEPT || verdict == NF_STOP) {
357 ret = 1;
358 } else if ((verdict & NF_VERDICT_MASK) == NF_DROP) {
359 kfree_skb(skb);
360 ret = NF_DROP_GETERR(verdict);
361 if (ret == 0)
362 ret = -EPERM;
363 } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) {
364 ret = nf_queue(skb, state, &entry, verdict);
365 if (ret == 1 && entry)
366 goto next_hook;
367 }
368 return ret;
369} 335}
370EXPORT_SYMBOL(nf_hook_slow); 336EXPORT_SYMBOL(nf_hook_slow);
371 337
@@ -409,7 +375,7 @@ void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb)
409{ 375{
410 void (*attach)(struct sk_buff *, const struct sk_buff *); 376 void (*attach)(struct sk_buff *, const struct sk_buff *);
411 377
412 if (skb->nfct) { 378 if (skb->_nfct) {
413 rcu_read_lock(); 379 rcu_read_lock();
414 attach = rcu_dereference(ip_ct_attach); 380 attach = rcu_dereference(ip_ct_attach);
415 if (attach) 381 if (attach)
diff --git a/net/netfilter/ipset/Kconfig b/net/netfilter/ipset/Kconfig
index 234a8ec82076..4083a8051f0f 100644
--- a/net/netfilter/ipset/Kconfig
+++ b/net/netfilter/ipset/Kconfig
@@ -99,6 +99,15 @@ config IP_SET_HASH_IPPORTNET
99 99
100 To compile it as a module, choose M here. If unsure, say N. 100 To compile it as a module, choose M here. If unsure, say N.
101 101
102config IP_SET_HASH_IPMAC
103 tristate "hash:ip,mac set support"
104 depends on IP_SET
105 help
106 This option adds the hash:ip,mac set type support, by which
107 one can store IPv4/IPv6 address and MAC (ethernet address) pairs in a set.
108
109 To compile it as a module, choose M here. If unsure, say N.
110
102config IP_SET_HASH_MAC 111config IP_SET_HASH_MAC
103 tristate "hash:mac set support" 112 tristate "hash:mac set support"
104 depends on IP_SET 113 depends on IP_SET
diff --git a/net/netfilter/ipset/Makefile b/net/netfilter/ipset/Makefile
index 3dbd5e958489..28ec148df02d 100644
--- a/net/netfilter/ipset/Makefile
+++ b/net/netfilter/ipset/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_IP_SET_BITMAP_PORT) += ip_set_bitmap_port.o
14 14
15# hash types 15# hash types
16obj-$(CONFIG_IP_SET_HASH_IP) += ip_set_hash_ip.o 16obj-$(CONFIG_IP_SET_HASH_IP) += ip_set_hash_ip.o
17obj-$(CONFIG_IP_SET_HASH_IPMAC) += ip_set_hash_ipmac.o
17obj-$(CONFIG_IP_SET_HASH_IPMARK) += ip_set_hash_ipmark.o 18obj-$(CONFIG_IP_SET_HASH_IPMARK) += ip_set_hash_ipmark.o
18obj-$(CONFIG_IP_SET_HASH_IPPORT) += ip_set_hash_ipport.o 19obj-$(CONFIG_IP_SET_HASH_IPPORT) += ip_set_hash_ipport.o
19obj-$(CONFIG_IP_SET_HASH_IPPORTIP) += ip_set_hash_ipportip.o 20obj-$(CONFIG_IP_SET_HASH_IPPORTIP) += ip_set_hash_ipportip.o
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
index 2e8e7e5fb4a6..6f09a99298cd 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -22,6 +22,7 @@
22#define mtype_kadt IPSET_TOKEN(MTYPE, _kadt) 22#define mtype_kadt IPSET_TOKEN(MTYPE, _kadt)
23#define mtype_uadt IPSET_TOKEN(MTYPE, _uadt) 23#define mtype_uadt IPSET_TOKEN(MTYPE, _uadt)
24#define mtype_destroy IPSET_TOKEN(MTYPE, _destroy) 24#define mtype_destroy IPSET_TOKEN(MTYPE, _destroy)
25#define mtype_memsize IPSET_TOKEN(MTYPE, _memsize)
25#define mtype_flush IPSET_TOKEN(MTYPE, _flush) 26#define mtype_flush IPSET_TOKEN(MTYPE, _flush)
26#define mtype_head IPSET_TOKEN(MTYPE, _head) 27#define mtype_head IPSET_TOKEN(MTYPE, _head)
27#define mtype_same_set IPSET_TOKEN(MTYPE, _same_set) 28#define mtype_same_set IPSET_TOKEN(MTYPE, _same_set)
@@ -40,11 +41,8 @@ mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
40{ 41{
41 struct mtype *map = set->data; 42 struct mtype *map = set->data;
42 43
43 init_timer(&map->gc); 44 setup_timer(&map->gc, gc, (unsigned long)set);
44 map->gc.data = (unsigned long)set; 45 mod_timer(&map->gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ);
45 map->gc.function = gc;
46 map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
47 add_timer(&map->gc);
48} 46}
49 47
50static void 48static void
@@ -82,6 +80,16 @@ mtype_flush(struct ip_set *set)
82 if (set->extensions & IPSET_EXT_DESTROY) 80 if (set->extensions & IPSET_EXT_DESTROY)
83 mtype_ext_cleanup(set); 81 mtype_ext_cleanup(set);
84 memset(map->members, 0, map->memsize); 82 memset(map->members, 0, map->memsize);
83 set->elements = 0;
84 set->ext_size = 0;
85}
86
87/* Calculate the actual memory size of the set data */
88static size_t
89mtype_memsize(const struct mtype *map, size_t dsize)
90{
91 return sizeof(*map) + map->memsize +
92 map->elements * dsize;
85} 93}
86 94
87static int 95static int
@@ -89,14 +97,15 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
89{ 97{
90 const struct mtype *map = set->data; 98 const struct mtype *map = set->data;
91 struct nlattr *nested; 99 struct nlattr *nested;
92 size_t memsize = sizeof(*map) + map->memsize; 100 size_t memsize = mtype_memsize(map, set->dsize) + set->ext_size;
93 101
94 nested = ipset_nest_start(skb, IPSET_ATTR_DATA); 102 nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
95 if (!nested) 103 if (!nested)
96 goto nla_put_failure; 104 goto nla_put_failure;
97 if (mtype_do_head(skb, map) || 105 if (mtype_do_head(skb, map) ||
98 nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) || 106 nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) ||
99 nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize))) 107 nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) ||
108 nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(set->elements)))
100 goto nla_put_failure; 109 goto nla_put_failure;
101 if (unlikely(ip_set_put_flags(skb, set))) 110 if (unlikely(ip_set_put_flags(skb, set)))
102 goto nla_put_failure; 111 goto nla_put_failure;
@@ -140,6 +149,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
140 if (ret == IPSET_ADD_FAILED) { 149 if (ret == IPSET_ADD_FAILED) {
141 if (SET_WITH_TIMEOUT(set) && 150 if (SET_WITH_TIMEOUT(set) &&
142 ip_set_timeout_expired(ext_timeout(x, set))) { 151 ip_set_timeout_expired(ext_timeout(x, set))) {
152 set->elements--;
143 ret = 0; 153 ret = 0;
144 } else if (!(flags & IPSET_FLAG_EXIST)) { 154 } else if (!(flags & IPSET_FLAG_EXIST)) {
145 set_bit(e->id, map->members); 155 set_bit(e->id, map->members);
@@ -148,6 +158,8 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
148 /* Element is re-added, cleanup extensions */ 158 /* Element is re-added, cleanup extensions */
149 ip_set_ext_destroy(set, x); 159 ip_set_ext_destroy(set, x);
150 } 160 }
161 if (ret > 0)
162 set->elements--;
151 163
152 if (SET_WITH_TIMEOUT(set)) 164 if (SET_WITH_TIMEOUT(set))
153#ifdef IP_SET_BITMAP_STORED_TIMEOUT 165#ifdef IP_SET_BITMAP_STORED_TIMEOUT
@@ -159,12 +171,13 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
159 if (SET_WITH_COUNTER(set)) 171 if (SET_WITH_COUNTER(set))
160 ip_set_init_counter(ext_counter(x, set), ext); 172 ip_set_init_counter(ext_counter(x, set), ext);
161 if (SET_WITH_COMMENT(set)) 173 if (SET_WITH_COMMENT(set))
162 ip_set_init_comment(ext_comment(x, set), ext); 174 ip_set_init_comment(set, ext_comment(x, set), ext);
163 if (SET_WITH_SKBINFO(set)) 175 if (SET_WITH_SKBINFO(set))
164 ip_set_init_skbinfo(ext_skbinfo(x, set), ext); 176 ip_set_init_skbinfo(ext_skbinfo(x, set), ext);
165 177
166 /* Activate element */ 178 /* Activate element */
167 set_bit(e->id, map->members); 179 set_bit(e->id, map->members);
180 set->elements++;
168 181
169 return 0; 182 return 0;
170} 183}
@@ -181,6 +194,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
181 return -IPSET_ERR_EXIST; 194 return -IPSET_ERR_EXIST;
182 195
183 ip_set_ext_destroy(set, x); 196 ip_set_ext_destroy(set, x);
197 set->elements--;
184 if (SET_WITH_TIMEOUT(set) && 198 if (SET_WITH_TIMEOUT(set) &&
185 ip_set_timeout_expired(ext_timeout(x, set))) 199 ip_set_timeout_expired(ext_timeout(x, set)))
186 return -IPSET_ERR_EXIST; 200 return -IPSET_ERR_EXIST;
@@ -276,6 +290,7 @@ mtype_gc(unsigned long ul_set)
276 if (ip_set_timeout_expired(ext_timeout(x, set))) { 290 if (ip_set_timeout_expired(ext_timeout(x, set))) {
277 clear_bit(id, map->members); 291 clear_bit(id, map->members);
278 ip_set_ext_destroy(set, x); 292 ip_set_ext_destroy(set, x);
293 set->elements--;
279 } 294 }
280 } 295 }
281 spin_unlock_bh(&set->lock); 296 spin_unlock_bh(&set->lock);
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index a748b0c2c981..c296f9b606d4 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -36,7 +36,7 @@ struct ip_set_net {
36 bool is_destroyed; /* all sets are destroyed */ 36 bool is_destroyed; /* all sets are destroyed */
37}; 37};
38 38
39static int ip_set_net_id __read_mostly; 39static unsigned int ip_set_net_id __read_mostly;
40 40
41static inline struct ip_set_net *ip_set_pernet(struct net *net) 41static inline struct ip_set_net *ip_set_pernet(struct net *net)
42{ 42{
@@ -324,7 +324,7 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr)
324} 324}
325EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6); 325EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6);
326 326
327typedef void (*destroyer)(void *); 327typedef void (*destroyer)(struct ip_set *, void *);
328/* ipset data extension types, in size order */ 328/* ipset data extension types, in size order */
329 329
330const struct ip_set_ext_type ip_set_extensions[] = { 330const struct ip_set_ext_type ip_set_extensions[] = {
@@ -426,20 +426,20 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[],
426 if (!SET_WITH_SKBINFO(set)) 426 if (!SET_WITH_SKBINFO(set))
427 return -IPSET_ERR_SKBINFO; 427 return -IPSET_ERR_SKBINFO;
428 fullmark = be64_to_cpu(nla_get_be64(tb[IPSET_ATTR_SKBMARK])); 428 fullmark = be64_to_cpu(nla_get_be64(tb[IPSET_ATTR_SKBMARK]));
429 ext->skbmark = fullmark >> 32; 429 ext->skbinfo.skbmark = fullmark >> 32;
430 ext->skbmarkmask = fullmark & 0xffffffff; 430 ext->skbinfo.skbmarkmask = fullmark & 0xffffffff;
431 } 431 }
432 if (tb[IPSET_ATTR_SKBPRIO]) { 432 if (tb[IPSET_ATTR_SKBPRIO]) {
433 if (!SET_WITH_SKBINFO(set)) 433 if (!SET_WITH_SKBINFO(set))
434 return -IPSET_ERR_SKBINFO; 434 return -IPSET_ERR_SKBINFO;
435 ext->skbprio = be32_to_cpu(nla_get_be32( 435 ext->skbinfo.skbprio =
436 tb[IPSET_ATTR_SKBPRIO])); 436 be32_to_cpu(nla_get_be32(tb[IPSET_ATTR_SKBPRIO]));
437 } 437 }
438 if (tb[IPSET_ATTR_SKBQUEUE]) { 438 if (tb[IPSET_ATTR_SKBQUEUE]) {
439 if (!SET_WITH_SKBINFO(set)) 439 if (!SET_WITH_SKBINFO(set))
440 return -IPSET_ERR_SKBINFO; 440 return -IPSET_ERR_SKBINFO;
441 ext->skbqueue = be16_to_cpu(nla_get_be16( 441 ext->skbinfo.skbqueue =
442 tb[IPSET_ATTR_SKBQUEUE])); 442 be16_to_cpu(nla_get_be16(tb[IPSET_ATTR_SKBQUEUE]));
443 } 443 }
444 return 0; 444 return 0;
445} 445}
@@ -541,7 +541,7 @@ int
541ip_set_test(ip_set_id_t index, const struct sk_buff *skb, 541ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
542 const struct xt_action_param *par, struct ip_set_adt_opt *opt) 542 const struct xt_action_param *par, struct ip_set_adt_opt *opt)
543{ 543{
544 struct ip_set *set = ip_set_rcu_get(par->net, index); 544 struct ip_set *set = ip_set_rcu_get(xt_net(par), index);
545 int ret = 0; 545 int ret = 0;
546 546
547 BUG_ON(!set); 547 BUG_ON(!set);
@@ -579,7 +579,7 @@ int
579ip_set_add(ip_set_id_t index, const struct sk_buff *skb, 579ip_set_add(ip_set_id_t index, const struct sk_buff *skb,
580 const struct xt_action_param *par, struct ip_set_adt_opt *opt) 580 const struct xt_action_param *par, struct ip_set_adt_opt *opt)
581{ 581{
582 struct ip_set *set = ip_set_rcu_get(par->net, index); 582 struct ip_set *set = ip_set_rcu_get(xt_net(par), index);
583 int ret; 583 int ret;
584 584
585 BUG_ON(!set); 585 BUG_ON(!set);
@@ -601,7 +601,7 @@ int
601ip_set_del(ip_set_id_t index, const struct sk_buff *skb, 601ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
602 const struct xt_action_param *par, struct ip_set_adt_opt *opt) 602 const struct xt_action_param *par, struct ip_set_adt_opt *opt)
603{ 603{
604 struct ip_set *set = ip_set_rcu_get(par->net, index); 604 struct ip_set *set = ip_set_rcu_get(xt_net(par), index);
605 int ret = 0; 605 int ret = 0;
606 606
607 BUG_ON(!set); 607 BUG_ON(!set);
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index d32fd6b036bf..f236c0bc7b3f 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -85,6 +85,8 @@ struct htable {
85}; 85};
86 86
87#define hbucket(h, i) ((h)->bucket[i]) 87#define hbucket(h, i) ((h)->bucket[i])
88#define ext_size(n, dsize) \
89 (sizeof(struct hbucket) + (n) * (dsize))
88 90
89#ifndef IPSET_NET_COUNT 91#ifndef IPSET_NET_COUNT
90#define IPSET_NET_COUNT 1 92#define IPSET_NET_COUNT 1
@@ -150,24 +152,34 @@ htable_bits(u32 hashsize)
150#define INIT_CIDR(cidr, host_mask) \ 152#define INIT_CIDR(cidr, host_mask) \
151 DCIDR_PUT(((cidr) ? NCIDR_GET(cidr) : host_mask)) 153 DCIDR_PUT(((cidr) ? NCIDR_GET(cidr) : host_mask))
152 154
153#define SET_HOST_MASK(family) (family == AF_INET ? 32 : 128)
154
155#ifdef IP_SET_HASH_WITH_NET0 155#ifdef IP_SET_HASH_WITH_NET0
156/* cidr from 0 to SET_HOST_MASK() value and c = cidr + 1 */ 156/* cidr from 0 to HOST_MASK value and c = cidr + 1 */
157#define NLEN(family) (SET_HOST_MASK(family) + 1) 157#define NLEN (HOST_MASK + 1)
158#define CIDR_POS(c) ((c) - 1) 158#define CIDR_POS(c) ((c) - 1)
159#else 159#else
160/* cidr from 1 to SET_HOST_MASK() value and c = cidr + 1 */ 160/* cidr from 1 to HOST_MASK value and c = cidr + 1 */
161#define NLEN(family) SET_HOST_MASK(family) 161#define NLEN HOST_MASK
162#define CIDR_POS(c) ((c) - 2) 162#define CIDR_POS(c) ((c) - 2)
163#endif 163#endif
164 164
165#else 165#else
166#define NLEN(family) 0 166#define NLEN 0
167#endif /* IP_SET_HASH_WITH_NETS */ 167#endif /* IP_SET_HASH_WITH_NETS */
168 168
169#endif /* _IP_SET_HASH_GEN_H */ 169#endif /* _IP_SET_HASH_GEN_H */
170 170
171#ifndef MTYPE
172#error "MTYPE is not defined!"
173#endif
174
175#ifndef HTYPE
176#error "HTYPE is not defined!"
177#endif
178
179#ifndef HOST_MASK
180#error "HOST_MASK is not defined!"
181#endif
182
171/* Family dependent templates */ 183/* Family dependent templates */
172 184
173#undef ahash_data 185#undef ahash_data
@@ -191,7 +203,6 @@ htable_bits(u32 hashsize)
191#undef mtype_same_set 203#undef mtype_same_set
192#undef mtype_kadt 204#undef mtype_kadt
193#undef mtype_uadt 205#undef mtype_uadt
194#undef mtype
195 206
196#undef mtype_add 207#undef mtype_add
197#undef mtype_del 208#undef mtype_del
@@ -207,6 +218,7 @@ htable_bits(u32 hashsize)
207#undef mtype_variant 218#undef mtype_variant
208#undef mtype_data_match 219#undef mtype_data_match
209 220
221#undef htype
210#undef HKEY 222#undef HKEY
211 223
212#define mtype_data_equal IPSET_TOKEN(MTYPE, _data_equal) 224#define mtype_data_equal IPSET_TOKEN(MTYPE, _data_equal)
@@ -233,7 +245,6 @@ htable_bits(u32 hashsize)
233#define mtype_same_set IPSET_TOKEN(MTYPE, _same_set) 245#define mtype_same_set IPSET_TOKEN(MTYPE, _same_set)
234#define mtype_kadt IPSET_TOKEN(MTYPE, _kadt) 246#define mtype_kadt IPSET_TOKEN(MTYPE, _kadt)
235#define mtype_uadt IPSET_TOKEN(MTYPE, _uadt) 247#define mtype_uadt IPSET_TOKEN(MTYPE, _uadt)
236#define mtype MTYPE
237 248
238#define mtype_add IPSET_TOKEN(MTYPE, _add) 249#define mtype_add IPSET_TOKEN(MTYPE, _add)
239#define mtype_del IPSET_TOKEN(MTYPE, _del) 250#define mtype_del IPSET_TOKEN(MTYPE, _del)
@@ -249,62 +260,54 @@ htable_bits(u32 hashsize)
249#define mtype_variant IPSET_TOKEN(MTYPE, _variant) 260#define mtype_variant IPSET_TOKEN(MTYPE, _variant)
250#define mtype_data_match IPSET_TOKEN(MTYPE, _data_match) 261#define mtype_data_match IPSET_TOKEN(MTYPE, _data_match)
251 262
252#ifndef MTYPE
253#error "MTYPE is not defined!"
254#endif
255
256#ifndef HOST_MASK
257#error "HOST_MASK is not defined!"
258#endif
259
260#ifndef HKEY_DATALEN 263#ifndef HKEY_DATALEN
261#define HKEY_DATALEN sizeof(struct mtype_elem) 264#define HKEY_DATALEN sizeof(struct mtype_elem)
262#endif 265#endif
263 266
264#define HKEY(data, initval, htable_bits) \ 267#define htype MTYPE
265(jhash2((u32 *)(data), HKEY_DATALEN / sizeof(u32), initval) \
266 & jhash_mask(htable_bits))
267 268
268#ifndef htype 269#define HKEY(data, initval, htable_bits) \
269#ifndef HTYPE 270({ \
270#error "HTYPE is not defined!" 271 const u32 *__k = (const u32 *)data; \
271#endif /* HTYPE */ 272 u32 __l = HKEY_DATALEN / sizeof(u32); \
272#define htype HTYPE 273 \
274 BUILD_BUG_ON(HKEY_DATALEN % sizeof(u32) != 0); \
275 \
276 jhash2(__k, __l, initval) & jhash_mask(htable_bits); \
277})
273 278
274/* The generic hash structure */ 279/* The generic hash structure */
275struct htype { 280struct htype {
276 struct htable __rcu *table; /* the hash table */ 281 struct htable __rcu *table; /* the hash table */
282 struct timer_list gc; /* garbage collection when timeout enabled */
277 u32 maxelem; /* max elements in the hash */ 283 u32 maxelem; /* max elements in the hash */
278 u32 elements; /* current element (vs timeout) */
279 u32 initval; /* random jhash init value */ 284 u32 initval; /* random jhash init value */
280#ifdef IP_SET_HASH_WITH_MARKMASK 285#ifdef IP_SET_HASH_WITH_MARKMASK
281 u32 markmask; /* markmask value for mark mask to store */ 286 u32 markmask; /* markmask value for mark mask to store */
282#endif 287#endif
283 struct timer_list gc; /* garbage collection when timeout enabled */
284 struct mtype_elem next; /* temporary storage for uadd */
285#ifdef IP_SET_HASH_WITH_MULTI 288#ifdef IP_SET_HASH_WITH_MULTI
286 u8 ahash_max; /* max elements in an array block */ 289 u8 ahash_max; /* max elements in an array block */
287#endif 290#endif
288#ifdef IP_SET_HASH_WITH_NETMASK 291#ifdef IP_SET_HASH_WITH_NETMASK
289 u8 netmask; /* netmask value for subnets to store */ 292 u8 netmask; /* netmask value for subnets to store */
290#endif 293#endif
294 struct mtype_elem next; /* temporary storage for uadd */
291#ifdef IP_SET_HASH_WITH_NETS 295#ifdef IP_SET_HASH_WITH_NETS
292 struct net_prefixes nets[0]; /* book-keeping of prefixes */ 296 struct net_prefixes nets[NLEN]; /* book-keeping of prefixes */
293#endif 297#endif
294}; 298};
295#endif /* htype */
296 299
297#ifdef IP_SET_HASH_WITH_NETS 300#ifdef IP_SET_HASH_WITH_NETS
298/* Network cidr size book keeping when the hash stores different 301/* Network cidr size book keeping when the hash stores different
299 * sized networks. cidr == real cidr + 1 to support /0. 302 * sized networks. cidr == real cidr + 1 to support /0.
300 */ 303 */
301static void 304static void
302mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n) 305mtype_add_cidr(struct htype *h, u8 cidr, u8 n)
303{ 306{
304 int i, j; 307 int i, j;
305 308
306 /* Add in increasing prefix order, so larger cidr first */ 309 /* Add in increasing prefix order, so larger cidr first */
307 for (i = 0, j = -1; i < nets_length && h->nets[i].cidr[n]; i++) { 310 for (i = 0, j = -1; i < NLEN && h->nets[i].cidr[n]; i++) {
308 if (j != -1) { 311 if (j != -1) {
309 continue; 312 continue;
310 } else if (h->nets[i].cidr[n] < cidr) { 313 } else if (h->nets[i].cidr[n] < cidr) {
@@ -323,11 +326,11 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
323} 326}
324 327
325static void 328static void
326mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n) 329mtype_del_cidr(struct htype *h, u8 cidr, u8 n)
327{ 330{
328 u8 i, j, net_end = nets_length - 1; 331 u8 i, j, net_end = NLEN - 1;
329 332
330 for (i = 0; i < nets_length; i++) { 333 for (i = 0; i < NLEN; i++) {
331 if (h->nets[i].cidr[n] != cidr) 334 if (h->nets[i].cidr[n] != cidr)
332 continue; 335 continue;
333 h->nets[CIDR_POS(cidr)].nets[n]--; 336 h->nets[CIDR_POS(cidr)].nets[n]--;
@@ -343,24 +346,9 @@ mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
343 346
344/* Calculate the actual memory size of the set data */ 347/* Calculate the actual memory size of the set data */
345static size_t 348static size_t
346mtype_ahash_memsize(const struct htype *h, const struct htable *t, 349mtype_ahash_memsize(const struct htype *h, const struct htable *t)
347 u8 nets_length, size_t dsize)
348{ 350{
349 u32 i; 351 return sizeof(*h) + sizeof(*t);
350 struct hbucket *n;
351 size_t memsize = sizeof(*h) + sizeof(*t);
352
353#ifdef IP_SET_HASH_WITH_NETS
354 memsize += sizeof(struct net_prefixes) * nets_length;
355#endif
356 for (i = 0; i < jhash_size(t->htable_bits); i++) {
357 n = rcu_dereference_bh(hbucket(t, i));
358 if (!n)
359 continue;
360 memsize += sizeof(struct hbucket) + n->size * dsize;
361 }
362
363 return memsize;
364} 352}
365 353
366/* Get the ith element from the array block n */ 354/* Get the ith element from the array block n */
@@ -398,9 +386,10 @@ mtype_flush(struct ip_set *set)
398 kfree_rcu(n, rcu); 386 kfree_rcu(n, rcu);
399 } 387 }
400#ifdef IP_SET_HASH_WITH_NETS 388#ifdef IP_SET_HASH_WITH_NETS
401 memset(h->nets, 0, sizeof(struct net_prefixes) * NLEN(set->family)); 389 memset(h->nets, 0, sizeof(h->nets));
402#endif 390#endif
403 h->elements = 0; 391 set->elements = 0;
392 set->ext_size = 0;
404} 393}
405 394
406/* Destroy the hashtable part of the set */ 395/* Destroy the hashtable part of the set */
@@ -444,11 +433,8 @@ mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
444{ 433{
445 struct htype *h = set->data; 434 struct htype *h = set->data;
446 435
447 init_timer(&h->gc); 436 setup_timer(&h->gc, gc, (unsigned long)set);
448 h->gc.data = (unsigned long)set; 437 mod_timer(&h->gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ);
449 h->gc.function = gc;
450 h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
451 add_timer(&h->gc);
452 pr_debug("gc initialized, run in every %u\n", 438 pr_debug("gc initialized, run in every %u\n",
453 IPSET_GC_PERIOD(set->timeout)); 439 IPSET_GC_PERIOD(set->timeout));
454} 440}
@@ -473,12 +459,13 @@ mtype_same_set(const struct ip_set *a, const struct ip_set *b)
473 459
474/* Delete expired elements from the hashtable */ 460/* Delete expired elements from the hashtable */
475static void 461static void
476mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize) 462mtype_expire(struct ip_set *set, struct htype *h)
477{ 463{
478 struct htable *t; 464 struct htable *t;
479 struct hbucket *n, *tmp; 465 struct hbucket *n, *tmp;
480 struct mtype_elem *data; 466 struct mtype_elem *data;
481 u32 i, j, d; 467 u32 i, j, d;
468 size_t dsize = set->dsize;
482#ifdef IP_SET_HASH_WITH_NETS 469#ifdef IP_SET_HASH_WITH_NETS
483 u8 k; 470 u8 k;
484#endif 471#endif
@@ -494,21 +481,20 @@ mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize)
494 continue; 481 continue;
495 } 482 }
496 data = ahash_data(n, j, dsize); 483 data = ahash_data(n, j, dsize);
497 if (ip_set_timeout_expired(ext_timeout(data, set))) { 484 if (!ip_set_timeout_expired(ext_timeout(data, set)))
498 pr_debug("expired %u/%u\n", i, j); 485 continue;
499 clear_bit(j, n->used); 486 pr_debug("expired %u/%u\n", i, j);
500 smp_mb__after_atomic(); 487 clear_bit(j, n->used);
488 smp_mb__after_atomic();
501#ifdef IP_SET_HASH_WITH_NETS 489#ifdef IP_SET_HASH_WITH_NETS
502 for (k = 0; k < IPSET_NET_COUNT; k++) 490 for (k = 0; k < IPSET_NET_COUNT; k++)
503 mtype_del_cidr(h, 491 mtype_del_cidr(h,
504 NCIDR_PUT(DCIDR_GET(data->cidr, 492 NCIDR_PUT(DCIDR_GET(data->cidr, k)),
505 k)), 493 k);
506 nets_length, k);
507#endif 494#endif
508 ip_set_ext_destroy(set, data); 495 ip_set_ext_destroy(set, data);
509 h->elements--; 496 set->elements--;
510 d++; 497 d++;
511 }
512 } 498 }
513 if (d >= AHASH_INIT_SIZE) { 499 if (d >= AHASH_INIT_SIZE) {
514 if (d >= n->size) { 500 if (d >= n->size) {
@@ -532,6 +518,7 @@ mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize)
532 d++; 518 d++;
533 } 519 }
534 tmp->pos = d; 520 tmp->pos = d;
521 set->ext_size -= ext_size(AHASH_INIT_SIZE, dsize);
535 rcu_assign_pointer(hbucket(t, i), tmp); 522 rcu_assign_pointer(hbucket(t, i), tmp);
536 kfree_rcu(n, rcu); 523 kfree_rcu(n, rcu);
537 } 524 }
@@ -546,7 +533,7 @@ mtype_gc(unsigned long ul_set)
546 533
547 pr_debug("called\n"); 534 pr_debug("called\n");
548 spin_lock_bh(&set->lock); 535 spin_lock_bh(&set->lock);
549 mtype_expire(set, h, NLEN(set->family), set->dsize); 536 mtype_expire(set, h);
550 spin_unlock_bh(&set->lock); 537 spin_unlock_bh(&set->lock);
551 538
552 h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; 539 h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
@@ -563,7 +550,7 @@ mtype_resize(struct ip_set *set, bool retried)
563 struct htype *h = set->data; 550 struct htype *h = set->data;
564 struct htable *t, *orig; 551 struct htable *t, *orig;
565 u8 htable_bits; 552 u8 htable_bits;
566 size_t dsize = set->dsize; 553 size_t extsize, dsize = set->dsize;
567#ifdef IP_SET_HASH_WITH_NETS 554#ifdef IP_SET_HASH_WITH_NETS
568 u8 flags; 555 u8 flags;
569 struct mtype_elem *tmp; 556 struct mtype_elem *tmp;
@@ -606,6 +593,7 @@ retry:
606 /* There can't be another parallel resizing, but dumping is possible */ 593 /* There can't be another parallel resizing, but dumping is possible */
607 atomic_set(&orig->ref, 1); 594 atomic_set(&orig->ref, 1);
608 atomic_inc(&orig->uref); 595 atomic_inc(&orig->uref);
596 extsize = 0;
609 pr_debug("attempt to resize set %s from %u to %u, t %p\n", 597 pr_debug("attempt to resize set %s from %u to %u, t %p\n",
610 set->name, orig->htable_bits, htable_bits, orig); 598 set->name, orig->htable_bits, htable_bits, orig);
611 for (i = 0; i < jhash_size(orig->htable_bits); i++) { 599 for (i = 0; i < jhash_size(orig->htable_bits); i++) {
@@ -636,6 +624,7 @@ retry:
636 goto cleanup; 624 goto cleanup;
637 } 625 }
638 m->size = AHASH_INIT_SIZE; 626 m->size = AHASH_INIT_SIZE;
627 extsize = ext_size(AHASH_INIT_SIZE, dsize);
639 RCU_INIT_POINTER(hbucket(t, key), m); 628 RCU_INIT_POINTER(hbucket(t, key), m);
640 } else if (m->pos >= m->size) { 629 } else if (m->pos >= m->size) {
641 struct hbucket *ht; 630 struct hbucket *ht;
@@ -655,6 +644,7 @@ retry:
655 memcpy(ht, m, sizeof(struct hbucket) + 644 memcpy(ht, m, sizeof(struct hbucket) +
656 m->size * dsize); 645 m->size * dsize);
657 ht->size = m->size + AHASH_INIT_SIZE; 646 ht->size = m->size + AHASH_INIT_SIZE;
647 extsize += ext_size(AHASH_INIT_SIZE, dsize);
658 kfree(m); 648 kfree(m);
659 m = ht; 649 m = ht;
660 RCU_INIT_POINTER(hbucket(t, key), ht); 650 RCU_INIT_POINTER(hbucket(t, key), ht);
@@ -668,6 +658,7 @@ retry:
668 } 658 }
669 } 659 }
670 rcu_assign_pointer(h->table, t); 660 rcu_assign_pointer(h->table, t);
661 set->ext_size = extsize;
671 662
672 spin_unlock_bh(&set->lock); 663 spin_unlock_bh(&set->lock);
673 664
@@ -715,11 +706,11 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
715 bool deleted = false, forceadd = false, reuse = false; 706 bool deleted = false, forceadd = false, reuse = false;
716 u32 key, multi = 0; 707 u32 key, multi = 0;
717 708
718 if (h->elements >= h->maxelem) { 709 if (set->elements >= h->maxelem) {
719 if (SET_WITH_TIMEOUT(set)) 710 if (SET_WITH_TIMEOUT(set))
720 /* FIXME: when set is full, we slow down here */ 711 /* FIXME: when set is full, we slow down here */
721 mtype_expire(set, h, NLEN(set->family), set->dsize); 712 mtype_expire(set, h);
722 if (h->elements >= h->maxelem && SET_WITH_FORCEADD(set)) 713 if (set->elements >= h->maxelem && SET_WITH_FORCEADD(set))
723 forceadd = true; 714 forceadd = true;
724 } 715 }
725 716
@@ -727,20 +718,15 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
727 key = HKEY(value, h->initval, t->htable_bits); 718 key = HKEY(value, h->initval, t->htable_bits);
728 n = __ipset_dereference_protected(hbucket(t, key), 1); 719 n = __ipset_dereference_protected(hbucket(t, key), 1);
729 if (!n) { 720 if (!n) {
730 if (forceadd) { 721 if (forceadd || set->elements >= h->maxelem)
731 if (net_ratelimit())
732 pr_warn("Set %s is full, maxelem %u reached\n",
733 set->name, h->maxelem);
734 return -IPSET_ERR_HASH_FULL;
735 } else if (h->elements >= h->maxelem) {
736 goto set_full; 722 goto set_full;
737 }
738 old = NULL; 723 old = NULL;
739 n = kzalloc(sizeof(*n) + AHASH_INIT_SIZE * set->dsize, 724 n = kzalloc(sizeof(*n) + AHASH_INIT_SIZE * set->dsize,
740 GFP_ATOMIC); 725 GFP_ATOMIC);
741 if (!n) 726 if (!n)
742 return -ENOMEM; 727 return -ENOMEM;
743 n->size = AHASH_INIT_SIZE; 728 n->size = AHASH_INIT_SIZE;
729 set->ext_size += ext_size(AHASH_INIT_SIZE, set->dsize);
744 goto copy_elem; 730 goto copy_elem;
745 } 731 }
746 for (i = 0; i < n->pos; i++) { 732 for (i = 0; i < n->pos; i++) {
@@ -778,14 +764,14 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
778 for (i = 0; i < IPSET_NET_COUNT; i++) 764 for (i = 0; i < IPSET_NET_COUNT; i++)
779 mtype_del_cidr(h, 765 mtype_del_cidr(h,
780 NCIDR_PUT(DCIDR_GET(data->cidr, i)), 766 NCIDR_PUT(DCIDR_GET(data->cidr, i)),
781 NLEN(set->family), i); 767 i);
782#endif 768#endif
783 ip_set_ext_destroy(set, data); 769 ip_set_ext_destroy(set, data);
784 h->elements--; 770 set->elements--;
785 } 771 }
786 goto copy_data; 772 goto copy_data;
787 } 773 }
788 if (h->elements >= h->maxelem) 774 if (set->elements >= h->maxelem)
789 goto set_full; 775 goto set_full;
790 /* Create a new slot */ 776 /* Create a new slot */
791 if (n->pos >= n->size) { 777 if (n->pos >= n->size) {
@@ -804,17 +790,17 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
804 memcpy(n, old, sizeof(struct hbucket) + 790 memcpy(n, old, sizeof(struct hbucket) +
805 old->size * set->dsize); 791 old->size * set->dsize);
806 n->size = old->size + AHASH_INIT_SIZE; 792 n->size = old->size + AHASH_INIT_SIZE;
793 set->ext_size += ext_size(AHASH_INIT_SIZE, set->dsize);
807 } 794 }
808 795
809copy_elem: 796copy_elem:
810 j = n->pos++; 797 j = n->pos++;
811 data = ahash_data(n, j, set->dsize); 798 data = ahash_data(n, j, set->dsize);
812copy_data: 799copy_data:
813 h->elements++; 800 set->elements++;
814#ifdef IP_SET_HASH_WITH_NETS 801#ifdef IP_SET_HASH_WITH_NETS
815 for (i = 0; i < IPSET_NET_COUNT; i++) 802 for (i = 0; i < IPSET_NET_COUNT; i++)
816 mtype_add_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), 803 mtype_add_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), i);
817 NLEN(set->family), i);
818#endif 804#endif
819 memcpy(data, d, sizeof(struct mtype_elem)); 805 memcpy(data, d, sizeof(struct mtype_elem));
820overwrite_extensions: 806overwrite_extensions:
@@ -824,7 +810,7 @@ overwrite_extensions:
824 if (SET_WITH_COUNTER(set)) 810 if (SET_WITH_COUNTER(set))
825 ip_set_init_counter(ext_counter(data, set), ext); 811 ip_set_init_counter(ext_counter(data, set), ext);
826 if (SET_WITH_COMMENT(set)) 812 if (SET_WITH_COMMENT(set))
827 ip_set_init_comment(ext_comment(data, set), ext); 813 ip_set_init_comment(set, ext_comment(data, set), ext);
828 if (SET_WITH_SKBINFO(set)) 814 if (SET_WITH_SKBINFO(set))
829 ip_set_init_skbinfo(ext_skbinfo(data, set), ext); 815 ip_set_init_skbinfo(ext_skbinfo(data, set), ext);
830 /* Must come last for the case when timed out entry is reused */ 816 /* Must come last for the case when timed out entry is reused */
@@ -883,11 +869,11 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
883 smp_mb__after_atomic(); 869 smp_mb__after_atomic();
884 if (i + 1 == n->pos) 870 if (i + 1 == n->pos)
885 n->pos--; 871 n->pos--;
886 h->elements--; 872 set->elements--;
887#ifdef IP_SET_HASH_WITH_NETS 873#ifdef IP_SET_HASH_WITH_NETS
888 for (j = 0; j < IPSET_NET_COUNT; j++) 874 for (j = 0; j < IPSET_NET_COUNT; j++)
889 mtype_del_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, j)), 875 mtype_del_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, j)),
890 NLEN(set->family), j); 876 j);
891#endif 877#endif
892 ip_set_ext_destroy(set, data); 878 ip_set_ext_destroy(set, data);
893 879
@@ -896,6 +882,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
896 k++; 882 k++;
897 } 883 }
898 if (n->pos == 0 && k == 0) { 884 if (n->pos == 0 && k == 0) {
885 set->ext_size -= ext_size(n->size, dsize);
899 rcu_assign_pointer(hbucket(t, key), NULL); 886 rcu_assign_pointer(hbucket(t, key), NULL);
900 kfree_rcu(n, rcu); 887 kfree_rcu(n, rcu);
901 } else if (k >= AHASH_INIT_SIZE) { 888 } else if (k >= AHASH_INIT_SIZE) {
@@ -910,10 +897,11 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
910 continue; 897 continue;
911 data = ahash_data(n, j, dsize); 898 data = ahash_data(n, j, dsize);
912 memcpy(tmp->value + k * dsize, data, dsize); 899 memcpy(tmp->value + k * dsize, data, dsize);
913 set_bit(j, tmp->used); 900 set_bit(k, tmp->used);
914 k++; 901 k++;
915 } 902 }
916 tmp->pos = k; 903 tmp->pos = k;
904 set->ext_size -= ext_size(AHASH_INIT_SIZE, dsize);
917 rcu_assign_pointer(hbucket(t, key), tmp); 905 rcu_assign_pointer(hbucket(t, key), tmp);
918 kfree_rcu(n, rcu); 906 kfree_rcu(n, rcu);
919 } 907 }
@@ -957,14 +945,13 @@ mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d,
957 int i, j = 0; 945 int i, j = 0;
958#endif 946#endif
959 u32 key, multi = 0; 947 u32 key, multi = 0;
960 u8 nets_length = NLEN(set->family);
961 948
962 pr_debug("test by nets\n"); 949 pr_debug("test by nets\n");
963 for (; j < nets_length && h->nets[j].cidr[0] && !multi; j++) { 950 for (; j < NLEN && h->nets[j].cidr[0] && !multi; j++) {
964#if IPSET_NET_COUNT == 2 951#if IPSET_NET_COUNT == 2
965 mtype_data_reset_elem(d, &orig); 952 mtype_data_reset_elem(d, &orig);
966 mtype_data_netmask(d, NCIDR_GET(h->nets[j].cidr[0]), false); 953 mtype_data_netmask(d, NCIDR_GET(h->nets[j].cidr[0]), false);
967 for (k = 0; k < nets_length && h->nets[k].cidr[1] && !multi; 954 for (k = 0; k < NLEN && h->nets[k].cidr[1] && !multi;
968 k++) { 955 k++) {
969 mtype_data_netmask(d, NCIDR_GET(h->nets[k].cidr[1]), 956 mtype_data_netmask(d, NCIDR_GET(h->nets[k].cidr[1]),
970 true); 957 true);
@@ -1021,7 +1008,7 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
1021 * try all possible network sizes 1008 * try all possible network sizes
1022 */ 1009 */
1023 for (i = 0; i < IPSET_NET_COUNT; i++) 1010 for (i = 0; i < IPSET_NET_COUNT; i++)
1024 if (DCIDR_GET(d->cidr, i) != SET_HOST_MASK(set->family)) 1011 if (DCIDR_GET(d->cidr, i) != HOST_MASK)
1025 break; 1012 break;
1026 if (i == IPSET_NET_COUNT) { 1013 if (i == IPSET_NET_COUNT) {
1027 ret = mtype_test_cidrs(set, d, ext, mext, flags); 1014 ret = mtype_test_cidrs(set, d, ext, mext, flags);
@@ -1062,7 +1049,7 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
1062 1049
1063 rcu_read_lock_bh(); 1050 rcu_read_lock_bh();
1064 t = rcu_dereference_bh_nfnl(h->table); 1051 t = rcu_dereference_bh_nfnl(h->table);
1065 memsize = mtype_ahash_memsize(h, t, NLEN(set->family), set->dsize); 1052 memsize = mtype_ahash_memsize(h, t) + set->ext_size;
1066 htable_bits = t->htable_bits; 1053 htable_bits = t->htable_bits;
1067 rcu_read_unlock_bh(); 1054 rcu_read_unlock_bh();
1068 1055
@@ -1083,7 +1070,8 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
1083 goto nla_put_failure; 1070 goto nla_put_failure;
1084#endif 1071#endif
1085 if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) || 1072 if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) ||
1086 nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize))) 1073 nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) ||
1074 nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(set->elements)))
1087 goto nla_put_failure; 1075 goto nla_put_failure;
1088 if (unlikely(ip_set_put_flags(skb, set))) 1076 if (unlikely(ip_set_put_flags(skb, set)))
1089 goto nla_put_failure; 1077 goto nla_put_failure;
@@ -1238,41 +1226,35 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
1238 struct htype *h; 1226 struct htype *h;
1239 struct htable *t; 1227 struct htable *t;
1240 1228
1229 pr_debug("Create set %s with family %s\n",
1230 set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6");
1231
1241#ifndef IP_SET_PROTO_UNDEF 1232#ifndef IP_SET_PROTO_UNDEF
1242 if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) 1233 if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6))
1243 return -IPSET_ERR_INVALID_FAMILY; 1234 return -IPSET_ERR_INVALID_FAMILY;
1244#endif 1235#endif
1245 1236
1246#ifdef IP_SET_HASH_WITH_MARKMASK
1247 markmask = 0xffffffff;
1248#endif
1249#ifdef IP_SET_HASH_WITH_NETMASK
1250 netmask = set->family == NFPROTO_IPV4 ? 32 : 128;
1251 pr_debug("Create set %s with family %s\n",
1252 set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6");
1253#endif
1254
1255 if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || 1237 if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
1256 !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || 1238 !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
1257 !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || 1239 !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
1258 !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) 1240 !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
1259 return -IPSET_ERR_PROTOCOL; 1241 return -IPSET_ERR_PROTOCOL;
1242
1260#ifdef IP_SET_HASH_WITH_MARKMASK 1243#ifdef IP_SET_HASH_WITH_MARKMASK
1261 /* Separated condition in order to avoid directive in argument list */ 1244 /* Separated condition in order to avoid directive in argument list */
1262 if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_MARKMASK))) 1245 if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_MARKMASK)))
1263 return -IPSET_ERR_PROTOCOL; 1246 return -IPSET_ERR_PROTOCOL;
1264#endif
1265 1247
1266 if (tb[IPSET_ATTR_HASHSIZE]) { 1248 markmask = 0xffffffff;
1267 hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); 1249 if (tb[IPSET_ATTR_MARKMASK]) {
1268 if (hashsize < IPSET_MIMINAL_HASHSIZE) 1250 markmask = ntohl(nla_get_be32(tb[IPSET_ATTR_MARKMASK]));
1269 hashsize = IPSET_MIMINAL_HASHSIZE; 1251 if (markmask == 0)
1252 return -IPSET_ERR_INVALID_MARKMASK;
1270 } 1253 }
1271 1254#endif
1272 if (tb[IPSET_ATTR_MAXELEM])
1273 maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
1274 1255
1275#ifdef IP_SET_HASH_WITH_NETMASK 1256#ifdef IP_SET_HASH_WITH_NETMASK
1257 netmask = set->family == NFPROTO_IPV4 ? 32 : 128;
1276 if (tb[IPSET_ATTR_NETMASK]) { 1258 if (tb[IPSET_ATTR_NETMASK]) {
1277 netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]); 1259 netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]);
1278 1260
@@ -1282,33 +1264,21 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
1282 return -IPSET_ERR_INVALID_NETMASK; 1264 return -IPSET_ERR_INVALID_NETMASK;
1283 } 1265 }
1284#endif 1266#endif
1285#ifdef IP_SET_HASH_WITH_MARKMASK
1286 if (tb[IPSET_ATTR_MARKMASK]) {
1287 markmask = ntohl(nla_get_be32(tb[IPSET_ATTR_MARKMASK]));
1288 1267
1289 if (markmask == 0) 1268 if (tb[IPSET_ATTR_HASHSIZE]) {
1290 return -IPSET_ERR_INVALID_MARKMASK; 1269 hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
1270 if (hashsize < IPSET_MIMINAL_HASHSIZE)
1271 hashsize = IPSET_MIMINAL_HASHSIZE;
1291 } 1272 }
1292#endif 1273
1274 if (tb[IPSET_ATTR_MAXELEM])
1275 maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
1293 1276
1294 hsize = sizeof(*h); 1277 hsize = sizeof(*h);
1295#ifdef IP_SET_HASH_WITH_NETS
1296 hsize += sizeof(struct net_prefixes) * NLEN(set->family);
1297#endif
1298 h = kzalloc(hsize, GFP_KERNEL); 1278 h = kzalloc(hsize, GFP_KERNEL);
1299 if (!h) 1279 if (!h)
1300 return -ENOMEM; 1280 return -ENOMEM;
1301 1281
1302 h->maxelem = maxelem;
1303#ifdef IP_SET_HASH_WITH_NETMASK
1304 h->netmask = netmask;
1305#endif
1306#ifdef IP_SET_HASH_WITH_MARKMASK
1307 h->markmask = markmask;
1308#endif
1309 get_random_bytes(&h->initval, sizeof(h->initval));
1310 set->timeout = IPSET_NO_TIMEOUT;
1311
1312 hbits = htable_bits(hashsize); 1282 hbits = htable_bits(hashsize);
1313 hsize = htable_size(hbits); 1283 hsize = htable_size(hbits);
1314 if (hsize == 0) { 1284 if (hsize == 0) {
@@ -1320,8 +1290,17 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
1320 kfree(h); 1290 kfree(h);
1321 return -ENOMEM; 1291 return -ENOMEM;
1322 } 1292 }
1293 h->maxelem = maxelem;
1294#ifdef IP_SET_HASH_WITH_NETMASK
1295 h->netmask = netmask;
1296#endif
1297#ifdef IP_SET_HASH_WITH_MARKMASK
1298 h->markmask = markmask;
1299#endif
1300 get_random_bytes(&h->initval, sizeof(h->initval));
1301
1323 t->htable_bits = hbits; 1302 t->htable_bits = hbits;
1324 rcu_assign_pointer(h->table, t); 1303 RCU_INIT_POINTER(h->table, t);
1325 1304
1326 set->data = h; 1305 set->data = h;
1327#ifndef IP_SET_PROTO_UNDEF 1306#ifndef IP_SET_PROTO_UNDEF
@@ -1339,6 +1318,7 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
1339 __alignof__(struct IPSET_TOKEN(HTYPE, 6_elem))); 1318 __alignof__(struct IPSET_TOKEN(HTYPE, 6_elem)));
1340 } 1319 }
1341#endif 1320#endif
1321 set->timeout = IPSET_NO_TIMEOUT;
1342 if (tb[IPSET_ATTR_TIMEOUT]) { 1322 if (tb[IPSET_ATTR_TIMEOUT]) {
1343 set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); 1323 set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
1344#ifndef IP_SET_PROTO_UNDEF 1324#ifndef IP_SET_PROTO_UNDEF
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
index 9d6bf19f7b78..20bfbd315f61 100644
--- a/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -82,7 +82,7 @@ hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb,
82 const struct xt_action_param *par, 82 const struct xt_action_param *par,
83 enum ipset_adt adt, struct ip_set_adt_opt *opt) 83 enum ipset_adt adt, struct ip_set_adt_opt *opt)
84{ 84{
85 const struct hash_ip *h = set->data; 85 const struct hash_ip4 *h = set->data;
86 ipset_adtfn adtfn = set->variant->adt[adt]; 86 ipset_adtfn adtfn = set->variant->adt[adt];
87 struct hash_ip4_elem e = { 0 }; 87 struct hash_ip4_elem e = { 0 };
88 struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); 88 struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -101,7 +101,7 @@ static int
101hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], 101hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
102 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) 102 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
103{ 103{
104 const struct hash_ip *h = set->data; 104 const struct hash_ip4 *h = set->data;
105 ipset_adtfn adtfn = set->variant->adt[adt]; 105 ipset_adtfn adtfn = set->variant->adt[adt];
106 struct hash_ip4_elem e = { 0 }; 106 struct hash_ip4_elem e = { 0 };
107 struct ip_set_ext ext = IP_SET_INIT_UEXT(set); 107 struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
@@ -199,7 +199,7 @@ nla_put_failure:
199} 199}
200 200
201static inline void 201static inline void
202hash_ip6_data_next(struct hash_ip4_elem *next, const struct hash_ip6_elem *e) 202hash_ip6_data_next(struct hash_ip6_elem *next, const struct hash_ip6_elem *e)
203{ 203{
204} 204}
205 205
@@ -217,7 +217,7 @@ hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb,
217 const struct xt_action_param *par, 217 const struct xt_action_param *par,
218 enum ipset_adt adt, struct ip_set_adt_opt *opt) 218 enum ipset_adt adt, struct ip_set_adt_opt *opt)
219{ 219{
220 const struct hash_ip *h = set->data; 220 const struct hash_ip6 *h = set->data;
221 ipset_adtfn adtfn = set->variant->adt[adt]; 221 ipset_adtfn adtfn = set->variant->adt[adt];
222 struct hash_ip6_elem e = { { .all = { 0 } } }; 222 struct hash_ip6_elem e = { { .all = { 0 } } };
223 struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); 223 struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -234,7 +234,7 @@ static int
234hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[], 234hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[],
235 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) 235 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
236{ 236{
237 const struct hash_ip *h = set->data; 237 const struct hash_ip6 *h = set->data;
238 ipset_adtfn adtfn = set->variant->adt[adt]; 238 ipset_adtfn adtfn = set->variant->adt[adt];
239 struct hash_ip6_elem e = { { .all = { 0 } } }; 239 struct hash_ip6_elem e = { { .all = { 0 } } };
240 struct ip_set_ext ext = IP_SET_INIT_UEXT(set); 240 struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
diff --git a/net/netfilter/ipset/ip_set_hash_ipmac.c b/net/netfilter/ipset/ip_set_hash_ipmac.c
new file mode 100644
index 000000000000..1ab5ed2f6839
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_ipmac.c
@@ -0,0 +1,315 @@
1/* Copyright (C) 2016 Tomasz Chilinski <tomasz.chilinski@chilan.com>
2 *
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License version 2 as
5 * published by the Free Software Foundation.
6 */
7
8/* Kernel module implementing an IP set type: the hash:ip,mac type */
9
10#include <linux/jhash.h>
11#include <linux/module.h>
12#include <linux/ip.h>
13#include <linux/etherdevice.h>
14#include <linux/skbuff.h>
15#include <linux/errno.h>
16#include <linux/random.h>
17#include <linux/if_ether.h>
18#include <net/ip.h>
19#include <net/ipv6.h>
20#include <net/netlink.h>
21#include <net/tcp.h>
22
23#include <linux/netfilter.h>
24#include <linux/netfilter/ipset/pfxlen.h>
25#include <linux/netfilter/ipset/ip_set.h>
26#include <linux/netfilter/ipset/ip_set_hash.h>
27
28#define IPSET_TYPE_REV_MIN 0
29#define IPSET_TYPE_REV_MAX 0
30
31MODULE_LICENSE("GPL");
32MODULE_AUTHOR("Tomasz Chilinski <tomasz.chilinski@chilan.com>");
33IP_SET_MODULE_DESC("hash:ip,mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
34MODULE_ALIAS("ip_set_hash:ip,mac");
35
36/* Type specific function prefix */
37#define HTYPE hash_ipmac
38
39/* Zero valued element is not supported */
40static const unsigned char invalid_ether[ETH_ALEN] = { 0 };
41
42/* IPv4 variant */
43
44/* Member elements */
45struct hash_ipmac4_elem {
46 /* Zero valued IP addresses cannot be stored */
47 __be32 ip;
48 union {
49 unsigned char ether[ETH_ALEN];
50 __be32 foo[2];
51 };
52};
53
54/* Common functions */
55
56static inline bool
57hash_ipmac4_data_equal(const struct hash_ipmac4_elem *e1,
58 const struct hash_ipmac4_elem *e2,
59 u32 *multi)
60{
61 return e1->ip == e2->ip && ether_addr_equal(e1->ether, e2->ether);
62}
63
64static bool
65hash_ipmac4_data_list(struct sk_buff *skb, const struct hash_ipmac4_elem *e)
66{
67 if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, e->ip) ||
68 nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether))
69 goto nla_put_failure;
70 return false;
71
72nla_put_failure:
73 return true;
74}
75
76static inline void
77hash_ipmac4_data_next(struct hash_ipmac4_elem *next,
78 const struct hash_ipmac4_elem *e)
79{
80 next->ip = e->ip;
81}
82
83#define MTYPE hash_ipmac4
84#define PF 4
85#define HOST_MASK 32
86#define HKEY_DATALEN sizeof(struct hash_ipmac4_elem)
87#include "ip_set_hash_gen.h"
88
89static int
90hash_ipmac4_kadt(struct ip_set *set, const struct sk_buff *skb,
91 const struct xt_action_param *par,
92 enum ipset_adt adt, struct ip_set_adt_opt *opt)
93{
94 ipset_adtfn adtfn = set->variant->adt[adt];
95 struct hash_ipmac4_elem e = { .ip = 0, { .foo[0] = 0, .foo[1] = 0 } };
96 struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
97
98 /* MAC can be src only */
99 if (!(opt->flags & IPSET_DIM_TWO_SRC))
100 return 0;
101
102 if (skb_mac_header(skb) < skb->head ||
103 (skb_mac_header(skb) + ETH_HLEN) > skb->data)
104 return -EINVAL;
105
106 memcpy(e.ether, eth_hdr(skb)->h_source, ETH_ALEN);
107 if (ether_addr_equal(e.ether, invalid_ether))
108 return -EINVAL;
109
110 ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
111
112 return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
113}
114
115static int
116hash_ipmac4_uadt(struct ip_set *set, struct nlattr *tb[],
117 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
118{
119 ipset_adtfn adtfn = set->variant->adt[adt];
120 struct hash_ipmac4_elem e = { .ip = 0, { .foo[0] = 0, .foo[1] = 0 } };
121 struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
122 int ret;
123
124 if (unlikely(!tb[IPSET_ATTR_IP] ||
125 !tb[IPSET_ATTR_ETHER] ||
126 nla_len(tb[IPSET_ATTR_ETHER]) != ETH_ALEN ||
127 !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
128 !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
129 !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
130 !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
131 !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
132 !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
133 return -IPSET_ERR_PROTOCOL;
134
135 if (tb[IPSET_ATTR_LINENO])
136 *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
137
138 ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) ||
139 ip_set_get_extensions(set, tb, &ext);
140 if (ret)
141 return ret;
142 memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN);
143 if (ether_addr_equal(e.ether, invalid_ether))
144 return -IPSET_ERR_HASH_ELEM;
145
146 return adtfn(set, &e, &ext, &ext, flags);
147}
148
149/* IPv6 variant */
150
151/* Member elements */
152struct hash_ipmac6_elem {
153 /* Zero valued IP addresses cannot be stored */
154 union nf_inet_addr ip;
155 union {
156 unsigned char ether[ETH_ALEN];
157 __be32 foo[2];
158 };
159};
160
161/* Common functions */
162
163static inline bool
164hash_ipmac6_data_equal(const struct hash_ipmac6_elem *e1,
165 const struct hash_ipmac6_elem *e2,
166 u32 *multi)
167{
168 return ipv6_addr_equal(&e1->ip.in6, &e2->ip.in6) &&
169 ether_addr_equal(e1->ether, e2->ether);
170}
171
172static bool
173hash_ipmac6_data_list(struct sk_buff *skb, const struct hash_ipmac6_elem *e)
174{
175 if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6) ||
176 nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether))
177 goto nla_put_failure;
178 return false;
179
180nla_put_failure:
181 return true;
182}
183
184static inline void
185hash_ipmac6_data_next(struct hash_ipmac6_elem *next,
186 const struct hash_ipmac6_elem *e)
187{
188}
189
190#undef MTYPE
191#undef PF
192#undef HOST_MASK
193#undef HKEY_DATALEN
194
195#define MTYPE hash_ipmac6
196#define PF 6
197#define HOST_MASK 128
198#define HKEY_DATALEN sizeof(struct hash_ipmac6_elem)
199#define IP_SET_EMIT_CREATE
200#include "ip_set_hash_gen.h"
201
202static int
203hash_ipmac6_kadt(struct ip_set *set, const struct sk_buff *skb,
204 const struct xt_action_param *par,
205 enum ipset_adt adt, struct ip_set_adt_opt *opt)
206{
207 ipset_adtfn adtfn = set->variant->adt[adt];
208 struct hash_ipmac6_elem e = {
209 { .all = { 0 } },
210 { .foo[0] = 0, .foo[1] = 0 }
211 };
212 struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
213
214 /* MAC can be src only */
215 if (!(opt->flags & IPSET_DIM_TWO_SRC))
216 return 0;
217
218 if (skb_mac_header(skb) < skb->head ||
219 (skb_mac_header(skb) + ETH_HLEN) > skb->data)
220 return -EINVAL;
221
222 memcpy(e.ether, eth_hdr(skb)->h_source, ETH_ALEN);
223 if (ether_addr_equal(e.ether, invalid_ether))
224 return -EINVAL;
225
226 ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
227
228 return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
229}
230
231static int
232hash_ipmac6_uadt(struct ip_set *set, struct nlattr *tb[],
233 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
234{
235 ipset_adtfn adtfn = set->variant->adt[adt];
236 struct hash_ipmac6_elem e = {
237 { .all = { 0 } },
238 { .foo[0] = 0, .foo[1] = 0 }
239 };
240 struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
241 int ret;
242
243 if (unlikely(!tb[IPSET_ATTR_IP] ||
244 !tb[IPSET_ATTR_ETHER] ||
245 nla_len(tb[IPSET_ATTR_ETHER]) != ETH_ALEN ||
246 !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
247 !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
248 !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
249 !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
250 !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
251 !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
252 return -IPSET_ERR_PROTOCOL;
253
254 if (tb[IPSET_ATTR_LINENO])
255 *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
256
257 ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
258 ip_set_get_extensions(set, tb, &ext);
259 if (ret)
260 return ret;
261
262 memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN);
263 if (ether_addr_equal(e.ether, invalid_ether))
264 return -IPSET_ERR_HASH_ELEM;
265
266 return adtfn(set, &e, &ext, &ext, flags);
267}
268
269static struct ip_set_type hash_ipmac_type __read_mostly = {
270 .name = "hash:ip,mac",
271 .protocol = IPSET_PROTOCOL,
272 .features = IPSET_TYPE_IP | IPSET_TYPE_MAC,
273 .dimension = IPSET_DIM_TWO,
274 .family = NFPROTO_UNSPEC,
275 .revision_min = IPSET_TYPE_REV_MIN,
276 .revision_max = IPSET_TYPE_REV_MAX,
277 .create = hash_ipmac_create,
278 .create_policy = {
279 [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
280 [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
281 [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
282 [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
283 [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
284 [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
285 },
286 .adt_policy = {
287 [IPSET_ATTR_IP] = { .type = NLA_NESTED },
288 [IPSET_ATTR_ETHER] = { .type = NLA_BINARY,
289 .len = ETH_ALEN },
290 [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
291 [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
292 [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
293 [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
294 [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
295 [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
296 [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
297 [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
298 },
299 .me = THIS_MODULE,
300};
301
302static int __init
303hash_ipmac_init(void)
304{
305 return ip_set_type_register(&hash_ipmac_type);
306}
307
308static void __exit
309hash_ipmac_fini(void)
310{
311 ip_set_type_unregister(&hash_ipmac_type);
312}
313
314module_init(hash_ipmac_init);
315module_exit(hash_ipmac_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c
index a0695a2ab585..b64cf14e8352 100644
--- a/net/netfilter/ipset/ip_set_hash_ipmark.c
+++ b/net/netfilter/ipset/ip_set_hash_ipmark.c
@@ -85,7 +85,7 @@ hash_ipmark4_kadt(struct ip_set *set, const struct sk_buff *skb,
85 const struct xt_action_param *par, 85 const struct xt_action_param *par,
86 enum ipset_adt adt, struct ip_set_adt_opt *opt) 86 enum ipset_adt adt, struct ip_set_adt_opt *opt)
87{ 87{
88 const struct hash_ipmark *h = set->data; 88 const struct hash_ipmark4 *h = set->data;
89 ipset_adtfn adtfn = set->variant->adt[adt]; 89 ipset_adtfn adtfn = set->variant->adt[adt];
90 struct hash_ipmark4_elem e = { }; 90 struct hash_ipmark4_elem e = { };
91 struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); 91 struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -101,7 +101,7 @@ static int
101hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], 101hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
102 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) 102 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
103{ 103{
104 const struct hash_ipmark *h = set->data; 104 const struct hash_ipmark4 *h = set->data;
105 ipset_adtfn adtfn = set->variant->adt[adt]; 105 ipset_adtfn adtfn = set->variant->adt[adt];
106 struct hash_ipmark4_elem e = { }; 106 struct hash_ipmark4_elem e = { };
107 struct ip_set_ext ext = IP_SET_INIT_UEXT(set); 107 struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
@@ -193,7 +193,7 @@ nla_put_failure:
193} 193}
194 194
195static inline void 195static inline void
196hash_ipmark6_data_next(struct hash_ipmark4_elem *next, 196hash_ipmark6_data_next(struct hash_ipmark6_elem *next,
197 const struct hash_ipmark6_elem *d) 197 const struct hash_ipmark6_elem *d)
198{ 198{
199} 199}
@@ -211,7 +211,7 @@ hash_ipmark6_kadt(struct ip_set *set, const struct sk_buff *skb,
211 const struct xt_action_param *par, 211 const struct xt_action_param *par,
212 enum ipset_adt adt, struct ip_set_adt_opt *opt) 212 enum ipset_adt adt, struct ip_set_adt_opt *opt)
213{ 213{
214 const struct hash_ipmark *h = set->data; 214 const struct hash_ipmark6 *h = set->data;
215 ipset_adtfn adtfn = set->variant->adt[adt]; 215 ipset_adtfn adtfn = set->variant->adt[adt];
216 struct hash_ipmark6_elem e = { }; 216 struct hash_ipmark6_elem e = { };
217 struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); 217 struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -227,7 +227,7 @@ static int
227hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[], 227hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[],
228 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) 228 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
229{ 229{
230 const struct hash_ipmark *h = set->data; 230 const struct hash_ipmark6 *h = set->data;
231 ipset_adtfn adtfn = set->variant->adt[adt]; 231 ipset_adtfn adtfn = set->variant->adt[adt];
232 struct hash_ipmark6_elem e = { }; 232 struct hash_ipmark6_elem e = { };
233 struct ip_set_ext ext = IP_SET_INIT_UEXT(set); 233 struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
index 9d84b3dff603..f438740e6c6a 100644
--- a/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -108,7 +108,7 @@ static int
108hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], 108hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
109 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) 109 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
110{ 110{
111 const struct hash_ipport *h = set->data; 111 const struct hash_ipport4 *h = set->data;
112 ipset_adtfn adtfn = set->variant->adt[adt]; 112 ipset_adtfn adtfn = set->variant->adt[adt];
113 struct hash_ipport4_elem e = { .ip = 0 }; 113 struct hash_ipport4_elem e = { .ip = 0 };
114 struct ip_set_ext ext = IP_SET_INIT_UEXT(set); 114 struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
@@ -231,7 +231,7 @@ nla_put_failure:
231} 231}
232 232
233static inline void 233static inline void
234hash_ipport6_data_next(struct hash_ipport4_elem *next, 234hash_ipport6_data_next(struct hash_ipport6_elem *next,
235 const struct hash_ipport6_elem *d) 235 const struct hash_ipport6_elem *d)
236{ 236{
237 next->port = d->port; 237 next->port = d->port;
@@ -266,7 +266,7 @@ static int
266hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], 266hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[],
267 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) 267 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
268{ 268{
269 const struct hash_ipport *h = set->data; 269 const struct hash_ipport6 *h = set->data;
270 ipset_adtfn adtfn = set->variant->adt[adt]; 270 ipset_adtfn adtfn = set->variant->adt[adt];
271 struct hash_ipport6_elem e = { .ip = { .all = { 0 } } }; 271 struct hash_ipport6_elem e = { .ip = { .all = { 0 } } };
272 struct ip_set_ext ext = IP_SET_INIT_UEXT(set); 272 struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
index 215b7b942038..6215fb898c50 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportip.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -111,7 +111,7 @@ static int
111hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], 111hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
112 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) 112 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
113{ 113{
114 const struct hash_ipportip *h = set->data; 114 const struct hash_ipportip4 *h = set->data;
115 ipset_adtfn adtfn = set->variant->adt[adt]; 115 ipset_adtfn adtfn = set->variant->adt[adt];
116 struct hash_ipportip4_elem e = { .ip = 0 }; 116 struct hash_ipportip4_elem e = { .ip = 0 };
117 struct ip_set_ext ext = IP_SET_INIT_UEXT(set); 117 struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
@@ -241,7 +241,7 @@ nla_put_failure:
241} 241}
242 242
243static inline void 243static inline void
244hash_ipportip6_data_next(struct hash_ipportip4_elem *next, 244hash_ipportip6_data_next(struct hash_ipportip6_elem *next,
245 const struct hash_ipportip6_elem *d) 245 const struct hash_ipportip6_elem *d)
246{ 246{
247 next->port = d->port; 247 next->port = d->port;
@@ -277,7 +277,7 @@ static int
277hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], 277hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[],
278 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) 278 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
279{ 279{
280 const struct hash_ipportip *h = set->data; 280 const struct hash_ipportip6 *h = set->data;
281 ipset_adtfn adtfn = set->variant->adt[adt]; 281 ipset_adtfn adtfn = set->variant->adt[adt];
282 struct hash_ipportip6_elem e = { .ip = { .all = { 0 } } }; 282 struct hash_ipportip6_elem e = { .ip = { .all = { 0 } } };
283 struct ip_set_ext ext = IP_SET_INIT_UEXT(set); 283 struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index 9ca719625ea3..5ab1b99a53c2 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -138,7 +138,7 @@ hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
138 const struct xt_action_param *par, 138 const struct xt_action_param *par,
139 enum ipset_adt adt, struct ip_set_adt_opt *opt) 139 enum ipset_adt adt, struct ip_set_adt_opt *opt)
140{ 140{
141 const struct hash_ipportnet *h = set->data; 141 const struct hash_ipportnet4 *h = set->data;
142 ipset_adtfn adtfn = set->variant->adt[adt]; 142 ipset_adtfn adtfn = set->variant->adt[adt];
143 struct hash_ipportnet4_elem e = { 143 struct hash_ipportnet4_elem e = {
144 .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), 144 .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
@@ -163,7 +163,7 @@ static int
163hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], 163hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
164 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) 164 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
165{ 165{
166 const struct hash_ipportnet *h = set->data; 166 const struct hash_ipportnet4 *h = set->data;
167 ipset_adtfn adtfn = set->variant->adt[adt]; 167 ipset_adtfn adtfn = set->variant->adt[adt];
168 struct hash_ipportnet4_elem e = { .cidr = HOST_MASK - 1 }; 168 struct hash_ipportnet4_elem e = { .cidr = HOST_MASK - 1 };
169 struct ip_set_ext ext = IP_SET_INIT_UEXT(set); 169 struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
@@ -370,7 +370,7 @@ nla_put_failure:
370} 370}
371 371
372static inline void 372static inline void
373hash_ipportnet6_data_next(struct hash_ipportnet4_elem *next, 373hash_ipportnet6_data_next(struct hash_ipportnet6_elem *next,
374 const struct hash_ipportnet6_elem *d) 374 const struct hash_ipportnet6_elem *d)
375{ 375{
376 next->port = d->port; 376 next->port = d->port;
@@ -389,7 +389,7 @@ hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
389 const struct xt_action_param *par, 389 const struct xt_action_param *par,
390 enum ipset_adt adt, struct ip_set_adt_opt *opt) 390 enum ipset_adt adt, struct ip_set_adt_opt *opt)
391{ 391{
392 const struct hash_ipportnet *h = set->data; 392 const struct hash_ipportnet6 *h = set->data;
393 ipset_adtfn adtfn = set->variant->adt[adt]; 393 ipset_adtfn adtfn = set->variant->adt[adt];
394 struct hash_ipportnet6_elem e = { 394 struct hash_ipportnet6_elem e = {
395 .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), 395 .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
@@ -414,7 +414,7 @@ static int
414hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], 414hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
415 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) 415 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
416{ 416{
417 const struct hash_ipportnet *h = set->data; 417 const struct hash_ipportnet6 *h = set->data;
418 ipset_adtfn adtfn = set->variant->adt[adt]; 418 ipset_adtfn adtfn = set->variant->adt[adt];
419 struct hash_ipportnet6_elem e = { .cidr = HOST_MASK - 1 }; 419 struct hash_ipportnet6_elem e = { .cidr = HOST_MASK - 1 };
420 struct ip_set_ext ext = IP_SET_INIT_UEXT(set); 420 struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
index 3e4bffdc1cc0..5d9e895452e7 100644
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -117,7 +117,7 @@ hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb,
117 const struct xt_action_param *par, 117 const struct xt_action_param *par,
118 enum ipset_adt adt, struct ip_set_adt_opt *opt) 118 enum ipset_adt adt, struct ip_set_adt_opt *opt)
119{ 119{
120 const struct hash_net *h = set->data; 120 const struct hash_net4 *h = set->data;
121 ipset_adtfn adtfn = set->variant->adt[adt]; 121 ipset_adtfn adtfn = set->variant->adt[adt];
122 struct hash_net4_elem e = { 122 struct hash_net4_elem e = {
123 .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), 123 .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
@@ -139,7 +139,7 @@ static int
139hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], 139hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
140 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) 140 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
141{ 141{
142 const struct hash_net *h = set->data; 142 const struct hash_net4 *h = set->data;
143 ipset_adtfn adtfn = set->variant->adt[adt]; 143 ipset_adtfn adtfn = set->variant->adt[adt];
144 struct hash_net4_elem e = { .cidr = HOST_MASK }; 144 struct hash_net4_elem e = { .cidr = HOST_MASK };
145 struct ip_set_ext ext = IP_SET_INIT_UEXT(set); 145 struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
@@ -268,7 +268,7 @@ nla_put_failure:
268} 268}
269 269
270static inline void 270static inline void
271hash_net6_data_next(struct hash_net4_elem *next, 271hash_net6_data_next(struct hash_net6_elem *next,
272 const struct hash_net6_elem *d) 272 const struct hash_net6_elem *d)
273{ 273{
274} 274}
@@ -286,7 +286,7 @@ hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb,
286 const struct xt_action_param *par, 286 const struct xt_action_param *par,
287 enum ipset_adt adt, struct ip_set_adt_opt *opt) 287 enum ipset_adt adt, struct ip_set_adt_opt *opt)
288{ 288{
289 const struct hash_net *h = set->data; 289 const struct hash_net6 *h = set->data;
290 ipset_adtfn adtfn = set->variant->adt[adt]; 290 ipset_adtfn adtfn = set->variant->adt[adt];
291 struct hash_net6_elem e = { 291 struct hash_net6_elem e = {
292 .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), 292 .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index f0f688db6213..44cf11939c91 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -156,7 +156,7 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb,
156 const struct xt_action_param *par, 156 const struct xt_action_param *par,
157 enum ipset_adt adt, struct ip_set_adt_opt *opt) 157 enum ipset_adt adt, struct ip_set_adt_opt *opt)
158{ 158{
159 struct hash_netiface *h = set->data; 159 struct hash_netiface4 *h = set->data;
160 ipset_adtfn adtfn = set->variant->adt[adt]; 160 ipset_adtfn adtfn = set->variant->adt[adt];
161 struct hash_netiface4_elem e = { 161 struct hash_netiface4_elem e = {
162 .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), 162 .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
@@ -170,7 +170,7 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb,
170 ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); 170 ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
171 e.ip &= ip_set_netmask(e.cidr); 171 e.ip &= ip_set_netmask(e.cidr);
172 172
173#define IFACE(dir) (par->dir ? par->dir->name : "") 173#define IFACE(dir) (par->state->dir ? par->state->dir->name : "")
174#define SRCDIR (opt->flags & IPSET_DIM_TWO_SRC) 174#define SRCDIR (opt->flags & IPSET_DIM_TWO_SRC)
175 175
176 if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { 176 if (opt->cmdflags & IPSET_FLAG_PHYSDEV) {
@@ -196,7 +196,7 @@ static int
196hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], 196hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
197 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) 197 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
198{ 198{
199 struct hash_netiface *h = set->data; 199 struct hash_netiface4 *h = set->data;
200 ipset_adtfn adtfn = set->variant->adt[adt]; 200 ipset_adtfn adtfn = set->variant->adt[adt];
201 struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 }; 201 struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 };
202 struct ip_set_ext ext = IP_SET_INIT_UEXT(set); 202 struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
@@ -348,7 +348,7 @@ nla_put_failure:
348} 348}
349 349
350static inline void 350static inline void
351hash_netiface6_data_next(struct hash_netiface4_elem *next, 351hash_netiface6_data_next(struct hash_netiface6_elem *next,
352 const struct hash_netiface6_elem *d) 352 const struct hash_netiface6_elem *d)
353{ 353{
354} 354}
@@ -367,7 +367,7 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb,
367 const struct xt_action_param *par, 367 const struct xt_action_param *par,
368 enum ipset_adt adt, struct ip_set_adt_opt *opt) 368 enum ipset_adt adt, struct ip_set_adt_opt *opt)
369{ 369{
370 struct hash_netiface *h = set->data; 370 struct hash_netiface6 *h = set->data;
371 ipset_adtfn adtfn = set->variant->adt[adt]; 371 ipset_adtfn adtfn = set->variant->adt[adt];
372 struct hash_netiface6_elem e = { 372 struct hash_netiface6_elem e = {
373 .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), 373 .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c
index a93dfebffa81..db614e13b193 100644
--- a/net/netfilter/ipset/ip_set_hash_netnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netnet.c
@@ -143,7 +143,7 @@ hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
143 const struct xt_action_param *par, 143 const struct xt_action_param *par,
144 enum ipset_adt adt, struct ip_set_adt_opt *opt) 144 enum ipset_adt adt, struct ip_set_adt_opt *opt)
145{ 145{
146 const struct hash_netnet *h = set->data; 146 const struct hash_netnet4 *h = set->data;
147 ipset_adtfn adtfn = set->variant->adt[adt]; 147 ipset_adtfn adtfn = set->variant->adt[adt];
148 struct hash_netnet4_elem e = { }; 148 struct hash_netnet4_elem e = { };
149 struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); 149 struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -165,7 +165,7 @@ static int
165hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], 165hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
166 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) 166 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
167{ 167{
168 const struct hash_netnet *h = set->data; 168 const struct hash_netnet4 *h = set->data;
169 ipset_adtfn adtfn = set->variant->adt[adt]; 169 ipset_adtfn adtfn = set->variant->adt[adt];
170 struct hash_netnet4_elem e = { }; 170 struct hash_netnet4_elem e = { };
171 struct ip_set_ext ext = IP_SET_INIT_UEXT(set); 171 struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
@@ -352,7 +352,7 @@ nla_put_failure:
352} 352}
353 353
354static inline void 354static inline void
355hash_netnet6_data_next(struct hash_netnet4_elem *next, 355hash_netnet6_data_next(struct hash_netnet6_elem *next,
356 const struct hash_netnet6_elem *d) 356 const struct hash_netnet6_elem *d)
357{ 357{
358} 358}
@@ -377,7 +377,7 @@ hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
377 const struct xt_action_param *par, 377 const struct xt_action_param *par,
378 enum ipset_adt adt, struct ip_set_adt_opt *opt) 378 enum ipset_adt adt, struct ip_set_adt_opt *opt)
379{ 379{
380 const struct hash_netnet *h = set->data; 380 const struct hash_netnet6 *h = set->data;
381 ipset_adtfn adtfn = set->variant->adt[adt]; 381 ipset_adtfn adtfn = set->variant->adt[adt];
382 struct hash_netnet6_elem e = { }; 382 struct hash_netnet6_elem e = { };
383 struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); 383 struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index 731813e0f08c..54b64b6cd0cd 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -133,7 +133,7 @@ hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb,
133 const struct xt_action_param *par, 133 const struct xt_action_param *par,
134 enum ipset_adt adt, struct ip_set_adt_opt *opt) 134 enum ipset_adt adt, struct ip_set_adt_opt *opt)
135{ 135{
136 const struct hash_netport *h = set->data; 136 const struct hash_netport4 *h = set->data;
137 ipset_adtfn adtfn = set->variant->adt[adt]; 137 ipset_adtfn adtfn = set->variant->adt[adt];
138 struct hash_netport4_elem e = { 138 struct hash_netport4_elem e = {
139 .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), 139 .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
@@ -157,7 +157,7 @@ static int
157hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], 157hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
158 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) 158 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
159{ 159{
160 const struct hash_netport *h = set->data; 160 const struct hash_netport4 *h = set->data;
161 ipset_adtfn adtfn = set->variant->adt[adt]; 161 ipset_adtfn adtfn = set->variant->adt[adt];
162 struct hash_netport4_elem e = { .cidr = HOST_MASK - 1 }; 162 struct hash_netport4_elem e = { .cidr = HOST_MASK - 1 };
163 struct ip_set_ext ext = IP_SET_INIT_UEXT(set); 163 struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
@@ -329,7 +329,7 @@ nla_put_failure:
329} 329}
330 330
331static inline void 331static inline void
332hash_netport6_data_next(struct hash_netport4_elem *next, 332hash_netport6_data_next(struct hash_netport6_elem *next,
333 const struct hash_netport6_elem *d) 333 const struct hash_netport6_elem *d)
334{ 334{
335 next->port = d->port; 335 next->port = d->port;
@@ -348,7 +348,7 @@ hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb,
348 const struct xt_action_param *par, 348 const struct xt_action_param *par,
349 enum ipset_adt adt, struct ip_set_adt_opt *opt) 349 enum ipset_adt adt, struct ip_set_adt_opt *opt)
350{ 350{
351 const struct hash_netport *h = set->data; 351 const struct hash_netport6 *h = set->data;
352 ipset_adtfn adtfn = set->variant->adt[adt]; 352 ipset_adtfn adtfn = set->variant->adt[adt];
353 struct hash_netport6_elem e = { 353 struct hash_netport6_elem e = {
354 .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), 354 .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
@@ -372,7 +372,7 @@ static int
372hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], 372hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[],
373 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) 373 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
374{ 374{
375 const struct hash_netport *h = set->data; 375 const struct hash_netport6 *h = set->data;
376 ipset_adtfn adtfn = set->variant->adt[adt]; 376 ipset_adtfn adtfn = set->variant->adt[adt];
377 struct hash_netport6_elem e = { .cidr = HOST_MASK - 1 }; 377 struct hash_netport6_elem e = { .cidr = HOST_MASK - 1 };
378 struct ip_set_ext ext = IP_SET_INIT_UEXT(set); 378 struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c
index 9a14c237830f..aff846960ac4 100644
--- a/net/netfilter/ipset/ip_set_hash_netportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netportnet.c
@@ -154,7 +154,7 @@ hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
154 const struct xt_action_param *par, 154 const struct xt_action_param *par,
155 enum ipset_adt adt, struct ip_set_adt_opt *opt) 155 enum ipset_adt adt, struct ip_set_adt_opt *opt)
156{ 156{
157 const struct hash_netportnet *h = set->data; 157 const struct hash_netportnet4 *h = set->data;
158 ipset_adtfn adtfn = set->variant->adt[adt]; 158 ipset_adtfn adtfn = set->variant->adt[adt];
159 struct hash_netportnet4_elem e = { }; 159 struct hash_netportnet4_elem e = { };
160 struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); 160 struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -180,7 +180,7 @@ static int
180hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], 180hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
181 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) 181 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
182{ 182{
183 const struct hash_netportnet *h = set->data; 183 const struct hash_netportnet4 *h = set->data;
184 ipset_adtfn adtfn = set->variant->adt[adt]; 184 ipset_adtfn adtfn = set->variant->adt[adt];
185 struct hash_netportnet4_elem e = { }; 185 struct hash_netportnet4_elem e = { };
186 struct ip_set_ext ext = IP_SET_INIT_UEXT(set); 186 struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
@@ -406,7 +406,7 @@ nla_put_failure:
406} 406}
407 407
408static inline void 408static inline void
409hash_netportnet6_data_next(struct hash_netportnet4_elem *next, 409hash_netportnet6_data_next(struct hash_netportnet6_elem *next,
410 const struct hash_netportnet6_elem *d) 410 const struct hash_netportnet6_elem *d)
411{ 411{
412 next->port = d->port; 412 next->port = d->port;
@@ -432,7 +432,7 @@ hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
432 const struct xt_action_param *par, 432 const struct xt_action_param *par,
433 enum ipset_adt adt, struct ip_set_adt_opt *opt) 433 enum ipset_adt adt, struct ip_set_adt_opt *opt)
434{ 434{
435 const struct hash_netportnet *h = set->data; 435 const struct hash_netportnet6 *h = set->data;
436 ipset_adtfn adtfn = set->variant->adt[adt]; 436 ipset_adtfn adtfn = set->variant->adt[adt];
437 struct hash_netportnet6_elem e = { }; 437 struct hash_netportnet6_elem e = { };
438 struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); 438 struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -458,7 +458,7 @@ static int
458hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], 458hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
459 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) 459 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
460{ 460{
461 const struct hash_netportnet *h = set->data; 461 const struct hash_netportnet6 *h = set->data;
462 ipset_adtfn adtfn = set->variant->adt[adt]; 462 ipset_adtfn adtfn = set->variant->adt[adt];
463 struct hash_netportnet6_elem e = { }; 463 struct hash_netportnet6_elem e = { };
464 struct ip_set_ext ext = IP_SET_INIT_UEXT(set); 464 struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index a2a89e4e0a14..178d4eba013b 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -166,6 +166,7 @@ __list_set_del_rcu(struct rcu_head * rcu)
166static inline void 166static inline void
167list_set_del(struct ip_set *set, struct set_elem *e) 167list_set_del(struct ip_set *set, struct set_elem *e)
168{ 168{
169 set->elements--;
169 list_del_rcu(&e->list); 170 list_del_rcu(&e->list);
170 call_rcu(&e->rcu, __list_set_del_rcu); 171 call_rcu(&e->rcu, __list_set_del_rcu);
171} 172}
@@ -227,7 +228,7 @@ list_set_init_extensions(struct ip_set *set, const struct ip_set_ext *ext,
227 if (SET_WITH_COUNTER(set)) 228 if (SET_WITH_COUNTER(set))
228 ip_set_init_counter(ext_counter(e, set), ext); 229 ip_set_init_counter(ext_counter(e, set), ext);
229 if (SET_WITH_COMMENT(set)) 230 if (SET_WITH_COMMENT(set))
230 ip_set_init_comment(ext_comment(e, set), ext); 231 ip_set_init_comment(set, ext_comment(e, set), ext);
231 if (SET_WITH_SKBINFO(set)) 232 if (SET_WITH_SKBINFO(set))
232 ip_set_init_skbinfo(ext_skbinfo(e, set), ext); 233 ip_set_init_skbinfo(ext_skbinfo(e, set), ext);
233 /* Update timeout last */ 234 /* Update timeout last */
@@ -259,11 +260,14 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
259 else 260 else
260 prev = e; 261 prev = e;
261 } 262 }
263
264 /* If before/after is used on an empty set */
265 if ((d->before > 0 && !next) ||
266 (d->before < 0 && !prev))
267 return -IPSET_ERR_REF_EXIST;
268
262 /* Re-add already existing element */ 269 /* Re-add already existing element */
263 if (n) { 270 if (n) {
264 if ((d->before > 0 && !next) ||
265 (d->before < 0 && !prev))
266 return -IPSET_ERR_REF_EXIST;
267 if (!flag_exist) 271 if (!flag_exist)
268 return -IPSET_ERR_EXIST; 272 return -IPSET_ERR_EXIST;
269 /* Update extensions */ 273 /* Update extensions */
@@ -309,6 +313,7 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
309 list_add_rcu(&e->list, &prev->list); 313 list_add_rcu(&e->list, &prev->list);
310 else 314 else
311 list_add_tail_rcu(&e->list, &map->members); 315 list_add_tail_rcu(&e->list, &map->members);
316 set->elements++;
312 317
313 return 0; 318 return 0;
314} 319}
@@ -419,6 +424,8 @@ list_set_flush(struct ip_set *set)
419 424
420 list_for_each_entry_safe(e, n, &map->members, list) 425 list_for_each_entry_safe(e, n, &map->members, list)
421 list_set_del(set, e); 426 list_set_del(set, e);
427 set->elements = 0;
428 set->ext_size = 0;
422} 429}
423 430
424static void 431static void
@@ -441,12 +448,12 @@ list_set_destroy(struct ip_set *set)
441 set->data = NULL; 448 set->data = NULL;
442} 449}
443 450
444static int 451/* Calculate the actual memory size of the set data */
445list_set_head(struct ip_set *set, struct sk_buff *skb) 452static size_t
453list_set_memsize(const struct list_set *map, size_t dsize)
446{ 454{
447 const struct list_set *map = set->data;
448 struct nlattr *nested;
449 struct set_elem *e; 455 struct set_elem *e;
456 size_t memsize;
450 u32 n = 0; 457 u32 n = 0;
451 458
452 rcu_read_lock(); 459 rcu_read_lock();
@@ -454,13 +461,25 @@ list_set_head(struct ip_set *set, struct sk_buff *skb)
454 n++; 461 n++;
455 rcu_read_unlock(); 462 rcu_read_unlock();
456 463
464 memsize = sizeof(*map) + n * dsize;
465
466 return memsize;
467}
468
469static int
470list_set_head(struct ip_set *set, struct sk_buff *skb)
471{
472 const struct list_set *map = set->data;
473 struct nlattr *nested;
474 size_t memsize = list_set_memsize(map, set->dsize) + set->ext_size;
475
457 nested = ipset_nest_start(skb, IPSET_ATTR_DATA); 476 nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
458 if (!nested) 477 if (!nested)
459 goto nla_put_failure; 478 goto nla_put_failure;
460 if (nla_put_net32(skb, IPSET_ATTR_SIZE, htonl(map->size)) || 479 if (nla_put_net32(skb, IPSET_ATTR_SIZE, htonl(map->size)) ||
461 nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) || 480 nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) ||
462 nla_put_net32(skb, IPSET_ATTR_MEMSIZE, 481 nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) ||
463 htonl(sizeof(*map) + n * set->dsize))) 482 nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(set->elements)))
464 goto nla_put_failure; 483 goto nla_put_failure;
465 if (unlikely(ip_set_put_flags(skb, set))) 484 if (unlikely(ip_set_put_flags(skb, set)))
466 goto nla_put_failure; 485 goto nla_put_failure;
@@ -570,11 +589,8 @@ list_set_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
570{ 589{
571 struct list_set *map = set->data; 590 struct list_set *map = set->data;
572 591
573 init_timer(&map->gc); 592 setup_timer(&map->gc, gc, (unsigned long)set);
574 map->gc.data = (unsigned long)set; 593 mod_timer(&map->gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ);
575 map->gc.function = gc;
576 map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
577 add_timer(&map->gc);
578} 594}
579 595
580/* Create list:set type of sets */ 596/* Create list:set type of sets */
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 096a45103f14..e6a2753dff9e 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -1429,7 +1429,7 @@ int __init ip_vs_conn_init(void)
1429 "(size=%d, memory=%ldKbytes)\n", 1429 "(size=%d, memory=%ldKbytes)\n",
1430 ip_vs_conn_tab_size, 1430 ip_vs_conn_tab_size,
1431 (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024); 1431 (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024);
1432 IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", 1432 IP_VS_DBG(0, "Each connection entry needs %zd bytes at least\n",
1433 sizeof(struct ip_vs_conn)); 1433 sizeof(struct ip_vs_conn));
1434 1434
1435 for (idx = 0; idx < ip_vs_conn_tab_size; idx++) 1435 for (idx = 0; idx < ip_vs_conn_tab_size; idx++)
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 2c1b498a7a27..db40050f8785 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -70,7 +70,7 @@ EXPORT_SYMBOL(ip_vs_get_debug_level);
70#endif 70#endif
71EXPORT_SYMBOL(ip_vs_new_conn_out); 71EXPORT_SYMBOL(ip_vs_new_conn_out);
72 72
73static int ip_vs_net_id __read_mostly; 73static unsigned int ip_vs_net_id __read_mostly;
74/* netns cnt used for uniqueness */ 74/* netns cnt used for uniqueness */
75static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0); 75static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0);
76 76
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index a6e44ef2ec9a..5aeb0dde6ccc 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -48,7 +48,7 @@
48#include <net/sock.h> 48#include <net/sock.h>
49#include <net/genetlink.h> 49#include <net/genetlink.h>
50 50
51#include <asm/uaccess.h> 51#include <linux/uaccess.h>
52 52
53#include <net/ip_vs.h> 53#include <net/ip_vs.h>
54 54
@@ -426,10 +426,9 @@ ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol
426 */ 426 */
427 svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, vport); 427 svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, vport);
428 428
429 if (svc == NULL 429 if (!svc && protocol == IPPROTO_TCP &&
430 && protocol == IPPROTO_TCP 430 atomic_read(&ipvs->ftpsvc_counter) &&
431 && atomic_read(&ipvs->ftpsvc_counter) 431 (vport == FTPDATA || ntohs(vport) >= inet_prot_sock(ipvs->net))) {
432 && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
433 /* 432 /*
434 * Check if ftp service entry exists, the packet 433 * Check if ftp service entry exists, the packet
435 * might belong to FTP data connections. 434 * might belong to FTP data connections.
@@ -711,7 +710,6 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af,
711 dest->vport == svc->port))) { 710 dest->vport == svc->port))) {
712 /* HIT */ 711 /* HIT */
713 list_del(&dest->t_list); 712 list_del(&dest->t_list);
714 ip_vs_dest_hold(dest);
715 goto out; 713 goto out;
716 } 714 }
717 } 715 }
@@ -741,7 +739,7 @@ static void ip_vs_dest_free(struct ip_vs_dest *dest)
741 * When the ip_vs_control_clearup is activated by ipvs module exit, 739 * When the ip_vs_control_clearup is activated by ipvs module exit,
742 * the service tables must have been flushed and all the connections 740 * the service tables must have been flushed and all the connections
743 * are expired, and the refcnt of each destination in the trash must 741 * are expired, and the refcnt of each destination in the trash must
744 * be 0, so we simply release them here. 742 * be 1, so we simply release them here.
745 */ 743 */
746static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs) 744static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs)
747{ 745{
@@ -1080,11 +1078,10 @@ static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest,
1080 if (list_empty(&ipvs->dest_trash) && !cleanup) 1078 if (list_empty(&ipvs->dest_trash) && !cleanup)
1081 mod_timer(&ipvs->dest_trash_timer, 1079 mod_timer(&ipvs->dest_trash_timer,
1082 jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); 1080 jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
1083 /* dest lives in trash without reference */ 1081 /* dest lives in trash with reference */
1084 list_add(&dest->t_list, &ipvs->dest_trash); 1082 list_add(&dest->t_list, &ipvs->dest_trash);
1085 dest->idle_start = 0; 1083 dest->idle_start = 0;
1086 spin_unlock_bh(&ipvs->dest_trash_lock); 1084 spin_unlock_bh(&ipvs->dest_trash_lock);
1087 ip_vs_dest_put(dest);
1088} 1085}
1089 1086
1090 1087
@@ -1160,7 +1157,7 @@ static void ip_vs_dest_trash_expire(unsigned long data)
1160 1157
1161 spin_lock(&ipvs->dest_trash_lock); 1158 spin_lock(&ipvs->dest_trash_lock);
1162 list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) { 1159 list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) {
1163 if (atomic_read(&dest->refcnt) > 0) 1160 if (atomic_read(&dest->refcnt) > 1)
1164 continue; 1161 continue;
1165 if (dest->idle_start) { 1162 if (dest->idle_start) {
1166 if (time_before(now, dest->idle_start + 1163 if (time_before(now, dest->idle_start +
@@ -2840,14 +2837,7 @@ static struct nf_sockopt_ops ip_vs_sockopts = {
2840 */ 2837 */
2841 2838
2842/* IPVS genetlink family */ 2839/* IPVS genetlink family */
2843static struct genl_family ip_vs_genl_family = { 2840static struct genl_family ip_vs_genl_family;
2844 .id = GENL_ID_GENERATE,
2845 .hdrsize = 0,
2846 .name = IPVS_GENL_NAME,
2847 .version = IPVS_GENL_VERSION,
2848 .maxattr = IPVS_CMD_ATTR_MAX,
2849 .netnsok = true, /* Make ipvsadm to work on netns */
2850};
2851 2841
2852/* Policy used for first-level command attributes */ 2842/* Policy used for first-level command attributes */
2853static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = { 2843static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
@@ -3267,7 +3257,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
3267 3257
3268 3258
3269 svc = ip_vs_genl_find_service(ipvs, attrs[IPVS_CMD_ATTR_SERVICE]); 3259 svc = ip_vs_genl_find_service(ipvs, attrs[IPVS_CMD_ATTR_SERVICE]);
3270 if (IS_ERR(svc) || svc == NULL) 3260 if (IS_ERR_OR_NULL(svc))
3271 goto out_err; 3261 goto out_err;
3272 3262
3273 /* Dump the destinations */ 3263 /* Dump the destinations */
@@ -3872,10 +3862,20 @@ static const struct genl_ops ip_vs_genl_ops[] = {
3872 }, 3862 },
3873}; 3863};
3874 3864
3865static struct genl_family ip_vs_genl_family __ro_after_init = {
3866 .hdrsize = 0,
3867 .name = IPVS_GENL_NAME,
3868 .version = IPVS_GENL_VERSION,
3869 .maxattr = IPVS_CMD_ATTR_MAX,
3870 .netnsok = true, /* Make ipvsadm to work on netns */
3871 .module = THIS_MODULE,
3872 .ops = ip_vs_genl_ops,
3873 .n_ops = ARRAY_SIZE(ip_vs_genl_ops),
3874};
3875
3875static int __init ip_vs_genl_register(void) 3876static int __init ip_vs_genl_register(void)
3876{ 3877{
3877 return genl_register_family_with_ops(&ip_vs_genl_family, 3878 return genl_register_family(&ip_vs_genl_family);
3878 ip_vs_genl_ops);
3879} 3879}
3880 3880
3881static void ip_vs_genl_unregister(void) 3881static void ip_vs_genl_unregister(void)
diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c
index 6be5c538b71e..75f798f8e83b 100644
--- a/net/netfilter/ipvs/ip_vs_dh.c
+++ b/net/netfilter/ipvs/ip_vs_dh.c
@@ -163,7 +163,7 @@ static int ip_vs_dh_init_svc(struct ip_vs_service *svc)
163 return -ENOMEM; 163 return -ENOMEM;
164 164
165 svc->sched_data = s; 165 svc->sched_data = s;
166 IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for " 166 IP_VS_DBG(6, "DH hash table (memory=%zdbytes) allocated for "
167 "current service\n", 167 "current service\n",
168 sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); 168 sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
169 169
@@ -183,7 +183,7 @@ static void ip_vs_dh_done_svc(struct ip_vs_service *svc)
183 183
184 /* release the table itself */ 184 /* release the table itself */
185 kfree_rcu(s, rcu_head); 185 kfree_rcu(s, rcu_head);
186 IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n", 186 IP_VS_DBG(6, "DH hash table (memory=%zdbytes) released\n",
187 sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); 187 sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
188} 188}
189 189
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index cccf4d637412..5824927cf8e0 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -356,7 +356,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
356 return -ENOMEM; 356 return -ENOMEM;
357 357
358 svc->sched_data = tbl; 358 svc->sched_data = tbl;
359 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " 359 IP_VS_DBG(6, "LBLC hash table (memory=%zdbytes) allocated for "
360 "current service\n", sizeof(*tbl)); 360 "current service\n", sizeof(*tbl));
361 361
362 /* 362 /*
@@ -393,7 +393,7 @@ static void ip_vs_lblc_done_svc(struct ip_vs_service *svc)
393 393
394 /* release the table itself */ 394 /* release the table itself */
395 kfree_rcu(tbl, rcu_head); 395 kfree_rcu(tbl, rcu_head);
396 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", 396 IP_VS_DBG(6, "LBLC hash table (memory=%zdbytes) released\n",
397 sizeof(*tbl)); 397 sizeof(*tbl));
398} 398}
399 399
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 796d70e47ddd..703f11877bee 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -519,7 +519,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
519 return -ENOMEM; 519 return -ENOMEM;
520 520
521 svc->sched_data = tbl; 521 svc->sched_data = tbl;
522 IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for " 522 IP_VS_DBG(6, "LBLCR hash table (memory=%zdbytes) allocated for "
523 "current service\n", sizeof(*tbl)); 523 "current service\n", sizeof(*tbl));
524 524
525 /* 525 /*
@@ -556,7 +556,7 @@ static void ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
556 556
557 /* release the table itself */ 557 /* release the table itself */
558 kfree_rcu(tbl, rcu_head); 558 kfree_rcu(tbl, rcu_head);
559 IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n", 559 IP_VS_DBG(6, "LBLCR hash table (memory=%zdbytes) released\n",
560 sizeof(*tbl)); 560 sizeof(*tbl));
561} 561}
562 562
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index 1e373a5e44e3..16aaac6eedc9 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -239,7 +239,7 @@ static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
239 return -ENOMEM; 239 return -ENOMEM;
240 240
241 svc->sched_data = s; 241 svc->sched_data = s;
242 IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for " 242 IP_VS_DBG(6, "SH hash table (memory=%zdbytes) allocated for "
243 "current service\n", 243 "current service\n",
244 sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); 244 sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
245 245
@@ -259,7 +259,7 @@ static void ip_vs_sh_done_svc(struct ip_vs_service *svc)
259 259
260 /* release the table itself */ 260 /* release the table itself */
261 kfree_rcu(s, rcu_head); 261 kfree_rcu(s, rcu_head);
262 IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n", 262 IP_VS_DBG(6, "SH hash table (memory=%zdbytes) released\n",
263 sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); 263 sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
264} 264}
265 265
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 9350530c16c1..b03c28084f81 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1791,7 +1791,7 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
1791 u16 mtu, min_mtu; 1791 u16 mtu, min_mtu;
1792 1792
1793 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); 1793 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
1794 IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", 1794 IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n",
1795 sizeof(struct ip_vs_sync_conn_v0)); 1795 sizeof(struct ip_vs_sync_conn_v0));
1796 1796
1797 if (!ipvs->sync_state) { 1797 if (!ipvs->sync_state) {
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 01d3d894de46..4e1a98fcc8c3 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -254,6 +254,54 @@ static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af,
254 return true; 254 return true;
255} 255}
256 256
257static inline bool decrement_ttl(struct netns_ipvs *ipvs,
258 int skb_af,
259 struct sk_buff *skb)
260{
261 struct net *net = ipvs->net;
262
263#ifdef CONFIG_IP_VS_IPV6
264 if (skb_af == AF_INET6) {
265 struct dst_entry *dst = skb_dst(skb);
266
267 /* check and decrement ttl */
268 if (ipv6_hdr(skb)->hop_limit <= 1) {
269 /* Force OUTPUT device used as source address */
270 skb->dev = dst->dev;
271 icmpv6_send(skb, ICMPV6_TIME_EXCEED,
272 ICMPV6_EXC_HOPLIMIT, 0);
273 __IP6_INC_STATS(net, ip6_dst_idev(dst),
274 IPSTATS_MIB_INHDRERRORS);
275
276 return false;
277 }
278
279 /* don't propagate ttl change to cloned packets */
280 if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
281 return false;
282
283 ipv6_hdr(skb)->hop_limit--;
284 } else
285#endif
286 {
287 if (ip_hdr(skb)->ttl <= 1) {
288 /* Tell the sender its packet died... */
289 __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
290 icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
291 return false;
292 }
293
294 /* don't propagate ttl change to cloned packets */
295 if (!skb_make_writable(skb, sizeof(struct iphdr)))
296 return false;
297
298 /* Decrease ttl */
299 ip_decrease_ttl(ip_hdr(skb));
300 }
301
302 return true;
303}
304
257/* Get route to destination or remote server */ 305/* Get route to destination or remote server */
258static int 306static int
259__ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, 307__ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
@@ -326,6 +374,9 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
326 return local; 374 return local;
327 } 375 }
328 376
377 if (!decrement_ttl(ipvs, skb_af, skb))
378 goto err_put;
379
329 if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) { 380 if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) {
330 mtu = dst_mtu(&rt->dst); 381 mtu = dst_mtu(&rt->dst);
331 } else { 382 } else {
@@ -473,6 +524,9 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
473 return local; 524 return local;
474 } 525 }
475 526
527 if (!decrement_ttl(ipvs, skb_af, skb))
528 goto err_put;
529
476 /* MTU checking */ 530 /* MTU checking */
477 if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) 531 if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL)))
478 mtu = dst_mtu(&rt->dst); 532 mtu = dst_mtu(&rt->dst);
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 0f87e5d21be7..ffb78e5f7b70 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -85,11 +85,11 @@ static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
85static __read_mostly bool nf_conntrack_locks_all; 85static __read_mostly bool nf_conntrack_locks_all;
86 86
87/* every gc cycle scans at most 1/GC_MAX_BUCKETS_DIV part of table */ 87/* every gc cycle scans at most 1/GC_MAX_BUCKETS_DIV part of table */
88#define GC_MAX_BUCKETS_DIV 64u 88#define GC_MAX_BUCKETS_DIV 128u
89/* upper bound of scan intervals */ 89/* upper bound of full table scan */
90#define GC_INTERVAL_MAX (2 * HZ) 90#define GC_MAX_SCAN_JIFFIES (16u * HZ)
91/* maximum conntracks to evict per gc run */ 91/* desired ratio of entries found to be expired */
92#define GC_MAX_EVICTS 256u 92#define GC_EVICT_RATIO 50u
93 93
94static struct conntrack_gc_work conntrack_gc_work; 94static struct conntrack_gc_work conntrack_gc_work;
95 95
@@ -181,7 +181,11 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
181unsigned int nf_conntrack_max __read_mostly; 181unsigned int nf_conntrack_max __read_mostly;
182seqcount_t nf_conntrack_generation __read_mostly; 182seqcount_t nf_conntrack_generation __read_mostly;
183 183
184DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked); 184/* nf_conn must be 8 bytes aligned, as the 3 LSB bits are used
185 * for the nfctinfo. We cheat by (ab)using the PER CPU cache line
186 * alignment to enforce this.
187 */
188DEFINE_PER_CPU_ALIGNED(struct nf_conn, nf_conntrack_untracked);
185EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); 189EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
186 190
187static unsigned int nf_conntrack_hash_rnd __read_mostly; 191static unsigned int nf_conntrack_hash_rnd __read_mostly;
@@ -350,16 +354,31 @@ static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
350 spin_unlock(&pcpu->lock); 354 spin_unlock(&pcpu->lock);
351} 355}
352 356
357#define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK)
358
353/* Released via destroy_conntrack() */ 359/* Released via destroy_conntrack() */
354struct nf_conn *nf_ct_tmpl_alloc(struct net *net, 360struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
355 const struct nf_conntrack_zone *zone, 361 const struct nf_conntrack_zone *zone,
356 gfp_t flags) 362 gfp_t flags)
357{ 363{
358 struct nf_conn *tmpl; 364 struct nf_conn *tmpl, *p;
359 365
360 tmpl = kzalloc(sizeof(*tmpl), flags); 366 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) {
361 if (tmpl == NULL) 367 tmpl = kzalloc(sizeof(*tmpl) + NFCT_INFOMASK, flags);
362 return NULL; 368 if (!tmpl)
369 return NULL;
370
371 p = tmpl;
372 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
373 if (tmpl != p) {
374 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
375 tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p;
376 }
377 } else {
378 tmpl = kzalloc(sizeof(*tmpl), flags);
379 if (!tmpl)
380 return NULL;
381 }
363 382
364 tmpl->status = IPS_TEMPLATE; 383 tmpl->status = IPS_TEMPLATE;
365 write_pnet(&tmpl->ct_net, net); 384 write_pnet(&tmpl->ct_net, net);
@@ -374,7 +393,11 @@ void nf_ct_tmpl_free(struct nf_conn *tmpl)
374{ 393{
375 nf_ct_ext_destroy(tmpl); 394 nf_ct_ext_destroy(tmpl);
376 nf_ct_ext_free(tmpl); 395 nf_ct_ext_free(tmpl);
377 kfree(tmpl); 396
397 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK)
398 kfree((char *)tmpl - tmpl->proto.tmpl_padto);
399 else
400 kfree(tmpl);
378} 401}
379EXPORT_SYMBOL_GPL(nf_ct_tmpl_free); 402EXPORT_SYMBOL_GPL(nf_ct_tmpl_free);
380 403
@@ -686,12 +709,12 @@ static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
686 !nfct_nat(ct) && 709 !nfct_nat(ct) &&
687 !nf_ct_is_dying(ct) && 710 !nf_ct_is_dying(ct) &&
688 atomic_inc_not_zero(&ct->ct_general.use)) { 711 atomic_inc_not_zero(&ct->ct_general.use)) {
689 nf_ct_acct_merge(ct, ctinfo, (struct nf_conn *)skb->nfct); 712 enum ip_conntrack_info oldinfo;
690 nf_conntrack_put(skb->nfct); 713 struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
691 /* Assign conntrack already in hashes to this skbuff. Don't 714
692 * modify skb->nfctinfo to ensure consistent stateful filtering. 715 nf_ct_acct_merge(ct, ctinfo, loser_ct);
693 */ 716 nf_conntrack_put(&loser_ct->ct_general);
694 skb->nfct = &ct->ct_general; 717 nf_ct_set(skb, ct, oldinfo);
695 return NF_ACCEPT; 718 return NF_ACCEPT;
696 } 719 }
697 NF_CT_STAT_INC(net, drop); 720 NF_CT_STAT_INC(net, drop);
@@ -783,7 +806,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
783 /* set conntrack timestamp, if enabled. */ 806 /* set conntrack timestamp, if enabled. */
784 tstamp = nf_conn_tstamp_find(ct); 807 tstamp = nf_conn_tstamp_find(ct);
785 if (tstamp) { 808 if (tstamp) {
786 if (skb->tstamp.tv64 == 0) 809 if (skb->tstamp == 0)
787 __net_timestamp(skb); 810 __net_timestamp(skb);
788 811
789 tstamp->start = ktime_to_ns(skb->tstamp); 812 tstamp->start = ktime_to_ns(skb->tstamp);
@@ -938,6 +961,7 @@ static noinline int early_drop(struct net *net, unsigned int _hash)
938 961
939static void gc_worker(struct work_struct *work) 962static void gc_worker(struct work_struct *work)
940{ 963{
964 unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u);
941 unsigned int i, goal, buckets = 0, expired_count = 0; 965 unsigned int i, goal, buckets = 0, expired_count = 0;
942 struct conntrack_gc_work *gc_work; 966 struct conntrack_gc_work *gc_work;
943 unsigned int ratio, scanned = 0; 967 unsigned int ratio, scanned = 0;
@@ -979,8 +1003,7 @@ static void gc_worker(struct work_struct *work)
979 */ 1003 */
980 rcu_read_unlock(); 1004 rcu_read_unlock();
981 cond_resched_rcu_qs(); 1005 cond_resched_rcu_qs();
982 } while (++buckets < goal && 1006 } while (++buckets < goal);
983 expired_count < GC_MAX_EVICTS);
984 1007
985 if (gc_work->exiting) 1008 if (gc_work->exiting)
986 return; 1009 return;
@@ -997,27 +1020,25 @@ static void gc_worker(struct work_struct *work)
997 * 1. Minimize time until we notice a stale entry 1020 * 1. Minimize time until we notice a stale entry
998 * 2. Maximize scan intervals to not waste cycles 1021 * 2. Maximize scan intervals to not waste cycles
999 * 1022 *
1000 * Normally, expired_count will be 0, this increases the next_run time 1023 * Normally, expire ratio will be close to 0.
1001 * to priorize 2) above.
1002 * 1024 *
1003 * As soon as a timed-out entry is found, move towards 1) and increase 1025 * As soon as a sizeable fraction of the entries have expired
1004 * the scan frequency. 1026 * increase scan frequency.
1005 * In case we have lots of evictions next scan is done immediately.
1006 */ 1027 */
1007 ratio = scanned ? expired_count * 100 / scanned : 0; 1028 ratio = scanned ? expired_count * 100 / scanned : 0;
1008 if (ratio >= 90 || expired_count == GC_MAX_EVICTS) { 1029 if (ratio > GC_EVICT_RATIO) {
1009 gc_work->next_gc_run = 0; 1030 gc_work->next_gc_run = min_interval;
1010 next_run = 0;
1011 } else if (expired_count) {
1012 gc_work->next_gc_run /= 2U;
1013 next_run = msecs_to_jiffies(1);
1014 } else { 1031 } else {
1015 if (gc_work->next_gc_run < GC_INTERVAL_MAX) 1032 unsigned int max = GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV;
1016 gc_work->next_gc_run += msecs_to_jiffies(1);
1017 1033
1018 next_run = gc_work->next_gc_run; 1034 BUILD_BUG_ON((GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV) == 0);
1035
1036 gc_work->next_gc_run += min_interval;
1037 if (gc_work->next_gc_run > max)
1038 gc_work->next_gc_run = max;
1019 } 1039 }
1020 1040
1041 next_run = gc_work->next_gc_run;
1021 gc_work->last_bucket = i; 1042 gc_work->last_bucket = i;
1022 queue_delayed_work(system_long_wq, &gc_work->dwork, next_run); 1043 queue_delayed_work(system_long_wq, &gc_work->dwork, next_run);
1023} 1044}
@@ -1025,7 +1046,7 @@ static void gc_worker(struct work_struct *work)
1025static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) 1046static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work)
1026{ 1047{
1027 INIT_DELAYED_WORK(&gc_work->dwork, gc_worker); 1048 INIT_DELAYED_WORK(&gc_work->dwork, gc_worker);
1028 gc_work->next_gc_run = GC_INTERVAL_MAX; 1049 gc_work->next_gc_run = HZ;
1029 gc_work->exiting = false; 1050 gc_work->exiting = false;
1030} 1051}
1031 1052
@@ -1220,7 +1241,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
1220 return &ct->tuplehash[IP_CT_DIR_ORIGINAL]; 1241 return &ct->tuplehash[IP_CT_DIR_ORIGINAL];
1221} 1242}
1222 1243
1223/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */ 1244/* On success, returns conntrack ptr, sets skb->_nfct | ctinfo */
1224static inline struct nf_conn * 1245static inline struct nf_conn *
1225resolve_normal_ct(struct net *net, struct nf_conn *tmpl, 1246resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
1226 struct sk_buff *skb, 1247 struct sk_buff *skb,
@@ -1279,8 +1300,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
1279 } 1300 }
1280 *set_reply = 0; 1301 *set_reply = 0;
1281 } 1302 }
1282 skb->nfct = &ct->ct_general; 1303 nf_ct_set(skb, ct, *ctinfo);
1283 skb->nfctinfo = *ctinfo;
1284 return ct; 1304 return ct;
1285} 1305}
1286 1306
@@ -1288,7 +1308,7 @@ unsigned int
1288nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, 1308nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
1289 struct sk_buff *skb) 1309 struct sk_buff *skb)
1290{ 1310{
1291 struct nf_conn *ct, *tmpl = NULL; 1311 struct nf_conn *ct, *tmpl;
1292 enum ip_conntrack_info ctinfo; 1312 enum ip_conntrack_info ctinfo;
1293 struct nf_conntrack_l3proto *l3proto; 1313 struct nf_conntrack_l3proto *l3proto;
1294 struct nf_conntrack_l4proto *l4proto; 1314 struct nf_conntrack_l4proto *l4proto;
@@ -1298,14 +1318,14 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
1298 int set_reply = 0; 1318 int set_reply = 0;
1299 int ret; 1319 int ret;
1300 1320
1301 if (skb->nfct) { 1321 tmpl = nf_ct_get(skb, &ctinfo);
1322 if (tmpl) {
1302 /* Previously seen (loopback or untracked)? Ignore. */ 1323 /* Previously seen (loopback or untracked)? Ignore. */
1303 tmpl = (struct nf_conn *)skb->nfct;
1304 if (!nf_ct_is_template(tmpl)) { 1324 if (!nf_ct_is_template(tmpl)) {
1305 NF_CT_STAT_INC_ATOMIC(net, ignore); 1325 NF_CT_STAT_INC_ATOMIC(net, ignore);
1306 return NF_ACCEPT; 1326 return NF_ACCEPT;
1307 } 1327 }
1308 skb->nfct = NULL; 1328 skb->_nfct = 0;
1309 } 1329 }
1310 1330
1311 /* rcu_read_lock()ed by nf_hook_thresh */ 1331 /* rcu_read_lock()ed by nf_hook_thresh */
@@ -1326,8 +1346,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
1326 * inverse of the return code tells to the netfilter 1346 * inverse of the return code tells to the netfilter
1327 * core what to do with the packet. */ 1347 * core what to do with the packet. */
1328 if (l4proto->error != NULL) { 1348 if (l4proto->error != NULL) {
1329 ret = l4proto->error(net, tmpl, skb, dataoff, &ctinfo, 1349 ret = l4proto->error(net, tmpl, skb, dataoff, pf, hooknum);
1330 pf, hooknum);
1331 if (ret <= 0) { 1350 if (ret <= 0) {
1332 NF_CT_STAT_INC_ATOMIC(net, error); 1351 NF_CT_STAT_INC_ATOMIC(net, error);
1333 NF_CT_STAT_INC_ATOMIC(net, invalid); 1352 NF_CT_STAT_INC_ATOMIC(net, invalid);
@@ -1335,10 +1354,10 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
1335 goto out; 1354 goto out;
1336 } 1355 }
1337 /* ICMP[v6] protocol trackers may assign one conntrack. */ 1356 /* ICMP[v6] protocol trackers may assign one conntrack. */
1338 if (skb->nfct) 1357 if (skb->_nfct)
1339 goto out; 1358 goto out;
1340 } 1359 }
1341 1360repeat:
1342 ct = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum, 1361 ct = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum,
1343 l3proto, l4proto, &set_reply, &ctinfo); 1362 l3proto, l4proto, &set_reply, &ctinfo);
1344 if (!ct) { 1363 if (!ct) {
@@ -1355,7 +1374,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
1355 goto out; 1374 goto out;
1356 } 1375 }
1357 1376
1358 NF_CT_ASSERT(skb->nfct); 1377 NF_CT_ASSERT(skb_nfct(skb));
1359 1378
1360 /* Decide what timeout policy we want to apply to this flow. */ 1379 /* Decide what timeout policy we want to apply to this flow. */
1361 timeouts = nf_ct_timeout_lookup(net, ct, l4proto); 1380 timeouts = nf_ct_timeout_lookup(net, ct, l4proto);
@@ -1365,11 +1384,17 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
1365 /* Invalid: inverse of the return code tells 1384 /* Invalid: inverse of the return code tells
1366 * the netfilter core what to do */ 1385 * the netfilter core what to do */
1367 pr_debug("nf_conntrack_in: Can't track with proto module\n"); 1386 pr_debug("nf_conntrack_in: Can't track with proto module\n");
1368 nf_conntrack_put(skb->nfct); 1387 nf_conntrack_put(&ct->ct_general);
1369 skb->nfct = NULL; 1388 skb->_nfct = 0;
1370 NF_CT_STAT_INC_ATOMIC(net, invalid); 1389 NF_CT_STAT_INC_ATOMIC(net, invalid);
1371 if (ret == -NF_DROP) 1390 if (ret == -NF_DROP)
1372 NF_CT_STAT_INC_ATOMIC(net, drop); 1391 NF_CT_STAT_INC_ATOMIC(net, drop);
1392 /* Special case: TCP tracker reports an attempt to reopen a
1393 * closed/aborted connection. We have to go back and create a
1394 * fresh conntrack.
1395 */
1396 if (ret == -NF_REPEAT)
1397 goto repeat;
1373 ret = -ret; 1398 ret = -ret;
1374 goto out; 1399 goto out;
1375 } 1400 }
@@ -1377,15 +1402,8 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
1377 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) 1402 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
1378 nf_conntrack_event_cache(IPCT_REPLY, ct); 1403 nf_conntrack_event_cache(IPCT_REPLY, ct);
1379out: 1404out:
1380 if (tmpl) { 1405 if (tmpl)
1381 /* Special case: we have to repeat this hook, assign the 1406 nf_ct_put(tmpl);
1382 * template again to this packet. We assume that this packet
1383 * has no conntrack assigned. This is used by nf_ct_tcp. */
1384 if (ret == NF_REPEAT)
1385 skb->nfct = (struct nf_conntrack *)tmpl;
1386 else
1387 nf_ct_put(tmpl);
1388 }
1389 1407
1390 return ret; 1408 return ret;
1391} 1409}
@@ -1525,9 +1543,8 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
1525 ctinfo = IP_CT_RELATED; 1543 ctinfo = IP_CT_RELATED;
1526 1544
1527 /* Attach to new skbuff, and increment count */ 1545 /* Attach to new skbuff, and increment count */
1528 nskb->nfct = &ct->ct_general; 1546 nf_ct_set(nskb, ct, ctinfo);
1529 nskb->nfctinfo = ctinfo; 1547 nf_conntrack_get(skb_nfct(nskb));
1530 nf_conntrack_get(nskb->nfct);
1531} 1548}
1532 1549
1533/* Bring out ya dead! */ 1550/* Bring out ya dead! */
@@ -1863,7 +1880,8 @@ int nf_conntrack_init_start(void)
1863 nf_conntrack_max = max_factor * nf_conntrack_htable_size; 1880 nf_conntrack_max = max_factor * nf_conntrack_htable_size;
1864 1881
1865 nf_conntrack_cachep = kmem_cache_create("nf_conntrack", 1882 nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
1866 sizeof(struct nf_conn), 0, 1883 sizeof(struct nf_conn),
1884 NFCT_INFOMASK + 1,
1867 SLAB_DESTROY_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); 1885 SLAB_DESTROY_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
1868 if (!nf_conntrack_cachep) 1886 if (!nf_conntrack_cachep)
1869 goto err_cachep; 1887 goto err_cachep;
@@ -1918,7 +1936,7 @@ int nf_conntrack_init_start(void)
1918 nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED); 1936 nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED);
1919 1937
1920 conntrack_gc_work_init(&conntrack_gc_work); 1938 conntrack_gc_work_init(&conntrack_gc_work);
1921 queue_delayed_work(system_long_wq, &conntrack_gc_work.dwork, GC_INTERVAL_MAX); 1939 queue_delayed_work(system_long_wq, &conntrack_gc_work.dwork, HZ);
1922 1940
1923 return 0; 1941 return 0;
1924 1942
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index da9df2d56e66..22fc32143e9c 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -290,6 +290,7 @@ void nf_conntrack_unregister_notifier(struct net *net,
290 BUG_ON(notify != new); 290 BUG_ON(notify != new);
291 RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL); 291 RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL);
292 mutex_unlock(&nf_ct_ecache_mutex); 292 mutex_unlock(&nf_ct_ecache_mutex);
293 /* synchronize_rcu() is called from ctnetlink_exit. */
293} 294}
294EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier); 295EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier);
295 296
@@ -326,6 +327,7 @@ void nf_ct_expect_unregister_notifier(struct net *net,
326 BUG_ON(notify != new); 327 BUG_ON(notify != new);
327 RCU_INIT_POINTER(net->ct.nf_expect_event_cb, NULL); 328 RCU_INIT_POINTER(net->ct.nf_expect_event_cb, NULL);
328 mutex_unlock(&nf_ct_ecache_mutex); 329 mutex_unlock(&nf_ct_ecache_mutex);
330 /* synchronize_rcu() is called from ctnetlink_exit. */
329} 331}
330EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier); 332EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier);
331 333
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index f8dbacf66795..d80073037856 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -57,7 +57,7 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
57 hlist_del_rcu(&exp->hnode); 57 hlist_del_rcu(&exp->hnode);
58 net->ct.expect_count--; 58 net->ct.expect_count--;
59 59
60 hlist_del(&exp->lnode); 60 hlist_del_rcu(&exp->lnode);
61 master_help->expecting[exp->class]--; 61 master_help->expecting[exp->class]--;
62 62
63 nf_ct_expect_event_report(IPEXP_DESTROY, exp, portid, report); 63 nf_ct_expect_event_report(IPEXP_DESTROY, exp, portid, report);
@@ -353,7 +353,7 @@ void nf_ct_expect_put(struct nf_conntrack_expect *exp)
353} 353}
354EXPORT_SYMBOL_GPL(nf_ct_expect_put); 354EXPORT_SYMBOL_GPL(nf_ct_expect_put);
355 355
356static int nf_ct_expect_insert(struct nf_conntrack_expect *exp) 356static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
357{ 357{
358 struct nf_conn_help *master_help = nfct_help(exp->master); 358 struct nf_conn_help *master_help = nfct_help(exp->master);
359 struct nf_conntrack_helper *helper; 359 struct nf_conntrack_helper *helper;
@@ -363,7 +363,7 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp)
363 /* two references : one for hash insert, one for the timer */ 363 /* two references : one for hash insert, one for the timer */
364 atomic_add(2, &exp->use); 364 atomic_add(2, &exp->use);
365 365
366 hlist_add_head(&exp->lnode, &master_help->expectations); 366 hlist_add_head_rcu(&exp->lnode, &master_help->expectations);
367 master_help->expecting[exp->class]++; 367 master_help->expecting[exp->class]++;
368 368
369 hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]); 369 hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]);
@@ -380,7 +380,6 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp)
380 add_timer(&exp->timeout); 380 add_timer(&exp->timeout);
381 381
382 NF_CT_STAT_INC(net, expect_create); 382 NF_CT_STAT_INC(net, expect_create);
383 return 0;
384} 383}
385 384
386/* Race with expectations being used means we could have none to find; OK. */ 385/* Race with expectations being used means we could have none to find; OK. */
@@ -411,7 +410,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
411 struct net *net = nf_ct_exp_net(expect); 410 struct net *net = nf_ct_exp_net(expect);
412 struct hlist_node *next; 411 struct hlist_node *next;
413 unsigned int h; 412 unsigned int h;
414 int ret = 1; 413 int ret = 0;
415 414
416 if (!master_help) { 415 if (!master_help) {
417 ret = -ESHUTDOWN; 416 ret = -ESHUTDOWN;
@@ -461,15 +460,14 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,
461 460
462 spin_lock_bh(&nf_conntrack_expect_lock); 461 spin_lock_bh(&nf_conntrack_expect_lock);
463 ret = __nf_ct_expect_check(expect); 462 ret = __nf_ct_expect_check(expect);
464 if (ret <= 0)
465 goto out;
466
467 ret = nf_ct_expect_insert(expect);
468 if (ret < 0) 463 if (ret < 0)
469 goto out; 464 goto out;
465
466 nf_ct_expect_insert(expect);
467
470 spin_unlock_bh(&nf_conntrack_expect_lock); 468 spin_unlock_bh(&nf_conntrack_expect_lock);
471 nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report); 469 nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report);
472 return ret; 470 return 0;
473out: 471out:
474 spin_unlock_bh(&nf_conntrack_expect_lock); 472 spin_unlock_bh(&nf_conntrack_expect_lock);
475 return ret; 473 return ret;
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index 02bcf00c2492..008299b7f78f 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -53,7 +53,11 @@ nf_ct_ext_create(struct nf_ct_ext **ext, enum nf_ct_ext_id id,
53 53
54 rcu_read_lock(); 54 rcu_read_lock();
55 t = rcu_dereference(nf_ct_ext_types[id]); 55 t = rcu_dereference(nf_ct_ext_types[id]);
56 BUG_ON(t == NULL); 56 if (!t) {
57 rcu_read_unlock();
58 return NULL;
59 }
60
57 off = ALIGN(sizeof(struct nf_ct_ext), t->align); 61 off = ALIGN(sizeof(struct nf_ct_ext), t->align);
58 len = off + t->len + var_alloc_len; 62 len = off + t->len + var_alloc_len;
59 alloc_size = t->alloc_size + var_alloc_len; 63 alloc_size = t->alloc_size + var_alloc_len;
@@ -88,7 +92,10 @@ void *__nf_ct_ext_add_length(struct nf_conn *ct, enum nf_ct_ext_id id,
88 92
89 rcu_read_lock(); 93 rcu_read_lock();
90 t = rcu_dereference(nf_ct_ext_types[id]); 94 t = rcu_dereference(nf_ct_ext_types[id]);
91 BUG_ON(t == NULL); 95 if (!t) {
96 rcu_read_unlock();
97 return NULL;
98 }
92 99
93 newoff = ALIGN(old->len, t->align); 100 newoff = ALIGN(old->len, t->align);
94 newlen = newoff + t->len + var_alloc_len; 101 newlen = newoff + t->len + var_alloc_len;
@@ -175,6 +182,6 @@ void nf_ct_extend_unregister(struct nf_ct_ext_type *type)
175 RCU_INIT_POINTER(nf_ct_ext_types[type->id], NULL); 182 RCU_INIT_POINTER(nf_ct_ext_types[type->id], NULL);
176 update_alloc_size(type); 183 update_alloc_size(type);
177 mutex_unlock(&nf_ct_ext_type_mutex); 184 mutex_unlock(&nf_ct_ext_type_mutex);
178 rcu_barrier(); /* Wait for completion of call_rcu()'s */ 185 synchronize_rcu();
179} 186}
180EXPORT_SYMBOL_GPL(nf_ct_extend_unregister); 187EXPORT_SYMBOL_GPL(nf_ct_extend_unregister);
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index e3ed20060878..4aecef4a89fb 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -300,7 +300,7 @@ static int find_pattern(const char *data, size_t dlen,
300{ 300{
301 size_t i = plen; 301 size_t i = plen;
302 302
303 pr_debug("find_pattern `%s': dlen = %Zu\n", pattern, dlen); 303 pr_debug("find_pattern `%s': dlen = %zu\n", pattern, dlen);
304 304
305 if (dlen <= plen) { 305 if (dlen <= plen) {
306 /* Short packet: try for partial? */ 306 /* Short packet: try for partial? */
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 7341adf7059d..4eeb3418366a 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -158,16 +158,25 @@ nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum)
158{ 158{
159 struct nf_conntrack_helper *h; 159 struct nf_conntrack_helper *h;
160 160
161 rcu_read_lock();
162
161 h = __nf_conntrack_helper_find(name, l3num, protonum); 163 h = __nf_conntrack_helper_find(name, l3num, protonum);
162#ifdef CONFIG_MODULES 164#ifdef CONFIG_MODULES
163 if (h == NULL) { 165 if (h == NULL) {
164 if (request_module("nfct-helper-%s", name) == 0) 166 rcu_read_unlock();
167 if (request_module("nfct-helper-%s", name) == 0) {
168 rcu_read_lock();
165 h = __nf_conntrack_helper_find(name, l3num, protonum); 169 h = __nf_conntrack_helper_find(name, l3num, protonum);
170 } else {
171 return h;
172 }
166 } 173 }
167#endif 174#endif
168 if (h != NULL && !try_module_get(h->me)) 175 if (h != NULL && !try_module_get(h->me))
169 h = NULL; 176 h = NULL;
170 177
178 rcu_read_unlock();
179
171 return h; 180 return h;
172} 181}
173EXPORT_SYMBOL_GPL(nf_conntrack_helper_try_module_get); 182EXPORT_SYMBOL_GPL(nf_conntrack_helper_try_module_get);
@@ -188,6 +197,26 @@ nf_ct_helper_ext_add(struct nf_conn *ct,
188} 197}
189EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add); 198EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add);
190 199
200static struct nf_conntrack_helper *
201nf_ct_lookup_helper(struct nf_conn *ct, struct net *net)
202{
203 if (!net->ct.sysctl_auto_assign_helper) {
204 if (net->ct.auto_assign_helper_warned)
205 return NULL;
206 if (!__nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple))
207 return NULL;
208 pr_info("nf_conntrack: default automatic helper assignment "
209 "has been turned off for security reasons and CT-based "
210 " firewall rule not found. Use the iptables CT target "
211 "to attach helpers instead.\n");
212 net->ct.auto_assign_helper_warned = 1;
213 return NULL;
214 }
215
216 return __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
217}
218
219
191int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, 220int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
192 gfp_t flags) 221 gfp_t flags)
193{ 222{
@@ -213,21 +242,14 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
213 } 242 }
214 243
215 help = nfct_help(ct); 244 help = nfct_help(ct);
216 if (net->ct.sysctl_auto_assign_helper && helper == NULL) {
217 helper = __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
218 if (unlikely(!net->ct.auto_assign_helper_warned && helper)) {
219 pr_info("nf_conntrack: automatic helper "
220 "assignment is deprecated and it will "
221 "be removed soon. Use the iptables CT target "
222 "to attach helpers instead.\n");
223 net->ct.auto_assign_helper_warned = true;
224 }
225 }
226 245
227 if (helper == NULL) { 246 if (helper == NULL) {
228 if (help) 247 helper = nf_ct_lookup_helper(ct, net);
229 RCU_INIT_POINTER(help->helper, NULL); 248 if (helper == NULL) {
230 return 0; 249 if (help)
250 RCU_INIT_POINTER(help->helper, NULL);
251 return 0;
252 }
231 } 253 }
232 254
233 if (help == NULL) { 255 if (help == NULL) {
@@ -298,38 +320,36 @@ void nf_ct_helper_expectfn_unregister(struct nf_ct_helper_expectfn *n)
298} 320}
299EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_unregister); 321EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_unregister);
300 322
323/* Caller should hold the rcu lock */
301struct nf_ct_helper_expectfn * 324struct nf_ct_helper_expectfn *
302nf_ct_helper_expectfn_find_by_name(const char *name) 325nf_ct_helper_expectfn_find_by_name(const char *name)
303{ 326{
304 struct nf_ct_helper_expectfn *cur; 327 struct nf_ct_helper_expectfn *cur;
305 bool found = false; 328 bool found = false;
306 329
307 rcu_read_lock();
308 list_for_each_entry_rcu(cur, &nf_ct_helper_expectfn_list, head) { 330 list_for_each_entry_rcu(cur, &nf_ct_helper_expectfn_list, head) {
309 if (!strcmp(cur->name, name)) { 331 if (!strcmp(cur->name, name)) {
310 found = true; 332 found = true;
311 break; 333 break;
312 } 334 }
313 } 335 }
314 rcu_read_unlock();
315 return found ? cur : NULL; 336 return found ? cur : NULL;
316} 337}
317EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_name); 338EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_name);
318 339
340/* Caller should hold the rcu lock */
319struct nf_ct_helper_expectfn * 341struct nf_ct_helper_expectfn *
320nf_ct_helper_expectfn_find_by_symbol(const void *symbol) 342nf_ct_helper_expectfn_find_by_symbol(const void *symbol)
321{ 343{
322 struct nf_ct_helper_expectfn *cur; 344 struct nf_ct_helper_expectfn *cur;
323 bool found = false; 345 bool found = false;
324 346
325 rcu_read_lock();
326 list_for_each_entry_rcu(cur, &nf_ct_helper_expectfn_list, head) { 347 list_for_each_entry_rcu(cur, &nf_ct_helper_expectfn_list, head) {
327 if (cur->expectfn == symbol) { 348 if (cur->expectfn == symbol) {
328 found = true; 349 found = true;
329 break; 350 break;
330 } 351 }
331 } 352 }
332 rcu_read_unlock();
333 return found ? cur : NULL; 353 return found ? cur : NULL;
334} 354}
335EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_symbol); 355EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_symbol);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 27540455dc62..dc7dfd68fafe 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1478,14 +1478,28 @@ static int ctnetlink_change_helper(struct nf_conn *ct,
1478 struct nlattr *helpinfo = NULL; 1478 struct nlattr *helpinfo = NULL;
1479 int err; 1479 int err;
1480 1480
1481 /* don't change helper of sibling connections */
1482 if (ct->master)
1483 return -EBUSY;
1484
1485 err = ctnetlink_parse_help(cda[CTA_HELP], &helpname, &helpinfo); 1481 err = ctnetlink_parse_help(cda[CTA_HELP], &helpname, &helpinfo);
1486 if (err < 0) 1482 if (err < 0)
1487 return err; 1483 return err;
1488 1484
1485 /* don't change helper of sibling connections */
1486 if (ct->master) {
1487 /* If we try to change the helper to the same thing twice,
1488 * treat the second attempt as a no-op instead of returning
1489 * an error.
1490 */
1491 err = -EBUSY;
1492 if (help) {
1493 rcu_read_lock();
1494 helper = rcu_dereference(help->helper);
1495 if (helper && !strcmp(helper->name, helpname))
1496 err = 0;
1497 rcu_read_unlock();
1498 }
1499
1500 return err;
1501 }
1502
1489 if (!strcmp(helpname, "")) { 1503 if (!strcmp(helpname, "")) {
1490 if (help && help->helper) { 1504 if (help && help->helper) {
1491 /* we had a helper before ... */ 1505 /* we had a helper before ... */
@@ -1920,9 +1934,9 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
1920 1934
1921 err = 0; 1935 err = 0;
1922 if (test_bit(IPS_EXPECTED_BIT, &ct->status)) 1936 if (test_bit(IPS_EXPECTED_BIT, &ct->status))
1923 events = IPCT_RELATED; 1937 events = 1 << IPCT_RELATED;
1924 else 1938 else
1925 events = IPCT_NEW; 1939 events = 1 << IPCT_NEW;
1926 1940
1927 if (cda[CTA_LABELS] && 1941 if (cda[CTA_LABELS] &&
1928 ctnetlink_attach_labels(ct, cda) == 0) 1942 ctnetlink_attach_labels(ct, cda) == 0)
@@ -2270,6 +2284,30 @@ nla_put_failure:
2270} 2284}
2271 2285
2272static int 2286static int
2287ctnetlink_update_status(struct nf_conn *ct, const struct nlattr * const cda[])
2288{
2289 unsigned int status = ntohl(nla_get_be32(cda[CTA_STATUS]));
2290 unsigned long d = ct->status ^ status;
2291
2292 if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
2293 /* SEEN_REPLY bit can only be set */
2294 return -EBUSY;
2295
2296 if (d & IPS_ASSURED && !(status & IPS_ASSURED))
2297 /* ASSURED bit can only be set */
2298 return -EBUSY;
2299
2300 /* This check is less strict than ctnetlink_change_status()
2301 * because callers often flip IPS_EXPECTED bits when sending
2302 * an NFQA_CT attribute to the kernel. So ignore the
2303 * unchangeable bits but do not error out.
2304 */
2305 ct->status = (status & ~IPS_UNCHANGEABLE_MASK) |
2306 (ct->status & IPS_UNCHANGEABLE_MASK);
2307 return 0;
2308}
2309
2310static int
2273ctnetlink_glue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct) 2311ctnetlink_glue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct)
2274{ 2312{
2275 int err; 2313 int err;
@@ -2280,7 +2318,7 @@ ctnetlink_glue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct)
2280 return err; 2318 return err;
2281 } 2319 }
2282 if (cda[CTA_STATUS]) { 2320 if (cda[CTA_STATUS]) {
2283 err = ctnetlink_change_status(ct, cda); 2321 err = ctnetlink_update_status(ct, cda);
2284 if (err < 0) 2322 if (err < 0)
2285 return err; 2323 return err;
2286 } 2324 }
@@ -2642,8 +2680,8 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
2642 last = (struct nf_conntrack_expect *)cb->args[1]; 2680 last = (struct nf_conntrack_expect *)cb->args[1];
2643 for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) { 2681 for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) {
2644restart: 2682restart:
2645 hlist_for_each_entry(exp, &nf_ct_expect_hash[cb->args[0]], 2683 hlist_for_each_entry_rcu(exp, &nf_ct_expect_hash[cb->args[0]],
2646 hnode) { 2684 hnode) {
2647 if (l3proto && exp->tuple.src.l3num != l3proto) 2685 if (l3proto && exp->tuple.src.l3num != l3proto)
2648 continue; 2686 continue;
2649 2687
@@ -2694,7 +2732,7 @@ ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
2694 rcu_read_lock(); 2732 rcu_read_lock();
2695 last = (struct nf_conntrack_expect *)cb->args[1]; 2733 last = (struct nf_conntrack_expect *)cb->args[1];
2696restart: 2734restart:
2697 hlist_for_each_entry(exp, &help->expectations, lnode) { 2735 hlist_for_each_entry_rcu(exp, &help->expectations, lnode) {
2698 if (l3proto && exp->tuple.src.l3num != l3proto) 2736 if (l3proto && exp->tuple.src.l3num != l3proto)
2699 continue; 2737 continue;
2700 if (cb->args[1]) { 2738 if (cb->args[1]) {
@@ -2756,6 +2794,12 @@ static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl,
2756 return -ENOENT; 2794 return -ENOENT;
2757 2795
2758 ct = nf_ct_tuplehash_to_ctrack(h); 2796 ct = nf_ct_tuplehash_to_ctrack(h);
2797 /* No expectation linked to this connection tracking. */
2798 if (!nfct_help(ct)) {
2799 nf_ct_put(ct);
2800 return 0;
2801 }
2802
2759 c.data = ct; 2803 c.data = ct;
2760 2804
2761 err = netlink_dump_start(ctnl, skb, nlh, &c); 2805 err = netlink_dump_start(ctnl, skb, nlh, &c);
@@ -3100,23 +3144,27 @@ ctnetlink_create_expect(struct net *net,
3100 return -ENOENT; 3144 return -ENOENT;
3101 ct = nf_ct_tuplehash_to_ctrack(h); 3145 ct = nf_ct_tuplehash_to_ctrack(h);
3102 3146
3147 rcu_read_lock();
3103 if (cda[CTA_EXPECT_HELP_NAME]) { 3148 if (cda[CTA_EXPECT_HELP_NAME]) {
3104 const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]); 3149 const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]);
3105 3150
3106 helper = __nf_conntrack_helper_find(helpname, u3, 3151 helper = __nf_conntrack_helper_find(helpname, u3,
3107 nf_ct_protonum(ct)); 3152 nf_ct_protonum(ct));
3108 if (helper == NULL) { 3153 if (helper == NULL) {
3154 rcu_read_unlock();
3109#ifdef CONFIG_MODULES 3155#ifdef CONFIG_MODULES
3110 if (request_module("nfct-helper-%s", helpname) < 0) { 3156 if (request_module("nfct-helper-%s", helpname) < 0) {
3111 err = -EOPNOTSUPP; 3157 err = -EOPNOTSUPP;
3112 goto err_ct; 3158 goto err_ct;
3113 } 3159 }
3160 rcu_read_lock();
3114 helper = __nf_conntrack_helper_find(helpname, u3, 3161 helper = __nf_conntrack_helper_find(helpname, u3,
3115 nf_ct_protonum(ct)); 3162 nf_ct_protonum(ct));
3116 if (helper) { 3163 if (helper) {
3117 err = -EAGAIN; 3164 err = -EAGAIN;
3118 goto err_ct; 3165 goto err_rcu;
3119 } 3166 }
3167 rcu_read_unlock();
3120#endif 3168#endif
3121 err = -EOPNOTSUPP; 3169 err = -EOPNOTSUPP;
3122 goto err_ct; 3170 goto err_ct;
@@ -3126,11 +3174,13 @@ ctnetlink_create_expect(struct net *net,
3126 exp = ctnetlink_alloc_expect(cda, ct, helper, &tuple, &mask); 3174 exp = ctnetlink_alloc_expect(cda, ct, helper, &tuple, &mask);
3127 if (IS_ERR(exp)) { 3175 if (IS_ERR(exp)) {
3128 err = PTR_ERR(exp); 3176 err = PTR_ERR(exp);
3129 goto err_ct; 3177 goto err_rcu;
3130 } 3178 }
3131 3179
3132 err = nf_ct_expect_related_report(exp, portid, report); 3180 err = nf_ct_expect_related_report(exp, portid, report);
3133 nf_ct_expect_put(exp); 3181 nf_ct_expect_put(exp);
3182err_rcu:
3183 rcu_read_unlock();
3134err_ct: 3184err_ct:
3135 nf_ct_put(ct); 3185 nf_ct_put(ct);
3136 return err; 3186 return err;
@@ -3409,6 +3459,7 @@ static void __exit ctnetlink_exit(void)
3409#ifdef CONFIG_NETFILTER_NETLINK_GLUE_CT 3459#ifdef CONFIG_NETFILTER_NETLINK_GLUE_CT
3410 RCU_INIT_POINTER(nfnl_ct_hook, NULL); 3460 RCU_INIT_POINTER(nfnl_ct_hook, NULL);
3411#endif 3461#endif
3462 synchronize_rcu();
3412} 3463}
3413 3464
3414module_init(ctnetlink_init); 3465module_init(ctnetlink_init);
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 8d2c7d8c666a..2d6ee1803415 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -125,6 +125,54 @@ void nf_ct_l3proto_module_put(unsigned short l3proto)
125} 125}
126EXPORT_SYMBOL_GPL(nf_ct_l3proto_module_put); 126EXPORT_SYMBOL_GPL(nf_ct_l3proto_module_put);
127 127
128int nf_ct_netns_get(struct net *net, u8 nfproto)
129{
130 const struct nf_conntrack_l3proto *l3proto;
131 int ret;
132
133 might_sleep();
134
135 ret = nf_ct_l3proto_try_module_get(nfproto);
136 if (ret < 0)
137 return ret;
138
139 /* we already have a reference, can't fail */
140 rcu_read_lock();
141 l3proto = __nf_ct_l3proto_find(nfproto);
142 rcu_read_unlock();
143
144 if (!l3proto->net_ns_get)
145 return 0;
146
147 ret = l3proto->net_ns_get(net);
148 if (ret < 0)
149 nf_ct_l3proto_module_put(nfproto);
150
151 return ret;
152}
153EXPORT_SYMBOL_GPL(nf_ct_netns_get);
154
155void nf_ct_netns_put(struct net *net, u8 nfproto)
156{
157 const struct nf_conntrack_l3proto *l3proto;
158
159 might_sleep();
160
161 /* same as nf_conntrack_netns_get(), reference assumed */
162 rcu_read_lock();
163 l3proto = __nf_ct_l3proto_find(nfproto);
164 rcu_read_unlock();
165
166 if (WARN_ON(!l3proto))
167 return;
168
169 if (l3proto->net_ns_put)
170 l3proto->net_ns_put(net);
171
172 nf_ct_l3proto_module_put(nfproto);
173}
174EXPORT_SYMBOL_GPL(nf_ct_netns_put);
175
128struct nf_conntrack_l4proto * 176struct nf_conntrack_l4proto *
129nf_ct_l4proto_find_get(u_int16_t l3num, u_int8_t l4num) 177nf_ct_l4proto_find_get(u_int16_t l3num, u_int8_t l4num)
130{ 178{
@@ -190,20 +238,19 @@ out_unlock:
190} 238}
191EXPORT_SYMBOL_GPL(nf_ct_l3proto_register); 239EXPORT_SYMBOL_GPL(nf_ct_l3proto_register);
192 240
241#ifdef CONFIG_SYSCTL
242extern unsigned int nf_conntrack_default_on;
243
193int nf_ct_l3proto_pernet_register(struct net *net, 244int nf_ct_l3proto_pernet_register(struct net *net,
194 struct nf_conntrack_l3proto *proto) 245 struct nf_conntrack_l3proto *proto)
195{ 246{
196 int ret; 247 if (nf_conntrack_default_on == 0)
197 248 return 0;
198 if (proto->init_net) {
199 ret = proto->init_net(net);
200 if (ret < 0)
201 return ret;
202 }
203 249
204 return 0; 250 return proto->net_ns_get ? proto->net_ns_get(net) : 0;
205} 251}
206EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_register); 252EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_register);
253#endif
207 254
208void nf_ct_l3proto_unregister(struct nf_conntrack_l3proto *proto) 255void nf_ct_l3proto_unregister(struct nf_conntrack_l3proto *proto)
209{ 256{
@@ -224,6 +271,16 @@ EXPORT_SYMBOL_GPL(nf_ct_l3proto_unregister);
224void nf_ct_l3proto_pernet_unregister(struct net *net, 271void nf_ct_l3proto_pernet_unregister(struct net *net,
225 struct nf_conntrack_l3proto *proto) 272 struct nf_conntrack_l3proto *proto)
226{ 273{
274 /*
275 * nf_conntrack_default_on *might* have registered hooks.
276 * ->net_ns_put must cope with more puts() than get(), i.e.
277 * if nf_conntrack_default_on was 0 at time of
278 * nf_ct_l3proto_pernet_register invocation this net_ns_put()
279 * should be a noop.
280 */
281 if (proto->net_ns_put)
282 proto->net_ns_put(net);
283
227 /* Remove all contrack entries for this protocol */ 284 /* Remove all contrack entries for this protocol */
228 nf_ct_iterate_cleanup(net, kill_l3proto, proto, 0, 0); 285 nf_ct_iterate_cleanup(net, kill_l3proto, proto, 0, 0);
229} 286}
@@ -281,15 +338,15 @@ void nf_ct_l4proto_unregister_sysctl(struct net *net,
281 338
282/* FIXME: Allow NULL functions and sub in pointers to generic for 339/* FIXME: Allow NULL functions and sub in pointers to generic for
283 them. --RR */ 340 them. --RR */
284int nf_ct_l4proto_register(struct nf_conntrack_l4proto *l4proto) 341int nf_ct_l4proto_register_one(struct nf_conntrack_l4proto *l4proto)
285{ 342{
286 int ret = 0; 343 int ret = 0;
287 344
288 if (l4proto->l3proto >= PF_MAX) 345 if (l4proto->l3proto >= PF_MAX)
289 return -EBUSY; 346 return -EBUSY;
290 347
291 if ((l4proto->to_nlattr && !l4proto->nlattr_size) 348 if ((l4proto->to_nlattr && !l4proto->nlattr_size) ||
292 || (l4proto->tuple_to_nlattr && !l4proto->nlattr_tuple_size)) 349 (l4proto->tuple_to_nlattr && !l4proto->nlattr_tuple_size))
293 return -EINVAL; 350 return -EINVAL;
294 351
295 mutex_lock(&nf_ct_proto_mutex); 352 mutex_lock(&nf_ct_proto_mutex);
@@ -307,7 +364,8 @@ int nf_ct_l4proto_register(struct nf_conntrack_l4proto *l4proto)
307 } 364 }
308 365
309 for (i = 0; i < MAX_NF_CT_PROTO; i++) 366 for (i = 0; i < MAX_NF_CT_PROTO; i++)
310 RCU_INIT_POINTER(proto_array[i], &nf_conntrack_l4proto_generic); 367 RCU_INIT_POINTER(proto_array[i],
368 &nf_conntrack_l4proto_generic);
311 369
312 /* Before making proto_array visible to lockless readers, 370 /* Before making proto_array visible to lockless readers,
313 * we must make sure its content is committed to memory. 371 * we must make sure its content is committed to memory.
@@ -335,10 +393,10 @@ out_unlock:
335 mutex_unlock(&nf_ct_proto_mutex); 393 mutex_unlock(&nf_ct_proto_mutex);
336 return ret; 394 return ret;
337} 395}
338EXPORT_SYMBOL_GPL(nf_ct_l4proto_register); 396EXPORT_SYMBOL_GPL(nf_ct_l4proto_register_one);
339 397
340int nf_ct_l4proto_pernet_register(struct net *net, 398int nf_ct_l4proto_pernet_register_one(struct net *net,
341 struct nf_conntrack_l4proto *l4proto) 399 struct nf_conntrack_l4proto *l4proto)
342{ 400{
343 int ret = 0; 401 int ret = 0;
344 struct nf_proto_net *pn = NULL; 402 struct nf_proto_net *pn = NULL;
@@ -361,9 +419,9 @@ int nf_ct_l4proto_pernet_register(struct net *net,
361out: 419out:
362 return ret; 420 return ret;
363} 421}
364EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register); 422EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register_one);
365 423
366void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *l4proto) 424void nf_ct_l4proto_unregister_one(struct nf_conntrack_l4proto *l4proto)
367{ 425{
368 BUG_ON(l4proto->l3proto >= PF_MAX); 426 BUG_ON(l4proto->l3proto >= PF_MAX);
369 427
@@ -378,10 +436,10 @@ void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *l4proto)
378 436
379 synchronize_rcu(); 437 synchronize_rcu();
380} 438}
381EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister); 439EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister_one);
382 440
383void nf_ct_l4proto_pernet_unregister(struct net *net, 441void nf_ct_l4proto_pernet_unregister_one(struct net *net,
384 struct nf_conntrack_l4proto *l4proto) 442 struct nf_conntrack_l4proto *l4proto)
385{ 443{
386 struct nf_proto_net *pn = NULL; 444 struct nf_proto_net *pn = NULL;
387 445
@@ -395,6 +453,66 @@ void nf_ct_l4proto_pernet_unregister(struct net *net,
395 /* Remove all contrack entries for this protocol */ 453 /* Remove all contrack entries for this protocol */
396 nf_ct_iterate_cleanup(net, kill_l4proto, l4proto, 0, 0); 454 nf_ct_iterate_cleanup(net, kill_l4proto, l4proto, 0, 0);
397} 455}
456EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister_one);
457
458int nf_ct_l4proto_register(struct nf_conntrack_l4proto *l4proto[],
459 unsigned int num_proto)
460{
461 int ret = -EINVAL, ver;
462 unsigned int i;
463
464 for (i = 0; i < num_proto; i++) {
465 ret = nf_ct_l4proto_register_one(l4proto[i]);
466 if (ret < 0)
467 break;
468 }
469 if (i != num_proto) {
470 ver = l4proto[i]->l3proto == PF_INET6 ? 6 : 4;
471 pr_err("nf_conntrack_ipv%d: can't register %s%d proto.\n",
472 ver, l4proto[i]->name, ver);
473 nf_ct_l4proto_unregister(l4proto, i);
474 }
475 return ret;
476}
477EXPORT_SYMBOL_GPL(nf_ct_l4proto_register);
478
479int nf_ct_l4proto_pernet_register(struct net *net,
480 struct nf_conntrack_l4proto *l4proto[],
481 unsigned int num_proto)
482{
483 int ret = -EINVAL;
484 unsigned int i;
485
486 for (i = 0; i < num_proto; i++) {
487 ret = nf_ct_l4proto_pernet_register_one(net, l4proto[i]);
488 if (ret < 0)
489 break;
490 }
491 if (i != num_proto) {
492 pr_err("nf_conntrack_%s%d: pernet registration failed\n",
493 l4proto[i]->name,
494 l4proto[i]->l3proto == PF_INET6 ? 6 : 4);
495 nf_ct_l4proto_pernet_unregister(net, l4proto, i);
496 }
497 return ret;
498}
499EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register);
500
501void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *l4proto[],
502 unsigned int num_proto)
503{
504 while (num_proto-- != 0)
505 nf_ct_l4proto_unregister_one(l4proto[num_proto]);
506}
507EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister);
508
509void nf_ct_l4proto_pernet_unregister(struct net *net,
510 struct nf_conntrack_l4proto *l4proto[],
511 unsigned int num_proto)
512{
513 while (num_proto-- != 0)
514 nf_ct_l4proto_pernet_unregister_one(net, l4proto[num_proto]);
515}
398EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister); 516EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister);
399 517
400int nf_conntrack_proto_pernet_init(struct net *net) 518int nf_conntrack_proto_pernet_init(struct net *net)
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index a45bee52dccc..93dd1c5b7bff 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -9,7 +9,6 @@
9 * 9 *
10 */ 10 */
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/module.h>
13#include <linux/init.h> 12#include <linux/init.h>
14#include <linux/sysctl.h> 13#include <linux/sysctl.h>
15#include <linux/spinlock.h> 14#include <linux/spinlock.h>
@@ -384,17 +383,9 @@ dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] =
384 }, 383 },
385}; 384};
386 385
387/* this module per-net specifics */ 386static inline struct nf_dccp_net *dccp_pernet(struct net *net)
388static int dccp_net_id __read_mostly;
389struct dccp_net {
390 struct nf_proto_net pn;
391 int dccp_loose;
392 unsigned int dccp_timeout[CT_DCCP_MAX + 1];
393};
394
395static inline struct dccp_net *dccp_pernet(struct net *net)
396{ 387{
397 return net_generic(net, dccp_net_id); 388 return &net->ct.nf_ct_proto.dccp;
398} 389}
399 390
400static bool dccp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, 391static bool dccp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
@@ -424,7 +415,7 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
424 unsigned int dataoff, unsigned int *timeouts) 415 unsigned int dataoff, unsigned int *timeouts)
425{ 416{
426 struct net *net = nf_ct_net(ct); 417 struct net *net = nf_ct_net(ct);
427 struct dccp_net *dn; 418 struct nf_dccp_net *dn;
428 struct dccp_hdr _dh, *dh; 419 struct dccp_hdr _dh, *dh;
429 const char *msg; 420 const char *msg;
430 u_int8_t state; 421 u_int8_t state;
@@ -570,7 +561,6 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
570 561
571static int dccp_error(struct net *net, struct nf_conn *tmpl, 562static int dccp_error(struct net *net, struct nf_conn *tmpl,
572 struct sk_buff *skb, unsigned int dataoff, 563 struct sk_buff *skb, unsigned int dataoff,
573 enum ip_conntrack_info *ctinfo,
574 u_int8_t pf, unsigned int hooknum) 564 u_int8_t pf, unsigned int hooknum)
575{ 565{
576 struct dccp_hdr _dh, *dh; 566 struct dccp_hdr _dh, *dh;
@@ -719,7 +709,7 @@ static int dccp_nlattr_size(void)
719static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[], 709static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[],
720 struct net *net, void *data) 710 struct net *net, void *data)
721{ 711{
722 struct dccp_net *dn = dccp_pernet(net); 712 struct nf_dccp_net *dn = dccp_pernet(net);
723 unsigned int *timeouts = data; 713 unsigned int *timeouts = data;
724 int i; 714 int i;
725 715
@@ -820,7 +810,7 @@ static struct ctl_table dccp_sysctl_table[] = {
820#endif /* CONFIG_SYSCTL */ 810#endif /* CONFIG_SYSCTL */
821 811
822static int dccp_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *pn, 812static int dccp_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *pn,
823 struct dccp_net *dn) 813 struct nf_dccp_net *dn)
824{ 814{
825#ifdef CONFIG_SYSCTL 815#ifdef CONFIG_SYSCTL
826 if (pn->ctl_table) 816 if (pn->ctl_table)
@@ -850,7 +840,7 @@ static int dccp_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *pn,
850 840
851static int dccp_init_net(struct net *net, u_int16_t proto) 841static int dccp_init_net(struct net *net, u_int16_t proto)
852{ 842{
853 struct dccp_net *dn = dccp_pernet(net); 843 struct nf_dccp_net *dn = dccp_pernet(net);
854 struct nf_proto_net *pn = &dn->pn; 844 struct nf_proto_net *pn = &dn->pn;
855 845
856 if (!pn->users) { 846 if (!pn->users) {
@@ -868,7 +858,7 @@ static int dccp_init_net(struct net *net, u_int16_t proto)
868 return dccp_kmemdup_sysctl_table(net, pn, dn); 858 return dccp_kmemdup_sysctl_table(net, pn, dn);
869} 859}
870 860
871static struct nf_conntrack_l4proto dccp_proto4 __read_mostly = { 861struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 __read_mostly = {
872 .l3proto = AF_INET, 862 .l3proto = AF_INET,
873 .l4proto = IPPROTO_DCCP, 863 .l4proto = IPPROTO_DCCP,
874 .name = "dccp", 864 .name = "dccp",
@@ -898,11 +888,11 @@ static struct nf_conntrack_l4proto dccp_proto4 __read_mostly = {
898 .nla_policy = dccp_timeout_nla_policy, 888 .nla_policy = dccp_timeout_nla_policy,
899 }, 889 },
900#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ 890#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
901 .net_id = &dccp_net_id,
902 .init_net = dccp_init_net, 891 .init_net = dccp_init_net,
903}; 892};
893EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp4);
904 894
905static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = { 895struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 __read_mostly = {
906 .l3proto = AF_INET6, 896 .l3proto = AF_INET6,
907 .l4proto = IPPROTO_DCCP, 897 .l4proto = IPPROTO_DCCP,
908 .name = "dccp", 898 .name = "dccp",
@@ -932,78 +922,6 @@ static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = {
932 .nla_policy = dccp_timeout_nla_policy, 922 .nla_policy = dccp_timeout_nla_policy,
933 }, 923 },
934#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ 924#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
935 .net_id = &dccp_net_id,
936 .init_net = dccp_init_net, 925 .init_net = dccp_init_net,
937}; 926};
938 927EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp6);
939static __net_init int dccp_net_init(struct net *net)
940{
941 int ret = 0;
942 ret = nf_ct_l4proto_pernet_register(net, &dccp_proto4);
943 if (ret < 0) {
944 pr_err("nf_conntrack_dccp4: pernet registration failed.\n");
945 goto out;
946 }
947 ret = nf_ct_l4proto_pernet_register(net, &dccp_proto6);
948 if (ret < 0) {
949 pr_err("nf_conntrack_dccp6: pernet registration failed.\n");
950 goto cleanup_dccp4;
951 }
952 return 0;
953cleanup_dccp4:
954 nf_ct_l4proto_pernet_unregister(net, &dccp_proto4);
955out:
956 return ret;
957}
958
959static __net_exit void dccp_net_exit(struct net *net)
960{
961 nf_ct_l4proto_pernet_unregister(net, &dccp_proto6);
962 nf_ct_l4proto_pernet_unregister(net, &dccp_proto4);
963}
964
965static struct pernet_operations dccp_net_ops = {
966 .init = dccp_net_init,
967 .exit = dccp_net_exit,
968 .id = &dccp_net_id,
969 .size = sizeof(struct dccp_net),
970};
971
972static int __init nf_conntrack_proto_dccp_init(void)
973{
974 int ret;
975
976 ret = register_pernet_subsys(&dccp_net_ops);
977 if (ret < 0)
978 goto out_pernet;
979
980 ret = nf_ct_l4proto_register(&dccp_proto4);
981 if (ret < 0)
982 goto out_dccp4;
983
984 ret = nf_ct_l4proto_register(&dccp_proto6);
985 if (ret < 0)
986 goto out_dccp6;
987
988 return 0;
989out_dccp6:
990 nf_ct_l4proto_unregister(&dccp_proto4);
991out_dccp4:
992 unregister_pernet_subsys(&dccp_net_ops);
993out_pernet:
994 return ret;
995}
996
997static void __exit nf_conntrack_proto_dccp_fini(void)
998{
999 nf_ct_l4proto_unregister(&dccp_proto6);
1000 nf_ct_l4proto_unregister(&dccp_proto4);
1001 unregister_pernet_subsys(&dccp_net_ops);
1002}
1003
1004module_init(nf_conntrack_proto_dccp_init);
1005module_exit(nf_conntrack_proto_dccp_fini);
1006
1007MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
1008MODULE_DESCRIPTION("DCCP connection tracking protocol helper");
1009MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index 9a715f88b2f1..87bb40a3feb5 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -53,7 +53,7 @@ static unsigned int gre_timeouts[GRE_CT_MAX] = {
53 [GRE_CT_REPLIED] = 180*HZ, 53 [GRE_CT_REPLIED] = 180*HZ,
54}; 54};
55 55
56static int proto_gre_net_id __read_mostly; 56static unsigned int proto_gre_net_id __read_mostly;
57struct netns_proto_gre { 57struct netns_proto_gre {
58 struct nf_proto_net nf; 58 struct nf_proto_net nf;
59 rwlock_t keymap_lock; 59 rwlock_t keymap_lock;
@@ -396,7 +396,9 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 __read_mostly = {
396static int proto_gre_net_init(struct net *net) 396static int proto_gre_net_init(struct net *net)
397{ 397{
398 int ret = 0; 398 int ret = 0;
399 ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_gre4); 399
400 ret = nf_ct_l4proto_pernet_register_one(net,
401 &nf_conntrack_l4proto_gre4);
400 if (ret < 0) 402 if (ret < 0)
401 pr_err("nf_conntrack_gre4: pernet registration failed.\n"); 403 pr_err("nf_conntrack_gre4: pernet registration failed.\n");
402 return ret; 404 return ret;
@@ -404,7 +406,7 @@ static int proto_gre_net_init(struct net *net)
404 406
405static void proto_gre_net_exit(struct net *net) 407static void proto_gre_net_exit(struct net *net)
406{ 408{
407 nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_gre4); 409 nf_ct_l4proto_pernet_unregister_one(net, &nf_conntrack_l4proto_gre4);
408 nf_ct_gre_keymap_flush(net); 410 nf_ct_gre_keymap_flush(net);
409} 411}
410 412
@@ -422,8 +424,7 @@ static int __init nf_ct_proto_gre_init(void)
422 ret = register_pernet_subsys(&proto_gre_net_ops); 424 ret = register_pernet_subsys(&proto_gre_net_ops);
423 if (ret < 0) 425 if (ret < 0)
424 goto out_pernet; 426 goto out_pernet;
425 427 ret = nf_ct_l4proto_register_one(&nf_conntrack_l4proto_gre4);
426 ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_gre4);
427 if (ret < 0) 428 if (ret < 0)
428 goto out_gre4; 429 goto out_gre4;
429 430
@@ -436,7 +437,7 @@ out_pernet:
436 437
437static void __exit nf_ct_proto_gre_fini(void) 438static void __exit nf_ct_proto_gre_fini(void)
438{ 439{
439 nf_ct_l4proto_unregister(&nf_conntrack_l4proto_gre4); 440 nf_ct_l4proto_unregister_one(&nf_conntrack_l4proto_gre4);
440 unregister_pernet_subsys(&proto_gre_net_ops); 441 unregister_pernet_subsys(&proto_gre_net_ops);
441} 442}
442 443
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 982ea62606c7..33279aab583d 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -15,7 +15,6 @@
15#include <linux/types.h> 15#include <linux/types.h>
16#include <linux/timer.h> 16#include <linux/timer.h>
17#include <linux/netfilter.h> 17#include <linux/netfilter.h>
18#include <linux/module.h>
19#include <linux/in.h> 18#include <linux/in.h>
20#include <linux/ip.h> 19#include <linux/ip.h>
21#include <linux/sctp.h> 20#include <linux/sctp.h>
@@ -23,7 +22,9 @@
23#include <linux/seq_file.h> 22#include <linux/seq_file.h>
24#include <linux/spinlock.h> 23#include <linux/spinlock.h>
25#include <linux/interrupt.h> 24#include <linux/interrupt.h>
25#include <net/sctp/checksum.h>
26 26
27#include <net/netfilter/nf_log.h>
27#include <net/netfilter/nf_conntrack.h> 28#include <net/netfilter/nf_conntrack.h>
28#include <net/netfilter/nf_conntrack_l4proto.h> 29#include <net/netfilter/nf_conntrack_l4proto.h>
29#include <net/netfilter/nf_conntrack_ecache.h> 30#include <net/netfilter/nf_conntrack_ecache.h>
@@ -144,15 +145,9 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = {
144 } 145 }
145}; 146};
146 147
147static int sctp_net_id __read_mostly; 148static inline struct nf_sctp_net *sctp_pernet(struct net *net)
148struct sctp_net {
149 struct nf_proto_net pn;
150 unsigned int timeouts[SCTP_CONNTRACK_MAX];
151};
152
153static inline struct sctp_net *sctp_pernet(struct net *net)
154{ 149{
155 return net_generic(net, sctp_net_id); 150 return &net->ct.nf_ct_proto.sctp;
156} 151}
157 152
158static bool sctp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, 153static bool sctp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
@@ -512,6 +507,34 @@ static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
512 return true; 507 return true;
513} 508}
514 509
510static int sctp_error(struct net *net, struct nf_conn *tpl, struct sk_buff *skb,
511 unsigned int dataoff,
512 u8 pf, unsigned int hooknum)
513{
514 const struct sctphdr *sh;
515 struct sctphdr _sctph;
516 const char *logmsg;
517
518 sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph);
519 if (!sh) {
520 logmsg = "nf_ct_sctp: short packet ";
521 goto out_invalid;
522 }
523 if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
524 skb->ip_summed == CHECKSUM_NONE) {
525 if (sh->checksum != sctp_compute_cksum(skb, dataoff)) {
526 logmsg = "nf_ct_sctp: bad CRC ";
527 goto out_invalid;
528 }
529 skb->ip_summed = CHECKSUM_UNNECESSARY;
530 }
531 return NF_ACCEPT;
532out_invalid:
533 if (LOG_INVALID(net, IPPROTO_SCTP))
534 nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "%s", logmsg);
535 return -NF_ACCEPT;
536}
537
515#if IS_ENABLED(CONFIG_NF_CT_NETLINK) 538#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
516 539
517#include <linux/netfilter/nfnetlink.h> 540#include <linux/netfilter/nfnetlink.h>
@@ -600,7 +623,7 @@ static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[],
600 struct net *net, void *data) 623 struct net *net, void *data)
601{ 624{
602 unsigned int *timeouts = data; 625 unsigned int *timeouts = data;
603 struct sctp_net *sn = sctp_pernet(net); 626 struct nf_sctp_net *sn = sctp_pernet(net);
604 int i; 627 int i;
605 628
606 /* set default SCTP timeouts. */ 629 /* set default SCTP timeouts. */
@@ -708,7 +731,7 @@ static struct ctl_table sctp_sysctl_table[] = {
708#endif 731#endif
709 732
710static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn, 733static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn,
711 struct sctp_net *sn) 734 struct nf_sctp_net *sn)
712{ 735{
713#ifdef CONFIG_SYSCTL 736#ifdef CONFIG_SYSCTL
714 if (pn->ctl_table) 737 if (pn->ctl_table)
@@ -735,7 +758,7 @@ static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn,
735 758
736static int sctp_init_net(struct net *net, u_int16_t proto) 759static int sctp_init_net(struct net *net, u_int16_t proto)
737{ 760{
738 struct sctp_net *sn = sctp_pernet(net); 761 struct nf_sctp_net *sn = sctp_pernet(net);
739 struct nf_proto_net *pn = &sn->pn; 762 struct nf_proto_net *pn = &sn->pn;
740 763
741 if (!pn->users) { 764 if (!pn->users) {
@@ -748,7 +771,7 @@ static int sctp_init_net(struct net *net, u_int16_t proto)
748 return sctp_kmemdup_sysctl_table(pn, sn); 771 return sctp_kmemdup_sysctl_table(pn, sn);
749} 772}
750 773
751static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = { 774struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {
752 .l3proto = PF_INET, 775 .l3proto = PF_INET,
753 .l4proto = IPPROTO_SCTP, 776 .l4proto = IPPROTO_SCTP,
754 .name = "sctp", 777 .name = "sctp",
@@ -759,6 +782,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {
759 .packet = sctp_packet, 782 .packet = sctp_packet,
760 .get_timeouts = sctp_get_timeouts, 783 .get_timeouts = sctp_get_timeouts,
761 .new = sctp_new, 784 .new = sctp_new,
785 .error = sctp_error,
762 .me = THIS_MODULE, 786 .me = THIS_MODULE,
763#if IS_ENABLED(CONFIG_NF_CT_NETLINK) 787#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
764 .to_nlattr = sctp_to_nlattr, 788 .to_nlattr = sctp_to_nlattr,
@@ -778,11 +802,11 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {
778 .nla_policy = sctp_timeout_nla_policy, 802 .nla_policy = sctp_timeout_nla_policy,
779 }, 803 },
780#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ 804#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
781 .net_id = &sctp_net_id,
782 .init_net = sctp_init_net, 805 .init_net = sctp_init_net,
783}; 806};
807EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_sctp4);
784 808
785static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = { 809struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = {
786 .l3proto = PF_INET6, 810 .l3proto = PF_INET6,
787 .l4proto = IPPROTO_SCTP, 811 .l4proto = IPPROTO_SCTP,
788 .name = "sctp", 812 .name = "sctp",
@@ -793,6 +817,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = {
793 .packet = sctp_packet, 817 .packet = sctp_packet,
794 .get_timeouts = sctp_get_timeouts, 818 .get_timeouts = sctp_get_timeouts,
795 .new = sctp_new, 819 .new = sctp_new,
820 .error = sctp_error,
796 .me = THIS_MODULE, 821 .me = THIS_MODULE,
797#if IS_ENABLED(CONFIG_NF_CT_NETLINK) 822#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
798 .to_nlattr = sctp_to_nlattr, 823 .to_nlattr = sctp_to_nlattr,
@@ -812,81 +837,6 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = {
812 }, 837 },
813#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ 838#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
814#endif 839#endif
815 .net_id = &sctp_net_id,
816 .init_net = sctp_init_net, 840 .init_net = sctp_init_net,
817}; 841};
818 842EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_sctp6);
819static int sctp_net_init(struct net *net)
820{
821 int ret = 0;
822
823 ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_sctp4);
824 if (ret < 0) {
825 pr_err("nf_conntrack_sctp4: pernet registration failed.\n");
826 goto out;
827 }
828 ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_sctp6);
829 if (ret < 0) {
830 pr_err("nf_conntrack_sctp6: pernet registration failed.\n");
831 goto cleanup_sctp4;
832 }
833 return 0;
834
835cleanup_sctp4:
836 nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_sctp4);
837out:
838 return ret;
839}
840
841static void sctp_net_exit(struct net *net)
842{
843 nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_sctp6);
844 nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_sctp4);
845}
846
847static struct pernet_operations sctp_net_ops = {
848 .init = sctp_net_init,
849 .exit = sctp_net_exit,
850 .id = &sctp_net_id,
851 .size = sizeof(struct sctp_net),
852};
853
854static int __init nf_conntrack_proto_sctp_init(void)
855{
856 int ret;
857
858 ret = register_pernet_subsys(&sctp_net_ops);
859 if (ret < 0)
860 goto out_pernet;
861
862 ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_sctp4);
863 if (ret < 0)
864 goto out_sctp4;
865
866 ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_sctp6);
867 if (ret < 0)
868 goto out_sctp6;
869
870 return 0;
871out_sctp6:
872 nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp4);
873out_sctp4:
874 unregister_pernet_subsys(&sctp_net_ops);
875out_pernet:
876 return ret;
877}
878
879static void __exit nf_conntrack_proto_sctp_fini(void)
880{
881 nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp6);
882 nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp4);
883 unregister_pernet_subsys(&sctp_net_ops);
884}
885
886module_init(nf_conntrack_proto_sctp_init);
887module_exit(nf_conntrack_proto_sctp_fini);
888
889MODULE_LICENSE("GPL");
890MODULE_AUTHOR("Kiran Kumar Immidi");
891MODULE_DESCRIPTION("Netfilter connection tracking protocol helper for SCTP");
892MODULE_ALIAS("ip_conntrack_proto_sctp");
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 69f687740c76..b122e9dacfed 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -750,7 +750,6 @@ static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK|
750static int tcp_error(struct net *net, struct nf_conn *tmpl, 750static int tcp_error(struct net *net, struct nf_conn *tmpl,
751 struct sk_buff *skb, 751 struct sk_buff *skb,
752 unsigned int dataoff, 752 unsigned int dataoff,
753 enum ip_conntrack_info *ctinfo,
754 u_int8_t pf, 753 u_int8_t pf,
755 unsigned int hooknum) 754 unsigned int hooknum)
756{ 755{
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 20f35ed68030..f6ebce6178ca 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -108,8 +108,60 @@ static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb,
108 return true; 108 return true;
109} 109}
110 110
111#ifdef CONFIG_NF_CT_PROTO_UDPLITE
112static int udplite_error(struct net *net, struct nf_conn *tmpl,
113 struct sk_buff *skb,
114 unsigned int dataoff,
115 u8 pf, unsigned int hooknum)
116{
117 unsigned int udplen = skb->len - dataoff;
118 const struct udphdr *hdr;
119 struct udphdr _hdr;
120 unsigned int cscov;
121
122 /* Header is too small? */
123 hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
124 if (!hdr) {
125 if (LOG_INVALID(net, IPPROTO_UDPLITE))
126 nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
127 "nf_ct_udplite: short packet ");
128 return -NF_ACCEPT;
129 }
130
131 cscov = ntohs(hdr->len);
132 if (cscov == 0) {
133 cscov = udplen;
134 } else if (cscov < sizeof(*hdr) || cscov > udplen) {
135 if (LOG_INVALID(net, IPPROTO_UDPLITE))
136 nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
137 "nf_ct_udplite: invalid checksum coverage ");
138 return -NF_ACCEPT;
139 }
140
141 /* UDPLITE mandates checksums */
142 if (!hdr->check) {
143 if (LOG_INVALID(net, IPPROTO_UDPLITE))
144 nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
145 "nf_ct_udplite: checksum missing ");
146 return -NF_ACCEPT;
147 }
148
149 /* Checksum invalid? Ignore. */
150 if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
151 nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP,
152 pf)) {
153 if (LOG_INVALID(net, IPPROTO_UDPLITE))
154 nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
155 "nf_ct_udplite: bad UDPLite checksum ");
156 return -NF_ACCEPT;
157 }
158
159 return NF_ACCEPT;
160}
161#endif
162
111static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, 163static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
112 unsigned int dataoff, enum ip_conntrack_info *ctinfo, 164 unsigned int dataoff,
113 u_int8_t pf, 165 u_int8_t pf,
114 unsigned int hooknum) 166 unsigned int hooknum)
115{ 167{
@@ -290,6 +342,41 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly =
290}; 342};
291EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp4); 343EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp4);
292 344
345#ifdef CONFIG_NF_CT_PROTO_UDPLITE
346struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =
347{
348 .l3proto = PF_INET,
349 .l4proto = IPPROTO_UDPLITE,
350 .name = "udplite",
351 .allow_clash = true,
352 .pkt_to_tuple = udp_pkt_to_tuple,
353 .invert_tuple = udp_invert_tuple,
354 .print_tuple = udp_print_tuple,
355 .packet = udp_packet,
356 .get_timeouts = udp_get_timeouts,
357 .new = udp_new,
358 .error = udplite_error,
359#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
360 .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
361 .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
362 .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
363 .nla_policy = nf_ct_port_nla_policy,
364#endif
365#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
366 .ctnl_timeout = {
367 .nlattr_to_obj = udp_timeout_nlattr_to_obj,
368 .obj_to_nlattr = udp_timeout_obj_to_nlattr,
369 .nlattr_max = CTA_TIMEOUT_UDP_MAX,
370 .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX,
371 .nla_policy = udp_timeout_nla_policy,
372 },
373#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
374 .init_net = udp_init_net,
375 .get_net_proto = udp_get_net_proto,
376};
377EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite4);
378#endif
379
293struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly = 380struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly =
294{ 381{
295 .l3proto = PF_INET6, 382 .l3proto = PF_INET6,
@@ -322,3 +409,38 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly =
322 .get_net_proto = udp_get_net_proto, 409 .get_net_proto = udp_get_net_proto,
323}; 410};
324EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp6); 411EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp6);
412
413#ifdef CONFIG_NF_CT_PROTO_UDPLITE
414struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly =
415{
416 .l3proto = PF_INET6,
417 .l4proto = IPPROTO_UDPLITE,
418 .name = "udplite",
419 .allow_clash = true,
420 .pkt_to_tuple = udp_pkt_to_tuple,
421 .invert_tuple = udp_invert_tuple,
422 .print_tuple = udp_print_tuple,
423 .packet = udp_packet,
424 .get_timeouts = udp_get_timeouts,
425 .new = udp_new,
426 .error = udplite_error,
427#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
428 .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
429 .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
430 .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
431 .nla_policy = nf_ct_port_nla_policy,
432#endif
433#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
434 .ctnl_timeout = {
435 .nlattr_to_obj = udp_timeout_nlattr_to_obj,
436 .obj_to_nlattr = udp_timeout_obj_to_nlattr,
437 .nlattr_max = CTA_TIMEOUT_UDP_MAX,
438 .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX,
439 .nla_policy = udp_timeout_nla_policy,
440 },
441#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
442 .init_net = udp_init_net,
443 .get_net_proto = udp_get_net_proto,
444};
445EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite6);
446#endif
diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c
deleted file mode 100644
index 029206e8dec4..000000000000
--- a/net/netfilter/nf_conntrack_proto_udplite.c
+++ /dev/null
@@ -1,409 +0,0 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3 * (C) 2007 Patrick McHardy <kaber@trash.net>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 */
9
10#include <linux/types.h>
11#include <linux/timer.h>
12#include <linux/module.h>
13#include <linux/udp.h>
14#include <linux/seq_file.h>
15#include <linux/skbuff.h>
16#include <linux/ipv6.h>
17#include <net/ip6_checksum.h>
18#include <net/checksum.h>
19
20#include <linux/netfilter.h>
21#include <linux/netfilter_ipv4.h>
22#include <linux/netfilter_ipv6.h>
23#include <net/netfilter/nf_conntrack_l4proto.h>
24#include <net/netfilter/nf_conntrack_ecache.h>
25#include <net/netfilter/nf_log.h>
26
27enum udplite_conntrack {
28 UDPLITE_CT_UNREPLIED,
29 UDPLITE_CT_REPLIED,
30 UDPLITE_CT_MAX
31};
32
33static unsigned int udplite_timeouts[UDPLITE_CT_MAX] = {
34 [UDPLITE_CT_UNREPLIED] = 30*HZ,
35 [UDPLITE_CT_REPLIED] = 180*HZ,
36};
37
38static int udplite_net_id __read_mostly;
39struct udplite_net {
40 struct nf_proto_net pn;
41 unsigned int timeouts[UDPLITE_CT_MAX];
42};
43
44static inline struct udplite_net *udplite_pernet(struct net *net)
45{
46 return net_generic(net, udplite_net_id);
47}
48
49static bool udplite_pkt_to_tuple(const struct sk_buff *skb,
50 unsigned int dataoff,
51 struct net *net,
52 struct nf_conntrack_tuple *tuple)
53{
54 const struct udphdr *hp;
55 struct udphdr _hdr;
56
57 /* Actually only need first 4 bytes to get ports. */
58 hp = skb_header_pointer(skb, dataoff, 4, &_hdr);
59 if (hp == NULL)
60 return false;
61
62 tuple->src.u.udp.port = hp->source;
63 tuple->dst.u.udp.port = hp->dest;
64 return true;
65}
66
67static bool udplite_invert_tuple(struct nf_conntrack_tuple *tuple,
68 const struct nf_conntrack_tuple *orig)
69{
70 tuple->src.u.udp.port = orig->dst.u.udp.port;
71 tuple->dst.u.udp.port = orig->src.u.udp.port;
72 return true;
73}
74
75/* Print out the per-protocol part of the tuple. */
76static void udplite_print_tuple(struct seq_file *s,
77 const struct nf_conntrack_tuple *tuple)
78{
79 seq_printf(s, "sport=%hu dport=%hu ",
80 ntohs(tuple->src.u.udp.port),
81 ntohs(tuple->dst.u.udp.port));
82}
83
84static unsigned int *udplite_get_timeouts(struct net *net)
85{
86 return udplite_pernet(net)->timeouts;
87}
88
89/* Returns verdict for packet, and may modify conntracktype */
90static int udplite_packet(struct nf_conn *ct,
91 const struct sk_buff *skb,
92 unsigned int dataoff,
93 enum ip_conntrack_info ctinfo,
94 u_int8_t pf,
95 unsigned int hooknum,
96 unsigned int *timeouts)
97{
98 /* If we've seen traffic both ways, this is some kind of UDP
99 stream. Extend timeout. */
100 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
101 nf_ct_refresh_acct(ct, ctinfo, skb,
102 timeouts[UDPLITE_CT_REPLIED]);
103 /* Also, more likely to be important, and not a probe */
104 if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
105 nf_conntrack_event_cache(IPCT_ASSURED, ct);
106 } else {
107 nf_ct_refresh_acct(ct, ctinfo, skb,
108 timeouts[UDPLITE_CT_UNREPLIED]);
109 }
110 return NF_ACCEPT;
111}
112
113/* Called when a new connection for this protocol found. */
114static bool udplite_new(struct nf_conn *ct, const struct sk_buff *skb,
115 unsigned int dataoff, unsigned int *timeouts)
116{
117 return true;
118}
119
120static int udplite_error(struct net *net, struct nf_conn *tmpl,
121 struct sk_buff *skb,
122 unsigned int dataoff,
123 enum ip_conntrack_info *ctinfo,
124 u_int8_t pf,
125 unsigned int hooknum)
126{
127 unsigned int udplen = skb->len - dataoff;
128 const struct udphdr *hdr;
129 struct udphdr _hdr;
130 unsigned int cscov;
131
132 /* Header is too small? */
133 hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
134 if (hdr == NULL) {
135 if (LOG_INVALID(net, IPPROTO_UDPLITE))
136 nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
137 "nf_ct_udplite: short packet ");
138 return -NF_ACCEPT;
139 }
140
141 cscov = ntohs(hdr->len);
142 if (cscov == 0)
143 cscov = udplen;
144 else if (cscov < sizeof(*hdr) || cscov > udplen) {
145 if (LOG_INVALID(net, IPPROTO_UDPLITE))
146 nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
147 "nf_ct_udplite: invalid checksum coverage ");
148 return -NF_ACCEPT;
149 }
150
151 /* UDPLITE mandates checksums */
152 if (!hdr->check) {
153 if (LOG_INVALID(net, IPPROTO_UDPLITE))
154 nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
155 "nf_ct_udplite: checksum missing ");
156 return -NF_ACCEPT;
157 }
158
159 /* Checksum invalid? Ignore. */
160 if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
161 nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP,
162 pf)) {
163 if (LOG_INVALID(net, IPPROTO_UDPLITE))
164 nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
165 "nf_ct_udplite: bad UDPLite checksum ");
166 return -NF_ACCEPT;
167 }
168
169 return NF_ACCEPT;
170}
171
172#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
173
174#include <linux/netfilter/nfnetlink.h>
175#include <linux/netfilter/nfnetlink_cttimeout.h>
176
177static int udplite_timeout_nlattr_to_obj(struct nlattr *tb[],
178 struct net *net, void *data)
179{
180 unsigned int *timeouts = data;
181 struct udplite_net *un = udplite_pernet(net);
182
183 /* set default timeouts for UDPlite. */
184 timeouts[UDPLITE_CT_UNREPLIED] = un->timeouts[UDPLITE_CT_UNREPLIED];
185 timeouts[UDPLITE_CT_REPLIED] = un->timeouts[UDPLITE_CT_REPLIED];
186
187 if (tb[CTA_TIMEOUT_UDPLITE_UNREPLIED]) {
188 timeouts[UDPLITE_CT_UNREPLIED] =
189 ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDPLITE_UNREPLIED])) * HZ;
190 }
191 if (tb[CTA_TIMEOUT_UDPLITE_REPLIED]) {
192 timeouts[UDPLITE_CT_REPLIED] =
193 ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDPLITE_REPLIED])) * HZ;
194 }
195 return 0;
196}
197
198static int
199udplite_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
200{
201 const unsigned int *timeouts = data;
202
203 if (nla_put_be32(skb, CTA_TIMEOUT_UDPLITE_UNREPLIED,
204 htonl(timeouts[UDPLITE_CT_UNREPLIED] / HZ)) ||
205 nla_put_be32(skb, CTA_TIMEOUT_UDPLITE_REPLIED,
206 htonl(timeouts[UDPLITE_CT_REPLIED] / HZ)))
207 goto nla_put_failure;
208 return 0;
209
210nla_put_failure:
211 return -ENOSPC;
212}
213
214static const struct nla_policy
215udplite_timeout_nla_policy[CTA_TIMEOUT_UDPLITE_MAX+1] = {
216 [CTA_TIMEOUT_UDPLITE_UNREPLIED] = { .type = NLA_U32 },
217 [CTA_TIMEOUT_UDPLITE_REPLIED] = { .type = NLA_U32 },
218};
219#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
220
221#ifdef CONFIG_SYSCTL
222static struct ctl_table udplite_sysctl_table[] = {
223 {
224 .procname = "nf_conntrack_udplite_timeout",
225 .maxlen = sizeof(unsigned int),
226 .mode = 0644,
227 .proc_handler = proc_dointvec_jiffies,
228 },
229 {
230 .procname = "nf_conntrack_udplite_timeout_stream",
231 .maxlen = sizeof(unsigned int),
232 .mode = 0644,
233 .proc_handler = proc_dointvec_jiffies,
234 },
235 { }
236};
237#endif /* CONFIG_SYSCTL */
238
239static int udplite_kmemdup_sysctl_table(struct nf_proto_net *pn,
240 struct udplite_net *un)
241{
242#ifdef CONFIG_SYSCTL
243 if (pn->ctl_table)
244 return 0;
245
246 pn->ctl_table = kmemdup(udplite_sysctl_table,
247 sizeof(udplite_sysctl_table),
248 GFP_KERNEL);
249 if (!pn->ctl_table)
250 return -ENOMEM;
251
252 pn->ctl_table[0].data = &un->timeouts[UDPLITE_CT_UNREPLIED];
253 pn->ctl_table[1].data = &un->timeouts[UDPLITE_CT_REPLIED];
254#endif
255 return 0;
256}
257
258static int udplite_init_net(struct net *net, u_int16_t proto)
259{
260 struct udplite_net *un = udplite_pernet(net);
261 struct nf_proto_net *pn = &un->pn;
262
263 if (!pn->users) {
264 int i;
265
266 for (i = 0 ; i < UDPLITE_CT_MAX; i++)
267 un->timeouts[i] = udplite_timeouts[i];
268 }
269
270 return udplite_kmemdup_sysctl_table(pn, un);
271}
272
273static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =
274{
275 .l3proto = PF_INET,
276 .l4proto = IPPROTO_UDPLITE,
277 .name = "udplite",
278 .allow_clash = true,
279 .pkt_to_tuple = udplite_pkt_to_tuple,
280 .invert_tuple = udplite_invert_tuple,
281 .print_tuple = udplite_print_tuple,
282 .packet = udplite_packet,
283 .get_timeouts = udplite_get_timeouts,
284 .new = udplite_new,
285 .error = udplite_error,
286#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
287 .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
288 .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
289 .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
290 .nla_policy = nf_ct_port_nla_policy,
291#endif
292#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
293 .ctnl_timeout = {
294 .nlattr_to_obj = udplite_timeout_nlattr_to_obj,
295 .obj_to_nlattr = udplite_timeout_obj_to_nlattr,
296 .nlattr_max = CTA_TIMEOUT_UDPLITE_MAX,
297 .obj_size = sizeof(unsigned int) *
298 CTA_TIMEOUT_UDPLITE_MAX,
299 .nla_policy = udplite_timeout_nla_policy,
300 },
301#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
302 .net_id = &udplite_net_id,
303 .init_net = udplite_init_net,
304};
305
306static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly =
307{
308 .l3proto = PF_INET6,
309 .l4proto = IPPROTO_UDPLITE,
310 .name = "udplite",
311 .allow_clash = true,
312 .pkt_to_tuple = udplite_pkt_to_tuple,
313 .invert_tuple = udplite_invert_tuple,
314 .print_tuple = udplite_print_tuple,
315 .packet = udplite_packet,
316 .get_timeouts = udplite_get_timeouts,
317 .new = udplite_new,
318 .error = udplite_error,
319#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
320 .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
321 .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
322 .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
323 .nla_policy = nf_ct_port_nla_policy,
324#endif
325#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
326 .ctnl_timeout = {
327 .nlattr_to_obj = udplite_timeout_nlattr_to_obj,
328 .obj_to_nlattr = udplite_timeout_obj_to_nlattr,
329 .nlattr_max = CTA_TIMEOUT_UDPLITE_MAX,
330 .obj_size = sizeof(unsigned int) *
331 CTA_TIMEOUT_UDPLITE_MAX,
332 .nla_policy = udplite_timeout_nla_policy,
333 },
334#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
335 .net_id = &udplite_net_id,
336 .init_net = udplite_init_net,
337};
338
339static int udplite_net_init(struct net *net)
340{
341 int ret = 0;
342
343 ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udplite4);
344 if (ret < 0) {
345 pr_err("nf_conntrack_udplite4: pernet registration failed.\n");
346 goto out;
347 }
348 ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udplite6);
349 if (ret < 0) {
350 pr_err("nf_conntrack_udplite6: pernet registration failed.\n");
351 goto cleanup_udplite4;
352 }
353 return 0;
354
355cleanup_udplite4:
356 nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udplite4);
357out:
358 return ret;
359}
360
361static void udplite_net_exit(struct net *net)
362{
363 nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udplite6);
364 nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udplite4);
365}
366
367static struct pernet_operations udplite_net_ops = {
368 .init = udplite_net_init,
369 .exit = udplite_net_exit,
370 .id = &udplite_net_id,
371 .size = sizeof(struct udplite_net),
372};
373
374static int __init nf_conntrack_proto_udplite_init(void)
375{
376 int ret;
377
378 ret = register_pernet_subsys(&udplite_net_ops);
379 if (ret < 0)
380 goto out_pernet;
381
382 ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udplite4);
383 if (ret < 0)
384 goto out_udplite4;
385
386 ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udplite6);
387 if (ret < 0)
388 goto out_udplite6;
389
390 return 0;
391out_udplite6:
392 nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite4);
393out_udplite4:
394 unregister_pernet_subsys(&udplite_net_ops);
395out_pernet:
396 return ret;
397}
398
399static void __exit nf_conntrack_proto_udplite_exit(void)
400{
401 nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite6);
402 nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite4);
403 unregister_pernet_subsys(&udplite_net_ops);
404}
405
406module_init(nf_conntrack_proto_udplite_init);
407module_exit(nf_conntrack_proto_udplite_exit);
408
409MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index c3fc14e021ec..0d17894798b5 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -809,13 +809,11 @@ static int refresh_signalling_expectation(struct nf_conn *ct,
809 exp->tuple.dst.protonum != proto || 809 exp->tuple.dst.protonum != proto ||
810 exp->tuple.dst.u.udp.port != port) 810 exp->tuple.dst.u.udp.port != port)
811 continue; 811 continue;
812 if (!del_timer(&exp->timeout)) 812 if (mod_timer_pending(&exp->timeout, jiffies + expires * HZ)) {
813 continue; 813 exp->flags &= ~NF_CT_EXPECT_INACTIVE;
814 exp->flags &= ~NF_CT_EXPECT_INACTIVE; 814 found = 1;
815 exp->timeout.expires = jiffies + expires * HZ; 815 break;
816 add_timer(&exp->timeout); 816 }
817 found = 1;
818 break;
819 } 817 }
820 spin_unlock_bh(&nf_conntrack_expect_lock); 818 spin_unlock_bh(&nf_conntrack_expect_lock);
821 return found; 819 return found;
@@ -1630,8 +1628,6 @@ static int __init nf_conntrack_sip_init(void)
1630 ports[ports_c++] = SIP_PORT; 1628 ports[ports_c++] = SIP_PORT;
1631 1629
1632 for (i = 0; i < ports_c; i++) { 1630 for (i = 0; i < ports_c; i++) {
1633 memset(&sip[i], 0, sizeof(sip[i]));
1634
1635 nf_ct_helper_init(&sip[4 * i], AF_INET, IPPROTO_UDP, "sip", 1631 nf_ct_helper_init(&sip[4 * i], AF_INET, IPPROTO_UDP, "sip",
1636 SIP_PORT, ports[i], i, sip_exp_policy, 1632 SIP_PORT, ports[i], i, sip_exp_policy,
1637 SIP_EXPECT_MAX, 1633 SIP_EXPECT_MAX,
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 5f446cd9f3fd..2256147dcaad 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -452,6 +452,9 @@ static int log_invalid_proto_max __read_mostly = 255;
452/* size the user *wants to set */ 452/* size the user *wants to set */
453static unsigned int nf_conntrack_htable_size_user __read_mostly; 453static unsigned int nf_conntrack_htable_size_user __read_mostly;
454 454
455extern unsigned int nf_conntrack_default_on;
456unsigned int nf_conntrack_default_on __read_mostly = 1;
457
455static int 458static int
456nf_conntrack_hash_sysctl(struct ctl_table *table, int write, 459nf_conntrack_hash_sysctl(struct ctl_table *table, int write,
457 void __user *buffer, size_t *lenp, loff_t *ppos) 460 void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -517,6 +520,13 @@ static struct ctl_table nf_ct_sysctl_table[] = {
517 .mode = 0644, 520 .mode = 0644,
518 .proc_handler = proc_dointvec, 521 .proc_handler = proc_dointvec,
519 }, 522 },
523 {
524 .procname = "nf_conntrack_default_on",
525 .data = &nf_conntrack_default_on,
526 .maxlen = sizeof(unsigned int),
527 .mode = 0644,
528 .proc_handler = proc_dointvec,
529 },
520 { } 530 { }
521}; 531};
522 532
@@ -632,6 +642,9 @@ static int __init nf_conntrack_standalone_init(void)
632 if (ret < 0) 642 if (ret < 0)
633 goto out_start; 643 goto out_start;
634 644
645 BUILD_BUG_ON(SKB_NFCT_PTRMASK != NFCT_PTRMASK);
646 BUILD_BUG_ON(NFCT_INFOMASK <= IP_CT_NUMBER);
647
635#ifdef CONFIG_SYSCTL 648#ifdef CONFIG_SYSCTL
636 nf_ct_netfilter_header = 649 nf_ct_netfilter_header =
637 register_net_sysctl(&init_net, "net", nf_ct_netfilter_table); 650 register_net_sysctl(&init_net, "net", nf_ct_netfilter_table);
diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c
index 7ec69723940f..c9d7f95768ab 100644
--- a/net/netfilter/nf_dup_netdev.c
+++ b/net/netfilter/nf_dup_netdev.c
@@ -14,24 +14,41 @@
14#include <linux/netfilter/nf_tables.h> 14#include <linux/netfilter/nf_tables.h>
15#include <net/netfilter/nf_tables.h> 15#include <net/netfilter/nf_tables.h>
16 16
17static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev)
18{
19 if (skb_mac_header_was_set(skb))
20 skb_push(skb, skb->mac_len);
21
22 skb->dev = dev;
23 dev_queue_xmit(skb);
24}
25
26void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif)
27{
28 struct net_device *dev;
29
30 dev = dev_get_by_index_rcu(nft_net(pkt), oif);
31 if (!dev) {
32 kfree_skb(pkt->skb);
33 return;
34 }
35
36 nf_do_netdev_egress(pkt->skb, dev);
37}
38EXPORT_SYMBOL_GPL(nf_fwd_netdev_egress);
39
17void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif) 40void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif)
18{ 41{
19 struct net_device *dev; 42 struct net_device *dev;
20 struct sk_buff *skb; 43 struct sk_buff *skb;
21 44
22 dev = dev_get_by_index_rcu(pkt->net, oif); 45 dev = dev_get_by_index_rcu(nft_net(pkt), oif);
23 if (dev == NULL) 46 if (dev == NULL)
24 return; 47 return;
25 48
26 skb = skb_clone(pkt->skb, GFP_ATOMIC); 49 skb = skb_clone(pkt->skb, GFP_ATOMIC);
27 if (skb == NULL) 50 if (skb)
28 return; 51 nf_do_netdev_egress(skb, dev);
29
30 if (skb_mac_header_was_set(skb))
31 skb_push(skb, skb->mac_len);
32
33 skb->dev = dev;
34 dev_queue_xmit(skb);
35} 52}
36EXPORT_SYMBOL_GPL(nf_dup_netdev_egress); 53EXPORT_SYMBOL_GPL(nf_dup_netdev_egress);
37 54
diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h
index 9fdb655f85bc..c46d214d5323 100644
--- a/net/netfilter/nf_internals.h
+++ b/net/netfilter/nf_internals.h
@@ -11,11 +11,6 @@
11#define NFDEBUG(format, args...) 11#define NFDEBUG(format, args...)
12#endif 12#endif
13 13
14
15/* core.c */
16unsigned int nf_iterate(struct sk_buff *skb, struct nf_hook_state *state,
17 struct nf_hook_entry **entryp);
18
19/* nf_queue.c */ 14/* nf_queue.c */
20int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, 15int nf_queue(struct sk_buff *skb, struct nf_hook_state *state,
21 struct nf_hook_entry **entryp, unsigned int verdict); 16 struct nf_hook_entry **entryp, unsigned int verdict);
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 3dca90dc24ad..8d85a0598b60 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -13,9 +13,11 @@
13/* Internal logging interface, which relies on the real 13/* Internal logging interface, which relies on the real
14 LOG target modules */ 14 LOG target modules */
15 15
16#define NF_LOG_PREFIXLEN 128
17#define NFLOGGER_NAME_LEN 64 16#define NFLOGGER_NAME_LEN 64
18 17
18int sysctl_nf_log_all_netns __read_mostly;
19EXPORT_SYMBOL(sysctl_nf_log_all_netns);
20
19static struct nf_logger __rcu *loggers[NFPROTO_NUMPROTO][NF_LOG_TYPE_MAX] __read_mostly; 21static struct nf_logger __rcu *loggers[NFPROTO_NUMPROTO][NF_LOG_TYPE_MAX] __read_mostly;
20static DEFINE_MUTEX(nf_log_mutex); 22static DEFINE_MUTEX(nf_log_mutex);
21 23
@@ -414,6 +416,18 @@ static const struct file_operations nflog_file_ops = {
414#ifdef CONFIG_SYSCTL 416#ifdef CONFIG_SYSCTL
415static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3]; 417static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3];
416static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1]; 418static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1];
419static struct ctl_table_header *nf_log_sysctl_fhdr;
420
421static struct ctl_table nf_log_sysctl_ftable[] = {
422 {
423 .procname = "nf_log_all_netns",
424 .data = &sysctl_nf_log_all_netns,
425 .maxlen = sizeof(sysctl_nf_log_all_netns),
426 .mode = 0644,
427 .proc_handler = proc_dointvec,
428 },
429 { }
430};
417 431
418static int nf_log_proc_dostring(struct ctl_table *table, int write, 432static int nf_log_proc_dostring(struct ctl_table *table, int write,
419 void __user *buffer, size_t *lenp, loff_t *ppos) 433 void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -483,6 +497,10 @@ static int netfilter_log_sysctl_init(struct net *net)
483 nf_log_sysctl_table[i].extra1 = 497 nf_log_sysctl_table[i].extra1 =
484 (void *)(unsigned long) i; 498 (void *)(unsigned long) i;
485 } 499 }
500 nf_log_sysctl_fhdr = register_net_sysctl(net, "net/netfilter",
501 nf_log_sysctl_ftable);
502 if (!nf_log_sysctl_fhdr)
503 goto err_freg;
486 } 504 }
487 505
488 for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) 506 for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++)
@@ -499,6 +517,9 @@ static int netfilter_log_sysctl_init(struct net *net)
499err_reg: 517err_reg:
500 if (!net_eq(net, &init_net)) 518 if (!net_eq(net, &init_net))
501 kfree(table); 519 kfree(table);
520 else
521 unregister_net_sysctl_table(nf_log_sysctl_fhdr);
522err_freg:
502err_alloc: 523err_alloc:
503 return -ENOMEM; 524 return -ENOMEM;
504} 525}
@@ -511,6 +532,8 @@ static void netfilter_log_sysctl_exit(struct net *net)
511 unregister_net_sysctl_table(net->nf.nf_log_dir_header); 532 unregister_net_sysctl_table(net->nf.nf_log_dir_header);
512 if (!net_eq(net, &init_net)) 533 if (!net_eq(net, &init_net))
513 kfree(table); 534 kfree(table);
535 else
536 unregister_net_sysctl_table(nf_log_sysctl_fhdr);
514} 537}
515#else 538#else
516static int netfilter_log_sysctl_init(struct net *net) 539static int netfilter_log_sysctl_init(struct net *net)
diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c
index 119fe1cb1ea9..dc61399e30be 100644
--- a/net/netfilter/nf_log_common.c
+++ b/net/netfilter/nf_log_common.c
@@ -175,6 +175,34 @@ nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf,
175} 175}
176EXPORT_SYMBOL_GPL(nf_log_dump_packet_common); 176EXPORT_SYMBOL_GPL(nf_log_dump_packet_common);
177 177
178/* bridge and netdev logging families share this code. */
179void nf_log_l2packet(struct net *net, u_int8_t pf,
180 __be16 protocol,
181 unsigned int hooknum,
182 const struct sk_buff *skb,
183 const struct net_device *in,
184 const struct net_device *out,
185 const struct nf_loginfo *loginfo,
186 const char *prefix)
187{
188 switch (protocol) {
189 case htons(ETH_P_IP):
190 nf_log_packet(net, NFPROTO_IPV4, hooknum, skb, in, out,
191 loginfo, "%s", prefix);
192 break;
193 case htons(ETH_P_IPV6):
194 nf_log_packet(net, NFPROTO_IPV6, hooknum, skb, in, out,
195 loginfo, "%s", prefix);
196 break;
197 case htons(ETH_P_ARP):
198 case htons(ETH_P_RARP):
199 nf_log_packet(net, NFPROTO_ARP, hooknum, skb, in, out,
200 loginfo, "%s", prefix);
201 break;
202 }
203}
204EXPORT_SYMBOL_GPL(nf_log_l2packet);
205
178static int __init nf_log_common_init(void) 206static int __init nf_log_common_init(void)
179{ 207{
180 return 0; 208 return 0;
diff --git a/net/netfilter/nf_log_netdev.c b/net/netfilter/nf_log_netdev.c
new file mode 100644
index 000000000000..350eb147754d
--- /dev/null
+++ b/net/netfilter/nf_log_netdev.c
@@ -0,0 +1,81 @@
1/*
2 * (C) 2016 by Pablo Neira Ayuso <pablo@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/module.h>
10#include <linux/spinlock.h>
11#include <linux/skbuff.h>
12#include <linux/ip.h>
13#include <net/route.h>
14
15#include <linux/netfilter.h>
16#include <net/netfilter/nf_log.h>
17
18static void nf_log_netdev_packet(struct net *net, u_int8_t pf,
19 unsigned int hooknum,
20 const struct sk_buff *skb,
21 const struct net_device *in,
22 const struct net_device *out,
23 const struct nf_loginfo *loginfo,
24 const char *prefix)
25{
26 nf_log_l2packet(net, pf, skb->protocol, hooknum, skb, in, out,
27 loginfo, prefix);
28}
29
30static struct nf_logger nf_netdev_logger __read_mostly = {
31 .name = "nf_log_netdev",
32 .type = NF_LOG_TYPE_LOG,
33 .logfn = nf_log_netdev_packet,
34 .me = THIS_MODULE,
35};
36
37static int __net_init nf_log_netdev_net_init(struct net *net)
38{
39 return nf_log_set(net, NFPROTO_NETDEV, &nf_netdev_logger);
40}
41
42static void __net_exit nf_log_netdev_net_exit(struct net *net)
43{
44 nf_log_unset(net, &nf_netdev_logger);
45}
46
47static struct pernet_operations nf_log_netdev_net_ops = {
48 .init = nf_log_netdev_net_init,
49 .exit = nf_log_netdev_net_exit,
50};
51
52static int __init nf_log_netdev_init(void)
53{
54 int ret;
55
56 /* Request to load the real packet loggers. */
57 nf_logger_request_module(NFPROTO_IPV4, NF_LOG_TYPE_LOG);
58 nf_logger_request_module(NFPROTO_IPV6, NF_LOG_TYPE_LOG);
59 nf_logger_request_module(NFPROTO_ARP, NF_LOG_TYPE_LOG);
60
61 ret = register_pernet_subsys(&nf_log_netdev_net_ops);
62 if (ret < 0)
63 return ret;
64
65 nf_log_register(NFPROTO_NETDEV, &nf_netdev_logger);
66 return 0;
67}
68
69static void __exit nf_log_netdev_exit(void)
70{
71 unregister_pernet_subsys(&nf_log_netdev_net_ops);
72 nf_log_unregister(&nf_netdev_logger);
73}
74
75module_init(nf_log_netdev_init);
76module_exit(nf_log_netdev_exit);
77
78MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
79MODULE_DESCRIPTION("Netfilter netdev packet logging");
80MODULE_LICENSE("GPL");
81MODULE_ALIAS_NF_LOGGER(5, 0); /* NFPROTO_NETDEV */
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 5b9c884a452e..82802e4a6640 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -682,6 +682,18 @@ int nf_nat_l3proto_register(const struct nf_nat_l3proto *l3proto)
682 &nf_nat_l4proto_tcp); 682 &nf_nat_l4proto_tcp);
683 RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_UDP], 683 RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_UDP],
684 &nf_nat_l4proto_udp); 684 &nf_nat_l4proto_udp);
685#ifdef CONFIG_NF_NAT_PROTO_DCCP
686 RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_DCCP],
687 &nf_nat_l4proto_dccp);
688#endif
689#ifdef CONFIG_NF_NAT_PROTO_SCTP
690 RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_SCTP],
691 &nf_nat_l4proto_sctp);
692#endif
693#ifdef CONFIG_NF_NAT_PROTO_UDPLITE
694 RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_UDPLITE],
695 &nf_nat_l4proto_udplite);
696#endif
685 mutex_unlock(&nf_nat_proto_mutex); 697 mutex_unlock(&nf_nat_proto_mutex);
686 698
687 RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], l3proto); 699 RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], l3proto);
@@ -891,6 +903,8 @@ static void __exit nf_nat_cleanup(void)
891#ifdef CONFIG_XFRM 903#ifdef CONFIG_XFRM
892 RCU_INIT_POINTER(nf_nat_decode_session_hook, NULL); 904 RCU_INIT_POINTER(nf_nat_decode_session_hook, NULL);
893#endif 905#endif
906 synchronize_rcu();
907
894 for (i = 0; i < NFPROTO_NUMPROTO; i++) 908 for (i = 0; i < NFPROTO_NUMPROTO; i++)
895 kfree(nf_nat_l4protos[i]); 909 kfree(nf_nat_l4protos[i]);
896 910
diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c
index 2840abb5bb99..211661cb2c90 100644
--- a/net/netfilter/nf_nat_helper.c
+++ b/net/netfilter/nf_nat_helper.c
@@ -60,7 +60,7 @@ static void mangle_contents(struct sk_buff *skb,
60 __skb_trim(skb, skb->len + rep_len - match_len); 60 __skb_trim(skb, skb->len + rep_len - match_len);
61 } 61 }
62 62
63 if (nf_ct_l3num((struct nf_conn *)skb->nfct) == NFPROTO_IPV4) { 63 if (nf_ct_l3num((struct nf_conn *)skb_nfct(skb)) == NFPROTO_IPV4) {
64 /* fix IP hdr checksum information */ 64 /* fix IP hdr checksum information */
65 ip_hdr(skb)->tot_len = htons(skb->len); 65 ip_hdr(skb)->tot_len = htons(skb->len);
66 ip_send_check(ip_hdr(skb)); 66 ip_send_check(ip_hdr(skb));
diff --git a/net/netfilter/nf_nat_proto_dccp.c b/net/netfilter/nf_nat_proto_dccp.c
index 15c47b246d0d..269fcd5dc34c 100644
--- a/net/netfilter/nf_nat_proto_dccp.c
+++ b/net/netfilter/nf_nat_proto_dccp.c
@@ -10,8 +10,6 @@
10 */ 10 */
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h>
14#include <linux/init.h>
15#include <linux/skbuff.h> 13#include <linux/skbuff.h>
16#include <linux/dccp.h> 14#include <linux/dccp.h>
17 15
@@ -73,7 +71,7 @@ dccp_manip_pkt(struct sk_buff *skb,
73 return true; 71 return true;
74} 72}
75 73
76static const struct nf_nat_l4proto nf_nat_l4proto_dccp = { 74const struct nf_nat_l4proto nf_nat_l4proto_dccp = {
77 .l4proto = IPPROTO_DCCP, 75 .l4proto = IPPROTO_DCCP,
78 .manip_pkt = dccp_manip_pkt, 76 .manip_pkt = dccp_manip_pkt,
79 .in_range = nf_nat_l4proto_in_range, 77 .in_range = nf_nat_l4proto_in_range,
@@ -82,35 +80,3 @@ static const struct nf_nat_l4proto nf_nat_l4proto_dccp = {
82 .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, 80 .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
83#endif 81#endif
84}; 82};
85
86static int __init nf_nat_proto_dccp_init(void)
87{
88 int err;
89
90 err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_dccp);
91 if (err < 0)
92 goto err1;
93 err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_dccp);
94 if (err < 0)
95 goto err2;
96 return 0;
97
98err2:
99 nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_dccp);
100err1:
101 return err;
102}
103
104static void __exit nf_nat_proto_dccp_fini(void)
105{
106 nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_dccp);
107 nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_dccp);
108
109}
110
111module_init(nf_nat_proto_dccp_init);
112module_exit(nf_nat_proto_dccp_fini);
113
114MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
115MODULE_DESCRIPTION("DCCP NAT protocol helper");
116MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c
index cbc7ade1487b..804e8a0ab36e 100644
--- a/net/netfilter/nf_nat_proto_sctp.c
+++ b/net/netfilter/nf_nat_proto_sctp.c
@@ -7,9 +7,7 @@
7 */ 7 */
8 8
9#include <linux/types.h> 9#include <linux/types.h>
10#include <linux/init.h>
11#include <linux/sctp.h> 10#include <linux/sctp.h>
12#include <linux/module.h>
13#include <net/sctp/checksum.h> 11#include <net/sctp/checksum.h>
14 12
15#include <net/netfilter/nf_nat_l4proto.h> 13#include <net/netfilter/nf_nat_l4proto.h>
@@ -35,8 +33,16 @@ sctp_manip_pkt(struct sk_buff *skb,
35 enum nf_nat_manip_type maniptype) 33 enum nf_nat_manip_type maniptype)
36{ 34{
37 sctp_sctphdr_t *hdr; 35 sctp_sctphdr_t *hdr;
36 int hdrsize = 8;
38 37
39 if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) 38 /* This could be an inner header returned in imcp packet; in such
39 * cases we cannot update the checksum field since it is outside
40 * of the 8 bytes of transport layer headers we are guaranteed.
41 */
42 if (skb->len >= hdroff + sizeof(*hdr))
43 hdrsize = sizeof(*hdr);
44
45 if (!skb_make_writable(skb, hdroff + hdrsize))
40 return false; 46 return false;
41 47
42 hdr = (struct sctphdr *)(skb->data + hdroff); 48 hdr = (struct sctphdr *)(skb->data + hdroff);
@@ -49,12 +55,18 @@ sctp_manip_pkt(struct sk_buff *skb,
49 hdr->dest = tuple->dst.u.sctp.port; 55 hdr->dest = tuple->dst.u.sctp.port;
50 } 56 }
51 57
52 hdr->checksum = sctp_compute_cksum(skb, hdroff); 58 if (hdrsize < sizeof(*hdr))
59 return true;
60
61 if (skb->ip_summed != CHECKSUM_PARTIAL) {
62 hdr->checksum = sctp_compute_cksum(skb, hdroff);
63 skb->ip_summed = CHECKSUM_NONE;
64 }
53 65
54 return true; 66 return true;
55} 67}
56 68
57static const struct nf_nat_l4proto nf_nat_l4proto_sctp = { 69const struct nf_nat_l4proto nf_nat_l4proto_sctp = {
58 .l4proto = IPPROTO_SCTP, 70 .l4proto = IPPROTO_SCTP,
59 .manip_pkt = sctp_manip_pkt, 71 .manip_pkt = sctp_manip_pkt,
60 .in_range = nf_nat_l4proto_in_range, 72 .in_range = nf_nat_l4proto_in_range,
@@ -63,34 +75,3 @@ static const struct nf_nat_l4proto nf_nat_l4proto_sctp = {
63 .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, 75 .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
64#endif 76#endif
65}; 77};
66
67static int __init nf_nat_proto_sctp_init(void)
68{
69 int err;
70
71 err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_sctp);
72 if (err < 0)
73 goto err1;
74 err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_sctp);
75 if (err < 0)
76 goto err2;
77 return 0;
78
79err2:
80 nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_sctp);
81err1:
82 return err;
83}
84
85static void __exit nf_nat_proto_sctp_exit(void)
86{
87 nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_sctp);
88 nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_sctp);
89}
90
91module_init(nf_nat_proto_sctp_init);
92module_exit(nf_nat_proto_sctp_exit);
93
94MODULE_LICENSE("GPL");
95MODULE_DESCRIPTION("SCTP NAT protocol helper");
96MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/netfilter/nf_nat_proto_udp.c b/net/netfilter/nf_nat_proto_udp.c
index b1e627227b6e..edd4a77dc09a 100644
--- a/net/netfilter/nf_nat_proto_udp.c
+++ b/net/netfilter/nf_nat_proto_udp.c
@@ -30,20 +30,15 @@ udp_unique_tuple(const struct nf_nat_l3proto *l3proto,
30 &udp_port_rover); 30 &udp_port_rover);
31} 31}
32 32
33static bool 33static void
34udp_manip_pkt(struct sk_buff *skb, 34__udp_manip_pkt(struct sk_buff *skb,
35 const struct nf_nat_l3proto *l3proto, 35 const struct nf_nat_l3proto *l3proto,
36 unsigned int iphdroff, unsigned int hdroff, 36 unsigned int iphdroff, struct udphdr *hdr,
37 const struct nf_conntrack_tuple *tuple, 37 const struct nf_conntrack_tuple *tuple,
38 enum nf_nat_manip_type maniptype) 38 enum nf_nat_manip_type maniptype, bool do_csum)
39{ 39{
40 struct udphdr *hdr;
41 __be16 *portptr, newport; 40 __be16 *portptr, newport;
42 41
43 if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
44 return false;
45 hdr = (struct udphdr *)(skb->data + hdroff);
46
47 if (maniptype == NF_NAT_MANIP_SRC) { 42 if (maniptype == NF_NAT_MANIP_SRC) {
48 /* Get rid of src port */ 43 /* Get rid of src port */
49 newport = tuple->src.u.udp.port; 44 newport = tuple->src.u.udp.port;
@@ -53,7 +48,7 @@ udp_manip_pkt(struct sk_buff *skb,
53 newport = tuple->dst.u.udp.port; 48 newport = tuple->dst.u.udp.port;
54 portptr = &hdr->dest; 49 portptr = &hdr->dest;
55 } 50 }
56 if (hdr->check || skb->ip_summed == CHECKSUM_PARTIAL) { 51 if (do_csum) {
57 l3proto->csum_update(skb, iphdroff, &hdr->check, 52 l3proto->csum_update(skb, iphdroff, &hdr->check,
58 tuple, maniptype); 53 tuple, maniptype);
59 inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, 54 inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport,
@@ -62,9 +57,68 @@ udp_manip_pkt(struct sk_buff *skb,
62 hdr->check = CSUM_MANGLED_0; 57 hdr->check = CSUM_MANGLED_0;
63 } 58 }
64 *portptr = newport; 59 *portptr = newport;
60}
61
62static bool udp_manip_pkt(struct sk_buff *skb,
63 const struct nf_nat_l3proto *l3proto,
64 unsigned int iphdroff, unsigned int hdroff,
65 const struct nf_conntrack_tuple *tuple,
66 enum nf_nat_manip_type maniptype)
67{
68 struct udphdr *hdr;
69 bool do_csum;
70
71 if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
72 return false;
73
74 hdr = (struct udphdr *)(skb->data + hdroff);
75 do_csum = hdr->check || skb->ip_summed == CHECKSUM_PARTIAL;
76
77 __udp_manip_pkt(skb, l3proto, iphdroff, hdr, tuple, maniptype, do_csum);
78 return true;
79}
80
81#ifdef CONFIG_NF_NAT_PROTO_UDPLITE
82static u16 udplite_port_rover;
83
84static bool udplite_manip_pkt(struct sk_buff *skb,
85 const struct nf_nat_l3proto *l3proto,
86 unsigned int iphdroff, unsigned int hdroff,
87 const struct nf_conntrack_tuple *tuple,
88 enum nf_nat_manip_type maniptype)
89{
90 struct udphdr *hdr;
91
92 if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
93 return false;
94
95 hdr = (struct udphdr *)(skb->data + hdroff);
96 __udp_manip_pkt(skb, l3proto, iphdroff, hdr, tuple, maniptype, true);
65 return true; 97 return true;
66} 98}
67 99
100static void
101udplite_unique_tuple(const struct nf_nat_l3proto *l3proto,
102 struct nf_conntrack_tuple *tuple,
103 const struct nf_nat_range *range,
104 enum nf_nat_manip_type maniptype,
105 const struct nf_conn *ct)
106{
107 nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct,
108 &udplite_port_rover);
109}
110
111const struct nf_nat_l4proto nf_nat_l4proto_udplite = {
112 .l4proto = IPPROTO_UDPLITE,
113 .manip_pkt = udplite_manip_pkt,
114 .in_range = nf_nat_l4proto_in_range,
115 .unique_tuple = udplite_unique_tuple,
116#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
117 .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
118#endif
119};
120#endif /* CONFIG_NF_NAT_PROTO_UDPLITE */
121
68const struct nf_nat_l4proto nf_nat_l4proto_udp = { 122const struct nf_nat_l4proto nf_nat_l4proto_udp = {
69 .l4proto = IPPROTO_UDP, 123 .l4proto = IPPROTO_UDP,
70 .manip_pkt = udp_manip_pkt, 124 .manip_pkt = udp_manip_pkt,
diff --git a/net/netfilter/nf_nat_proto_udplite.c b/net/netfilter/nf_nat_proto_udplite.c
deleted file mode 100644
index 58340c97bd83..000000000000
--- a/net/netfilter/nf_nat_proto_udplite.c
+++ /dev/null
@@ -1,106 +0,0 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
3 * (C) 2008 Patrick McHardy <kaber@trash.net>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 */
9
10#include <linux/types.h>
11#include <linux/init.h>
12#include <linux/udp.h>
13
14#include <linux/netfilter.h>
15#include <linux/module.h>
16#include <net/netfilter/nf_nat.h>
17#include <net/netfilter/nf_nat_l3proto.h>
18#include <net/netfilter/nf_nat_l4proto.h>
19
20static u16 udplite_port_rover;
21
22static void
23udplite_unique_tuple(const struct nf_nat_l3proto *l3proto,
24 struct nf_conntrack_tuple *tuple,
25 const struct nf_nat_range *range,
26 enum nf_nat_manip_type maniptype,
27 const struct nf_conn *ct)
28{
29 nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct,
30 &udplite_port_rover);
31}
32
33static bool
34udplite_manip_pkt(struct sk_buff *skb,
35 const struct nf_nat_l3proto *l3proto,
36 unsigned int iphdroff, unsigned int hdroff,
37 const struct nf_conntrack_tuple *tuple,
38 enum nf_nat_manip_type maniptype)
39{
40 struct udphdr *hdr;
41 __be16 *portptr, newport;
42
43 if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
44 return false;
45
46 hdr = (struct udphdr *)(skb->data + hdroff);
47
48 if (maniptype == NF_NAT_MANIP_SRC) {
49 /* Get rid of source port */
50 newport = tuple->src.u.udp.port;
51 portptr = &hdr->source;
52 } else {
53 /* Get rid of dst port */
54 newport = tuple->dst.u.udp.port;
55 portptr = &hdr->dest;
56 }
57
58 l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype);
59 inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, false);
60 if (!hdr->check)
61 hdr->check = CSUM_MANGLED_0;
62
63 *portptr = newport;
64 return true;
65}
66
67static const struct nf_nat_l4proto nf_nat_l4proto_udplite = {
68 .l4proto = IPPROTO_UDPLITE,
69 .manip_pkt = udplite_manip_pkt,
70 .in_range = nf_nat_l4proto_in_range,
71 .unique_tuple = udplite_unique_tuple,
72#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
73 .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
74#endif
75};
76
77static int __init nf_nat_proto_udplite_init(void)
78{
79 int err;
80
81 err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_udplite);
82 if (err < 0)
83 goto err1;
84 err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_udplite);
85 if (err < 0)
86 goto err2;
87 return 0;
88
89err2:
90 nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_udplite);
91err1:
92 return err;
93}
94
95static void __exit nf_nat_proto_udplite_fini(void)
96{
97 nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_udplite);
98 nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_udplite);
99}
100
101module_init(nf_nat_proto_udplite_init);
102module_exit(nf_nat_proto_udplite_fini);
103
104MODULE_LICENSE("GPL");
105MODULE_DESCRIPTION("UDP-Lite NAT protocol helper");
106MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c
index d43869879fcf..86067560a318 100644
--- a/net/netfilter/nf_nat_redirect.c
+++ b/net/netfilter/nf_nat_redirect.c
@@ -101,11 +101,13 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range *range,
101 rcu_read_lock(); 101 rcu_read_lock();
102 idev = __in6_dev_get(skb->dev); 102 idev = __in6_dev_get(skb->dev);
103 if (idev != NULL) { 103 if (idev != NULL) {
104 read_lock_bh(&idev->lock);
104 list_for_each_entry(ifa, &idev->addr_list, if_list) { 105 list_for_each_entry(ifa, &idev->addr_list, if_list) {
105 newdst = ifa->addr; 106 newdst = ifa->addr;
106 addr = true; 107 addr = true;
107 break; 108 break;
108 } 109 }
110 read_unlock_bh(&idev->lock);
109 } 111 }
110 rcu_read_unlock(); 112 rcu_read_unlock();
111 113
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 8f08d759844a..4a7662486f44 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -108,7 +108,7 @@ void nf_queue_nf_hook_drop(struct net *net, const struct nf_hook_entry *entry)
108} 108}
109 109
110static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, 110static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
111 unsigned int queuenum) 111 struct nf_hook_entry *hook_entry, unsigned int queuenum)
112{ 112{
113 int status = -ENOENT; 113 int status = -ENOENT;
114 struct nf_queue_entry *entry = NULL; 114 struct nf_queue_entry *entry = NULL;
@@ -136,6 +136,7 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
136 *entry = (struct nf_queue_entry) { 136 *entry = (struct nf_queue_entry) {
137 .skb = skb, 137 .skb = skb,
138 .state = *state, 138 .state = *state,
139 .hook = hook_entry,
139 .size = sizeof(*entry) + afinfo->route_key_size, 140 .size = sizeof(*entry) + afinfo->route_key_size,
140 }; 141 };
141 142
@@ -163,8 +164,7 @@ int nf_queue(struct sk_buff *skb, struct nf_hook_state *state,
163 struct nf_hook_entry *entry = *entryp; 164 struct nf_hook_entry *entry = *entryp;
164 int ret; 165 int ret;
165 166
166 RCU_INIT_POINTER(state->hook_entries, entry); 167 ret = __nf_queue(skb, state, entry, verdict >> NF_VERDICT_QBITS);
167 ret = __nf_queue(skb, state, verdict >> NF_VERDICT_QBITS);
168 if (ret < 0) { 168 if (ret < 0) {
169 if (ret == -ESRCH && 169 if (ret == -ESRCH &&
170 (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) { 170 (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) {
@@ -177,22 +177,38 @@ int nf_queue(struct sk_buff *skb, struct nf_hook_state *state,
177 return 0; 177 return 0;
178} 178}
179 179
180static unsigned int nf_iterate(struct sk_buff *skb,
181 struct nf_hook_state *state,
182 struct nf_hook_entry **entryp)
183{
184 unsigned int verdict;
185
186 do {
187repeat:
188 verdict = nf_hook_entry_hookfn((*entryp), skb, state);
189 if (verdict != NF_ACCEPT) {
190 if (verdict != NF_REPEAT)
191 return verdict;
192 goto repeat;
193 }
194 *entryp = rcu_dereference((*entryp)->next);
195 } while (*entryp);
196
197 return NF_ACCEPT;
198}
199
180void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) 200void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
181{ 201{
182 struct nf_hook_entry *hook_entry; 202 struct nf_hook_entry *hook_entry = entry->hook;
183 struct sk_buff *skb = entry->skb; 203 struct sk_buff *skb = entry->skb;
184 const struct nf_afinfo *afinfo; 204 const struct nf_afinfo *afinfo;
185 struct nf_hook_ops *elem;
186 int err; 205 int err;
187 206
188 hook_entry = rcu_dereference(entry->state.hook_entries);
189 elem = &hook_entry->ops;
190
191 nf_queue_entry_release_refs(entry); 207 nf_queue_entry_release_refs(entry);
192 208
193 /* Continue traversal iff userspace said ok... */ 209 /* Continue traversal iff userspace said ok... */
194 if (verdict == NF_REPEAT) 210 if (verdict == NF_REPEAT)
195 verdict = elem->hook(elem->priv, skb, &entry->state); 211 verdict = nf_hook_entry_hookfn(hook_entry, skb, &entry->state);
196 212
197 if (verdict == NF_ACCEPT) { 213 if (verdict == NF_ACCEPT) {
198 afinfo = nf_get_afinfo(entry->state.pf); 214 afinfo = nf_get_afinfo(entry->state.pf);
@@ -200,8 +216,6 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
200 verdict = NF_DROP; 216 verdict = NF_DROP;
201 } 217 }
202 218
203 entry->state.thresh = INT_MIN;
204
205 if (verdict == NF_ACCEPT) { 219 if (verdict == NF_ACCEPT) {
206 hook_entry = rcu_dereference(hook_entry->next); 220 hook_entry = rcu_dereference(hook_entry->next);
207 if (hook_entry) 221 if (hook_entry)
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index c8a4a48bced9..7c6d1fbe38b9 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -24,7 +24,7 @@
24#include <net/netfilter/nf_conntrack_synproxy.h> 24#include <net/netfilter/nf_conntrack_synproxy.h>
25#include <net/netfilter/nf_conntrack_zones.h> 25#include <net/netfilter/nf_conntrack_zones.h>
26 26
27int synproxy_net_id; 27unsigned int synproxy_net_id;
28EXPORT_SYMBOL_GPL(synproxy_net_id); 28EXPORT_SYMBOL_GPL(synproxy_net_id);
29 29
30bool 30bool
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index e5194f6f906c..434c739dfeca 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -22,6 +22,7 @@
22#include <net/sock.h> 22#include <net/sock.h>
23 23
24static LIST_HEAD(nf_tables_expressions); 24static LIST_HEAD(nf_tables_expressions);
25static LIST_HEAD(nf_tables_objects);
25 26
26/** 27/**
27 * nft_register_afinfo - register nf_tables address family info 28 * nft_register_afinfo - register nf_tables address family info
@@ -110,12 +111,12 @@ static void nft_ctx_init(struct nft_ctx *ctx,
110 ctx->seq = nlh->nlmsg_seq; 111 ctx->seq = nlh->nlmsg_seq;
111} 112}
112 113
113static struct nft_trans *nft_trans_alloc(struct nft_ctx *ctx, int msg_type, 114static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx,
114 u32 size) 115 int msg_type, u32 size, gfp_t gfp)
115{ 116{
116 struct nft_trans *trans; 117 struct nft_trans *trans;
117 118
118 trans = kzalloc(sizeof(struct nft_trans) + size, GFP_KERNEL); 119 trans = kzalloc(sizeof(struct nft_trans) + size, gfp);
119 if (trans == NULL) 120 if (trans == NULL)
120 return NULL; 121 return NULL;
121 122
@@ -125,6 +126,12 @@ static struct nft_trans *nft_trans_alloc(struct nft_ctx *ctx, int msg_type,
125 return trans; 126 return trans;
126} 127}
127 128
129static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx,
130 int msg_type, u32 size)
131{
132 return nft_trans_alloc_gfp(ctx, msg_type, size, GFP_KERNEL);
133}
134
128static void nft_trans_destroy(struct nft_trans *trans) 135static void nft_trans_destroy(struct nft_trans *trans)
129{ 136{
130 list_del(&trans->list); 137 list_del(&trans->list);
@@ -233,6 +240,10 @@ static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type,
233 if (trans == NULL) 240 if (trans == NULL)
234 return NULL; 241 return NULL;
235 242
243 if (msg_type == NFT_MSG_NEWRULE && ctx->nla[NFTA_RULE_ID] != NULL) {
244 nft_trans_rule_id(trans) =
245 ntohl(nla_get_be32(ctx->nla[NFTA_RULE_ID]));
246 }
236 nft_trans_rule(trans) = rule; 247 nft_trans_rule(trans) = rule;
237 list_add_tail(&trans->list, &ctx->net->nft.commit_list); 248 list_add_tail(&trans->list, &ctx->net->nft.commit_list);
238 249
@@ -304,6 +315,38 @@ static int nft_delset(struct nft_ctx *ctx, struct nft_set *set)
304 return err; 315 return err;
305} 316}
306 317
318static int nft_trans_obj_add(struct nft_ctx *ctx, int msg_type,
319 struct nft_object *obj)
320{
321 struct nft_trans *trans;
322
323 trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_obj));
324 if (trans == NULL)
325 return -ENOMEM;
326
327 if (msg_type == NFT_MSG_NEWOBJ)
328 nft_activate_next(ctx->net, obj);
329
330 nft_trans_obj(trans) = obj;
331 list_add_tail(&trans->list, &ctx->net->nft.commit_list);
332
333 return 0;
334}
335
336static int nft_delobj(struct nft_ctx *ctx, struct nft_object *obj)
337{
338 int err;
339
340 err = nft_trans_obj_add(ctx, NFT_MSG_DELOBJ, obj);
341 if (err < 0)
342 return err;
343
344 nft_deactivate_next(ctx->net, obj);
345 ctx->table->use--;
346
347 return err;
348}
349
307/* 350/*
308 * Tables 351 * Tables
309 */ 352 */
@@ -418,16 +461,15 @@ nla_put_failure:
418 return -1; 461 return -1;
419} 462}
420 463
421static int nf_tables_table_notify(const struct nft_ctx *ctx, int event) 464static void nf_tables_table_notify(const struct nft_ctx *ctx, int event)
422{ 465{
423 struct sk_buff *skb; 466 struct sk_buff *skb;
424 int err; 467 int err;
425 468
426 if (!ctx->report && 469 if (!ctx->report &&
427 !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) 470 !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
428 return 0; 471 return;
429 472
430 err = -ENOBUFS;
431 skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); 473 skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
432 if (skb == NULL) 474 if (skb == NULL)
433 goto err; 475 goto err;
@@ -439,14 +481,11 @@ static int nf_tables_table_notify(const struct nft_ctx *ctx, int event)
439 goto err; 481 goto err;
440 } 482 }
441 483
442 err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, 484 nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
443 ctx->report, GFP_KERNEL); 485 ctx->report, GFP_KERNEL);
486 return;
444err: 487err:
445 if (err < 0) { 488 nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
446 nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES,
447 err);
448 }
449 return err;
450} 489}
451 490
452static int nf_tables_dump_tables(struct sk_buff *skb, 491static int nf_tables_dump_tables(struct sk_buff *skb,
@@ -537,6 +576,28 @@ err:
537 return err; 576 return err;
538} 577}
539 578
579static void _nf_tables_table_disable(struct net *net,
580 const struct nft_af_info *afi,
581 struct nft_table *table,
582 u32 cnt)
583{
584 struct nft_chain *chain;
585 u32 i = 0;
586
587 list_for_each_entry(chain, &table->chains, list) {
588 if (!nft_is_active_next(net, chain))
589 continue;
590 if (!(chain->flags & NFT_BASE_CHAIN))
591 continue;
592
593 if (cnt && i++ == cnt)
594 break;
595
596 nf_unregister_net_hooks(net, nft_base_chain(chain)->ops,
597 afi->nops);
598 }
599}
600
540static int nf_tables_table_enable(struct net *net, 601static int nf_tables_table_enable(struct net *net,
541 const struct nft_af_info *afi, 602 const struct nft_af_info *afi,
542 struct nft_table *table) 603 struct nft_table *table)
@@ -559,18 +620,8 @@ static int nf_tables_table_enable(struct net *net,
559 } 620 }
560 return 0; 621 return 0;
561err: 622err:
562 list_for_each_entry(chain, &table->chains, list) { 623 if (i)
563 if (!nft_is_active_next(net, chain)) 624 _nf_tables_table_disable(net, afi, table, i);
564 continue;
565 if (!(chain->flags & NFT_BASE_CHAIN))
566 continue;
567
568 if (i-- <= 0)
569 break;
570
571 nf_unregister_net_hooks(net, nft_base_chain(chain)->ops,
572 afi->nops);
573 }
574 return err; 625 return err;
575} 626}
576 627
@@ -578,17 +629,7 @@ static void nf_tables_table_disable(struct net *net,
578 const struct nft_af_info *afi, 629 const struct nft_af_info *afi,
579 struct nft_table *table) 630 struct nft_table *table)
580{ 631{
581 struct nft_chain *chain; 632 _nf_tables_table_disable(net, afi, table, 0);
582
583 list_for_each_entry(chain, &table->chains, list) {
584 if (!nft_is_active_next(net, chain))
585 continue;
586 if (!(chain->flags & NFT_BASE_CHAIN))
587 continue;
588
589 nf_unregister_net_hooks(net, nft_base_chain(chain)->ops,
590 afi->nops);
591 }
592} 633}
593 634
594static int nf_tables_updtable(struct nft_ctx *ctx) 635static int nf_tables_updtable(struct nft_ctx *ctx)
@@ -657,10 +698,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
657 if (IS_ERR(table)) { 698 if (IS_ERR(table)) {
658 if (PTR_ERR(table) != -ENOENT) 699 if (PTR_ERR(table) != -ENOENT)
659 return PTR_ERR(table); 700 return PTR_ERR(table);
660 table = NULL; 701 } else {
661 }
662
663 if (table != NULL) {
664 if (nlh->nlmsg_flags & NLM_F_EXCL) 702 if (nlh->nlmsg_flags & NLM_F_EXCL)
665 return -EEXIST; 703 return -EEXIST;
666 if (nlh->nlmsg_flags & NLM_F_REPLACE) 704 if (nlh->nlmsg_flags & NLM_F_REPLACE)
@@ -688,6 +726,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
688 nla_strlcpy(table->name, name, NFT_TABLE_MAXNAMELEN); 726 nla_strlcpy(table->name, name, NFT_TABLE_MAXNAMELEN);
689 INIT_LIST_HEAD(&table->chains); 727 INIT_LIST_HEAD(&table->chains);
690 INIT_LIST_HEAD(&table->sets); 728 INIT_LIST_HEAD(&table->sets);
729 INIT_LIST_HEAD(&table->objects);
691 table->flags = flags; 730 table->flags = flags;
692 731
693 nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); 732 nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
@@ -709,6 +748,7 @@ static int nft_flush_table(struct nft_ctx *ctx)
709{ 748{
710 int err; 749 int err;
711 struct nft_chain *chain, *nc; 750 struct nft_chain *chain, *nc;
751 struct nft_object *obj, *ne;
712 struct nft_set *set, *ns; 752 struct nft_set *set, *ns;
713 753
714 list_for_each_entry(chain, &ctx->table->chains, list) { 754 list_for_each_entry(chain, &ctx->table->chains, list) {
@@ -735,6 +775,12 @@ static int nft_flush_table(struct nft_ctx *ctx)
735 goto out; 775 goto out;
736 } 776 }
737 777
778 list_for_each_entry_safe(obj, ne, &ctx->table->objects, list) {
779 err = nft_delobj(ctx, obj);
780 if (err < 0)
781 goto out;
782 }
783
738 list_for_each_entry_safe(chain, nc, &ctx->table->chains, list) { 784 list_for_each_entry_safe(chain, nc, &ctx->table->chains, list) {
739 if (!nft_is_active_next(ctx->net, chain)) 785 if (!nft_is_active_next(ctx->net, chain))
740 continue; 786 continue;
@@ -881,7 +927,8 @@ static struct nft_chain *nf_tables_chain_lookup(const struct nft_table *table,
881} 927}
882 928
883static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = { 929static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = {
884 [NFTA_CHAIN_TABLE] = { .type = NLA_STRING }, 930 [NFTA_CHAIN_TABLE] = { .type = NLA_STRING,
931 .len = NFT_TABLE_MAXNAMELEN - 1 },
885 [NFTA_CHAIN_HANDLE] = { .type = NLA_U64 }, 932 [NFTA_CHAIN_HANDLE] = { .type = NLA_U64 },
886 [NFTA_CHAIN_NAME] = { .type = NLA_STRING, 933 [NFTA_CHAIN_NAME] = { .type = NLA_STRING,
887 .len = NFT_CHAIN_MAXNAMELEN - 1 }, 934 .len = NFT_CHAIN_MAXNAMELEN - 1 },
@@ -999,16 +1046,15 @@ nla_put_failure:
999 return -1; 1046 return -1;
1000} 1047}
1001 1048
1002static int nf_tables_chain_notify(const struct nft_ctx *ctx, int event) 1049static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event)
1003{ 1050{
1004 struct sk_buff *skb; 1051 struct sk_buff *skb;
1005 int err; 1052 int err;
1006 1053
1007 if (!ctx->report && 1054 if (!ctx->report &&
1008 !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) 1055 !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
1009 return 0; 1056 return;
1010 1057
1011 err = -ENOBUFS;
1012 skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); 1058 skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
1013 if (skb == NULL) 1059 if (skb == NULL)
1014 goto err; 1060 goto err;
@@ -1021,14 +1067,11 @@ static int nf_tables_chain_notify(const struct nft_ctx *ctx, int event)
1021 goto err; 1067 goto err;
1022 } 1068 }
1023 1069
1024 err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, 1070 nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
1025 ctx->report, GFP_KERNEL); 1071 ctx->report, GFP_KERNEL);
1072 return;
1026err: 1073err:
1027 if (err < 0) { 1074 nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
1028 nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES,
1029 err);
1030 }
1031 return err;
1032} 1075}
1033 1076
1034static int nf_tables_dump_chains(struct sk_buff *skb, 1077static int nf_tables_dump_chains(struct sk_buff *skb,
@@ -1807,7 +1850,8 @@ static struct nft_rule *nf_tables_rule_lookup(const struct nft_chain *chain,
1807} 1850}
1808 1851
1809static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { 1852static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = {
1810 [NFTA_RULE_TABLE] = { .type = NLA_STRING }, 1853 [NFTA_RULE_TABLE] = { .type = NLA_STRING,
1854 .len = NFT_TABLE_MAXNAMELEN - 1 },
1811 [NFTA_RULE_CHAIN] = { .type = NLA_STRING, 1855 [NFTA_RULE_CHAIN] = { .type = NLA_STRING,
1812 .len = NFT_CHAIN_MAXNAMELEN - 1 }, 1856 .len = NFT_CHAIN_MAXNAMELEN - 1 },
1813 [NFTA_RULE_HANDLE] = { .type = NLA_U64 }, 1857 [NFTA_RULE_HANDLE] = { .type = NLA_U64 },
@@ -1882,18 +1926,16 @@ nla_put_failure:
1882 return -1; 1926 return -1;
1883} 1927}
1884 1928
1885static int nf_tables_rule_notify(const struct nft_ctx *ctx, 1929static void nf_tables_rule_notify(const struct nft_ctx *ctx,
1886 const struct nft_rule *rule, 1930 const struct nft_rule *rule, int event)
1887 int event)
1888{ 1931{
1889 struct sk_buff *skb; 1932 struct sk_buff *skb;
1890 int err; 1933 int err;
1891 1934
1892 if (!ctx->report && 1935 if (!ctx->report &&
1893 !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) 1936 !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
1894 return 0; 1937 return;
1895 1938
1896 err = -ENOBUFS;
1897 skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); 1939 skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
1898 if (skb == NULL) 1940 if (skb == NULL)
1899 goto err; 1941 goto err;
@@ -1906,14 +1948,11 @@ static int nf_tables_rule_notify(const struct nft_ctx *ctx,
1906 goto err; 1948 goto err;
1907 } 1949 }
1908 1950
1909 err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, 1951 nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
1910 ctx->report, GFP_KERNEL); 1952 ctx->report, GFP_KERNEL);
1953 return;
1911err: 1954err:
1912 if (err < 0) { 1955 nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
1913 nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES,
1914 err);
1915 }
1916 return err;
1917} 1956}
1918 1957
1919struct nft_rule_dump_ctx { 1958struct nft_rule_dump_ctx {
@@ -2068,7 +2107,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
2068 * is called on error from nf_tables_newrule(). 2107 * is called on error from nf_tables_newrule().
2069 */ 2108 */
2070 expr = nft_expr_first(rule); 2109 expr = nft_expr_first(rule);
2071 while (expr->ops && expr != nft_expr_last(rule)) { 2110 while (expr != nft_expr_last(rule) && expr->ops) {
2072 nf_tables_expr_destroy(ctx, expr); 2111 nf_tables_expr_destroy(ctx, expr);
2073 expr = nft_expr_next(expr); 2112 expr = nft_expr_next(expr);
2074 } 2113 }
@@ -2245,6 +2284,22 @@ err1:
2245 return err; 2284 return err;
2246} 2285}
2247 2286
2287static struct nft_rule *nft_rule_lookup_byid(const struct net *net,
2288 const struct nlattr *nla)
2289{
2290 u32 id = ntohl(nla_get_be32(nla));
2291 struct nft_trans *trans;
2292
2293 list_for_each_entry(trans, &net->nft.commit_list, list) {
2294 struct nft_rule *rule = nft_trans_rule(trans);
2295
2296 if (trans->msg_type == NFT_MSG_NEWRULE &&
2297 id == nft_trans_rule_id(trans))
2298 return rule;
2299 }
2300 return ERR_PTR(-ENOENT);
2301}
2302
2248static int nf_tables_delrule(struct net *net, struct sock *nlsk, 2303static int nf_tables_delrule(struct net *net, struct sock *nlsk,
2249 struct sk_buff *skb, const struct nlmsghdr *nlh, 2304 struct sk_buff *skb, const struct nlmsghdr *nlh,
2250 const struct nlattr * const nla[]) 2305 const struct nlattr * const nla[])
@@ -2283,6 +2338,12 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
2283 return PTR_ERR(rule); 2338 return PTR_ERR(rule);
2284 2339
2285 err = nft_delrule(&ctx, rule); 2340 err = nft_delrule(&ctx, rule);
2341 } else if (nla[NFTA_RULE_ID]) {
2342 rule = nft_rule_lookup_byid(net, nla[NFTA_RULE_ID]);
2343 if (IS_ERR(rule))
2344 return PTR_ERR(rule);
2345
2346 err = nft_delrule(&ctx, rule);
2286 } else { 2347 } else {
2287 err = nft_delrule_by_chain(&ctx); 2348 err = nft_delrule_by_chain(&ctx);
2288 } 2349 }
@@ -2350,12 +2411,14 @@ nft_select_set_ops(const struct nlattr * const nla[],
2350 features = 0; 2411 features = 0;
2351 if (nla[NFTA_SET_FLAGS] != NULL) { 2412 if (nla[NFTA_SET_FLAGS] != NULL) {
2352 features = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS])); 2413 features = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS]));
2353 features &= NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_TIMEOUT; 2414 features &= NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_TIMEOUT |
2415 NFT_SET_OBJECT;
2354 } 2416 }
2355 2417
2356 bops = NULL; 2418 bops = NULL;
2357 best.size = ~0; 2419 best.size = ~0;
2358 best.class = ~0; 2420 best.lookup = ~0;
2421 best.space = ~0;
2359 2422
2360 list_for_each_entry(ops, &nf_tables_set_ops, list) { 2423 list_for_each_entry(ops, &nf_tables_set_ops, list) {
2361 if ((ops->features & features) != features) 2424 if ((ops->features & features) != features)
@@ -2365,16 +2428,27 @@ nft_select_set_ops(const struct nlattr * const nla[],
2365 2428
2366 switch (policy) { 2429 switch (policy) {
2367 case NFT_SET_POL_PERFORMANCE: 2430 case NFT_SET_POL_PERFORMANCE:
2368 if (est.class < best.class) 2431 if (est.lookup < best.lookup)
2369 break;
2370 if (est.class == best.class && est.size < best.size)
2371 break; 2432 break;
2433 if (est.lookup == best.lookup) {
2434 if (!desc->size) {
2435 if (est.space < best.space)
2436 break;
2437 } else if (est.size < best.size) {
2438 break;
2439 }
2440 }
2372 continue; 2441 continue;
2373 case NFT_SET_POL_MEMORY: 2442 case NFT_SET_POL_MEMORY:
2374 if (est.size < best.size) 2443 if (!desc->size) {
2375 break; 2444 if (est.space < best.space)
2376 if (est.size == best.size && est.class < best.class) 2445 break;
2446 if (est.space == best.space &&
2447 est.lookup < best.lookup)
2448 break;
2449 } else if (est.size < best.size) {
2377 break; 2450 break;
2451 }
2378 continue; 2452 continue;
2379 default: 2453 default:
2380 break; 2454 break;
@@ -2396,7 +2470,8 @@ nft_select_set_ops(const struct nlattr * const nla[],
2396} 2470}
2397 2471
2398static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { 2472static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
2399 [NFTA_SET_TABLE] = { .type = NLA_STRING }, 2473 [NFTA_SET_TABLE] = { .type = NLA_STRING,
2474 .len = NFT_TABLE_MAXNAMELEN - 1 },
2400 [NFTA_SET_NAME] = { .type = NLA_STRING, 2475 [NFTA_SET_NAME] = { .type = NLA_STRING,
2401 .len = NFT_SET_MAXNAMELEN - 1 }, 2476 .len = NFT_SET_MAXNAMELEN - 1 },
2402 [NFTA_SET_FLAGS] = { .type = NLA_U32 }, 2477 [NFTA_SET_FLAGS] = { .type = NLA_U32 },
@@ -2411,6 +2486,7 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
2411 [NFTA_SET_GC_INTERVAL] = { .type = NLA_U32 }, 2486 [NFTA_SET_GC_INTERVAL] = { .type = NLA_U32 },
2412 [NFTA_SET_USERDATA] = { .type = NLA_BINARY, 2487 [NFTA_SET_USERDATA] = { .type = NLA_BINARY,
2413 .len = NFT_USERDATA_MAXLEN }, 2488 .len = NFT_USERDATA_MAXLEN },
2489 [NFTA_SET_OBJ_TYPE] = { .type = NLA_U32 },
2414}; 2490};
2415 2491
2416static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { 2492static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = {
@@ -2462,6 +2538,7 @@ struct nft_set *nf_tables_set_lookup(const struct nft_table *table,
2462 } 2538 }
2463 return ERR_PTR(-ENOENT); 2539 return ERR_PTR(-ENOENT);
2464} 2540}
2541EXPORT_SYMBOL_GPL(nf_tables_set_lookup);
2465 2542
2466struct nft_set *nf_tables_set_lookup_byid(const struct net *net, 2543struct nft_set *nf_tables_set_lookup_byid(const struct net *net,
2467 const struct nlattr *nla, 2544 const struct nlattr *nla,
@@ -2480,6 +2557,7 @@ struct nft_set *nf_tables_set_lookup_byid(const struct net *net,
2480 } 2557 }
2481 return ERR_PTR(-ENOENT); 2558 return ERR_PTR(-ENOENT);
2482} 2559}
2560EXPORT_SYMBOL_GPL(nf_tables_set_lookup_byid);
2483 2561
2484static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, 2562static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
2485 const char *name) 2563 const char *name)
@@ -2568,6 +2646,9 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
2568 if (nla_put_be32(skb, NFTA_SET_DATA_LEN, htonl(set->dlen))) 2646 if (nla_put_be32(skb, NFTA_SET_DATA_LEN, htonl(set->dlen)))
2569 goto nla_put_failure; 2647 goto nla_put_failure;
2570 } 2648 }
2649 if (set->flags & NFT_SET_OBJECT &&
2650 nla_put_be32(skb, NFTA_SET_OBJ_TYPE, htonl(set->objtype)))
2651 goto nla_put_failure;
2571 2652
2572 if (set->timeout && 2653 if (set->timeout &&
2573 nla_put_be64(skb, NFTA_SET_TIMEOUT, 2654 nla_put_be64(skb, NFTA_SET_TIMEOUT,
@@ -2602,9 +2683,9 @@ nla_put_failure:
2602 return -1; 2683 return -1;
2603} 2684}
2604 2685
2605static int nf_tables_set_notify(const struct nft_ctx *ctx, 2686static void nf_tables_set_notify(const struct nft_ctx *ctx,
2606 const struct nft_set *set, 2687 const struct nft_set *set, int event,
2607 int event, gfp_t gfp_flags) 2688 gfp_t gfp_flags)
2608{ 2689{
2609 struct sk_buff *skb; 2690 struct sk_buff *skb;
2610 u32 portid = ctx->portid; 2691 u32 portid = ctx->portid;
@@ -2612,9 +2693,8 @@ static int nf_tables_set_notify(const struct nft_ctx *ctx,
2612 2693
2613 if (!ctx->report && 2694 if (!ctx->report &&
2614 !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) 2695 !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
2615 return 0; 2696 return;
2616 2697
2617 err = -ENOBUFS;
2618 skb = nlmsg_new(NLMSG_GOODSIZE, gfp_flags); 2698 skb = nlmsg_new(NLMSG_GOODSIZE, gfp_flags);
2619 if (skb == NULL) 2699 if (skb == NULL)
2620 goto err; 2700 goto err;
@@ -2625,12 +2705,11 @@ static int nf_tables_set_notify(const struct nft_ctx *ctx,
2625 goto err; 2705 goto err;
2626 } 2706 }
2627 2707
2628 err = nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES, 2708 nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES, ctx->report,
2629 ctx->report, gfp_flags); 2709 gfp_flags);
2710 return;
2630err: 2711err:
2631 if (err < 0) 2712 nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, -ENOBUFS);
2632 nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, err);
2633 return err;
2634} 2713}
2635 2714
2636static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) 2715static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb)
@@ -2797,7 +2876,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
2797 unsigned int size; 2876 unsigned int size;
2798 bool create; 2877 bool create;
2799 u64 timeout; 2878 u64 timeout;
2800 u32 ktype, dtype, flags, policy, gc_int; 2879 u32 ktype, dtype, flags, policy, gc_int, objtype;
2801 struct nft_set_desc desc; 2880 struct nft_set_desc desc;
2802 unsigned char *udata; 2881 unsigned char *udata;
2803 u16 udlen; 2882 u16 udlen;
@@ -2827,11 +2906,12 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
2827 flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS])); 2906 flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS]));
2828 if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT | 2907 if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT |
2829 NFT_SET_INTERVAL | NFT_SET_TIMEOUT | 2908 NFT_SET_INTERVAL | NFT_SET_TIMEOUT |
2830 NFT_SET_MAP | NFT_SET_EVAL)) 2909 NFT_SET_MAP | NFT_SET_EVAL |
2910 NFT_SET_OBJECT))
2831 return -EINVAL; 2911 return -EINVAL;
2832 /* Only one of both operations is supported */ 2912 /* Only one of these operations is supported */
2833 if ((flags & (NFT_SET_MAP | NFT_SET_EVAL)) == 2913 if ((flags & (NFT_SET_MAP | NFT_SET_EVAL | NFT_SET_OBJECT)) ==
2834 (NFT_SET_MAP | NFT_SET_EVAL)) 2914 (NFT_SET_MAP | NFT_SET_EVAL | NFT_SET_OBJECT))
2835 return -EOPNOTSUPP; 2915 return -EOPNOTSUPP;
2836 } 2916 }
2837 2917
@@ -2856,6 +2936,19 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
2856 } else if (flags & NFT_SET_MAP) 2936 } else if (flags & NFT_SET_MAP)
2857 return -EINVAL; 2937 return -EINVAL;
2858 2938
2939 if (nla[NFTA_SET_OBJ_TYPE] != NULL) {
2940 if (!(flags & NFT_SET_OBJECT))
2941 return -EINVAL;
2942
2943 objtype = ntohl(nla_get_be32(nla[NFTA_SET_OBJ_TYPE]));
2944 if (objtype == NFT_OBJECT_UNSPEC ||
2945 objtype > NFT_OBJECT_MAX)
2946 return -EINVAL;
2947 } else if (flags & NFT_SET_OBJECT)
2948 return -EINVAL;
2949 else
2950 objtype = NFT_OBJECT_UNSPEC;
2951
2859 timeout = 0; 2952 timeout = 0;
2860 if (nla[NFTA_SET_TIMEOUT] != NULL) { 2953 if (nla[NFTA_SET_TIMEOUT] != NULL) {
2861 if (!(flags & NFT_SET_TIMEOUT)) 2954 if (!(flags & NFT_SET_TIMEOUT))
@@ -2896,10 +2989,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
2896 if (IS_ERR(set)) { 2989 if (IS_ERR(set)) {
2897 if (PTR_ERR(set) != -ENOENT) 2990 if (PTR_ERR(set) != -ENOENT)
2898 return PTR_ERR(set); 2991 return PTR_ERR(set);
2899 set = NULL; 2992 } else {
2900 }
2901
2902 if (set != NULL) {
2903 if (nlh->nlmsg_flags & NLM_F_EXCL) 2993 if (nlh->nlmsg_flags & NLM_F_EXCL)
2904 return -EEXIST; 2994 return -EEXIST;
2905 if (nlh->nlmsg_flags & NLM_F_REPLACE) 2995 if (nlh->nlmsg_flags & NLM_F_REPLACE)
@@ -2943,6 +3033,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
2943 set->ktype = ktype; 3033 set->ktype = ktype;
2944 set->klen = desc.klen; 3034 set->klen = desc.klen;
2945 set->dtype = dtype; 3035 set->dtype = dtype;
3036 set->objtype = objtype;
2946 set->dlen = desc.dlen; 3037 set->dlen = desc.dlen;
2947 set->flags = flags; 3038 set->flags = flags;
2948 set->size = desc.size; 3039 set->size = desc.size;
@@ -3016,9 +3107,9 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk,
3016} 3107}
3017 3108
3018static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx, 3109static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx,
3019 const struct nft_set *set, 3110 struct nft_set *set,
3020 const struct nft_set_iter *iter, 3111 const struct nft_set_iter *iter,
3021 const struct nft_set_elem *elem) 3112 struct nft_set_elem *elem)
3022{ 3113{
3023 const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); 3114 const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
3024 enum nft_registers dreg; 3115 enum nft_registers dreg;
@@ -3064,6 +3155,7 @@ bind:
3064 list_add_tail_rcu(&binding->list, &set->bindings); 3155 list_add_tail_rcu(&binding->list, &set->bindings);
3065 return 0; 3156 return 0;
3066} 3157}
3158EXPORT_SYMBOL_GPL(nf_tables_bind_set);
3067 3159
3068void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, 3160void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
3069 struct nft_set_binding *binding) 3161 struct nft_set_binding *binding)
@@ -3074,6 +3166,7 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
3074 nft_is_active(ctx->net, set)) 3166 nft_is_active(ctx->net, set))
3075 nf_tables_set_destroy(ctx, set); 3167 nf_tables_set_destroy(ctx, set);
3076} 3168}
3169EXPORT_SYMBOL_GPL(nf_tables_unbind_set);
3077 3170
3078const struct nft_set_ext_type nft_set_ext_types[] = { 3171const struct nft_set_ext_type nft_set_ext_types[] = {
3079 [NFT_SET_EXT_KEY] = { 3172 [NFT_SET_EXT_KEY] = {
@@ -3085,6 +3178,10 @@ const struct nft_set_ext_type nft_set_ext_types[] = {
3085 [NFT_SET_EXT_EXPR] = { 3178 [NFT_SET_EXT_EXPR] = {
3086 .align = __alignof__(struct nft_expr), 3179 .align = __alignof__(struct nft_expr),
3087 }, 3180 },
3181 [NFT_SET_EXT_OBJREF] = {
3182 .len = sizeof(struct nft_object *),
3183 .align = __alignof__(struct nft_object *),
3184 },
3088 [NFT_SET_EXT_FLAGS] = { 3185 [NFT_SET_EXT_FLAGS] = {
3089 .len = sizeof(u8), 3186 .len = sizeof(u8),
3090 .align = __alignof__(u8), 3187 .align = __alignof__(u8),
@@ -3118,8 +3215,10 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = {
3118}; 3215};
3119 3216
3120static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = { 3217static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = {
3121 [NFTA_SET_ELEM_LIST_TABLE] = { .type = NLA_STRING }, 3218 [NFTA_SET_ELEM_LIST_TABLE] = { .type = NLA_STRING,
3122 [NFTA_SET_ELEM_LIST_SET] = { .type = NLA_STRING }, 3219 .len = NFT_TABLE_MAXNAMELEN - 1 },
3220 [NFTA_SET_ELEM_LIST_SET] = { .type = NLA_STRING,
3221 .len = NFT_SET_MAXNAMELEN - 1 },
3123 [NFTA_SET_ELEM_LIST_ELEMENTS] = { .type = NLA_NESTED }, 3222 [NFTA_SET_ELEM_LIST_ELEMENTS] = { .type = NLA_NESTED },
3124 [NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 }, 3223 [NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 },
3125}; 3224};
@@ -3173,6 +3272,11 @@ static int nf_tables_fill_setelem(struct sk_buff *skb,
3173 nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, nft_set_ext_expr(ext)) < 0) 3272 nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, nft_set_ext_expr(ext)) < 0)
3174 goto nla_put_failure; 3273 goto nla_put_failure;
3175 3274
3275 if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) &&
3276 nla_put_string(skb, NFTA_SET_ELEM_OBJREF,
3277 (*nft_set_ext_obj(ext))->name) < 0)
3278 goto nla_put_failure;
3279
3176 if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && 3280 if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
3177 nla_put_be32(skb, NFTA_SET_ELEM_FLAGS, 3281 nla_put_be32(skb, NFTA_SET_ELEM_FLAGS,
3178 htonl(*nft_set_ext_flags(ext)))) 3282 htonl(*nft_set_ext_flags(ext))))
@@ -3224,9 +3328,9 @@ struct nft_set_dump_args {
3224}; 3328};
3225 3329
3226static int nf_tables_dump_setelem(const struct nft_ctx *ctx, 3330static int nf_tables_dump_setelem(const struct nft_ctx *ctx,
3227 const struct nft_set *set, 3331 struct nft_set *set,
3228 const struct nft_set_iter *iter, 3332 const struct nft_set_iter *iter,
3229 const struct nft_set_elem *elem) 3333 struct nft_set_elem *elem)
3230{ 3334{
3231 struct nft_set_dump_args *args; 3335 struct nft_set_dump_args *args;
3232 3336
@@ -3238,7 +3342,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
3238{ 3342{
3239 struct net *net = sock_net(skb->sk); 3343 struct net *net = sock_net(skb->sk);
3240 u8 genmask = nft_genmask_cur(net); 3344 u8 genmask = nft_genmask_cur(net);
3241 const struct nft_set *set; 3345 struct nft_set *set;
3242 struct nft_set_dump_args args; 3346 struct nft_set_dump_args args;
3243 struct nft_ctx ctx; 3347 struct nft_ctx ctx;
3244 struct nlattr *nla[NFTA_SET_ELEM_LIST_MAX + 1]; 3348 struct nlattr *nla[NFTA_SET_ELEM_LIST_MAX + 1];
@@ -3383,10 +3487,10 @@ nla_put_failure:
3383 return -1; 3487 return -1;
3384} 3488}
3385 3489
3386static int nf_tables_setelem_notify(const struct nft_ctx *ctx, 3490static void nf_tables_setelem_notify(const struct nft_ctx *ctx,
3387 const struct nft_set *set, 3491 const struct nft_set *set,
3388 const struct nft_set_elem *elem, 3492 const struct nft_set_elem *elem,
3389 int event, u16 flags) 3493 int event, u16 flags)
3390{ 3494{
3391 struct net *net = ctx->net; 3495 struct net *net = ctx->net;
3392 u32 portid = ctx->portid; 3496 u32 portid = ctx->portid;
@@ -3394,9 +3498,8 @@ static int nf_tables_setelem_notify(const struct nft_ctx *ctx,
3394 int err; 3498 int err;
3395 3499
3396 if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) 3500 if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
3397 return 0; 3501 return;
3398 3502
3399 err = -ENOBUFS;
3400 skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); 3503 skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
3401 if (skb == NULL) 3504 if (skb == NULL)
3402 goto err; 3505 goto err;
@@ -3408,12 +3511,11 @@ static int nf_tables_setelem_notify(const struct nft_ctx *ctx,
3408 goto err; 3511 goto err;
3409 } 3512 }
3410 3513
3411 err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, ctx->report, 3514 nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, ctx->report,
3412 GFP_KERNEL); 3515 GFP_KERNEL);
3516 return;
3413err: 3517err:
3414 if (err < 0) 3518 nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS);
3415 nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err);
3416 return err;
3417} 3519}
3418 3520
3419static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx, 3521static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx,
@@ -3467,7 +3569,8 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem,
3467 nft_data_uninit(nft_set_ext_data(ext), set->dtype); 3569 nft_data_uninit(nft_set_ext_data(ext), set->dtype);
3468 if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) 3570 if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
3469 nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext)); 3571 nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext));
3470 3572 if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
3573 (*nft_set_ext_obj(ext))->use--;
3471 kfree(elem); 3574 kfree(elem);
3472} 3575}
3473EXPORT_SYMBOL_GPL(nft_set_elem_destroy); 3576EXPORT_SYMBOL_GPL(nft_set_elem_destroy);
@@ -3492,11 +3595,13 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
3492 const struct nlattr *attr, u32 nlmsg_flags) 3595 const struct nlattr *attr, u32 nlmsg_flags)
3493{ 3596{
3494 struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; 3597 struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
3598 u8 genmask = nft_genmask_next(ctx->net);
3495 struct nft_data_desc d1, d2; 3599 struct nft_data_desc d1, d2;
3496 struct nft_set_ext_tmpl tmpl; 3600 struct nft_set_ext_tmpl tmpl;
3497 struct nft_set_ext *ext, *ext2; 3601 struct nft_set_ext *ext, *ext2;
3498 struct nft_set_elem elem; 3602 struct nft_set_elem elem;
3499 struct nft_set_binding *binding; 3603 struct nft_set_binding *binding;
3604 struct nft_object *obj = NULL;
3500 struct nft_userdata *udata; 3605 struct nft_userdata *udata;
3501 struct nft_data data; 3606 struct nft_data data;
3502 enum nft_registers dreg; 3607 enum nft_registers dreg;
@@ -3559,6 +3664,20 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
3559 nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT); 3664 nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT);
3560 } 3665 }
3561 3666
3667 if (nla[NFTA_SET_ELEM_OBJREF] != NULL) {
3668 if (!(set->flags & NFT_SET_OBJECT)) {
3669 err = -EINVAL;
3670 goto err2;
3671 }
3672 obj = nf_tables_obj_lookup(ctx->table, nla[NFTA_SET_ELEM_OBJREF],
3673 set->objtype, genmask);
3674 if (IS_ERR(obj)) {
3675 err = PTR_ERR(obj);
3676 goto err2;
3677 }
3678 nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF);
3679 }
3680
3562 if (nla[NFTA_SET_ELEM_DATA] != NULL) { 3681 if (nla[NFTA_SET_ELEM_DATA] != NULL) {
3563 err = nft_data_init(ctx, &data, sizeof(data), &d2, 3682 err = nft_data_init(ctx, &data, sizeof(data), &d2,
3564 nla[NFTA_SET_ELEM_DATA]); 3683 nla[NFTA_SET_ELEM_DATA]);
@@ -3617,6 +3736,10 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
3617 udata->len = ulen - 1; 3736 udata->len = ulen - 1;
3618 nla_memcpy(&udata->data, nla[NFTA_SET_ELEM_USERDATA], ulen); 3737 nla_memcpy(&udata->data, nla[NFTA_SET_ELEM_USERDATA], ulen);
3619 } 3738 }
3739 if (obj) {
3740 *nft_set_ext_obj(ext) = obj;
3741 obj->use++;
3742 }
3620 3743
3621 trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); 3744 trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set);
3622 if (trans == NULL) 3745 if (trans == NULL)
@@ -3626,10 +3749,13 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
3626 err = set->ops->insert(ctx->net, set, &elem, &ext2); 3749 err = set->ops->insert(ctx->net, set, &elem, &ext2);
3627 if (err) { 3750 if (err) {
3628 if (err == -EEXIST) { 3751 if (err == -EEXIST) {
3629 if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && 3752 if ((nft_set_ext_exists(ext, NFT_SET_EXT_DATA) &&
3630 nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) && 3753 nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) &&
3631 memcmp(nft_set_ext_data(ext), 3754 memcmp(nft_set_ext_data(ext),
3632 nft_set_ext_data(ext2), set->dlen) != 0) 3755 nft_set_ext_data(ext2), set->dlen) != 0) ||
3756 (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) &&
3757 nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF) &&
3758 *nft_set_ext_obj(ext) != *nft_set_ext_obj(ext2)))
3633 err = -EBUSY; 3759 err = -EBUSY;
3634 else if (!(nlmsg_flags & NLM_F_EXCL)) 3760 else if (!(nlmsg_flags & NLM_F_EXCL))
3635 err = 0; 3761 err = 0;
@@ -3637,10 +3763,18 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
3637 goto err5; 3763 goto err5;
3638 } 3764 }
3639 3765
3766 if (set->size &&
3767 !atomic_add_unless(&set->nelems, 1, set->size + set->ndeact)) {
3768 err = -ENFILE;
3769 goto err6;
3770 }
3771
3640 nft_trans_elem(trans) = elem; 3772 nft_trans_elem(trans) = elem;
3641 list_add_tail(&trans->list, &ctx->net->nft.commit_list); 3773 list_add_tail(&trans->list, &ctx->net->nft.commit_list);
3642 return 0; 3774 return 0;
3643 3775
3776err6:
3777 set->ops->remove(ctx->net, set, &elem);
3644err5: 3778err5:
3645 kfree(trans); 3779 kfree(trans);
3646err4: 3780err4:
@@ -3687,15 +3821,9 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
3687 return -EBUSY; 3821 return -EBUSY;
3688 3822
3689 nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { 3823 nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
3690 if (set->size &&
3691 !atomic_add_unless(&set->nelems, 1, set->size + set->ndeact))
3692 return -ENFILE;
3693
3694 err = nft_add_set_elem(&ctx, set, attr, nlh->nlmsg_flags); 3824 err = nft_add_set_elem(&ctx, set, attr, nlh->nlmsg_flags);
3695 if (err < 0) { 3825 if (err < 0)
3696 atomic_dec(&set->nelems);
3697 break; 3826 break;
3698 }
3699 } 3827 }
3700 return err; 3828 return err;
3701} 3829}
@@ -3779,6 +3907,35 @@ err1:
3779 return err; 3907 return err;
3780} 3908}
3781 3909
3910static int nft_flush_set(const struct nft_ctx *ctx,
3911 struct nft_set *set,
3912 const struct nft_set_iter *iter,
3913 struct nft_set_elem *elem)
3914{
3915 struct nft_trans *trans;
3916 int err;
3917
3918 trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM,
3919 sizeof(struct nft_trans_elem), GFP_ATOMIC);
3920 if (!trans)
3921 return -ENOMEM;
3922
3923 if (!set->ops->flush(ctx->net, set, elem->priv)) {
3924 err = -ENOENT;
3925 goto err1;
3926 }
3927 set->ndeact++;
3928
3929 nft_trans_elem_set(trans) = set;
3930 nft_trans_elem(trans) = *elem;
3931 list_add_tail(&trans->list, &ctx->net->nft.commit_list);
3932
3933 return 0;
3934err1:
3935 kfree(trans);
3936 return err;
3937}
3938
3782static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, 3939static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
3783 struct sk_buff *skb, const struct nlmsghdr *nlh, 3940 struct sk_buff *skb, const struct nlmsghdr *nlh,
3784 const struct nlattr * const nla[]) 3941 const struct nlattr * const nla[])
@@ -3789,9 +3946,6 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
3789 struct nft_ctx ctx; 3946 struct nft_ctx ctx;
3790 int rem, err = 0; 3947 int rem, err = 0;
3791 3948
3792 if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL)
3793 return -EINVAL;
3794
3795 err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask); 3949 err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask);
3796 if (err < 0) 3950 if (err < 0)
3797 return err; 3951 return err;
@@ -3803,6 +3957,16 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
3803 if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) 3957 if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT)
3804 return -EBUSY; 3958 return -EBUSY;
3805 3959
3960 if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) {
3961 struct nft_set_iter iter = {
3962 .genmask = genmask,
3963 .fn = nft_flush_set,
3964 };
3965 set->ops->walk(&ctx, set, &iter);
3966
3967 return iter.err;
3968 }
3969
3806 nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { 3970 nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
3807 err = nft_del_setelem(&ctx, set, attr); 3971 err = nft_del_setelem(&ctx, set, attr);
3808 if (err < 0) 3972 if (err < 0)
@@ -3838,6 +4002,496 @@ struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set,
3838} 4002}
3839EXPORT_SYMBOL_GPL(nft_set_gc_batch_alloc); 4003EXPORT_SYMBOL_GPL(nft_set_gc_batch_alloc);
3840 4004
4005/*
4006 * Stateful objects
4007 */
4008
4009/**
4010 * nft_register_obj- register nf_tables stateful object type
4011 * @obj: object type
4012 *
4013 * Registers the object type for use with nf_tables. Returns zero on
4014 * success or a negative errno code otherwise.
4015 */
4016int nft_register_obj(struct nft_object_type *obj_type)
4017{
4018 if (obj_type->type == NFT_OBJECT_UNSPEC)
4019 return -EINVAL;
4020
4021 nfnl_lock(NFNL_SUBSYS_NFTABLES);
4022 list_add_rcu(&obj_type->list, &nf_tables_objects);
4023 nfnl_unlock(NFNL_SUBSYS_NFTABLES);
4024 return 0;
4025}
4026EXPORT_SYMBOL_GPL(nft_register_obj);
4027
4028/**
4029 * nft_unregister_obj - unregister nf_tables object type
4030 * @obj: object type
4031 *
4032 * Unregisters the object type for use with nf_tables.
4033 */
4034void nft_unregister_obj(struct nft_object_type *obj_type)
4035{
4036 nfnl_lock(NFNL_SUBSYS_NFTABLES);
4037 list_del_rcu(&obj_type->list);
4038 nfnl_unlock(NFNL_SUBSYS_NFTABLES);
4039}
4040EXPORT_SYMBOL_GPL(nft_unregister_obj);
4041
4042struct nft_object *nf_tables_obj_lookup(const struct nft_table *table,
4043 const struct nlattr *nla,
4044 u32 objtype, u8 genmask)
4045{
4046 struct nft_object *obj;
4047
4048 list_for_each_entry(obj, &table->objects, list) {
4049 if (!nla_strcmp(nla, obj->name) &&
4050 objtype == obj->type->type &&
4051 nft_active_genmask(obj, genmask))
4052 return obj;
4053 }
4054 return ERR_PTR(-ENOENT);
4055}
4056EXPORT_SYMBOL_GPL(nf_tables_obj_lookup);
4057
4058static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = {
4059 [NFTA_OBJ_TABLE] = { .type = NLA_STRING,
4060 .len = NFT_TABLE_MAXNAMELEN - 1 },
4061 [NFTA_OBJ_NAME] = { .type = NLA_STRING,
4062 .len = NFT_OBJ_MAXNAMELEN - 1 },
4063 [NFTA_OBJ_TYPE] = { .type = NLA_U32 },
4064 [NFTA_OBJ_DATA] = { .type = NLA_NESTED },
4065};
4066
4067static struct nft_object *nft_obj_init(const struct nft_object_type *type,
4068 const struct nlattr *attr)
4069{
4070 struct nlattr *tb[type->maxattr + 1];
4071 struct nft_object *obj;
4072 int err;
4073
4074 if (attr) {
4075 err = nla_parse_nested(tb, type->maxattr, attr, type->policy);
4076 if (err < 0)
4077 goto err1;
4078 } else {
4079 memset(tb, 0, sizeof(tb[0]) * (type->maxattr + 1));
4080 }
4081
4082 err = -ENOMEM;
4083 obj = kzalloc(sizeof(struct nft_object) + type->size, GFP_KERNEL);
4084 if (obj == NULL)
4085 goto err1;
4086
4087 err = type->init((const struct nlattr * const *)tb, obj);
4088 if (err < 0)
4089 goto err2;
4090
4091 obj->type = type;
4092 return obj;
4093err2:
4094 kfree(obj);
4095err1:
4096 return ERR_PTR(err);
4097}
4098
4099static int nft_object_dump(struct sk_buff *skb, unsigned int attr,
4100 struct nft_object *obj, bool reset)
4101{
4102 struct nlattr *nest;
4103
4104 nest = nla_nest_start(skb, attr);
4105 if (!nest)
4106 goto nla_put_failure;
4107 if (obj->type->dump(skb, obj, reset) < 0)
4108 goto nla_put_failure;
4109 nla_nest_end(skb, nest);
4110 return 0;
4111
4112nla_put_failure:
4113 return -1;
4114}
4115
4116static const struct nft_object_type *__nft_obj_type_get(u32 objtype)
4117{
4118 const struct nft_object_type *type;
4119
4120 list_for_each_entry(type, &nf_tables_objects, list) {
4121 if (objtype == type->type)
4122 return type;
4123 }
4124 return NULL;
4125}
4126
4127static const struct nft_object_type *nft_obj_type_get(u32 objtype)
4128{
4129 const struct nft_object_type *type;
4130
4131 type = __nft_obj_type_get(objtype);
4132 if (type != NULL && try_module_get(type->owner))
4133 return type;
4134
4135#ifdef CONFIG_MODULES
4136 if (type == NULL) {
4137 nfnl_unlock(NFNL_SUBSYS_NFTABLES);
4138 request_module("nft-obj-%u", objtype);
4139 nfnl_lock(NFNL_SUBSYS_NFTABLES);
4140 if (__nft_obj_type_get(objtype))
4141 return ERR_PTR(-EAGAIN);
4142 }
4143#endif
4144 return ERR_PTR(-ENOENT);
4145}
4146
4147static int nf_tables_newobj(struct net *net, struct sock *nlsk,
4148 struct sk_buff *skb, const struct nlmsghdr *nlh,
4149 const struct nlattr * const nla[])
4150{
4151 const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
4152 const struct nft_object_type *type;
4153 u8 genmask = nft_genmask_next(net);
4154 int family = nfmsg->nfgen_family;
4155 struct nft_af_info *afi;
4156 struct nft_table *table;
4157 struct nft_object *obj;
4158 struct nft_ctx ctx;
4159 u32 objtype;
4160 int err;
4161
4162 if (!nla[NFTA_OBJ_TYPE] ||
4163 !nla[NFTA_OBJ_NAME] ||
4164 !nla[NFTA_OBJ_DATA])
4165 return -EINVAL;
4166
4167 afi = nf_tables_afinfo_lookup(net, family, true);
4168 if (IS_ERR(afi))
4169 return PTR_ERR(afi);
4170
4171 table = nf_tables_table_lookup(afi, nla[NFTA_OBJ_TABLE], genmask);
4172 if (IS_ERR(table))
4173 return PTR_ERR(table);
4174
4175 objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
4176 obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
4177 if (IS_ERR(obj)) {
4178 err = PTR_ERR(obj);
4179 if (err != -ENOENT)
4180 return err;
4181
4182 } else {
4183 if (nlh->nlmsg_flags & NLM_F_EXCL)
4184 return -EEXIST;
4185
4186 return 0;
4187 }
4188
4189 nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
4190
4191 type = nft_obj_type_get(objtype);
4192 if (IS_ERR(type))
4193 return PTR_ERR(type);
4194
4195 obj = nft_obj_init(type, nla[NFTA_OBJ_DATA]);
4196 if (IS_ERR(obj)) {
4197 err = PTR_ERR(obj);
4198 goto err1;
4199 }
4200 obj->table = table;
4201 nla_strlcpy(obj->name, nla[NFTA_OBJ_NAME], NFT_OBJ_MAXNAMELEN);
4202
4203 err = nft_trans_obj_add(&ctx, NFT_MSG_NEWOBJ, obj);
4204 if (err < 0)
4205 goto err2;
4206
4207 list_add_tail_rcu(&obj->list, &table->objects);
4208 table->use++;
4209 return 0;
4210err2:
4211 if (obj->type->destroy)
4212 obj->type->destroy(obj);
4213 kfree(obj);
4214err1:
4215 module_put(type->owner);
4216 return err;
4217}
4218
4219static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net,
4220 u32 portid, u32 seq, int event, u32 flags,
4221 int family, const struct nft_table *table,
4222 struct nft_object *obj, bool reset)
4223{
4224 struct nfgenmsg *nfmsg;
4225 struct nlmsghdr *nlh;
4226
4227 event |= NFNL_SUBSYS_NFTABLES << 8;
4228 nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
4229 if (nlh == NULL)
4230 goto nla_put_failure;
4231
4232 nfmsg = nlmsg_data(nlh);
4233 nfmsg->nfgen_family = family;
4234 nfmsg->version = NFNETLINK_V0;
4235 nfmsg->res_id = htons(net->nft.base_seq & 0xffff);
4236
4237 if (nla_put_string(skb, NFTA_OBJ_TABLE, table->name) ||
4238 nla_put_string(skb, NFTA_OBJ_NAME, obj->name) ||
4239 nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->type->type)) ||
4240 nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) ||
4241 nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset))
4242 goto nla_put_failure;
4243
4244 nlmsg_end(skb, nlh);
4245 return 0;
4246
4247nla_put_failure:
4248 nlmsg_trim(skb, nlh);
4249 return -1;
4250}
4251
4252struct nft_obj_filter {
4253 char table[NFT_OBJ_MAXNAMELEN];
4254 u32 type;
4255};
4256
4257static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb)
4258{
4259 const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
4260 const struct nft_af_info *afi;
4261 const struct nft_table *table;
4262 unsigned int idx = 0, s_idx = cb->args[0];
4263 struct nft_obj_filter *filter = cb->data;
4264 struct net *net = sock_net(skb->sk);
4265 int family = nfmsg->nfgen_family;
4266 struct nft_object *obj;
4267 bool reset = false;
4268
4269 if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET)
4270 reset = true;
4271
4272 rcu_read_lock();
4273 cb->seq = net->nft.base_seq;
4274
4275 list_for_each_entry_rcu(afi, &net->nft.af_info, list) {
4276 if (family != NFPROTO_UNSPEC && family != afi->family)
4277 continue;
4278
4279 list_for_each_entry_rcu(table, &afi->tables, list) {
4280 list_for_each_entry_rcu(obj, &table->objects, list) {
4281 if (!nft_is_active(net, obj))
4282 goto cont;
4283 if (idx < s_idx)
4284 goto cont;
4285 if (idx > s_idx)
4286 memset(&cb->args[1], 0,
4287 sizeof(cb->args) - sizeof(cb->args[0]));
4288 if (filter && filter->table[0] &&
4289 strcmp(filter->table, table->name))
4290 goto cont;
4291 if (filter &&
4292 filter->type != NFT_OBJECT_UNSPEC &&
4293 obj->type->type != filter->type)
4294 goto cont;
4295
4296 if (nf_tables_fill_obj_info(skb, net, NETLINK_CB(cb->skb).portid,
4297 cb->nlh->nlmsg_seq,
4298 NFT_MSG_NEWOBJ,
4299 NLM_F_MULTI | NLM_F_APPEND,
4300 afi->family, table, obj, reset) < 0)
4301 goto done;
4302
4303 nl_dump_check_consistent(cb, nlmsg_hdr(skb));
4304cont:
4305 idx++;
4306 }
4307 }
4308 }
4309done:
4310 rcu_read_unlock();
4311
4312 cb->args[0] = idx;
4313 return skb->len;
4314}
4315
4316static int nf_tables_dump_obj_done(struct netlink_callback *cb)
4317{
4318 kfree(cb->data);
4319
4320 return 0;
4321}
4322
4323static struct nft_obj_filter *
4324nft_obj_filter_alloc(const struct nlattr * const nla[])
4325{
4326 struct nft_obj_filter *filter;
4327
4328 filter = kzalloc(sizeof(*filter), GFP_KERNEL);
4329 if (!filter)
4330 return ERR_PTR(-ENOMEM);
4331
4332 if (nla[NFTA_OBJ_TABLE])
4333 nla_strlcpy(filter->table, nla[NFTA_OBJ_TABLE],
4334 NFT_TABLE_MAXNAMELEN);
4335 if (nla[NFTA_OBJ_TYPE])
4336 filter->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
4337
4338 return filter;
4339}
4340
4341static int nf_tables_getobj(struct net *net, struct sock *nlsk,
4342 struct sk_buff *skb, const struct nlmsghdr *nlh,
4343 const struct nlattr * const nla[])
4344{
4345 const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
4346 u8 genmask = nft_genmask_cur(net);
4347 int family = nfmsg->nfgen_family;
4348 const struct nft_af_info *afi;
4349 const struct nft_table *table;
4350 struct nft_object *obj;
4351 struct sk_buff *skb2;
4352 bool reset = false;
4353 u32 objtype;
4354 int err;
4355
4356 if (nlh->nlmsg_flags & NLM_F_DUMP) {
4357 struct netlink_dump_control c = {
4358 .dump = nf_tables_dump_obj,
4359 .done = nf_tables_dump_obj_done,
4360 };
4361
4362 if (nla[NFTA_OBJ_TABLE] ||
4363 nla[NFTA_OBJ_TYPE]) {
4364 struct nft_obj_filter *filter;
4365
4366 filter = nft_obj_filter_alloc(nla);
4367 if (IS_ERR(filter))
4368 return -ENOMEM;
4369
4370 c.data = filter;
4371 }
4372 return netlink_dump_start(nlsk, skb, nlh, &c);
4373 }
4374
4375 if (!nla[NFTA_OBJ_NAME] ||
4376 !nla[NFTA_OBJ_TYPE])
4377 return -EINVAL;
4378
4379 afi = nf_tables_afinfo_lookup(net, family, false);
4380 if (IS_ERR(afi))
4381 return PTR_ERR(afi);
4382
4383 table = nf_tables_table_lookup(afi, nla[NFTA_OBJ_TABLE], genmask);
4384 if (IS_ERR(table))
4385 return PTR_ERR(table);
4386
4387 objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
4388 obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
4389 if (IS_ERR(obj))
4390 return PTR_ERR(obj);
4391
4392 skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4393 if (!skb2)
4394 return -ENOMEM;
4395
4396 if (NFNL_MSG_TYPE(nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET)
4397 reset = true;
4398
4399 err = nf_tables_fill_obj_info(skb2, net, NETLINK_CB(skb).portid,
4400 nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0,
4401 family, table, obj, reset);
4402 if (err < 0)
4403 goto err;
4404
4405 return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
4406err:
4407 kfree_skb(skb2);
4408 return err;
4409
4410 return 0;
4411}
4412
4413static void nft_obj_destroy(struct nft_object *obj)
4414{
4415 if (obj->type->destroy)
4416 obj->type->destroy(obj);
4417
4418 module_put(obj->type->owner);
4419 kfree(obj);
4420}
4421
4422static int nf_tables_delobj(struct net *net, struct sock *nlsk,
4423 struct sk_buff *skb, const struct nlmsghdr *nlh,
4424 const struct nlattr * const nla[])
4425{
4426 const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
4427 u8 genmask = nft_genmask_next(net);
4428 int family = nfmsg->nfgen_family;
4429 struct nft_af_info *afi;
4430 struct nft_table *table;
4431 struct nft_object *obj;
4432 struct nft_ctx ctx;
4433 u32 objtype;
4434
4435 if (!nla[NFTA_OBJ_TYPE] ||
4436 !nla[NFTA_OBJ_NAME])
4437 return -EINVAL;
4438
4439 afi = nf_tables_afinfo_lookup(net, family, true);
4440 if (IS_ERR(afi))
4441 return PTR_ERR(afi);
4442
4443 table = nf_tables_table_lookup(afi, nla[NFTA_OBJ_TABLE], genmask);
4444 if (IS_ERR(table))
4445 return PTR_ERR(table);
4446
4447 objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
4448 obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
4449 if (IS_ERR(obj))
4450 return PTR_ERR(obj);
4451 if (obj->use > 0)
4452 return -EBUSY;
4453
4454 nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
4455
4456 return nft_delobj(&ctx, obj);
4457}
4458
4459void nft_obj_notify(struct net *net, struct nft_table *table,
4460 struct nft_object *obj, u32 portid, u32 seq, int event,
4461 int family, int report, gfp_t gfp)
4462{
4463 struct sk_buff *skb;
4464 int err;
4465
4466 if (!report &&
4467 !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
4468 return;
4469
4470 skb = nlmsg_new(NLMSG_GOODSIZE, gfp);
4471 if (skb == NULL)
4472 goto err;
4473
4474 err = nf_tables_fill_obj_info(skb, net, portid, seq, event, 0, family,
4475 table, obj, false);
4476 if (err < 0) {
4477 kfree_skb(skb);
4478 goto err;
4479 }
4480
4481 nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report, gfp);
4482 return;
4483err:
4484 nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS);
4485}
4486EXPORT_SYMBOL_GPL(nft_obj_notify);
4487
4488static void nf_tables_obj_notify(const struct nft_ctx *ctx,
4489 struct nft_object *obj, int event)
4490{
4491 nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid, ctx->seq, event,
4492 ctx->afi->family, ctx->report, GFP_KERNEL);
4493}
4494
3841static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, 4495static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net,
3842 u32 portid, u32 seq) 4496 u32 portid, u32 seq)
3843{ 4497{
@@ -3865,7 +4519,8 @@ nla_put_failure:
3865 return -EMSGSIZE; 4519 return -EMSGSIZE;
3866} 4520}
3867 4521
3868static int nf_tables_gen_notify(struct net *net, struct sk_buff *skb, int event) 4522static void nf_tables_gen_notify(struct net *net, struct sk_buff *skb,
4523 int event)
3869{ 4524{
3870 struct nlmsghdr *nlh = nlmsg_hdr(skb); 4525 struct nlmsghdr *nlh = nlmsg_hdr(skb);
3871 struct sk_buff *skb2; 4526 struct sk_buff *skb2;
@@ -3873,9 +4528,8 @@ static int nf_tables_gen_notify(struct net *net, struct sk_buff *skb, int event)
3873 4528
3874 if (nlmsg_report(nlh) && 4529 if (nlmsg_report(nlh) &&
3875 !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) 4530 !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
3876 return 0; 4531 return;
3877 4532
3878 err = -ENOBUFS;
3879 skb2 = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); 4533 skb2 = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
3880 if (skb2 == NULL) 4534 if (skb2 == NULL)
3881 goto err; 4535 goto err;
@@ -3887,14 +4541,12 @@ static int nf_tables_gen_notify(struct net *net, struct sk_buff *skb, int event)
3887 goto err; 4541 goto err;
3888 } 4542 }
3889 4543
3890 err = nfnetlink_send(skb2, net, NETLINK_CB(skb).portid, 4544 nfnetlink_send(skb2, net, NETLINK_CB(skb).portid, NFNLGRP_NFTABLES,
3891 NFNLGRP_NFTABLES, nlmsg_report(nlh), GFP_KERNEL); 4545 nlmsg_report(nlh), GFP_KERNEL);
4546 return;
3892err: 4547err:
3893 if (err < 0) { 4548 nfnetlink_set_err(net, NETLINK_CB(skb).portid, NFNLGRP_NFTABLES,
3894 nfnetlink_set_err(net, NETLINK_CB(skb).portid, NFNLGRP_NFTABLES, 4549 -ENOBUFS);
3895 err);
3896 }
3897 return err;
3898} 4550}
3899 4551
3900static int nf_tables_getgen(struct net *net, struct sock *nlsk, 4552static int nf_tables_getgen(struct net *net, struct sock *nlsk,
@@ -3998,6 +4650,26 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
3998 [NFT_MSG_GETGEN] = { 4650 [NFT_MSG_GETGEN] = {
3999 .call = nf_tables_getgen, 4651 .call = nf_tables_getgen,
4000 }, 4652 },
4653 [NFT_MSG_NEWOBJ] = {
4654 .call_batch = nf_tables_newobj,
4655 .attr_count = NFTA_OBJ_MAX,
4656 .policy = nft_obj_policy,
4657 },
4658 [NFT_MSG_GETOBJ] = {
4659 .call = nf_tables_getobj,
4660 .attr_count = NFTA_OBJ_MAX,
4661 .policy = nft_obj_policy,
4662 },
4663 [NFT_MSG_DELOBJ] = {
4664 .call_batch = nf_tables_delobj,
4665 .attr_count = NFTA_OBJ_MAX,
4666 .policy = nft_obj_policy,
4667 },
4668 [NFT_MSG_GETOBJ_RESET] = {
4669 .call = nf_tables_getobj,
4670 .attr_count = NFTA_OBJ_MAX,
4671 .policy = nft_obj_policy,
4672 },
4001}; 4673};
4002 4674
4003static void nft_chain_commit_update(struct nft_trans *trans) 4675static void nft_chain_commit_update(struct nft_trans *trans)
@@ -4040,6 +4712,9 @@ static void nf_tables_commit_release(struct nft_trans *trans)
4040 nft_set_elem_destroy(nft_trans_elem_set(trans), 4712 nft_set_elem_destroy(nft_trans_elem_set(trans),
4041 nft_trans_elem(trans).priv, true); 4713 nft_trans_elem(trans).priv, true);
4042 break; 4714 break;
4715 case NFT_MSG_DELOBJ:
4716 nft_obj_destroy(nft_trans_obj(trans));
4717 break;
4043 } 4718 }
4044 kfree(trans); 4719 kfree(trans);
4045} 4720}
@@ -4143,10 +4818,21 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
4143 nf_tables_setelem_notify(&trans->ctx, te->set, 4818 nf_tables_setelem_notify(&trans->ctx, te->set,
4144 &te->elem, 4819 &te->elem,
4145 NFT_MSG_DELSETELEM, 0); 4820 NFT_MSG_DELSETELEM, 0);
4146 te->set->ops->remove(te->set, &te->elem); 4821 te->set->ops->remove(net, te->set, &te->elem);
4147 atomic_dec(&te->set->nelems); 4822 atomic_dec(&te->set->nelems);
4148 te->set->ndeact--; 4823 te->set->ndeact--;
4149 break; 4824 break;
4825 case NFT_MSG_NEWOBJ:
4826 nft_clear(net, nft_trans_obj(trans));
4827 nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans),
4828 NFT_MSG_NEWOBJ);
4829 nft_trans_destroy(trans);
4830 break;
4831 case NFT_MSG_DELOBJ:
4832 list_del_rcu(&nft_trans_obj(trans)->list);
4833 nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans),
4834 NFT_MSG_DELOBJ);
4835 break;
4150 } 4836 }
4151 } 4837 }
4152 4838
@@ -4181,6 +4867,9 @@ static void nf_tables_abort_release(struct nft_trans *trans)
4181 nft_set_elem_destroy(nft_trans_elem_set(trans), 4867 nft_set_elem_destroy(nft_trans_elem_set(trans),
4182 nft_trans_elem(trans).priv, true); 4868 nft_trans_elem(trans).priv, true);
4183 break; 4869 break;
4870 case NFT_MSG_NEWOBJ:
4871 nft_obj_destroy(nft_trans_obj(trans));
4872 break;
4184 } 4873 }
4185 kfree(trans); 4874 kfree(trans);
4186} 4875}
@@ -4250,7 +4939,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
4250 case NFT_MSG_NEWSETELEM: 4939 case NFT_MSG_NEWSETELEM:
4251 te = (struct nft_trans_elem *)trans->data; 4940 te = (struct nft_trans_elem *)trans->data;
4252 4941
4253 te->set->ops->remove(te->set, &te->elem); 4942 te->set->ops->remove(net, te->set, &te->elem);
4254 atomic_dec(&te->set->nelems); 4943 atomic_dec(&te->set->nelems);
4255 break; 4944 break;
4256 case NFT_MSG_DELSETELEM: 4945 case NFT_MSG_DELSETELEM:
@@ -4261,6 +4950,15 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
4261 4950
4262 nft_trans_destroy(trans); 4951 nft_trans_destroy(trans);
4263 break; 4952 break;
4953 case NFT_MSG_NEWOBJ:
4954 trans->ctx.table->use--;
4955 list_del_rcu(&nft_trans_obj(trans)->list);
4956 break;
4957 case NFT_MSG_DELOBJ:
4958 trans->ctx.table->use++;
4959 nft_clear(trans->ctx.net, nft_trans_obj(trans));
4960 nft_trans_destroy(trans);
4961 break;
4264 } 4962 }
4265 } 4963 }
4266 4964
@@ -4275,6 +4973,11 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
4275 return 0; 4973 return 0;
4276} 4974}
4277 4975
4976static bool nf_tables_valid_genid(struct net *net, u32 genid)
4977{
4978 return net->nft.base_seq == genid;
4979}
4980
4278static const struct nfnetlink_subsystem nf_tables_subsys = { 4981static const struct nfnetlink_subsystem nf_tables_subsys = {
4279 .name = "nf_tables", 4982 .name = "nf_tables",
4280 .subsys_id = NFNL_SUBSYS_NFTABLES, 4983 .subsys_id = NFNL_SUBSYS_NFTABLES,
@@ -4282,6 +4985,7 @@ static const struct nfnetlink_subsystem nf_tables_subsys = {
4282 .cb = nf_tables_cb, 4985 .cb = nf_tables_cb,
4283 .commit = nf_tables_commit, 4986 .commit = nf_tables_commit,
4284 .abort = nf_tables_abort, 4987 .abort = nf_tables_abort,
4988 .valid_genid = nf_tables_valid_genid,
4285}; 4989};
4286 4990
4287int nft_chain_validate_dependency(const struct nft_chain *chain, 4991int nft_chain_validate_dependency(const struct nft_chain *chain,
@@ -4329,9 +5033,9 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
4329 const struct nft_chain *chain); 5033 const struct nft_chain *chain);
4330 5034
4331static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx, 5035static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx,
4332 const struct nft_set *set, 5036 struct nft_set *set,
4333 const struct nft_set_iter *iter, 5037 const struct nft_set_iter *iter,
4334 const struct nft_set_elem *elem) 5038 struct nft_set_elem *elem)
4335{ 5039{
4336 const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); 5040 const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
4337 const struct nft_data *data; 5041 const struct nft_data *data;
@@ -4355,7 +5059,7 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
4355{ 5059{
4356 const struct nft_rule *rule; 5060 const struct nft_rule *rule;
4357 const struct nft_expr *expr, *last; 5061 const struct nft_expr *expr, *last;
4358 const struct nft_set *set; 5062 struct nft_set *set;
4359 struct nft_set_binding *binding; 5063 struct nft_set_binding *binding;
4360 struct nft_set_iter iter; 5064 struct nft_set_iter iter;
4361 5065
@@ -4807,6 +5511,7 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi)
4807{ 5511{
4808 struct nft_table *table, *nt; 5512 struct nft_table *table, *nt;
4809 struct nft_chain *chain, *nc; 5513 struct nft_chain *chain, *nc;
5514 struct nft_object *obj, *ne;
4810 struct nft_rule *rule, *nr; 5515 struct nft_rule *rule, *nr;
4811 struct nft_set *set, *ns; 5516 struct nft_set *set, *ns;
4812 struct nft_ctx ctx = { 5517 struct nft_ctx ctx = {
@@ -4833,6 +5538,11 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi)
4833 table->use--; 5538 table->use--;
4834 nft_set_destroy(set); 5539 nft_set_destroy(set);
4835 } 5540 }
5541 list_for_each_entry_safe(obj, ne, &table->objects, list) {
5542 list_del(&obj->list);
5543 table->use--;
5544 nft_obj_destroy(obj);
5545 }
4836 list_for_each_entry_safe(chain, nc, &table->chains, list) { 5546 list_for_each_entry_safe(chain, nc, &table->chains, list) {
4837 list_del(&chain->list); 5547 list_del(&chain->list);
4838 table->use--; 5548 table->use--;
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 0dd5c695482f..65dbeadcb118 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -53,10 +53,10 @@ static noinline void __nft_trace_packet(struct nft_traceinfo *info,
53 53
54 nft_trace_notify(info); 54 nft_trace_notify(info);
55 55
56 nf_log_trace(pkt->net, pkt->pf, pkt->hook, pkt->skb, pkt->in, 56 nf_log_trace(nft_net(pkt), nft_pf(pkt), nft_hook(pkt), pkt->skb,
57 pkt->out, &trace_loginfo, "TRACE: %s:%s:%s:%u ", 57 nft_in(pkt), nft_out(pkt), &trace_loginfo,
58 chain->table->name, chain->name, comments[type], 58 "TRACE: %s:%s:%s:%u ",
59 rulenum); 59 chain->table->name, chain->name, comments[type], rulenum);
60} 60}
61 61
62static inline void nft_trace_packet(struct nft_traceinfo *info, 62static inline void nft_trace_packet(struct nft_traceinfo *info,
@@ -124,7 +124,7 @@ unsigned int
124nft_do_chain(struct nft_pktinfo *pkt, void *priv) 124nft_do_chain(struct nft_pktinfo *pkt, void *priv)
125{ 125{
126 const struct nft_chain *chain = priv, *basechain = chain; 126 const struct nft_chain *chain = priv, *basechain = chain;
127 const struct net *net = pkt->net; 127 const struct net *net = nft_net(pkt);
128 const struct nft_rule *rule; 128 const struct nft_rule *rule;
129 const struct nft_expr *expr, *last; 129 const struct nft_expr *expr, *last;
130 struct nft_regs regs; 130 struct nft_regs regs;
@@ -178,6 +178,7 @@ next_rule:
178 case NF_ACCEPT: 178 case NF_ACCEPT:
179 case NF_DROP: 179 case NF_DROP:
180 case NF_QUEUE: 180 case NF_QUEUE:
181 case NF_STOLEN:
181 nft_trace_packet(&info, chain, rule, 182 nft_trace_packet(&info, chain, rule,
182 rulenum, NFT_TRACETYPE_RULE); 183 rulenum, NFT_TRACETYPE_RULE);
183 return regs.verdict.code; 184 return regs.verdict.code;
@@ -231,68 +232,40 @@ next_rule:
231} 232}
232EXPORT_SYMBOL_GPL(nft_do_chain); 233EXPORT_SYMBOL_GPL(nft_do_chain);
233 234
235static struct nft_expr_type *nft_basic_types[] = {
236 &nft_imm_type,
237 &nft_cmp_type,
238 &nft_lookup_type,
239 &nft_bitwise_type,
240 &nft_byteorder_type,
241 &nft_payload_type,
242 &nft_dynset_type,
243 &nft_range_type,
244};
245
234int __init nf_tables_core_module_init(void) 246int __init nf_tables_core_module_init(void)
235{ 247{
236 int err; 248 int err, i;
237
238 err = nft_immediate_module_init();
239 if (err < 0)
240 goto err1;
241
242 err = nft_cmp_module_init();
243 if (err < 0)
244 goto err2;
245
246 err = nft_lookup_module_init();
247 if (err < 0)
248 goto err3;
249
250 err = nft_bitwise_module_init();
251 if (err < 0)
252 goto err4;
253 249
254 err = nft_byteorder_module_init(); 250 for (i = 0; i < ARRAY_SIZE(nft_basic_types); i++) {
255 if (err < 0) 251 err = nft_register_expr(nft_basic_types[i]);
256 goto err5; 252 if (err)
257 253 goto err;
258 err = nft_payload_module_init(); 254 }
259 if (err < 0)
260 goto err6;
261
262 err = nft_dynset_module_init();
263 if (err < 0)
264 goto err7;
265
266 err = nft_range_module_init();
267 if (err < 0)
268 goto err8;
269 255
270 return 0; 256 return 0;
271err8: 257
272 nft_dynset_module_exit(); 258err:
273err7: 259 while (i-- > 0)
274 nft_payload_module_exit(); 260 nft_unregister_expr(nft_basic_types[i]);
275err6:
276 nft_byteorder_module_exit();
277err5:
278 nft_bitwise_module_exit();
279err4:
280 nft_lookup_module_exit();
281err3:
282 nft_cmp_module_exit();
283err2:
284 nft_immediate_module_exit();
285err1:
286 return err; 261 return err;
287} 262}
288 263
289void nf_tables_core_module_exit(void) 264void nf_tables_core_module_exit(void)
290{ 265{
291 nft_dynset_module_exit(); 266 int i;
292 nft_payload_module_exit(); 267
293 nft_byteorder_module_exit(); 268 i = ARRAY_SIZE(nft_basic_types);
294 nft_bitwise_module_exit(); 269 while (i-- > 0)
295 nft_lookup_module_exit(); 270 nft_unregister_expr(nft_basic_types[i]);
296 nft_cmp_module_exit();
297 nft_immediate_module_exit();
298} 271}
diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index ab695f8e2d29..12eb9041dca2 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -171,7 +171,7 @@ void nft_trace_notify(struct nft_traceinfo *info)
171 unsigned int size; 171 unsigned int size;
172 int event = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_TRACE; 172 int event = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_TRACE;
173 173
174 if (!nfnetlink_has_listeners(pkt->net, NFNLGRP_NFTRACE)) 174 if (!nfnetlink_has_listeners(nft_net(pkt), NFNLGRP_NFTRACE))
175 return; 175 return;
176 176
177 size = nlmsg_total_size(sizeof(struct nfgenmsg)) + 177 size = nlmsg_total_size(sizeof(struct nfgenmsg)) +
@@ -207,7 +207,7 @@ void nft_trace_notify(struct nft_traceinfo *info)
207 nfmsg->version = NFNETLINK_V0; 207 nfmsg->version = NFNETLINK_V0;
208 nfmsg->res_id = 0; 208 nfmsg->res_id = 0;
209 209
210 if (nla_put_be32(skb, NFTA_TRACE_NFPROTO, htonl(pkt->pf))) 210 if (nla_put_be32(skb, NFTA_TRACE_NFPROTO, htonl(nft_pf(pkt))))
211 goto nla_put_failure; 211 goto nla_put_failure;
212 212
213 if (nla_put_be32(skb, NFTA_TRACE_TYPE, htonl(info->type))) 213 if (nla_put_be32(skb, NFTA_TRACE_TYPE, htonl(info->type)))
@@ -249,7 +249,7 @@ void nft_trace_notify(struct nft_traceinfo *info)
249 goto nla_put_failure; 249 goto nla_put_failure;
250 250
251 if (!info->packet_dumped) { 251 if (!info->packet_dumped) {
252 if (nf_trace_fill_dev_info(skb, pkt->in, pkt->out)) 252 if (nf_trace_fill_dev_info(skb, nft_in(pkt), nft_out(pkt)))
253 goto nla_put_failure; 253 goto nla_put_failure;
254 254
255 if (nf_trace_fill_pkt_info(skb, pkt)) 255 if (nf_trace_fill_pkt_info(skb, pkt))
@@ -258,7 +258,7 @@ void nft_trace_notify(struct nft_traceinfo *info)
258 } 258 }
259 259
260 nlmsg_end(skb, nlh); 260 nlmsg_end(skb, nlh);
261 nfnetlink_send(skb, pkt->net, 0, NFNLGRP_NFTRACE, 0, GFP_ATOMIC); 261 nfnetlink_send(skb, nft_net(pkt), 0, NFNLGRP_NFTRACE, 0, GFP_ATOMIC);
262 return; 262 return;
263 263
264 nla_put_failure: 264 nla_put_failure:
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 2278d9ab723b..68eda920160e 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * (C) 2001 by Jay Schulist <jschlst@samba.org>, 4 * (C) 2001 by Jay Schulist <jschlst@samba.org>,
5 * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org> 5 * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org>
6 * (C) 2005,2007 by Pablo Neira Ayuso <pablo@netfilter.org> 6 * (C) 2005-2017 by Pablo Neira Ayuso <pablo@netfilter.org>
7 * 7 *
8 * Initial netfilter messages via netlink development funded and 8 * Initial netfilter messages via netlink development funded and
9 * generally made possible by Network Robots, Inc. (www.networkrobots.com) 9 * generally made possible by Network Robots, Inc. (www.networkrobots.com)
@@ -22,7 +22,7 @@
22#include <linux/sockios.h> 22#include <linux/sockios.h>
23#include <linux/net.h> 23#include <linux/net.h>
24#include <linux/skbuff.h> 24#include <linux/skbuff.h>
25#include <asm/uaccess.h> 25#include <linux/uaccess.h>
26#include <net/sock.h> 26#include <net/sock.h>
27#include <linux/init.h> 27#include <linux/init.h>
28 28
@@ -100,9 +100,9 @@ int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n)
100} 100}
101EXPORT_SYMBOL_GPL(nfnetlink_subsys_unregister); 101EXPORT_SYMBOL_GPL(nfnetlink_subsys_unregister);
102 102
103static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t type) 103static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u16 type)
104{ 104{
105 u_int8_t subsys_id = NFNL_SUBSYS_ID(type); 105 u8 subsys_id = NFNL_SUBSYS_ID(type);
106 106
107 if (subsys_id >= NFNL_SUBSYS_COUNT) 107 if (subsys_id >= NFNL_SUBSYS_COUNT)
108 return NULL; 108 return NULL;
@@ -111,9 +111,9 @@ static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t t
111} 111}
112 112
113static inline const struct nfnl_callback * 113static inline const struct nfnl_callback *
114nfnetlink_find_client(u_int16_t type, const struct nfnetlink_subsystem *ss) 114nfnetlink_find_client(u16 type, const struct nfnetlink_subsystem *ss)
115{ 115{
116 u_int8_t cb_id = NFNL_MSG_TYPE(type); 116 u8 cb_id = NFNL_MSG_TYPE(type);
117 117
118 if (cb_id >= ss->cb_count) 118 if (cb_id >= ss->cb_count)
119 return NULL; 119 return NULL;
@@ -185,7 +185,7 @@ replay:
185 185
186 { 186 {
187 int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); 187 int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
188 u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); 188 u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
189 struct nlattr *cda[ss->cb[cb_id].attr_count + 1]; 189 struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
190 struct nlattr *attr = (void *)nlh + min_len; 190 struct nlattr *attr = (void *)nlh + min_len;
191 int attrlen = nlh->nlmsg_len - min_len; 191 int attrlen = nlh->nlmsg_len - min_len;
@@ -273,13 +273,13 @@ enum {
273}; 273};
274 274
275static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, 275static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
276 u_int16_t subsys_id) 276 u16 subsys_id, u32 genid)
277{ 277{
278 struct sk_buff *oskb = skb; 278 struct sk_buff *oskb = skb;
279 struct net *net = sock_net(skb->sk); 279 struct net *net = sock_net(skb->sk);
280 const struct nfnetlink_subsystem *ss; 280 const struct nfnetlink_subsystem *ss;
281 const struct nfnl_callback *nc; 281 const struct nfnl_callback *nc;
282 static LIST_HEAD(err_list); 282 LIST_HEAD(err_list);
283 u32 status; 283 u32 status;
284 int err; 284 int err;
285 285
@@ -315,6 +315,12 @@ replay:
315 return kfree_skb(skb); 315 return kfree_skb(skb);
316 } 316 }
317 317
318 if (genid && ss->valid_genid && !ss->valid_genid(net, genid)) {
319 nfnl_unlock(subsys_id);
320 netlink_ack(oskb, nlh, -ERESTART);
321 return kfree_skb(skb);
322 }
323
318 while (skb->len >= nlmsg_total_size(0)) { 324 while (skb->len >= nlmsg_total_size(0)) {
319 int msglen, type; 325 int msglen, type;
320 326
@@ -365,7 +371,7 @@ replay:
365 371
366 { 372 {
367 int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); 373 int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
368 u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); 374 u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
369 struct nlattr *cda[ss->cb[cb_id].attr_count + 1]; 375 struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
370 struct nlattr *attr = (void *)nlh + min_len; 376 struct nlattr *attr = (void *)nlh + min_len;
371 int attrlen = nlh->nlmsg_len - min_len; 377 int attrlen = nlh->nlmsg_len - min_len;
@@ -436,11 +442,51 @@ done:
436 kfree_skb(skb); 442 kfree_skb(skb);
437} 443}
438 444
445static const struct nla_policy nfnl_batch_policy[NFNL_BATCH_MAX + 1] = {
446 [NFNL_BATCH_GENID] = { .type = NLA_U32 },
447};
448
449static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh)
450{
451 int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
452 struct nlattr *attr = (void *)nlh + min_len;
453 struct nlattr *cda[NFNL_BATCH_MAX + 1];
454 int attrlen = nlh->nlmsg_len - min_len;
455 struct nfgenmsg *nfgenmsg;
456 int msglen, err;
457 u32 gen_id = 0;
458 u16 res_id;
459
460 msglen = NLMSG_ALIGN(nlh->nlmsg_len);
461 if (msglen > skb->len)
462 msglen = skb->len;
463
464 if (nlh->nlmsg_len < NLMSG_HDRLEN ||
465 skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg))
466 return;
467
468 err = nla_parse(cda, NFNL_BATCH_MAX, attr, attrlen, nfnl_batch_policy);
469 if (err < 0) {
470 netlink_ack(skb, nlh, err);
471 return;
472 }
473 if (cda[NFNL_BATCH_GENID])
474 gen_id = ntohl(nla_get_be32(cda[NFNL_BATCH_GENID]));
475
476 nfgenmsg = nlmsg_data(nlh);
477 skb_pull(skb, msglen);
478 /* Work around old nft using host byte order */
479 if (nfgenmsg->res_id == NFNL_SUBSYS_NFTABLES)
480 res_id = NFNL_SUBSYS_NFTABLES;
481 else
482 res_id = ntohs(nfgenmsg->res_id);
483
484 nfnetlink_rcv_batch(skb, nlh, res_id, gen_id);
485}
486
439static void nfnetlink_rcv(struct sk_buff *skb) 487static void nfnetlink_rcv(struct sk_buff *skb)
440{ 488{
441 struct nlmsghdr *nlh = nlmsg_hdr(skb); 489 struct nlmsghdr *nlh = nlmsg_hdr(skb);
442 u_int16_t res_id;
443 int msglen;
444 490
445 if (nlh->nlmsg_len < NLMSG_HDRLEN || 491 if (nlh->nlmsg_len < NLMSG_HDRLEN ||
446 skb->len < nlh->nlmsg_len) 492 skb->len < nlh->nlmsg_len)
@@ -451,28 +497,10 @@ static void nfnetlink_rcv(struct sk_buff *skb)
451 return; 497 return;
452 } 498 }
453 499
454 if (nlh->nlmsg_type == NFNL_MSG_BATCH_BEGIN) { 500 if (nlh->nlmsg_type == NFNL_MSG_BATCH_BEGIN)
455 struct nfgenmsg *nfgenmsg; 501 nfnetlink_rcv_skb_batch(skb, nlh);
456 502 else
457 msglen = NLMSG_ALIGN(nlh->nlmsg_len);
458 if (msglen > skb->len)
459 msglen = skb->len;
460
461 if (nlh->nlmsg_len < NLMSG_HDRLEN ||
462 skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg))
463 return;
464
465 nfgenmsg = nlmsg_data(nlh);
466 skb_pull(skb, msglen);
467 /* Work around old nft using host byte order */
468 if (nfgenmsg->res_id == NFNL_SUBSYS_NFTABLES)
469 res_id = NFNL_SUBSYS_NFTABLES;
470 else
471 res_id = ntohs(nfgenmsg->res_id);
472 nfnetlink_rcv_batch(skb, nlh, res_id);
473 } else {
474 netlink_rcv_skb(skb, &nfnetlink_rcv_msg); 503 netlink_rcv_skb(skb, &nfnetlink_rcv_msg);
475 }
476} 504}
477 505
478#ifdef CONFIG_MODULES 506#ifdef CONFIG_MODULES
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index 3b79f34b5095..d45558178da5 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -32,6 +32,13 @@ MODULE_LICENSE("GPL");
32MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); 32MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
33MODULE_DESCRIPTION("nfnl_cthelper: User-space connection tracking helpers"); 33MODULE_DESCRIPTION("nfnl_cthelper: User-space connection tracking helpers");
34 34
35struct nfnl_cthelper {
36 struct list_head list;
37 struct nf_conntrack_helper helper;
38};
39
40static LIST_HEAD(nfnl_cthelper_list);
41
35static int 42static int
36nfnl_userspace_cthelper(struct sk_buff *skb, unsigned int protoff, 43nfnl_userspace_cthelper(struct sk_buff *skb, unsigned int protoff,
37 struct nf_conn *ct, enum ip_conntrack_info ctinfo) 44 struct nf_conn *ct, enum ip_conntrack_info ctinfo)
@@ -48,7 +55,7 @@ nfnl_userspace_cthelper(struct sk_buff *skb, unsigned int protoff,
48 if (helper == NULL) 55 if (helper == NULL)
49 return NF_DROP; 56 return NF_DROP;
50 57
51 /* This is an user-space helper not yet configured, skip. */ 58 /* This is a user-space helper not yet configured, skip. */
52 if ((helper->flags & 59 if ((helper->flags &
53 (NF_CT_HELPER_F_USERSPACE | NF_CT_HELPER_F_CONFIGURED)) == 60 (NF_CT_HELPER_F_USERSPACE | NF_CT_HELPER_F_CONFIGURED)) ==
54 NF_CT_HELPER_F_USERSPACE) 61 NF_CT_HELPER_F_USERSPACE)
@@ -161,6 +168,7 @@ nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper,
161 int i, ret; 168 int i, ret;
162 struct nf_conntrack_expect_policy *expect_policy; 169 struct nf_conntrack_expect_policy *expect_policy;
163 struct nlattr *tb[NFCTH_POLICY_SET_MAX+1]; 170 struct nlattr *tb[NFCTH_POLICY_SET_MAX+1];
171 unsigned int class_max;
164 172
165 ret = nla_parse_nested(tb, NFCTH_POLICY_SET_MAX, attr, 173 ret = nla_parse_nested(tb, NFCTH_POLICY_SET_MAX, attr,
166 nfnl_cthelper_expect_policy_set); 174 nfnl_cthelper_expect_policy_set);
@@ -170,19 +178,18 @@ nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper,
170 if (!tb[NFCTH_POLICY_SET_NUM]) 178 if (!tb[NFCTH_POLICY_SET_NUM])
171 return -EINVAL; 179 return -EINVAL;
172 180
173 helper->expect_class_max = 181 class_max = ntohl(nla_get_be32(tb[NFCTH_POLICY_SET_NUM]));
174 ntohl(nla_get_be32(tb[NFCTH_POLICY_SET_NUM])); 182 if (class_max == 0)
175 183 return -EINVAL;
176 if (helper->expect_class_max != 0 && 184 if (class_max > NF_CT_MAX_EXPECT_CLASSES)
177 helper->expect_class_max > NF_CT_MAX_EXPECT_CLASSES)
178 return -EOVERFLOW; 185 return -EOVERFLOW;
179 186
180 expect_policy = kzalloc(sizeof(struct nf_conntrack_expect_policy) * 187 expect_policy = kzalloc(sizeof(struct nf_conntrack_expect_policy) *
181 helper->expect_class_max, GFP_KERNEL); 188 class_max, GFP_KERNEL);
182 if (expect_policy == NULL) 189 if (expect_policy == NULL)
183 return -ENOMEM; 190 return -ENOMEM;
184 191
185 for (i=0; i<helper->expect_class_max; i++) { 192 for (i = 0; i < class_max; i++) {
186 if (!tb[NFCTH_POLICY_SET+i]) 193 if (!tb[NFCTH_POLICY_SET+i])
187 goto err; 194 goto err;
188 195
@@ -191,6 +198,8 @@ nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper,
191 if (ret < 0) 198 if (ret < 0)
192 goto err; 199 goto err;
193 } 200 }
201
202 helper->expect_class_max = class_max - 1;
194 helper->expect_policy = expect_policy; 203 helper->expect_policy = expect_policy;
195 return 0; 204 return 0;
196err: 205err:
@@ -203,18 +212,20 @@ nfnl_cthelper_create(const struct nlattr * const tb[],
203 struct nf_conntrack_tuple *tuple) 212 struct nf_conntrack_tuple *tuple)
204{ 213{
205 struct nf_conntrack_helper *helper; 214 struct nf_conntrack_helper *helper;
215 struct nfnl_cthelper *nfcth;
206 int ret; 216 int ret;
207 217
208 if (!tb[NFCTH_TUPLE] || !tb[NFCTH_POLICY] || !tb[NFCTH_PRIV_DATA_LEN]) 218 if (!tb[NFCTH_TUPLE] || !tb[NFCTH_POLICY] || !tb[NFCTH_PRIV_DATA_LEN])
209 return -EINVAL; 219 return -EINVAL;
210 220
211 helper = kzalloc(sizeof(struct nf_conntrack_helper), GFP_KERNEL); 221 nfcth = kzalloc(sizeof(*nfcth), GFP_KERNEL);
212 if (helper == NULL) 222 if (nfcth == NULL)
213 return -ENOMEM; 223 return -ENOMEM;
224 helper = &nfcth->helper;
214 225
215 ret = nfnl_cthelper_parse_expect_policy(helper, tb[NFCTH_POLICY]); 226 ret = nfnl_cthelper_parse_expect_policy(helper, tb[NFCTH_POLICY]);
216 if (ret < 0) 227 if (ret < 0)
217 goto err; 228 goto err1;
218 229
219 strncpy(helper->name, nla_data(tb[NFCTH_NAME]), NF_CT_HELPER_NAME_LEN); 230 strncpy(helper->name, nla_data(tb[NFCTH_NAME]), NF_CT_HELPER_NAME_LEN);
220 helper->data_len = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN])); 231 helper->data_len = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN]));
@@ -245,15 +256,101 @@ nfnl_cthelper_create(const struct nlattr * const tb[],
245 256
246 ret = nf_conntrack_helper_register(helper); 257 ret = nf_conntrack_helper_register(helper);
247 if (ret < 0) 258 if (ret < 0)
248 goto err; 259 goto err2;
249 260
261 list_add_tail(&nfcth->list, &nfnl_cthelper_list);
250 return 0; 262 return 0;
251err: 263err2:
252 kfree(helper); 264 kfree(helper->expect_policy);
265err1:
266 kfree(nfcth);
253 return ret; 267 return ret;
254} 268}
255 269
256static int 270static int
271nfnl_cthelper_update_policy_one(const struct nf_conntrack_expect_policy *policy,
272 struct nf_conntrack_expect_policy *new_policy,
273 const struct nlattr *attr)
274{
275 struct nlattr *tb[NFCTH_POLICY_MAX + 1];
276 int err;
277
278 err = nla_parse_nested(tb, NFCTH_POLICY_MAX, attr,
279 nfnl_cthelper_expect_pol);
280 if (err < 0)
281 return err;
282
283 if (!tb[NFCTH_POLICY_NAME] ||
284 !tb[NFCTH_POLICY_EXPECT_MAX] ||
285 !tb[NFCTH_POLICY_EXPECT_TIMEOUT])
286 return -EINVAL;
287
288 if (nla_strcmp(tb[NFCTH_POLICY_NAME], policy->name))
289 return -EBUSY;
290
291 new_policy->max_expected =
292 ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX]));
293 new_policy->timeout =
294 ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_TIMEOUT]));
295
296 return 0;
297}
298
299static int nfnl_cthelper_update_policy_all(struct nlattr *tb[],
300 struct nf_conntrack_helper *helper)
301{
302 struct nf_conntrack_expect_policy new_policy[helper->expect_class_max + 1];
303 struct nf_conntrack_expect_policy *policy;
304 int i, err;
305
306 /* Check first that all policy attributes are well-formed, so we don't
307 * leave things in inconsistent state on errors.
308 */
309 for (i = 0; i < helper->expect_class_max + 1; i++) {
310
311 if (!tb[NFCTH_POLICY_SET + i])
312 return -EINVAL;
313
314 err = nfnl_cthelper_update_policy_one(&helper->expect_policy[i],
315 &new_policy[i],
316 tb[NFCTH_POLICY_SET + i]);
317 if (err < 0)
318 return err;
319 }
320 /* Now we can safely update them. */
321 for (i = 0; i < helper->expect_class_max + 1; i++) {
322 policy = (struct nf_conntrack_expect_policy *)
323 &helper->expect_policy[i];
324 policy->max_expected = new_policy->max_expected;
325 policy->timeout = new_policy->timeout;
326 }
327
328 return 0;
329}
330
331static int nfnl_cthelper_update_policy(struct nf_conntrack_helper *helper,
332 const struct nlattr *attr)
333{
334 struct nlattr *tb[NFCTH_POLICY_SET_MAX + 1];
335 unsigned int class_max;
336 int err;
337
338 err = nla_parse_nested(tb, NFCTH_POLICY_SET_MAX, attr,
339 nfnl_cthelper_expect_policy_set);
340 if (err < 0)
341 return err;
342
343 if (!tb[NFCTH_POLICY_SET_NUM])
344 return -EINVAL;
345
346 class_max = ntohl(nla_get_be32(tb[NFCTH_POLICY_SET_NUM]));
347 if (helper->expect_class_max + 1 != class_max)
348 return -EBUSY;
349
350 return nfnl_cthelper_update_policy_all(tb, helper);
351}
352
353static int
257nfnl_cthelper_update(const struct nlattr * const tb[], 354nfnl_cthelper_update(const struct nlattr * const tb[],
258 struct nf_conntrack_helper *helper) 355 struct nf_conntrack_helper *helper)
259{ 356{
@@ -263,8 +360,7 @@ nfnl_cthelper_update(const struct nlattr * const tb[],
263 return -EBUSY; 360 return -EBUSY;
264 361
265 if (tb[NFCTH_POLICY]) { 362 if (tb[NFCTH_POLICY]) {
266 ret = nfnl_cthelper_parse_expect_policy(helper, 363 ret = nfnl_cthelper_update_policy(helper, tb[NFCTH_POLICY]);
267 tb[NFCTH_POLICY]);
268 if (ret < 0) 364 if (ret < 0)
269 return ret; 365 return ret;
270 } 366 }
@@ -293,7 +389,8 @@ static int nfnl_cthelper_new(struct net *net, struct sock *nfnl,
293 const char *helper_name; 389 const char *helper_name;
294 struct nf_conntrack_helper *cur, *helper = NULL; 390 struct nf_conntrack_helper *cur, *helper = NULL;
295 struct nf_conntrack_tuple tuple; 391 struct nf_conntrack_tuple tuple;
296 int ret = 0, i; 392 struct nfnl_cthelper *nlcth;
393 int ret = 0;
297 394
298 if (!tb[NFCTH_NAME] || !tb[NFCTH_TUPLE]) 395 if (!tb[NFCTH_NAME] || !tb[NFCTH_TUPLE])
299 return -EINVAL; 396 return -EINVAL;
@@ -304,31 +401,22 @@ static int nfnl_cthelper_new(struct net *net, struct sock *nfnl,
304 if (ret < 0) 401 if (ret < 0)
305 return ret; 402 return ret;
306 403
307 rcu_read_lock(); 404 list_for_each_entry(nlcth, &nfnl_cthelper_list, list) {
308 for (i = 0; i < nf_ct_helper_hsize && !helper; i++) { 405 cur = &nlcth->helper;
309 hlist_for_each_entry_rcu(cur, &nf_ct_helper_hash[i], hnode) {
310 406
311 /* skip non-userspace conntrack helpers. */ 407 if (strncmp(cur->name, helper_name, NF_CT_HELPER_NAME_LEN))
312 if (!(cur->flags & NF_CT_HELPER_F_USERSPACE)) 408 continue;
313 continue;
314 409
315 if (strncmp(cur->name, helper_name, 410 if ((tuple.src.l3num != cur->tuple.src.l3num ||
316 NF_CT_HELPER_NAME_LEN) != 0) 411 tuple.dst.protonum != cur->tuple.dst.protonum))
317 continue; 412 continue;
318 413
319 if ((tuple.src.l3num != cur->tuple.src.l3num || 414 if (nlh->nlmsg_flags & NLM_F_EXCL)
320 tuple.dst.protonum != cur->tuple.dst.protonum)) 415 return -EEXIST;
321 continue;
322 416
323 if (nlh->nlmsg_flags & NLM_F_EXCL) { 417 helper = cur;
324 ret = -EEXIST; 418 break;
325 goto err;
326 }
327 helper = cur;
328 break;
329 }
330 } 419 }
331 rcu_read_unlock();
332 420
333 if (helper == NULL) 421 if (helper == NULL)
334 ret = nfnl_cthelper_create(tb, &tuple); 422 ret = nfnl_cthelper_create(tb, &tuple);
@@ -336,9 +424,6 @@ static int nfnl_cthelper_new(struct net *net, struct sock *nfnl,
336 ret = nfnl_cthelper_update(tb, helper); 424 ret = nfnl_cthelper_update(tb, helper);
337 425
338 return ret; 426 return ret;
339err:
340 rcu_read_unlock();
341 return ret;
342} 427}
343 428
344static int 429static int
@@ -377,10 +462,10 @@ nfnl_cthelper_dump_policy(struct sk_buff *skb,
377 goto nla_put_failure; 462 goto nla_put_failure;
378 463
379 if (nla_put_be32(skb, NFCTH_POLICY_SET_NUM, 464 if (nla_put_be32(skb, NFCTH_POLICY_SET_NUM,
380 htonl(helper->expect_class_max))) 465 htonl(helper->expect_class_max + 1)))
381 goto nla_put_failure; 466 goto nla_put_failure;
382 467
383 for (i=0; i<helper->expect_class_max; i++) { 468 for (i = 0; i < helper->expect_class_max + 1; i++) {
384 nest_parms2 = nla_nest_start(skb, 469 nest_parms2 = nla_nest_start(skb,
385 (NFCTH_POLICY_SET+i) | NLA_F_NESTED); 470 (NFCTH_POLICY_SET+i) | NLA_F_NESTED);
386 if (nest_parms2 == NULL) 471 if (nest_parms2 == NULL)
@@ -502,11 +587,12 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl,
502 struct sk_buff *skb, const struct nlmsghdr *nlh, 587 struct sk_buff *skb, const struct nlmsghdr *nlh,
503 const struct nlattr * const tb[]) 588 const struct nlattr * const tb[])
504{ 589{
505 int ret = -ENOENT, i; 590 int ret = -ENOENT;
506 struct nf_conntrack_helper *cur; 591 struct nf_conntrack_helper *cur;
507 struct sk_buff *skb2; 592 struct sk_buff *skb2;
508 char *helper_name = NULL; 593 char *helper_name = NULL;
509 struct nf_conntrack_tuple tuple; 594 struct nf_conntrack_tuple tuple;
595 struct nfnl_cthelper *nlcth;
510 bool tuple_set = false; 596 bool tuple_set = false;
511 597
512 if (nlh->nlmsg_flags & NLM_F_DUMP) { 598 if (nlh->nlmsg_flags & NLM_F_DUMP) {
@@ -527,45 +613,39 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl,
527 tuple_set = true; 613 tuple_set = true;
528 } 614 }
529 615
530 for (i = 0; i < nf_ct_helper_hsize; i++) { 616 list_for_each_entry(nlcth, &nfnl_cthelper_list, list) {
531 hlist_for_each_entry_rcu(cur, &nf_ct_helper_hash[i], hnode) { 617 cur = &nlcth->helper;
618 if (helper_name &&
619 strncmp(cur->name, helper_name, NF_CT_HELPER_NAME_LEN))
620 continue;
532 621
533 /* skip non-userspace conntrack helpers. */ 622 if (tuple_set &&
534 if (!(cur->flags & NF_CT_HELPER_F_USERSPACE)) 623 (tuple.src.l3num != cur->tuple.src.l3num ||
535 continue; 624 tuple.dst.protonum != cur->tuple.dst.protonum))
625 continue;
536 626
537 if (helper_name && strncmp(cur->name, helper_name, 627 skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
538 NF_CT_HELPER_NAME_LEN) != 0) { 628 if (skb2 == NULL) {
539 continue; 629 ret = -ENOMEM;
540 } 630 break;
541 if (tuple_set && 631 }
542 (tuple.src.l3num != cur->tuple.src.l3num ||
543 tuple.dst.protonum != cur->tuple.dst.protonum))
544 continue;
545
546 skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
547 if (skb2 == NULL) {
548 ret = -ENOMEM;
549 break;
550 }
551 632
552 ret = nfnl_cthelper_fill_info(skb2, NETLINK_CB(skb).portid, 633 ret = nfnl_cthelper_fill_info(skb2, NETLINK_CB(skb).portid,
553 nlh->nlmsg_seq, 634 nlh->nlmsg_seq,
554 NFNL_MSG_TYPE(nlh->nlmsg_type), 635 NFNL_MSG_TYPE(nlh->nlmsg_type),
555 NFNL_MSG_CTHELPER_NEW, cur); 636 NFNL_MSG_CTHELPER_NEW, cur);
556 if (ret <= 0) { 637 if (ret <= 0) {
557 kfree_skb(skb2); 638 kfree_skb(skb2);
558 break; 639 break;
559 } 640 }
560 641
561 ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid, 642 ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
562 MSG_DONTWAIT); 643 MSG_DONTWAIT);
563 if (ret > 0) 644 if (ret > 0)
564 ret = 0; 645 ret = 0;
565 646
566 /* this avoids a loop in nfnetlink. */ 647 /* this avoids a loop in nfnetlink. */
567 return ret == -EAGAIN ? -ENOBUFS : ret; 648 return ret == -EAGAIN ? -ENOBUFS : ret;
568 }
569 } 649 }
570 return ret; 650 return ret;
571} 651}
@@ -576,10 +656,10 @@ static int nfnl_cthelper_del(struct net *net, struct sock *nfnl,
576{ 656{
577 char *helper_name = NULL; 657 char *helper_name = NULL;
578 struct nf_conntrack_helper *cur; 658 struct nf_conntrack_helper *cur;
579 struct hlist_node *tmp;
580 struct nf_conntrack_tuple tuple; 659 struct nf_conntrack_tuple tuple;
581 bool tuple_set = false, found = false; 660 bool tuple_set = false, found = false;
582 int i, j = 0, ret; 661 struct nfnl_cthelper *nlcth, *n;
662 int j = 0, ret;
583 663
584 if (tb[NFCTH_NAME]) 664 if (tb[NFCTH_NAME])
585 helper_name = nla_data(tb[NFCTH_NAME]); 665 helper_name = nla_data(tb[NFCTH_NAME]);
@@ -592,28 +672,27 @@ static int nfnl_cthelper_del(struct net *net, struct sock *nfnl,
592 tuple_set = true; 672 tuple_set = true;
593 } 673 }
594 674
595 for (i = 0; i < nf_ct_helper_hsize; i++) { 675 list_for_each_entry_safe(nlcth, n, &nfnl_cthelper_list, list) {
596 hlist_for_each_entry_safe(cur, tmp, &nf_ct_helper_hash[i], 676 cur = &nlcth->helper;
597 hnode) { 677 j++;
598 /* skip non-userspace conntrack helpers. */
599 if (!(cur->flags & NF_CT_HELPER_F_USERSPACE))
600 continue;
601 678
602 j++; 679 if (helper_name &&
680 strncmp(cur->name, helper_name, NF_CT_HELPER_NAME_LEN))
681 continue;
603 682
604 if (helper_name && strncmp(cur->name, helper_name, 683 if (tuple_set &&
605 NF_CT_HELPER_NAME_LEN) != 0) { 684 (tuple.src.l3num != cur->tuple.src.l3num ||
606 continue; 685 tuple.dst.protonum != cur->tuple.dst.protonum))
607 } 686 continue;
608 if (tuple_set &&
609 (tuple.src.l3num != cur->tuple.src.l3num ||
610 tuple.dst.protonum != cur->tuple.dst.protonum))
611 continue;
612 687
613 found = true; 688 found = true;
614 nf_conntrack_helper_unregister(cur); 689 nf_conntrack_helper_unregister(cur);
615 } 690 kfree(cur->expect_policy);
691
692 list_del(&nlcth->list);
693 kfree(nlcth);
616 } 694 }
695
617 /* Make sure we return success if we flush and there is no helpers */ 696 /* Make sure we return success if we flush and there is no helpers */
618 return (found || j == 0) ? 0 : -ENOENT; 697 return (found || j == 0) ? 0 : -ENOENT;
619} 698}
@@ -662,20 +741,16 @@ err_out:
662static void __exit nfnl_cthelper_exit(void) 741static void __exit nfnl_cthelper_exit(void)
663{ 742{
664 struct nf_conntrack_helper *cur; 743 struct nf_conntrack_helper *cur;
665 struct hlist_node *tmp; 744 struct nfnl_cthelper *nlcth, *n;
666 int i;
667 745
668 nfnetlink_subsys_unregister(&nfnl_cthelper_subsys); 746 nfnetlink_subsys_unregister(&nfnl_cthelper_subsys);
669 747
670 for (i=0; i<nf_ct_helper_hsize; i++) { 748 list_for_each_entry_safe(nlcth, n, &nfnl_cthelper_list, list) {
671 hlist_for_each_entry_safe(cur, tmp, &nf_ct_helper_hash[i], 749 cur = &nlcth->helper;
672 hnode) {
673 /* skip non-userspace conntrack helpers. */
674 if (!(cur->flags & NF_CT_HELPER_F_USERSPACE))
675 continue;
676 750
677 nf_conntrack_helper_unregister(cur); 751 nf_conntrack_helper_unregister(cur);
678 } 752 kfree(cur->expect_policy);
753 kfree(nlcth);
679 } 754 }
680} 755}
681 756
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index 139e0867e56e..47d6656c9119 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -646,8 +646,8 @@ static void __exit cttimeout_exit(void)
646#ifdef CONFIG_NF_CONNTRACK_TIMEOUT 646#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
647 RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL); 647 RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL);
648 RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL); 648 RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL);
649 synchronize_rcu();
649#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ 650#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
650 rcu_barrier();
651} 651}
652 652
653module_init(cttimeout_init); 653module_init(cttimeout_init);
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index eb086a192c5a..08247bf7d7b8 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -80,7 +80,7 @@ struct nfulnl_instance {
80 80
81#define INSTANCE_BUCKETS 16 81#define INSTANCE_BUCKETS 16
82 82
83static int nfnl_log_net_id __read_mostly; 83static unsigned int nfnl_log_net_id __read_mostly;
84 84
85struct nfnl_log_net { 85struct nfnl_log_net {
86 spinlock_t instances_lock; 86 spinlock_t instances_lock;
@@ -330,7 +330,7 @@ nfulnl_alloc_skb(struct net *net, u32 peer_portid, unsigned int inst_size,
330 * message. WARNING: has to be <= 128k due to slab restrictions */ 330 * message. WARNING: has to be <= 128k due to slab restrictions */
331 331
332 n = max(inst_size, pkt_size); 332 n = max(inst_size, pkt_size);
333 skb = alloc_skb(n, GFP_ATOMIC); 333 skb = alloc_skb(n, GFP_ATOMIC | __GFP_NOWARN);
334 if (!skb) { 334 if (!skb) {
335 if (n > pkt_size) { 335 if (n > pkt_size) {
336 /* try to allocate only as much as we need for current 336 /* try to allocate only as much as we need for current
@@ -538,7 +538,7 @@ __build_packet_message(struct nfnl_log_net *log,
538 goto nla_put_failure; 538 goto nla_put_failure;
539 } 539 }
540 540
541 if (skb->tstamp.tv64) { 541 if (skb->tstamp) {
542 struct nfulnl_msg_packet_timestamp ts; 542 struct nfulnl_msg_packet_timestamp ts;
543 struct timespec64 kts = ktime_to_timespec64(skb->tstamp); 543 struct timespec64 kts = ktime_to_timespec64(skb->tstamp);
544 ts.sec = cpu_to_be64(kts.tv_sec); 544 ts.sec = cpu_to_be64(kts.tv_sec);
@@ -1152,6 +1152,7 @@ MODULE_ALIAS_NF_LOGGER(AF_INET, 1);
1152MODULE_ALIAS_NF_LOGGER(AF_INET6, 1); 1152MODULE_ALIAS_NF_LOGGER(AF_INET6, 1);
1153MODULE_ALIAS_NF_LOGGER(AF_BRIDGE, 1); 1153MODULE_ALIAS_NF_LOGGER(AF_BRIDGE, 1);
1154MODULE_ALIAS_NF_LOGGER(3, 1); /* NFPROTO_ARP */ 1154MODULE_ALIAS_NF_LOGGER(3, 1); /* NFPROTO_ARP */
1155MODULE_ALIAS_NF_LOGGER(5, 1); /* NFPROTO_NETDEV */
1155 1156
1156module_init(nfnetlink_log_init); 1157module_init(nfnetlink_log_init);
1157module_exit(nfnetlink_log_fini); 1158module_exit(nfnetlink_log_fini);
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index af832c526048..933509ebf3d3 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -69,7 +69,7 @@ struct nfqnl_instance {
69 * Following fields are dirtied for each queued packet, 69 * Following fields are dirtied for each queued packet,
70 * keep them in same cache line if possible. 70 * keep them in same cache line if possible.
71 */ 71 */
72 spinlock_t lock; 72 spinlock_t lock ____cacheline_aligned_in_smp;
73 unsigned int queue_total; 73 unsigned int queue_total;
74 unsigned int id_sequence; /* 'sequence' of pkt ids */ 74 unsigned int id_sequence; /* 'sequence' of pkt ids */
75 struct list_head queue_list; /* packets in queue */ 75 struct list_head queue_list; /* packets in queue */
@@ -77,7 +77,7 @@ struct nfqnl_instance {
77 77
78typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long); 78typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long);
79 79
80static int nfnl_queue_net_id __read_mostly; 80static unsigned int nfnl_queue_net_id __read_mostly;
81 81
82#define INSTANCE_BUCKETS 16 82#define INSTANCE_BUCKETS 16
83struct nfnl_queue_net { 83struct nfnl_queue_net {
@@ -384,7 +384,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
384 + nla_total_size(sizeof(u_int32_t)) /* skbinfo */ 384 + nla_total_size(sizeof(u_int32_t)) /* skbinfo */
385 + nla_total_size(sizeof(u_int32_t)); /* cap_len */ 385 + nla_total_size(sizeof(u_int32_t)); /* cap_len */
386 386
387 if (entskb->tstamp.tv64) 387 if (entskb->tstamp)
388 size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp)); 388 size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp));
389 389
390 size += nfqnl_get_bridge_size(entry); 390 size += nfqnl_get_bridge_size(entry);
@@ -443,7 +443,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
443 skb = alloc_skb(size, GFP_ATOMIC); 443 skb = alloc_skb(size, GFP_ATOMIC);
444 if (!skb) { 444 if (!skb) {
445 skb_tx_error(entskb); 445 skb_tx_error(entskb);
446 return NULL; 446 goto nlmsg_failure;
447 } 447 }
448 448
449 nlh = nlmsg_put(skb, 0, 0, 449 nlh = nlmsg_put(skb, 0, 0,
@@ -452,7 +452,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
452 if (!nlh) { 452 if (!nlh) {
453 skb_tx_error(entskb); 453 skb_tx_error(entskb);
454 kfree_skb(skb); 454 kfree_skb(skb);
455 return NULL; 455 goto nlmsg_failure;
456 } 456 }
457 nfmsg = nlmsg_data(nlh); 457 nfmsg = nlmsg_data(nlh);
458 nfmsg->nfgen_family = entry->state.pf; 458 nfmsg->nfgen_family = entry->state.pf;
@@ -555,7 +555,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
555 if (nfqnl_put_bridge(entry, skb) < 0) 555 if (nfqnl_put_bridge(entry, skb) < 0)
556 goto nla_put_failure; 556 goto nla_put_failure;
557 557
558 if (entskb->tstamp.tv64) { 558 if (entskb->tstamp) {
559 struct nfqnl_msg_packet_timestamp ts; 559 struct nfqnl_msg_packet_timestamp ts;
560 struct timespec64 kts = ktime_to_timespec64(entskb->tstamp); 560 struct timespec64 kts = ktime_to_timespec64(entskb->tstamp);
561 561
@@ -598,12 +598,17 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
598 } 598 }
599 599
600 nlh->nlmsg_len = skb->len; 600 nlh->nlmsg_len = skb->len;
601 if (seclen)
602 security_release_secctx(secdata, seclen);
601 return skb; 603 return skb;
602 604
603nla_put_failure: 605nla_put_failure:
604 skb_tx_error(entskb); 606 skb_tx_error(entskb);
605 kfree_skb(skb); 607 kfree_skb(skb);
606 net_err_ratelimited("nf_queue: error creating packet message\n"); 608 net_err_ratelimited("nf_queue: error creating packet message\n");
609nlmsg_failure:
610 if (seclen)
611 security_release_secctx(secdata, seclen);
607 return NULL; 612 return NULL;
608} 613}
609 614
@@ -919,7 +924,7 @@ static struct notifier_block nfqnl_dev_notifier = {
919 924
920static int nf_hook_cmp(struct nf_queue_entry *entry, unsigned long entry_ptr) 925static int nf_hook_cmp(struct nf_queue_entry *entry, unsigned long entry_ptr)
921{ 926{
922 return rcu_access_pointer(entry->state.hook_entries) == 927 return rcu_access_pointer(entry->hook) ==
923 (struct nf_hook_entry *)entry_ptr; 928 (struct nf_hook_entry *)entry_ptr;
924} 929}
925 930
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
index 31c15ed2e5fc..877d9acd91ef 100644
--- a/net/netfilter/nft_bitwise.c
+++ b/net/netfilter/nft_bitwise.c
@@ -121,7 +121,6 @@ nla_put_failure:
121 return -1; 121 return -1;
122} 122}
123 123
124static struct nft_expr_type nft_bitwise_type;
125static const struct nft_expr_ops nft_bitwise_ops = { 124static const struct nft_expr_ops nft_bitwise_ops = {
126 .type = &nft_bitwise_type, 125 .type = &nft_bitwise_type,
127 .size = NFT_EXPR_SIZE(sizeof(struct nft_bitwise)), 126 .size = NFT_EXPR_SIZE(sizeof(struct nft_bitwise)),
@@ -130,20 +129,10 @@ static const struct nft_expr_ops nft_bitwise_ops = {
130 .dump = nft_bitwise_dump, 129 .dump = nft_bitwise_dump,
131}; 130};
132 131
133static struct nft_expr_type nft_bitwise_type __read_mostly = { 132struct nft_expr_type nft_bitwise_type __read_mostly = {
134 .name = "bitwise", 133 .name = "bitwise",
135 .ops = &nft_bitwise_ops, 134 .ops = &nft_bitwise_ops,
136 .policy = nft_bitwise_policy, 135 .policy = nft_bitwise_policy,
137 .maxattr = NFTA_BITWISE_MAX, 136 .maxattr = NFTA_BITWISE_MAX,
138 .owner = THIS_MODULE, 137 .owner = THIS_MODULE,
139}; 138};
140
141int __init nft_bitwise_module_init(void)
142{
143 return nft_register_expr(&nft_bitwise_type);
144}
145
146void nft_bitwise_module_exit(void)
147{
148 nft_unregister_expr(&nft_bitwise_type);
149}
diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c
index ee63d981268d..13d4e421a6b3 100644
--- a/net/netfilter/nft_byteorder.c
+++ b/net/netfilter/nft_byteorder.c
@@ -169,7 +169,6 @@ nla_put_failure:
169 return -1; 169 return -1;
170} 170}
171 171
172static struct nft_expr_type nft_byteorder_type;
173static const struct nft_expr_ops nft_byteorder_ops = { 172static const struct nft_expr_ops nft_byteorder_ops = {
174 .type = &nft_byteorder_type, 173 .type = &nft_byteorder_type,
175 .size = NFT_EXPR_SIZE(sizeof(struct nft_byteorder)), 174 .size = NFT_EXPR_SIZE(sizeof(struct nft_byteorder)),
@@ -178,20 +177,10 @@ static const struct nft_expr_ops nft_byteorder_ops = {
178 .dump = nft_byteorder_dump, 177 .dump = nft_byteorder_dump,
179}; 178};
180 179
181static struct nft_expr_type nft_byteorder_type __read_mostly = { 180struct nft_expr_type nft_byteorder_type __read_mostly = {
182 .name = "byteorder", 181 .name = "byteorder",
183 .ops = &nft_byteorder_ops, 182 .ops = &nft_byteorder_ops,
184 .policy = nft_byteorder_policy, 183 .policy = nft_byteorder_policy,
185 .maxattr = NFTA_BYTEORDER_MAX, 184 .maxattr = NFTA_BYTEORDER_MAX,
186 .owner = THIS_MODULE, 185 .owner = THIS_MODULE,
187}; 186};
188
189int __init nft_byteorder_module_init(void)
190{
191 return nft_register_expr(&nft_byteorder_type);
192}
193
194void nft_byteorder_module_exit(void)
195{
196 nft_unregister_expr(&nft_byteorder_type);
197}
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index 2e53739812b1..2b96effeadc1 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -84,9 +84,6 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
84 if (err < 0) 84 if (err < 0)
85 return err; 85 return err;
86 86
87 if (desc.len > U8_MAX)
88 return -ERANGE;
89
90 priv->op = ntohl(nla_get_be32(tb[NFTA_CMP_OP])); 87 priv->op = ntohl(nla_get_be32(tb[NFTA_CMP_OP]));
91 priv->len = desc.len; 88 priv->len = desc.len;
92 return 0; 89 return 0;
@@ -110,7 +107,6 @@ nla_put_failure:
110 return -1; 107 return -1;
111} 108}
112 109
113static struct nft_expr_type nft_cmp_type;
114static const struct nft_expr_ops nft_cmp_ops = { 110static const struct nft_expr_ops nft_cmp_ops = {
115 .type = &nft_cmp_type, 111 .type = &nft_cmp_type,
116 .size = NFT_EXPR_SIZE(sizeof(struct nft_cmp_expr)), 112 .size = NFT_EXPR_SIZE(sizeof(struct nft_cmp_expr)),
@@ -211,20 +207,10 @@ nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
211 return &nft_cmp_ops; 207 return &nft_cmp_ops;
212} 208}
213 209
214static struct nft_expr_type nft_cmp_type __read_mostly = { 210struct nft_expr_type nft_cmp_type __read_mostly = {
215 .name = "cmp", 211 .name = "cmp",
216 .select_ops = nft_cmp_select_ops, 212 .select_ops = nft_cmp_select_ops,
217 .policy = nft_cmp_policy, 213 .policy = nft_cmp_policy,
218 .maxattr = NFTA_CMP_MAX, 214 .maxattr = NFTA_CMP_MAX,
219 .owner = THIS_MODULE, 215 .owner = THIS_MODULE,
220}; 216};
221
222int __init nft_cmp_module_init(void)
223{
224 return nft_register_expr(&nft_cmp_type);
225}
226
227void nft_cmp_module_exit(void)
228{
229 nft_unregister_expr(&nft_cmp_type);
230}
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index 77db8358ab14..7f8422213341 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -18,105 +18,197 @@
18#include <net/netfilter/nf_tables.h> 18#include <net/netfilter/nf_tables.h>
19 19
20struct nft_counter { 20struct nft_counter {
21 u64 bytes; 21 s64 bytes;
22 u64 packets; 22 s64 packets;
23};
24
25struct nft_counter_percpu {
26 struct nft_counter counter;
27 struct u64_stats_sync syncp;
28}; 23};
29 24
30struct nft_counter_percpu_priv { 25struct nft_counter_percpu_priv {
31 struct nft_counter_percpu __percpu *counter; 26 struct nft_counter __percpu *counter;
32}; 27};
33 28
34static void nft_counter_eval(const struct nft_expr *expr, 29static DEFINE_PER_CPU(seqcount_t, nft_counter_seq);
35 struct nft_regs *regs, 30
36 const struct nft_pktinfo *pkt) 31static inline void nft_counter_do_eval(struct nft_counter_percpu_priv *priv,
32 struct nft_regs *regs,
33 const struct nft_pktinfo *pkt)
37{ 34{
38 struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); 35 struct nft_counter *this_cpu;
39 struct nft_counter_percpu *this_cpu; 36 seqcount_t *myseq;
40 37
41 local_bh_disable(); 38 local_bh_disable();
42 this_cpu = this_cpu_ptr(priv->counter); 39 this_cpu = this_cpu_ptr(priv->counter);
43 u64_stats_update_begin(&this_cpu->syncp); 40 myseq = this_cpu_ptr(&nft_counter_seq);
44 this_cpu->counter.bytes += pkt->skb->len; 41
45 this_cpu->counter.packets++; 42 write_seqcount_begin(myseq);
46 u64_stats_update_end(&this_cpu->syncp); 43
44 this_cpu->bytes += pkt->skb->len;
45 this_cpu->packets++;
46
47 write_seqcount_end(myseq);
47 local_bh_enable(); 48 local_bh_enable();
48} 49}
49 50
50static void nft_counter_fetch(const struct nft_counter_percpu __percpu *counter, 51static inline void nft_counter_obj_eval(struct nft_object *obj,
52 struct nft_regs *regs,
53 const struct nft_pktinfo *pkt)
54{
55 struct nft_counter_percpu_priv *priv = nft_obj_data(obj);
56
57 nft_counter_do_eval(priv, regs, pkt);
58}
59
60static int nft_counter_do_init(const struct nlattr * const tb[],
61 struct nft_counter_percpu_priv *priv)
62{
63 struct nft_counter __percpu *cpu_stats;
64 struct nft_counter *this_cpu;
65
66 cpu_stats = alloc_percpu(struct nft_counter);
67 if (cpu_stats == NULL)
68 return -ENOMEM;
69
70 preempt_disable();
71 this_cpu = this_cpu_ptr(cpu_stats);
72 if (tb[NFTA_COUNTER_PACKETS]) {
73 this_cpu->packets =
74 be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]));
75 }
76 if (tb[NFTA_COUNTER_BYTES]) {
77 this_cpu->bytes =
78 be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]));
79 }
80 preempt_enable();
81 priv->counter = cpu_stats;
82 return 0;
83}
84
85static int nft_counter_obj_init(const struct nlattr * const tb[],
86 struct nft_object *obj)
87{
88 struct nft_counter_percpu_priv *priv = nft_obj_data(obj);
89
90 return nft_counter_do_init(tb, priv);
91}
92
93static void nft_counter_do_destroy(struct nft_counter_percpu_priv *priv)
94{
95 free_percpu(priv->counter);
96}
97
98static void nft_counter_obj_destroy(struct nft_object *obj)
99{
100 struct nft_counter_percpu_priv *priv = nft_obj_data(obj);
101
102 nft_counter_do_destroy(priv);
103}
104
105static void nft_counter_reset(struct nft_counter_percpu_priv __percpu *priv,
51 struct nft_counter *total) 106 struct nft_counter *total)
52{ 107{
53 const struct nft_counter_percpu *cpu_stats; 108 struct nft_counter *this_cpu;
109
110 local_bh_disable();
111 this_cpu = this_cpu_ptr(priv->counter);
112 this_cpu->packets -= total->packets;
113 this_cpu->bytes -= total->bytes;
114 local_bh_enable();
115}
116
117static void nft_counter_fetch(struct nft_counter_percpu_priv *priv,
118 struct nft_counter *total)
119{
120 struct nft_counter *this_cpu;
121 const seqcount_t *myseq;
54 u64 bytes, packets; 122 u64 bytes, packets;
55 unsigned int seq; 123 unsigned int seq;
56 int cpu; 124 int cpu;
57 125
58 memset(total, 0, sizeof(*total)); 126 memset(total, 0, sizeof(*total));
59 for_each_possible_cpu(cpu) { 127 for_each_possible_cpu(cpu) {
60 cpu_stats = per_cpu_ptr(counter, cpu); 128 myseq = per_cpu_ptr(&nft_counter_seq, cpu);
129 this_cpu = per_cpu_ptr(priv->counter, cpu);
61 do { 130 do {
62 seq = u64_stats_fetch_begin_irq(&cpu_stats->syncp); 131 seq = read_seqcount_begin(myseq);
63 bytes = cpu_stats->counter.bytes; 132 bytes = this_cpu->bytes;
64 packets = cpu_stats->counter.packets; 133 packets = this_cpu->packets;
65 } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, seq)); 134 } while (read_seqcount_retry(myseq, seq));
66 135
67 total->packets += packets; 136 total->bytes += bytes;
68 total->bytes += bytes; 137 total->packets += packets;
69 } 138 }
70} 139}
71 140
72static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr) 141static int nft_counter_do_dump(struct sk_buff *skb,
142 struct nft_counter_percpu_priv *priv,
143 bool reset)
73{ 144{
74 struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
75 struct nft_counter total; 145 struct nft_counter total;
76 146
77 nft_counter_fetch(priv->counter, &total); 147 nft_counter_fetch(priv, &total);
78 148
79 if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes), 149 if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes),
80 NFTA_COUNTER_PAD) || 150 NFTA_COUNTER_PAD) ||
81 nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.packets), 151 nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.packets),
82 NFTA_COUNTER_PAD)) 152 NFTA_COUNTER_PAD))
83 goto nla_put_failure; 153 goto nla_put_failure;
154
155 if (reset)
156 nft_counter_reset(priv, &total);
157
84 return 0; 158 return 0;
85 159
86nla_put_failure: 160nla_put_failure:
87 return -1; 161 return -1;
88} 162}
89 163
164static int nft_counter_obj_dump(struct sk_buff *skb,
165 struct nft_object *obj, bool reset)
166{
167 struct nft_counter_percpu_priv *priv = nft_obj_data(obj);
168
169 return nft_counter_do_dump(skb, priv, reset);
170}
171
90static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = { 172static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = {
91 [NFTA_COUNTER_PACKETS] = { .type = NLA_U64 }, 173 [NFTA_COUNTER_PACKETS] = { .type = NLA_U64 },
92 [NFTA_COUNTER_BYTES] = { .type = NLA_U64 }, 174 [NFTA_COUNTER_BYTES] = { .type = NLA_U64 },
93}; 175};
94 176
177static struct nft_object_type nft_counter_obj __read_mostly = {
178 .type = NFT_OBJECT_COUNTER,
179 .size = sizeof(struct nft_counter_percpu_priv),
180 .maxattr = NFTA_COUNTER_MAX,
181 .policy = nft_counter_policy,
182 .eval = nft_counter_obj_eval,
183 .init = nft_counter_obj_init,
184 .destroy = nft_counter_obj_destroy,
185 .dump = nft_counter_obj_dump,
186 .owner = THIS_MODULE,
187};
188
189static void nft_counter_eval(const struct nft_expr *expr,
190 struct nft_regs *regs,
191 const struct nft_pktinfo *pkt)
192{
193 struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
194
195 nft_counter_do_eval(priv, regs, pkt);
196}
197
198static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr)
199{
200 struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
201
202 return nft_counter_do_dump(skb, priv, false);
203}
204
95static int nft_counter_init(const struct nft_ctx *ctx, 205static int nft_counter_init(const struct nft_ctx *ctx,
96 const struct nft_expr *expr, 206 const struct nft_expr *expr,
97 const struct nlattr * const tb[]) 207 const struct nlattr * const tb[])
98{ 208{
99 struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); 209 struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
100 struct nft_counter_percpu __percpu *cpu_stats;
101 struct nft_counter_percpu *this_cpu;
102
103 cpu_stats = netdev_alloc_pcpu_stats(struct nft_counter_percpu);
104 if (cpu_stats == NULL)
105 return -ENOMEM;
106 210
107 preempt_disable(); 211 return nft_counter_do_init(tb, priv);
108 this_cpu = this_cpu_ptr(cpu_stats);
109 if (tb[NFTA_COUNTER_PACKETS]) {
110 this_cpu->counter.packets =
111 be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]));
112 }
113 if (tb[NFTA_COUNTER_BYTES]) {
114 this_cpu->counter.bytes =
115 be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]));
116 }
117 preempt_enable();
118 priv->counter = cpu_stats;
119 return 0;
120} 212}
121 213
122static void nft_counter_destroy(const struct nft_ctx *ctx, 214static void nft_counter_destroy(const struct nft_ctx *ctx,
@@ -124,28 +216,27 @@ static void nft_counter_destroy(const struct nft_ctx *ctx,
124{ 216{
125 struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); 217 struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
126 218
127 free_percpu(priv->counter); 219 nft_counter_do_destroy(priv);
128} 220}
129 221
130static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src) 222static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src)
131{ 223{
132 struct nft_counter_percpu_priv *priv = nft_expr_priv(src); 224 struct nft_counter_percpu_priv *priv = nft_expr_priv(src);
133 struct nft_counter_percpu_priv *priv_clone = nft_expr_priv(dst); 225 struct nft_counter_percpu_priv *priv_clone = nft_expr_priv(dst);
134 struct nft_counter_percpu __percpu *cpu_stats; 226 struct nft_counter __percpu *cpu_stats;
135 struct nft_counter_percpu *this_cpu; 227 struct nft_counter *this_cpu;
136 struct nft_counter total; 228 struct nft_counter total;
137 229
138 nft_counter_fetch(priv->counter, &total); 230 nft_counter_fetch(priv, &total);
139 231
140 cpu_stats = __netdev_alloc_pcpu_stats(struct nft_counter_percpu, 232 cpu_stats = alloc_percpu_gfp(struct nft_counter, GFP_ATOMIC);
141 GFP_ATOMIC);
142 if (cpu_stats == NULL) 233 if (cpu_stats == NULL)
143 return -ENOMEM; 234 return -ENOMEM;
144 235
145 preempt_disable(); 236 preempt_disable();
146 this_cpu = this_cpu_ptr(cpu_stats); 237 this_cpu = this_cpu_ptr(cpu_stats);
147 this_cpu->counter.packets = total.packets; 238 this_cpu->packets = total.packets;
148 this_cpu->counter.bytes = total.bytes; 239 this_cpu->bytes = total.bytes;
149 preempt_enable(); 240 preempt_enable();
150 241
151 priv_clone->counter = cpu_stats; 242 priv_clone->counter = cpu_stats;
@@ -174,12 +265,29 @@ static struct nft_expr_type nft_counter_type __read_mostly = {
174 265
175static int __init nft_counter_module_init(void) 266static int __init nft_counter_module_init(void)
176{ 267{
177 return nft_register_expr(&nft_counter_type); 268 int cpu, err;
269
270 for_each_possible_cpu(cpu)
271 seqcount_init(per_cpu_ptr(&nft_counter_seq, cpu));
272
273 err = nft_register_obj(&nft_counter_obj);
274 if (err < 0)
275 return err;
276
277 err = nft_register_expr(&nft_counter_type);
278 if (err < 0)
279 goto err1;
280
281 return 0;
282err1:
283 nft_unregister_obj(&nft_counter_obj);
284 return err;
178} 285}
179 286
180static void __exit nft_counter_module_exit(void) 287static void __exit nft_counter_module_exit(void)
181{ 288{
182 nft_unregister_expr(&nft_counter_type); 289 nft_unregister_expr(&nft_counter_type);
290 nft_unregister_obj(&nft_counter_obj);
183} 291}
184 292
185module_init(nft_counter_module_init); 293module_init(nft_counter_module_init);
@@ -188,3 +296,4 @@ module_exit(nft_counter_module_exit);
188MODULE_LICENSE("GPL"); 296MODULE_LICENSE("GPL");
189MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); 297MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
190MODULE_ALIAS_NFT_EXPR("counter"); 298MODULE_ALIAS_NFT_EXPR("counter");
299MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_COUNTER);
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index d7b0d171172a..0264258c46fe 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -1,5 +1,6 @@
1/* 1/*
2 * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> 2 * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
3 * Copyright (c) 2016 Pablo Neira Ayuso <pablo@netfilter.org>
3 * 4 *
4 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as 6 * it under the terms of the GNU General Public License version 2 as
@@ -31,6 +32,11 @@ struct nft_ct {
31 }; 32 };
32}; 33};
33 34
35#ifdef CONFIG_NF_CONNTRACK_ZONES
36static DEFINE_PER_CPU(struct nf_conn *, nft_ct_pcpu_template);
37static unsigned int nft_ct_pcpu_template_refcnt __read_mostly;
38#endif
39
34static u64 nft_ct_get_eval_counter(const struct nf_conn_counter *c, 40static u64 nft_ct_get_eval_counter(const struct nf_conn_counter *c,
35 enum nft_ct_keys k, 41 enum nft_ct_keys k,
36 enum ip_conntrack_dir d) 42 enum ip_conntrack_dir d)
@@ -77,7 +83,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
77 83
78 switch (priv->key) { 84 switch (priv->key) {
79 case NFT_CT_DIRECTION: 85 case NFT_CT_DIRECTION:
80 *dest = CTINFO2DIR(ctinfo); 86 nft_reg_store8(dest, CTINFO2DIR(ctinfo));
81 return; 87 return;
82 case NFT_CT_STATUS: 88 case NFT_CT_STATUS:
83 *dest = ct->status; 89 *dest = ct->status;
@@ -128,12 +134,42 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
128 memcpy(dest, &count, sizeof(count)); 134 memcpy(dest, &count, sizeof(count));
129 return; 135 return;
130 } 136 }
137 case NFT_CT_AVGPKT: {
138 const struct nf_conn_acct *acct = nf_conn_acct_find(ct);
139 u64 avgcnt = 0, bcnt = 0, pcnt = 0;
140
141 if (acct) {
142 pcnt = nft_ct_get_eval_counter(acct->counter,
143 NFT_CT_PKTS, priv->dir);
144 bcnt = nft_ct_get_eval_counter(acct->counter,
145 NFT_CT_BYTES, priv->dir);
146 if (pcnt != 0)
147 avgcnt = div64_u64(bcnt, pcnt);
148 }
149
150 memcpy(dest, &avgcnt, sizeof(avgcnt));
151 return;
152 }
131 case NFT_CT_L3PROTOCOL: 153 case NFT_CT_L3PROTOCOL:
132 *dest = nf_ct_l3num(ct); 154 nft_reg_store8(dest, nf_ct_l3num(ct));
133 return; 155 return;
134 case NFT_CT_PROTOCOL: 156 case NFT_CT_PROTOCOL:
135 *dest = nf_ct_protonum(ct); 157 nft_reg_store8(dest, nf_ct_protonum(ct));
158 return;
159#ifdef CONFIG_NF_CONNTRACK_ZONES
160 case NFT_CT_ZONE: {
161 const struct nf_conntrack_zone *zone = nf_ct_zone(ct);
162 u16 zoneid;
163
164 if (priv->dir < IP_CT_DIR_MAX)
165 zoneid = nf_ct_zone_id(zone, priv->dir);
166 else
167 zoneid = zone->id;
168
169 nft_reg_store16(dest, zoneid);
136 return; 170 return;
171 }
172#endif
137 default: 173 default:
138 break; 174 break;
139 } 175 }
@@ -149,10 +185,10 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
149 nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16); 185 nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16);
150 return; 186 return;
151 case NFT_CT_PROTO_SRC: 187 case NFT_CT_PROTO_SRC:
152 *dest = (__force __u16)tuple->src.u.all; 188 nft_reg_store16(dest, (__force u16)tuple->src.u.all);
153 return; 189 return;
154 case NFT_CT_PROTO_DST: 190 case NFT_CT_PROTO_DST:
155 *dest = (__force __u16)tuple->dst.u.all; 191 nft_reg_store16(dest, (__force u16)tuple->dst.u.all);
156 return; 192 return;
157 default: 193 default:
158 break; 194 break;
@@ -162,6 +198,53 @@ err:
162 regs->verdict.code = NFT_BREAK; 198 regs->verdict.code = NFT_BREAK;
163} 199}
164 200
201#ifdef CONFIG_NF_CONNTRACK_ZONES
202static void nft_ct_set_zone_eval(const struct nft_expr *expr,
203 struct nft_regs *regs,
204 const struct nft_pktinfo *pkt)
205{
206 struct nf_conntrack_zone zone = { .dir = NF_CT_DEFAULT_ZONE_DIR };
207 const struct nft_ct *priv = nft_expr_priv(expr);
208 struct sk_buff *skb = pkt->skb;
209 enum ip_conntrack_info ctinfo;
210 u16 value = nft_reg_load16(&regs->data[priv->sreg]);
211 struct nf_conn *ct;
212
213 ct = nf_ct_get(skb, &ctinfo);
214 if (ct) /* already tracked */
215 return;
216
217 zone.id = value;
218
219 switch (priv->dir) {
220 case IP_CT_DIR_ORIGINAL:
221 zone.dir = NF_CT_ZONE_DIR_ORIG;
222 break;
223 case IP_CT_DIR_REPLY:
224 zone.dir = NF_CT_ZONE_DIR_REPL;
225 break;
226 default:
227 break;
228 }
229
230 ct = this_cpu_read(nft_ct_pcpu_template);
231
232 if (likely(atomic_read(&ct->ct_general.use) == 1)) {
233 nf_ct_zone_add(ct, &zone);
234 } else {
235 /* previous skb got queued to userspace */
236 ct = nf_ct_tmpl_alloc(nft_net(pkt), &zone, GFP_ATOMIC);
237 if (!ct) {
238 regs->verdict.code = NF_DROP;
239 return;
240 }
241 }
242
243 atomic_inc(&ct->ct_general.use);
244 nf_ct_set(skb, ct, IP_CT_NEW);
245}
246#endif
247
165static void nft_ct_set_eval(const struct nft_expr *expr, 248static void nft_ct_set_eval(const struct nft_expr *expr,
166 struct nft_regs *regs, 249 struct nft_regs *regs,
167 const struct nft_pktinfo *pkt) 250 const struct nft_pktinfo *pkt)
@@ -207,39 +290,78 @@ static const struct nla_policy nft_ct_policy[NFTA_CT_MAX + 1] = {
207 [NFTA_CT_SREG] = { .type = NLA_U32 }, 290 [NFTA_CT_SREG] = { .type = NLA_U32 },
208}; 291};
209 292
210static int nft_ct_l3proto_try_module_get(uint8_t family) 293static int nft_ct_netns_get(struct net *net, uint8_t family)
211{ 294{
212 int err; 295 int err;
213 296
214 if (family == NFPROTO_INET) { 297 if (family == NFPROTO_INET) {
215 err = nf_ct_l3proto_try_module_get(NFPROTO_IPV4); 298 err = nf_ct_netns_get(net, NFPROTO_IPV4);
216 if (err < 0) 299 if (err < 0)
217 goto err1; 300 goto err1;
218 err = nf_ct_l3proto_try_module_get(NFPROTO_IPV6); 301 err = nf_ct_netns_get(net, NFPROTO_IPV6);
219 if (err < 0) 302 if (err < 0)
220 goto err2; 303 goto err2;
221 } else { 304 } else {
222 err = nf_ct_l3proto_try_module_get(family); 305 err = nf_ct_netns_get(net, family);
223 if (err < 0) 306 if (err < 0)
224 goto err1; 307 goto err1;
225 } 308 }
226 return 0; 309 return 0;
227 310
228err2: 311err2:
229 nf_ct_l3proto_module_put(NFPROTO_IPV4); 312 nf_ct_netns_put(net, NFPROTO_IPV4);
230err1: 313err1:
231 return err; 314 return err;
232} 315}
233 316
234static void nft_ct_l3proto_module_put(uint8_t family) 317static void nft_ct_netns_put(struct net *net, uint8_t family)
235{ 318{
236 if (family == NFPROTO_INET) { 319 if (family == NFPROTO_INET) {
237 nf_ct_l3proto_module_put(NFPROTO_IPV4); 320 nf_ct_netns_put(net, NFPROTO_IPV4);
238 nf_ct_l3proto_module_put(NFPROTO_IPV6); 321 nf_ct_netns_put(net, NFPROTO_IPV6);
239 } else 322 } else
240 nf_ct_l3proto_module_put(family); 323 nf_ct_netns_put(net, family);
324}
325
326#ifdef CONFIG_NF_CONNTRACK_ZONES
327static void nft_ct_tmpl_put_pcpu(void)
328{
329 struct nf_conn *ct;
330 int cpu;
331
332 for_each_possible_cpu(cpu) {
333 ct = per_cpu(nft_ct_pcpu_template, cpu);
334 if (!ct)
335 break;
336 nf_ct_put(ct);
337 per_cpu(nft_ct_pcpu_template, cpu) = NULL;
338 }
241} 339}
242 340
341static bool nft_ct_tmpl_alloc_pcpu(void)
342{
343 struct nf_conntrack_zone zone = { .id = 0 };
344 struct nf_conn *tmp;
345 int cpu;
346
347 if (nft_ct_pcpu_template_refcnt)
348 return true;
349
350 for_each_possible_cpu(cpu) {
351 tmp = nf_ct_tmpl_alloc(&init_net, &zone, GFP_KERNEL);
352 if (!tmp) {
353 nft_ct_tmpl_put_pcpu();
354 return false;
355 }
356
357 atomic_set(&tmp->ct_general.use, 1);
358 per_cpu(nft_ct_pcpu_template, cpu) = tmp;
359 }
360
361 return true;
362}
363#endif
364
243static int nft_ct_get_init(const struct nft_ctx *ctx, 365static int nft_ct_get_init(const struct nft_ctx *ctx,
244 const struct nft_expr *expr, 366 const struct nft_expr *expr,
245 const struct nlattr * const tb[]) 367 const struct nlattr * const tb[])
@@ -249,6 +371,7 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
249 int err; 371 int err;
250 372
251 priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); 373 priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY]));
374 priv->dir = IP_CT_DIR_MAX;
252 switch (priv->key) { 375 switch (priv->key) {
253 case NFT_CT_DIRECTION: 376 case NFT_CT_DIRECTION:
254 if (tb[NFTA_CT_DIRECTION] != NULL) 377 if (tb[NFTA_CT_DIRECTION] != NULL)
@@ -315,11 +438,14 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
315 break; 438 break;
316 case NFT_CT_BYTES: 439 case NFT_CT_BYTES:
317 case NFT_CT_PKTS: 440 case NFT_CT_PKTS:
318 /* no direction? return sum of original + reply */ 441 case NFT_CT_AVGPKT:
319 if (tb[NFTA_CT_DIRECTION] == NULL)
320 priv->dir = IP_CT_DIR_MAX;
321 len = sizeof(u64); 442 len = sizeof(u64);
322 break; 443 break;
444#ifdef CONFIG_NF_CONNTRACK_ZONES
445 case NFT_CT_ZONE:
446 len = sizeof(u16);
447 break;
448#endif
323 default: 449 default:
324 return -EOPNOTSUPP; 450 return -EOPNOTSUPP;
325 } 451 }
@@ -341,25 +467,45 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
341 if (err < 0) 467 if (err < 0)
342 return err; 468 return err;
343 469
344 err = nft_ct_l3proto_try_module_get(ctx->afi->family); 470 err = nft_ct_netns_get(ctx->net, ctx->afi->family);
345 if (err < 0) 471 if (err < 0)
346 return err; 472 return err;
347 473
348 if (priv->key == NFT_CT_BYTES || priv->key == NFT_CT_PKTS) 474 if (priv->key == NFT_CT_BYTES ||
475 priv->key == NFT_CT_PKTS ||
476 priv->key == NFT_CT_AVGPKT)
349 nf_ct_set_acct(ctx->net, true); 477 nf_ct_set_acct(ctx->net, true);
350 478
351 return 0; 479 return 0;
352} 480}
353 481
482static void __nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv)
483{
484 switch (priv->key) {
485#ifdef CONFIG_NF_CONNTRACK_LABELS
486 case NFT_CT_LABELS:
487 nf_connlabels_put(ctx->net);
488 break;
489#endif
490#ifdef CONFIG_NF_CONNTRACK_ZONES
491 case NFT_CT_ZONE:
492 if (--nft_ct_pcpu_template_refcnt == 0)
493 nft_ct_tmpl_put_pcpu();
494#endif
495 default:
496 break;
497 }
498}
499
354static int nft_ct_set_init(const struct nft_ctx *ctx, 500static int nft_ct_set_init(const struct nft_ctx *ctx,
355 const struct nft_expr *expr, 501 const struct nft_expr *expr,
356 const struct nlattr * const tb[]) 502 const struct nlattr * const tb[])
357{ 503{
358 struct nft_ct *priv = nft_expr_priv(expr); 504 struct nft_ct *priv = nft_expr_priv(expr);
359 bool label_got = false;
360 unsigned int len; 505 unsigned int len;
361 int err; 506 int err;
362 507
508 priv->dir = IP_CT_DIR_MAX;
363 priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); 509 priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY]));
364 switch (priv->key) { 510 switch (priv->key) {
365#ifdef CONFIG_NF_CONNTRACK_MARK 511#ifdef CONFIG_NF_CONNTRACK_MARK
@@ -377,34 +523,52 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
377 err = nf_connlabels_get(ctx->net, (len * BITS_PER_BYTE) - 1); 523 err = nf_connlabels_get(ctx->net, (len * BITS_PER_BYTE) - 1);
378 if (err) 524 if (err)
379 return err; 525 return err;
380 label_got = true; 526 break;
527#endif
528#ifdef CONFIG_NF_CONNTRACK_ZONES
529 case NFT_CT_ZONE:
530 if (!nft_ct_tmpl_alloc_pcpu())
531 return -ENOMEM;
532 nft_ct_pcpu_template_refcnt++;
533 len = sizeof(u16);
381 break; 534 break;
382#endif 535#endif
383 default: 536 default:
384 return -EOPNOTSUPP; 537 return -EOPNOTSUPP;
385 } 538 }
386 539
540 if (tb[NFTA_CT_DIRECTION]) {
541 priv->dir = nla_get_u8(tb[NFTA_CT_DIRECTION]);
542 switch (priv->dir) {
543 case IP_CT_DIR_ORIGINAL:
544 case IP_CT_DIR_REPLY:
545 break;
546 default:
547 err = -EINVAL;
548 goto err1;
549 }
550 }
551
387 priv->sreg = nft_parse_register(tb[NFTA_CT_SREG]); 552 priv->sreg = nft_parse_register(tb[NFTA_CT_SREG]);
388 err = nft_validate_register_load(priv->sreg, len); 553 err = nft_validate_register_load(priv->sreg, len);
389 if (err < 0) 554 if (err < 0)
390 goto err1; 555 goto err1;
391 556
392 err = nft_ct_l3proto_try_module_get(ctx->afi->family); 557 err = nft_ct_netns_get(ctx->net, ctx->afi->family);
393 if (err < 0) 558 if (err < 0)
394 goto err1; 559 goto err1;
395 560
396 return 0; 561 return 0;
397 562
398err1: 563err1:
399 if (label_got) 564 __nft_ct_set_destroy(ctx, priv);
400 nf_connlabels_put(ctx->net);
401 return err; 565 return err;
402} 566}
403 567
404static void nft_ct_get_destroy(const struct nft_ctx *ctx, 568static void nft_ct_get_destroy(const struct nft_ctx *ctx,
405 const struct nft_expr *expr) 569 const struct nft_expr *expr)
406{ 570{
407 nft_ct_l3proto_module_put(ctx->afi->family); 571 nf_ct_netns_put(ctx->net, ctx->afi->family);
408} 572}
409 573
410static void nft_ct_set_destroy(const struct nft_ctx *ctx, 574static void nft_ct_set_destroy(const struct nft_ctx *ctx,
@@ -412,17 +576,8 @@ static void nft_ct_set_destroy(const struct nft_ctx *ctx,
412{ 576{
413 struct nft_ct *priv = nft_expr_priv(expr); 577 struct nft_ct *priv = nft_expr_priv(expr);
414 578
415 switch (priv->key) { 579 __nft_ct_set_destroy(ctx, priv);
416#ifdef CONFIG_NF_CONNTRACK_LABELS 580 nft_ct_netns_put(ctx->net, ctx->afi->family);
417 case NFT_CT_LABELS:
418 nf_connlabels_put(ctx->net);
419 break;
420#endif
421 default:
422 break;
423 }
424
425 nft_ct_l3proto_module_put(ctx->afi->family);
426} 581}
427 582
428static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr) 583static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
@@ -444,6 +599,8 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
444 break; 599 break;
445 case NFT_CT_BYTES: 600 case NFT_CT_BYTES:
446 case NFT_CT_PKTS: 601 case NFT_CT_PKTS:
602 case NFT_CT_AVGPKT:
603 case NFT_CT_ZONE:
447 if (priv->dir < IP_CT_DIR_MAX && 604 if (priv->dir < IP_CT_DIR_MAX &&
448 nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) 605 nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir))
449 goto nla_put_failure; 606 goto nla_put_failure;
@@ -466,6 +623,17 @@ static int nft_ct_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
466 goto nla_put_failure; 623 goto nla_put_failure;
467 if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key))) 624 if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key)))
468 goto nla_put_failure; 625 goto nla_put_failure;
626
627 switch (priv->key) {
628 case NFT_CT_ZONE:
629 if (priv->dir < IP_CT_DIR_MAX &&
630 nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir))
631 goto nla_put_failure;
632 break;
633 default:
634 break;
635 }
636
469 return 0; 637 return 0;
470 638
471nla_put_failure: 639nla_put_failure:
@@ -491,6 +659,17 @@ static const struct nft_expr_ops nft_ct_set_ops = {
491 .dump = nft_ct_set_dump, 659 .dump = nft_ct_set_dump,
492}; 660};
493 661
662#ifdef CONFIG_NF_CONNTRACK_ZONES
663static const struct nft_expr_ops nft_ct_set_zone_ops = {
664 .type = &nft_ct_type,
665 .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)),
666 .eval = nft_ct_set_zone_eval,
667 .init = nft_ct_set_init,
668 .destroy = nft_ct_set_destroy,
669 .dump = nft_ct_set_dump,
670};
671#endif
672
494static const struct nft_expr_ops * 673static const struct nft_expr_ops *
495nft_ct_select_ops(const struct nft_ctx *ctx, 674nft_ct_select_ops(const struct nft_ctx *ctx,
496 const struct nlattr * const tb[]) 675 const struct nlattr * const tb[])
@@ -504,8 +683,13 @@ nft_ct_select_ops(const struct nft_ctx *ctx,
504 if (tb[NFTA_CT_DREG]) 683 if (tb[NFTA_CT_DREG])
505 return &nft_ct_get_ops; 684 return &nft_ct_get_ops;
506 685
507 if (tb[NFTA_CT_SREG]) 686 if (tb[NFTA_CT_SREG]) {
687#ifdef CONFIG_NF_CONNTRACK_ZONES
688 if (nla_get_be32(tb[NFTA_CT_KEY]) == htonl(NFT_CT_ZONE))
689 return &nft_ct_set_zone_ops;
690#endif
508 return &nft_ct_set_ops; 691 return &nft_ct_set_ops;
692 }
509 693
510 return ERR_PTR(-EINVAL); 694 return ERR_PTR(-EINVAL);
511} 695}
@@ -518,15 +702,60 @@ static struct nft_expr_type nft_ct_type __read_mostly = {
518 .owner = THIS_MODULE, 702 .owner = THIS_MODULE,
519}; 703};
520 704
705static void nft_notrack_eval(const struct nft_expr *expr,
706 struct nft_regs *regs,
707 const struct nft_pktinfo *pkt)
708{
709 struct sk_buff *skb = pkt->skb;
710 enum ip_conntrack_info ctinfo;
711 struct nf_conn *ct;
712
713 ct = nf_ct_get(pkt->skb, &ctinfo);
714 /* Previously seen (loopback or untracked)? Ignore. */
715 if (ct)
716 return;
717
718 ct = nf_ct_untracked_get();
719 atomic_inc(&ct->ct_general.use);
720 nf_ct_set(skb, ct, IP_CT_NEW);
721}
722
723static struct nft_expr_type nft_notrack_type;
724static const struct nft_expr_ops nft_notrack_ops = {
725 .type = &nft_notrack_type,
726 .size = NFT_EXPR_SIZE(0),
727 .eval = nft_notrack_eval,
728};
729
730static struct nft_expr_type nft_notrack_type __read_mostly = {
731 .name = "notrack",
732 .ops = &nft_notrack_ops,
733 .owner = THIS_MODULE,
734};
735
521static int __init nft_ct_module_init(void) 736static int __init nft_ct_module_init(void)
522{ 737{
738 int err;
739
523 BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE > NFT_REG_SIZE); 740 BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE > NFT_REG_SIZE);
524 741
525 return nft_register_expr(&nft_ct_type); 742 err = nft_register_expr(&nft_ct_type);
743 if (err < 0)
744 return err;
745
746 err = nft_register_expr(&nft_notrack_type);
747 if (err < 0)
748 goto err1;
749
750 return 0;
751err1:
752 nft_unregister_expr(&nft_ct_type);
753 return err;
526} 754}
527 755
528static void __exit nft_ct_module_exit(void) 756static void __exit nft_ct_module_exit(void)
529{ 757{
758 nft_unregister_expr(&nft_notrack_type);
530 nft_unregister_expr(&nft_ct_type); 759 nft_unregister_expr(&nft_ct_type);
531} 760}
532 761
@@ -536,3 +765,4 @@ module_exit(nft_ct_module_exit);
536MODULE_LICENSE("GPL"); 765MODULE_LICENSE("GPL");
537MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); 766MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
538MODULE_ALIAS_NFT_EXPR("ct"); 767MODULE_ALIAS_NFT_EXPR("ct");
768MODULE_ALIAS_NFT_EXPR("notrack");
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 31ca94793aa9..049ad2d9ee66 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -98,7 +98,8 @@ out:
98} 98}
99 99
100static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = { 100static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = {
101 [NFTA_DYNSET_SET_NAME] = { .type = NLA_STRING }, 101 [NFTA_DYNSET_SET_NAME] = { .type = NLA_STRING,
102 .len = NFT_SET_MAXNAMELEN - 1 },
102 [NFTA_DYNSET_SET_ID] = { .type = NLA_U32 }, 103 [NFTA_DYNSET_SET_ID] = { .type = NLA_U32 },
103 [NFTA_DYNSET_OP] = { .type = NLA_U32 }, 104 [NFTA_DYNSET_OP] = { .type = NLA_U32 },
104 [NFTA_DYNSET_SREG_KEY] = { .type = NLA_U32 }, 105 [NFTA_DYNSET_SREG_KEY] = { .type = NLA_U32 },
@@ -268,7 +269,6 @@ nla_put_failure:
268 return -1; 269 return -1;
269} 270}
270 271
271static struct nft_expr_type nft_dynset_type;
272static const struct nft_expr_ops nft_dynset_ops = { 272static const struct nft_expr_ops nft_dynset_ops = {
273 .type = &nft_dynset_type, 273 .type = &nft_dynset_type,
274 .size = NFT_EXPR_SIZE(sizeof(struct nft_dynset)), 274 .size = NFT_EXPR_SIZE(sizeof(struct nft_dynset)),
@@ -278,20 +278,10 @@ static const struct nft_expr_ops nft_dynset_ops = {
278 .dump = nft_dynset_dump, 278 .dump = nft_dynset_dump,
279}; 279};
280 280
281static struct nft_expr_type nft_dynset_type __read_mostly = { 281struct nft_expr_type nft_dynset_type __read_mostly = {
282 .name = "dynset", 282 .name = "dynset",
283 .ops = &nft_dynset_ops, 283 .ops = &nft_dynset_ops,
284 .policy = nft_dynset_policy, 284 .policy = nft_dynset_policy,
285 .maxattr = NFTA_DYNSET_MAX, 285 .maxattr = NFTA_DYNSET_MAX,
286 .owner = THIS_MODULE, 286 .owner = THIS_MODULE,
287}; 287};
288
289int __init nft_dynset_module_init(void)
290{
291 return nft_register_expr(&nft_dynset_type);
292}
293
294void nft_dynset_module_exit(void)
295{
296 nft_unregister_expr(&nft_dynset_type);
297}
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index 47beb3abcc9d..c308920b194c 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -15,19 +15,29 @@
15#include <linux/netfilter.h> 15#include <linux/netfilter.h>
16#include <linux/netfilter/nf_tables.h> 16#include <linux/netfilter/nf_tables.h>
17#include <net/netfilter/nf_tables.h> 17#include <net/netfilter/nf_tables.h>
18// FIXME: 18#include <net/tcp.h>
19#include <net/ipv6.h>
20 19
21struct nft_exthdr { 20struct nft_exthdr {
22 u8 type; 21 u8 type;
23 u8 offset; 22 u8 offset;
24 u8 len; 23 u8 len;
24 u8 op;
25 enum nft_registers dreg:8; 25 enum nft_registers dreg:8;
26 u8 flags;
26}; 27};
27 28
28static void nft_exthdr_eval(const struct nft_expr *expr, 29static unsigned int optlen(const u8 *opt, unsigned int offset)
29 struct nft_regs *regs, 30{
30 const struct nft_pktinfo *pkt) 31 /* Beware zero-length options: make finite progress */
32 if (opt[offset] <= TCPOPT_NOP || opt[offset + 1] == 0)
33 return 1;
34 else
35 return opt[offset + 1];
36}
37
38static void nft_exthdr_ipv6_eval(const struct nft_expr *expr,
39 struct nft_regs *regs,
40 const struct nft_pktinfo *pkt)
31{ 41{
32 struct nft_exthdr *priv = nft_expr_priv(expr); 42 struct nft_exthdr *priv = nft_expr_priv(expr);
33 u32 *dest = &regs->data[priv->dreg]; 43 u32 *dest = &regs->data[priv->dreg];
@@ -35,8 +45,12 @@ static void nft_exthdr_eval(const struct nft_expr *expr,
35 int err; 45 int err;
36 46
37 err = ipv6_find_hdr(pkt->skb, &offset, priv->type, NULL, NULL); 47 err = ipv6_find_hdr(pkt->skb, &offset, priv->type, NULL, NULL);
38 if (err < 0) 48 if (priv->flags & NFT_EXTHDR_F_PRESENT) {
49 *dest = (err >= 0);
50 return;
51 } else if (err < 0) {
39 goto err; 52 goto err;
53 }
40 offset += priv->offset; 54 offset += priv->offset;
41 55
42 dest[priv->len / NFT_REG32_SIZE] = 0; 56 dest[priv->len / NFT_REG32_SIZE] = 0;
@@ -47,11 +61,59 @@ err:
47 regs->verdict.code = NFT_BREAK; 61 regs->verdict.code = NFT_BREAK;
48} 62}
49 63
64static void nft_exthdr_tcp_eval(const struct nft_expr *expr,
65 struct nft_regs *regs,
66 const struct nft_pktinfo *pkt)
67{
68 u8 buff[sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE];
69 struct nft_exthdr *priv = nft_expr_priv(expr);
70 unsigned int i, optl, tcphdr_len, offset;
71 u32 *dest = &regs->data[priv->dreg];
72 struct tcphdr *tcph;
73 u8 *opt;
74
75 if (!pkt->tprot_set || pkt->tprot != IPPROTO_TCP)
76 goto err;
77
78 tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, sizeof(*tcph), buff);
79 if (!tcph)
80 goto err;
81
82 tcphdr_len = __tcp_hdrlen(tcph);
83 if (tcphdr_len < sizeof(*tcph))
84 goto err;
85
86 tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, tcphdr_len, buff);
87 if (!tcph)
88 goto err;
89
90 opt = (u8 *)tcph;
91 for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) {
92 optl = optlen(opt, i);
93
94 if (priv->type != opt[i])
95 continue;
96
97 if (i + optl > tcphdr_len || priv->len + priv->offset > optl)
98 goto err;
99
100 offset = i + priv->offset;
101 dest[priv->len / NFT_REG32_SIZE] = 0;
102 memcpy(dest, opt + offset, priv->len);
103
104 return;
105 }
106
107err:
108 regs->verdict.code = NFT_BREAK;
109}
110
50static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = { 111static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = {
51 [NFTA_EXTHDR_DREG] = { .type = NLA_U32 }, 112 [NFTA_EXTHDR_DREG] = { .type = NLA_U32 },
52 [NFTA_EXTHDR_TYPE] = { .type = NLA_U8 }, 113 [NFTA_EXTHDR_TYPE] = { .type = NLA_U8 },
53 [NFTA_EXTHDR_OFFSET] = { .type = NLA_U32 }, 114 [NFTA_EXTHDR_OFFSET] = { .type = NLA_U32 },
54 [NFTA_EXTHDR_LEN] = { .type = NLA_U32 }, 115 [NFTA_EXTHDR_LEN] = { .type = NLA_U32 },
116 [NFTA_EXTHDR_FLAGS] = { .type = NLA_U32 },
55}; 117};
56 118
57static int nft_exthdr_init(const struct nft_ctx *ctx, 119static int nft_exthdr_init(const struct nft_ctx *ctx,
@@ -59,13 +121,13 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
59 const struct nlattr * const tb[]) 121 const struct nlattr * const tb[])
60{ 122{
61 struct nft_exthdr *priv = nft_expr_priv(expr); 123 struct nft_exthdr *priv = nft_expr_priv(expr);
62 u32 offset, len; 124 u32 offset, len, flags = 0, op = NFT_EXTHDR_OP_IPV6;
63 int err; 125 int err;
64 126
65 if (tb[NFTA_EXTHDR_DREG] == NULL || 127 if (!tb[NFTA_EXTHDR_DREG] ||
66 tb[NFTA_EXTHDR_TYPE] == NULL || 128 !tb[NFTA_EXTHDR_TYPE] ||
67 tb[NFTA_EXTHDR_OFFSET] == NULL || 129 !tb[NFTA_EXTHDR_OFFSET] ||
68 tb[NFTA_EXTHDR_LEN] == NULL) 130 !tb[NFTA_EXTHDR_LEN])
69 return -EINVAL; 131 return -EINVAL;
70 132
71 err = nft_parse_u32_check(tb[NFTA_EXTHDR_OFFSET], U8_MAX, &offset); 133 err = nft_parse_u32_check(tb[NFTA_EXTHDR_OFFSET], U8_MAX, &offset);
@@ -76,10 +138,27 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
76 if (err < 0) 138 if (err < 0)
77 return err; 139 return err;
78 140
141 if (tb[NFTA_EXTHDR_FLAGS]) {
142 err = nft_parse_u32_check(tb[NFTA_EXTHDR_FLAGS], U8_MAX, &flags);
143 if (err < 0)
144 return err;
145
146 if (flags & ~NFT_EXTHDR_F_PRESENT)
147 return -EINVAL;
148 }
149
150 if (tb[NFTA_EXTHDR_OP]) {
151 err = nft_parse_u32_check(tb[NFTA_EXTHDR_OP], U8_MAX, &op);
152 if (err < 0)
153 return err;
154 }
155
79 priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]); 156 priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]);
80 priv->offset = offset; 157 priv->offset = offset;
81 priv->len = len; 158 priv->len = len;
82 priv->dreg = nft_parse_register(tb[NFTA_EXTHDR_DREG]); 159 priv->dreg = nft_parse_register(tb[NFTA_EXTHDR_DREG]);
160 priv->flags = flags;
161 priv->op = op;
83 162
84 return nft_validate_register_store(ctx, priv->dreg, NULL, 163 return nft_validate_register_store(ctx, priv->dreg, NULL,
85 NFT_DATA_VALUE, priv->len); 164 NFT_DATA_VALUE, priv->len);
@@ -97,6 +176,10 @@ static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr)
97 goto nla_put_failure; 176 goto nla_put_failure;
98 if (nla_put_be32(skb, NFTA_EXTHDR_LEN, htonl(priv->len))) 177 if (nla_put_be32(skb, NFTA_EXTHDR_LEN, htonl(priv->len)))
99 goto nla_put_failure; 178 goto nla_put_failure;
179 if (nla_put_be32(skb, NFTA_EXTHDR_FLAGS, htonl(priv->flags)))
180 goto nla_put_failure;
181 if (nla_put_be32(skb, NFTA_EXTHDR_OP, htonl(priv->op)))
182 goto nla_put_failure;
100 return 0; 183 return 0;
101 184
102nla_put_failure: 185nla_put_failure:
@@ -104,17 +187,45 @@ nla_put_failure:
104} 187}
105 188
106static struct nft_expr_type nft_exthdr_type; 189static struct nft_expr_type nft_exthdr_type;
107static const struct nft_expr_ops nft_exthdr_ops = { 190static const struct nft_expr_ops nft_exthdr_ipv6_ops = {
108 .type = &nft_exthdr_type, 191 .type = &nft_exthdr_type,
109 .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), 192 .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
110 .eval = nft_exthdr_eval, 193 .eval = nft_exthdr_ipv6_eval,
111 .init = nft_exthdr_init, 194 .init = nft_exthdr_init,
112 .dump = nft_exthdr_dump, 195 .dump = nft_exthdr_dump,
113}; 196};
114 197
198static const struct nft_expr_ops nft_exthdr_tcp_ops = {
199 .type = &nft_exthdr_type,
200 .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
201 .eval = nft_exthdr_tcp_eval,
202 .init = nft_exthdr_init,
203 .dump = nft_exthdr_dump,
204};
205
206static const struct nft_expr_ops *
207nft_exthdr_select_ops(const struct nft_ctx *ctx,
208 const struct nlattr * const tb[])
209{
210 u32 op;
211
212 if (!tb[NFTA_EXTHDR_OP])
213 return &nft_exthdr_ipv6_ops;
214
215 op = ntohl(nla_get_u32(tb[NFTA_EXTHDR_OP]));
216 switch (op) {
217 case NFT_EXTHDR_OP_TCPOPT:
218 return &nft_exthdr_tcp_ops;
219 case NFT_EXTHDR_OP_IPV6:
220 return &nft_exthdr_ipv6_ops;
221 }
222
223 return ERR_PTR(-EOPNOTSUPP);
224}
225
115static struct nft_expr_type nft_exthdr_type __read_mostly = { 226static struct nft_expr_type nft_exthdr_type __read_mostly = {
116 .name = "exthdr", 227 .name = "exthdr",
117 .ops = &nft_exthdr_ops, 228 .select_ops = &nft_exthdr_select_ops,
118 .policy = nft_exthdr_policy, 229 .policy = nft_exthdr_policy,
119 .maxattr = NFTA_EXTHDR_MAX, 230 .maxattr = NFTA_EXTHDR_MAX,
120 .owner = THIS_MODULE, 231 .owner = THIS_MODULE,
diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c
new file mode 100644
index 000000000000..29a4906adc27
--- /dev/null
+++ b/net/netfilter/nft_fib.c
@@ -0,0 +1,159 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License version 2 as
4 * published by the Free Software Foundation.
5 *
6 * Generic part shared by ipv4 and ipv6 backends.
7 */
8
9#include <linux/kernel.h>
10#include <linux/init.h>
11#include <linux/module.h>
12#include <linux/netlink.h>
13#include <linux/netfilter.h>
14#include <linux/netfilter/nf_tables.h>
15#include <net/netfilter/nf_tables_core.h>
16#include <net/netfilter/nf_tables.h>
17#include <net/netfilter/nft_fib.h>
18
19const struct nla_policy nft_fib_policy[NFTA_FIB_MAX + 1] = {
20 [NFTA_FIB_DREG] = { .type = NLA_U32 },
21 [NFTA_FIB_RESULT] = { .type = NLA_U32 },
22 [NFTA_FIB_FLAGS] = { .type = NLA_U32 },
23};
24EXPORT_SYMBOL(nft_fib_policy);
25
26#define NFTA_FIB_F_ALL (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR | \
27 NFTA_FIB_F_MARK | NFTA_FIB_F_IIF | NFTA_FIB_F_OIF)
28
29int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr,
30 const struct nft_data **data)
31{
32 const struct nft_fib *priv = nft_expr_priv(expr);
33 unsigned int hooks;
34
35 switch (priv->result) {
36 case NFT_FIB_RESULT_OIF: /* fallthrough */
37 case NFT_FIB_RESULT_OIFNAME:
38 hooks = (1 << NF_INET_PRE_ROUTING);
39 break;
40 case NFT_FIB_RESULT_ADDRTYPE:
41 if (priv->flags & NFTA_FIB_F_IIF)
42 hooks = (1 << NF_INET_PRE_ROUTING) |
43 (1 << NF_INET_LOCAL_IN) |
44 (1 << NF_INET_FORWARD);
45 else if (priv->flags & NFTA_FIB_F_OIF)
46 hooks = (1 << NF_INET_LOCAL_OUT) |
47 (1 << NF_INET_POST_ROUTING) |
48 (1 << NF_INET_FORWARD);
49 else
50 hooks = (1 << NF_INET_LOCAL_IN) |
51 (1 << NF_INET_LOCAL_OUT) |
52 (1 << NF_INET_FORWARD) |
53 (1 << NF_INET_PRE_ROUTING) |
54 (1 << NF_INET_POST_ROUTING);
55
56 break;
57 default:
58 return -EINVAL;
59 }
60
61 return nft_chain_validate_hooks(ctx->chain, hooks);
62}
63EXPORT_SYMBOL_GPL(nft_fib_validate);
64
65int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
66 const struct nlattr * const tb[])
67{
68 struct nft_fib *priv = nft_expr_priv(expr);
69 unsigned int len;
70 int err;
71
72 if (!tb[NFTA_FIB_DREG] || !tb[NFTA_FIB_RESULT] || !tb[NFTA_FIB_FLAGS])
73 return -EINVAL;
74
75 priv->flags = ntohl(nla_get_be32(tb[NFTA_FIB_FLAGS]));
76
77 if (priv->flags == 0 || (priv->flags & ~NFTA_FIB_F_ALL))
78 return -EINVAL;
79
80 if ((priv->flags & (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR)) ==
81 (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR))
82 return -EINVAL;
83 if ((priv->flags & (NFTA_FIB_F_IIF | NFTA_FIB_F_OIF)) ==
84 (NFTA_FIB_F_IIF | NFTA_FIB_F_OIF))
85 return -EINVAL;
86 if ((priv->flags & (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR)) == 0)
87 return -EINVAL;
88
89 priv->result = ntohl(nla_get_be32(tb[NFTA_FIB_RESULT]));
90 priv->dreg = nft_parse_register(tb[NFTA_FIB_DREG]);
91
92 switch (priv->result) {
93 case NFT_FIB_RESULT_OIF:
94 if (priv->flags & NFTA_FIB_F_OIF)
95 return -EINVAL;
96 len = sizeof(int);
97 break;
98 case NFT_FIB_RESULT_OIFNAME:
99 if (priv->flags & NFTA_FIB_F_OIF)
100 return -EINVAL;
101 len = IFNAMSIZ;
102 break;
103 case NFT_FIB_RESULT_ADDRTYPE:
104 len = sizeof(u32);
105 break;
106 default:
107 return -EINVAL;
108 }
109
110 err = nft_validate_register_store(ctx, priv->dreg, NULL,
111 NFT_DATA_VALUE, len);
112 if (err < 0)
113 return err;
114
115 return nft_fib_validate(ctx, expr, NULL);
116}
117EXPORT_SYMBOL_GPL(nft_fib_init);
118
119int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr)
120{
121 const struct nft_fib *priv = nft_expr_priv(expr);
122
123 if (nft_dump_register(skb, NFTA_FIB_DREG, priv->dreg))
124 return -1;
125
126 if (nla_put_be32(skb, NFTA_FIB_RESULT, htonl(priv->result)))
127 return -1;
128
129 if (nla_put_be32(skb, NFTA_FIB_FLAGS, htonl(priv->flags)))
130 return -1;
131
132 return 0;
133}
134EXPORT_SYMBOL_GPL(nft_fib_dump);
135
136void nft_fib_store_result(void *reg, enum nft_fib_result r,
137 const struct nft_pktinfo *pkt, int index)
138{
139 struct net_device *dev;
140 u32 *dreg = reg;
141
142 switch (r) {
143 case NFT_FIB_RESULT_OIF:
144 *dreg = index;
145 break;
146 case NFT_FIB_RESULT_OIFNAME:
147 dev = dev_get_by_index_rcu(nft_net(pkt), index);
148 strncpy(reg, dev ? dev->name : "", IFNAMSIZ);
149 break;
150 default:
151 WARN_ON_ONCE(1);
152 *dreg = 0;
153 break;
154 }
155}
156EXPORT_SYMBOL_GPL(nft_fib_store_result);
157
158MODULE_LICENSE("GPL");
159MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
diff --git a/net/netfilter/nft_fib_inet.c b/net/netfilter/nft_fib_inet.c
new file mode 100644
index 000000000000..9120fc7228f4
--- /dev/null
+++ b/net/netfilter/nft_fib_inet.c
@@ -0,0 +1,82 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License version 2 as
4 * published by the Free Software Foundation.
5 */
6
7#include <linux/kernel.h>
8#include <linux/init.h>
9#include <linux/module.h>
10#include <linux/netlink.h>
11#include <linux/netfilter.h>
12#include <linux/netfilter/nf_tables.h>
13#include <net/netfilter/nf_tables_core.h>
14#include <net/netfilter/nf_tables.h>
15
16#include <net/netfilter/nft_fib.h>
17
18static void nft_fib_inet_eval(const struct nft_expr *expr,
19 struct nft_regs *regs,
20 const struct nft_pktinfo *pkt)
21{
22 const struct nft_fib *priv = nft_expr_priv(expr);
23
24 switch (nft_pf(pkt)) {
25 case NFPROTO_IPV4:
26 switch (priv->result) {
27 case NFT_FIB_RESULT_OIF:
28 case NFT_FIB_RESULT_OIFNAME:
29 return nft_fib4_eval(expr, regs, pkt);
30 case NFT_FIB_RESULT_ADDRTYPE:
31 return nft_fib4_eval_type(expr, regs, pkt);
32 }
33 break;
34 case NFPROTO_IPV6:
35 switch (priv->result) {
36 case NFT_FIB_RESULT_OIF:
37 case NFT_FIB_RESULT_OIFNAME:
38 return nft_fib6_eval(expr, regs, pkt);
39 case NFT_FIB_RESULT_ADDRTYPE:
40 return nft_fib6_eval_type(expr, regs, pkt);
41 }
42 break;
43 }
44
45 regs->verdict.code = NF_DROP;
46}
47
48static struct nft_expr_type nft_fib_inet_type;
49static const struct nft_expr_ops nft_fib_inet_ops = {
50 .type = &nft_fib_inet_type,
51 .size = NFT_EXPR_SIZE(sizeof(struct nft_fib)),
52 .eval = nft_fib_inet_eval,
53 .init = nft_fib_init,
54 .dump = nft_fib_dump,
55 .validate = nft_fib_validate,
56};
57
58static struct nft_expr_type nft_fib_inet_type __read_mostly = {
59 .family = NFPROTO_INET,
60 .name = "fib",
61 .ops = &nft_fib_inet_ops,
62 .policy = nft_fib_policy,
63 .maxattr = NFTA_FIB_MAX,
64 .owner = THIS_MODULE,
65};
66
67static int __init nft_fib_inet_module_init(void)
68{
69 return nft_register_expr(&nft_fib_inet_type);
70}
71
72static void __exit nft_fib_inet_module_exit(void)
73{
74 nft_unregister_expr(&nft_fib_inet_type);
75}
76
77module_init(nft_fib_inet_module_init);
78module_exit(nft_fib_inet_module_exit);
79
80MODULE_LICENSE("GPL");
81MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
82MODULE_ALIAS_NFT_AF_EXPR(1, "fib");
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
index 763ebc3e0b2b..ce13a50b9189 100644
--- a/net/netfilter/nft_fwd_netdev.c
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -26,8 +26,8 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr,
26 struct nft_fwd_netdev *priv = nft_expr_priv(expr); 26 struct nft_fwd_netdev *priv = nft_expr_priv(expr);
27 int oif = regs->data[priv->sreg_dev]; 27 int oif = regs->data[priv->sreg_dev];
28 28
29 nf_dup_netdev_egress(pkt, oif); 29 nf_fwd_netdev_egress(pkt, oif);
30 regs->verdict.code = NF_DROP; 30 regs->verdict.code = NF_STOLEN;
31} 31}
32 32
33static const struct nla_policy nft_fwd_netdev_policy[NFTA_FWD_MAX + 1] = { 33static const struct nla_policy nft_fwd_netdev_policy[NFTA_FWD_MAX + 1] = {
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index d5447a22275c..c4dad1254ead 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -21,6 +21,7 @@ struct nft_hash {
21 enum nft_registers sreg:8; 21 enum nft_registers sreg:8;
22 enum nft_registers dreg:8; 22 enum nft_registers dreg:8;
23 u8 len; 23 u8 len;
24 bool autogen_seed:1;
24 u32 modulus; 25 u32 modulus;
25 u32 seed; 26 u32 seed;
26 u32 offset; 27 u32 offset;
@@ -58,7 +59,6 @@ static int nft_hash_init(const struct nft_ctx *ctx,
58 if (!tb[NFTA_HASH_SREG] || 59 if (!tb[NFTA_HASH_SREG] ||
59 !tb[NFTA_HASH_DREG] || 60 !tb[NFTA_HASH_DREG] ||
60 !tb[NFTA_HASH_LEN] || 61 !tb[NFTA_HASH_LEN] ||
61 !tb[NFTA_HASH_SEED] ||
62 !tb[NFTA_HASH_MODULUS]) 62 !tb[NFTA_HASH_MODULUS])
63 return -EINVAL; 63 return -EINVAL;
64 64
@@ -83,7 +83,12 @@ static int nft_hash_init(const struct nft_ctx *ctx,
83 if (priv->offset + priv->modulus - 1 < priv->offset) 83 if (priv->offset + priv->modulus - 1 < priv->offset)
84 return -EOVERFLOW; 84 return -EOVERFLOW;
85 85
86 priv->seed = ntohl(nla_get_be32(tb[NFTA_HASH_SEED])); 86 if (tb[NFTA_HASH_SEED]) {
87 priv->seed = ntohl(nla_get_be32(tb[NFTA_HASH_SEED]));
88 } else {
89 priv->autogen_seed = true;
90 get_random_bytes(&priv->seed, sizeof(priv->seed));
91 }
87 92
88 return nft_validate_register_load(priv->sreg, len) && 93 return nft_validate_register_load(priv->sreg, len) &&
89 nft_validate_register_store(ctx, priv->dreg, NULL, 94 nft_validate_register_store(ctx, priv->dreg, NULL,
@@ -103,7 +108,8 @@ static int nft_hash_dump(struct sk_buff *skb,
103 goto nla_put_failure; 108 goto nla_put_failure;
104 if (nla_put_be32(skb, NFTA_HASH_MODULUS, htonl(priv->modulus))) 109 if (nla_put_be32(skb, NFTA_HASH_MODULUS, htonl(priv->modulus)))
105 goto nla_put_failure; 110 goto nla_put_failure;
106 if (nla_put_be32(skb, NFTA_HASH_SEED, htonl(priv->seed))) 111 if (!priv->autogen_seed &&
112 nla_put_be32(skb, NFTA_HASH_SEED, htonl(priv->seed)))
107 goto nla_put_failure; 113 goto nla_put_failure;
108 if (priv->offset != 0) 114 if (priv->offset != 0)
109 if (nla_put_be32(skb, NFTA_HASH_OFFSET, htonl(priv->offset))) 115 if (nla_put_be32(skb, NFTA_HASH_OFFSET, htonl(priv->offset)))
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index d17018ff54e6..728baf88295a 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -54,9 +54,6 @@ static int nft_immediate_init(const struct nft_ctx *ctx,
54 if (err < 0) 54 if (err < 0)
55 return err; 55 return err;
56 56
57 if (desc.len > U8_MAX)
58 return -ERANGE;
59
60 priv->dlen = desc.len; 57 priv->dlen = desc.len;
61 58
62 priv->dreg = nft_parse_register(tb[NFTA_IMMEDIATE_DREG]); 59 priv->dreg = nft_parse_register(tb[NFTA_IMMEDIATE_DREG]);
@@ -105,7 +102,6 @@ static int nft_immediate_validate(const struct nft_ctx *ctx,
105 return 0; 102 return 0;
106} 103}
107 104
108static struct nft_expr_type nft_imm_type;
109static const struct nft_expr_ops nft_imm_ops = { 105static const struct nft_expr_ops nft_imm_ops = {
110 .type = &nft_imm_type, 106 .type = &nft_imm_type,
111 .size = NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)), 107 .size = NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)),
@@ -116,20 +112,10 @@ static const struct nft_expr_ops nft_imm_ops = {
116 .validate = nft_immediate_validate, 112 .validate = nft_immediate_validate,
117}; 113};
118 114
119static struct nft_expr_type nft_imm_type __read_mostly = { 115struct nft_expr_type nft_imm_type __read_mostly = {
120 .name = "immediate", 116 .name = "immediate",
121 .ops = &nft_imm_ops, 117 .ops = &nft_imm_ops,
122 .policy = nft_immediate_policy, 118 .policy = nft_immediate_policy,
123 .maxattr = NFTA_IMMEDIATE_MAX, 119 .maxattr = NFTA_IMMEDIATE_MAX,
124 .owner = THIS_MODULE, 120 .owner = THIS_MODULE,
125}; 121};
126
127int __init nft_immediate_module_init(void)
128{
129 return nft_register_expr(&nft_imm_type);
130}
131
132void nft_immediate_module_exit(void)
133{
134 nft_unregister_expr(&nft_imm_type);
135}
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index 1b01404bb33f..6f6e64423643 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -32,13 +32,15 @@ static void nft_log_eval(const struct nft_expr *expr,
32{ 32{
33 const struct nft_log *priv = nft_expr_priv(expr); 33 const struct nft_log *priv = nft_expr_priv(expr);
34 34
35 nf_log_packet(pkt->net, pkt->pf, pkt->hook, pkt->skb, pkt->in, 35 nf_log_packet(nft_net(pkt), nft_pf(pkt), nft_hook(pkt), pkt->skb,
36 pkt->out, &priv->loginfo, "%s", priv->prefix); 36 nft_in(pkt), nft_out(pkt), &priv->loginfo, "%s",
37 priv->prefix);
37} 38}
38 39
39static const struct nla_policy nft_log_policy[NFTA_LOG_MAX + 1] = { 40static const struct nla_policy nft_log_policy[NFTA_LOG_MAX + 1] = {
40 [NFTA_LOG_GROUP] = { .type = NLA_U16 }, 41 [NFTA_LOG_GROUP] = { .type = NLA_U16 },
41 [NFTA_LOG_PREFIX] = { .type = NLA_STRING }, 42 [NFTA_LOG_PREFIX] = { .type = NLA_STRING,
43 .len = NF_LOG_PREFIXLEN - 1 },
42 [NFTA_LOG_SNAPLEN] = { .type = NLA_U32 }, 44 [NFTA_LOG_SNAPLEN] = { .type = NLA_U32 },
43 [NFTA_LOG_QTHRESHOLD] = { .type = NLA_U16 }, 45 [NFTA_LOG_QTHRESHOLD] = { .type = NLA_U16 },
44 [NFTA_LOG_LEVEL] = { .type = NLA_U32 }, 46 [NFTA_LOG_LEVEL] = { .type = NLA_U32 },
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index 8166b6994cc7..e21aea7e5ec8 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -35,9 +35,8 @@ static void nft_lookup_eval(const struct nft_expr *expr,
35 const struct nft_set_ext *ext; 35 const struct nft_set_ext *ext;
36 bool found; 36 bool found;
37 37
38 found = set->ops->lookup(pkt->net, set, &regs->data[priv->sreg], &ext) ^ 38 found = set->ops->lookup(nft_net(pkt), set, &regs->data[priv->sreg],
39 priv->invert; 39 &ext) ^ priv->invert;
40
41 if (!found) { 40 if (!found) {
42 regs->verdict.code = NFT_BREAK; 41 regs->verdict.code = NFT_BREAK;
43 return; 42 return;
@@ -50,7 +49,8 @@ static void nft_lookup_eval(const struct nft_expr *expr,
50} 49}
51 50
52static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = { 51static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = {
53 [NFTA_LOOKUP_SET] = { .type = NLA_STRING }, 52 [NFTA_LOOKUP_SET] = { .type = NLA_STRING,
53 .len = NFT_SET_MAXNAMELEN - 1 },
54 [NFTA_LOOKUP_SET_ID] = { .type = NLA_U32 }, 54 [NFTA_LOOKUP_SET_ID] = { .type = NLA_U32 },
55 [NFTA_LOOKUP_SREG] = { .type = NLA_U32 }, 55 [NFTA_LOOKUP_SREG] = { .type = NLA_U32 },
56 [NFTA_LOOKUP_DREG] = { .type = NLA_U32 }, 56 [NFTA_LOOKUP_DREG] = { .type = NLA_U32 },
@@ -155,7 +155,6 @@ nla_put_failure:
155 return -1; 155 return -1;
156} 156}
157 157
158static struct nft_expr_type nft_lookup_type;
159static const struct nft_expr_ops nft_lookup_ops = { 158static const struct nft_expr_ops nft_lookup_ops = {
160 .type = &nft_lookup_type, 159 .type = &nft_lookup_type,
161 .size = NFT_EXPR_SIZE(sizeof(struct nft_lookup)), 160 .size = NFT_EXPR_SIZE(sizeof(struct nft_lookup)),
@@ -165,20 +164,10 @@ static const struct nft_expr_ops nft_lookup_ops = {
165 .dump = nft_lookup_dump, 164 .dump = nft_lookup_dump,
166}; 165};
167 166
168static struct nft_expr_type nft_lookup_type __read_mostly = { 167struct nft_expr_type nft_lookup_type __read_mostly = {
169 .name = "lookup", 168 .name = "lookup",
170 .ops = &nft_lookup_ops, 169 .ops = &nft_lookup_ops,
171 .policy = nft_lookup_policy, 170 .policy = nft_lookup_policy,
172 .maxattr = NFTA_LOOKUP_MAX, 171 .maxattr = NFTA_LOOKUP_MAX,
173 .owner = THIS_MODULE, 172 .owner = THIS_MODULE,
174}; 173};
175
176int __init nft_lookup_module_init(void)
177{
178 return nft_register_expr(&nft_lookup_type);
179}
180
181void nft_lookup_module_exit(void)
182{
183 nft_unregister_expr(&nft_lookup_type);
184}
diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c
index 81b5ad6165ac..11ce016cd479 100644
--- a/net/netfilter/nft_masq.c
+++ b/net/netfilter/nft_masq.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com> 2 * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify 4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as 5 * it under the terms of the GNU General Public License version 2 as
@@ -77,7 +77,7 @@ int nft_masq_init(const struct nft_ctx *ctx,
77 } 77 }
78 } 78 }
79 79
80 return 0; 80 return nf_ct_netns_get(ctx->net, ctx->afi->family);
81} 81}
82EXPORT_SYMBOL_GPL(nft_masq_init); 82EXPORT_SYMBOL_GPL(nft_masq_init);
83 83
@@ -105,4 +105,4 @@ nla_put_failure:
105EXPORT_SYMBOL_GPL(nft_masq_dump); 105EXPORT_SYMBOL_GPL(nft_masq_dump);
106 106
107MODULE_LICENSE("GPL"); 107MODULE_LICENSE("GPL");
108MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>"); 108MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>");
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 6c1e0246706e..7b60e01f38ff 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -36,7 +36,7 @@ void nft_meta_get_eval(const struct nft_expr *expr,
36{ 36{
37 const struct nft_meta *priv = nft_expr_priv(expr); 37 const struct nft_meta *priv = nft_expr_priv(expr);
38 const struct sk_buff *skb = pkt->skb; 38 const struct sk_buff *skb = pkt->skb;
39 const struct net_device *in = pkt->in, *out = pkt->out; 39 const struct net_device *in = nft_in(pkt), *out = nft_out(pkt);
40 struct sock *sk; 40 struct sock *sk;
41 u32 *dest = &regs->data[priv->dreg]; 41 u32 *dest = &regs->data[priv->dreg];
42 42
@@ -45,16 +45,15 @@ void nft_meta_get_eval(const struct nft_expr *expr,
45 *dest = skb->len; 45 *dest = skb->len;
46 break; 46 break;
47 case NFT_META_PROTOCOL: 47 case NFT_META_PROTOCOL:
48 *dest = 0; 48 nft_reg_store16(dest, (__force u16)skb->protocol);
49 *(__be16 *)dest = skb->protocol;
50 break; 49 break;
51 case NFT_META_NFPROTO: 50 case NFT_META_NFPROTO:
52 *dest = pkt->pf; 51 nft_reg_store8(dest, nft_pf(pkt));
53 break; 52 break;
54 case NFT_META_L4PROTO: 53 case NFT_META_L4PROTO:
55 if (!pkt->tprot_set) 54 if (!pkt->tprot_set)
56 goto err; 55 goto err;
57 *dest = pkt->tprot; 56 nft_reg_store8(dest, pkt->tprot);
58 break; 57 break;
59 case NFT_META_PRIORITY: 58 case NFT_META_PRIORITY:
60 *dest = skb->priority; 59 *dest = skb->priority;
@@ -85,14 +84,12 @@ void nft_meta_get_eval(const struct nft_expr *expr,
85 case NFT_META_IIFTYPE: 84 case NFT_META_IIFTYPE:
86 if (in == NULL) 85 if (in == NULL)
87 goto err; 86 goto err;
88 *dest = 0; 87 nft_reg_store16(dest, in->type);
89 *(u16 *)dest = in->type;
90 break; 88 break;
91 case NFT_META_OIFTYPE: 89 case NFT_META_OIFTYPE:
92 if (out == NULL) 90 if (out == NULL)
93 goto err; 91 goto err;
94 *dest = 0; 92 nft_reg_store16(dest, out->type);
95 *(u16 *)dest = out->type;
96 break; 93 break;
97 case NFT_META_SKUID: 94 case NFT_META_SKUID:
98 sk = skb_to_full_sk(skb); 95 sk = skb_to_full_sk(skb);
@@ -142,25 +139,48 @@ void nft_meta_get_eval(const struct nft_expr *expr,
142#endif 139#endif
143 case NFT_META_PKTTYPE: 140 case NFT_META_PKTTYPE:
144 if (skb->pkt_type != PACKET_LOOPBACK) { 141 if (skb->pkt_type != PACKET_LOOPBACK) {
145 *dest = skb->pkt_type; 142 nft_reg_store8(dest, skb->pkt_type);
146 break; 143 break;
147 } 144 }
148 145
149 switch (pkt->pf) { 146 switch (nft_pf(pkt)) {
150 case NFPROTO_IPV4: 147 case NFPROTO_IPV4:
151 if (ipv4_is_multicast(ip_hdr(skb)->daddr)) 148 if (ipv4_is_multicast(ip_hdr(skb)->daddr))
152 *dest = PACKET_MULTICAST; 149 nft_reg_store8(dest, PACKET_MULTICAST);
153 else 150 else
154 *dest = PACKET_BROADCAST; 151 nft_reg_store8(dest, PACKET_BROADCAST);
155 break; 152 break;
156 case NFPROTO_IPV6: 153 case NFPROTO_IPV6:
157 if (ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF) 154 nft_reg_store8(dest, PACKET_MULTICAST);
158 *dest = PACKET_MULTICAST; 155 break;
159 else 156 case NFPROTO_NETDEV:
160 *dest = PACKET_BROADCAST; 157 switch (skb->protocol) {
158 case htons(ETH_P_IP): {
159 int noff = skb_network_offset(skb);
160 struct iphdr *iph, _iph;
161
162 iph = skb_header_pointer(skb, noff,
163 sizeof(_iph), &_iph);
164 if (!iph)
165 goto err;
166
167 if (ipv4_is_multicast(iph->daddr))
168 nft_reg_store8(dest, PACKET_MULTICAST);
169 else
170 nft_reg_store8(dest, PACKET_BROADCAST);
171
172 break;
173 }
174 case htons(ETH_P_IPV6):
175 nft_reg_store8(dest, PACKET_MULTICAST);
176 break;
177 default:
178 WARN_ON_ONCE(1);
179 goto err;
180 }
161 break; 181 break;
162 default: 182 default:
163 WARN_ON(1); 183 WARN_ON_ONCE(1);
164 goto err; 184 goto err;
165 } 185 }
166 break; 186 break;
@@ -207,7 +227,9 @@ void nft_meta_set_eval(const struct nft_expr *expr,
207{ 227{
208 const struct nft_meta *meta = nft_expr_priv(expr); 228 const struct nft_meta *meta = nft_expr_priv(expr);
209 struct sk_buff *skb = pkt->skb; 229 struct sk_buff *skb = pkt->skb;
210 u32 value = regs->data[meta->sreg]; 230 u32 *sreg = &regs->data[meta->sreg];
231 u32 value = *sreg;
232 u8 pkt_type;
211 233
212 switch (meta->key) { 234 switch (meta->key) {
213 case NFT_META_MARK: 235 case NFT_META_MARK:
@@ -217,9 +239,12 @@ void nft_meta_set_eval(const struct nft_expr *expr,
217 skb->priority = value; 239 skb->priority = value;
218 break; 240 break;
219 case NFT_META_PKTTYPE: 241 case NFT_META_PKTTYPE:
220 if (skb->pkt_type != value && 242 pkt_type = nft_reg_load8(sreg);
221 skb_pkt_type_ok(value) && skb_pkt_type_ok(skb->pkt_type)) 243
222 skb->pkt_type = value; 244 if (skb->pkt_type != pkt_type &&
245 skb_pkt_type_ok(pkt_type) &&
246 skb_pkt_type_ok(skb->pkt_type))
247 skb->pkt_type = pkt_type;
223 break; 248 break;
224 case NFT_META_NFTRACE: 249 case NFT_META_NFTRACE:
225 skb->nf_trace = !!value; 250 skb->nf_trace = !!value;
@@ -310,6 +335,11 @@ int nft_meta_set_validate(const struct nft_ctx *ctx,
310 case NFPROTO_NETDEV: 335 case NFPROTO_NETDEV:
311 hooks = 1 << NF_NETDEV_INGRESS; 336 hooks = 1 << NF_NETDEV_INGRESS;
312 break; 337 break;
338 case NFPROTO_IPV4:
339 case NFPROTO_IPV6:
340 case NFPROTO_INET:
341 hooks = 1 << NF_INET_PRE_ROUTING;
342 break;
313 default: 343 default:
314 return -EOPNOTSUPP; 344 return -EOPNOTSUPP;
315 } 345 }
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
index ee2d71753746..439e0bd152a0 100644
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -65,10 +65,10 @@ static void nft_nat_eval(const struct nft_expr *expr,
65 } 65 }
66 66
67 if (priv->sreg_proto_min) { 67 if (priv->sreg_proto_min) {
68 range.min_proto.all = 68 range.min_proto.all = (__force __be16)nft_reg_load16(
69 *(__be16 *)&regs->data[priv->sreg_proto_min]; 69 &regs->data[priv->sreg_proto_min]);
70 range.max_proto.all = 70 range.max_proto.all = (__force __be16)nft_reg_load16(
71 *(__be16 *)&regs->data[priv->sreg_proto_max]; 71 &regs->data[priv->sreg_proto_max]);
72 range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; 72 range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
73 } 73 }
74 74
@@ -209,7 +209,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
209 return -EINVAL; 209 return -EINVAL;
210 } 210 }
211 211
212 return 0; 212 return nf_ct_netns_get(ctx->net, family);
213} 213}
214 214
215static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr) 215static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr)
@@ -257,12 +257,21 @@ nla_put_failure:
257 return -1; 257 return -1;
258} 258}
259 259
260static void
261nft_nat_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
262{
263 const struct nft_nat *priv = nft_expr_priv(expr);
264
265 nf_ct_netns_put(ctx->net, priv->family);
266}
267
260static struct nft_expr_type nft_nat_type; 268static struct nft_expr_type nft_nat_type;
261static const struct nft_expr_ops nft_nat_ops = { 269static const struct nft_expr_ops nft_nat_ops = {
262 .type = &nft_nat_type, 270 .type = &nft_nat_type,
263 .size = NFT_EXPR_SIZE(sizeof(struct nft_nat)), 271 .size = NFT_EXPR_SIZE(sizeof(struct nft_nat)),
264 .eval = nft_nat_eval, 272 .eval = nft_nat_eval,
265 .init = nft_nat_init, 273 .init = nft_nat_init,
274 .destroy = nft_nat_destroy,
266 .dump = nft_nat_dump, 275 .dump = nft_nat_dump,
267 .validate = nft_nat_validate, 276 .validate = nft_nat_validate,
268}; 277};
diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
index 55bc5ab78d4a..a66b36097b8f 100644
--- a/net/netfilter/nft_numgen.c
+++ b/net/netfilter/nft_numgen.c
@@ -65,7 +65,7 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx,
65 return -EOVERFLOW; 65 return -EOVERFLOW;
66 66
67 priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]); 67 priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]);
68 atomic_set(&priv->counter, 0); 68 atomic_set(&priv->counter, priv->modulus - 1);
69 69
70 return nft_validate_register_store(ctx, priv->dreg, NULL, 70 return nft_validate_register_store(ctx, priv->dreg, NULL,
71 NFT_DATA_VALUE, sizeof(u32)); 71 NFT_DATA_VALUE, sizeof(u32));
diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c
new file mode 100644
index 000000000000..1ae8c49ca4a1
--- /dev/null
+++ b/net/netfilter/nft_objref.c
@@ -0,0 +1,228 @@
1/*
2 * Copyright (c) 2012-2016 Pablo Neira Ayuso <pablo@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 */
8
9#include <linux/init.h>
10#include <linux/module.h>
11#include <linux/skbuff.h>
12#include <linux/netlink.h>
13#include <linux/netfilter.h>
14#include <linux/netfilter/nf_tables.h>
15#include <net/netfilter/nf_tables.h>
16
17#define nft_objref_priv(expr) *((struct nft_object **)nft_expr_priv(expr))
18
19static void nft_objref_eval(const struct nft_expr *expr,
20 struct nft_regs *regs,
21 const struct nft_pktinfo *pkt)
22{
23 struct nft_object *obj = nft_objref_priv(expr);
24
25 obj->type->eval(obj, regs, pkt);
26}
27
28static int nft_objref_init(const struct nft_ctx *ctx,
29 const struct nft_expr *expr,
30 const struct nlattr * const tb[])
31{
32 struct nft_object *obj = nft_objref_priv(expr);
33 u8 genmask = nft_genmask_next(ctx->net);
34 u32 objtype;
35
36 if (!tb[NFTA_OBJREF_IMM_NAME] ||
37 !tb[NFTA_OBJREF_IMM_TYPE])
38 return -EINVAL;
39
40 objtype = ntohl(nla_get_be32(tb[NFTA_OBJREF_IMM_TYPE]));
41 obj = nf_tables_obj_lookup(ctx->table, tb[NFTA_OBJREF_IMM_NAME], objtype,
42 genmask);
43 if (IS_ERR(obj))
44 return -ENOENT;
45
46 nft_objref_priv(expr) = obj;
47 obj->use++;
48
49 return 0;
50}
51
52static int nft_objref_dump(struct sk_buff *skb, const struct nft_expr *expr)
53{
54 const struct nft_object *obj = nft_objref_priv(expr);
55
56 if (nla_put_string(skb, NFTA_OBJREF_IMM_NAME, obj->name) ||
57 nla_put_be32(skb, NFTA_OBJREF_IMM_TYPE, htonl(obj->type->type)))
58 goto nla_put_failure;
59
60 return 0;
61
62nla_put_failure:
63 return -1;
64}
65
66static void nft_objref_destroy(const struct nft_ctx *ctx,
67 const struct nft_expr *expr)
68{
69 struct nft_object *obj = nft_objref_priv(expr);
70
71 obj->use--;
72}
73
74static struct nft_expr_type nft_objref_type;
75static const struct nft_expr_ops nft_objref_ops = {
76 .type = &nft_objref_type,
77 .size = NFT_EXPR_SIZE(sizeof(struct nft_object *)),
78 .eval = nft_objref_eval,
79 .init = nft_objref_init,
80 .destroy = nft_objref_destroy,
81 .dump = nft_objref_dump,
82};
83
84struct nft_objref_map {
85 struct nft_set *set;
86 enum nft_registers sreg:8;
87 struct nft_set_binding binding;
88};
89
90static void nft_objref_map_eval(const struct nft_expr *expr,
91 struct nft_regs *regs,
92 const struct nft_pktinfo *pkt)
93{
94 struct nft_objref_map *priv = nft_expr_priv(expr);
95 const struct nft_set *set = priv->set;
96 const struct nft_set_ext *ext;
97 struct nft_object *obj;
98 bool found;
99
100 found = set->ops->lookup(nft_net(pkt), set, &regs->data[priv->sreg],
101 &ext);
102 if (!found) {
103 regs->verdict.code = NFT_BREAK;
104 return;
105 }
106 obj = *nft_set_ext_obj(ext);
107 obj->type->eval(obj, regs, pkt);
108}
109
110static int nft_objref_map_init(const struct nft_ctx *ctx,
111 const struct nft_expr *expr,
112 const struct nlattr * const tb[])
113{
114 struct nft_objref_map *priv = nft_expr_priv(expr);
115 u8 genmask = nft_genmask_next(ctx->net);
116 struct nft_set *set;
117 int err;
118
119 set = nf_tables_set_lookup(ctx->table, tb[NFTA_OBJREF_SET_NAME], genmask);
120 if (IS_ERR(set)) {
121 if (tb[NFTA_OBJREF_SET_ID]) {
122 set = nf_tables_set_lookup_byid(ctx->net,
123 tb[NFTA_OBJREF_SET_ID],
124 genmask);
125 }
126 if (IS_ERR(set))
127 return PTR_ERR(set);
128 }
129
130 if (!(set->flags & NFT_SET_OBJECT))
131 return -EINVAL;
132
133 priv->sreg = nft_parse_register(tb[NFTA_OBJREF_SET_SREG]);
134 err = nft_validate_register_load(priv->sreg, set->klen);
135 if (err < 0)
136 return err;
137
138 priv->binding.flags = set->flags & NFT_SET_OBJECT;
139
140 err = nf_tables_bind_set(ctx, set, &priv->binding);
141 if (err < 0)
142 return err;
143
144 priv->set = set;
145 return 0;
146}
147
148static int nft_objref_map_dump(struct sk_buff *skb, const struct nft_expr *expr)
149{
150 const struct nft_objref_map *priv = nft_expr_priv(expr);
151
152 if (nft_dump_register(skb, NFTA_OBJREF_SET_SREG, priv->sreg) ||
153 nla_put_string(skb, NFTA_OBJREF_SET_NAME, priv->set->name))
154 goto nla_put_failure;
155
156 return 0;
157
158nla_put_failure:
159 return -1;
160}
161
162static void nft_objref_map_destroy(const struct nft_ctx *ctx,
163 const struct nft_expr *expr)
164{
165 struct nft_objref_map *priv = nft_expr_priv(expr);
166
167 nf_tables_unbind_set(ctx, priv->set, &priv->binding);
168}
169
170static struct nft_expr_type nft_objref_type;
171static const struct nft_expr_ops nft_objref_map_ops = {
172 .type = &nft_objref_type,
173 .size = NFT_EXPR_SIZE(sizeof(struct nft_objref_map)),
174 .eval = nft_objref_map_eval,
175 .init = nft_objref_map_init,
176 .destroy = nft_objref_map_destroy,
177 .dump = nft_objref_map_dump,
178};
179
180static const struct nft_expr_ops *
181nft_objref_select_ops(const struct nft_ctx *ctx,
182 const struct nlattr * const tb[])
183{
184 if (tb[NFTA_OBJREF_SET_SREG] &&
185 (tb[NFTA_OBJREF_SET_NAME] ||
186 tb[NFTA_OBJREF_SET_ID]))
187 return &nft_objref_map_ops;
188 else if (tb[NFTA_OBJREF_IMM_NAME] &&
189 tb[NFTA_OBJREF_IMM_TYPE])
190 return &nft_objref_ops;
191
192 return ERR_PTR(-EOPNOTSUPP);
193}
194
195static const struct nla_policy nft_objref_policy[NFTA_OBJREF_MAX + 1] = {
196 [NFTA_OBJREF_IMM_NAME] = { .type = NLA_STRING,
197 .len = NFT_OBJ_MAXNAMELEN - 1 },
198 [NFTA_OBJREF_IMM_TYPE] = { .type = NLA_U32 },
199 [NFTA_OBJREF_SET_SREG] = { .type = NLA_U32 },
200 [NFTA_OBJREF_SET_NAME] = { .type = NLA_STRING,
201 .len = NFT_SET_MAXNAMELEN - 1 },
202 [NFTA_OBJREF_SET_ID] = { .type = NLA_U32 },
203};
204
205static struct nft_expr_type nft_objref_type __read_mostly = {
206 .name = "objref",
207 .select_ops = nft_objref_select_ops,
208 .policy = nft_objref_policy,
209 .maxattr = NFTA_OBJREF_MAX,
210 .owner = THIS_MODULE,
211};
212
213static int __init nft_objref_module_init(void)
214{
215 return nft_register_expr(&nft_objref_type);
216}
217
218static void __exit nft_objref_module_exit(void)
219{
220 nft_unregister_expr(&nft_objref_type);
221}
222
223module_init(nft_objref_module_init);
224module_exit(nft_objref_module_exit);
225
226MODULE_LICENSE("GPL");
227MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
228MODULE_ALIAS_NFT_EXPR("objref");
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index b2f88617611a..7d699bbd45b0 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -1,5 +1,6 @@
1/* 1/*
2 * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> 2 * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
3 * Copyright (c) 2016 Pablo Neira Ayuso <pablo@netfilter.org>
3 * 4 *
4 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as 6 * it under the terms of the GNU General Public License version 2 as
@@ -17,6 +18,10 @@
17#include <linux/netfilter/nf_tables.h> 18#include <linux/netfilter/nf_tables.h>
18#include <net/netfilter/nf_tables_core.h> 19#include <net/netfilter/nf_tables_core.h>
19#include <net/netfilter/nf_tables.h> 20#include <net/netfilter/nf_tables.h>
21/* For layer 4 checksum field offset. */
22#include <linux/tcp.h>
23#include <linux/udp.h>
24#include <linux/icmpv6.h>
20 25
21/* add vlan header into the user buffer for if tag was removed by offloads */ 26/* add vlan header into the user buffer for if tag was removed by offloads */
22static bool 27static bool
@@ -148,7 +153,6 @@ nla_put_failure:
148 return -1; 153 return -1;
149} 154}
150 155
151static struct nft_expr_type nft_payload_type;
152static const struct nft_expr_ops nft_payload_ops = { 156static const struct nft_expr_ops nft_payload_ops = {
153 .type = &nft_payload_type, 157 .type = &nft_payload_type,
154 .size = NFT_EXPR_SIZE(sizeof(struct nft_payload)), 158 .size = NFT_EXPR_SIZE(sizeof(struct nft_payload)),
@@ -165,6 +169,103 @@ const struct nft_expr_ops nft_payload_fast_ops = {
165 .dump = nft_payload_dump, 169 .dump = nft_payload_dump,
166}; 170};
167 171
172static inline void nft_csum_replace(__sum16 *sum, __wsum fsum, __wsum tsum)
173{
174 *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), fsum), tsum));
175 if (*sum == 0)
176 *sum = CSUM_MANGLED_0;
177}
178
179static bool nft_payload_udp_checksum(struct sk_buff *skb, unsigned int thoff)
180{
181 struct udphdr *uh, _uh;
182
183 uh = skb_header_pointer(skb, thoff, sizeof(_uh), &_uh);
184 if (!uh)
185 return false;
186
187 return uh->check;
188}
189
190static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt,
191 struct sk_buff *skb,
192 unsigned int *l4csum_offset)
193{
194 switch (pkt->tprot) {
195 case IPPROTO_TCP:
196 *l4csum_offset = offsetof(struct tcphdr, check);
197 break;
198 case IPPROTO_UDP:
199 if (!nft_payload_udp_checksum(skb, pkt->xt.thoff))
200 return -1;
201 /* Fall through. */
202 case IPPROTO_UDPLITE:
203 *l4csum_offset = offsetof(struct udphdr, check);
204 break;
205 case IPPROTO_ICMPV6:
206 *l4csum_offset = offsetof(struct icmp6hdr, icmp6_cksum);
207 break;
208 default:
209 return -1;
210 }
211
212 *l4csum_offset += pkt->xt.thoff;
213 return 0;
214}
215
216static int nft_payload_l4csum_update(const struct nft_pktinfo *pkt,
217 struct sk_buff *skb,
218 __wsum fsum, __wsum tsum)
219{
220 int l4csum_offset;
221 __sum16 sum;
222
223 /* If we cannot determine layer 4 checksum offset or this packet doesn't
224 * require layer 4 checksum recalculation, skip this packet.
225 */
226 if (nft_payload_l4csum_offset(pkt, skb, &l4csum_offset) < 0)
227 return 0;
228
229 if (skb_copy_bits(skb, l4csum_offset, &sum, sizeof(sum)) < 0)
230 return -1;
231
232 /* Checksum mangling for an arbitrary amount of bytes, based on
233 * inet_proto_csum_replace*() functions.
234 */
235 if (skb->ip_summed != CHECKSUM_PARTIAL) {
236 nft_csum_replace(&sum, fsum, tsum);
237 if (skb->ip_summed == CHECKSUM_COMPLETE) {
238 skb->csum = ~csum_add(csum_sub(~(skb->csum), fsum),
239 tsum);
240 }
241 } else {
242 sum = ~csum_fold(csum_add(csum_sub(csum_unfold(sum), fsum),
243 tsum));
244 }
245
246 if (!skb_make_writable(skb, l4csum_offset + sizeof(sum)) ||
247 skb_store_bits(skb, l4csum_offset, &sum, sizeof(sum)) < 0)
248 return -1;
249
250 return 0;
251}
252
253static int nft_payload_csum_inet(struct sk_buff *skb, const u32 *src,
254 __wsum fsum, __wsum tsum, int csum_offset)
255{
256 __sum16 sum;
257
258 if (skb_copy_bits(skb, csum_offset, &sum, sizeof(sum)) < 0)
259 return -1;
260
261 nft_csum_replace(&sum, fsum, tsum);
262 if (!skb_make_writable(skb, csum_offset + sizeof(sum)) ||
263 skb_store_bits(skb, csum_offset, &sum, sizeof(sum)) < 0)
264 return -1;
265
266 return 0;
267}
268
168static void nft_payload_set_eval(const struct nft_expr *expr, 269static void nft_payload_set_eval(const struct nft_expr *expr,
169 struct nft_regs *regs, 270 struct nft_regs *regs,
170 const struct nft_pktinfo *pkt) 271 const struct nft_pktinfo *pkt)
@@ -174,7 +275,6 @@ static void nft_payload_set_eval(const struct nft_expr *expr,
174 const u32 *src = &regs->data[priv->sreg]; 275 const u32 *src = &regs->data[priv->sreg];
175 int offset, csum_offset; 276 int offset, csum_offset;
176 __wsum fsum, tsum; 277 __wsum fsum, tsum;
177 __sum16 sum;
178 278
179 switch (priv->base) { 279 switch (priv->base) {
180 case NFT_PAYLOAD_LL_HEADER: 280 case NFT_PAYLOAD_LL_HEADER:
@@ -197,21 +297,18 @@ static void nft_payload_set_eval(const struct nft_expr *expr,
197 csum_offset = offset + priv->csum_offset; 297 csum_offset = offset + priv->csum_offset;
198 offset += priv->offset; 298 offset += priv->offset;
199 299
200 if (priv->csum_type == NFT_PAYLOAD_CSUM_INET && 300 if ((priv->csum_type == NFT_PAYLOAD_CSUM_INET || priv->csum_flags) &&
201 (priv->base != NFT_PAYLOAD_TRANSPORT_HEADER || 301 (priv->base != NFT_PAYLOAD_TRANSPORT_HEADER ||
202 skb->ip_summed != CHECKSUM_PARTIAL)) { 302 skb->ip_summed != CHECKSUM_PARTIAL)) {
203 if (skb_copy_bits(skb, csum_offset, &sum, sizeof(sum)) < 0)
204 goto err;
205
206 fsum = skb_checksum(skb, offset, priv->len, 0); 303 fsum = skb_checksum(skb, offset, priv->len, 0);
207 tsum = csum_partial(src, priv->len, 0); 304 tsum = csum_partial(src, priv->len, 0);
208 sum = csum_fold(csum_add(csum_sub(~csum_unfold(sum), fsum),
209 tsum));
210 if (sum == 0)
211 sum = CSUM_MANGLED_0;
212 305
213 if (!skb_make_writable(skb, csum_offset + sizeof(sum)) || 306 if (priv->csum_type == NFT_PAYLOAD_CSUM_INET &&
214 skb_store_bits(skb, csum_offset, &sum, sizeof(sum)) < 0) 307 nft_payload_csum_inet(skb, src, fsum, tsum, csum_offset))
308 goto err;
309
310 if (priv->csum_flags &&
311 nft_payload_l4csum_update(pkt, skb, fsum, tsum) < 0)
215 goto err; 312 goto err;
216 } 313 }
217 314
@@ -241,6 +338,15 @@ static int nft_payload_set_init(const struct nft_ctx *ctx,
241 if (tb[NFTA_PAYLOAD_CSUM_OFFSET]) 338 if (tb[NFTA_PAYLOAD_CSUM_OFFSET])
242 priv->csum_offset = 339 priv->csum_offset =
243 ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_OFFSET])); 340 ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_OFFSET]));
341 if (tb[NFTA_PAYLOAD_CSUM_FLAGS]) {
342 u32 flags;
343
344 flags = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_FLAGS]));
345 if (flags & ~NFT_PAYLOAD_L4CSUM_PSEUDOHDR)
346 return -EINVAL;
347
348 priv->csum_flags = flags;
349 }
244 350
245 switch (priv->csum_type) { 351 switch (priv->csum_type) {
246 case NFT_PAYLOAD_CSUM_NONE: 352 case NFT_PAYLOAD_CSUM_NONE:
@@ -263,7 +369,8 @@ static int nft_payload_set_dump(struct sk_buff *skb, const struct nft_expr *expr
263 nla_put_be32(skb, NFTA_PAYLOAD_LEN, htonl(priv->len)) || 369 nla_put_be32(skb, NFTA_PAYLOAD_LEN, htonl(priv->len)) ||
264 nla_put_be32(skb, NFTA_PAYLOAD_CSUM_TYPE, htonl(priv->csum_type)) || 370 nla_put_be32(skb, NFTA_PAYLOAD_CSUM_TYPE, htonl(priv->csum_type)) ||
265 nla_put_be32(skb, NFTA_PAYLOAD_CSUM_OFFSET, 371 nla_put_be32(skb, NFTA_PAYLOAD_CSUM_OFFSET,
266 htonl(priv->csum_offset))) 372 htonl(priv->csum_offset)) ||
373 nla_put_be32(skb, NFTA_PAYLOAD_CSUM_FLAGS, htonl(priv->csum_flags)))
267 goto nla_put_failure; 374 goto nla_put_failure;
268 return 0; 375 return 0;
269 376
@@ -320,20 +427,10 @@ nft_payload_select_ops(const struct nft_ctx *ctx,
320 return &nft_payload_ops; 427 return &nft_payload_ops;
321} 428}
322 429
323static struct nft_expr_type nft_payload_type __read_mostly = { 430struct nft_expr_type nft_payload_type __read_mostly = {
324 .name = "payload", 431 .name = "payload",
325 .select_ops = nft_payload_select_ops, 432 .select_ops = nft_payload_select_ops,
326 .policy = nft_payload_policy, 433 .policy = nft_payload_policy,
327 .maxattr = NFTA_PAYLOAD_MAX, 434 .maxattr = NFTA_PAYLOAD_MAX,
328 .owner = THIS_MODULE, 435 .owner = THIS_MODULE,
329}; 436};
330
331int __init nft_payload_module_init(void)
332{
333 return nft_register_expr(&nft_payload_type);
334}
335
336void nft_payload_module_exit(void)
337{
338 nft_unregister_expr(&nft_payload_type);
339}
diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c
index 393d359a1889..dbb6aaff67ec 100644
--- a/net/netfilter/nft_queue.c
+++ b/net/netfilter/nft_queue.c
@@ -38,12 +38,12 @@ static void nft_queue_eval(const struct nft_expr *expr,
38 38
39 if (priv->queues_total > 1) { 39 if (priv->queues_total > 1) {
40 if (priv->flags & NFT_QUEUE_FLAG_CPU_FANOUT) { 40 if (priv->flags & NFT_QUEUE_FLAG_CPU_FANOUT) {
41 int cpu = smp_processor_id(); 41 int cpu = raw_smp_processor_id();
42 42
43 queue = priv->queuenum + cpu % priv->queues_total; 43 queue = priv->queuenum + cpu % priv->queues_total;
44 } else { 44 } else {
45 queue = nfqueue_hash(pkt->skb, queue, 45 queue = nfqueue_hash(pkt->skb, queue,
46 priv->queues_total, pkt->pf, 46 priv->queues_total, nft_pf(pkt),
47 jhash_initval); 47 jhash_initval);
48 } 48 }
49 } 49 }
diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
index c00104c07095..2d6fe3559912 100644
--- a/net/netfilter/nft_quota.c
+++ b/net/netfilter/nft_quota.c
@@ -17,38 +17,59 @@
17 17
18struct nft_quota { 18struct nft_quota {
19 u64 quota; 19 u64 quota;
20 bool invert; 20 unsigned long flags;
21 atomic64_t remain; 21 atomic64_t consumed;
22}; 22};
23 23
24static inline bool nft_overquota(struct nft_quota *priv, 24static inline bool nft_overquota(struct nft_quota *priv,
25 const struct nft_pktinfo *pkt) 25 const struct sk_buff *skb)
26{ 26{
27 return atomic64_sub_return(pkt->skb->len, &priv->remain) < 0; 27 return atomic64_add_return(skb->len, &priv->consumed) >= priv->quota;
28} 28}
29 29
30static void nft_quota_eval(const struct nft_expr *expr, 30static inline bool nft_quota_invert(struct nft_quota *priv)
31 struct nft_regs *regs,
32 const struct nft_pktinfo *pkt)
33{ 31{
34 struct nft_quota *priv = nft_expr_priv(expr); 32 return priv->flags & NFT_QUOTA_F_INV;
33}
35 34
36 if (nft_overquota(priv, pkt) ^ priv->invert) 35static inline void nft_quota_do_eval(struct nft_quota *priv,
36 struct nft_regs *regs,
37 const struct nft_pktinfo *pkt)
38{
39 if (nft_overquota(priv, pkt->skb) ^ nft_quota_invert(priv))
37 regs->verdict.code = NFT_BREAK; 40 regs->verdict.code = NFT_BREAK;
38} 41}
39 42
40static const struct nla_policy nft_quota_policy[NFTA_QUOTA_MAX + 1] = { 43static const struct nla_policy nft_quota_policy[NFTA_QUOTA_MAX + 1] = {
41 [NFTA_QUOTA_BYTES] = { .type = NLA_U64 }, 44 [NFTA_QUOTA_BYTES] = { .type = NLA_U64 },
42 [NFTA_QUOTA_FLAGS] = { .type = NLA_U32 }, 45 [NFTA_QUOTA_FLAGS] = { .type = NLA_U32 },
46 [NFTA_QUOTA_CONSUMED] = { .type = NLA_U64 },
43}; 47};
44 48
45static int nft_quota_init(const struct nft_ctx *ctx, 49#define NFT_QUOTA_DEPLETED_BIT 1 /* From NFT_QUOTA_F_DEPLETED. */
46 const struct nft_expr *expr, 50
47 const struct nlattr * const tb[]) 51static void nft_quota_obj_eval(struct nft_object *obj,
52 struct nft_regs *regs,
53 const struct nft_pktinfo *pkt)
48{ 54{
49 struct nft_quota *priv = nft_expr_priv(expr); 55 struct nft_quota *priv = nft_obj_data(obj);
50 u32 flags = 0; 56 bool overquota;
51 u64 quota; 57
58 overquota = nft_overquota(priv, pkt->skb);
59 if (overquota ^ nft_quota_invert(priv))
60 regs->verdict.code = NFT_BREAK;
61
62 if (overquota &&
63 !test_and_set_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags))
64 nft_obj_notify(nft_net(pkt), obj->table, obj, 0, 0,
65 NFT_MSG_NEWOBJ, nft_pf(pkt), 0, GFP_ATOMIC);
66}
67
68static int nft_quota_do_init(const struct nlattr * const tb[],
69 struct nft_quota *priv)
70{
71 unsigned long flags = 0;
72 u64 quota, consumed = 0;
52 73
53 if (!tb[NFTA_QUOTA_BYTES]) 74 if (!tb[NFTA_QUOTA_BYTES])
54 return -EINVAL; 75 return -EINVAL;
@@ -57,34 +78,114 @@ static int nft_quota_init(const struct nft_ctx *ctx,
57 if (quota > S64_MAX) 78 if (quota > S64_MAX)
58 return -EOVERFLOW; 79 return -EOVERFLOW;
59 80
81 if (tb[NFTA_QUOTA_CONSUMED]) {
82 consumed = be64_to_cpu(nla_get_be64(tb[NFTA_QUOTA_CONSUMED]));
83 if (consumed > quota)
84 return -EINVAL;
85 }
86
60 if (tb[NFTA_QUOTA_FLAGS]) { 87 if (tb[NFTA_QUOTA_FLAGS]) {
61 flags = ntohl(nla_get_be32(tb[NFTA_QUOTA_FLAGS])); 88 flags = ntohl(nla_get_be32(tb[NFTA_QUOTA_FLAGS]));
62 if (flags & ~NFT_QUOTA_F_INV) 89 if (flags & ~NFT_QUOTA_F_INV)
63 return -EINVAL; 90 return -EINVAL;
91 if (flags & NFT_QUOTA_F_DEPLETED)
92 return -EOPNOTSUPP;
64 } 93 }
65 94
66 priv->quota = quota; 95 priv->quota = quota;
67 priv->invert = (flags & NFT_QUOTA_F_INV) ? true : false; 96 priv->flags = flags;
68 atomic64_set(&priv->remain, quota); 97 atomic64_set(&priv->consumed, consumed);
69 98
70 return 0; 99 return 0;
71} 100}
72 101
73static int nft_quota_dump(struct sk_buff *skb, const struct nft_expr *expr) 102static int nft_quota_obj_init(const struct nlattr * const tb[],
103 struct nft_object *obj)
104{
105 struct nft_quota *priv = nft_obj_data(obj);
106
107 return nft_quota_do_init(tb, priv);
108}
109
110static int nft_quota_do_dump(struct sk_buff *skb, struct nft_quota *priv,
111 bool reset)
74{ 112{
75 const struct nft_quota *priv = nft_expr_priv(expr); 113 u64 consumed, consumed_cap;
76 u32 flags = priv->invert ? NFT_QUOTA_F_INV : 0; 114 u32 flags = priv->flags;
115
116 /* Since we inconditionally increment consumed quota for each packet
117 * that we see, don't go over the quota boundary in what we send to
118 * userspace.
119 */
120 consumed = atomic64_read(&priv->consumed);
121 if (consumed >= priv->quota) {
122 consumed_cap = priv->quota;
123 flags |= NFT_QUOTA_F_DEPLETED;
124 } else {
125 consumed_cap = consumed;
126 }
77 127
78 if (nla_put_be64(skb, NFTA_QUOTA_BYTES, cpu_to_be64(priv->quota), 128 if (nla_put_be64(skb, NFTA_QUOTA_BYTES, cpu_to_be64(priv->quota),
79 NFTA_QUOTA_PAD) || 129 NFTA_QUOTA_PAD) ||
130 nla_put_be64(skb, NFTA_QUOTA_CONSUMED, cpu_to_be64(consumed_cap),
131 NFTA_QUOTA_PAD) ||
80 nla_put_be32(skb, NFTA_QUOTA_FLAGS, htonl(flags))) 132 nla_put_be32(skb, NFTA_QUOTA_FLAGS, htonl(flags)))
81 goto nla_put_failure; 133 goto nla_put_failure;
134
135 if (reset) {
136 atomic64_sub(consumed, &priv->consumed);
137 clear_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags);
138 }
82 return 0; 139 return 0;
83 140
84nla_put_failure: 141nla_put_failure:
85 return -1; 142 return -1;
86} 143}
87 144
145static int nft_quota_obj_dump(struct sk_buff *skb, struct nft_object *obj,
146 bool reset)
147{
148 struct nft_quota *priv = nft_obj_data(obj);
149
150 return nft_quota_do_dump(skb, priv, reset);
151}
152
153static struct nft_object_type nft_quota_obj __read_mostly = {
154 .type = NFT_OBJECT_QUOTA,
155 .size = sizeof(struct nft_quota),
156 .maxattr = NFTA_QUOTA_MAX,
157 .policy = nft_quota_policy,
158 .init = nft_quota_obj_init,
159 .eval = nft_quota_obj_eval,
160 .dump = nft_quota_obj_dump,
161 .owner = THIS_MODULE,
162};
163
164static void nft_quota_eval(const struct nft_expr *expr,
165 struct nft_regs *regs,
166 const struct nft_pktinfo *pkt)
167{
168 struct nft_quota *priv = nft_expr_priv(expr);
169
170 nft_quota_do_eval(priv, regs, pkt);
171}
172
173static int nft_quota_init(const struct nft_ctx *ctx,
174 const struct nft_expr *expr,
175 const struct nlattr * const tb[])
176{
177 struct nft_quota *priv = nft_expr_priv(expr);
178
179 return nft_quota_do_init(tb, priv);
180}
181
182static int nft_quota_dump(struct sk_buff *skb, const struct nft_expr *expr)
183{
184 struct nft_quota *priv = nft_expr_priv(expr);
185
186 return nft_quota_do_dump(skb, priv, false);
187}
188
88static struct nft_expr_type nft_quota_type; 189static struct nft_expr_type nft_quota_type;
89static const struct nft_expr_ops nft_quota_ops = { 190static const struct nft_expr_ops nft_quota_ops = {
90 .type = &nft_quota_type, 191 .type = &nft_quota_type,
@@ -105,12 +206,26 @@ static struct nft_expr_type nft_quota_type __read_mostly = {
105 206
106static int __init nft_quota_module_init(void) 207static int __init nft_quota_module_init(void)
107{ 208{
108 return nft_register_expr(&nft_quota_type); 209 int err;
210
211 err = nft_register_obj(&nft_quota_obj);
212 if (err < 0)
213 return err;
214
215 err = nft_register_expr(&nft_quota_type);
216 if (err < 0)
217 goto err1;
218
219 return 0;
220err1:
221 nft_unregister_obj(&nft_quota_obj);
222 return err;
109} 223}
110 224
111static void __exit nft_quota_module_exit(void) 225static void __exit nft_quota_module_exit(void)
112{ 226{
113 nft_unregister_expr(&nft_quota_type); 227 nft_unregister_expr(&nft_quota_type);
228 nft_unregister_obj(&nft_quota_obj);
114} 229}
115 230
116module_init(nft_quota_module_init); 231module_init(nft_quota_module_init);
@@ -119,3 +234,4 @@ module_exit(nft_quota_module_exit);
119MODULE_LICENSE("GPL"); 234MODULE_LICENSE("GPL");
120MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); 235MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
121MODULE_ALIAS_NFT_EXPR("quota"); 236MODULE_ALIAS_NFT_EXPR("quota");
237MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_QUOTA);
diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c
index 8f0aaaea1376..9edc74eedc10 100644
--- a/net/netfilter/nft_range.c
+++ b/net/netfilter/nft_range.c
@@ -128,7 +128,6 @@ nla_put_failure:
128 return -1; 128 return -1;
129} 129}
130 130
131static struct nft_expr_type nft_range_type;
132static const struct nft_expr_ops nft_range_ops = { 131static const struct nft_expr_ops nft_range_ops = {
133 .type = &nft_range_type, 132 .type = &nft_range_type,
134 .size = NFT_EXPR_SIZE(sizeof(struct nft_range_expr)), 133 .size = NFT_EXPR_SIZE(sizeof(struct nft_range_expr)),
@@ -137,20 +136,10 @@ static const struct nft_expr_ops nft_range_ops = {
137 .dump = nft_range_dump, 136 .dump = nft_range_dump,
138}; 137};
139 138
140static struct nft_expr_type nft_range_type __read_mostly = { 139struct nft_expr_type nft_range_type __read_mostly = {
141 .name = "range", 140 .name = "range",
142 .ops = &nft_range_ops, 141 .ops = &nft_range_ops,
143 .policy = nft_range_policy, 142 .policy = nft_range_policy,
144 .maxattr = NFTA_RANGE_MAX, 143 .maxattr = NFTA_RANGE_MAX,
145 .owner = THIS_MODULE, 144 .owner = THIS_MODULE,
146}; 145};
147
148int __init nft_range_module_init(void)
149{
150 return nft_register_expr(&nft_range_type);
151}
152
153void nft_range_module_exit(void)
154{
155 nft_unregister_expr(&nft_range_type);
156}
diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c
index 03f7bf40ae75..40dcd05146d5 100644
--- a/net/netfilter/nft_redir.c
+++ b/net/netfilter/nft_redir.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com> 2 * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify 4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as 5 * it under the terms of the GNU General Public License version 2 as
@@ -79,7 +79,7 @@ int nft_redir_init(const struct nft_ctx *ctx,
79 return -EINVAL; 79 return -EINVAL;
80 } 80 }
81 81
82 return 0; 82 return nf_ct_netns_get(ctx->net, ctx->afi->family);
83} 83}
84EXPORT_SYMBOL_GPL(nft_redir_init); 84EXPORT_SYMBOL_GPL(nft_redir_init);
85 85
@@ -108,4 +108,4 @@ nla_put_failure:
108EXPORT_SYMBOL_GPL(nft_redir_dump); 108EXPORT_SYMBOL_GPL(nft_redir_dump);
109 109
110MODULE_LICENSE("GPL"); 110MODULE_LICENSE("GPL");
111MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>"); 111MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>");
diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c
index e79d9ca2ffee..9e90a02cb104 100644
--- a/net/netfilter/nft_reject_inet.c
+++ b/net/netfilter/nft_reject_inet.c
@@ -23,36 +23,36 @@ static void nft_reject_inet_eval(const struct nft_expr *expr,
23{ 23{
24 struct nft_reject *priv = nft_expr_priv(expr); 24 struct nft_reject *priv = nft_expr_priv(expr);
25 25
26 switch (pkt->pf) { 26 switch (nft_pf(pkt)) {
27 case NFPROTO_IPV4: 27 case NFPROTO_IPV4:
28 switch (priv->type) { 28 switch (priv->type) {
29 case NFT_REJECT_ICMP_UNREACH: 29 case NFT_REJECT_ICMP_UNREACH:
30 nf_send_unreach(pkt->skb, priv->icmp_code, 30 nf_send_unreach(pkt->skb, priv->icmp_code,
31 pkt->hook); 31 nft_hook(pkt));
32 break; 32 break;
33 case NFT_REJECT_TCP_RST: 33 case NFT_REJECT_TCP_RST:
34 nf_send_reset(pkt->net, pkt->skb, pkt->hook); 34 nf_send_reset(nft_net(pkt), pkt->skb, nft_hook(pkt));
35 break; 35 break;
36 case NFT_REJECT_ICMPX_UNREACH: 36 case NFT_REJECT_ICMPX_UNREACH:
37 nf_send_unreach(pkt->skb, 37 nf_send_unreach(pkt->skb,
38 nft_reject_icmp_code(priv->icmp_code), 38 nft_reject_icmp_code(priv->icmp_code),
39 pkt->hook); 39 nft_hook(pkt));
40 break; 40 break;
41 } 41 }
42 break; 42 break;
43 case NFPROTO_IPV6: 43 case NFPROTO_IPV6:
44 switch (priv->type) { 44 switch (priv->type) {
45 case NFT_REJECT_ICMP_UNREACH: 45 case NFT_REJECT_ICMP_UNREACH:
46 nf_send_unreach6(pkt->net, pkt->skb, priv->icmp_code, 46 nf_send_unreach6(nft_net(pkt), pkt->skb,
47 pkt->hook); 47 priv->icmp_code, nft_hook(pkt));
48 break; 48 break;
49 case NFT_REJECT_TCP_RST: 49 case NFT_REJECT_TCP_RST:
50 nf_send_reset6(pkt->net, pkt->skb, pkt->hook); 50 nf_send_reset6(nft_net(pkt), pkt->skb, nft_hook(pkt));
51 break; 51 break;
52 case NFT_REJECT_ICMPX_UNREACH: 52 case NFT_REJECT_ICMPX_UNREACH:
53 nf_send_unreach6(pkt->net, pkt->skb, 53 nf_send_unreach6(nft_net(pkt), pkt->skb,
54 nft_reject_icmpv6_code(priv->icmp_code), 54 nft_reject_icmpv6_code(priv->icmp_code),
55 pkt->hook); 55 nft_hook(pkt));
56 break; 56 break;
57 } 57 }
58 break; 58 break;
diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c
new file mode 100644
index 000000000000..d3eb640bc784
--- /dev/null
+++ b/net/netfilter/nft_rt.c
@@ -0,0 +1,153 @@
1/*
2 * Copyright (c) 2016 Anders K. Pedersen <akp@cohaesio.com>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/kernel.h>
10#include <linux/init.h>
11#include <linux/module.h>
12#include <linux/netlink.h>
13#include <linux/netfilter.h>
14#include <linux/netfilter/nf_tables.h>
15#include <net/dst.h>
16#include <net/ip6_route.h>
17#include <net/route.h>
18#include <net/netfilter/nf_tables.h>
19#include <net/netfilter/nf_tables_core.h>
20
21struct nft_rt {
22 enum nft_rt_keys key:8;
23 enum nft_registers dreg:8;
24};
25
26void nft_rt_get_eval(const struct nft_expr *expr,
27 struct nft_regs *regs,
28 const struct nft_pktinfo *pkt)
29{
30 const struct nft_rt *priv = nft_expr_priv(expr);
31 const struct sk_buff *skb = pkt->skb;
32 u32 *dest = &regs->data[priv->dreg];
33 const struct dst_entry *dst;
34
35 dst = skb_dst(skb);
36 if (!dst)
37 goto err;
38
39 switch (priv->key) {
40#ifdef CONFIG_IP_ROUTE_CLASSID
41 case NFT_RT_CLASSID:
42 *dest = dst->tclassid;
43 break;
44#endif
45 case NFT_RT_NEXTHOP4:
46 if (nft_pf(pkt) != NFPROTO_IPV4)
47 goto err;
48
49 *dest = rt_nexthop((const struct rtable *)dst,
50 ip_hdr(skb)->daddr);
51 break;
52 case NFT_RT_NEXTHOP6:
53 if (nft_pf(pkt) != NFPROTO_IPV6)
54 goto err;
55
56 memcpy(dest, rt6_nexthop((struct rt6_info *)dst,
57 &ipv6_hdr(skb)->daddr),
58 sizeof(struct in6_addr));
59 break;
60 default:
61 WARN_ON(1);
62 goto err;
63 }
64 return;
65
66err:
67 regs->verdict.code = NFT_BREAK;
68}
69
70const struct nla_policy nft_rt_policy[NFTA_RT_MAX + 1] = {
71 [NFTA_RT_DREG] = { .type = NLA_U32 },
72 [NFTA_RT_KEY] = { .type = NLA_U32 },
73};
74
75int nft_rt_get_init(const struct nft_ctx *ctx,
76 const struct nft_expr *expr,
77 const struct nlattr * const tb[])
78{
79 struct nft_rt *priv = nft_expr_priv(expr);
80 unsigned int len;
81
82 if (tb[NFTA_RT_KEY] == NULL ||
83 tb[NFTA_RT_DREG] == NULL)
84 return -EINVAL;
85
86 priv->key = ntohl(nla_get_be32(tb[NFTA_RT_KEY]));
87 switch (priv->key) {
88#ifdef CONFIG_IP_ROUTE_CLASSID
89 case NFT_RT_CLASSID:
90#endif
91 case NFT_RT_NEXTHOP4:
92 len = sizeof(u32);
93 break;
94 case NFT_RT_NEXTHOP6:
95 len = sizeof(struct in6_addr);
96 break;
97 default:
98 return -EOPNOTSUPP;
99 }
100
101 priv->dreg = nft_parse_register(tb[NFTA_RT_DREG]);
102 return nft_validate_register_store(ctx, priv->dreg, NULL,
103 NFT_DATA_VALUE, len);
104}
105
106int nft_rt_get_dump(struct sk_buff *skb,
107 const struct nft_expr *expr)
108{
109 const struct nft_rt *priv = nft_expr_priv(expr);
110
111 if (nla_put_be32(skb, NFTA_RT_KEY, htonl(priv->key)))
112 goto nla_put_failure;
113 if (nft_dump_register(skb, NFTA_RT_DREG, priv->dreg))
114 goto nla_put_failure;
115 return 0;
116
117nla_put_failure:
118 return -1;
119}
120
121static struct nft_expr_type nft_rt_type;
122static const struct nft_expr_ops nft_rt_get_ops = {
123 .type = &nft_rt_type,
124 .size = NFT_EXPR_SIZE(sizeof(struct nft_rt)),
125 .eval = nft_rt_get_eval,
126 .init = nft_rt_get_init,
127 .dump = nft_rt_get_dump,
128};
129
130static struct nft_expr_type nft_rt_type __read_mostly = {
131 .name = "rt",
132 .ops = &nft_rt_get_ops,
133 .policy = nft_rt_policy,
134 .maxattr = NFTA_RT_MAX,
135 .owner = THIS_MODULE,
136};
137
138static int __init nft_rt_module_init(void)
139{
140 return nft_register_expr(&nft_rt_type);
141}
142
143static void __exit nft_rt_module_exit(void)
144{
145 nft_unregister_expr(&nft_rt_type);
146}
147
148module_init(nft_rt_module_init);
149module_exit(nft_rt_module_exit);
150
151MODULE_LICENSE("GPL");
152MODULE_AUTHOR("Anders K. Pedersen <akp@cohaesio.com>");
153MODULE_ALIAS_NFT_EXPR("rt");
diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
new file mode 100644
index 000000000000..8ebbc2940f4c
--- /dev/null
+++ b/net/netfilter/nft_set_bitmap.c
@@ -0,0 +1,307 @@
1/*
2 * Copyright (c) 2017 Pablo Neira Ayuso <pablo@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/kernel.h>
10#include <linux/init.h>
11#include <linux/module.h>
12#include <linux/list.h>
13#include <linux/netlink.h>
14#include <linux/netfilter.h>
15#include <linux/netfilter/nf_tables.h>
16#include <net/netfilter/nf_tables.h>
17
18struct nft_bitmap_elem {
19 struct list_head head;
20 struct nft_set_ext ext;
21};
22
23/* This bitmap uses two bits to represent one element. These two bits determine
24 * the element state in the current and the future generation.
25 *
26 * An element can be in three states. The generation cursor is represented using
27 * the ^ character, note that this cursor shifts on every succesful transaction.
28 * If no transaction is going on, we observe all elements are in the following
29 * state:
30 *
31 * 11 = this element is active in the current generation. In case of no updates,
32 * ^ it stays active in the next generation.
33 * 00 = this element is inactive in the current generation. In case of no
34 * ^ updates, it stays inactive in the next generation.
35 *
36 * On transaction handling, we observe these two temporary states:
37 *
38 * 01 = this element is inactive in the current generation and it becomes active
39 * ^ in the next one. This happens when the element is inserted but commit
40 * path has not yet been executed yet, so activation is still pending. On
41 * transaction abortion, the element is removed.
42 * 10 = this element is active in the current generation and it becomes inactive
43 * ^ in the next one. This happens when the element is deactivated but commit
44 * path has not yet been executed yet, so removal is still pending. On
45 * transation abortion, the next generation bit is reset to go back to
46 * restore its previous state.
47 */
48struct nft_bitmap {
49 struct list_head list;
50 u16 bitmap_size;
51 u8 bitmap[];
52};
53
54static inline void nft_bitmap_location(const struct nft_set *set,
55 const void *key,
56 u32 *idx, u32 *off)
57{
58 u32 k;
59
60 if (set->klen == 2)
61 k = *(u16 *)key;
62 else
63 k = *(u8 *)key;
64 k <<= 1;
65
66 *idx = k / BITS_PER_BYTE;
67 *off = k % BITS_PER_BYTE;
68}
69
70/* Fetch the two bits that represent the element and check if it is active based
71 * on the generation mask.
72 */
73static inline bool
74nft_bitmap_active(const u8 *bitmap, u32 idx, u32 off, u8 genmask)
75{
76 return (bitmap[idx] & (0x3 << off)) & (genmask << off);
77}
78
79static bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set,
80 const u32 *key, const struct nft_set_ext **ext)
81{
82 const struct nft_bitmap *priv = nft_set_priv(set);
83 u8 genmask = nft_genmask_cur(net);
84 u32 idx, off;
85
86 nft_bitmap_location(set, key, &idx, &off);
87
88 return nft_bitmap_active(priv->bitmap, idx, off, genmask);
89}
90
91static struct nft_bitmap_elem *
92nft_bitmap_elem_find(const struct nft_set *set, struct nft_bitmap_elem *this,
93 u8 genmask)
94{
95 const struct nft_bitmap *priv = nft_set_priv(set);
96 struct nft_bitmap_elem *be;
97
98 list_for_each_entry_rcu(be, &priv->list, head) {
99 if (memcmp(nft_set_ext_key(&be->ext),
100 nft_set_ext_key(&this->ext), set->klen) ||
101 !nft_set_elem_active(&be->ext, genmask))
102 continue;
103
104 return be;
105 }
106 return NULL;
107}
108
109static int nft_bitmap_insert(const struct net *net, const struct nft_set *set,
110 const struct nft_set_elem *elem,
111 struct nft_set_ext **ext)
112{
113 struct nft_bitmap *priv = nft_set_priv(set);
114 struct nft_bitmap_elem *new = elem->priv, *be;
115 u8 genmask = nft_genmask_next(net);
116 u32 idx, off;
117
118 be = nft_bitmap_elem_find(set, new, genmask);
119 if (be) {
120 *ext = &be->ext;
121 return -EEXIST;
122 }
123
124 nft_bitmap_location(set, nft_set_ext_key(&new->ext), &idx, &off);
125 /* Enter 01 state. */
126 priv->bitmap[idx] |= (genmask << off);
127 list_add_tail_rcu(&new->head, &priv->list);
128
129 return 0;
130}
131
132static void nft_bitmap_remove(const struct net *net,
133 const struct nft_set *set,
134 const struct nft_set_elem *elem)
135{
136 struct nft_bitmap *priv = nft_set_priv(set);
137 struct nft_bitmap_elem *be = elem->priv;
138 u8 genmask = nft_genmask_next(net);
139 u32 idx, off;
140
141 nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off);
142 /* Enter 00 state. */
143 priv->bitmap[idx] &= ~(genmask << off);
144 list_del_rcu(&be->head);
145}
146
147static void nft_bitmap_activate(const struct net *net,
148 const struct nft_set *set,
149 const struct nft_set_elem *elem)
150{
151 struct nft_bitmap *priv = nft_set_priv(set);
152 struct nft_bitmap_elem *be = elem->priv;
153 u8 genmask = nft_genmask_next(net);
154 u32 idx, off;
155
156 nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off);
157 /* Enter 11 state. */
158 priv->bitmap[idx] |= (genmask << off);
159 nft_set_elem_change_active(net, set, &be->ext);
160}
161
162static bool nft_bitmap_flush(const struct net *net,
163 const struct nft_set *set, void *_be)
164{
165 struct nft_bitmap *priv = nft_set_priv(set);
166 u8 genmask = nft_genmask_next(net);
167 struct nft_bitmap_elem *be = _be;
168 u32 idx, off;
169
170 nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off);
171 /* Enter 10 state, similar to deactivation. */
172 priv->bitmap[idx] &= ~(genmask << off);
173 nft_set_elem_change_active(net, set, &be->ext);
174
175 return true;
176}
177
178static void *nft_bitmap_deactivate(const struct net *net,
179 const struct nft_set *set,
180 const struct nft_set_elem *elem)
181{
182 struct nft_bitmap *priv = nft_set_priv(set);
183 struct nft_bitmap_elem *this = elem->priv, *be;
184 u8 genmask = nft_genmask_next(net);
185 u32 idx, off;
186
187 nft_bitmap_location(set, elem->key.val.data, &idx, &off);
188
189 be = nft_bitmap_elem_find(set, this, genmask);
190 if (!be)
191 return NULL;
192
193 /* Enter 10 state. */
194 priv->bitmap[idx] &= ~(genmask << off);
195 nft_set_elem_change_active(net, set, &be->ext);
196
197 return be;
198}
199
200static void nft_bitmap_walk(const struct nft_ctx *ctx,
201 struct nft_set *set,
202 struct nft_set_iter *iter)
203{
204 const struct nft_bitmap *priv = nft_set_priv(set);
205 struct nft_bitmap_elem *be;
206 struct nft_set_elem elem;
207
208 list_for_each_entry_rcu(be, &priv->list, head) {
209 if (iter->count < iter->skip)
210 goto cont;
211 if (!nft_set_elem_active(&be->ext, iter->genmask))
212 goto cont;
213
214 elem.priv = be;
215
216 iter->err = iter->fn(ctx, set, iter, &elem);
217
218 if (iter->err < 0)
219 return;
220cont:
221 iter->count++;
222 }
223}
224
225/* The bitmap size is pow(2, key length in bits) / bits per byte. This is
226 * multiplied by two since each element takes two bits. For 8 bit keys, the
227 * bitmap consumes 66 bytes. For 16 bit keys, 16388 bytes.
228 */
229static inline u32 nft_bitmap_size(u32 klen)
230{
231 return ((2 << ((klen * BITS_PER_BYTE) - 1)) / BITS_PER_BYTE) << 1;
232}
233
234static inline u32 nft_bitmap_total_size(u32 klen)
235{
236 return sizeof(struct nft_bitmap) + nft_bitmap_size(klen);
237}
238
239static unsigned int nft_bitmap_privsize(const struct nlattr * const nla[])
240{
241 u32 klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN]));
242
243 return nft_bitmap_total_size(klen);
244}
245
246static int nft_bitmap_init(const struct nft_set *set,
247 const struct nft_set_desc *desc,
248 const struct nlattr * const nla[])
249{
250 struct nft_bitmap *priv = nft_set_priv(set);
251
252 INIT_LIST_HEAD(&priv->list);
253 priv->bitmap_size = nft_bitmap_size(set->klen);
254
255 return 0;
256}
257
258static void nft_bitmap_destroy(const struct nft_set *set)
259{
260}
261
262static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features,
263 struct nft_set_estimate *est)
264{
265 /* Make sure bitmaps we don't get bitmaps larger than 16 Kbytes. */
266 if (desc->klen > 2)
267 return false;
268
269 est->size = nft_bitmap_total_size(desc->klen);
270 est->lookup = NFT_SET_CLASS_O_1;
271 est->space = NFT_SET_CLASS_O_1;
272
273 return true;
274}
275
276static struct nft_set_ops nft_bitmap_ops __read_mostly = {
277 .privsize = nft_bitmap_privsize,
278 .elemsize = offsetof(struct nft_bitmap_elem, ext),
279 .estimate = nft_bitmap_estimate,
280 .init = nft_bitmap_init,
281 .destroy = nft_bitmap_destroy,
282 .insert = nft_bitmap_insert,
283 .remove = nft_bitmap_remove,
284 .deactivate = nft_bitmap_deactivate,
285 .flush = nft_bitmap_flush,
286 .activate = nft_bitmap_activate,
287 .lookup = nft_bitmap_lookup,
288 .walk = nft_bitmap_walk,
289 .owner = THIS_MODULE,
290};
291
292static int __init nft_bitmap_module_init(void)
293{
294 return nft_register_set(&nft_bitmap_ops);
295}
296
297static void __exit nft_bitmap_module_exit(void)
298{
299 nft_unregister_set(&nft_bitmap_ops);
300}
301
302module_init(nft_bitmap_module_init);
303module_exit(nft_bitmap_module_exit);
304
305MODULE_LICENSE("GPL");
306MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
307MODULE_ALIAS_NFT_SET();
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index a3dface3e6e6..5f652720fc78 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -167,6 +167,19 @@ static void nft_hash_activate(const struct net *net, const struct nft_set *set,
167 nft_set_elem_clear_busy(&he->ext); 167 nft_set_elem_clear_busy(&he->ext);
168} 168}
169 169
170static bool nft_hash_flush(const struct net *net,
171 const struct nft_set *set, void *priv)
172{
173 struct nft_hash_elem *he = priv;
174
175 if (!nft_set_elem_mark_busy(&he->ext) ||
176 !nft_is_active(net, &he->ext)) {
177 nft_set_elem_change_active(net, set, &he->ext);
178 return true;
179 }
180 return false;
181}
182
170static void *nft_hash_deactivate(const struct net *net, 183static void *nft_hash_deactivate(const struct net *net,
171 const struct nft_set *set, 184 const struct nft_set *set,
172 const struct nft_set_elem *elem) 185 const struct nft_set_elem *elem)
@@ -181,19 +194,17 @@ static void *nft_hash_deactivate(const struct net *net,
181 194
182 rcu_read_lock(); 195 rcu_read_lock();
183 he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params); 196 he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params);
184 if (he != NULL) { 197 if (he != NULL &&
185 if (!nft_set_elem_mark_busy(&he->ext) || 198 !nft_hash_flush(net, set, he))
186 !nft_is_active(net, &he->ext)) 199 he = NULL;
187 nft_set_elem_change_active(net, set, &he->ext); 200
188 else
189 he = NULL;
190 }
191 rcu_read_unlock(); 201 rcu_read_unlock();
192 202
193 return he; 203 return he;
194} 204}
195 205
196static void nft_hash_remove(const struct nft_set *set, 206static void nft_hash_remove(const struct net *net,
207 const struct nft_set *set,
197 const struct nft_set_elem *elem) 208 const struct nft_set_elem *elem)
198{ 209{
199 struct nft_hash *priv = nft_set_priv(set); 210 struct nft_hash *priv = nft_set_priv(set);
@@ -202,7 +213,7 @@ static void nft_hash_remove(const struct nft_set *set,
202 rhashtable_remove_fast(&priv->ht, &he->node, nft_hash_params); 213 rhashtable_remove_fast(&priv->ht, &he->node, nft_hash_params);
203} 214}
204 215
205static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set, 216static void nft_hash_walk(const struct nft_ctx *ctx, struct nft_set *set,
206 struct nft_set_iter *iter) 217 struct nft_set_iter *iter)
207{ 218{
208 struct nft_hash *priv = nft_set_priv(set); 219 struct nft_hash *priv = nft_set_priv(set);
@@ -373,7 +384,8 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features,
373 est->size = esize + 2 * sizeof(struct nft_hash_elem *); 384 est->size = esize + 2 * sizeof(struct nft_hash_elem *);
374 } 385 }
375 386
376 est->class = NFT_SET_CLASS_O_1; 387 est->lookup = NFT_SET_CLASS_O_1;
388 est->space = NFT_SET_CLASS_O_N;
377 389
378 return true; 390 return true;
379} 391}
@@ -387,11 +399,12 @@ static struct nft_set_ops nft_hash_ops __read_mostly = {
387 .insert = nft_hash_insert, 399 .insert = nft_hash_insert,
388 .activate = nft_hash_activate, 400 .activate = nft_hash_activate,
389 .deactivate = nft_hash_deactivate, 401 .deactivate = nft_hash_deactivate,
402 .flush = nft_hash_flush,
390 .remove = nft_hash_remove, 403 .remove = nft_hash_remove,
391 .lookup = nft_hash_lookup, 404 .lookup = nft_hash_lookup,
392 .update = nft_hash_update, 405 .update = nft_hash_update,
393 .walk = nft_hash_walk, 406 .walk = nft_hash_walk,
394 .features = NFT_SET_MAP | NFT_SET_TIMEOUT, 407 .features = NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
395 .owner = THIS_MODULE, 408 .owner = THIS_MODULE,
396}; 409};
397 410
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 36493a7cae88..78dfbf9588b3 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -60,11 +60,10 @@ static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
60 d = memcmp(this, key, set->klen); 60 d = memcmp(this, key, set->klen);
61 if (d < 0) { 61 if (d < 0) {
62 parent = parent->rb_left; 62 parent = parent->rb_left;
63 /* In case of adjacent ranges, we always see the high 63 if (interval &&
64 * part of the range in first place, before the low one. 64 nft_rbtree_equal(set, this, interval) &&
65 * So don't update interval if the keys are equal. 65 nft_rbtree_interval_end(this) &&
66 */ 66 !nft_rbtree_interval_end(interval))
67 if (interval && nft_rbtree_equal(set, this, interval))
68 continue; 67 continue;
69 interval = rbe; 68 interval = rbe;
70 } else if (d > 0) 69 } else if (d > 0)
@@ -151,7 +150,8 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
151 return err; 150 return err;
152} 151}
153 152
154static void nft_rbtree_remove(const struct nft_set *set, 153static void nft_rbtree_remove(const struct net *net,
154 const struct nft_set *set,
155 const struct nft_set_elem *elem) 155 const struct nft_set_elem *elem)
156{ 156{
157 struct nft_rbtree *priv = nft_set_priv(set); 157 struct nft_rbtree *priv = nft_set_priv(set);
@@ -171,6 +171,15 @@ static void nft_rbtree_activate(const struct net *net,
171 nft_set_elem_change_active(net, set, &rbe->ext); 171 nft_set_elem_change_active(net, set, &rbe->ext);
172} 172}
173 173
174static bool nft_rbtree_flush(const struct net *net,
175 const struct nft_set *set, void *priv)
176{
177 struct nft_rbtree_elem *rbe = priv;
178
179 nft_set_elem_change_active(net, set, &rbe->ext);
180 return true;
181}
182
174static void *nft_rbtree_deactivate(const struct net *net, 183static void *nft_rbtree_deactivate(const struct net *net,
175 const struct nft_set *set, 184 const struct nft_set *set,
176 const struct nft_set_elem *elem) 185 const struct nft_set_elem *elem)
@@ -204,7 +213,7 @@ static void *nft_rbtree_deactivate(const struct net *net,
204 parent = parent->rb_right; 213 parent = parent->rb_right;
205 continue; 214 continue;
206 } 215 }
207 nft_set_elem_change_active(net, set, &rbe->ext); 216 nft_rbtree_flush(net, set, rbe);
208 return rbe; 217 return rbe;
209 } 218 }
210 } 219 }
@@ -212,7 +221,7 @@ static void *nft_rbtree_deactivate(const struct net *net,
212} 221}
213 222
214static void nft_rbtree_walk(const struct nft_ctx *ctx, 223static void nft_rbtree_walk(const struct nft_ctx *ctx,
215 const struct nft_set *set, 224 struct nft_set *set,
216 struct nft_set_iter *iter) 225 struct nft_set_iter *iter)
217{ 226{
218 const struct nft_rbtree *priv = nft_set_priv(set); 227 const struct nft_rbtree *priv = nft_set_priv(set);
@@ -281,7 +290,8 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
281 else 290 else
282 est->size = nsize; 291 est->size = nsize;
283 292
284 est->class = NFT_SET_CLASS_O_LOG_N; 293 est->lookup = NFT_SET_CLASS_O_LOG_N;
294 est->space = NFT_SET_CLASS_O_N;
285 295
286 return true; 296 return true;
287} 297}
@@ -295,10 +305,11 @@ static struct nft_set_ops nft_rbtree_ops __read_mostly = {
295 .insert = nft_rbtree_insert, 305 .insert = nft_rbtree_insert,
296 .remove = nft_rbtree_remove, 306 .remove = nft_rbtree_remove,
297 .deactivate = nft_rbtree_deactivate, 307 .deactivate = nft_rbtree_deactivate,
308 .flush = nft_rbtree_flush,
298 .activate = nft_rbtree_activate, 309 .activate = nft_rbtree_activate,
299 .lookup = nft_rbtree_lookup, 310 .lookup = nft_rbtree_lookup,
300 .walk = nft_rbtree_walk, 311 .walk = nft_rbtree_walk,
301 .features = NFT_SET_INTERVAL | NFT_SET_MAP, 312 .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT,
302 .owner = THIS_MODULE, 313 .owner = THIS_MODULE,
303}; 314};
304 315
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index fc4977456c30..14857afc9937 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -40,6 +40,7 @@ MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
40MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module"); 40MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module");
41 41
42#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1)) 42#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1))
43#define XT_PCPU_BLOCK_SIZE 4096
43 44
44struct compat_delta { 45struct compat_delta {
45 unsigned int offset; /* offset in kernel */ 46 unsigned int offset; /* offset in kernel */
@@ -261,6 +262,60 @@ struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision)
261} 262}
262EXPORT_SYMBOL_GPL(xt_request_find_target); 263EXPORT_SYMBOL_GPL(xt_request_find_target);
263 264
265
266static int xt_obj_to_user(u16 __user *psize, u16 size,
267 void __user *pname, const char *name,
268 u8 __user *prev, u8 rev)
269{
270 if (put_user(size, psize))
271 return -EFAULT;
272 if (copy_to_user(pname, name, strlen(name) + 1))
273 return -EFAULT;
274 if (put_user(rev, prev))
275 return -EFAULT;
276
277 return 0;
278}
279
280#define XT_OBJ_TO_USER(U, K, TYPE, C_SIZE) \
281 xt_obj_to_user(&U->u.TYPE##_size, C_SIZE ? : K->u.TYPE##_size, \
282 U->u.user.name, K->u.kernel.TYPE->name, \
283 &U->u.user.revision, K->u.kernel.TYPE->revision)
284
285int xt_data_to_user(void __user *dst, const void *src,
286 int usersize, int size)
287{
288 usersize = usersize ? : size;
289 if (copy_to_user(dst, src, usersize))
290 return -EFAULT;
291 if (usersize != size && clear_user(dst + usersize, size - usersize))
292 return -EFAULT;
293
294 return 0;
295}
296EXPORT_SYMBOL_GPL(xt_data_to_user);
297
298#define XT_DATA_TO_USER(U, K, TYPE, C_SIZE) \
299 xt_data_to_user(U->data, K->data, \
300 K->u.kernel.TYPE->usersize, \
301 C_SIZE ? : K->u.kernel.TYPE->TYPE##size)
302
303int xt_match_to_user(const struct xt_entry_match *m,
304 struct xt_entry_match __user *u)
305{
306 return XT_OBJ_TO_USER(u, m, match, 0) ||
307 XT_DATA_TO_USER(u, m, match, 0);
308}
309EXPORT_SYMBOL_GPL(xt_match_to_user);
310
311int xt_target_to_user(const struct xt_entry_target *t,
312 struct xt_entry_target __user *u)
313{
314 return XT_OBJ_TO_USER(u, t, target, 0) ||
315 XT_DATA_TO_USER(u, t, target, 0);
316}
317EXPORT_SYMBOL_GPL(xt_target_to_user);
318
264static int match_revfn(u8 af, const char *name, u8 revision, int *bestp) 319static int match_revfn(u8 af, const char *name, u8 revision, int *bestp)
265{ 320{
266 const struct xt_match *m; 321 const struct xt_match *m;
@@ -564,17 +619,14 @@ int xt_compat_match_to_user(const struct xt_entry_match *m,
564 int off = xt_compat_match_offset(match); 619 int off = xt_compat_match_offset(match);
565 u_int16_t msize = m->u.user.match_size - off; 620 u_int16_t msize = m->u.user.match_size - off;
566 621
567 if (copy_to_user(cm, m, sizeof(*cm)) || 622 if (XT_OBJ_TO_USER(cm, m, match, msize))
568 put_user(msize, &cm->u.user.match_size) ||
569 copy_to_user(cm->u.user.name, m->u.kernel.match->name,
570 strlen(m->u.kernel.match->name) + 1))
571 return -EFAULT; 623 return -EFAULT;
572 624
573 if (match->compat_to_user) { 625 if (match->compat_to_user) {
574 if (match->compat_to_user((void __user *)cm->data, m->data)) 626 if (match->compat_to_user((void __user *)cm->data, m->data))
575 return -EFAULT; 627 return -EFAULT;
576 } else { 628 } else {
577 if (copy_to_user(cm->data, m->data, msize - sizeof(*cm))) 629 if (XT_DATA_TO_USER(cm, m, match, msize - sizeof(*cm)))
578 return -EFAULT; 630 return -EFAULT;
579 } 631 }
580 632
@@ -615,7 +667,7 @@ int xt_compat_check_entry_offsets(const void *base, const char *elems,
615 COMPAT_XT_ALIGN(target_offset + sizeof(struct compat_xt_standard_target)) != next_offset) 667 COMPAT_XT_ALIGN(target_offset + sizeof(struct compat_xt_standard_target)) != next_offset)
616 return -EINVAL; 668 return -EINVAL;
617 669
618 /* compat_xt_entry match has less strict aligment requirements, 670 /* compat_xt_entry match has less strict alignment requirements,
619 * otherwise they are identical. In case of padding differences 671 * otherwise they are identical. In case of padding differences
620 * we need to add compat version of xt_check_entry_match. 672 * we need to add compat version of xt_check_entry_match.
621 */ 673 */
@@ -922,17 +974,14 @@ int xt_compat_target_to_user(const struct xt_entry_target *t,
922 int off = xt_compat_target_offset(target); 974 int off = xt_compat_target_offset(target);
923 u_int16_t tsize = t->u.user.target_size - off; 975 u_int16_t tsize = t->u.user.target_size - off;
924 976
925 if (copy_to_user(ct, t, sizeof(*ct)) || 977 if (XT_OBJ_TO_USER(ct, t, target, tsize))
926 put_user(tsize, &ct->u.user.target_size) ||
927 copy_to_user(ct->u.user.name, t->u.kernel.target->name,
928 strlen(t->u.kernel.target->name) + 1))
929 return -EFAULT; 978 return -EFAULT;
930 979
931 if (target->compat_to_user) { 980 if (target->compat_to_user) {
932 if (target->compat_to_user((void __user *)ct->data, t->data)) 981 if (target->compat_to_user((void __user *)ct->data, t->data))
933 return -EFAULT; 982 return -EFAULT;
934 } else { 983 } else {
935 if (copy_to_user(ct->data, t->data, tsize - sizeof(*ct))) 984 if (XT_DATA_TO_USER(ct, t, target, tsize - sizeof(*ct)))
936 return -EFAULT; 985 return -EFAULT;
937 } 986 }
938 987
@@ -958,7 +1007,9 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size)
958 if (sz <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) 1007 if (sz <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
959 info = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); 1008 info = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
960 if (!info) { 1009 if (!info) {
961 info = vmalloc(sz); 1010 info = __vmalloc(sz, GFP_KERNEL | __GFP_NOWARN |
1011 __GFP_NORETRY | __GFP_HIGHMEM,
1012 PAGE_KERNEL);
962 if (!info) 1013 if (!info)
963 return NULL; 1014 return NULL;
964 } 1015 }
@@ -982,7 +1033,7 @@ void xt_free_table_info(struct xt_table_info *info)
982} 1033}
983EXPORT_SYMBOL(xt_free_table_info); 1034EXPORT_SYMBOL(xt_free_table_info);
984 1035
985/* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */ 1036/* Find table by name, grabs mutex & ref. Returns NULL on error. */
986struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, 1037struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
987 const char *name) 1038 const char *name)
988{ 1039{
@@ -1615,6 +1666,59 @@ void xt_proto_fini(struct net *net, u_int8_t af)
1615} 1666}
1616EXPORT_SYMBOL_GPL(xt_proto_fini); 1667EXPORT_SYMBOL_GPL(xt_proto_fini);
1617 1668
1669/**
1670 * xt_percpu_counter_alloc - allocate x_tables rule counter
1671 *
1672 * @state: pointer to xt_percpu allocation state
1673 * @counter: pointer to counter struct inside the ip(6)/arpt_entry struct
1674 *
1675 * On SMP, the packet counter [ ip(6)t_entry->counters.pcnt ] will then
1676 * contain the address of the real (percpu) counter.
1677 *
1678 * Rule evaluation needs to use xt_get_this_cpu_counter() helper
1679 * to fetch the real percpu counter.
1680 *
1681 * To speed up allocation and improve data locality, a 4kb block is
1682 * allocated.
1683 *
1684 * xt_percpu_counter_alloc_state contains the base address of the
1685 * allocated page and the current sub-offset.
1686 *
1687 * returns false on error.
1688 */
1689bool xt_percpu_counter_alloc(struct xt_percpu_counter_alloc_state *state,
1690 struct xt_counters *counter)
1691{
1692 BUILD_BUG_ON(XT_PCPU_BLOCK_SIZE < (sizeof(*counter) * 2));
1693
1694 if (nr_cpu_ids <= 1)
1695 return true;
1696
1697 if (!state->mem) {
1698 state->mem = __alloc_percpu(XT_PCPU_BLOCK_SIZE,
1699 XT_PCPU_BLOCK_SIZE);
1700 if (!state->mem)
1701 return false;
1702 }
1703 counter->pcnt = (__force unsigned long)(state->mem + state->off);
1704 state->off += sizeof(*counter);
1705 if (state->off > (XT_PCPU_BLOCK_SIZE - sizeof(*counter))) {
1706 state->mem = NULL;
1707 state->off = 0;
1708 }
1709 return true;
1710}
1711EXPORT_SYMBOL_GPL(xt_percpu_counter_alloc);
1712
1713void xt_percpu_counter_free(struct xt_counters *counters)
1714{
1715 unsigned long pcnt = counters->pcnt;
1716
1717 if (nr_cpu_ids > 1 && (pcnt & (XT_PCPU_BLOCK_SIZE - 1)) == 0)
1718 free_percpu((void __percpu *)pcnt);
1719}
1720EXPORT_SYMBOL_GPL(xt_percpu_counter_free);
1721
1618static int __net_init xt_net_init(struct net *net) 1722static int __net_init xt_net_init(struct net *net)
1619{ 1723{
1620 int i; 1724 int i;
diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c
index 4973cbddc446..19247a17e511 100644
--- a/net/netfilter/xt_AUDIT.c
+++ b/net/netfilter/xt_AUDIT.c
@@ -132,9 +132,9 @@ audit_tg(struct sk_buff *skb, const struct xt_action_param *par)
132 goto errout; 132 goto errout;
133 133
134 audit_log_format(ab, "action=%hhu hook=%u len=%u inif=%s outif=%s", 134 audit_log_format(ab, "action=%hhu hook=%u len=%u inif=%s outif=%s",
135 info->type, par->hooknum, skb->len, 135 info->type, xt_hooknum(par), skb->len,
136 par->in ? par->in->name : "?", 136 xt_in(par) ? xt_inname(par) : "?",
137 par->out ? par->out->name : "?"); 137 xt_out(par) ? xt_outname(par) : "?");
138 138
139 if (skb->mark) 139 if (skb->mark)
140 audit_log_format(ab, " mark=%#x", skb->mark); 140 audit_log_format(ab, " mark=%#x", skb->mark);
@@ -144,7 +144,7 @@ audit_tg(struct sk_buff *skb, const struct xt_action_param *par)
144 eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, 144 eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
145 ntohs(eth_hdr(skb)->h_proto)); 145 ntohs(eth_hdr(skb)->h_proto));
146 146
147 if (par->family == NFPROTO_BRIDGE) { 147 if (xt_family(par) == NFPROTO_BRIDGE) {
148 switch (eth_hdr(skb)->h_proto) { 148 switch (eth_hdr(skb)->h_proto) {
149 case htons(ETH_P_IP): 149 case htons(ETH_P_IP):
150 audit_ip4(ab, skb); 150 audit_ip4(ab, skb);
@@ -157,7 +157,7 @@ audit_tg(struct sk_buff *skb, const struct xt_action_param *par)
157 } 157 }
158 } 158 }
159 159
160 switch (par->family) { 160 switch (xt_family(par)) {
161 case NFPROTO_IPV4: 161 case NFPROTO_IPV4:
162 audit_ip4(ab, skb); 162 audit_ip4(ab, skb);
163 break; 163 break;
diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c
index e04dc282e3bb..da56c06a443c 100644
--- a/net/netfilter/xt_CONNSECMARK.c
+++ b/net/netfilter/xt_CONNSECMARK.c
@@ -106,7 +106,7 @@ static int connsecmark_tg_check(const struct xt_tgchk_param *par)
106 return -EINVAL; 106 return -EINVAL;
107 } 107 }
108 108
109 ret = nf_ct_l3proto_try_module_get(par->family); 109 ret = nf_ct_netns_get(par->net, par->family);
110 if (ret < 0) 110 if (ret < 0)
111 pr_info("cannot load conntrack support for proto=%u\n", 111 pr_info("cannot load conntrack support for proto=%u\n",
112 par->family); 112 par->family);
@@ -115,7 +115,7 @@ static int connsecmark_tg_check(const struct xt_tgchk_param *par)
115 115
116static void connsecmark_tg_destroy(const struct xt_tgdtor_param *par) 116static void connsecmark_tg_destroy(const struct xt_tgdtor_param *par)
117{ 117{
118 nf_ct_l3proto_module_put(par->family); 118 nf_ct_netns_put(par->net, par->family);
119} 119}
120 120
121static struct xt_target connsecmark_tg_reg __read_mostly = { 121static struct xt_target connsecmark_tg_reg __read_mostly = {
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 6669e68d589e..b008db0184b8 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -23,15 +23,14 @@
23static inline int xt_ct_target(struct sk_buff *skb, struct nf_conn *ct) 23static inline int xt_ct_target(struct sk_buff *skb, struct nf_conn *ct)
24{ 24{
25 /* Previously seen (loopback)? Ignore. */ 25 /* Previously seen (loopback)? Ignore. */
26 if (skb->nfct != NULL) 26 if (skb->_nfct != 0)
27 return XT_CONTINUE; 27 return XT_CONTINUE;
28 28
29 /* special case the untracked ct : we want the percpu object */ 29 /* special case the untracked ct : we want the percpu object */
30 if (!ct) 30 if (!ct)
31 ct = nf_ct_untracked_get(); 31 ct = nf_ct_untracked_get();
32 atomic_inc(&ct->ct_general.use); 32 atomic_inc(&ct->ct_general.use);
33 skb->nfct = &ct->ct_general; 33 nf_ct_set(skb, ct, IP_CT_NEW);
34 skb->nfctinfo = IP_CT_NEW;
35 34
36 return XT_CONTINUE; 35 return XT_CONTINUE;
37} 36}
@@ -216,7 +215,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
216 goto err1; 215 goto err1;
217#endif 216#endif
218 217
219 ret = nf_ct_l3proto_try_module_get(par->family); 218 ret = nf_ct_netns_get(par->net, par->family);
220 if (ret < 0) 219 if (ret < 0)
221 goto err1; 220 goto err1;
222 221
@@ -260,7 +259,7 @@ out:
260err3: 259err3:
261 nf_ct_tmpl_free(ct); 260 nf_ct_tmpl_free(ct);
262err2: 261err2:
263 nf_ct_l3proto_module_put(par->family); 262 nf_ct_netns_put(par->net, par->family);
264err1: 263err1:
265 return ret; 264 return ret;
266} 265}
@@ -341,7 +340,7 @@ static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par,
341 if (help) 340 if (help)
342 module_put(help->helper->me); 341 module_put(help->helper->me);
343 342
344 nf_ct_l3proto_module_put(par->family); 343 nf_ct_netns_put(par->net, par->family);
345 344
346 xt_ct_destroy_timeout(ct); 345 xt_ct_destroy_timeout(ct);
347 nf_ct_put(info->ct); 346 nf_ct_put(info->ct);
@@ -373,6 +372,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
373 .name = "CT", 372 .name = "CT",
374 .family = NFPROTO_UNSPEC, 373 .family = NFPROTO_UNSPEC,
375 .targetsize = sizeof(struct xt_ct_target_info), 374 .targetsize = sizeof(struct xt_ct_target_info),
375 .usersize = offsetof(struct xt_ct_target_info, ct),
376 .checkentry = xt_ct_tg_check_v0, 376 .checkentry = xt_ct_tg_check_v0,
377 .destroy = xt_ct_tg_destroy_v0, 377 .destroy = xt_ct_tg_destroy_v0,
378 .target = xt_ct_target_v0, 378 .target = xt_ct_target_v0,
@@ -384,6 +384,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
384 .family = NFPROTO_UNSPEC, 384 .family = NFPROTO_UNSPEC,
385 .revision = 1, 385 .revision = 1,
386 .targetsize = sizeof(struct xt_ct_target_info_v1), 386 .targetsize = sizeof(struct xt_ct_target_info_v1),
387 .usersize = offsetof(struct xt_ct_target_info, ct),
387 .checkentry = xt_ct_tg_check_v1, 388 .checkentry = xt_ct_tg_check_v1,
388 .destroy = xt_ct_tg_destroy_v1, 389 .destroy = xt_ct_tg_destroy_v1,
389 .target = xt_ct_target_v1, 390 .target = xt_ct_target_v1,
@@ -395,6 +396,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
395 .family = NFPROTO_UNSPEC, 396 .family = NFPROTO_UNSPEC,
396 .revision = 2, 397 .revision = 2,
397 .targetsize = sizeof(struct xt_ct_target_info_v1), 398 .targetsize = sizeof(struct xt_ct_target_info_v1),
399 .usersize = offsetof(struct xt_ct_target_info, ct),
398 .checkentry = xt_ct_tg_check_v2, 400 .checkentry = xt_ct_tg_check_v2,
399 .destroy = xt_ct_tg_destroy_v1, 401 .destroy = xt_ct_tg_destroy_v1,
400 .target = xt_ct_target_v1, 402 .target = xt_ct_target_v1,
@@ -407,12 +409,11 @@ static unsigned int
407notrack_tg(struct sk_buff *skb, const struct xt_action_param *par) 409notrack_tg(struct sk_buff *skb, const struct xt_action_param *par)
408{ 410{
409 /* Previously seen (loopback)? Ignore. */ 411 /* Previously seen (loopback)? Ignore. */
410 if (skb->nfct != NULL) 412 if (skb->_nfct != 0)
411 return XT_CONTINUE; 413 return XT_CONTINUE;
412 414
413 skb->nfct = &nf_ct_untracked_get()->ct_general; 415 nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW);
414 skb->nfctinfo = IP_CT_NEW; 416 nf_conntrack_get(skb_nfct(skb));
415 nf_conntrack_get(skb->nfct);
416 417
417 return XT_CONTINUE; 418 return XT_CONTINUE;
418} 419}
diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c
index 1763ab82bcd7..c3b2017ebe41 100644
--- a/net/netfilter/xt_LOG.c
+++ b/net/netfilter/xt_LOG.c
@@ -32,15 +32,15 @@ static unsigned int
32log_tg(struct sk_buff *skb, const struct xt_action_param *par) 32log_tg(struct sk_buff *skb, const struct xt_action_param *par)
33{ 33{
34 const struct xt_log_info *loginfo = par->targinfo; 34 const struct xt_log_info *loginfo = par->targinfo;
35 struct net *net = xt_net(par);
35 struct nf_loginfo li; 36 struct nf_loginfo li;
36 struct net *net = par->net;
37 37
38 li.type = NF_LOG_TYPE_LOG; 38 li.type = NF_LOG_TYPE_LOG;
39 li.u.log.level = loginfo->level; 39 li.u.log.level = loginfo->level;
40 li.u.log.logflags = loginfo->logflags; 40 li.u.log.logflags = loginfo->logflags;
41 41
42 nf_log_packet(net, par->family, par->hooknum, skb, par->in, par->out, 42 nf_log_packet(net, xt_family(par), xt_hooknum(par), skb, xt_in(par),
43 &li, "%s", loginfo->prefix); 43 xt_out(par), &li, "%s", loginfo->prefix);
44 return XT_CONTINUE; 44 return XT_CONTINUE;
45} 45}
46 46
diff --git a/net/netfilter/xt_NETMAP.c b/net/netfilter/xt_NETMAP.c
index b253e07cb1c5..e45a01255e70 100644
--- a/net/netfilter/xt_NETMAP.c
+++ b/net/netfilter/xt_NETMAP.c
@@ -33,8 +33,8 @@ netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par)
33 netmask.ip6[i] = ~(range->min_addr.ip6[i] ^ 33 netmask.ip6[i] = ~(range->min_addr.ip6[i] ^
34 range->max_addr.ip6[i]); 34 range->max_addr.ip6[i]);
35 35
36 if (par->hooknum == NF_INET_PRE_ROUTING || 36 if (xt_hooknum(par) == NF_INET_PRE_ROUTING ||
37 par->hooknum == NF_INET_LOCAL_OUT) 37 xt_hooknum(par) == NF_INET_LOCAL_OUT)
38 new_addr.in6 = ipv6_hdr(skb)->daddr; 38 new_addr.in6 = ipv6_hdr(skb)->daddr;
39 else 39 else
40 new_addr.in6 = ipv6_hdr(skb)->saddr; 40 new_addr.in6 = ipv6_hdr(skb)->saddr;
@@ -51,7 +51,7 @@ netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par)
51 newrange.min_proto = range->min_proto; 51 newrange.min_proto = range->min_proto;
52 newrange.max_proto = range->max_proto; 52 newrange.max_proto = range->max_proto;
53 53
54 return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum)); 54 return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(xt_hooknum(par)));
55} 55}
56 56
57static int netmap_tg6_checkentry(const struct xt_tgchk_param *par) 57static int netmap_tg6_checkentry(const struct xt_tgchk_param *par)
@@ -60,7 +60,12 @@ static int netmap_tg6_checkentry(const struct xt_tgchk_param *par)
60 60
61 if (!(range->flags & NF_NAT_RANGE_MAP_IPS)) 61 if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
62 return -EINVAL; 62 return -EINVAL;
63 return 0; 63 return nf_ct_netns_get(par->net, par->family);
64}
65
66static void netmap_tg_destroy(const struct xt_tgdtor_param *par)
67{
68 nf_ct_netns_put(par->net, par->family);
64} 69}
65 70
66static unsigned int 71static unsigned int
@@ -72,16 +77,16 @@ netmap_tg4(struct sk_buff *skb, const struct xt_action_param *par)
72 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; 77 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
73 struct nf_nat_range newrange; 78 struct nf_nat_range newrange;
74 79
75 NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING || 80 NF_CT_ASSERT(xt_hooknum(par) == NF_INET_PRE_ROUTING ||
76 par->hooknum == NF_INET_POST_ROUTING || 81 xt_hooknum(par) == NF_INET_POST_ROUTING ||
77 par->hooknum == NF_INET_LOCAL_OUT || 82 xt_hooknum(par) == NF_INET_LOCAL_OUT ||
78 par->hooknum == NF_INET_LOCAL_IN); 83 xt_hooknum(par) == NF_INET_LOCAL_IN);
79 ct = nf_ct_get(skb, &ctinfo); 84 ct = nf_ct_get(skb, &ctinfo);
80 85
81 netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip); 86 netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
82 87
83 if (par->hooknum == NF_INET_PRE_ROUTING || 88 if (xt_hooknum(par) == NF_INET_PRE_ROUTING ||
84 par->hooknum == NF_INET_LOCAL_OUT) 89 xt_hooknum(par) == NF_INET_LOCAL_OUT)
85 new_ip = ip_hdr(skb)->daddr & ~netmask; 90 new_ip = ip_hdr(skb)->daddr & ~netmask;
86 else 91 else
87 new_ip = ip_hdr(skb)->saddr & ~netmask; 92 new_ip = ip_hdr(skb)->saddr & ~netmask;
@@ -96,7 +101,7 @@ netmap_tg4(struct sk_buff *skb, const struct xt_action_param *par)
96 newrange.max_proto = mr->range[0].max; 101 newrange.max_proto = mr->range[0].max;
97 102
98 /* Hand modified range to generic setup. */ 103 /* Hand modified range to generic setup. */
99 return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum)); 104 return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(xt_hooknum(par)));
100} 105}
101 106
102static int netmap_tg4_check(const struct xt_tgchk_param *par) 107static int netmap_tg4_check(const struct xt_tgchk_param *par)
@@ -111,7 +116,7 @@ static int netmap_tg4_check(const struct xt_tgchk_param *par)
111 pr_debug("bad rangesize %u.\n", mr->rangesize); 116 pr_debug("bad rangesize %u.\n", mr->rangesize);
112 return -EINVAL; 117 return -EINVAL;
113 } 118 }
114 return 0; 119 return nf_ct_netns_get(par->net, par->family);
115} 120}
116 121
117static struct xt_target netmap_tg_reg[] __read_mostly = { 122static struct xt_target netmap_tg_reg[] __read_mostly = {
@@ -127,6 +132,7 @@ static struct xt_target netmap_tg_reg[] __read_mostly = {
127 (1 << NF_INET_LOCAL_OUT) | 132 (1 << NF_INET_LOCAL_OUT) |
128 (1 << NF_INET_LOCAL_IN), 133 (1 << NF_INET_LOCAL_IN),
129 .checkentry = netmap_tg6_checkentry, 134 .checkentry = netmap_tg6_checkentry,
135 .destroy = netmap_tg_destroy,
130 .me = THIS_MODULE, 136 .me = THIS_MODULE,
131 }, 137 },
132 { 138 {
@@ -141,6 +147,7 @@ static struct xt_target netmap_tg_reg[] __read_mostly = {
141 (1 << NF_INET_LOCAL_OUT) | 147 (1 << NF_INET_LOCAL_OUT) |
142 (1 << NF_INET_LOCAL_IN), 148 (1 << NF_INET_LOCAL_IN),
143 .checkentry = netmap_tg4_check, 149 .checkentry = netmap_tg4_check,
150 .destroy = netmap_tg_destroy,
144 .me = THIS_MODULE, 151 .me = THIS_MODULE,
145 }, 152 },
146}; 153};
diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c
index 8668a5c18dc3..c7f8958cea4a 100644
--- a/net/netfilter/xt_NFLOG.c
+++ b/net/netfilter/xt_NFLOG.c
@@ -25,8 +25,8 @@ static unsigned int
25nflog_tg(struct sk_buff *skb, const struct xt_action_param *par) 25nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
26{ 26{
27 const struct xt_nflog_info *info = par->targinfo; 27 const struct xt_nflog_info *info = par->targinfo;
28 struct net *net = xt_net(par);
28 struct nf_loginfo li; 29 struct nf_loginfo li;
29 struct net *net = par->net;
30 30
31 li.type = NF_LOG_TYPE_ULOG; 31 li.type = NF_LOG_TYPE_ULOG;
32 li.u.ulog.copy_len = info->len; 32 li.u.ulog.copy_len = info->len;
@@ -37,8 +37,8 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
37 if (info->flags & XT_NFLOG_F_COPY_LEN) 37 if (info->flags & XT_NFLOG_F_COPY_LEN)
38 li.u.ulog.flags |= NF_LOG_F_COPY_LEN; 38 li.u.ulog.flags |= NF_LOG_F_COPY_LEN;
39 39
40 nfulnl_log_packet(net, par->family, par->hooknum, skb, par->in, 40 nfulnl_log_packet(net, xt_family(par), xt_hooknum(par), skb,
41 par->out, &li, info->prefix); 41 xt_in(par), xt_out(par), &li, info->prefix);
42 return XT_CONTINUE; 42 return XT_CONTINUE;
43} 43}
44 44
diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c
index 8f1779ff7e30..a360b99a958a 100644
--- a/net/netfilter/xt_NFQUEUE.c
+++ b/net/netfilter/xt_NFQUEUE.c
@@ -43,7 +43,7 @@ nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par)
43 43
44 if (info->queues_total > 1) { 44 if (info->queues_total > 1) {
45 queue = nfqueue_hash(skb, queue, info->queues_total, 45 queue = nfqueue_hash(skb, queue, info->queues_total,
46 par->family, jhash_initval); 46 xt_family(par), jhash_initval);
47 } 47 }
48 return NF_QUEUE_NR(queue); 48 return NF_QUEUE_NR(queue);
49} 49}
@@ -98,7 +98,7 @@ nfqueue_tg_v3(struct sk_buff *skb, const struct xt_action_param *par)
98 queue = info->queuenum + cpu % info->queues_total; 98 queue = info->queuenum + cpu % info->queues_total;
99 } else { 99 } else {
100 queue = nfqueue_hash(skb, queue, info->queues_total, 100 queue = nfqueue_hash(skb, queue, info->queues_total,
101 par->family, jhash_initval); 101 xt_family(par), jhash_initval);
102 } 102 }
103 } 103 }
104 104
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index dbd6c4a12b97..498b54fd04d7 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -63,7 +63,7 @@ void xt_rateest_put(struct xt_rateest *est)
63 mutex_lock(&xt_rateest_mutex); 63 mutex_lock(&xt_rateest_mutex);
64 if (--est->refcnt == 0) { 64 if (--est->refcnt == 0) {
65 hlist_del(&est->list); 65 hlist_del(&est->list);
66 gen_kill_estimator(&est->bstats, &est->rstats); 66 gen_kill_estimator(&est->rate_est);
67 /* 67 /*
68 * gen_estimator est_timer() might access est->lock or bstats, 68 * gen_estimator est_timer() might access est->lock or bstats,
69 * wait a RCU grace period before freeing 'est' 69 * wait a RCU grace period before freeing 'est'
@@ -132,7 +132,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
132 cfg.est.interval = info->interval; 132 cfg.est.interval = info->interval;
133 cfg.est.ewma_log = info->ewma_log; 133 cfg.est.ewma_log = info->ewma_log;
134 134
135 ret = gen_new_estimator(&est->bstats, NULL, &est->rstats, 135 ret = gen_new_estimator(&est->bstats, NULL, &est->rate_est,
136 &est->lock, NULL, &cfg.opt); 136 &est->lock, NULL, &cfg.opt);
137 if (ret < 0) 137 if (ret < 0)
138 goto err2; 138 goto err2;
@@ -162,6 +162,7 @@ static struct xt_target xt_rateest_tg_reg __read_mostly = {
162 .checkentry = xt_rateest_tg_checkentry, 162 .checkentry = xt_rateest_tg_checkentry,
163 .destroy = xt_rateest_tg_destroy, 163 .destroy = xt_rateest_tg_destroy,
164 .targetsize = sizeof(struct xt_rateest_target_info), 164 .targetsize = sizeof(struct xt_rateest_target_info),
165 .usersize = offsetof(struct xt_rateest_target_info, est),
165 .me = THIS_MODULE, 166 .me = THIS_MODULE,
166}; 167};
167 168
diff --git a/net/netfilter/xt_REDIRECT.c b/net/netfilter/xt_REDIRECT.c
index 03f0b370e178..98a4c6d4f1cb 100644
--- a/net/netfilter/xt_REDIRECT.c
+++ b/net/netfilter/xt_REDIRECT.c
@@ -31,7 +31,7 @@
31static unsigned int 31static unsigned int
32redirect_tg6(struct sk_buff *skb, const struct xt_action_param *par) 32redirect_tg6(struct sk_buff *skb, const struct xt_action_param *par)
33{ 33{
34 return nf_nat_redirect_ipv6(skb, par->targinfo, par->hooknum); 34 return nf_nat_redirect_ipv6(skb, par->targinfo, xt_hooknum(par));
35} 35}
36 36
37static int redirect_tg6_checkentry(const struct xt_tgchk_param *par) 37static int redirect_tg6_checkentry(const struct xt_tgchk_param *par)
@@ -40,7 +40,13 @@ static int redirect_tg6_checkentry(const struct xt_tgchk_param *par)
40 40
41 if (range->flags & NF_NAT_RANGE_MAP_IPS) 41 if (range->flags & NF_NAT_RANGE_MAP_IPS)
42 return -EINVAL; 42 return -EINVAL;
43 return 0; 43
44 return nf_ct_netns_get(par->net, par->family);
45}
46
47static void redirect_tg_destroy(const struct xt_tgdtor_param *par)
48{
49 nf_ct_netns_put(par->net, par->family);
44} 50}
45 51
46/* FIXME: Take multiple ranges --RR */ 52/* FIXME: Take multiple ranges --RR */
@@ -56,13 +62,13 @@ static int redirect_tg4_check(const struct xt_tgchk_param *par)
56 pr_debug("bad rangesize %u.\n", mr->rangesize); 62 pr_debug("bad rangesize %u.\n", mr->rangesize);
57 return -EINVAL; 63 return -EINVAL;
58 } 64 }
59 return 0; 65 return nf_ct_netns_get(par->net, par->family);
60} 66}
61 67
62static unsigned int 68static unsigned int
63redirect_tg4(struct sk_buff *skb, const struct xt_action_param *par) 69redirect_tg4(struct sk_buff *skb, const struct xt_action_param *par)
64{ 70{
65 return nf_nat_redirect_ipv4(skb, par->targinfo, par->hooknum); 71 return nf_nat_redirect_ipv4(skb, par->targinfo, xt_hooknum(par));
66} 72}
67 73
68static struct xt_target redirect_tg_reg[] __read_mostly = { 74static struct xt_target redirect_tg_reg[] __read_mostly = {
@@ -72,6 +78,7 @@ static struct xt_target redirect_tg_reg[] __read_mostly = {
72 .revision = 0, 78 .revision = 0,
73 .table = "nat", 79 .table = "nat",
74 .checkentry = redirect_tg6_checkentry, 80 .checkentry = redirect_tg6_checkentry,
81 .destroy = redirect_tg_destroy,
75 .target = redirect_tg6, 82 .target = redirect_tg6,
76 .targetsize = sizeof(struct nf_nat_range), 83 .targetsize = sizeof(struct nf_nat_range),
77 .hooks = (1 << NF_INET_PRE_ROUTING) | 84 .hooks = (1 << NF_INET_PRE_ROUTING) |
@@ -85,6 +92,7 @@ static struct xt_target redirect_tg_reg[] __read_mostly = {
85 .table = "nat", 92 .table = "nat",
86 .target = redirect_tg4, 93 .target = redirect_tg4,
87 .checkentry = redirect_tg4_check, 94 .checkentry = redirect_tg4_check,
95 .destroy = redirect_tg_destroy,
88 .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), 96 .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
89 .hooks = (1 << NF_INET_PRE_ROUTING) | 97 .hooks = (1 << NF_INET_PRE_ROUTING) |
90 (1 << NF_INET_LOCAL_OUT), 98 (1 << NF_INET_LOCAL_OUT),
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index 872db2d0e2a9..c64aca611ac5 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -104,11 +104,11 @@ tcpmss_mangle_packet(struct sk_buff *skb,
104 tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); 104 tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff);
105 tcp_hdrlen = tcph->doff * 4; 105 tcp_hdrlen = tcph->doff * 4;
106 106
107 if (len < tcp_hdrlen) 107 if (len < tcp_hdrlen || tcp_hdrlen < sizeof(struct tcphdr))
108 return -1; 108 return -1;
109 109
110 if (info->mss == XT_TCPMSS_CLAMP_PMTU) { 110 if (info->mss == XT_TCPMSS_CLAMP_PMTU) {
111 struct net *net = par->net; 111 struct net *net = xt_net(par);
112 unsigned int in_mtu = tcpmss_reverse_mtu(net, skb, family); 112 unsigned int in_mtu = tcpmss_reverse_mtu(net, skb, family);
113 unsigned int min_mtu = min(dst_mtu(skb_dst(skb)), in_mtu); 113 unsigned int min_mtu = min(dst_mtu(skb_dst(skb)), in_mtu);
114 114
@@ -152,6 +152,10 @@ tcpmss_mangle_packet(struct sk_buff *skb,
152 if (len > tcp_hdrlen) 152 if (len > tcp_hdrlen)
153 return 0; 153 return 0;
154 154
155 /* tcph->doff has 4 bits, do not wrap it to 0 */
156 if (tcp_hdrlen >= 15 * 4)
157 return 0;
158
155 /* 159 /*
156 * MSS Option not found ?! add it.. 160 * MSS Option not found ?! add it..
157 */ 161 */
@@ -172,7 +176,7 @@ tcpmss_mangle_packet(struct sk_buff *skb,
172 * length IPv6 header of 60, ergo the default MSS value is 1220 176 * length IPv6 header of 60, ergo the default MSS value is 1220
173 * Since no MSS was provided, we must use the default values 177 * Since no MSS was provided, we must use the default values
174 */ 178 */
175 if (par->family == NFPROTO_IPV4) 179 if (xt_family(par) == NFPROTO_IPV4)
176 newmss = min(newmss, (u16)536); 180 newmss = min(newmss, (u16)536);
177 else 181 else
178 newmss = min(newmss, (u16)1220); 182 newmss = min(newmss, (u16)1220);
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index 0471db4032c5..86b0580b2216 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -33,7 +33,7 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)
33 const struct xt_tee_tginfo *info = par->targinfo; 33 const struct xt_tee_tginfo *info = par->targinfo;
34 int oif = info->priv ? info->priv->oif : 0; 34 int oif = info->priv ? info->priv->oif : 0;
35 35
36 nf_dup_ipv4(par->net, skb, par->hooknum, &info->gw.in, oif); 36 nf_dup_ipv4(xt_net(par), skb, xt_hooknum(par), &info->gw.in, oif);
37 37
38 return XT_CONTINUE; 38 return XT_CONTINUE;
39} 39}
@@ -45,7 +45,7 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)
45 const struct xt_tee_tginfo *info = par->targinfo; 45 const struct xt_tee_tginfo *info = par->targinfo;
46 int oif = info->priv ? info->priv->oif : 0; 46 int oif = info->priv ? info->priv->oif : 0;
47 47
48 nf_dup_ipv6(par->net, skb, par->hooknum, &info->gw.in6, oif); 48 nf_dup_ipv6(xt_net(par), skb, xt_hooknum(par), &info->gw.in6, oif);
49 49
50 return XT_CONTINUE; 50 return XT_CONTINUE;
51} 51}
@@ -133,6 +133,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = {
133 .family = NFPROTO_IPV4, 133 .family = NFPROTO_IPV4,
134 .target = tee_tg4, 134 .target = tee_tg4,
135 .targetsize = sizeof(struct xt_tee_tginfo), 135 .targetsize = sizeof(struct xt_tee_tginfo),
136 .usersize = offsetof(struct xt_tee_tginfo, priv),
136 .checkentry = tee_tg_check, 137 .checkentry = tee_tg_check,
137 .destroy = tee_tg_destroy, 138 .destroy = tee_tg_destroy,
138 .me = THIS_MODULE, 139 .me = THIS_MODULE,
@@ -144,6 +145,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = {
144 .family = NFPROTO_IPV6, 145 .family = NFPROTO_IPV6,
145 .target = tee_tg6, 146 .target = tee_tg6,
146 .targetsize = sizeof(struct xt_tee_tginfo), 147 .targetsize = sizeof(struct xt_tee_tginfo),
148 .usersize = offsetof(struct xt_tee_tginfo, priv),
147 .checkentry = tee_tg_check, 149 .checkentry = tee_tg_check,
148 .destroy = tee_tg_destroy, 150 .destroy = tee_tg_destroy,
149 .me = THIS_MODULE, 151 .me = THIS_MODULE,
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 663c4c3c9072..df7f1df00330 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -364,7 +364,8 @@ tproxy_tg4_v0(struct sk_buff *skb, const struct xt_action_param *par)
364{ 364{
365 const struct xt_tproxy_target_info *tgi = par->targinfo; 365 const struct xt_tproxy_target_info *tgi = par->targinfo;
366 366
367 return tproxy_tg4(par->net, skb, tgi->laddr, tgi->lport, tgi->mark_mask, tgi->mark_value); 367 return tproxy_tg4(xt_net(par), skb, tgi->laddr, tgi->lport,
368 tgi->mark_mask, tgi->mark_value);
368} 369}
369 370
370static unsigned int 371static unsigned int
@@ -372,7 +373,8 @@ tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par)
372{ 373{
373 const struct xt_tproxy_target_info_v1 *tgi = par->targinfo; 374 const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
374 375
375 return tproxy_tg4(par->net, skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value); 376 return tproxy_tg4(xt_net(par), skb, tgi->laddr.ip, tgi->lport,
377 tgi->mark_mask, tgi->mark_value);
376} 378}
377 379
378#ifdef XT_TPROXY_HAVE_IPV6 380#ifdef XT_TPROXY_HAVE_IPV6
@@ -391,7 +393,8 @@ tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr,
391 393
392 rcu_read_lock(); 394 rcu_read_lock();
393 indev = __in6_dev_get(skb->dev); 395 indev = __in6_dev_get(skb->dev);
394 if (indev) 396 if (indev) {
397 read_lock_bh(&indev->lock);
395 list_for_each_entry(ifa, &indev->addr_list, if_list) { 398 list_for_each_entry(ifa, &indev->addr_list, if_list) {
396 if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED)) 399 if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED))
397 continue; 400 continue;
@@ -399,6 +402,8 @@ tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr,
399 laddr = &ifa->addr; 402 laddr = &ifa->addr;
400 break; 403 break;
401 } 404 }
405 read_unlock_bh(&indev->lock);
406 }
402 rcu_read_unlock(); 407 rcu_read_unlock();
403 408
404 return laddr ? laddr : daddr; 409 return laddr ? laddr : daddr;
@@ -442,7 +447,7 @@ tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
442 * to a listener socket if there's one */ 447 * to a listener socket if there's one */
443 struct sock *sk2; 448 struct sock *sk2;
444 449
445 sk2 = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp, tproto, 450 sk2 = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, tproto,
446 &iph->saddr, 451 &iph->saddr,
447 tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr), 452 tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr),
448 hp->source, 453 hp->source,
@@ -485,10 +490,10 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
485 * addresses, this happens if the redirect already happened 490 * addresses, this happens if the redirect already happened
486 * and the current packet belongs to an already established 491 * and the current packet belongs to an already established
487 * connection */ 492 * connection */
488 sk = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp, tproto, 493 sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, tproto,
489 &iph->saddr, &iph->daddr, 494 &iph->saddr, &iph->daddr,
490 hp->source, hp->dest, 495 hp->source, hp->dest,
491 par->in, NFT_LOOKUP_ESTABLISHED); 496 xt_in(par), NFT_LOOKUP_ESTABLISHED);
492 497
493 laddr = tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr); 498 laddr = tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr);
494 lport = tgi->lport ? tgi->lport : hp->dest; 499 lport = tgi->lport ? tgi->lport : hp->dest;
@@ -500,10 +505,10 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
500 else if (!sk) 505 else if (!sk)
501 /* no there's no established connection, check if 506 /* no there's no established connection, check if
502 * there's a listener on the redirected addr/port */ 507 * there's a listener on the redirected addr/port */
503 sk = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp, 508 sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp,
504 tproto, &iph->saddr, laddr, 509 tproto, &iph->saddr, laddr,
505 hp->source, lport, 510 hp->source, lport,
506 par->in, NFT_LOOKUP_LISTENER); 511 xt_in(par), NFT_LOOKUP_LISTENER);
507 512
508 /* NOTE: assign_sock consumes our sk reference */ 513 /* NOTE: assign_sock consumes our sk reference */
509 if (sk && tproxy_sk_is_transparent(sk)) { 514 if (sk && tproxy_sk_is_transparent(sk)) {
@@ -529,6 +534,11 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
529static int tproxy_tg6_check(const struct xt_tgchk_param *par) 534static int tproxy_tg6_check(const struct xt_tgchk_param *par)
530{ 535{
531 const struct ip6t_ip6 *i = par->entryinfo; 536 const struct ip6t_ip6 *i = par->entryinfo;
537 int err;
538
539 err = nf_defrag_ipv6_enable(par->net);
540 if (err)
541 return err;
532 542
533 if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP) && 543 if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP) &&
534 !(i->invflags & IP6T_INV_PROTO)) 544 !(i->invflags & IP6T_INV_PROTO))
@@ -543,6 +553,11 @@ static int tproxy_tg6_check(const struct xt_tgchk_param *par)
543static int tproxy_tg4_check(const struct xt_tgchk_param *par) 553static int tproxy_tg4_check(const struct xt_tgchk_param *par)
544{ 554{
545 const struct ipt_ip *i = par->entryinfo; 555 const struct ipt_ip *i = par->entryinfo;
556 int err;
557
558 err = nf_defrag_ipv4_enable(par->net);
559 if (err)
560 return err;
546 561
547 if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP) 562 if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP)
548 && !(i->invflags & IPT_INV_PROTO)) 563 && !(i->invflags & IPT_INV_PROTO))
@@ -594,11 +609,6 @@ static struct xt_target tproxy_tg_reg[] __read_mostly = {
594 609
595static int __init tproxy_tg_init(void) 610static int __init tproxy_tg_init(void)
596{ 611{
597 nf_defrag_ipv4_enable();
598#ifdef XT_TPROXY_HAVE_IPV6
599 nf_defrag_ipv6_enable();
600#endif
601
602 return xt_register_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg)); 612 return xt_register_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg));
603} 613}
604 614
diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c
index 11d6091991a4..e329dabde35f 100644
--- a/net/netfilter/xt_addrtype.c
+++ b/net/netfilter/xt_addrtype.c
@@ -125,7 +125,7 @@ static inline bool match_type(struct net *net, const struct net_device *dev,
125static bool 125static bool
126addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) 126addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
127{ 127{
128 struct net *net = par->net; 128 struct net *net = xt_net(par);
129 const struct xt_addrtype_info *info = par->matchinfo; 129 const struct xt_addrtype_info *info = par->matchinfo;
130 const struct iphdr *iph = ip_hdr(skb); 130 const struct iphdr *iph = ip_hdr(skb);
131 bool ret = true; 131 bool ret = true;
@@ -143,19 +143,19 @@ addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
143static bool 143static bool
144addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) 144addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
145{ 145{
146 struct net *net = par->net; 146 struct net *net = xt_net(par);
147 const struct xt_addrtype_info_v1 *info = par->matchinfo; 147 const struct xt_addrtype_info_v1 *info = par->matchinfo;
148 const struct iphdr *iph; 148 const struct iphdr *iph;
149 const struct net_device *dev = NULL; 149 const struct net_device *dev = NULL;
150 bool ret = true; 150 bool ret = true;
151 151
152 if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN) 152 if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN)
153 dev = par->in; 153 dev = xt_in(par);
154 else if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) 154 else if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT)
155 dev = par->out; 155 dev = xt_out(par);
156 156
157#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) 157#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
158 if (par->family == NFPROTO_IPV6) 158 if (xt_family(par) == NFPROTO_IPV6)
159 return addrtype_mt6(net, dev, skb, info); 159 return addrtype_mt6(net, dev, skb, info);
160#endif 160#endif
161 iph = ip_hdr(skb); 161 iph = ip_hdr(skb);
diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
index dffee9d47ec4..38986a95216c 100644
--- a/net/netfilter/xt_bpf.c
+++ b/net/netfilter/xt_bpf.c
@@ -10,6 +10,7 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/skbuff.h> 11#include <linux/skbuff.h>
12#include <linux/filter.h> 12#include <linux/filter.h>
13#include <linux/bpf.h>
13 14
14#include <linux/netfilter/xt_bpf.h> 15#include <linux/netfilter/xt_bpf.h>
15#include <linux/netfilter/x_tables.h> 16#include <linux/netfilter/x_tables.h>
@@ -20,15 +21,15 @@ MODULE_LICENSE("GPL");
20MODULE_ALIAS("ipt_bpf"); 21MODULE_ALIAS("ipt_bpf");
21MODULE_ALIAS("ip6t_bpf"); 22MODULE_ALIAS("ip6t_bpf");
22 23
23static int bpf_mt_check(const struct xt_mtchk_param *par) 24static int __bpf_mt_check_bytecode(struct sock_filter *insns, __u16 len,
25 struct bpf_prog **ret)
24{ 26{
25 struct xt_bpf_info *info = par->matchinfo;
26 struct sock_fprog_kern program; 27 struct sock_fprog_kern program;
27 28
28 program.len = info->bpf_program_num_elem; 29 program.len = len;
29 program.filter = info->bpf_program; 30 program.filter = insns;
30 31
31 if (bpf_prog_create(&info->filter, &program)) { 32 if (bpf_prog_create(ret, &program)) {
32 pr_info("bpf: check failed: parse error\n"); 33 pr_info("bpf: check failed: parse error\n");
33 return -EINVAL; 34 return -EINVAL;
34 } 35 }
@@ -36,6 +37,42 @@ static int bpf_mt_check(const struct xt_mtchk_param *par)
36 return 0; 37 return 0;
37} 38}
38 39
40static int __bpf_mt_check_fd(int fd, struct bpf_prog **ret)
41{
42 struct bpf_prog *prog;
43
44 prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
45 if (IS_ERR(prog))
46 return PTR_ERR(prog);
47
48 *ret = prog;
49 return 0;
50}
51
52static int bpf_mt_check(const struct xt_mtchk_param *par)
53{
54 struct xt_bpf_info *info = par->matchinfo;
55
56 return __bpf_mt_check_bytecode(info->bpf_program,
57 info->bpf_program_num_elem,
58 &info->filter);
59}
60
61static int bpf_mt_check_v1(const struct xt_mtchk_param *par)
62{
63 struct xt_bpf_info_v1 *info = par->matchinfo;
64
65 if (info->mode == XT_BPF_MODE_BYTECODE)
66 return __bpf_mt_check_bytecode(info->bpf_program,
67 info->bpf_program_num_elem,
68 &info->filter);
69 else if (info->mode == XT_BPF_MODE_FD_PINNED ||
70 info->mode == XT_BPF_MODE_FD_ELF)
71 return __bpf_mt_check_fd(info->fd, &info->filter);
72 else
73 return -EINVAL;
74}
75
39static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par) 76static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par)
40{ 77{
41 const struct xt_bpf_info *info = par->matchinfo; 78 const struct xt_bpf_info *info = par->matchinfo;
@@ -43,31 +80,60 @@ static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par)
43 return BPF_PROG_RUN(info->filter, skb); 80 return BPF_PROG_RUN(info->filter, skb);
44} 81}
45 82
83static bool bpf_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
84{
85 const struct xt_bpf_info_v1 *info = par->matchinfo;
86
87 return !!bpf_prog_run_save_cb(info->filter, (struct sk_buff *) skb);
88}
89
46static void bpf_mt_destroy(const struct xt_mtdtor_param *par) 90static void bpf_mt_destroy(const struct xt_mtdtor_param *par)
47{ 91{
48 const struct xt_bpf_info *info = par->matchinfo; 92 const struct xt_bpf_info *info = par->matchinfo;
93
94 bpf_prog_destroy(info->filter);
95}
96
97static void bpf_mt_destroy_v1(const struct xt_mtdtor_param *par)
98{
99 const struct xt_bpf_info_v1 *info = par->matchinfo;
100
49 bpf_prog_destroy(info->filter); 101 bpf_prog_destroy(info->filter);
50} 102}
51 103
52static struct xt_match bpf_mt_reg __read_mostly = { 104static struct xt_match bpf_mt_reg[] __read_mostly = {
53 .name = "bpf", 105 {
54 .revision = 0, 106 .name = "bpf",
55 .family = NFPROTO_UNSPEC, 107 .revision = 0,
56 .checkentry = bpf_mt_check, 108 .family = NFPROTO_UNSPEC,
57 .match = bpf_mt, 109 .checkentry = bpf_mt_check,
58 .destroy = bpf_mt_destroy, 110 .match = bpf_mt,
59 .matchsize = sizeof(struct xt_bpf_info), 111 .destroy = bpf_mt_destroy,
60 .me = THIS_MODULE, 112 .matchsize = sizeof(struct xt_bpf_info),
113 .usersize = offsetof(struct xt_bpf_info, filter),
114 .me = THIS_MODULE,
115 },
116 {
117 .name = "bpf",
118 .revision = 1,
119 .family = NFPROTO_UNSPEC,
120 .checkentry = bpf_mt_check_v1,
121 .match = bpf_mt_v1,
122 .destroy = bpf_mt_destroy_v1,
123 .matchsize = sizeof(struct xt_bpf_info_v1),
124 .usersize = offsetof(struct xt_bpf_info_v1, filter),
125 .me = THIS_MODULE,
126 },
61}; 127};
62 128
63static int __init bpf_mt_init(void) 129static int __init bpf_mt_init(void)
64{ 130{
65 return xt_register_match(&bpf_mt_reg); 131 return xt_register_matches(bpf_mt_reg, ARRAY_SIZE(bpf_mt_reg));
66} 132}
67 133
68static void __exit bpf_mt_exit(void) 134static void __exit bpf_mt_exit(void)
69{ 135{
70 xt_unregister_match(&bpf_mt_reg); 136 xt_unregister_matches(bpf_mt_reg, ARRAY_SIZE(bpf_mt_reg));
71} 137}
72 138
73module_init(bpf_mt_init); 139module_init(bpf_mt_init);
diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c
index a086a914865f..1db1ce59079f 100644
--- a/net/netfilter/xt_cgroup.c
+++ b/net/netfilter/xt_cgroup.c
@@ -122,6 +122,7 @@ static struct xt_match cgroup_mt_reg[] __read_mostly = {
122 .checkentry = cgroup_mt_check_v1, 122 .checkentry = cgroup_mt_check_v1,
123 .match = cgroup_mt_v1, 123 .match = cgroup_mt_v1,
124 .matchsize = sizeof(struct xt_cgroup_info_v1), 124 .matchsize = sizeof(struct xt_cgroup_info_v1),
125 .usersize = offsetof(struct xt_cgroup_info_v1, priv),
125 .destroy = cgroup_mt_destroy_v1, 126 .destroy = cgroup_mt_destroy_v1,
126 .me = THIS_MODULE, 127 .me = THIS_MODULE,
127 .hooks = (1 << NF_INET_LOCAL_OUT) | 128 .hooks = (1 << NF_INET_LOCAL_OUT) |
diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c
index 96fa26b20b67..9a9884a39c0e 100644
--- a/net/netfilter/xt_cluster.c
+++ b/net/netfilter/xt_cluster.c
@@ -112,7 +112,7 @@ xt_cluster_mt(const struct sk_buff *skb, struct xt_action_param *par)
112 * know, matches should not alter packets, but we are doing this here 112 * know, matches should not alter packets, but we are doing this here
113 * because we would need to add a PKTTYPE target for this sole purpose. 113 * because we would need to add a PKTTYPE target for this sole purpose.
114 */ 114 */
115 if (!xt_cluster_is_multicast_addr(skb, par->family) && 115 if (!xt_cluster_is_multicast_addr(skb, xt_family(par)) &&
116 skb->pkt_type == PACKET_MULTICAST) { 116 skb->pkt_type == PACKET_MULTICAST) {
117 pskb->pkt_type = PACKET_HOST; 117 pskb->pkt_type = PACKET_HOST;
118 } 118 }
diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c
index d4bec261e74e..cad0b7b5eb35 100644
--- a/net/netfilter/xt_connbytes.c
+++ b/net/netfilter/xt_connbytes.c
@@ -110,7 +110,7 @@ static int connbytes_mt_check(const struct xt_mtchk_param *par)
110 sinfo->direction != XT_CONNBYTES_DIR_BOTH) 110 sinfo->direction != XT_CONNBYTES_DIR_BOTH)
111 return -EINVAL; 111 return -EINVAL;
112 112
113 ret = nf_ct_l3proto_try_module_get(par->family); 113 ret = nf_ct_netns_get(par->net, par->family);
114 if (ret < 0) 114 if (ret < 0)
115 pr_info("cannot load conntrack support for proto=%u\n", 115 pr_info("cannot load conntrack support for proto=%u\n",
116 par->family); 116 par->family);
@@ -129,7 +129,7 @@ static int connbytes_mt_check(const struct xt_mtchk_param *par)
129 129
130static void connbytes_mt_destroy(const struct xt_mtdtor_param *par) 130static void connbytes_mt_destroy(const struct xt_mtdtor_param *par)
131{ 131{
132 nf_ct_l3proto_module_put(par->family); 132 nf_ct_netns_put(par->net, par->family);
133} 133}
134 134
135static struct xt_match connbytes_mt_reg __read_mostly = { 135static struct xt_match connbytes_mt_reg __read_mostly = {
diff --git a/net/netfilter/xt_connlabel.c b/net/netfilter/xt_connlabel.c
index 03d66f1c5e69..7827128d5a95 100644
--- a/net/netfilter/xt_connlabel.c
+++ b/net/netfilter/xt_connlabel.c
@@ -61,7 +61,7 @@ static int connlabel_mt_check(const struct xt_mtchk_param *par)
61 return -EINVAL; 61 return -EINVAL;
62 } 62 }
63 63
64 ret = nf_ct_l3proto_try_module_get(par->family); 64 ret = nf_ct_netns_get(par->net, par->family);
65 if (ret < 0) { 65 if (ret < 0) {
66 pr_info("cannot load conntrack support for proto=%u\n", 66 pr_info("cannot load conntrack support for proto=%u\n",
67 par->family); 67 par->family);
@@ -70,14 +70,14 @@ static int connlabel_mt_check(const struct xt_mtchk_param *par)
70 70
71 ret = nf_connlabels_get(par->net, info->bit); 71 ret = nf_connlabels_get(par->net, info->bit);
72 if (ret < 0) 72 if (ret < 0)
73 nf_ct_l3proto_module_put(par->family); 73 nf_ct_netns_put(par->net, par->family);
74 return ret; 74 return ret;
75} 75}
76 76
77static void connlabel_mt_destroy(const struct xt_mtdtor_param *par) 77static void connlabel_mt_destroy(const struct xt_mtdtor_param *par)
78{ 78{
79 nf_connlabels_put(par->net); 79 nf_connlabels_put(par->net);
80 nf_ct_l3proto_module_put(par->family); 80 nf_ct_netns_put(par->net, par->family);
81} 81}
82 82
83static struct xt_match connlabels_mt_reg __read_mostly = { 83static struct xt_match connlabels_mt_reg __read_mostly = {
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index b6dc322593a3..b8fd4ab762ed 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -218,7 +218,7 @@ count_tree(struct net *net, struct rb_root *root,
218 int diff; 218 int diff;
219 bool addit; 219 bool addit;
220 220
221 rbconn = container_of(*rbnode, struct xt_connlimit_rb, node); 221 rbconn = rb_entry(*rbnode, struct xt_connlimit_rb, node);
222 222
223 parent = *rbnode; 223 parent = *rbnode;
224 diff = same_source_net(addr, mask, &rbconn->addr, family); 224 diff = same_source_net(addr, mask, &rbconn->addr, family);
@@ -317,7 +317,7 @@ static int count_them(struct net *net,
317static bool 317static bool
318connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) 318connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
319{ 319{
320 struct net *net = par->net; 320 struct net *net = xt_net(par);
321 const struct xt_connlimit_info *info = par->matchinfo; 321 const struct xt_connlimit_info *info = par->matchinfo;
322 union nf_inet_addr addr; 322 union nf_inet_addr addr;
323 struct nf_conntrack_tuple tuple; 323 struct nf_conntrack_tuple tuple;
@@ -332,11 +332,11 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
332 tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; 332 tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
333 zone = nf_ct_zone(ct); 333 zone = nf_ct_zone(ct);
334 } else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), 334 } else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
335 par->family, net, &tuple)) { 335 xt_family(par), net, &tuple)) {
336 goto hotdrop; 336 goto hotdrop;
337 } 337 }
338 338
339 if (par->family == NFPROTO_IPV6) { 339 if (xt_family(par) == NFPROTO_IPV6) {
340 const struct ipv6hdr *iph = ipv6_hdr(skb); 340 const struct ipv6hdr *iph = ipv6_hdr(skb);
341 memcpy(&addr.ip6, (info->flags & XT_CONNLIMIT_DADDR) ? 341 memcpy(&addr.ip6, (info->flags & XT_CONNLIMIT_DADDR) ?
342 &iph->daddr : &iph->saddr, sizeof(addr.ip6)); 342 &iph->daddr : &iph->saddr, sizeof(addr.ip6));
@@ -347,7 +347,7 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
347 } 347 }
348 348
349 connections = count_them(net, info->data, tuple_ptr, &addr, 349 connections = count_them(net, info->data, tuple_ptr, &addr,
350 &info->mask, par->family, zone); 350 &info->mask, xt_family(par), zone);
351 if (connections == 0) 351 if (connections == 0)
352 /* kmalloc failed, drop it entirely */ 352 /* kmalloc failed, drop it entirely */
353 goto hotdrop; 353 goto hotdrop;
@@ -368,7 +368,7 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par)
368 368
369 net_get_random_once(&connlimit_rnd, sizeof(connlimit_rnd)); 369 net_get_random_once(&connlimit_rnd, sizeof(connlimit_rnd));
370 370
371 ret = nf_ct_l3proto_try_module_get(par->family); 371 ret = nf_ct_netns_get(par->net, par->family);
372 if (ret < 0) { 372 if (ret < 0) {
373 pr_info("cannot load conntrack support for " 373 pr_info("cannot load conntrack support for "
374 "address family %u\n", par->family); 374 "address family %u\n", par->family);
@@ -378,7 +378,7 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par)
378 /* init private data */ 378 /* init private data */
379 info->data = kmalloc(sizeof(struct xt_connlimit_data), GFP_KERNEL); 379 info->data = kmalloc(sizeof(struct xt_connlimit_data), GFP_KERNEL);
380 if (info->data == NULL) { 380 if (info->data == NULL) {
381 nf_ct_l3proto_module_put(par->family); 381 nf_ct_netns_put(par->net, par->family);
382 return -ENOMEM; 382 return -ENOMEM;
383 } 383 }
384 384
@@ -398,7 +398,7 @@ static void destroy_tree(struct rb_root *r)
398 struct rb_node *node; 398 struct rb_node *node;
399 399
400 while ((node = rb_first(r)) != NULL) { 400 while ((node = rb_first(r)) != NULL) {
401 rbconn = container_of(node, struct xt_connlimit_rb, node); 401 rbconn = rb_entry(node, struct xt_connlimit_rb, node);
402 402
403 rb_erase(node, r); 403 rb_erase(node, r);
404 404
@@ -414,7 +414,7 @@ static void connlimit_mt_destroy(const struct xt_mtdtor_param *par)
414 const struct xt_connlimit_info *info = par->matchinfo; 414 const struct xt_connlimit_info *info = par->matchinfo;
415 unsigned int i; 415 unsigned int i;
416 416
417 nf_ct_l3proto_module_put(par->family); 417 nf_ct_netns_put(par->net, par->family);
418 418
419 for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i) 419 for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i)
420 destroy_tree(&info->data->climit_root4[i]); 420 destroy_tree(&info->data->climit_root4[i]);
@@ -431,6 +431,7 @@ static struct xt_match connlimit_mt_reg __read_mostly = {
431 .checkentry = connlimit_mt_check, 431 .checkentry = connlimit_mt_check,
432 .match = connlimit_mt, 432 .match = connlimit_mt,
433 .matchsize = sizeof(struct xt_connlimit_info), 433 .matchsize = sizeof(struct xt_connlimit_info),
434 .usersize = offsetof(struct xt_connlimit_info, data),
434 .destroy = connlimit_mt_destroy, 435 .destroy = connlimit_mt_destroy,
435 .me = THIS_MODULE, 436 .me = THIS_MODULE,
436}; 437};
diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c
index b83e158e116a..9935d5029b0e 100644
--- a/net/netfilter/xt_connmark.c
+++ b/net/netfilter/xt_connmark.c
@@ -77,7 +77,7 @@ static int connmark_tg_check(const struct xt_tgchk_param *par)
77{ 77{
78 int ret; 78 int ret;
79 79
80 ret = nf_ct_l3proto_try_module_get(par->family); 80 ret = nf_ct_netns_get(par->net, par->family);
81 if (ret < 0) 81 if (ret < 0)
82 pr_info("cannot load conntrack support for proto=%u\n", 82 pr_info("cannot load conntrack support for proto=%u\n",
83 par->family); 83 par->family);
@@ -86,7 +86,7 @@ static int connmark_tg_check(const struct xt_tgchk_param *par)
86 86
87static void connmark_tg_destroy(const struct xt_tgdtor_param *par) 87static void connmark_tg_destroy(const struct xt_tgdtor_param *par)
88{ 88{
89 nf_ct_l3proto_module_put(par->family); 89 nf_ct_netns_put(par->net, par->family);
90} 90}
91 91
92static bool 92static bool
@@ -107,7 +107,7 @@ static int connmark_mt_check(const struct xt_mtchk_param *par)
107{ 107{
108 int ret; 108 int ret;
109 109
110 ret = nf_ct_l3proto_try_module_get(par->family); 110 ret = nf_ct_netns_get(par->net, par->family);
111 if (ret < 0) 111 if (ret < 0)
112 pr_info("cannot load conntrack support for proto=%u\n", 112 pr_info("cannot load conntrack support for proto=%u\n",
113 par->family); 113 par->family);
@@ -116,7 +116,7 @@ static int connmark_mt_check(const struct xt_mtchk_param *par)
116 116
117static void connmark_mt_destroy(const struct xt_mtdtor_param *par) 117static void connmark_mt_destroy(const struct xt_mtdtor_param *par)
118{ 118{
119 nf_ct_l3proto_module_put(par->family); 119 nf_ct_netns_put(par->net, par->family);
120} 120}
121 121
122static struct xt_target connmark_tg_reg __read_mostly = { 122static struct xt_target connmark_tg_reg __read_mostly = {
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
index a3b8f697cfc5..c0fb217bc649 100644
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -200,22 +200,22 @@ conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par,
200 return false; 200 return false;
201 201
202 if (info->match_flags & XT_CONNTRACK_ORIGSRC) 202 if (info->match_flags & XT_CONNTRACK_ORIGSRC)
203 if (conntrack_mt_origsrc(ct, info, par->family) ^ 203 if (conntrack_mt_origsrc(ct, info, xt_family(par)) ^
204 !(info->invert_flags & XT_CONNTRACK_ORIGSRC)) 204 !(info->invert_flags & XT_CONNTRACK_ORIGSRC))
205 return false; 205 return false;
206 206
207 if (info->match_flags & XT_CONNTRACK_ORIGDST) 207 if (info->match_flags & XT_CONNTRACK_ORIGDST)
208 if (conntrack_mt_origdst(ct, info, par->family) ^ 208 if (conntrack_mt_origdst(ct, info, xt_family(par)) ^
209 !(info->invert_flags & XT_CONNTRACK_ORIGDST)) 209 !(info->invert_flags & XT_CONNTRACK_ORIGDST))
210 return false; 210 return false;
211 211
212 if (info->match_flags & XT_CONNTRACK_REPLSRC) 212 if (info->match_flags & XT_CONNTRACK_REPLSRC)
213 if (conntrack_mt_replsrc(ct, info, par->family) ^ 213 if (conntrack_mt_replsrc(ct, info, xt_family(par)) ^
214 !(info->invert_flags & XT_CONNTRACK_REPLSRC)) 214 !(info->invert_flags & XT_CONNTRACK_REPLSRC))
215 return false; 215 return false;
216 216
217 if (info->match_flags & XT_CONNTRACK_REPLDST) 217 if (info->match_flags & XT_CONNTRACK_REPLDST)
218 if (conntrack_mt_repldst(ct, info, par->family) ^ 218 if (conntrack_mt_repldst(ct, info, xt_family(par)) ^
219 !(info->invert_flags & XT_CONNTRACK_REPLDST)) 219 !(info->invert_flags & XT_CONNTRACK_REPLDST))
220 return false; 220 return false;
221 221
@@ -271,7 +271,7 @@ static int conntrack_mt_check(const struct xt_mtchk_param *par)
271{ 271{
272 int ret; 272 int ret;
273 273
274 ret = nf_ct_l3proto_try_module_get(par->family); 274 ret = nf_ct_netns_get(par->net, par->family);
275 if (ret < 0) 275 if (ret < 0)
276 pr_info("cannot load conntrack support for proto=%u\n", 276 pr_info("cannot load conntrack support for proto=%u\n",
277 par->family); 277 par->family);
@@ -280,7 +280,7 @@ static int conntrack_mt_check(const struct xt_mtchk_param *par)
280 280
281static void conntrack_mt_destroy(const struct xt_mtdtor_param *par) 281static void conntrack_mt_destroy(const struct xt_mtdtor_param *par)
282{ 282{
283 nf_ct_l3proto_module_put(par->family); 283 nf_ct_netns_put(par->net, par->family);
284} 284}
285 285
286static struct xt_match conntrack_mt_reg[] __read_mostly = { 286static struct xt_match conntrack_mt_reg[] __read_mostly = {
diff --git a/net/netfilter/xt_devgroup.c b/net/netfilter/xt_devgroup.c
index d9202cdd25c9..96ebe1cdefec 100644
--- a/net/netfilter/xt_devgroup.c
+++ b/net/netfilter/xt_devgroup.c
@@ -24,12 +24,12 @@ static bool devgroup_mt(const struct sk_buff *skb, struct xt_action_param *par)
24 const struct xt_devgroup_info *info = par->matchinfo; 24 const struct xt_devgroup_info *info = par->matchinfo;
25 25
26 if (info->flags & XT_DEVGROUP_MATCH_SRC && 26 if (info->flags & XT_DEVGROUP_MATCH_SRC &&
27 (((info->src_group ^ par->in->group) & info->src_mask ? 1 : 0) ^ 27 (((info->src_group ^ xt_in(par)->group) & info->src_mask ? 1 : 0) ^
28 ((info->flags & XT_DEVGROUP_INVERT_SRC) ? 1 : 0))) 28 ((info->flags & XT_DEVGROUP_INVERT_SRC) ? 1 : 0)))
29 return false; 29 return false;
30 30
31 if (info->flags & XT_DEVGROUP_MATCH_DST && 31 if (info->flags & XT_DEVGROUP_MATCH_DST &&
32 (((info->dst_group ^ par->out->group) & info->dst_mask ? 1 : 0) ^ 32 (((info->dst_group ^ xt_out(par)->group) & info->dst_mask ? 1 : 0) ^
33 ((info->flags & XT_DEVGROUP_INVERT_DST) ? 1 : 0))) 33 ((info->flags & XT_DEVGROUP_INVERT_DST) ? 1 : 0)))
34 return false; 34 return false;
35 35
diff --git a/net/netfilter/xt_dscp.c b/net/netfilter/xt_dscp.c
index 64670fc5d0e1..236ac8008909 100644
--- a/net/netfilter/xt_dscp.c
+++ b/net/netfilter/xt_dscp.c
@@ -58,7 +58,7 @@ static bool tos_mt(const struct sk_buff *skb, struct xt_action_param *par)
58{ 58{
59 const struct xt_tos_match_info *info = par->matchinfo; 59 const struct xt_tos_match_info *info = par->matchinfo;
60 60
61 if (par->family == NFPROTO_IPV4) 61 if (xt_family(par) == NFPROTO_IPV4)
62 return ((ip_hdr(skb)->tos & info->tos_mask) == 62 return ((ip_hdr(skb)->tos & info->tos_mask) ==
63 info->tos_value) ^ !!info->invert; 63 info->tos_value) ^ !!info->invert;
64 else 64 else
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index b89b688e9d01..2a6dfe8b74d3 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -49,7 +49,7 @@ struct hashlimit_net {
49 struct proc_dir_entry *ip6t_hashlimit; 49 struct proc_dir_entry *ip6t_hashlimit;
50}; 50};
51 51
52static int hashlimit_net_id; 52static unsigned int hashlimit_net_id;
53static inline struct hashlimit_net *hashlimit_pernet(struct net *net) 53static inline struct hashlimit_net *hashlimit_pernet(struct net *net)
54{ 54{
55 return net_generic(net, hashlimit_net_id); 55 return net_generic(net, hashlimit_net_id);
@@ -463,23 +463,16 @@ static u32 xt_hashlimit_len_to_chunks(u32 len)
463/* Precision saver. */ 463/* Precision saver. */
464static u64 user2credits(u64 user, int revision) 464static u64 user2credits(u64 user, int revision)
465{ 465{
466 if (revision == 1) { 466 u64 scale = (revision == 1) ?
467 /* If multiplying would overflow... */ 467 XT_HASHLIMIT_SCALE : XT_HASHLIMIT_SCALE_v2;
468 if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY_v1)) 468 u64 cpj = (revision == 1) ?
469 /* Divide first. */ 469 CREDITS_PER_JIFFY_v1 : CREDITS_PER_JIFFY;
470 return div64_u64(user, XT_HASHLIMIT_SCALE)
471 * HZ * CREDITS_PER_JIFFY_v1;
472
473 return div64_u64(user * HZ * CREDITS_PER_JIFFY_v1,
474 XT_HASHLIMIT_SCALE);
475 } else {
476 if (user > 0xFFFFFFFFFFFFFFFFULL / (HZ*CREDITS_PER_JIFFY))
477 return div64_u64(user, XT_HASHLIMIT_SCALE_v2)
478 * HZ * CREDITS_PER_JIFFY;
479 470
480 return div64_u64(user * HZ * CREDITS_PER_JIFFY, 471 /* Avoid overflow: divide the constant operands first */
481 XT_HASHLIMIT_SCALE_v2); 472 if (scale >= HZ * cpj)
482 } 473 return div64_u64(user, div64_u64(scale, HZ * cpj));
474
475 return user * div64_u64(HZ * cpj, scale);
483} 476}
484 477
485static u32 user2credits_byte(u32 user) 478static u32 user2credits_byte(u32 user)
@@ -838,6 +831,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
838 .family = NFPROTO_IPV4, 831 .family = NFPROTO_IPV4,
839 .match = hashlimit_mt_v1, 832 .match = hashlimit_mt_v1,
840 .matchsize = sizeof(struct xt_hashlimit_mtinfo1), 833 .matchsize = sizeof(struct xt_hashlimit_mtinfo1),
834 .usersize = offsetof(struct xt_hashlimit_mtinfo1, hinfo),
841 .checkentry = hashlimit_mt_check_v1, 835 .checkentry = hashlimit_mt_check_v1,
842 .destroy = hashlimit_mt_destroy_v1, 836 .destroy = hashlimit_mt_destroy_v1,
843 .me = THIS_MODULE, 837 .me = THIS_MODULE,
@@ -848,6 +842,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
848 .family = NFPROTO_IPV4, 842 .family = NFPROTO_IPV4,
849 .match = hashlimit_mt, 843 .match = hashlimit_mt,
850 .matchsize = sizeof(struct xt_hashlimit_mtinfo2), 844 .matchsize = sizeof(struct xt_hashlimit_mtinfo2),
845 .usersize = offsetof(struct xt_hashlimit_mtinfo2, hinfo),
851 .checkentry = hashlimit_mt_check, 846 .checkentry = hashlimit_mt_check,
852 .destroy = hashlimit_mt_destroy, 847 .destroy = hashlimit_mt_destroy,
853 .me = THIS_MODULE, 848 .me = THIS_MODULE,
@@ -859,6 +854,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
859 .family = NFPROTO_IPV6, 854 .family = NFPROTO_IPV6,
860 .match = hashlimit_mt_v1, 855 .match = hashlimit_mt_v1,
861 .matchsize = sizeof(struct xt_hashlimit_mtinfo1), 856 .matchsize = sizeof(struct xt_hashlimit_mtinfo1),
857 .usersize = offsetof(struct xt_hashlimit_mtinfo1, hinfo),
862 .checkentry = hashlimit_mt_check_v1, 858 .checkentry = hashlimit_mt_check_v1,
863 .destroy = hashlimit_mt_destroy_v1, 859 .destroy = hashlimit_mt_destroy_v1,
864 .me = THIS_MODULE, 860 .me = THIS_MODULE,
@@ -869,6 +865,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
869 .family = NFPROTO_IPV6, 865 .family = NFPROTO_IPV6,
870 .match = hashlimit_mt, 866 .match = hashlimit_mt,
871 .matchsize = sizeof(struct xt_hashlimit_mtinfo2), 867 .matchsize = sizeof(struct xt_hashlimit_mtinfo2),
868 .usersize = offsetof(struct xt_hashlimit_mtinfo2, hinfo),
872 .checkentry = hashlimit_mt_check, 869 .checkentry = hashlimit_mt_check,
873 .destroy = hashlimit_mt_destroy, 870 .destroy = hashlimit_mt_destroy,
874 .me = THIS_MODULE, 871 .me = THIS_MODULE,
diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c
index f679dd4c272a..38a78151c0e9 100644
--- a/net/netfilter/xt_helper.c
+++ b/net/netfilter/xt_helper.c
@@ -59,7 +59,7 @@ static int helper_mt_check(const struct xt_mtchk_param *par)
59 struct xt_helper_info *info = par->matchinfo; 59 struct xt_helper_info *info = par->matchinfo;
60 int ret; 60 int ret;
61 61
62 ret = nf_ct_l3proto_try_module_get(par->family); 62 ret = nf_ct_netns_get(par->net, par->family);
63 if (ret < 0) { 63 if (ret < 0) {
64 pr_info("cannot load conntrack support for proto=%u\n", 64 pr_info("cannot load conntrack support for proto=%u\n",
65 par->family); 65 par->family);
@@ -71,7 +71,7 @@ static int helper_mt_check(const struct xt_mtchk_param *par)
71 71
72static void helper_mt_destroy(const struct xt_mtdtor_param *par) 72static void helper_mt_destroy(const struct xt_mtdtor_param *par)
73{ 73{
74 nf_ct_l3proto_module_put(par->family); 74 nf_ct_netns_put(par->net, par->family);
75} 75}
76 76
77static struct xt_match helper_mt_reg __read_mostly = { 77static struct xt_match helper_mt_reg __read_mostly = {
diff --git a/net/netfilter/xt_ipvs.c b/net/netfilter/xt_ipvs.c
index 71a9d95e0a81..0fdc89064488 100644
--- a/net/netfilter/xt_ipvs.c
+++ b/net/netfilter/xt_ipvs.c
@@ -48,9 +48,9 @@ static bool
48ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par) 48ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par)
49{ 49{
50 const struct xt_ipvs_mtinfo *data = par->matchinfo; 50 const struct xt_ipvs_mtinfo *data = par->matchinfo;
51 struct netns_ipvs *ipvs = net_ipvs(par->net); 51 struct netns_ipvs *ipvs = net_ipvs(xt_net(par));
52 /* ipvs_mt_check ensures that family is only NFPROTO_IPV[46]. */ 52 /* ipvs_mt_check ensures that family is only NFPROTO_IPV[46]. */
53 const u_int8_t family = par->family; 53 const u_int8_t family = xt_family(par);
54 struct ip_vs_iphdr iph; 54 struct ip_vs_iphdr iph;
55 struct ip_vs_protocol *pp; 55 struct ip_vs_protocol *pp;
56 struct ip_vs_conn *cp; 56 struct ip_vs_conn *cp;
diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c
index bef850596558..dab962df1787 100644
--- a/net/netfilter/xt_limit.c
+++ b/net/netfilter/xt_limit.c
@@ -192,6 +192,8 @@ static struct xt_match limit_mt_reg __read_mostly = {
192 .compatsize = sizeof(struct compat_xt_rateinfo), 192 .compatsize = sizeof(struct compat_xt_rateinfo),
193 .compat_from_user = limit_mt_compat_from_user, 193 .compat_from_user = limit_mt_compat_from_user,
194 .compat_to_user = limit_mt_compat_to_user, 194 .compat_to_user = limit_mt_compat_to_user,
195#else
196 .usersize = offsetof(struct xt_rateinfo, prev),
195#endif 197#endif
196 .me = THIS_MODULE, 198 .me = THIS_MODULE,
197}; 199};
diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c
index ac1d3c3d09e7..1cde0e4985b7 100644
--- a/net/netfilter/xt_multiport.c
+++ b/net/netfilter/xt_multiport.c
@@ -42,29 +42,43 @@ ports_match_v1(const struct xt_multiport_v1 *minfo,
42 e = minfo->ports[++i]; 42 e = minfo->ports[++i];
43 pr_debug("src or dst matches with %d-%d?\n", s, e); 43 pr_debug("src or dst matches with %d-%d?\n", s, e);
44 44
45 if (minfo->flags == XT_MULTIPORT_SOURCE 45 switch (minfo->flags) {
46 && src >= s && src <= e) 46 case XT_MULTIPORT_SOURCE:
47 return true ^ minfo->invert; 47 if (src >= s && src <= e)
48 if (minfo->flags == XT_MULTIPORT_DESTINATION 48 return true ^ minfo->invert;
49 && dst >= s && dst <= e) 49 break;
50 return true ^ minfo->invert; 50 case XT_MULTIPORT_DESTINATION:
51 if (minfo->flags == XT_MULTIPORT_EITHER 51 if (dst >= s && dst <= e)
52 && ((dst >= s && dst <= e) 52 return true ^ minfo->invert;
53 || (src >= s && src <= e))) 53 break;
54 return true ^ minfo->invert; 54 case XT_MULTIPORT_EITHER:
55 if ((dst >= s && dst <= e) ||
56 (src >= s && src <= e))
57 return true ^ minfo->invert;
58 break;
59 default:
60 break;
61 }
55 } else { 62 } else {
56 /* exact port matching */ 63 /* exact port matching */
57 pr_debug("src or dst matches with %d?\n", s); 64 pr_debug("src or dst matches with %d?\n", s);
58 65
59 if (minfo->flags == XT_MULTIPORT_SOURCE 66 switch (minfo->flags) {
60 && src == s) 67 case XT_MULTIPORT_SOURCE:
61 return true ^ minfo->invert; 68 if (src == s)
62 if (minfo->flags == XT_MULTIPORT_DESTINATION 69 return true ^ minfo->invert;
63 && dst == s) 70 break;
64 return true ^ minfo->invert; 71 case XT_MULTIPORT_DESTINATION:
65 if (minfo->flags == XT_MULTIPORT_EITHER 72 if (dst == s)
66 && (src == s || dst == s)) 73 return true ^ minfo->invert;
67 return true ^ minfo->invert; 74 break;
75 case XT_MULTIPORT_EITHER:
76 if (src == s || dst == s)
77 return true ^ minfo->invert;
78 break;
79 default:
80 break;
81 }
68 } 82 }
69 } 83 }
70 84
diff --git a/net/netfilter/xt_nat.c b/net/netfilter/xt_nat.c
index bea7464cc43f..8107b3eb865f 100644
--- a/net/netfilter/xt_nat.c
+++ b/net/netfilter/xt_nat.c
@@ -23,7 +23,17 @@ static int xt_nat_checkentry_v0(const struct xt_tgchk_param *par)
23 par->target->name); 23 par->target->name);
24 return -EINVAL; 24 return -EINVAL;
25 } 25 }
26 return 0; 26 return nf_ct_netns_get(par->net, par->family);
27}
28
29static int xt_nat_checkentry(const struct xt_tgchk_param *par)
30{
31 return nf_ct_netns_get(par->net, par->family);
32}
33
34static void xt_nat_destroy(const struct xt_tgdtor_param *par)
35{
36 nf_ct_netns_put(par->net, par->family);
27} 37}
28 38
29static void xt_nat_convert_range(struct nf_nat_range *dst, 39static void xt_nat_convert_range(struct nf_nat_range *dst,
@@ -106,6 +116,7 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = {
106 .name = "SNAT", 116 .name = "SNAT",
107 .revision = 0, 117 .revision = 0,
108 .checkentry = xt_nat_checkentry_v0, 118 .checkentry = xt_nat_checkentry_v0,
119 .destroy = xt_nat_destroy,
109 .target = xt_snat_target_v0, 120 .target = xt_snat_target_v0,
110 .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), 121 .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
111 .family = NFPROTO_IPV4, 122 .family = NFPROTO_IPV4,
@@ -118,6 +129,7 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = {
118 .name = "DNAT", 129 .name = "DNAT",
119 .revision = 0, 130 .revision = 0,
120 .checkentry = xt_nat_checkentry_v0, 131 .checkentry = xt_nat_checkentry_v0,
132 .destroy = xt_nat_destroy,
121 .target = xt_dnat_target_v0, 133 .target = xt_dnat_target_v0,
122 .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), 134 .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
123 .family = NFPROTO_IPV4, 135 .family = NFPROTO_IPV4,
@@ -129,6 +141,8 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = {
129 { 141 {
130 .name = "SNAT", 142 .name = "SNAT",
131 .revision = 1, 143 .revision = 1,
144 .checkentry = xt_nat_checkentry,
145 .destroy = xt_nat_destroy,
132 .target = xt_snat_target_v1, 146 .target = xt_snat_target_v1,
133 .targetsize = sizeof(struct nf_nat_range), 147 .targetsize = sizeof(struct nf_nat_range),
134 .table = "nat", 148 .table = "nat",
@@ -139,6 +153,8 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = {
139 { 153 {
140 .name = "DNAT", 154 .name = "DNAT",
141 .revision = 1, 155 .revision = 1,
156 .checkentry = xt_nat_checkentry,
157 .destroy = xt_nat_destroy,
142 .target = xt_dnat_target_v1, 158 .target = xt_dnat_target_v1,
143 .targetsize = sizeof(struct nf_nat_range), 159 .targetsize = sizeof(struct nf_nat_range),
144 .table = "nat", 160 .table = "nat",
diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c
index cf327593852a..cc0518fe598e 100644
--- a/net/netfilter/xt_nfacct.c
+++ b/net/netfilter/xt_nfacct.c
@@ -26,7 +26,7 @@ static bool nfacct_mt(const struct sk_buff *skb, struct xt_action_param *par)
26 26
27 nfnl_acct_update(skb, info->nfacct); 27 nfnl_acct_update(skb, info->nfacct);
28 28
29 overquota = nfnl_acct_overquota(par->net, skb, info->nfacct); 29 overquota = nfnl_acct_overquota(xt_net(par), skb, info->nfacct);
30 30
31 return overquota == NFACCT_UNDERQUOTA ? false : true; 31 return overquota == NFACCT_UNDERQUOTA ? false : true;
32} 32}
diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
index 2455b69b5810..c05fefcec238 100644
--- a/net/netfilter/xt_osf.c
+++ b/net/netfilter/xt_osf.c
@@ -201,7 +201,7 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
201 unsigned char opts[MAX_IPOPTLEN]; 201 unsigned char opts[MAX_IPOPTLEN];
202 const struct xt_osf_finger *kf; 202 const struct xt_osf_finger *kf;
203 const struct xt_osf_user_finger *f; 203 const struct xt_osf_user_finger *f;
204 struct net *net = p->net; 204 struct net *net = xt_net(p);
205 205
206 if (!info) 206 if (!info)
207 return false; 207 return false;
@@ -326,8 +326,8 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
326 fcount++; 326 fcount++;
327 327
328 if (info->flags & XT_OSF_LOG) 328 if (info->flags & XT_OSF_LOG)
329 nf_log_packet(net, p->family, p->hooknum, skb, 329 nf_log_packet(net, xt_family(p), xt_hooknum(p), skb,
330 p->in, p->out, NULL, 330 xt_in(p), xt_out(p), NULL,
331 "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n", 331 "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n",
332 f->genre, f->version, f->subtype, 332 f->genre, f->version, f->subtype,
333 &ip->saddr, ntohs(tcp->source), 333 &ip->saddr, ntohs(tcp->source),
@@ -341,8 +341,8 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
341 rcu_read_unlock(); 341 rcu_read_unlock();
342 342
343 if (!fcount && (info->flags & XT_OSF_LOG)) 343 if (!fcount && (info->flags & XT_OSF_LOG))
344 nf_log_packet(net, p->family, p->hooknum, skb, p->in, 344 nf_log_packet(net, xt_family(p), xt_hooknum(p), skb, xt_in(p),
345 p->out, NULL, 345 xt_out(p), NULL,
346 "Remote OS is not known: %pI4:%u -> %pI4:%u\n", 346 "Remote OS is not known: %pI4:%u -> %pI4:%u\n",
347 &ip->saddr, ntohs(tcp->source), 347 &ip->saddr, ntohs(tcp->source),
348 &ip->daddr, ntohs(tcp->dest)); 348 &ip->daddr, ntohs(tcp->dest));
diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
index a20e731b5b6c..3d705c688a27 100644
--- a/net/netfilter/xt_owner.c
+++ b/net/netfilter/xt_owner.c
@@ -13,6 +13,8 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/skbuff.h> 14#include <linux/skbuff.h>
15#include <linux/file.h> 15#include <linux/file.h>
16#include <linux/cred.h>
17
16#include <net/sock.h> 18#include <net/sock.h>
17#include <net/inet_sock.h> 19#include <net/inet_sock.h>
18#include <linux/netfilter/x_tables.h> 20#include <linux/netfilter/x_tables.h>
@@ -63,7 +65,7 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
63 const struct xt_owner_match_info *info = par->matchinfo; 65 const struct xt_owner_match_info *info = par->matchinfo;
64 const struct file *filp; 66 const struct file *filp;
65 struct sock *sk = skb_to_full_sk(skb); 67 struct sock *sk = skb_to_full_sk(skb);
66 struct net *net = par->net; 68 struct net *net = xt_net(par);
67 69
68 if (sk == NULL || sk->sk_socket == NULL) 70 if (sk == NULL || sk->sk_socket == NULL)
69 return (info->match ^ info->invert) == 0; 71 return (info->match ^ info->invert) == 0;
diff --git a/net/netfilter/xt_pkttype.c b/net/netfilter/xt_pkttype.c
index 5b645cb598fc..1ef99151b3ba 100644
--- a/net/netfilter/xt_pkttype.c
+++ b/net/netfilter/xt_pkttype.c
@@ -30,11 +30,10 @@ pkttype_mt(const struct sk_buff *skb, struct xt_action_param *par)
30 30
31 if (skb->pkt_type != PACKET_LOOPBACK) 31 if (skb->pkt_type != PACKET_LOOPBACK)
32 type = skb->pkt_type; 32 type = skb->pkt_type;
33 else if (par->family == NFPROTO_IPV4 && 33 else if (xt_family(par) == NFPROTO_IPV4 &&
34 ipv4_is_multicast(ip_hdr(skb)->daddr)) 34 ipv4_is_multicast(ip_hdr(skb)->daddr))
35 type = PACKET_MULTICAST; 35 type = PACKET_MULTICAST;
36 else if (par->family == NFPROTO_IPV6 && 36 else if (xt_family(par) == NFPROTO_IPV6)
37 ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF)
38 type = PACKET_MULTICAST; 37 type = PACKET_MULTICAST;
39 else 38 else
40 type = PACKET_BROADCAST; 39 type = PACKET_BROADCAST;
diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c
index f23e97bb42d7..2b4ab189bba7 100644
--- a/net/netfilter/xt_policy.c
+++ b/net/netfilter/xt_policy.c
@@ -116,9 +116,9 @@ policy_mt(const struct sk_buff *skb, struct xt_action_param *par)
116 int ret; 116 int ret;
117 117
118 if (info->flags & XT_POLICY_MATCH_IN) 118 if (info->flags & XT_POLICY_MATCH_IN)
119 ret = match_policy_in(skb, info, par->family); 119 ret = match_policy_in(skb, info, xt_family(par));
120 else 120 else
121 ret = match_policy_out(skb, info, par->family); 121 ret = match_policy_out(skb, info, xt_family(par));
122 122
123 if (ret < 0) 123 if (ret < 0)
124 ret = info->flags & XT_POLICY_MATCH_NONE ? true : false; 124 ret = info->flags & XT_POLICY_MATCH_NONE ? true : false;
diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c
index 44c8eb4c9d66..10d61a6eed71 100644
--- a/net/netfilter/xt_quota.c
+++ b/net/netfilter/xt_quota.c
@@ -73,6 +73,7 @@ static struct xt_match quota_mt_reg __read_mostly = {
73 .checkentry = quota_mt_check, 73 .checkentry = quota_mt_check,
74 .destroy = quota_mt_destroy, 74 .destroy = quota_mt_destroy,
75 .matchsize = sizeof(struct xt_quota_info), 75 .matchsize = sizeof(struct xt_quota_info),
76 .usersize = offsetof(struct xt_quota_info, master),
76 .me = THIS_MODULE, 77 .me = THIS_MODULE,
77}; 78};
78 79
diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c
index 7720b036d76a..755d2f6693a2 100644
--- a/net/netfilter/xt_rateest.c
+++ b/net/netfilter/xt_rateest.c
@@ -18,35 +18,33 @@ static bool
18xt_rateest_mt(const struct sk_buff *skb, struct xt_action_param *par) 18xt_rateest_mt(const struct sk_buff *skb, struct xt_action_param *par)
19{ 19{
20 const struct xt_rateest_match_info *info = par->matchinfo; 20 const struct xt_rateest_match_info *info = par->matchinfo;
21 struct gnet_stats_rate_est64 *r; 21 struct gnet_stats_rate_est64 sample = {0};
22 u_int32_t bps1, bps2, pps1, pps2; 22 u_int32_t bps1, bps2, pps1, pps2;
23 bool ret = true; 23 bool ret = true;
24 24
25 spin_lock_bh(&info->est1->lock); 25 gen_estimator_read(&info->est1->rate_est, &sample);
26 r = &info->est1->rstats; 26
27 if (info->flags & XT_RATEEST_MATCH_DELTA) { 27 if (info->flags & XT_RATEEST_MATCH_DELTA) {
28 bps1 = info->bps1 >= r->bps ? info->bps1 - r->bps : 0; 28 bps1 = info->bps1 >= sample.bps ? info->bps1 - sample.bps : 0;
29 pps1 = info->pps1 >= r->pps ? info->pps1 - r->pps : 0; 29 pps1 = info->pps1 >= sample.pps ? info->pps1 - sample.pps : 0;
30 } else { 30 } else {
31 bps1 = r->bps; 31 bps1 = sample.bps;
32 pps1 = r->pps; 32 pps1 = sample.pps;
33 } 33 }
34 spin_unlock_bh(&info->est1->lock);
35 34
36 if (info->flags & XT_RATEEST_MATCH_ABS) { 35 if (info->flags & XT_RATEEST_MATCH_ABS) {
37 bps2 = info->bps2; 36 bps2 = info->bps2;
38 pps2 = info->pps2; 37 pps2 = info->pps2;
39 } else { 38 } else {
40 spin_lock_bh(&info->est2->lock); 39 gen_estimator_read(&info->est2->rate_est, &sample);
41 r = &info->est2->rstats; 40
42 if (info->flags & XT_RATEEST_MATCH_DELTA) { 41 if (info->flags & XT_RATEEST_MATCH_DELTA) {
43 bps2 = info->bps2 >= r->bps ? info->bps2 - r->bps : 0; 42 bps2 = info->bps2 >= sample.bps ? info->bps2 - sample.bps : 0;
44 pps2 = info->pps2 >= r->pps ? info->pps2 - r->pps : 0; 43 pps2 = info->pps2 >= sample.pps ? info->pps2 - sample.pps : 0;
45 } else { 44 } else {
46 bps2 = r->bps; 45 bps2 = sample.bps;
47 pps2 = r->pps; 46 pps2 = sample.pps;
48 } 47 }
49 spin_unlock_bh(&info->est2->lock);
50 } 48 }
51 49
52 switch (info->mode) { 50 switch (info->mode) {
@@ -135,6 +133,7 @@ static struct xt_match xt_rateest_mt_reg __read_mostly = {
135 .checkentry = xt_rateest_mt_checkentry, 133 .checkentry = xt_rateest_mt_checkentry,
136 .destroy = xt_rateest_mt_destroy, 134 .destroy = xt_rateest_mt_destroy,
137 .matchsize = sizeof(struct xt_rateest_match_info), 135 .matchsize = sizeof(struct xt_rateest_match_info),
136 .usersize = offsetof(struct xt_rateest_match_info, est1),
138 .me = THIS_MODULE, 137 .me = THIS_MODULE,
139}; 138};
140 139
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index e3b7a09b103e..1d89a4eaf841 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -95,7 +95,7 @@ struct recent_net {
95#endif 95#endif
96}; 96};
97 97
98static int recent_net_id __read_mostly; 98static unsigned int recent_net_id __read_mostly;
99 99
100static inline struct recent_net *recent_pernet(struct net *net) 100static inline struct recent_net *recent_pernet(struct net *net)
101{ 101{
@@ -236,7 +236,7 @@ static void recent_table_flush(struct recent_table *t)
236static bool 236static bool
237recent_mt(const struct sk_buff *skb, struct xt_action_param *par) 237recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
238{ 238{
239 struct net *net = par->net; 239 struct net *net = xt_net(par);
240 struct recent_net *recent_net = recent_pernet(net); 240 struct recent_net *recent_net = recent_pernet(net);
241 const struct xt_recent_mtinfo_v1 *info = par->matchinfo; 241 const struct xt_recent_mtinfo_v1 *info = par->matchinfo;
242 struct recent_table *t; 242 struct recent_table *t;
@@ -245,7 +245,7 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
245 u_int8_t ttl; 245 u_int8_t ttl;
246 bool ret = info->invert; 246 bool ret = info->invert;
247 247
248 if (par->family == NFPROTO_IPV4) { 248 if (xt_family(par) == NFPROTO_IPV4) {
249 const struct iphdr *iph = ip_hdr(skb); 249 const struct iphdr *iph = ip_hdr(skb);
250 250
251 if (info->side == XT_RECENT_DEST) 251 if (info->side == XT_RECENT_DEST)
@@ -266,7 +266,7 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
266 } 266 }
267 267
268 /* use TTL as seen before forwarding */ 268 /* use TTL as seen before forwarding */
269 if (par->out != NULL && skb->sk == NULL) 269 if (xt_out(par) != NULL && skb->sk == NULL)
270 ttl++; 270 ttl++;
271 271
272 spin_lock_bh(&recent_lock); 272 spin_lock_bh(&recent_lock);
@@ -274,12 +274,12 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
274 274
275 nf_inet_addr_mask(&addr, &addr_mask, &t->mask); 275 nf_inet_addr_mask(&addr, &addr_mask, &t->mask);
276 276
277 e = recent_entry_lookup(t, &addr_mask, par->family, 277 e = recent_entry_lookup(t, &addr_mask, xt_family(par),
278 (info->check_set & XT_RECENT_TTL) ? ttl : 0); 278 (info->check_set & XT_RECENT_TTL) ? ttl : 0);
279 if (e == NULL) { 279 if (e == NULL) {
280 if (!(info->check_set & XT_RECENT_SET)) 280 if (!(info->check_set & XT_RECENT_SET))
281 goto out; 281 goto out;
282 e = recent_entry_init(t, &addr_mask, par->family, ttl); 282 e = recent_entry_init(t, &addr_mask, xt_family(par), ttl);
283 if (e == NULL) 283 if (e == NULL)
284 par->hotdrop = true; 284 par->hotdrop = true;
285 ret = !ret; 285 ret = !ret;
diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c
index 5669e5b453f4..64285702afd5 100644
--- a/net/netfilter/xt_set.c
+++ b/net/netfilter/xt_set.c
@@ -55,7 +55,7 @@ set_match_v0(const struct sk_buff *skb, struct xt_action_param *par)
55{ 55{
56 const struct xt_set_info_match_v0 *info = par->matchinfo; 56 const struct xt_set_info_match_v0 *info = par->matchinfo;
57 57
58 ADT_OPT(opt, par->family, info->match_set.u.compat.dim, 58 ADT_OPT(opt, xt_family(par), info->match_set.u.compat.dim,
59 info->match_set.u.compat.flags, 0, UINT_MAX); 59 info->match_set.u.compat.flags, 0, UINT_MAX);
60 60
61 return match_set(info->match_set.index, skb, par, &opt, 61 return match_set(info->match_set.index, skb, par, &opt,
@@ -118,7 +118,7 @@ set_match_v1(const struct sk_buff *skb, struct xt_action_param *par)
118{ 118{
119 const struct xt_set_info_match_v1 *info = par->matchinfo; 119 const struct xt_set_info_match_v1 *info = par->matchinfo;
120 120
121 ADT_OPT(opt, par->family, info->match_set.dim, 121 ADT_OPT(opt, xt_family(par), info->match_set.dim,
122 info->match_set.flags, 0, UINT_MAX); 122 info->match_set.flags, 0, UINT_MAX);
123 123
124 if (opt.flags & IPSET_RETURN_NOMATCH) 124 if (opt.flags & IPSET_RETURN_NOMATCH)
@@ -184,7 +184,7 @@ set_match_v3(const struct sk_buff *skb, struct xt_action_param *par)
184 const struct xt_set_info_match_v3 *info = par->matchinfo; 184 const struct xt_set_info_match_v3 *info = par->matchinfo;
185 int ret; 185 int ret;
186 186
187 ADT_OPT(opt, par->family, info->match_set.dim, 187 ADT_OPT(opt, xt_family(par), info->match_set.dim,
188 info->match_set.flags, info->flags, UINT_MAX); 188 info->match_set.flags, info->flags, UINT_MAX);
189 189
190 if (info->packets.op != IPSET_COUNTER_NONE || 190 if (info->packets.op != IPSET_COUNTER_NONE ||
@@ -231,7 +231,7 @@ set_match_v4(const struct sk_buff *skb, struct xt_action_param *par)
231 const struct xt_set_info_match_v4 *info = par->matchinfo; 231 const struct xt_set_info_match_v4 *info = par->matchinfo;
232 int ret; 232 int ret;
233 233
234 ADT_OPT(opt, par->family, info->match_set.dim, 234 ADT_OPT(opt, xt_family(par), info->match_set.dim,
235 info->match_set.flags, info->flags, UINT_MAX); 235 info->match_set.flags, info->flags, UINT_MAX);
236 236
237 if (info->packets.op != IPSET_COUNTER_NONE || 237 if (info->packets.op != IPSET_COUNTER_NONE ||
@@ -259,9 +259,9 @@ set_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
259{ 259{
260 const struct xt_set_info_target_v0 *info = par->targinfo; 260 const struct xt_set_info_target_v0 *info = par->targinfo;
261 261
262 ADT_OPT(add_opt, par->family, info->add_set.u.compat.dim, 262 ADT_OPT(add_opt, xt_family(par), info->add_set.u.compat.dim,
263 info->add_set.u.compat.flags, 0, UINT_MAX); 263 info->add_set.u.compat.flags, 0, UINT_MAX);
264 ADT_OPT(del_opt, par->family, info->del_set.u.compat.dim, 264 ADT_OPT(del_opt, xt_family(par), info->del_set.u.compat.dim,
265 info->del_set.u.compat.flags, 0, UINT_MAX); 265 info->del_set.u.compat.flags, 0, UINT_MAX);
266 266
267 if (info->add_set.index != IPSET_INVALID_ID) 267 if (info->add_set.index != IPSET_INVALID_ID)
@@ -332,9 +332,9 @@ set_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
332{ 332{
333 const struct xt_set_info_target_v1 *info = par->targinfo; 333 const struct xt_set_info_target_v1 *info = par->targinfo;
334 334
335 ADT_OPT(add_opt, par->family, info->add_set.dim, 335 ADT_OPT(add_opt, xt_family(par), info->add_set.dim,
336 info->add_set.flags, 0, UINT_MAX); 336 info->add_set.flags, 0, UINT_MAX);
337 ADT_OPT(del_opt, par->family, info->del_set.dim, 337 ADT_OPT(del_opt, xt_family(par), info->del_set.dim,
338 info->del_set.flags, 0, UINT_MAX); 338 info->del_set.flags, 0, UINT_MAX);
339 339
340 if (info->add_set.index != IPSET_INVALID_ID) 340 if (info->add_set.index != IPSET_INVALID_ID)
@@ -401,9 +401,9 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
401{ 401{
402 const struct xt_set_info_target_v2 *info = par->targinfo; 402 const struct xt_set_info_target_v2 *info = par->targinfo;
403 403
404 ADT_OPT(add_opt, par->family, info->add_set.dim, 404 ADT_OPT(add_opt, xt_family(par), info->add_set.dim,
405 info->add_set.flags, info->flags, info->timeout); 405 info->add_set.flags, info->flags, info->timeout);
406 ADT_OPT(del_opt, par->family, info->del_set.dim, 406 ADT_OPT(del_opt, xt_family(par), info->del_set.dim,
407 info->del_set.flags, 0, UINT_MAX); 407 info->del_set.flags, 0, UINT_MAX);
408 408
409 /* Normalize to fit into jiffies */ 409 /* Normalize to fit into jiffies */
@@ -423,17 +423,19 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
423 423
424/* Revision 3 target */ 424/* Revision 3 target */
425 425
426#define MOPT(opt, member) ((opt).ext.skbinfo.member)
427
426static unsigned int 428static unsigned int
427set_target_v3(struct sk_buff *skb, const struct xt_action_param *par) 429set_target_v3(struct sk_buff *skb, const struct xt_action_param *par)
428{ 430{
429 const struct xt_set_info_target_v3 *info = par->targinfo; 431 const struct xt_set_info_target_v3 *info = par->targinfo;
430 int ret; 432 int ret;
431 433
432 ADT_OPT(add_opt, par->family, info->add_set.dim, 434 ADT_OPT(add_opt, xt_family(par), info->add_set.dim,
433 info->add_set.flags, info->flags, info->timeout); 435 info->add_set.flags, info->flags, info->timeout);
434 ADT_OPT(del_opt, par->family, info->del_set.dim, 436 ADT_OPT(del_opt, xt_family(par), info->del_set.dim,
435 info->del_set.flags, 0, UINT_MAX); 437 info->del_set.flags, 0, UINT_MAX);
436 ADT_OPT(map_opt, par->family, info->map_set.dim, 438 ADT_OPT(map_opt, xt_family(par), info->map_set.dim,
437 info->map_set.flags, 0, UINT_MAX); 439 info->map_set.flags, 0, UINT_MAX);
438 440
439 /* Normalize to fit into jiffies */ 441 /* Normalize to fit into jiffies */
@@ -453,14 +455,14 @@ set_target_v3(struct sk_buff *skb, const struct xt_action_param *par)
453 if (!ret) 455 if (!ret)
454 return XT_CONTINUE; 456 return XT_CONTINUE;
455 if (map_opt.cmdflags & IPSET_FLAG_MAP_SKBMARK) 457 if (map_opt.cmdflags & IPSET_FLAG_MAP_SKBMARK)
456 skb->mark = (skb->mark & ~(map_opt.ext.skbmarkmask)) 458 skb->mark = (skb->mark & ~MOPT(map_opt,skbmarkmask))
457 ^ (map_opt.ext.skbmark); 459 ^ MOPT(map_opt, skbmark);
458 if (map_opt.cmdflags & IPSET_FLAG_MAP_SKBPRIO) 460 if (map_opt.cmdflags & IPSET_FLAG_MAP_SKBPRIO)
459 skb->priority = map_opt.ext.skbprio; 461 skb->priority = MOPT(map_opt, skbprio);
460 if ((map_opt.cmdflags & IPSET_FLAG_MAP_SKBQUEUE) && 462 if ((map_opt.cmdflags & IPSET_FLAG_MAP_SKBQUEUE) &&
461 skb->dev && 463 skb->dev &&
462 skb->dev->real_num_tx_queues > map_opt.ext.skbqueue) 464 skb->dev->real_num_tx_queues > MOPT(map_opt, skbqueue))
463 skb_set_queue_mapping(skb, map_opt.ext.skbqueue); 465 skb_set_queue_mapping(skb, MOPT(map_opt, skbqueue));
464 } 466 }
465 return XT_CONTINUE; 467 return XT_CONTINUE;
466} 468}
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index b10ade272b50..770bbec878f1 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -22,76 +22,14 @@
22#include <net/netfilter/ipv4/nf_defrag_ipv4.h> 22#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
23 23
24#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) 24#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
25#define XT_SOCKET_HAVE_IPV6 1
26#include <linux/netfilter_ipv6/ip6_tables.h> 25#include <linux/netfilter_ipv6/ip6_tables.h>
27#include <net/inet6_hashtables.h> 26#include <net/inet6_hashtables.h>
28#include <net/netfilter/ipv6/nf_defrag_ipv6.h> 27#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
29#endif 28#endif
30 29
30#include <net/netfilter/nf_socket.h>
31#include <linux/netfilter/xt_socket.h> 31#include <linux/netfilter/xt_socket.h>
32 32
33#if IS_ENABLED(CONFIG_NF_CONNTRACK)
34#define XT_SOCKET_HAVE_CONNTRACK 1
35#include <net/netfilter/nf_conntrack.h>
36#endif
37
38static int
39extract_icmp4_fields(const struct sk_buff *skb,
40 u8 *protocol,
41 __be32 *raddr,
42 __be32 *laddr,
43 __be16 *rport,
44 __be16 *lport)
45{
46 unsigned int outside_hdrlen = ip_hdrlen(skb);
47 struct iphdr *inside_iph, _inside_iph;
48 struct icmphdr *icmph, _icmph;
49 __be16 *ports, _ports[2];
50
51 icmph = skb_header_pointer(skb, outside_hdrlen,
52 sizeof(_icmph), &_icmph);
53 if (icmph == NULL)
54 return 1;
55
56 switch (icmph->type) {
57 case ICMP_DEST_UNREACH:
58 case ICMP_SOURCE_QUENCH:
59 case ICMP_REDIRECT:
60 case ICMP_TIME_EXCEEDED:
61 case ICMP_PARAMETERPROB:
62 break;
63 default:
64 return 1;
65 }
66
67 inside_iph = skb_header_pointer(skb, outside_hdrlen +
68 sizeof(struct icmphdr),
69 sizeof(_inside_iph), &_inside_iph);
70 if (inside_iph == NULL)
71 return 1;
72
73 if (inside_iph->protocol != IPPROTO_TCP &&
74 inside_iph->protocol != IPPROTO_UDP)
75 return 1;
76
77 ports = skb_header_pointer(skb, outside_hdrlen +
78 sizeof(struct icmphdr) +
79 (inside_iph->ihl << 2),
80 sizeof(_ports), &_ports);
81 if (ports == NULL)
82 return 1;
83
84 /* the inside IP packet is the one quoted from our side, thus
85 * its saddr is the local address */
86 *protocol = inside_iph->protocol;
87 *laddr = inside_iph->saddr;
88 *lport = ports[0];
89 *raddr = inside_iph->daddr;
90 *rport = ports[1];
91
92 return 0;
93}
94
95/* "socket" match based redirection (no specific rule) 33/* "socket" match based redirection (no specific rule)
96 * =================================================== 34 * ===================================================
97 * 35 *
@@ -111,104 +49,6 @@ extract_icmp4_fields(const struct sk_buff *skb,
111 * then local services could intercept traffic going through the 49 * then local services could intercept traffic going through the
112 * box. 50 * box.
113 */ 51 */
114static struct sock *
115xt_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff,
116 const u8 protocol,
117 const __be32 saddr, const __be32 daddr,
118 const __be16 sport, const __be16 dport,
119 const struct net_device *in)
120{
121 switch (protocol) {
122 case IPPROTO_TCP:
123 return inet_lookup(net, &tcp_hashinfo, skb, doff,
124 saddr, sport, daddr, dport,
125 in->ifindex);
126 case IPPROTO_UDP:
127 return udp4_lib_lookup(net, saddr, sport, daddr, dport,
128 in->ifindex);
129 }
130 return NULL;
131}
132
133static bool xt_socket_sk_is_transparent(struct sock *sk)
134{
135 switch (sk->sk_state) {
136 case TCP_TIME_WAIT:
137 return inet_twsk(sk)->tw_transparent;
138
139 case TCP_NEW_SYN_RECV:
140 return inet_rsk(inet_reqsk(sk))->no_srccheck;
141
142 default:
143 return inet_sk(sk)->transparent;
144 }
145}
146
147static struct sock *xt_socket_lookup_slow_v4(struct net *net,
148 const struct sk_buff *skb,
149 const struct net_device *indev)
150{
151 const struct iphdr *iph = ip_hdr(skb);
152 struct sk_buff *data_skb = NULL;
153 int doff = 0;
154 __be32 uninitialized_var(daddr), uninitialized_var(saddr);
155 __be16 uninitialized_var(dport), uninitialized_var(sport);
156 u8 uninitialized_var(protocol);
157#ifdef XT_SOCKET_HAVE_CONNTRACK
158 struct nf_conn const *ct;
159 enum ip_conntrack_info ctinfo;
160#endif
161
162 if (iph->protocol == IPPROTO_UDP || iph->protocol == IPPROTO_TCP) {
163 struct udphdr _hdr, *hp;
164
165 hp = skb_header_pointer(skb, ip_hdrlen(skb),
166 sizeof(_hdr), &_hdr);
167 if (hp == NULL)
168 return NULL;
169
170 protocol = iph->protocol;
171 saddr = iph->saddr;
172 sport = hp->source;
173 daddr = iph->daddr;
174 dport = hp->dest;
175 data_skb = (struct sk_buff *)skb;
176 doff = iph->protocol == IPPROTO_TCP ?
177 ip_hdrlen(skb) + __tcp_hdrlen((struct tcphdr *)hp) :
178 ip_hdrlen(skb) + sizeof(*hp);
179
180 } else if (iph->protocol == IPPROTO_ICMP) {
181 if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr,
182 &sport, &dport))
183 return NULL;
184 } else {
185 return NULL;
186 }
187
188#ifdef XT_SOCKET_HAVE_CONNTRACK
189 /* Do the lookup with the original socket address in
190 * case this is a reply packet of an established
191 * SNAT-ted connection.
192 */
193 ct = nf_ct_get(skb, &ctinfo);
194 if (ct && !nf_ct_is_untracked(ct) &&
195 ((iph->protocol != IPPROTO_ICMP &&
196 ctinfo == IP_CT_ESTABLISHED_REPLY) ||
197 (iph->protocol == IPPROTO_ICMP &&
198 ctinfo == IP_CT_RELATED_REPLY)) &&
199 (ct->status & IPS_SRC_NAT_DONE)) {
200
201 daddr = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip;
202 dport = (iph->protocol == IPPROTO_TCP) ?
203 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port :
204 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port;
205 }
206#endif
207
208 return xt_socket_get_sock_v4(net, data_skb, doff, protocol, saddr,
209 daddr, sport, dport, indev);
210}
211
212static bool 52static bool
213socket_match(const struct sk_buff *skb, struct xt_action_param *par, 53socket_match(const struct sk_buff *skb, struct xt_action_param *par,
214 const struct xt_socket_mtinfo1 *info) 54 const struct xt_socket_mtinfo1 *info)
@@ -217,7 +57,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
217 struct sock *sk = skb->sk; 57 struct sock *sk = skb->sk;
218 58
219 if (!sk) 59 if (!sk)
220 sk = xt_socket_lookup_slow_v4(par->net, skb, par->in); 60 sk = nf_sk_lookup_slow_v4(xt_net(par), skb, xt_in(par));
221 if (sk) { 61 if (sk) {
222 bool wildcard; 62 bool wildcard;
223 bool transparent = true; 63 bool transparent = true;
@@ -233,7 +73,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
233 * if XT_SOCKET_TRANSPARENT is used 73 * if XT_SOCKET_TRANSPARENT is used
234 */ 74 */
235 if (info->flags & XT_SOCKET_TRANSPARENT) 75 if (info->flags & XT_SOCKET_TRANSPARENT)
236 transparent = xt_socket_sk_is_transparent(sk); 76 transparent = nf_sk_is_transparent(sk);
237 77
238 if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && 78 if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
239 transparent) 79 transparent)
@@ -265,132 +105,7 @@ socket_mt4_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
265 return socket_match(skb, par, par->matchinfo); 105 return socket_match(skb, par, par->matchinfo);
266} 106}
267 107
268#ifdef XT_SOCKET_HAVE_IPV6 108#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
269
270static int
271extract_icmp6_fields(const struct sk_buff *skb,
272 unsigned int outside_hdrlen,
273 int *protocol,
274 const struct in6_addr **raddr,
275 const struct in6_addr **laddr,
276 __be16 *rport,
277 __be16 *lport,
278 struct ipv6hdr *ipv6_var)
279{
280 const struct ipv6hdr *inside_iph;
281 struct icmp6hdr *icmph, _icmph;
282 __be16 *ports, _ports[2];
283 u8 inside_nexthdr;
284 __be16 inside_fragoff;
285 int inside_hdrlen;
286
287 icmph = skb_header_pointer(skb, outside_hdrlen,
288 sizeof(_icmph), &_icmph);
289 if (icmph == NULL)
290 return 1;
291
292 if (icmph->icmp6_type & ICMPV6_INFOMSG_MASK)
293 return 1;
294
295 inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(_icmph),
296 sizeof(*ipv6_var), ipv6_var);
297 if (inside_iph == NULL)
298 return 1;
299 inside_nexthdr = inside_iph->nexthdr;
300
301 inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) +
302 sizeof(*ipv6_var),
303 &inside_nexthdr, &inside_fragoff);
304 if (inside_hdrlen < 0)
305 return 1; /* hjm: Packet has no/incomplete transport layer headers. */
306
307 if (inside_nexthdr != IPPROTO_TCP &&
308 inside_nexthdr != IPPROTO_UDP)
309 return 1;
310
311 ports = skb_header_pointer(skb, inside_hdrlen,
312 sizeof(_ports), &_ports);
313 if (ports == NULL)
314 return 1;
315
316 /* the inside IP packet is the one quoted from our side, thus
317 * its saddr is the local address */
318 *protocol = inside_nexthdr;
319 *laddr = &inside_iph->saddr;
320 *lport = ports[0];
321 *raddr = &inside_iph->daddr;
322 *rport = ports[1];
323
324 return 0;
325}
326
327static struct sock *
328xt_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff,
329 const u8 protocol,
330 const struct in6_addr *saddr, const struct in6_addr *daddr,
331 const __be16 sport, const __be16 dport,
332 const struct net_device *in)
333{
334 switch (protocol) {
335 case IPPROTO_TCP:
336 return inet6_lookup(net, &tcp_hashinfo, skb, doff,
337 saddr, sport, daddr, dport,
338 in->ifindex);
339 case IPPROTO_UDP:
340 return udp6_lib_lookup(net, saddr, sport, daddr, dport,
341 in->ifindex);
342 }
343
344 return NULL;
345}
346
347static struct sock *xt_socket_lookup_slow_v6(struct net *net,
348 const struct sk_buff *skb,
349 const struct net_device *indev)
350{
351 __be16 uninitialized_var(dport), uninitialized_var(sport);
352 const struct in6_addr *daddr = NULL, *saddr = NULL;
353 struct ipv6hdr *iph = ipv6_hdr(skb);
354 struct sk_buff *data_skb = NULL;
355 int doff = 0;
356 int thoff = 0, tproto;
357
358 tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);
359 if (tproto < 0) {
360 pr_debug("unable to find transport header in IPv6 packet, dropping\n");
361 return NULL;
362 }
363
364 if (tproto == IPPROTO_UDP || tproto == IPPROTO_TCP) {
365 struct udphdr _hdr, *hp;
366
367 hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
368 if (hp == NULL)
369 return NULL;
370
371 saddr = &iph->saddr;
372 sport = hp->source;
373 daddr = &iph->daddr;
374 dport = hp->dest;
375 data_skb = (struct sk_buff *)skb;
376 doff = tproto == IPPROTO_TCP ?
377 thoff + __tcp_hdrlen((struct tcphdr *)hp) :
378 thoff + sizeof(*hp);
379
380 } else if (tproto == IPPROTO_ICMPV6) {
381 struct ipv6hdr ipv6_var;
382
383 if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr,
384 &sport, &dport, &ipv6_var))
385 return NULL;
386 } else {
387 return NULL;
388 }
389
390 return xt_socket_get_sock_v6(net, data_skb, doff, tproto, saddr, daddr,
391 sport, dport, indev);
392}
393
394static bool 109static bool
395socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par) 110socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
396{ 111{
@@ -399,7 +114,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
399 struct sock *sk = skb->sk; 114 struct sock *sk = skb->sk;
400 115
401 if (!sk) 116 if (!sk)
402 sk = xt_socket_lookup_slow_v6(par->net, skb, par->in); 117 sk = nf_sk_lookup_slow_v6(xt_net(par), skb, xt_in(par));
403 if (sk) { 118 if (sk) {
404 bool wildcard; 119 bool wildcard;
405 bool transparent = true; 120 bool transparent = true;
@@ -415,7 +130,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
415 * if XT_SOCKET_TRANSPARENT is used 130 * if XT_SOCKET_TRANSPARENT is used
416 */ 131 */
417 if (info->flags & XT_SOCKET_TRANSPARENT) 132 if (info->flags & XT_SOCKET_TRANSPARENT)
418 transparent = xt_socket_sk_is_transparent(sk); 133 transparent = nf_sk_is_transparent(sk);
419 134
420 if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && 135 if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
421 transparent) 136 transparent)
@@ -432,9 +147,28 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
432} 147}
433#endif 148#endif
434 149
150static int socket_mt_enable_defrag(struct net *net, int family)
151{
152 switch (family) {
153 case NFPROTO_IPV4:
154 return nf_defrag_ipv4_enable(net);
155#ifdef XT_SOCKET_HAVE_IPV6
156 case NFPROTO_IPV6:
157 return nf_defrag_ipv6_enable(net);
158#endif
159 }
160 WARN_ONCE(1, "Unknown family %d\n", family);
161 return 0;
162}
163
435static int socket_mt_v1_check(const struct xt_mtchk_param *par) 164static int socket_mt_v1_check(const struct xt_mtchk_param *par)
436{ 165{
437 const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo; 166 const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo;
167 int err;
168
169 err = socket_mt_enable_defrag(par->net, par->family);
170 if (err)
171 return err;
438 172
439 if (info->flags & ~XT_SOCKET_FLAGS_V1) { 173 if (info->flags & ~XT_SOCKET_FLAGS_V1) {
440 pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V1); 174 pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V1);
@@ -446,6 +180,11 @@ static int socket_mt_v1_check(const struct xt_mtchk_param *par)
446static int socket_mt_v2_check(const struct xt_mtchk_param *par) 180static int socket_mt_v2_check(const struct xt_mtchk_param *par)
447{ 181{
448 const struct xt_socket_mtinfo2 *info = (struct xt_socket_mtinfo2 *) par->matchinfo; 182 const struct xt_socket_mtinfo2 *info = (struct xt_socket_mtinfo2 *) par->matchinfo;
183 int err;
184
185 err = socket_mt_enable_defrag(par->net, par->family);
186 if (err)
187 return err;
449 188
450 if (info->flags & ~XT_SOCKET_FLAGS_V2) { 189 if (info->flags & ~XT_SOCKET_FLAGS_V2) {
451 pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V2); 190 pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V2);
@@ -458,7 +197,11 @@ static int socket_mt_v3_check(const struct xt_mtchk_param *par)
458{ 197{
459 const struct xt_socket_mtinfo3 *info = 198 const struct xt_socket_mtinfo3 *info =
460 (struct xt_socket_mtinfo3 *)par->matchinfo; 199 (struct xt_socket_mtinfo3 *)par->matchinfo;
200 int err;
461 201
202 err = socket_mt_enable_defrag(par->net, par->family);
203 if (err)
204 return err;
462 if (info->flags & ~XT_SOCKET_FLAGS_V3) { 205 if (info->flags & ~XT_SOCKET_FLAGS_V3) {
463 pr_info("unknown flags 0x%x\n", 206 pr_info("unknown flags 0x%x\n",
464 info->flags & ~XT_SOCKET_FLAGS_V3); 207 info->flags & ~XT_SOCKET_FLAGS_V3);
@@ -488,7 +231,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
488 (1 << NF_INET_LOCAL_IN), 231 (1 << NF_INET_LOCAL_IN),
489 .me = THIS_MODULE, 232 .me = THIS_MODULE,
490 }, 233 },
491#ifdef XT_SOCKET_HAVE_IPV6 234#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
492 { 235 {
493 .name = "socket", 236 .name = "socket",
494 .revision = 1, 237 .revision = 1,
@@ -512,7 +255,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
512 (1 << NF_INET_LOCAL_IN), 255 (1 << NF_INET_LOCAL_IN),
513 .me = THIS_MODULE, 256 .me = THIS_MODULE,
514 }, 257 },
515#ifdef XT_SOCKET_HAVE_IPV6 258#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
516 { 259 {
517 .name = "socket", 260 .name = "socket",
518 .revision = 2, 261 .revision = 2,
@@ -536,7 +279,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
536 (1 << NF_INET_LOCAL_IN), 279 (1 << NF_INET_LOCAL_IN),
537 .me = THIS_MODULE, 280 .me = THIS_MODULE,
538 }, 281 },
539#ifdef XT_SOCKET_HAVE_IPV6 282#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
540 { 283 {
541 .name = "socket", 284 .name = "socket",
542 .revision = 3, 285 .revision = 3,
@@ -553,11 +296,6 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
553 296
554static int __init socket_mt_init(void) 297static int __init socket_mt_init(void)
555{ 298{
556 nf_defrag_ipv4_enable();
557#ifdef XT_SOCKET_HAVE_IPV6
558 nf_defrag_ipv6_enable();
559#endif
560
561 return xt_register_matches(socket_mt_reg, ARRAY_SIZE(socket_mt_reg)); 299 return xt_register_matches(socket_mt_reg, ARRAY_SIZE(socket_mt_reg));
562} 300}
563 301
diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c
index a507922d80cd..5746a33789a5 100644
--- a/net/netfilter/xt_state.c
+++ b/net/netfilter/xt_state.c
@@ -43,7 +43,7 @@ static int state_mt_check(const struct xt_mtchk_param *par)
43{ 43{
44 int ret; 44 int ret;
45 45
46 ret = nf_ct_l3proto_try_module_get(par->family); 46 ret = nf_ct_netns_get(par->net, par->family);
47 if (ret < 0) 47 if (ret < 0)
48 pr_info("cannot load conntrack support for proto=%u\n", 48 pr_info("cannot load conntrack support for proto=%u\n",
49 par->family); 49 par->family);
@@ -52,7 +52,7 @@ static int state_mt_check(const struct xt_mtchk_param *par)
52 52
53static void state_mt_destroy(const struct xt_mtdtor_param *par) 53static void state_mt_destroy(const struct xt_mtdtor_param *par)
54{ 54{
55 nf_ct_l3proto_module_put(par->family); 55 nf_ct_netns_put(par->net, par->family);
56} 56}
57 57
58static struct xt_match state_mt_reg __read_mostly = { 58static struct xt_match state_mt_reg __read_mostly = {
diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c
index 0bc3460319c8..423293ee57c2 100644
--- a/net/netfilter/xt_string.c
+++ b/net/netfilter/xt_string.c
@@ -77,6 +77,7 @@ static struct xt_match xt_string_mt_reg __read_mostly = {
77 .match = string_mt, 77 .match = string_mt,
78 .destroy = string_mt_destroy, 78 .destroy = string_mt_destroy,
79 .matchsize = sizeof(struct xt_string_info), 79 .matchsize = sizeof(struct xt_string_info),
80 .usersize = offsetof(struct xt_string_info, config),
80 .me = THIS_MODULE, 81 .me = THIS_MODULE,
81}; 82};
82 83
diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c
index 0ae55a36f492..1b01eec1fbda 100644
--- a/net/netfilter/xt_time.c
+++ b/net/netfilter/xt_time.c
@@ -168,7 +168,7 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par)
168 * may happen that the same packet matches both rules if 168 * may happen that the same packet matches both rules if
169 * it arrived at the right moment before 13:00. 169 * it arrived at the right moment before 13:00.
170 */ 170 */
171 if (skb->tstamp.tv64 == 0) 171 if (skb->tstamp == 0)
172 __net_timestamp((struct sk_buff *)skb); 172 __net_timestamp((struct sk_buff *)skb);
173 173
174 stamp = ktime_to_ns(skb->tstamp); 174 stamp = ktime_to_ns(skb->tstamp);
diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c
index 2ec93c5e77bb..d177dd066504 100644
--- a/net/netlabel/netlabel_calipso.c
+++ b/net/netlabel/netlabel_calipso.c
@@ -60,13 +60,7 @@ struct netlbl_domhsh_walk_arg {
60}; 60};
61 61
62/* NetLabel Generic NETLINK CALIPSO family */ 62/* NetLabel Generic NETLINK CALIPSO family */
63static struct genl_family netlbl_calipso_gnl_family = { 63static struct genl_family netlbl_calipso_gnl_family;
64 .id = GENL_ID_GENERATE,
65 .hdrsize = 0,
66 .name = NETLBL_NLTYPE_CALIPSO_NAME,
67 .version = NETLBL_PROTO_VERSION,
68 .maxattr = NLBL_CALIPSO_A_MAX,
69};
70 64
71/* NetLabel Netlink attribute policy */ 65/* NetLabel Netlink attribute policy */
72static const struct nla_policy calipso_genl_policy[NLBL_CALIPSO_A_MAX + 1] = { 66static const struct nla_policy calipso_genl_policy[NLBL_CALIPSO_A_MAX + 1] = {
@@ -355,6 +349,16 @@ static const struct genl_ops netlbl_calipso_ops[] = {
355 }, 349 },
356}; 350};
357 351
352static struct genl_family netlbl_calipso_gnl_family __ro_after_init = {
353 .hdrsize = 0,
354 .name = NETLBL_NLTYPE_CALIPSO_NAME,
355 .version = NETLBL_PROTO_VERSION,
356 .maxattr = NLBL_CALIPSO_A_MAX,
357 .module = THIS_MODULE,
358 .ops = netlbl_calipso_ops,
359 .n_ops = ARRAY_SIZE(netlbl_calipso_ops),
360};
361
358/* NetLabel Generic NETLINK Protocol Functions 362/* NetLabel Generic NETLINK Protocol Functions
359 */ 363 */
360 364
@@ -368,8 +372,7 @@ static const struct genl_ops netlbl_calipso_ops[] = {
368 */ 372 */
369int __init netlbl_calipso_genl_init(void) 373int __init netlbl_calipso_genl_init(void)
370{ 374{
371 return genl_register_family_with_ops(&netlbl_calipso_gnl_family, 375 return genl_register_family(&netlbl_calipso_gnl_family);
372 netlbl_calipso_ops);
373} 376}
374 377
375static const struct netlbl_calipso_ops *calipso_ops; 378static const struct netlbl_calipso_ops *calipso_ops;
diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c
index 7fd1104ba900..4149d3e63589 100644
--- a/net/netlabel/netlabel_cipso_v4.c
+++ b/net/netlabel/netlabel_cipso_v4.c
@@ -59,14 +59,7 @@ struct netlbl_domhsh_walk_arg {
59}; 59};
60 60
61/* NetLabel Generic NETLINK CIPSOv4 family */ 61/* NetLabel Generic NETLINK CIPSOv4 family */
62static struct genl_family netlbl_cipsov4_gnl_family = { 62static struct genl_family netlbl_cipsov4_gnl_family;
63 .id = GENL_ID_GENERATE,
64 .hdrsize = 0,
65 .name = NETLBL_NLTYPE_CIPSOV4_NAME,
66 .version = NETLBL_PROTO_VERSION,
67 .maxattr = NLBL_CIPSOV4_A_MAX,
68};
69
70/* NetLabel Netlink attribute policy */ 63/* NetLabel Netlink attribute policy */
71static const struct nla_policy netlbl_cipsov4_genl_policy[NLBL_CIPSOV4_A_MAX + 1] = { 64static const struct nla_policy netlbl_cipsov4_genl_policy[NLBL_CIPSOV4_A_MAX + 1] = {
72 [NLBL_CIPSOV4_A_DOI] = { .type = NLA_U32 }, 65 [NLBL_CIPSOV4_A_DOI] = { .type = NLA_U32 },
@@ -767,6 +760,16 @@ static const struct genl_ops netlbl_cipsov4_ops[] = {
767 }, 760 },
768}; 761};
769 762
763static struct genl_family netlbl_cipsov4_gnl_family __ro_after_init = {
764 .hdrsize = 0,
765 .name = NETLBL_NLTYPE_CIPSOV4_NAME,
766 .version = NETLBL_PROTO_VERSION,
767 .maxattr = NLBL_CIPSOV4_A_MAX,
768 .module = THIS_MODULE,
769 .ops = netlbl_cipsov4_ops,
770 .n_ops = ARRAY_SIZE(netlbl_cipsov4_ops),
771};
772
770/* 773/*
771 * NetLabel Generic NETLINK Protocol Functions 774 * NetLabel Generic NETLINK Protocol Functions
772 */ 775 */
@@ -781,6 +784,5 @@ static const struct genl_ops netlbl_cipsov4_ops[] = {
781 */ 784 */
782int __init netlbl_cipsov4_genl_init(void) 785int __init netlbl_cipsov4_genl_init(void)
783{ 786{
784 return genl_register_family_with_ops(&netlbl_cipsov4_gnl_family, 787 return genl_register_family(&netlbl_cipsov4_gnl_family);
785 netlbl_cipsov4_ops);
786} 788}
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index 28c56b95fb7f..ea7c67050792 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -1502,10 +1502,7 @@ static int __init netlbl_init(void)
1502 printk(KERN_INFO "NetLabel: Initializing\n"); 1502 printk(KERN_INFO "NetLabel: Initializing\n");
1503 printk(KERN_INFO "NetLabel: domain hash size = %u\n", 1503 printk(KERN_INFO "NetLabel: domain hash size = %u\n",
1504 (1 << NETLBL_DOMHSH_BITSIZE)); 1504 (1 << NETLBL_DOMHSH_BITSIZE));
1505 printk(KERN_INFO "NetLabel: protocols =" 1505 printk(KERN_INFO "NetLabel: protocols = UNLABELED CIPSOv4 CALIPSO\n");
1506 " UNLABELED"
1507 " CIPSOv4"
1508 "\n");
1509 1506
1510 ret_val = netlbl_domhsh_init(NETLBL_DOMHSH_BITSIZE); 1507 ret_val = netlbl_domhsh_init(NETLBL_DOMHSH_BITSIZE);
1511 if (ret_val != 0) 1508 if (ret_val != 0)
diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c
index f85d0e07af2d..21e0095b1d14 100644
--- a/net/netlabel/netlabel_mgmt.c
+++ b/net/netlabel/netlabel_mgmt.c
@@ -60,13 +60,7 @@ struct netlbl_domhsh_walk_arg {
60}; 60};
61 61
62/* NetLabel Generic NETLINK CIPSOv4 family */ 62/* NetLabel Generic NETLINK CIPSOv4 family */
63static struct genl_family netlbl_mgmt_gnl_family = { 63static struct genl_family netlbl_mgmt_gnl_family;
64 .id = GENL_ID_GENERATE,
65 .hdrsize = 0,
66 .name = NETLBL_NLTYPE_MGMT_NAME,
67 .version = NETLBL_PROTO_VERSION,
68 .maxattr = NLBL_MGMT_A_MAX,
69};
70 64
71/* NetLabel Netlink attribute policy */ 65/* NetLabel Netlink attribute policy */
72static const struct nla_policy netlbl_mgmt_genl_policy[NLBL_MGMT_A_MAX + 1] = { 66static const struct nla_policy netlbl_mgmt_genl_policy[NLBL_MGMT_A_MAX + 1] = {
@@ -834,6 +828,16 @@ static const struct genl_ops netlbl_mgmt_genl_ops[] = {
834 }, 828 },
835}; 829};
836 830
831static struct genl_family netlbl_mgmt_gnl_family __ro_after_init = {
832 .hdrsize = 0,
833 .name = NETLBL_NLTYPE_MGMT_NAME,
834 .version = NETLBL_PROTO_VERSION,
835 .maxattr = NLBL_MGMT_A_MAX,
836 .module = THIS_MODULE,
837 .ops = netlbl_mgmt_genl_ops,
838 .n_ops = ARRAY_SIZE(netlbl_mgmt_genl_ops),
839};
840
837/* 841/*
838 * NetLabel Generic NETLINK Protocol Functions 842 * NetLabel Generic NETLINK Protocol Functions
839 */ 843 */
@@ -848,6 +852,5 @@ static const struct genl_ops netlbl_mgmt_genl_ops[] = {
848 */ 852 */
849int __init netlbl_mgmt_genl_init(void) 853int __init netlbl_mgmt_genl_init(void)
850{ 854{
851 return genl_register_family_with_ops(&netlbl_mgmt_gnl_family, 855 return genl_register_family(&netlbl_mgmt_gnl_family);
852 netlbl_mgmt_genl_ops);
853} 856}
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index 4528cff9138b..22dc1b9d6362 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -123,13 +123,7 @@ static struct netlbl_unlhsh_iface __rcu *netlbl_unlhsh_def;
123static u8 netlabel_unlabel_acceptflg; 123static u8 netlabel_unlabel_acceptflg;
124 124
125/* NetLabel Generic NETLINK unlabeled family */ 125/* NetLabel Generic NETLINK unlabeled family */
126static struct genl_family netlbl_unlabel_gnl_family = { 126static struct genl_family netlbl_unlabel_gnl_family;
127 .id = GENL_ID_GENERATE,
128 .hdrsize = 0,
129 .name = NETLBL_NLTYPE_UNLABELED_NAME,
130 .version = NETLBL_PROTO_VERSION,
131 .maxattr = NLBL_UNLABEL_A_MAX,
132};
133 127
134/* NetLabel Netlink attribute policy */ 128/* NetLabel Netlink attribute policy */
135static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = { 129static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = {
@@ -1378,6 +1372,16 @@ static const struct genl_ops netlbl_unlabel_genl_ops[] = {
1378 }, 1372 },
1379}; 1373};
1380 1374
1375static struct genl_family netlbl_unlabel_gnl_family __ro_after_init = {
1376 .hdrsize = 0,
1377 .name = NETLBL_NLTYPE_UNLABELED_NAME,
1378 .version = NETLBL_PROTO_VERSION,
1379 .maxattr = NLBL_UNLABEL_A_MAX,
1380 .module = THIS_MODULE,
1381 .ops = netlbl_unlabel_genl_ops,
1382 .n_ops = ARRAY_SIZE(netlbl_unlabel_genl_ops),
1383};
1384
1381/* 1385/*
1382 * NetLabel Generic NETLINK Protocol Functions 1386 * NetLabel Generic NETLINK Protocol Functions
1383 */ 1387 */
@@ -1392,8 +1396,7 @@ static const struct genl_ops netlbl_unlabel_genl_ops[] = {
1392 */ 1396 */
1393int __init netlbl_unlabel_genl_init(void) 1397int __init netlbl_unlabel_genl_init(void)
1394{ 1398{
1395 return genl_register_family_with_ops(&netlbl_unlabel_gnl_family, 1399 return genl_register_family(&netlbl_unlabel_gnl_family);
1396 netlbl_unlabel_genl_ops);
1397} 1400}
1398 1401
1399/* 1402/*
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 246f29d365c0..596eaff66649 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -40,7 +40,7 @@
40#include <linux/net.h> 40#include <linux/net.h>
41#include <linux/fs.h> 41#include <linux/fs.h>
42#include <linux/slab.h> 42#include <linux/slab.h>
43#include <asm/uaccess.h> 43#include <linux/uaccess.h>
44#include <linux/skbuff.h> 44#include <linux/skbuff.h>
45#include <linux/netdevice.h> 45#include <linux/netdevice.h>
46#include <linux/rtnetlink.h> 46#include <linux/rtnetlink.h>
@@ -96,6 +96,44 @@ EXPORT_SYMBOL_GPL(nl_table);
96 96
97static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait); 97static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait);
98 98
99static struct lock_class_key nlk_cb_mutex_keys[MAX_LINKS];
100
101static const char *const nlk_cb_mutex_key_strings[MAX_LINKS + 1] = {
102 "nlk_cb_mutex-ROUTE",
103 "nlk_cb_mutex-1",
104 "nlk_cb_mutex-USERSOCK",
105 "nlk_cb_mutex-FIREWALL",
106 "nlk_cb_mutex-SOCK_DIAG",
107 "nlk_cb_mutex-NFLOG",
108 "nlk_cb_mutex-XFRM",
109 "nlk_cb_mutex-SELINUX",
110 "nlk_cb_mutex-ISCSI",
111 "nlk_cb_mutex-AUDIT",
112 "nlk_cb_mutex-FIB_LOOKUP",
113 "nlk_cb_mutex-CONNECTOR",
114 "nlk_cb_mutex-NETFILTER",
115 "nlk_cb_mutex-IP6_FW",
116 "nlk_cb_mutex-DNRTMSG",
117 "nlk_cb_mutex-KOBJECT_UEVENT",
118 "nlk_cb_mutex-GENERIC",
119 "nlk_cb_mutex-17",
120 "nlk_cb_mutex-SCSITRANSPORT",
121 "nlk_cb_mutex-ECRYPTFS",
122 "nlk_cb_mutex-RDMA",
123 "nlk_cb_mutex-CRYPTO",
124 "nlk_cb_mutex-SMC",
125 "nlk_cb_mutex-23",
126 "nlk_cb_mutex-24",
127 "nlk_cb_mutex-25",
128 "nlk_cb_mutex-26",
129 "nlk_cb_mutex-27",
130 "nlk_cb_mutex-28",
131 "nlk_cb_mutex-29",
132 "nlk_cb_mutex-30",
133 "nlk_cb_mutex-31",
134 "nlk_cb_mutex-MAX_LINKS"
135};
136
99static int netlink_dump(struct sock *sk); 137static int netlink_dump(struct sock *sk);
100static void netlink_skb_destructor(struct sk_buff *skb); 138static void netlink_skb_destructor(struct sk_buff *skb);
101 139
@@ -113,7 +151,7 @@ static atomic_t nl_table_users = ATOMIC_INIT(0);
113 151
114#define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock)); 152#define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock));
115 153
116static ATOMIC_NOTIFIER_HEAD(netlink_chain); 154static BLOCKING_NOTIFIER_HEAD(netlink_chain);
117 155
118static DEFINE_SPINLOCK(netlink_tap_lock); 156static DEFINE_SPINLOCK(netlink_tap_lock);
119static struct list_head netlink_tap_all __read_mostly; 157static struct list_head netlink_tap_all __read_mostly;
@@ -585,6 +623,9 @@ static int __netlink_create(struct net *net, struct socket *sock,
585 } else { 623 } else {
586 nlk->cb_mutex = &nlk->cb_def_mutex; 624 nlk->cb_mutex = &nlk->cb_def_mutex;
587 mutex_init(nlk->cb_mutex); 625 mutex_init(nlk->cb_mutex);
626 lockdep_set_class_and_name(nlk->cb_mutex,
627 nlk_cb_mutex_keys + protocol,
628 nlk_cb_mutex_key_strings[protocol]);
588 } 629 }
589 init_waitqueue_head(&nlk->wait); 630 init_waitqueue_head(&nlk->wait);
590 631
@@ -711,7 +752,7 @@ static int netlink_release(struct socket *sock)
711 .protocol = sk->sk_protocol, 752 .protocol = sk->sk_protocol,
712 .portid = nlk->portid, 753 .portid = nlk->portid,
713 }; 754 };
714 atomic_notifier_call_chain(&netlink_chain, 755 blocking_notifier_call_chain(&netlink_chain,
715 NETLINK_URELEASE, &n); 756 NETLINK_URELEASE, &n);
716 } 757 }
717 758
@@ -1210,9 +1251,9 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
1210 skb = nskb; 1251 skb = nskb;
1211 } 1252 }
1212 1253
1213 if (!pskb_expand_head(skb, 0, -delta, allocation)) 1254 pskb_expand_head(skb, 0, -delta,
1214 skb->truesize -= delta; 1255 (allocation & ~__GFP_DIRECT_RECLAIM) |
1215 1256 __GFP_NOWARN | __GFP_NORETRY);
1216 return skb; 1257 return skb;
1217} 1258}
1218 1259
@@ -2504,13 +2545,13 @@ static const struct file_operations netlink_seq_fops = {
2504 2545
2505int netlink_register_notifier(struct notifier_block *nb) 2546int netlink_register_notifier(struct notifier_block *nb)
2506{ 2547{
2507 return atomic_notifier_chain_register(&netlink_chain, nb); 2548 return blocking_notifier_chain_register(&netlink_chain, nb);
2508} 2549}
2509EXPORT_SYMBOL(netlink_register_notifier); 2550EXPORT_SYMBOL(netlink_register_notifier);
2510 2551
2511int netlink_unregister_notifier(struct notifier_block *nb) 2552int netlink_unregister_notifier(struct notifier_block *nb)
2512{ 2553{
2513 return atomic_notifier_chain_unregister(&netlink_chain, nb); 2554 return blocking_notifier_chain_unregister(&netlink_chain, nb);
2514} 2555}
2515EXPORT_SYMBOL(netlink_unregister_notifier); 2556EXPORT_SYMBOL(netlink_unregister_notifier);
2516 2557
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 49c28e8ef01b..92e0981f7404 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -17,6 +17,7 @@
17#include <linux/mutex.h> 17#include <linux/mutex.h>
18#include <linux/bitmap.h> 18#include <linux/bitmap.h>
19#include <linux/rwsem.h> 19#include <linux/rwsem.h>
20#include <linux/idr.h>
20#include <net/sock.h> 21#include <net/sock.h>
21#include <net/genetlink.h> 22#include <net/genetlink.h>
22 23
@@ -58,10 +59,8 @@ static void genl_unlock_all(void)
58 up_write(&cb_lock); 59 up_write(&cb_lock);
59} 60}
60 61
61#define GENL_FAM_TAB_SIZE 16 62static DEFINE_IDR(genl_fam_idr);
62#define GENL_FAM_TAB_MASK (GENL_FAM_TAB_SIZE - 1)
63 63
64static struct list_head family_ht[GENL_FAM_TAB_SIZE];
65/* 64/*
66 * Bitmap of multicast groups that are currently in use. 65 * Bitmap of multicast groups that are currently in use.
67 * 66 *
@@ -86,45 +85,29 @@ static unsigned long mc_group_start = 0x3 | BIT(GENL_ID_CTRL) |
86static unsigned long *mc_groups = &mc_group_start; 85static unsigned long *mc_groups = &mc_group_start;
87static unsigned long mc_groups_longs = 1; 86static unsigned long mc_groups_longs = 1;
88 87
89static int genl_ctrl_event(int event, struct genl_family *family, 88static int genl_ctrl_event(int event, const struct genl_family *family,
90 const struct genl_multicast_group *grp, 89 const struct genl_multicast_group *grp,
91 int grp_id); 90 int grp_id);
92 91
93static inline unsigned int genl_family_hash(unsigned int id) 92static const struct genl_family *genl_family_find_byid(unsigned int id)
94{ 93{
95 return id & GENL_FAM_TAB_MASK; 94 return idr_find(&genl_fam_idr, id);
96} 95}
97 96
98static inline struct list_head *genl_family_chain(unsigned int id) 97static const struct genl_family *genl_family_find_byname(char *name)
99{ 98{
100 return &family_ht[genl_family_hash(id)]; 99 const struct genl_family *family;
101} 100 unsigned int id;
102
103static struct genl_family *genl_family_find_byid(unsigned int id)
104{
105 struct genl_family *f;
106
107 list_for_each_entry(f, genl_family_chain(id), family_list)
108 if (f->id == id)
109 return f;
110
111 return NULL;
112}
113
114static struct genl_family *genl_family_find_byname(char *name)
115{
116 struct genl_family *f;
117 int i;
118 101
119 for (i = 0; i < GENL_FAM_TAB_SIZE; i++) 102 idr_for_each_entry(&genl_fam_idr, family, id)
120 list_for_each_entry(f, genl_family_chain(i), family_list) 103 if (strcmp(family->name, name) == 0)
121 if (strcmp(f->name, name) == 0) 104 return family;
122 return f;
123 105
124 return NULL; 106 return NULL;
125} 107}
126 108
127static const struct genl_ops *genl_get_cmd(u8 cmd, struct genl_family *family) 109static const struct genl_ops *genl_get_cmd(u8 cmd,
110 const struct genl_family *family)
128{ 111{
129 int i; 112 int i;
130 113
@@ -135,26 +118,6 @@ static const struct genl_ops *genl_get_cmd(u8 cmd, struct genl_family *family)
135 return NULL; 118 return NULL;
136} 119}
137 120
138/* Of course we are going to have problems once we hit
139 * 2^16 alive types, but that can only happen by year 2K
140*/
141static u16 genl_generate_id(void)
142{
143 static u16 id_gen_idx = GENL_MIN_ID;
144 int i;
145
146 for (i = 0; i <= GENL_MAX_ID - GENL_MIN_ID; i++) {
147 if (id_gen_idx != GENL_ID_VFS_DQUOT &&
148 id_gen_idx != GENL_ID_PMCRAID &&
149 !genl_family_find_byid(id_gen_idx))
150 return id_gen_idx;
151 if (++id_gen_idx > GENL_MAX_ID)
152 id_gen_idx = GENL_MIN_ID;
153 }
154
155 return 0;
156}
157
158static int genl_allocate_reserve_groups(int n_groups, int *first_id) 121static int genl_allocate_reserve_groups(int n_groups, int *first_id)
159{ 122{
160 unsigned long *new_groups; 123 unsigned long *new_groups;
@@ -295,7 +258,7 @@ static int genl_validate_assign_mc_groups(struct genl_family *family)
295 return err; 258 return err;
296} 259}
297 260
298static void genl_unregister_mc_groups(struct genl_family *family) 261static void genl_unregister_mc_groups(const struct genl_family *family)
299{ 262{
300 struct net *net; 263 struct net *net;
301 int i; 264 int i;
@@ -344,28 +307,21 @@ static int genl_validate_ops(const struct genl_family *family)
344} 307}
345 308
346/** 309/**
347 * __genl_register_family - register a generic netlink family 310 * genl_register_family - register a generic netlink family
348 * @family: generic netlink family 311 * @family: generic netlink family
349 * 312 *
350 * Registers the specified family after validating it first. Only one 313 * Registers the specified family after validating it first. Only one
351 * family may be registered with the same family name or identifier. 314 * family may be registered with the same family name or identifier.
352 * The family id may equal GENL_ID_GENERATE causing an unique id to
353 * be automatically generated and assigned.
354 * 315 *
355 * The family's ops array must already be assigned, you can use the 316 * The family's ops, multicast groups and module pointer must already
356 * genl_register_family_with_ops() helper function. 317 * be assigned.
357 * 318 *
358 * Return 0 on success or a negative error code. 319 * Return 0 on success or a negative error code.
359 */ 320 */
360int __genl_register_family(struct genl_family *family) 321int genl_register_family(struct genl_family *family)
361{ 322{
362 int err = -EINVAL, i; 323 int err, i;
363 324 int start = GENL_START_ALLOC, end = GENL_MAX_ID;
364 if (family->id && family->id < GENL_MIN_ID)
365 goto errout;
366
367 if (family->id > GENL_MAX_ID)
368 goto errout;
369 325
370 err = genl_validate_ops(family); 326 err = genl_validate_ops(family);
371 if (err) 327 if (err)
@@ -378,18 +334,20 @@ int __genl_register_family(struct genl_family *family)
378 goto errout_locked; 334 goto errout_locked;
379 } 335 }
380 336
381 if (family->id == GENL_ID_GENERATE) { 337 /*
382 u16 newid = genl_generate_id(); 338 * Sadly, a few cases need to be special-cased
383 339 * due to them having previously abused the API
384 if (!newid) { 340 * and having used their family ID also as their
385 err = -ENOMEM; 341 * multicast group ID, so we use reserved IDs
386 goto errout_locked; 342 * for both to be sure we can do that mapping.
387 } 343 */
388 344 if (family == &genl_ctrl) {
389 family->id = newid; 345 /* and this needs to be special for initial family lookups */
390 } else if (genl_family_find_byid(family->id)) { 346 start = end = GENL_ID_CTRL;
391 err = -EEXIST; 347 } else if (strcmp(family->name, "pmcraid") == 0) {
392 goto errout_locked; 348 start = end = GENL_ID_PMCRAID;
349 } else if (strcmp(family->name, "VFS_DQUOT") == 0) {
350 start = end = GENL_ID_VFS_DQUOT;
393 } 351 }
394 352
395 if (family->maxattr && !family->parallel_ops) { 353 if (family->maxattr && !family->parallel_ops) {
@@ -402,11 +360,17 @@ int __genl_register_family(struct genl_family *family)
402 } else 360 } else
403 family->attrbuf = NULL; 361 family->attrbuf = NULL;
404 362
363 family->id = idr_alloc(&genl_fam_idr, family,
364 start, end + 1, GFP_KERNEL);
365 if (family->id < 0) {
366 err = family->id;
367 goto errout_locked;
368 }
369
405 err = genl_validate_assign_mc_groups(family); 370 err = genl_validate_assign_mc_groups(family);
406 if (err) 371 if (err)
407 goto errout_free; 372 goto errout_remove;
408 373
409 list_add_tail(&family->family_list, genl_family_chain(family->id));
410 genl_unlock_all(); 374 genl_unlock_all();
411 375
412 /* send all events */ 376 /* send all events */
@@ -417,14 +381,14 @@ int __genl_register_family(struct genl_family *family)
417 381
418 return 0; 382 return 0;
419 383
420errout_free: 384errout_remove:
385 idr_remove(&genl_fam_idr, family->id);
421 kfree(family->attrbuf); 386 kfree(family->attrbuf);
422errout_locked: 387errout_locked:
423 genl_unlock_all(); 388 genl_unlock_all();
424errout:
425 return err; 389 return err;
426} 390}
427EXPORT_SYMBOL(__genl_register_family); 391EXPORT_SYMBOL(genl_register_family);
428 392
429/** 393/**
430 * genl_unregister_family - unregister generic netlink family 394 * genl_unregister_family - unregister generic netlink family
@@ -434,33 +398,29 @@ EXPORT_SYMBOL(__genl_register_family);
434 * 398 *
435 * Returns 0 on success or a negative error code. 399 * Returns 0 on success or a negative error code.
436 */ 400 */
437int genl_unregister_family(struct genl_family *family) 401int genl_unregister_family(const struct genl_family *family)
438{ 402{
439 struct genl_family *rc;
440
441 genl_lock_all(); 403 genl_lock_all();
442 404
443 list_for_each_entry(rc, genl_family_chain(family->id), family_list) { 405 if (!genl_family_find_byid(family->id)) {
444 if (family->id != rc->id || strcmp(rc->name, family->name)) 406 genl_unlock_all();
445 continue; 407 return -ENOENT;
408 }
446 409
447 genl_unregister_mc_groups(family); 410 genl_unregister_mc_groups(family);
448 411
449 list_del(&rc->family_list); 412 idr_remove(&genl_fam_idr, family->id);
450 family->n_ops = 0;
451 up_write(&cb_lock);
452 wait_event(genl_sk_destructing_waitq,
453 atomic_read(&genl_sk_destructing_cnt) == 0);
454 genl_unlock();
455 413
456 kfree(family->attrbuf); 414 up_write(&cb_lock);
457 genl_ctrl_event(CTRL_CMD_DELFAMILY, family, NULL, 0); 415 wait_event(genl_sk_destructing_waitq,
458 return 0; 416 atomic_read(&genl_sk_destructing_cnt) == 0);
459 } 417 genl_unlock();
460 418
461 genl_unlock_all(); 419 kfree(family->attrbuf);
420
421 genl_ctrl_event(CTRL_CMD_DELFAMILY, family, NULL, 0);
462 422
463 return -ENOENT; 423 return 0;
464} 424}
465EXPORT_SYMBOL(genl_unregister_family); 425EXPORT_SYMBOL(genl_unregister_family);
466 426
@@ -476,7 +436,7 @@ EXPORT_SYMBOL(genl_unregister_family);
476 * Returns pointer to user specific header 436 * Returns pointer to user specific header
477 */ 437 */
478void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, 438void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq,
479 struct genl_family *family, int flags, u8 cmd) 439 const struct genl_family *family, int flags, u8 cmd)
480{ 440{
481 struct nlmsghdr *nlh; 441 struct nlmsghdr *nlh;
482 struct genlmsghdr *hdr; 442 struct genlmsghdr *hdr;
@@ -535,7 +495,7 @@ static int genl_lock_done(struct netlink_callback *cb)
535 return rc; 495 return rc;
536} 496}
537 497
538static int genl_family_rcv_msg(struct genl_family *family, 498static int genl_family_rcv_msg(const struct genl_family *family,
539 struct sk_buff *skb, 499 struct sk_buff *skb,
540 struct nlmsghdr *nlh) 500 struct nlmsghdr *nlh)
541{ 501{
@@ -647,7 +607,7 @@ out:
647 607
648static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) 608static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
649{ 609{
650 struct genl_family *family; 610 const struct genl_family *family;
651 int err; 611 int err;
652 612
653 family = genl_family_find_byid(nlh->nlmsg_type); 613 family = genl_family_find_byid(nlh->nlmsg_type);
@@ -676,15 +636,9 @@ static void genl_rcv(struct sk_buff *skb)
676 * Controller 636 * Controller
677 **************************************************************************/ 637 **************************************************************************/
678 638
679static struct genl_family genl_ctrl = { 639static struct genl_family genl_ctrl;
680 .id = GENL_ID_CTRL,
681 .name = "nlctrl",
682 .version = 0x2,
683 .maxattr = CTRL_ATTR_MAX,
684 .netnsok = true,
685};
686 640
687static int ctrl_fill_info(struct genl_family *family, u32 portid, u32 seq, 641static int ctrl_fill_info(const struct genl_family *family, u32 portid, u32 seq,
688 u32 flags, struct sk_buff *skb, u8 cmd) 642 u32 flags, struct sk_buff *skb, u8 cmd)
689{ 643{
690 void *hdr; 644 void *hdr;
@@ -771,7 +725,7 @@ nla_put_failure:
771 return -EMSGSIZE; 725 return -EMSGSIZE;
772} 726}
773 727
774static int ctrl_fill_mcgrp_info(struct genl_family *family, 728static int ctrl_fill_mcgrp_info(const struct genl_family *family,
775 const struct genl_multicast_group *grp, 729 const struct genl_multicast_group *grp,
776 int grp_id, u32 portid, u32 seq, u32 flags, 730 int grp_id, u32 portid, u32 seq, u32 flags,
777 struct sk_buff *skb, u8 cmd) 731 struct sk_buff *skb, u8 cmd)
@@ -814,37 +768,32 @@ nla_put_failure:
814 768
815static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb) 769static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb)
816{ 770{
817 771 int n = 0;
818 int i, n = 0;
819 struct genl_family *rt; 772 struct genl_family *rt;
820 struct net *net = sock_net(skb->sk); 773 struct net *net = sock_net(skb->sk);
821 int chains_to_skip = cb->args[0]; 774 int fams_to_skip = cb->args[0];
822 int fams_to_skip = cb->args[1]; 775 unsigned int id;
823
824 for (i = chains_to_skip; i < GENL_FAM_TAB_SIZE; i++) {
825 n = 0;
826 list_for_each_entry(rt, genl_family_chain(i), family_list) {
827 if (!rt->netnsok && !net_eq(net, &init_net))
828 continue;
829 if (++n < fams_to_skip)
830 continue;
831 if (ctrl_fill_info(rt, NETLINK_CB(cb->skb).portid,
832 cb->nlh->nlmsg_seq, NLM_F_MULTI,
833 skb, CTRL_CMD_NEWFAMILY) < 0)
834 goto errout;
835 }
836 776
837 fams_to_skip = 0; 777 idr_for_each_entry(&genl_fam_idr, rt, id) {
838 } 778 if (!rt->netnsok && !net_eq(net, &init_net))
779 continue;
839 780
840errout: 781 if (n++ < fams_to_skip)
841 cb->args[0] = i; 782 continue;
842 cb->args[1] = n;
843 783
784 if (ctrl_fill_info(rt, NETLINK_CB(cb->skb).portid,
785 cb->nlh->nlmsg_seq, NLM_F_MULTI,
786 skb, CTRL_CMD_NEWFAMILY) < 0) {
787 n--;
788 break;
789 }
790 }
791
792 cb->args[0] = n;
844 return skb->len; 793 return skb->len;
845} 794}
846 795
847static struct sk_buff *ctrl_build_family_msg(struct genl_family *family, 796static struct sk_buff *ctrl_build_family_msg(const struct genl_family *family,
848 u32 portid, int seq, u8 cmd) 797 u32 portid, int seq, u8 cmd)
849{ 798{
850 struct sk_buff *skb; 799 struct sk_buff *skb;
@@ -864,7 +813,7 @@ static struct sk_buff *ctrl_build_family_msg(struct genl_family *family,
864} 813}
865 814
866static struct sk_buff * 815static struct sk_buff *
867ctrl_build_mcgrp_msg(struct genl_family *family, 816ctrl_build_mcgrp_msg(const struct genl_family *family,
868 const struct genl_multicast_group *grp, 817 const struct genl_multicast_group *grp,
869 int grp_id, u32 portid, int seq, u8 cmd) 818 int grp_id, u32 portid, int seq, u8 cmd)
870{ 819{
@@ -894,7 +843,7 @@ static const struct nla_policy ctrl_policy[CTRL_ATTR_MAX+1] = {
894static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info) 843static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info)
895{ 844{
896 struct sk_buff *msg; 845 struct sk_buff *msg;
897 struct genl_family *res = NULL; 846 const struct genl_family *res = NULL;
898 int err = -EINVAL; 847 int err = -EINVAL;
899 848
900 if (info->attrs[CTRL_ATTR_FAMILY_ID]) { 849 if (info->attrs[CTRL_ATTR_FAMILY_ID]) {
@@ -938,7 +887,7 @@ static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info)
938 return genlmsg_reply(msg, info); 887 return genlmsg_reply(msg, info);
939} 888}
940 889
941static int genl_ctrl_event(int event, struct genl_family *family, 890static int genl_ctrl_event(int event, const struct genl_family *family,
942 const struct genl_multicast_group *grp, 891 const struct genl_multicast_group *grp,
943 int grp_id) 892 int grp_id)
944{ 893{
@@ -992,27 +941,39 @@ static const struct genl_multicast_group genl_ctrl_groups[] = {
992 { .name = "notify", }, 941 { .name = "notify", },
993}; 942};
994 943
944static struct genl_family genl_ctrl __ro_after_init = {
945 .module = THIS_MODULE,
946 .ops = genl_ctrl_ops,
947 .n_ops = ARRAY_SIZE(genl_ctrl_ops),
948 .mcgrps = genl_ctrl_groups,
949 .n_mcgrps = ARRAY_SIZE(genl_ctrl_groups),
950 .id = GENL_ID_CTRL,
951 .name = "nlctrl",
952 .version = 0x2,
953 .maxattr = CTRL_ATTR_MAX,
954 .netnsok = true,
955};
956
995static int genl_bind(struct net *net, int group) 957static int genl_bind(struct net *net, int group)
996{ 958{
997 int i, err = -ENOENT; 959 struct genl_family *f;
960 int err = -ENOENT;
961 unsigned int id;
998 962
999 down_read(&cb_lock); 963 down_read(&cb_lock);
1000 for (i = 0; i < GENL_FAM_TAB_SIZE; i++) { 964
1001 struct genl_family *f; 965 idr_for_each_entry(&genl_fam_idr, f, id) {
1002 966 if (group >= f->mcgrp_offset &&
1003 list_for_each_entry(f, genl_family_chain(i), family_list) { 967 group < f->mcgrp_offset + f->n_mcgrps) {
1004 if (group >= f->mcgrp_offset && 968 int fam_grp = group - f->mcgrp_offset;
1005 group < f->mcgrp_offset + f->n_mcgrps) { 969
1006 int fam_grp = group - f->mcgrp_offset; 970 if (!f->netnsok && net != &init_net)
1007 971 err = -ENOENT;
1008 if (!f->netnsok && net != &init_net) 972 else if (f->mcast_bind)
1009 err = -ENOENT; 973 err = f->mcast_bind(net, fam_grp);
1010 else if (f->mcast_bind) 974 else
1011 err = f->mcast_bind(net, fam_grp); 975 err = 0;
1012 else 976 break;
1013 err = 0;
1014 break;
1015 }
1016 } 977 }
1017 } 978 }
1018 up_read(&cb_lock); 979 up_read(&cb_lock);
@@ -1022,21 +983,19 @@ static int genl_bind(struct net *net, int group)
1022 983
1023static void genl_unbind(struct net *net, int group) 984static void genl_unbind(struct net *net, int group)
1024{ 985{
1025 int i; 986 struct genl_family *f;
987 unsigned int id;
1026 988
1027 down_read(&cb_lock); 989 down_read(&cb_lock);
1028 for (i = 0; i < GENL_FAM_TAB_SIZE; i++) {
1029 struct genl_family *f;
1030 990
1031 list_for_each_entry(f, genl_family_chain(i), family_list) { 991 idr_for_each_entry(&genl_fam_idr, f, id) {
1032 if (group >= f->mcgrp_offset && 992 if (group >= f->mcgrp_offset &&
1033 group < f->mcgrp_offset + f->n_mcgrps) { 993 group < f->mcgrp_offset + f->n_mcgrps) {
1034 int fam_grp = group - f->mcgrp_offset; 994 int fam_grp = group - f->mcgrp_offset;
1035 995
1036 if (f->mcast_unbind) 996 if (f->mcast_unbind)
1037 f->mcast_unbind(net, fam_grp); 997 f->mcast_unbind(net, fam_grp);
1038 break; 998 break;
1039 }
1040 } 999 }
1041 } 1000 }
1042 up_read(&cb_lock); 1001 up_read(&cb_lock);
@@ -1076,13 +1035,9 @@ static struct pernet_operations genl_pernet_ops = {
1076 1035
1077static int __init genl_init(void) 1036static int __init genl_init(void)
1078{ 1037{
1079 int i, err; 1038 int err;
1080
1081 for (i = 0; i < GENL_FAM_TAB_SIZE; i++)
1082 INIT_LIST_HEAD(&family_ht[i]);
1083 1039
1084 err = genl_register_family_with_ops_groups(&genl_ctrl, genl_ctrl_ops, 1040 err = genl_register_family(&genl_ctrl);
1085 genl_ctrl_groups);
1086 if (err < 0) 1041 if (err < 0)
1087 goto problem; 1042 goto problem;
1088 1043
@@ -1098,6 +1053,25 @@ problem:
1098 1053
1099subsys_initcall(genl_init); 1054subsys_initcall(genl_init);
1100 1055
1056/**
1057 * genl_family_attrbuf - return family's attrbuf
1058 * @family: the family
1059 *
1060 * Return the family's attrbuf, while validating that it's
1061 * actually valid to access it.
1062 *
1063 * You cannot use this function with a family that has parallel_ops
1064 * and you can only use it within (pre/post) doit/dumpit callbacks.
1065 */
1066struct nlattr **genl_family_attrbuf(const struct genl_family *family)
1067{
1068 if (!WARN_ON(family->parallel_ops))
1069 lockdep_assert_held(&genl_mutex);
1070
1071 return family->attrbuf;
1072}
1073EXPORT_SYMBOL(genl_family_attrbuf);
1074
1101static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group, 1075static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group,
1102 gfp_t flags) 1076 gfp_t flags)
1103{ 1077{
@@ -1127,8 +1101,9 @@ static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group,
1127 return err; 1101 return err;
1128} 1102}
1129 1103
1130int genlmsg_multicast_allns(struct genl_family *family, struct sk_buff *skb, 1104int genlmsg_multicast_allns(const struct genl_family *family,
1131 u32 portid, unsigned int group, gfp_t flags) 1105 struct sk_buff *skb, u32 portid,
1106 unsigned int group, gfp_t flags)
1132{ 1107{
1133 if (WARN_ON_ONCE(group >= family->n_mcgrps)) 1108 if (WARN_ON_ONCE(group >= family->n_mcgrps))
1134 return -EINVAL; 1109 return -EINVAL;
@@ -1137,7 +1112,7 @@ int genlmsg_multicast_allns(struct genl_family *family, struct sk_buff *skb,
1137} 1112}
1138EXPORT_SYMBOL(genlmsg_multicast_allns); 1113EXPORT_SYMBOL(genlmsg_multicast_allns);
1139 1114
1140void genl_notify(struct genl_family *family, struct sk_buff *skb, 1115void genl_notify(const struct genl_family *family, struct sk_buff *skb,
1141 struct genl_info *info, u32 group, gfp_t flags) 1116 struct genl_info *info, u32 group, gfp_t flags)
1142{ 1117{
1143 struct net *net = genl_info_net(info); 1118 struct net *net = genl_info_net(info);
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index ed212ffc1d9d..ebf16f7f9089 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -17,7 +17,7 @@
17#include <linux/in.h> 17#include <linux/in.h>
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/kernel.h> 19#include <linux/kernel.h>
20#include <linux/sched.h> 20#include <linux/sched/signal.h>
21#include <linux/timer.h> 21#include <linux/timer.h>
22#include <linux/string.h> 22#include <linux/string.h>
23#include <linux/sockios.h> 23#include <linux/sockios.h>
@@ -765,7 +765,8 @@ out_release:
765 return err; 765 return err;
766} 766}
767 767
768static int nr_accept(struct socket *sock, struct socket *newsock, int flags) 768static int nr_accept(struct socket *sock, struct socket *newsock, int flags,
769 bool kern)
769{ 770{
770 struct sk_buff *skb; 771 struct sk_buff *skb;
771 struct sock *newsk; 772 struct sock *newsk;
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index b9edf5fae6ae..2ffb18e73df6 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -21,6 +21,7 @@
21#include <linux/kernel.h> 21#include <linux/kernel.h>
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/nfc.h> 23#include <linux/nfc.h>
24#include <linux/sched/signal.h>
24 25
25#include "nfc.h" 26#include "nfc.h"
26#include "llcp.h" 27#include "llcp.h"
@@ -440,7 +441,7 @@ struct sock *nfc_llcp_accept_dequeue(struct sock *parent,
440} 441}
441 442
442static int llcp_sock_accept(struct socket *sock, struct socket *newsock, 443static int llcp_sock_accept(struct socket *sock, struct socket *newsock,
443 int flags) 444 int flags, bool kern)
444{ 445{
445 DECLARE_WAITQUEUE(wait, current); 446 DECLARE_WAITQUEUE(wait, current);
446 struct sock *sk = sock->sk, *new_sk; 447 struct sock *sk = sock->sk, *new_sk;
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index ea023b35f1c2..03f3d5c7beb8 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -38,14 +38,7 @@ static const struct genl_multicast_group nfc_genl_mcgrps[] = {
38 { .name = NFC_GENL_MCAST_EVENT_NAME, }, 38 { .name = NFC_GENL_MCAST_EVENT_NAME, },
39}; 39};
40 40
41static struct genl_family nfc_genl_family = { 41static struct genl_family nfc_genl_family;
42 .id = GENL_ID_GENERATE,
43 .hdrsize = 0,
44 .name = NFC_GENL_NAME,
45 .version = NFC_GENL_VERSION,
46 .maxattr = NFC_ATTR_MAX,
47};
48
49static const struct nla_policy nfc_genl_policy[NFC_ATTR_MAX + 1] = { 42static const struct nla_policy nfc_genl_policy[NFC_ATTR_MAX + 1] = {
50 [NFC_ATTR_DEVICE_INDEX] = { .type = NLA_U32 }, 43 [NFC_ATTR_DEVICE_INDEX] = { .type = NLA_U32 },
51 [NFC_ATTR_DEVICE_NAME] = { .type = NLA_STRING, 44 [NFC_ATTR_DEVICE_NAME] = { .type = NLA_STRING,
@@ -120,21 +113,20 @@ nla_put_failure:
120 113
121static struct nfc_dev *__get_device_from_cb(struct netlink_callback *cb) 114static struct nfc_dev *__get_device_from_cb(struct netlink_callback *cb)
122{ 115{
116 struct nlattr **attrbuf = genl_family_attrbuf(&nfc_genl_family);
123 struct nfc_dev *dev; 117 struct nfc_dev *dev;
124 int rc; 118 int rc;
125 u32 idx; 119 u32 idx;
126 120
127 rc = nlmsg_parse(cb->nlh, GENL_HDRLEN + nfc_genl_family.hdrsize, 121 rc = nlmsg_parse(cb->nlh, GENL_HDRLEN + nfc_genl_family.hdrsize,
128 nfc_genl_family.attrbuf, 122 attrbuf, nfc_genl_family.maxattr, nfc_genl_policy);
129 nfc_genl_family.maxattr,
130 nfc_genl_policy);
131 if (rc < 0) 123 if (rc < 0)
132 return ERR_PTR(rc); 124 return ERR_PTR(rc);
133 125
134 if (!nfc_genl_family.attrbuf[NFC_ATTR_DEVICE_INDEX]) 126 if (!attrbuf[NFC_ATTR_DEVICE_INDEX])
135 return ERR_PTR(-EINVAL); 127 return ERR_PTR(-EINVAL);
136 128
137 idx = nla_get_u32(nfc_genl_family.attrbuf[NFC_ATTR_DEVICE_INDEX]); 129 idx = nla_get_u32(attrbuf[NFC_ATTR_DEVICE_INDEX]);
138 130
139 dev = nfc_get_device(idx); 131 dev = nfc_get_device(idx);
140 if (!dev) 132 if (!dev)
@@ -1754,6 +1746,18 @@ static const struct genl_ops nfc_genl_ops[] = {
1754 }, 1746 },
1755}; 1747};
1756 1748
1749static struct genl_family nfc_genl_family __ro_after_init = {
1750 .hdrsize = 0,
1751 .name = NFC_GENL_NAME,
1752 .version = NFC_GENL_VERSION,
1753 .maxattr = NFC_ATTR_MAX,
1754 .module = THIS_MODULE,
1755 .ops = nfc_genl_ops,
1756 .n_ops = ARRAY_SIZE(nfc_genl_ops),
1757 .mcgrps = nfc_genl_mcgrps,
1758 .n_mcgrps = ARRAY_SIZE(nfc_genl_mcgrps),
1759};
1760
1757 1761
1758struct urelease_work { 1762struct urelease_work {
1759 struct work_struct w; 1763 struct work_struct w;
@@ -1839,9 +1843,7 @@ int __init nfc_genl_init(void)
1839{ 1843{
1840 int rc; 1844 int rc;
1841 1845
1842 rc = genl_register_family_with_ops_groups(&nfc_genl_family, 1846 rc = genl_register_family(&nfc_genl_family);
1843 nfc_genl_ops,
1844 nfc_genl_mcgrps);
1845 if (rc) 1847 if (rc)
1846 return rc; 1848 return rc;
1847 1849
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 4e03f64709bc..c82301ce3fff 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -62,9 +62,11 @@ struct ovs_frag_data {
62 struct vport *vport; 62 struct vport *vport;
63 struct ovs_skb_cb cb; 63 struct ovs_skb_cb cb;
64 __be16 inner_protocol; 64 __be16 inner_protocol;
65 __u16 vlan_tci; 65 u16 network_offset; /* valid only for MPLS */
66 u16 vlan_tci;
66 __be16 vlan_proto; 67 __be16 vlan_proto;
67 unsigned int l2_len; 68 unsigned int l2_len;
69 u8 mac_proto;
68 u8 l2_data[MAX_L2_LEN]; 70 u8 l2_data[MAX_L2_LEN];
69}; 71};
70 72
@@ -136,12 +138,12 @@ static struct deferred_action *add_deferred_actions(struct sk_buff *skb,
136 138
137static void invalidate_flow_key(struct sw_flow_key *key) 139static void invalidate_flow_key(struct sw_flow_key *key)
138{ 140{
139 key->eth.type = htons(0); 141 key->mac_proto |= SW_FLOW_KEY_INVALID;
140} 142}
141 143
142static bool is_flow_key_valid(const struct sw_flow_key *key) 144static bool is_flow_key_valid(const struct sw_flow_key *key)
143{ 145{
144 return !!key->eth.type; 146 return !(key->mac_proto & SW_FLOW_KEY_INVALID);
145} 147}
146 148
147static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr, 149static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr,
@@ -185,7 +187,8 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
185 187
186 skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN); 188 skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN);
187 189
188 update_ethertype(skb, eth_hdr(skb), mpls->mpls_ethertype); 190 if (ovs_key_mac_proto(key) == MAC_PROTO_ETHERNET)
191 update_ethertype(skb, eth_hdr(skb), mpls->mpls_ethertype);
189 skb->protocol = mpls->mpls_ethertype; 192 skb->protocol = mpls->mpls_ethertype;
190 193
191 invalidate_flow_key(key); 194 invalidate_flow_key(key);
@@ -195,7 +198,6 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
195static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key, 198static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key,
196 const __be16 ethertype) 199 const __be16 ethertype)
197{ 200{
198 struct ethhdr *hdr;
199 int err; 201 int err;
200 202
201 err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); 203 err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
@@ -211,11 +213,15 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key,
211 skb_reset_mac_header(skb); 213 skb_reset_mac_header(skb);
212 skb_set_network_header(skb, skb->mac_len); 214 skb_set_network_header(skb, skb->mac_len);
213 215
214 /* mpls_hdr() is used to locate the ethertype field correctly in the 216 if (ovs_key_mac_proto(key) == MAC_PROTO_ETHERNET) {
215 * presence of VLAN tags. 217 struct ethhdr *hdr;
216 */ 218
217 hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN); 219 /* mpls_hdr() is used to locate the ethertype field correctly in the
218 update_ethertype(skb, hdr, ethertype); 220 * presence of VLAN tags.
221 */
222 hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN);
223 update_ethertype(skb, hdr, ethertype);
224 }
219 if (eth_p_mpls(skb->protocol)) 225 if (eth_p_mpls(skb->protocol))
220 skb->protocol = ethertype; 226 skb->protocol = ethertype;
221 227
@@ -311,6 +317,47 @@ static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key,
311 return 0; 317 return 0;
312} 318}
313 319
320/* pop_eth does not support VLAN packets as this action is never called
321 * for them.
322 */
323static int pop_eth(struct sk_buff *skb, struct sw_flow_key *key)
324{
325 skb_pull_rcsum(skb, ETH_HLEN);
326 skb_reset_mac_header(skb);
327 skb_reset_mac_len(skb);
328
329 /* safe right before invalidate_flow_key */
330 key->mac_proto = MAC_PROTO_NONE;
331 invalidate_flow_key(key);
332 return 0;
333}
334
335static int push_eth(struct sk_buff *skb, struct sw_flow_key *key,
336 const struct ovs_action_push_eth *ethh)
337{
338 struct ethhdr *hdr;
339
340 /* Add the new Ethernet header */
341 if (skb_cow_head(skb, ETH_HLEN) < 0)
342 return -ENOMEM;
343
344 skb_push(skb, ETH_HLEN);
345 skb_reset_mac_header(skb);
346 skb_reset_mac_len(skb);
347
348 hdr = eth_hdr(skb);
349 ether_addr_copy(hdr->h_source, ethh->addresses.eth_src);
350 ether_addr_copy(hdr->h_dest, ethh->addresses.eth_dst);
351 hdr->h_proto = skb->protocol;
352
353 skb_postpush_rcsum(skb, hdr, ETH_HLEN);
354
355 /* safe right before invalidate_flow_key */
356 key->mac_proto = MAC_PROTO_ETHERNET;
357 invalidate_flow_key(key);
358 return 0;
359}
360
314static void update_ip_l4_checksum(struct sk_buff *skb, struct iphdr *nh, 361static void update_ip_l4_checksum(struct sk_buff *skb, struct iphdr *nh,
315 __be32 addr, __be32 new_addr) 362 __be32 addr, __be32 new_addr)
316{ 363{
@@ -666,7 +713,13 @@ static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *sk
666 skb_postpush_rcsum(skb, skb->data, data->l2_len); 713 skb_postpush_rcsum(skb, skb->data, data->l2_len);
667 skb_reset_mac_header(skb); 714 skb_reset_mac_header(skb);
668 715
669 ovs_vport_send(vport, skb); 716 if (eth_p_mpls(skb->protocol)) {
717 skb->inner_network_header = skb->network_header;
718 skb_set_network_header(skb, data->network_offset);
719 skb_reset_mac_len(skb);
720 }
721
722 ovs_vport_send(vport, skb, data->mac_proto);
670 return 0; 723 return 0;
671} 724}
672 725
@@ -684,7 +737,8 @@ static struct dst_ops ovs_dst_ops = {
684/* prepare_frag() is called once per (larger-than-MTU) frame; its inverse is 737/* prepare_frag() is called once per (larger-than-MTU) frame; its inverse is
685 * ovs_vport_output(), which is called once per fragmented packet. 738 * ovs_vport_output(), which is called once per fragmented packet.
686 */ 739 */
687static void prepare_frag(struct vport *vport, struct sk_buff *skb) 740static void prepare_frag(struct vport *vport, struct sk_buff *skb,
741 u16 orig_network_offset, u8 mac_proto)
688{ 742{
689 unsigned int hlen = skb_network_offset(skb); 743 unsigned int hlen = skb_network_offset(skb);
690 struct ovs_frag_data *data; 744 struct ovs_frag_data *data;
@@ -694,8 +748,10 @@ static void prepare_frag(struct vport *vport, struct sk_buff *skb)
694 data->vport = vport; 748 data->vport = vport;
695 data->cb = *OVS_CB(skb); 749 data->cb = *OVS_CB(skb);
696 data->inner_protocol = skb->inner_protocol; 750 data->inner_protocol = skb->inner_protocol;
751 data->network_offset = orig_network_offset;
697 data->vlan_tci = skb->vlan_tci; 752 data->vlan_tci = skb->vlan_tci;
698 data->vlan_proto = skb->vlan_proto; 753 data->vlan_proto = skb->vlan_proto;
754 data->mac_proto = mac_proto;
699 data->l2_len = hlen; 755 data->l2_len = hlen;
700 memcpy(&data->l2_data, skb->data, hlen); 756 memcpy(&data->l2_data, skb->data, hlen);
701 757
@@ -704,18 +760,27 @@ static void prepare_frag(struct vport *vport, struct sk_buff *skb)
704} 760}
705 761
706static void ovs_fragment(struct net *net, struct vport *vport, 762static void ovs_fragment(struct net *net, struct vport *vport,
707 struct sk_buff *skb, u16 mru, __be16 ethertype) 763 struct sk_buff *skb, u16 mru,
764 struct sw_flow_key *key)
708{ 765{
766 u16 orig_network_offset = 0;
767
768 if (eth_p_mpls(skb->protocol)) {
769 orig_network_offset = skb_network_offset(skb);
770 skb->network_header = skb->inner_network_header;
771 }
772
709 if (skb_network_offset(skb) > MAX_L2_LEN) { 773 if (skb_network_offset(skb) > MAX_L2_LEN) {
710 OVS_NLERR(1, "L2 header too long to fragment"); 774 OVS_NLERR(1, "L2 header too long to fragment");
711 goto err; 775 goto err;
712 } 776 }
713 777
714 if (ethertype == htons(ETH_P_IP)) { 778 if (key->eth.type == htons(ETH_P_IP)) {
715 struct dst_entry ovs_dst; 779 struct dst_entry ovs_dst;
716 unsigned long orig_dst; 780 unsigned long orig_dst;
717 781
718 prepare_frag(vport, skb); 782 prepare_frag(vport, skb, orig_network_offset,
783 ovs_key_mac_proto(key));
719 dst_init(&ovs_dst, &ovs_dst_ops, NULL, 1, 784 dst_init(&ovs_dst, &ovs_dst_ops, NULL, 1,
720 DST_OBSOLETE_NONE, DST_NOCOUNT); 785 DST_OBSOLETE_NONE, DST_NOCOUNT);
721 ovs_dst.dev = vport->dev; 786 ovs_dst.dev = vport->dev;
@@ -726,16 +791,16 @@ static void ovs_fragment(struct net *net, struct vport *vport,
726 791
727 ip_do_fragment(net, skb->sk, skb, ovs_vport_output); 792 ip_do_fragment(net, skb->sk, skb, ovs_vport_output);
728 refdst_drop(orig_dst); 793 refdst_drop(orig_dst);
729 } else if (ethertype == htons(ETH_P_IPV6)) { 794 } else if (key->eth.type == htons(ETH_P_IPV6)) {
730 const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); 795 const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();
731 unsigned long orig_dst; 796 unsigned long orig_dst;
732 struct rt6_info ovs_rt; 797 struct rt6_info ovs_rt;
733 798
734 if (!v6ops) { 799 if (!v6ops)
735 goto err; 800 goto err;
736 }
737 801
738 prepare_frag(vport, skb); 802 prepare_frag(vport, skb, orig_network_offset,
803 ovs_key_mac_proto(key));
739 memset(&ovs_rt, 0, sizeof(ovs_rt)); 804 memset(&ovs_rt, 0, sizeof(ovs_rt));
740 dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL, 1, 805 dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL, 1,
741 DST_OBSOLETE_NONE, DST_NOCOUNT); 806 DST_OBSOLETE_NONE, DST_NOCOUNT);
@@ -749,7 +814,7 @@ static void ovs_fragment(struct net *net, struct vport *vport,
749 refdst_drop(orig_dst); 814 refdst_drop(orig_dst);
750 } else { 815 } else {
751 WARN_ONCE(1, "Failed fragment ->%s: eth=%04x, MRU=%d, MTU=%d.", 816 WARN_ONCE(1, "Failed fragment ->%s: eth=%04x, MRU=%d, MTU=%d.",
752 ovs_vport_name(vport), ntohs(ethertype), mru, 817 ovs_vport_name(vport), ntohs(key->eth.type), mru,
753 vport->dev->mtu); 818 vport->dev->mtu);
754 goto err; 819 goto err;
755 } 820 }
@@ -769,26 +834,19 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port,
769 u32 cutlen = OVS_CB(skb)->cutlen; 834 u32 cutlen = OVS_CB(skb)->cutlen;
770 835
771 if (unlikely(cutlen > 0)) { 836 if (unlikely(cutlen > 0)) {
772 if (skb->len - cutlen > ETH_HLEN) 837 if (skb->len - cutlen > ovs_mac_header_len(key))
773 pskb_trim(skb, skb->len - cutlen); 838 pskb_trim(skb, skb->len - cutlen);
774 else 839 else
775 pskb_trim(skb, ETH_HLEN); 840 pskb_trim(skb, ovs_mac_header_len(key));
776 } 841 }
777 842
778 if (likely(!mru || (skb->len <= mru + ETH_HLEN))) { 843 if (likely(!mru ||
779 ovs_vport_send(vport, skb); 844 (skb->len <= mru + vport->dev->hard_header_len))) {
845 ovs_vport_send(vport, skb, ovs_key_mac_proto(key));
780 } else if (mru <= vport->dev->mtu) { 846 } else if (mru <= vport->dev->mtu) {
781 struct net *net = read_pnet(&dp->net); 847 struct net *net = read_pnet(&dp->net);
782 __be16 ethertype = key->eth.type;
783 848
784 if (!is_flow_key_valid(key)) { 849 ovs_fragment(net, vport, skb, mru, key);
785 if (eth_p_mpls(skb->protocol))
786 ethertype = skb->inner_protocol;
787 else
788 ethertype = vlan_get_protocol(skb);
789 }
790
791 ovs_fragment(net, vport, skb, mru, ethertype);
792 } else { 850 } else {
793 kfree_skb(skb); 851 kfree_skb(skb);
794 } 852 }
@@ -1015,6 +1073,8 @@ static int execute_masked_set_action(struct sk_buff *skb,
1015 case OVS_KEY_ATTR_CT_ZONE: 1073 case OVS_KEY_ATTR_CT_ZONE:
1016 case OVS_KEY_ATTR_CT_MARK: 1074 case OVS_KEY_ATTR_CT_MARK:
1017 case OVS_KEY_ATTR_CT_LABELS: 1075 case OVS_KEY_ATTR_CT_LABELS:
1076 case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4:
1077 case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6:
1018 err = -EINVAL; 1078 err = -EINVAL;
1019 break; 1079 break;
1020 } 1080 }
@@ -1082,12 +1142,6 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
1082 struct sw_flow_key *key, 1142 struct sw_flow_key *key,
1083 const struct nlattr *attr, int len) 1143 const struct nlattr *attr, int len)
1084{ 1144{
1085 /* Every output action needs a separate clone of 'skb', but the common
1086 * case is just a single output action, so that doing a clone and
1087 * then freeing the original skbuff is wasteful. So the following code
1088 * is slightly obscure just to avoid that.
1089 */
1090 int prev_port = -1;
1091 const struct nlattr *a; 1145 const struct nlattr *a;
1092 int rem; 1146 int rem;
1093 1147
@@ -1095,20 +1149,28 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
1095 a = nla_next(a, &rem)) { 1149 a = nla_next(a, &rem)) {
1096 int err = 0; 1150 int err = 0;
1097 1151
1098 if (unlikely(prev_port != -1)) { 1152 switch (nla_type(a)) {
1099 struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC); 1153 case OVS_ACTION_ATTR_OUTPUT: {
1100 1154 int port = nla_get_u32(a);
1101 if (out_skb) 1155 struct sk_buff *clone;
1102 do_output(dp, out_skb, prev_port, key); 1156
1157 /* Every output action needs a separate clone
1158 * of 'skb', In case the output action is the
1159 * last action, cloning can be avoided.
1160 */
1161 if (nla_is_last(a, rem)) {
1162 do_output(dp, skb, port, key);
1163 /* 'skb' has been used for output.
1164 */
1165 return 0;
1166 }
1103 1167
1168 clone = skb_clone(skb, GFP_ATOMIC);
1169 if (clone)
1170 do_output(dp, clone, port, key);
1104 OVS_CB(skb)->cutlen = 0; 1171 OVS_CB(skb)->cutlen = 0;
1105 prev_port = -1;
1106 }
1107
1108 switch (nla_type(a)) {
1109 case OVS_ACTION_ATTR_OUTPUT:
1110 prev_port = nla_get_u32(a);
1111 break; 1172 break;
1173 }
1112 1174
1113 case OVS_ACTION_ATTR_TRUNC: { 1175 case OVS_ACTION_ATTR_TRUNC: {
1114 struct ovs_action_trunc *trunc = nla_data(a); 1176 struct ovs_action_trunc *trunc = nla_data(a);
@@ -1182,6 +1244,14 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
1182 if (err) 1244 if (err)
1183 return err == -EINPROGRESS ? 0 : err; 1245 return err == -EINPROGRESS ? 0 : err;
1184 break; 1246 break;
1247
1248 case OVS_ACTION_ATTR_PUSH_ETH:
1249 err = push_eth(skb, key, nla_data(a));
1250 break;
1251
1252 case OVS_ACTION_ATTR_POP_ETH:
1253 err = pop_eth(skb, key);
1254 break;
1185 } 1255 }
1186 1256
1187 if (unlikely(err)) { 1257 if (unlikely(err)) {
@@ -1190,11 +1260,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
1190 } 1260 }
1191 } 1261 }
1192 1262
1193 if (prev_port != -1) 1263 consume_skb(skb);
1194 do_output(dp, skb, prev_port, key);
1195 else
1196 consume_skb(skb);
1197
1198 return 0; 1264 return 0;
1199} 1265}
1200 1266
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index fecefa2dc94e..7b2c2fce408a 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -65,6 +65,7 @@ struct ovs_conntrack_info {
65 struct nf_conn *ct; 65 struct nf_conn *ct;
66 u8 commit : 1; 66 u8 commit : 1;
67 u8 nat : 3; /* enum ovs_ct_nat */ 67 u8 nat : 3; /* enum ovs_ct_nat */
68 u8 force : 1;
68 u16 family; 69 u16 family;
69 struct md_mark mark; 70 struct md_mark mark;
70 struct md_labels labels; 71 struct md_labels labels;
@@ -73,6 +74,8 @@ struct ovs_conntrack_info {
73#endif 74#endif
74}; 75};
75 76
77static bool labels_nonzero(const struct ovs_key_ct_labels *labels);
78
76static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info); 79static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info);
77 80
78static u16 key_to_nfproto(const struct sw_flow_key *key) 81static u16 key_to_nfproto(const struct sw_flow_key *key)
@@ -129,21 +132,33 @@ static u32 ovs_ct_get_mark(const struct nf_conn *ct)
129#endif 132#endif
130} 133}
131 134
135/* Guard against conntrack labels max size shrinking below 128 bits. */
136#if NF_CT_LABELS_MAX_SIZE < 16
137#error NF_CT_LABELS_MAX_SIZE must be at least 16 bytes
138#endif
139
132static void ovs_ct_get_labels(const struct nf_conn *ct, 140static void ovs_ct_get_labels(const struct nf_conn *ct,
133 struct ovs_key_ct_labels *labels) 141 struct ovs_key_ct_labels *labels)
134{ 142{
135 struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL; 143 struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL;
136 144
137 if (cl) { 145 if (cl)
138 size_t len = sizeof(cl->bits); 146 memcpy(labels, cl->bits, OVS_CT_LABELS_LEN);
147 else
148 memset(labels, 0, OVS_CT_LABELS_LEN);
149}
139 150
140 if (len > OVS_CT_LABELS_LEN) 151static void __ovs_ct_update_key_orig_tp(struct sw_flow_key *key,
141 len = OVS_CT_LABELS_LEN; 152 const struct nf_conntrack_tuple *orig,
142 else if (len < OVS_CT_LABELS_LEN) 153 u8 icmp_proto)
143 memset(labels, 0, OVS_CT_LABELS_LEN); 154{
144 memcpy(labels, cl->bits, len); 155 key->ct_orig_proto = orig->dst.protonum;
156 if (orig->dst.protonum == icmp_proto) {
157 key->ct.orig_tp.src = htons(orig->dst.u.icmp.type);
158 key->ct.orig_tp.dst = htons(orig->dst.u.icmp.code);
145 } else { 159 } else {
146 memset(labels, 0, OVS_CT_LABELS_LEN); 160 key->ct.orig_tp.src = orig->src.u.all;
161 key->ct.orig_tp.dst = orig->dst.u.all;
147 } 162 }
148} 163}
149 164
@@ -151,13 +166,42 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
151 const struct nf_conntrack_zone *zone, 166 const struct nf_conntrack_zone *zone,
152 const struct nf_conn *ct) 167 const struct nf_conn *ct)
153{ 168{
154 key->ct.state = state; 169 key->ct_state = state;
155 key->ct.zone = zone->id; 170 key->ct_zone = zone->id;
156 key->ct.mark = ovs_ct_get_mark(ct); 171 key->ct.mark = ovs_ct_get_mark(ct);
157 ovs_ct_get_labels(ct, &key->ct.labels); 172 ovs_ct_get_labels(ct, &key->ct.labels);
173
174 if (ct) {
175 const struct nf_conntrack_tuple *orig;
176
177 /* Use the master if we have one. */
178 if (ct->master)
179 ct = ct->master;
180 orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
181
182 /* IP version must match with the master connection. */
183 if (key->eth.type == htons(ETH_P_IP) &&
184 nf_ct_l3num(ct) == NFPROTO_IPV4) {
185 key->ipv4.ct_orig.src = orig->src.u3.ip;
186 key->ipv4.ct_orig.dst = orig->dst.u3.ip;
187 __ovs_ct_update_key_orig_tp(key, orig, IPPROTO_ICMP);
188 return;
189 } else if (key->eth.type == htons(ETH_P_IPV6) &&
190 !sw_flow_key_is_nd(key) &&
191 nf_ct_l3num(ct) == NFPROTO_IPV6) {
192 key->ipv6.ct_orig.src = orig->src.u3.in6;
193 key->ipv6.ct_orig.dst = orig->dst.u3.in6;
194 __ovs_ct_update_key_orig_tp(key, orig, NEXTHDR_ICMP);
195 return;
196 }
197 }
198 /* Clear 'ct_orig_proto' to mark the non-existence of conntrack
199 * original direction key fields.
200 */
201 key->ct_orig_proto = 0;
158} 202}
159 203
160/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has 204/* Update 'key' based on skb->_nfct. If 'post_ct' is true, then OVS has
161 * previously sent the packet to conntrack via the ct action. If 205 * previously sent the packet to conntrack via the ct action. If
162 * 'keep_nat_flags' is true, the existing NAT flags retained, else they are 206 * 'keep_nat_flags' is true, the existing NAT flags retained, else they are
163 * initialized from the connection status. 207 * initialized from the connection status.
@@ -184,7 +228,7 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
184 if (ct->master) 228 if (ct->master)
185 state |= OVS_CS_F_RELATED; 229 state |= OVS_CS_F_RELATED;
186 if (keep_nat_flags) { 230 if (keep_nat_flags) {
187 state |= key->ct.state & OVS_CS_F_NAT_MASK; 231 state |= key->ct_state & OVS_CS_F_NAT_MASK;
188 } else { 232 } else {
189 if (ct->status & IPS_SRC_NAT) 233 if (ct->status & IPS_SRC_NAT)
190 state |= OVS_CS_F_SRC_NAT; 234 state |= OVS_CS_F_SRC_NAT;
@@ -208,44 +252,69 @@ void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key)
208 ovs_ct_update_key(skb, NULL, key, false, false); 252 ovs_ct_update_key(skb, NULL, key, false, false);
209} 253}
210 254
211int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb) 255#define IN6_ADDR_INITIALIZER(ADDR) \
256 { (ADDR).s6_addr32[0], (ADDR).s6_addr32[1], \
257 (ADDR).s6_addr32[2], (ADDR).s6_addr32[3] }
258
259int ovs_ct_put_key(const struct sw_flow_key *swkey,
260 const struct sw_flow_key *output, struct sk_buff *skb)
212{ 261{
213 if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, key->ct.state)) 262 if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, output->ct_state))
214 return -EMSGSIZE; 263 return -EMSGSIZE;
215 264
216 if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && 265 if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
217 nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, key->ct.zone)) 266 nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, output->ct_zone))
218 return -EMSGSIZE; 267 return -EMSGSIZE;
219 268
220 if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && 269 if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) &&
221 nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, key->ct.mark)) 270 nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, output->ct.mark))
222 return -EMSGSIZE; 271 return -EMSGSIZE;
223 272
224 if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && 273 if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
225 nla_put(skb, OVS_KEY_ATTR_CT_LABELS, sizeof(key->ct.labels), 274 nla_put(skb, OVS_KEY_ATTR_CT_LABELS, sizeof(output->ct.labels),
226 &key->ct.labels)) 275 &output->ct.labels))
227 return -EMSGSIZE; 276 return -EMSGSIZE;
228 277
278 if (swkey->ct_orig_proto) {
279 if (swkey->eth.type == htons(ETH_P_IP)) {
280 struct ovs_key_ct_tuple_ipv4 orig = {
281 output->ipv4.ct_orig.src,
282 output->ipv4.ct_orig.dst,
283 output->ct.orig_tp.src,
284 output->ct.orig_tp.dst,
285 output->ct_orig_proto,
286 };
287 if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4,
288 sizeof(orig), &orig))
289 return -EMSGSIZE;
290 } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
291 struct ovs_key_ct_tuple_ipv6 orig = {
292 IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.src),
293 IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.dst),
294 output->ct.orig_tp.src,
295 output->ct.orig_tp.dst,
296 output->ct_orig_proto,
297 };
298 if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6,
299 sizeof(orig), &orig))
300 return -EMSGSIZE;
301 }
302 }
303
229 return 0; 304 return 0;
230} 305}
231 306
232static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key, 307static int ovs_ct_set_mark(struct nf_conn *ct, struct sw_flow_key *key,
233 u32 ct_mark, u32 mask) 308 u32 ct_mark, u32 mask)
234{ 309{
235#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) 310#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
236 enum ip_conntrack_info ctinfo;
237 struct nf_conn *ct;
238 u32 new_mark; 311 u32 new_mark;
239 312
240 /* The connection could be invalid, in which case set_mark is no-op. */
241 ct = nf_ct_get(skb, &ctinfo);
242 if (!ct)
243 return 0;
244
245 new_mark = ct_mark | (ct->mark & ~(mask)); 313 new_mark = ct_mark | (ct->mark & ~(mask));
246 if (ct->mark != new_mark) { 314 if (ct->mark != new_mark) {
247 ct->mark = new_mark; 315 ct->mark = new_mark;
248 nf_conntrack_event_cache(IPCT_MARK, ct); 316 if (nf_ct_is_confirmed(ct))
317 nf_conntrack_event_cache(IPCT_MARK, ct);
249 key->ct.mark = new_mark; 318 key->ct.mark = new_mark;
250 } 319 }
251 320
@@ -255,34 +324,83 @@ static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key,
255#endif 324#endif
256} 325}
257 326
258static int ovs_ct_set_labels(struct sk_buff *skb, struct sw_flow_key *key, 327static struct nf_conn_labels *ovs_ct_get_conn_labels(struct nf_conn *ct)
259 const struct ovs_key_ct_labels *labels,
260 const struct ovs_key_ct_labels *mask)
261{ 328{
262 enum ip_conntrack_info ctinfo;
263 struct nf_conn_labels *cl; 329 struct nf_conn_labels *cl;
264 struct nf_conn *ct;
265 int err;
266
267 /* The connection could be invalid, in which case set_label is no-op.*/
268 ct = nf_ct_get(skb, &ctinfo);
269 if (!ct)
270 return 0;
271 330
272 cl = nf_ct_labels_find(ct); 331 cl = nf_ct_labels_find(ct);
273 if (!cl) { 332 if (!cl) {
274 nf_ct_labels_ext_add(ct); 333 nf_ct_labels_ext_add(ct);
275 cl = nf_ct_labels_find(ct); 334 cl = nf_ct_labels_find(ct);
276 } 335 }
277 if (!cl || sizeof(cl->bits) < OVS_CT_LABELS_LEN) 336
337 return cl;
338}
339
340/* Initialize labels for a new, yet to be committed conntrack entry. Note that
341 * since the new connection is not yet confirmed, and thus no-one else has
342 * access to it's labels, we simply write them over.
343 */
344static int ovs_ct_init_labels(struct nf_conn *ct, struct sw_flow_key *key,
345 const struct ovs_key_ct_labels *labels,
346 const struct ovs_key_ct_labels *mask)
347{
348 struct nf_conn_labels *cl, *master_cl;
349 bool have_mask = labels_nonzero(mask);
350
351 /* Inherit master's labels to the related connection? */
352 master_cl = ct->master ? nf_ct_labels_find(ct->master) : NULL;
353
354 if (!master_cl && !have_mask)
355 return 0; /* Nothing to do. */
356
357 cl = ovs_ct_get_conn_labels(ct);
358 if (!cl)
278 return -ENOSPC; 359 return -ENOSPC;
279 360
280 err = nf_connlabels_replace(ct, (u32 *)labels, (u32 *)mask, 361 /* Inherit the master's labels, if any. */
281 OVS_CT_LABELS_LEN / sizeof(u32)); 362 if (master_cl)
363 *cl = *master_cl;
364
365 if (have_mask) {
366 u32 *dst = (u32 *)cl->bits;
367 int i;
368
369 for (i = 0; i < OVS_CT_LABELS_LEN_32; i++)
370 dst[i] = (dst[i] & ~mask->ct_labels_32[i]) |
371 (labels->ct_labels_32[i]
372 & mask->ct_labels_32[i]);
373 }
374
375 /* Labels are included in the IPCTNL_MSG_CT_NEW event only if the
376 * IPCT_LABEL bit it set in the event cache.
377 */
378 nf_conntrack_event_cache(IPCT_LABEL, ct);
379
380 memcpy(&key->ct.labels, cl->bits, OVS_CT_LABELS_LEN);
381
382 return 0;
383}
384
385static int ovs_ct_set_labels(struct nf_conn *ct, struct sw_flow_key *key,
386 const struct ovs_key_ct_labels *labels,
387 const struct ovs_key_ct_labels *mask)
388{
389 struct nf_conn_labels *cl;
390 int err;
391
392 cl = ovs_ct_get_conn_labels(ct);
393 if (!cl)
394 return -ENOSPC;
395
396 err = nf_connlabels_replace(ct, labels->ct_labels_32,
397 mask->ct_labels_32,
398 OVS_CT_LABELS_LEN_32);
282 if (err) 399 if (err)
283 return err; 400 return err;
284 401
285 ovs_ct_get_labels(ct, &key->ct.labels); 402 memcpy(&key->ct.labels, cl->bits, OVS_CT_LABELS_LEN);
403
286 return 0; 404 return 0;
287} 405}
288 406
@@ -367,7 +485,6 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key,
367 } else if (key->eth.type == htons(ETH_P_IPV6)) { 485 } else if (key->eth.type == htons(ETH_P_IPV6)) {
368 enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone; 486 enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
369 487
370 skb_orphan(skb);
371 memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); 488 memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
372 err = nf_ct_frag6_gather(net, skb, user); 489 err = nf_ct_frag6_gather(net, skb, user);
373 if (err) { 490 if (err) {
@@ -421,16 +538,16 @@ ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h)
421 538
422/* Find an existing connection which this packet belongs to without 539/* Find an existing connection which this packet belongs to without
423 * re-attributing statistics or modifying the connection state. This allows an 540 * re-attributing statistics or modifying the connection state. This allows an
424 * skb->nfct lost due to an upcall to be recovered during actions execution. 541 * skb->_nfct lost due to an upcall to be recovered during actions execution.
425 * 542 *
426 * Must be called with rcu_read_lock. 543 * Must be called with rcu_read_lock.
427 * 544 *
428 * On success, populates skb->nfct and skb->nfctinfo, and returns the 545 * On success, populates skb->_nfct and returns the connection. Returns NULL
429 * connection. Returns NULL if there is no existing entry. 546 * if there is no existing entry.
430 */ 547 */
431static struct nf_conn * 548static struct nf_conn *
432ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, 549ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
433 u8 l3num, struct sk_buff *skb) 550 u8 l3num, struct sk_buff *skb, bool natted)
434{ 551{
435 struct nf_conntrack_l3proto *l3proto; 552 struct nf_conntrack_l3proto *l3proto;
436 struct nf_conntrack_l4proto *l4proto; 553 struct nf_conntrack_l4proto *l4proto;
@@ -453,6 +570,17 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
453 return NULL; 570 return NULL;
454 } 571 }
455 572
573 /* Must invert the tuple if skb has been transformed by NAT. */
574 if (natted) {
575 struct nf_conntrack_tuple inverse;
576
577 if (!nf_ct_invert_tuple(&inverse, &tuple, l3proto, l4proto)) {
578 pr_debug("ovs_ct_find_existing: Inversion failed!\n");
579 return NULL;
580 }
581 tuple = inverse;
582 }
583
456 /* look for tuple match */ 584 /* look for tuple match */
457 h = nf_conntrack_find_get(net, zone, &tuple); 585 h = nf_conntrack_find_get(net, zone, &tuple);
458 if (!h) 586 if (!h)
@@ -460,12 +588,18 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
460 588
461 ct = nf_ct_tuplehash_to_ctrack(h); 589 ct = nf_ct_tuplehash_to_ctrack(h);
462 590
463 skb->nfct = &ct->ct_general; 591 /* Inverted packet tuple matches the reverse direction conntrack tuple,
464 skb->nfctinfo = ovs_ct_get_info(h); 592 * select the other tuplehash to get the right 'ctinfo' bits for this
593 * packet.
594 */
595 if (natted)
596 h = &ct->tuplehash[!h->tuple.dst.dir];
597
598 nf_ct_set(skb, ct, ovs_ct_get_info(h));
465 return ct; 599 return ct;
466} 600}
467 601
468/* Determine whether skb->nfct is equal to the result of conntrack lookup. */ 602/* Determine whether skb->_nfct is equal to the result of conntrack lookup. */
469static bool skb_nfct_cached(struct net *net, 603static bool skb_nfct_cached(struct net *net,
470 const struct sw_flow_key *key, 604 const struct sw_flow_key *key,
471 const struct ovs_conntrack_info *info, 605 const struct ovs_conntrack_info *info,
@@ -476,14 +610,19 @@ static bool skb_nfct_cached(struct net *net,
476 610
477 ct = nf_ct_get(skb, &ctinfo); 611 ct = nf_ct_get(skb, &ctinfo);
478 /* If no ct, check if we have evidence that an existing conntrack entry 612 /* If no ct, check if we have evidence that an existing conntrack entry
479 * might be found for this skb. This happens when we lose a skb->nfct 613 * might be found for this skb. This happens when we lose a skb->_nfct
480 * due to an upcall. If the connection was not confirmed, it is not 614 * due to an upcall. If the connection was not confirmed, it is not
481 * cached and needs to be run through conntrack again. 615 * cached and needs to be run through conntrack again.
482 */ 616 */
483 if (!ct && key->ct.state & OVS_CS_F_TRACKED && 617 if (!ct && key->ct_state & OVS_CS_F_TRACKED &&
484 !(key->ct.state & OVS_CS_F_INVALID) && 618 !(key->ct_state & OVS_CS_F_INVALID) &&
485 key->ct.zone == info->zone.id) 619 key->ct_zone == info->zone.id) {
486 ct = ovs_ct_find_existing(net, &info->zone, info->family, skb); 620 ct = ovs_ct_find_existing(net, &info->zone, info->family, skb,
621 !!(key->ct_state
622 & OVS_CS_F_NAT_MASK));
623 if (ct)
624 nf_ct_get(skb, &ctinfo);
625 }
487 if (!ct) 626 if (!ct)
488 return false; 627 return false;
489 if (!net_eq(net, read_pnet(&ct->ct_net))) 628 if (!net_eq(net, read_pnet(&ct->ct_net)))
@@ -497,6 +636,18 @@ static bool skb_nfct_cached(struct net *net,
497 if (help && rcu_access_pointer(help->helper) != info->helper) 636 if (help && rcu_access_pointer(help->helper) != info->helper)
498 return false; 637 return false;
499 } 638 }
639 /* Force conntrack entry direction to the current packet? */
640 if (info->force && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) {
641 /* Delete the conntrack entry if confirmed, else just release
642 * the reference.
643 */
644 if (nf_ct_is_confirmed(ct))
645 nf_ct_delete(ct, 0, 0);
646
647 nf_conntrack_put(&ct->ct_general);
648 nf_ct_set(skb, NULL, 0);
649 return false;
650 }
500 651
501 return true; 652 return true;
502} 653}
@@ -514,7 +665,7 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
514 int hooknum, nh_off, err = NF_ACCEPT; 665 int hooknum, nh_off, err = NF_ACCEPT;
515 666
516 nh_off = skb_network_offset(skb); 667 nh_off = skb_network_offset(skb);
517 skb_pull(skb, nh_off); 668 skb_pull_rcsum(skb, nh_off);
518 669
519 /* See HOOK2MANIP(). */ 670 /* See HOOK2MANIP(). */
520 if (maniptype == NF_NAT_MANIP_SRC) 671 if (maniptype == NF_NAT_MANIP_SRC)
@@ -579,6 +730,7 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
579 err = nf_nat_packet(ct, ctinfo, hooknum, skb); 730 err = nf_nat_packet(ct, ctinfo, hooknum, skb);
580push: 731push:
581 skb_push(skb, nh_off); 732 skb_push(skb, nh_off);
733 skb_postpush_rcsum(skb, skb->data, nh_off);
582 734
583 return err; 735 return err;
584} 736}
@@ -590,7 +742,7 @@ static void ovs_nat_update_key(struct sw_flow_key *key,
590 if (maniptype == NF_NAT_MANIP_SRC) { 742 if (maniptype == NF_NAT_MANIP_SRC) {
591 __be16 src; 743 __be16 src;
592 744
593 key->ct.state |= OVS_CS_F_SRC_NAT; 745 key->ct_state |= OVS_CS_F_SRC_NAT;
594 if (key->eth.type == htons(ETH_P_IP)) 746 if (key->eth.type == htons(ETH_P_IP))
595 key->ipv4.addr.src = ip_hdr(skb)->saddr; 747 key->ipv4.addr.src = ip_hdr(skb)->saddr;
596 else if (key->eth.type == htons(ETH_P_IPV6)) 748 else if (key->eth.type == htons(ETH_P_IPV6))
@@ -612,7 +764,7 @@ static void ovs_nat_update_key(struct sw_flow_key *key,
612 } else { 764 } else {
613 __be16 dst; 765 __be16 dst;
614 766
615 key->ct.state |= OVS_CS_F_DST_NAT; 767 key->ct_state |= OVS_CS_F_DST_NAT;
616 if (key->eth.type == htons(ETH_P_IP)) 768 if (key->eth.type == htons(ETH_P_IP))
617 key->ipv4.addr.dst = ip_hdr(skb)->daddr; 769 key->ipv4.addr.dst = ip_hdr(skb)->daddr;
618 else if (key->eth.type == htons(ETH_P_IPV6)) 770 else if (key->eth.type == htons(ETH_P_IPV6))
@@ -699,7 +851,7 @@ static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
699/* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if 851/* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if
700 * not done already. Update key with new CT state after passing the packet 852 * not done already. Update key with new CT state after passing the packet
701 * through conntrack. 853 * through conntrack.
702 * Note that if the packet is deemed invalid by conntrack, skb->nfct will be 854 * Note that if the packet is deemed invalid by conntrack, skb->_nfct will be
703 * set to NULL and 0 will be returned. 855 * set to NULL and 0 will be returned.
704 */ 856 */
705static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, 857static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
@@ -721,19 +873,14 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
721 873
722 /* Associate skb with specified zone. */ 874 /* Associate skb with specified zone. */
723 if (tmpl) { 875 if (tmpl) {
724 if (skb->nfct) 876 if (skb_nfct(skb))
725 nf_conntrack_put(skb->nfct); 877 nf_conntrack_put(skb_nfct(skb));
726 nf_conntrack_get(&tmpl->ct_general); 878 nf_conntrack_get(&tmpl->ct_general);
727 skb->nfct = &tmpl->ct_general; 879 nf_ct_set(skb, tmpl, IP_CT_NEW);
728 skb->nfctinfo = IP_CT_NEW;
729 } 880 }
730 881
731 /* Repeat if requested, see nf_iterate(). */ 882 err = nf_conntrack_in(net, info->family,
732 do { 883 NF_INET_PRE_ROUTING, skb);
733 err = nf_conntrack_in(net, info->family,
734 NF_INET_PRE_ROUTING, skb);
735 } while (err == NF_REPEAT);
736
737 if (err != NF_ACCEPT) 884 if (err != NF_ACCEPT)
738 return -ENOENT; 885 return -ENOENT;
739 886
@@ -741,7 +888,7 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
741 * NAT after the nf_conntrack_in() call. We can actually clear 888 * NAT after the nf_conntrack_in() call. We can actually clear
742 * the whole state, as it will be re-initialized below. 889 * the whole state, as it will be re-initialized below.
743 */ 890 */
744 key->ct.state = 0; 891 key->ct_state = 0;
745 892
746 /* Update the key, but keep the NAT flags. */ 893 /* Update the key, but keep the NAT flags. */
747 ovs_ct_update_key(skb, info, key, true, true); 894 ovs_ct_update_key(skb, info, key, true, true);
@@ -757,9 +904,9 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
757 * 904 *
758 * NAT will be done only if the CT action has NAT, and only 905 * NAT will be done only if the CT action has NAT, and only
759 * once per packet (per zone), as guarded by the NAT bits in 906 * once per packet (per zone), as guarded by the NAT bits in
760 * the key->ct.state. 907 * the key->ct_state.
761 */ 908 */
762 if (info->nat && !(key->ct.state & OVS_CS_F_NAT_MASK) && 909 if (info->nat && !(key->ct_state & OVS_CS_F_NAT_MASK) &&
763 (nf_ct_is_confirmed(ct) || info->commit) && 910 (nf_ct_is_confirmed(ct) || info->commit) &&
764 ovs_ct_nat(net, key, info, skb, ct, ctinfo) != NF_ACCEPT) { 911 ovs_ct_nat(net, key, info, skb, ct, ctinfo) != NF_ACCEPT) {
765 return -EINVAL; 912 return -EINVAL;
@@ -823,7 +970,7 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
823 if (err) 970 if (err)
824 return err; 971 return err;
825 972
826 ct = (struct nf_conn *)skb->nfct; 973 ct = (struct nf_conn *)skb_nfct(skb);
827 if (ct) 974 if (ct)
828 nf_ct_deliver_cached_events(ct); 975 nf_ct_deliver_cached_events(ct);
829 } 976 }
@@ -835,8 +982,8 @@ static bool labels_nonzero(const struct ovs_key_ct_labels *labels)
835{ 982{
836 size_t i; 983 size_t i;
837 984
838 for (i = 0; i < sizeof(*labels); i++) 985 for (i = 0; i < OVS_CT_LABELS_LEN_32; i++)
839 if (labels->ct_labels[i]) 986 if (labels->ct_labels_32[i])
840 return true; 987 return true;
841 988
842 return false; 989 return false;
@@ -847,24 +994,36 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
847 const struct ovs_conntrack_info *info, 994 const struct ovs_conntrack_info *info,
848 struct sk_buff *skb) 995 struct sk_buff *skb)
849{ 996{
997 enum ip_conntrack_info ctinfo;
998 struct nf_conn *ct;
850 int err; 999 int err;
851 1000
852 err = __ovs_ct_lookup(net, key, info, skb); 1001 err = __ovs_ct_lookup(net, key, info, skb);
853 if (err) 1002 if (err)
854 return err; 1003 return err;
855 1004
1005 /* The connection could be invalid, in which case this is a no-op.*/
1006 ct = nf_ct_get(skb, &ctinfo);
1007 if (!ct)
1008 return 0;
1009
856 /* Apply changes before confirming the connection so that the initial 1010 /* Apply changes before confirming the connection so that the initial
857 * conntrack NEW netlink event carries the values given in the CT 1011 * conntrack NEW netlink event carries the values given in the CT
858 * action. 1012 * action.
859 */ 1013 */
860 if (info->mark.mask) { 1014 if (info->mark.mask) {
861 err = ovs_ct_set_mark(skb, key, info->mark.value, 1015 err = ovs_ct_set_mark(ct, key, info->mark.value,
862 info->mark.mask); 1016 info->mark.mask);
863 if (err) 1017 if (err)
864 return err; 1018 return err;
865 } 1019 }
866 if (labels_nonzero(&info->labels.mask)) { 1020 if (!nf_ct_is_confirmed(ct)) {
867 err = ovs_ct_set_labels(skb, key, &info->labels.value, 1021 err = ovs_ct_init_labels(ct, key, &info->labels.value,
1022 &info->labels.mask);
1023 if (err)
1024 return err;
1025 } else if (labels_nonzero(&info->labels.mask)) {
1026 err = ovs_ct_set_labels(ct, key, &info->labels.value,
868 &info->labels.mask); 1027 &info->labels.mask);
869 if (err) 1028 if (err)
870 return err; 1029 return err;
@@ -890,7 +1049,7 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
890 1049
891 /* The conntrack module expects to be working at L3. */ 1050 /* The conntrack module expects to be working at L3. */
892 nh_ofs = skb_network_offset(skb); 1051 nh_ofs = skb_network_offset(skb);
893 skb_pull(skb, nh_ofs); 1052 skb_pull_rcsum(skb, nh_ofs);
894 1053
895 if (key->ip.frag != OVS_FRAG_TYPE_NONE) { 1054 if (key->ip.frag != OVS_FRAG_TYPE_NONE) {
896 err = handle_fragments(net, key, info->zone.id, skb); 1055 err = handle_fragments(net, key, info->zone.id, skb);
@@ -904,6 +1063,7 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
904 err = ovs_ct_lookup(net, key, info, skb); 1063 err = ovs_ct_lookup(net, key, info, skb);
905 1064
906 skb_push(skb, nh_ofs); 1065 skb_push(skb, nh_ofs);
1066 skb_postpush_rcsum(skb, skb->data, nh_ofs);
907 if (err) 1067 if (err)
908 kfree_skb(skb); 1068 kfree_skb(skb);
909 return err; 1069 return err;
@@ -1065,6 +1225,7 @@ static int parse_nat(const struct nlattr *attr,
1065 1225
1066static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { 1226static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
1067 [OVS_CT_ATTR_COMMIT] = { .minlen = 0, .maxlen = 0 }, 1227 [OVS_CT_ATTR_COMMIT] = { .minlen = 0, .maxlen = 0 },
1228 [OVS_CT_ATTR_FORCE_COMMIT] = { .minlen = 0, .maxlen = 0 },
1068 [OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16), 1229 [OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16),
1069 .maxlen = sizeof(u16) }, 1230 .maxlen = sizeof(u16) },
1070 [OVS_CT_ATTR_MARK] = { .minlen = sizeof(struct md_mark), 1231 [OVS_CT_ATTR_MARK] = { .minlen = sizeof(struct md_mark),
@@ -1104,6 +1265,9 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
1104 } 1265 }
1105 1266
1106 switch (type) { 1267 switch (type) {
1268 case OVS_CT_ATTR_FORCE_COMMIT:
1269 info->force = true;
1270 /* fall through. */
1107 case OVS_CT_ATTR_COMMIT: 1271 case OVS_CT_ATTR_COMMIT:
1108 info->commit = true; 1272 info->commit = true;
1109 break; 1273 break;
@@ -1330,7 +1494,9 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
1330 if (!start) 1494 if (!start)
1331 return -EMSGSIZE; 1495 return -EMSGSIZE;
1332 1496
1333 if (ct_info->commit && nla_put_flag(skb, OVS_CT_ATTR_COMMIT)) 1497 if (ct_info->commit && nla_put_flag(skb, ct_info->force
1498 ? OVS_CT_ATTR_FORCE_COMMIT
1499 : OVS_CT_ATTR_COMMIT))
1334 return -EMSGSIZE; 1500 return -EMSGSIZE;
1335 if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && 1501 if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
1336 nla_put_u16(skb, OVS_CT_ATTR_ZONE, ct_info->zone.id)) 1502 nla_put_u16(skb, OVS_CT_ATTR_ZONE, ct_info->zone.id))
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index 8f6230bd6183..bc7efd1867ab 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -32,7 +32,8 @@ int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *,
32 const struct ovs_conntrack_info *); 32 const struct ovs_conntrack_info *);
33 33
34void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key); 34void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key);
35int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb); 35int ovs_ct_put_key(const struct sw_flow_key *swkey,
36 const struct sw_flow_key *output, struct sk_buff *skb);
36void ovs_ct_free_action(const struct nlattr *a); 37void ovs_ct_free_action(const struct nlattr *a);
37 38
38#define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \ 39#define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \
@@ -75,13 +76,18 @@ static inline int ovs_ct_execute(struct net *net, struct sk_buff *skb,
75static inline void ovs_ct_fill_key(const struct sk_buff *skb, 76static inline void ovs_ct_fill_key(const struct sk_buff *skb,
76 struct sw_flow_key *key) 77 struct sw_flow_key *key)
77{ 78{
78 key->ct.state = 0; 79 key->ct_state = 0;
79 key->ct.zone = 0; 80 key->ct_zone = 0;
80 key->ct.mark = 0; 81 key->ct.mark = 0;
81 memset(&key->ct.labels, 0, sizeof(key->ct.labels)); 82 memset(&key->ct.labels, 0, sizeof(key->ct.labels));
83 /* Clear 'ct_orig_proto' to mark the non-existence of original
84 * direction key fields.
85 */
86 key->ct_orig_proto = 0;
82} 87}
83 88
84static inline int ovs_ct_put_key(const struct sw_flow_key *key, 89static inline int ovs_ct_put_key(const struct sw_flow_key *swkey,
90 const struct sw_flow_key *output,
85 struct sk_buff *skb) 91 struct sk_buff *skb)
86{ 92{
87 return 0; 93 return 0;
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 4d67ea856067..9c62b6325f7a 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -58,8 +58,7 @@
58#include "vport-internal_dev.h" 58#include "vport-internal_dev.h"
59#include "vport-netdev.h" 59#include "vport-netdev.h"
60 60
61int ovs_net_id __read_mostly; 61unsigned int ovs_net_id __read_mostly;
62EXPORT_SYMBOL_GPL(ovs_net_id);
63 62
64static struct genl_family dp_packet_genl_family; 63static struct genl_family dp_packet_genl_family;
65static struct genl_family dp_flow_genl_family; 64static struct genl_family dp_flow_genl_family;
@@ -131,7 +130,6 @@ int lockdep_ovsl_is_held(void)
131 else 130 else
132 return 1; 131 return 1;
133} 132}
134EXPORT_SYMBOL_GPL(lockdep_ovsl_is_held);
135#endif 133#endif
136 134
137static struct vport *new_vport(const struct vport_parms *); 135static struct vport *new_vport(const struct vport_parms *);
@@ -562,7 +560,6 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
562 struct sw_flow *flow; 560 struct sw_flow *flow;
563 struct sw_flow_actions *sf_acts; 561 struct sw_flow_actions *sf_acts;
564 struct datapath *dp; 562 struct datapath *dp;
565 struct ethhdr *eth;
566 struct vport *input_vport; 563 struct vport *input_vport;
567 u16 mru = 0; 564 u16 mru = 0;
568 int len; 565 int len;
@@ -583,17 +580,6 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
583 580
584 nla_memcpy(__skb_put(packet, len), a[OVS_PACKET_ATTR_PACKET], len); 581 nla_memcpy(__skb_put(packet, len), a[OVS_PACKET_ATTR_PACKET], len);
585 582
586 skb_reset_mac_header(packet);
587 eth = eth_hdr(packet);
588
589 /* Normally, setting the skb 'protocol' field would be handled by a
590 * call to eth_type_trans(), but it assumes there's a sending
591 * device, which we may not have. */
592 if (eth_proto_is_802_3(eth->h_proto))
593 packet->protocol = eth->h_proto;
594 else
595 packet->protocol = htons(ETH_P_802_2);
596
597 /* Set packet's mru */ 583 /* Set packet's mru */
598 if (a[OVS_PACKET_ATTR_MRU]) { 584 if (a[OVS_PACKET_ATTR_MRU]) {
599 mru = nla_get_u16(a[OVS_PACKET_ATTR_MRU]); 585 mru = nla_get_u16(a[OVS_PACKET_ATTR_MRU]);
@@ -672,8 +658,7 @@ static const struct genl_ops dp_packet_genl_ops[] = {
672 } 658 }
673}; 659};
674 660
675static struct genl_family dp_packet_genl_family = { 661static struct genl_family dp_packet_genl_family __ro_after_init = {
676 .id = GENL_ID_GENERATE,
677 .hdrsize = sizeof(struct ovs_header), 662 .hdrsize = sizeof(struct ovs_header),
678 .name = OVS_PACKET_FAMILY, 663 .name = OVS_PACKET_FAMILY,
679 .version = OVS_PACKET_VERSION, 664 .version = OVS_PACKET_VERSION,
@@ -682,6 +667,7 @@ static struct genl_family dp_packet_genl_family = {
682 .parallel_ops = true, 667 .parallel_ops = true,
683 .ops = dp_packet_genl_ops, 668 .ops = dp_packet_genl_ops,
684 .n_ops = ARRAY_SIZE(dp_packet_genl_ops), 669 .n_ops = ARRAY_SIZE(dp_packet_genl_ops),
670 .module = THIS_MODULE,
685}; 671};
686 672
687static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats, 673static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats,
@@ -1437,8 +1423,7 @@ static const struct genl_ops dp_flow_genl_ops[] = {
1437 }, 1423 },
1438}; 1424};
1439 1425
1440static struct genl_family dp_flow_genl_family = { 1426static struct genl_family dp_flow_genl_family __ro_after_init = {
1441 .id = GENL_ID_GENERATE,
1442 .hdrsize = sizeof(struct ovs_header), 1427 .hdrsize = sizeof(struct ovs_header),
1443 .name = OVS_FLOW_FAMILY, 1428 .name = OVS_FLOW_FAMILY,
1444 .version = OVS_FLOW_VERSION, 1429 .version = OVS_FLOW_VERSION,
@@ -1449,6 +1434,7 @@ static struct genl_family dp_flow_genl_family = {
1449 .n_ops = ARRAY_SIZE(dp_flow_genl_ops), 1434 .n_ops = ARRAY_SIZE(dp_flow_genl_ops),
1450 .mcgrps = &ovs_dp_flow_multicast_group, 1435 .mcgrps = &ovs_dp_flow_multicast_group,
1451 .n_mcgrps = 1, 1436 .n_mcgrps = 1,
1437 .module = THIS_MODULE,
1452}; 1438};
1453 1439
1454static size_t ovs_dp_cmd_msg_size(void) 1440static size_t ovs_dp_cmd_msg_size(void)
@@ -1823,8 +1809,7 @@ static const struct genl_ops dp_datapath_genl_ops[] = {
1823 }, 1809 },
1824}; 1810};
1825 1811
1826static struct genl_family dp_datapath_genl_family = { 1812static struct genl_family dp_datapath_genl_family __ro_after_init = {
1827 .id = GENL_ID_GENERATE,
1828 .hdrsize = sizeof(struct ovs_header), 1813 .hdrsize = sizeof(struct ovs_header),
1829 .name = OVS_DATAPATH_FAMILY, 1814 .name = OVS_DATAPATH_FAMILY,
1830 .version = OVS_DATAPATH_VERSION, 1815 .version = OVS_DATAPATH_VERSION,
@@ -1835,6 +1820,7 @@ static struct genl_family dp_datapath_genl_family = {
1835 .n_ops = ARRAY_SIZE(dp_datapath_genl_ops), 1820 .n_ops = ARRAY_SIZE(dp_datapath_genl_ops),
1836 .mcgrps = &ovs_dp_datapath_multicast_group, 1821 .mcgrps = &ovs_dp_datapath_multicast_group,
1837 .n_mcgrps = 1, 1822 .n_mcgrps = 1,
1823 .module = THIS_MODULE,
1838}; 1824};
1839 1825
1840/* Called with ovs_mutex or RCU read lock. */ 1826/* Called with ovs_mutex or RCU read lock. */
@@ -2245,8 +2231,7 @@ static const struct genl_ops dp_vport_genl_ops[] = {
2245 }, 2231 },
2246}; 2232};
2247 2233
2248struct genl_family dp_vport_genl_family = { 2234struct genl_family dp_vport_genl_family __ro_after_init = {
2249 .id = GENL_ID_GENERATE,
2250 .hdrsize = sizeof(struct ovs_header), 2235 .hdrsize = sizeof(struct ovs_header),
2251 .name = OVS_VPORT_FAMILY, 2236 .name = OVS_VPORT_FAMILY,
2252 .version = OVS_VPORT_VERSION, 2237 .version = OVS_VPORT_VERSION,
@@ -2257,6 +2242,7 @@ struct genl_family dp_vport_genl_family = {
2257 .n_ops = ARRAY_SIZE(dp_vport_genl_ops), 2242 .n_ops = ARRAY_SIZE(dp_vport_genl_ops),
2258 .mcgrps = &ovs_dp_vport_multicast_group, 2243 .mcgrps = &ovs_dp_vport_multicast_group,
2259 .n_mcgrps = 1, 2244 .n_mcgrps = 1,
2245 .module = THIS_MODULE,
2260}; 2246};
2261 2247
2262static struct genl_family * const dp_genl_families[] = { 2248static struct genl_family * const dp_genl_families[] = {
@@ -2274,7 +2260,7 @@ static void dp_unregister_genl(int n_families)
2274 genl_unregister_family(dp_genl_families[i]); 2260 genl_unregister_family(dp_genl_families[i]);
2275} 2261}
2276 2262
2277static int dp_register_genl(void) 2263static int __init dp_register_genl(void)
2278{ 2264{
2279 int err; 2265 int err;
2280 int i; 2266 int i;
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index ab85c1cae255..1c6e9377436d 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -144,7 +144,7 @@ struct ovs_net {
144 bool xt_label; 144 bool xt_label;
145}; 145};
146 146
147extern int ovs_net_id; 147extern unsigned int ovs_net_id;
148void ovs_lock(void); 148void ovs_lock(void);
149void ovs_unlock(void); 149void ovs_unlock(void);
150 150
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 22087062bd10..3f76cb765e5b 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -312,7 +312,8 @@ static bool icmp6hdr_ok(struct sk_buff *skb)
312 * Returns 0 if it encounters a non-vlan or incomplete packet. 312 * Returns 0 if it encounters a non-vlan or incomplete packet.
313 * Returns 1 after successfully parsing vlan tag. 313 * Returns 1 after successfully parsing vlan tag.
314 */ 314 */
315static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh) 315static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh,
316 bool untag_vlan)
316{ 317{
317 struct vlan_head *vh = (struct vlan_head *)skb->data; 318 struct vlan_head *vh = (struct vlan_head *)skb->data;
318 319
@@ -330,31 +331,47 @@ static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh)
330 key_vh->tci = vh->tci | htons(VLAN_TAG_PRESENT); 331 key_vh->tci = vh->tci | htons(VLAN_TAG_PRESENT);
331 key_vh->tpid = vh->tpid; 332 key_vh->tpid = vh->tpid;
332 333
333 __skb_pull(skb, sizeof(struct vlan_head)); 334 if (unlikely(untag_vlan)) {
335 int offset = skb->data - skb_mac_header(skb);
336 u16 tci;
337 int err;
338
339 __skb_push(skb, offset);
340 err = __skb_vlan_pop(skb, &tci);
341 __skb_pull(skb, offset);
342 if (err)
343 return err;
344 __vlan_hwaccel_put_tag(skb, key_vh->tpid, tci);
345 } else {
346 __skb_pull(skb, sizeof(struct vlan_head));
347 }
334 return 1; 348 return 1;
335} 349}
336 350
337static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key) 351static void clear_vlan(struct sw_flow_key *key)
338{ 352{
339 int res;
340
341 key->eth.vlan.tci = 0; 353 key->eth.vlan.tci = 0;
342 key->eth.vlan.tpid = 0; 354 key->eth.vlan.tpid = 0;
343 key->eth.cvlan.tci = 0; 355 key->eth.cvlan.tci = 0;
344 key->eth.cvlan.tpid = 0; 356 key->eth.cvlan.tpid = 0;
357}
358
359static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key)
360{
361 int res;
345 362
346 if (skb_vlan_tag_present(skb)) { 363 if (skb_vlan_tag_present(skb)) {
347 key->eth.vlan.tci = htons(skb->vlan_tci); 364 key->eth.vlan.tci = htons(skb->vlan_tci);
348 key->eth.vlan.tpid = skb->vlan_proto; 365 key->eth.vlan.tpid = skb->vlan_proto;
349 } else { 366 } else {
350 /* Parse outer vlan tag in the non-accelerated case. */ 367 /* Parse outer vlan tag in the non-accelerated case. */
351 res = parse_vlan_tag(skb, &key->eth.vlan); 368 res = parse_vlan_tag(skb, &key->eth.vlan, true);
352 if (res <= 0) 369 if (res <= 0)
353 return res; 370 return res;
354 } 371 }
355 372
356 /* Parse inner vlan tag. */ 373 /* Parse inner vlan tag. */
357 res = parse_vlan_tag(skb, &key->eth.cvlan); 374 res = parse_vlan_tag(skb, &key->eth.cvlan, false);
358 if (res <= 0) 375 if (res <= 0)
359 return res; 376 return res;
360 377
@@ -483,17 +500,20 @@ invalid:
483 * 500 *
484 * Returns 0 if successful, otherwise a negative errno value. 501 * Returns 0 if successful, otherwise a negative errno value.
485 * 502 *
486 * Initializes @skb header pointers as follows: 503 * Initializes @skb header fields as follows:
487 * 504 *
488 * - skb->mac_header: the Ethernet header. 505 * - skb->mac_header: the L2 header.
489 * 506 *
490 * - skb->network_header: just past the Ethernet header, or just past the 507 * - skb->network_header: just past the L2 header, or just past the
491 * VLAN header, to the first byte of the Ethernet payload. 508 * VLAN header, to the first byte of the L2 payload.
492 * 509 *
493 * - skb->transport_header: If key->eth.type is ETH_P_IP or ETH_P_IPV6 510 * - skb->transport_header: If key->eth.type is ETH_P_IP or ETH_P_IPV6
494 * on output, then just past the IP header, if one is present and 511 * on output, then just past the IP header, if one is present and
495 * of a correct length, otherwise the same as skb->network_header. 512 * of a correct length, otherwise the same as skb->network_header.
496 * For other key->eth.type values it is left untouched. 513 * For other key->eth.type values it is left untouched.
514 *
515 * - skb->protocol: the type of the data starting at skb->network_header.
516 * Equals to key->eth.type.
497 */ 517 */
498static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) 518static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
499{ 519{
@@ -505,28 +525,35 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
505 525
506 skb_reset_mac_header(skb); 526 skb_reset_mac_header(skb);
507 527
508 /* Link layer. We are guaranteed to have at least the 14 byte Ethernet 528 /* Link layer. */
509 * header in the linear data area. 529 clear_vlan(key);
510 */ 530 if (ovs_key_mac_proto(key) == MAC_PROTO_NONE) {
511 eth = eth_hdr(skb); 531 if (unlikely(eth_type_vlan(skb->protocol)))
512 ether_addr_copy(key->eth.src, eth->h_source); 532 return -EINVAL;
513 ether_addr_copy(key->eth.dst, eth->h_dest);
514 533
515 __skb_pull(skb, 2 * ETH_ALEN); 534 skb_reset_network_header(skb);
516 /* We are going to push all headers that we pull, so no need to 535 } else {
517 * update skb->csum here. 536 eth = eth_hdr(skb);
518 */ 537 ether_addr_copy(key->eth.src, eth->h_source);
538 ether_addr_copy(key->eth.dst, eth->h_dest);
519 539
520 if (unlikely(parse_vlan(skb, key))) 540 __skb_pull(skb, 2 * ETH_ALEN);
521 return -ENOMEM; 541 /* We are going to push all headers that we pull, so no need to
542 * update skb->csum here.
543 */
522 544
523 key->eth.type = parse_ethertype(skb); 545 if (unlikely(parse_vlan(skb, key)))
524 if (unlikely(key->eth.type == htons(0))) 546 return -ENOMEM;
525 return -ENOMEM; 547
548 skb->protocol = parse_ethertype(skb);
549 if (unlikely(skb->protocol == htons(0)))
550 return -ENOMEM;
526 551
527 skb_reset_network_header(skb); 552 skb_reset_network_header(skb);
553 __skb_push(skb, skb->data - skb_mac_header(skb));
554 }
528 skb_reset_mac_len(skb); 555 skb_reset_mac_len(skb);
529 __skb_push(skb, skb->data - skb_mac_header(skb)); 556 key->eth.type = skb->protocol;
530 557
531 /* Network layer. */ 558 /* Network layer. */
532 if (key->eth.type == htons(ETH_P_IP)) { 559 if (key->eth.type == htons(ETH_P_IP)) {
@@ -718,12 +745,34 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
718 745
719int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key) 746int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key)
720{ 747{
721 return key_extract(skb, key); 748 int res;
749
750 res = key_extract(skb, key);
751 if (!res)
752 key->mac_proto &= ~SW_FLOW_KEY_INVALID;
753
754 return res;
755}
756
757static int key_extract_mac_proto(struct sk_buff *skb)
758{
759 switch (skb->dev->type) {
760 case ARPHRD_ETHER:
761 return MAC_PROTO_ETHERNET;
762 case ARPHRD_NONE:
763 if (skb->protocol == htons(ETH_P_TEB))
764 return MAC_PROTO_ETHERNET;
765 return MAC_PROTO_NONE;
766 }
767 WARN_ON_ONCE(1);
768 return -EINVAL;
722} 769}
723 770
724int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, 771int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
725 struct sk_buff *skb, struct sw_flow_key *key) 772 struct sk_buff *skb, struct sw_flow_key *key)
726{ 773{
774 int res, err;
775
727 /* Extract metadata from packet. */ 776 /* Extract metadata from packet. */
728 if (tun_info) { 777 if (tun_info) {
729 key->tun_proto = ip_tunnel_info_af(tun_info); 778 key->tun_proto = ip_tunnel_info_af(tun_info);
@@ -749,23 +798,61 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
749 key->phy.priority = skb->priority; 798 key->phy.priority = skb->priority;
750 key->phy.in_port = OVS_CB(skb)->input_vport->port_no; 799 key->phy.in_port = OVS_CB(skb)->input_vport->port_no;
751 key->phy.skb_mark = skb->mark; 800 key->phy.skb_mark = skb->mark;
752 ovs_ct_fill_key(skb, key);
753 key->ovs_flow_hash = 0; 801 key->ovs_flow_hash = 0;
802 res = key_extract_mac_proto(skb);
803 if (res < 0)
804 return res;
805 key->mac_proto = res;
754 key->recirc_id = 0; 806 key->recirc_id = 0;
755 807
756 return key_extract(skb, key); 808 err = key_extract(skb, key);
809 if (!err)
810 ovs_ct_fill_key(skb, key); /* Must be after key_extract(). */
811 return err;
757} 812}
758 813
759int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr, 814int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr,
760 struct sk_buff *skb, 815 struct sk_buff *skb,
761 struct sw_flow_key *key, bool log) 816 struct sw_flow_key *key, bool log)
762{ 817{
818 const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
819 u64 attrs = 0;
763 int err; 820 int err;
764 821
822 err = parse_flow_nlattrs(attr, a, &attrs, log);
823 if (err)
824 return -EINVAL;
825
765 /* Extract metadata from netlink attributes. */ 826 /* Extract metadata from netlink attributes. */
766 err = ovs_nla_get_flow_metadata(net, attr, key, log); 827 err = ovs_nla_get_flow_metadata(net, a, attrs, key, log);
767 if (err) 828 if (err)
768 return err; 829 return err;
769 830
770 return key_extract(skb, key); 831 /* key_extract assumes that skb->protocol is set-up for
832 * layer 3 packets which is the case for other callers,
833 * in particular packets received from the network stack.
834 * Here the correct value can be set from the metadata
835 * extracted above.
836 * For L2 packet key eth type would be zero. skb protocol
837 * would be set to correct value later during key-extact.
838 */
839
840 skb->protocol = key->eth.type;
841 err = key_extract(skb, key);
842 if (err)
843 return err;
844
845 /* Check that we have conntrack original direction tuple metadata only
846 * for packets for which it makes sense. Otherwise the key may be
847 * corrupted due to overlapping key fields.
848 */
849 if (attrs & (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4) &&
850 key->eth.type != htons(ETH_P_IP))
851 return -EINVAL;
852 if (attrs & (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6) &&
853 (key->eth.type != htons(ETH_P_IPV6) ||
854 sw_flow_key_is_nd(key)))
855 return -EINVAL;
856
857 return 0;
771} 858}
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index ae783f5c6695..a9bc1c875965 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2007-2014 Nicira, Inc. 2 * Copyright (c) 2007-2017 Nicira, Inc.
3 * 3 *
4 * This program is free software; you can redistribute it and/or 4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public 5 * modify it under the terms of version 2 of the GNU General Public
@@ -37,6 +37,12 @@
37 37
38struct sk_buff; 38struct sk_buff;
39 39
40enum sw_flow_mac_proto {
41 MAC_PROTO_NONE = 0,
42 MAC_PROTO_ETHERNET,
43};
44#define SW_FLOW_KEY_INVALID 0x80
45
40/* Store options at the end of the array if they are less than the 46/* Store options at the end of the array if they are less than the
41 * maximum size. This allows us to get the benefits of variable length 47 * maximum size. This allows us to get the benefits of variable length
42 * matching for small options. 48 * matching for small options.
@@ -68,6 +74,7 @@ struct sw_flow_key {
68 u32 skb_mark; /* SKB mark. */ 74 u32 skb_mark; /* SKB mark. */
69 u16 in_port; /* Input switch port (or DP_MAX_PORTS). */ 75 u16 in_port; /* Input switch port (or DP_MAX_PORTS). */
70 } __packed phy; /* Safe when right after 'tun_key'. */ 76 } __packed phy; /* Safe when right after 'tun_key'. */
77 u8 mac_proto; /* MAC layer protocol (e.g. Ethernet). */
71 u8 tun_proto; /* Protocol of encapsulating tunnel. */ 78 u8 tun_proto; /* Protocol of encapsulating tunnel. */
72 u32 ovs_flow_hash; /* Datapath computed hash value. */ 79 u32 ovs_flow_hash; /* Datapath computed hash value. */
73 u32 recirc_id; /* Recirculation ID. */ 80 u32 recirc_id; /* Recirculation ID. */
@@ -78,6 +85,11 @@ struct sw_flow_key {
78 struct vlan_head cvlan; 85 struct vlan_head cvlan;
79 __be16 type; /* Ethernet frame type. */ 86 __be16 type; /* Ethernet frame type. */
80 } eth; 87 } eth;
88 /* Filling a hole of two bytes. */
89 u8 ct_state;
90 u8 ct_orig_proto; /* CT original direction tuple IP
91 * protocol.
92 */
81 union { 93 union {
82 struct { 94 struct {
83 __be32 top_lse; /* top label stack entry */ 95 __be32 top_lse; /* top label stack entry */
@@ -89,6 +101,7 @@ struct sw_flow_key {
89 u8 frag; /* One of OVS_FRAG_TYPE_*. */ 101 u8 frag; /* One of OVS_FRAG_TYPE_*. */
90 } ip; 102 } ip;
91 }; 103 };
104 u16 ct_zone; /* Conntrack zone. */
92 struct { 105 struct {
93 __be16 src; /* TCP/UDP/SCTP source port. */ 106 __be16 src; /* TCP/UDP/SCTP source port. */
94 __be16 dst; /* TCP/UDP/SCTP destination port. */ 107 __be16 dst; /* TCP/UDP/SCTP destination port. */
@@ -100,10 +113,16 @@ struct sw_flow_key {
100 __be32 src; /* IP source address. */ 113 __be32 src; /* IP source address. */
101 __be32 dst; /* IP destination address. */ 114 __be32 dst; /* IP destination address. */
102 } addr; 115 } addr;
103 struct { 116 union {
104 u8 sha[ETH_ALEN]; /* ARP source hardware address. */ 117 struct {
105 u8 tha[ETH_ALEN]; /* ARP target hardware address. */ 118 __be32 src;
106 } arp; 119 __be32 dst;
120 } ct_orig; /* Conntrack original direction fields. */
121 struct {
122 u8 sha[ETH_ALEN]; /* ARP source hardware address. */
123 u8 tha[ETH_ALEN]; /* ARP target hardware address. */
124 } arp;
125 };
107 } ipv4; 126 } ipv4;
108 struct { 127 struct {
109 struct { 128 struct {
@@ -111,23 +130,40 @@ struct sw_flow_key {
111 struct in6_addr dst; /* IPv6 destination address. */ 130 struct in6_addr dst; /* IPv6 destination address. */
112 } addr; 131 } addr;
113 __be32 label; /* IPv6 flow label. */ 132 __be32 label; /* IPv6 flow label. */
114 struct { 133 union {
115 struct in6_addr target; /* ND target address. */ 134 struct {
116 u8 sll[ETH_ALEN]; /* ND source link layer address. */ 135 struct in6_addr src;
117 u8 tll[ETH_ALEN]; /* ND target link layer address. */ 136 struct in6_addr dst;
118 } nd; 137 } ct_orig; /* Conntrack original direction fields. */
138 struct {
139 struct in6_addr target; /* ND target address. */
140 u8 sll[ETH_ALEN]; /* ND source link layer address. */
141 u8 tll[ETH_ALEN]; /* ND target link layer address. */
142 } nd;
143 };
119 } ipv6; 144 } ipv6;
120 }; 145 };
121 struct { 146 struct {
122 /* Connection tracking fields. */ 147 /* Connection tracking fields not packed above. */
123 u16 zone; 148 struct {
149 __be16 src; /* CT orig tuple tp src port. */
150 __be16 dst; /* CT orig tuple tp dst port. */
151 } orig_tp;
124 u32 mark; 152 u32 mark;
125 u8 state;
126 struct ovs_key_ct_labels labels; 153 struct ovs_key_ct_labels labels;
127 } ct; 154 } ct;
128 155
129} __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */ 156} __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */
130 157
158static inline bool sw_flow_key_is_nd(const struct sw_flow_key *key)
159{
160 return key->eth.type == htons(ETH_P_IPV6) &&
161 key->ip.proto == NEXTHDR_ICMP &&
162 key->tp.dst == 0 &&
163 (key->tp.src == htons(NDISC_NEIGHBOUR_SOLICITATION) ||
164 key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT));
165}
166
131struct sw_flow_key_range { 167struct sw_flow_key_range {
132 unsigned short int start; 168 unsigned short int start;
133 unsigned short int end; 169 unsigned short int end;
@@ -206,6 +242,21 @@ struct arp_eth_header {
206 unsigned char ar_tip[4]; /* target IP address */ 242 unsigned char ar_tip[4]; /* target IP address */
207} __packed; 243} __packed;
208 244
245static inline u8 ovs_key_mac_proto(const struct sw_flow_key *key)
246{
247 return key->mac_proto & ~SW_FLOW_KEY_INVALID;
248}
249
250static inline u16 __ovs_mac_header_len(u8 mac_proto)
251{
252 return mac_proto == MAC_PROTO_ETHERNET ? ETH_HLEN : 0;
253}
254
255static inline u16 ovs_mac_header_len(const struct sw_flow_key *key)
256{
257 return __ovs_mac_header_len(ovs_key_mac_proto(key));
258}
259
209static inline bool ovs_identifier_is_ufid(const struct sw_flow_id *sfid) 260static inline bool ovs_identifier_is_ufid(const struct sw_flow_id *sfid)
210{ 261{
211 return sfid->ufid_len; 262 return sfid->ufid_len;
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index ae25ded82b3b..1105a838bab8 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -123,13 +123,15 @@ static void update_range(struct sw_flow_match *match,
123static bool match_validate(const struct sw_flow_match *match, 123static bool match_validate(const struct sw_flow_match *match,
124 u64 key_attrs, u64 mask_attrs, bool log) 124 u64 key_attrs, u64 mask_attrs, bool log)
125{ 125{
126 u64 key_expected = 1 << OVS_KEY_ATTR_ETHERNET; 126 u64 key_expected = 0;
127 u64 mask_allowed = key_attrs; /* At most allow all key attributes */ 127 u64 mask_allowed = key_attrs; /* At most allow all key attributes */
128 128
129 /* The following mask attributes allowed only if they 129 /* The following mask attributes allowed only if they
130 * pass the validation tests. */ 130 * pass the validation tests. */
131 mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4) 131 mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4)
132 | (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4)
132 | (1 << OVS_KEY_ATTR_IPV6) 133 | (1 << OVS_KEY_ATTR_IPV6)
134 | (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6)
133 | (1 << OVS_KEY_ATTR_TCP) 135 | (1 << OVS_KEY_ATTR_TCP)
134 | (1 << OVS_KEY_ATTR_TCP_FLAGS) 136 | (1 << OVS_KEY_ATTR_TCP_FLAGS)
135 | (1 << OVS_KEY_ATTR_UDP) 137 | (1 << OVS_KEY_ATTR_UDP)
@@ -161,8 +163,10 @@ static bool match_validate(const struct sw_flow_match *match,
161 163
162 if (match->key->eth.type == htons(ETH_P_IP)) { 164 if (match->key->eth.type == htons(ETH_P_IP)) {
163 key_expected |= 1 << OVS_KEY_ATTR_IPV4; 165 key_expected |= 1 << OVS_KEY_ATTR_IPV4;
164 if (match->mask && (match->mask->key.eth.type == htons(0xffff))) 166 if (match->mask && match->mask->key.eth.type == htons(0xffff)) {
165 mask_allowed |= 1 << OVS_KEY_ATTR_IPV4; 167 mask_allowed |= 1 << OVS_KEY_ATTR_IPV4;
168 mask_allowed |= 1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4;
169 }
166 170
167 if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) { 171 if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
168 if (match->key->ip.proto == IPPROTO_UDP) { 172 if (match->key->ip.proto == IPPROTO_UDP) {
@@ -196,8 +200,10 @@ static bool match_validate(const struct sw_flow_match *match,
196 200
197 if (match->key->eth.type == htons(ETH_P_IPV6)) { 201 if (match->key->eth.type == htons(ETH_P_IPV6)) {
198 key_expected |= 1 << OVS_KEY_ATTR_IPV6; 202 key_expected |= 1 << OVS_KEY_ATTR_IPV6;
199 if (match->mask && (match->mask->key.eth.type == htons(0xffff))) 203 if (match->mask && match->mask->key.eth.type == htons(0xffff)) {
200 mask_allowed |= 1 << OVS_KEY_ATTR_IPV6; 204 mask_allowed |= 1 << OVS_KEY_ATTR_IPV6;
205 mask_allowed |= 1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6;
206 }
201 207
202 if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) { 208 if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
203 if (match->key->ip.proto == IPPROTO_UDP) { 209 if (match->key->ip.proto == IPPROTO_UDP) {
@@ -230,6 +236,12 @@ static bool match_validate(const struct sw_flow_match *match,
230 htons(NDISC_NEIGHBOUR_SOLICITATION) || 236 htons(NDISC_NEIGHBOUR_SOLICITATION) ||
231 match->key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) { 237 match->key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) {
232 key_expected |= 1 << OVS_KEY_ATTR_ND; 238 key_expected |= 1 << OVS_KEY_ATTR_ND;
239 /* Original direction conntrack tuple
240 * uses the same space as the ND fields
241 * in the key, so both are not allowed
242 * at the same time.
243 */
244 mask_allowed &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6);
233 if (match->mask && (match->mask->key.tp.src == htons(0xff))) 245 if (match->mask && (match->mask->key.tp.src == htons(0xff)))
234 mask_allowed |= 1 << OVS_KEY_ATTR_ND; 246 mask_allowed |= 1 << OVS_KEY_ATTR_ND;
235 } 247 }
@@ -282,7 +294,7 @@ size_t ovs_key_attr_size(void)
282 /* Whenever adding new OVS_KEY_ FIELDS, we should consider 294 /* Whenever adding new OVS_KEY_ FIELDS, we should consider
283 * updating this function. 295 * updating this function.
284 */ 296 */
285 BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 26); 297 BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 28);
286 298
287 return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */ 299 return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */
288 + nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */ 300 + nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */
@@ -295,6 +307,7 @@ size_t ovs_key_attr_size(void)
295 + nla_total_size(2) /* OVS_KEY_ATTR_CT_ZONE */ 307 + nla_total_size(2) /* OVS_KEY_ATTR_CT_ZONE */
296 + nla_total_size(4) /* OVS_KEY_ATTR_CT_MARK */ 308 + nla_total_size(4) /* OVS_KEY_ATTR_CT_MARK */
297 + nla_total_size(16) /* OVS_KEY_ATTR_CT_LABELS */ 309 + nla_total_size(16) /* OVS_KEY_ATTR_CT_LABELS */
310 + nla_total_size(40) /* OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6 */
298 + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ 311 + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */
299 + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ 312 + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */
300 + nla_total_size(4) /* OVS_KEY_ATTR_VLAN */ 313 + nla_total_size(4) /* OVS_KEY_ATTR_VLAN */
@@ -355,6 +368,10 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
355 [OVS_KEY_ATTR_CT_ZONE] = { .len = sizeof(u16) }, 368 [OVS_KEY_ATTR_CT_ZONE] = { .len = sizeof(u16) },
356 [OVS_KEY_ATTR_CT_MARK] = { .len = sizeof(u32) }, 369 [OVS_KEY_ATTR_CT_MARK] = { .len = sizeof(u32) },
357 [OVS_KEY_ATTR_CT_LABELS] = { .len = sizeof(struct ovs_key_ct_labels) }, 370 [OVS_KEY_ATTR_CT_LABELS] = { .len = sizeof(struct ovs_key_ct_labels) },
371 [OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4] = {
372 .len = sizeof(struct ovs_key_ct_tuple_ipv4) },
373 [OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6] = {
374 .len = sizeof(struct ovs_key_ct_tuple_ipv6) },
358}; 375};
359 376
360static bool check_attr_len(unsigned int attr_len, unsigned int expected_len) 377static bool check_attr_len(unsigned int attr_len, unsigned int expected_len)
@@ -430,9 +447,8 @@ static int parse_flow_mask_nlattrs(const struct nlattr *attr,
430 return __parse_flow_nlattrs(attr, a, attrsp, log, true); 447 return __parse_flow_nlattrs(attr, a, attrsp, log, true);
431} 448}
432 449
433static int parse_flow_nlattrs(const struct nlattr *attr, 450int parse_flow_nlattrs(const struct nlattr *attr, const struct nlattr *a[],
434 const struct nlattr *a[], u64 *attrsp, 451 u64 *attrsp, bool log)
435 bool log)
436{ 452{
437 return __parse_flow_nlattrs(attr, a, attrsp, log, false); 453 return __parse_flow_nlattrs(attr, a, attrsp, log, false);
438} 454}
@@ -588,7 +604,7 @@ static int ip_tun_from_nlattr(const struct nlattr *attr,
588 ipv4 = true; 604 ipv4 = true;
589 break; 605 break;
590 case OVS_TUNNEL_KEY_ATTR_IPV6_SRC: 606 case OVS_TUNNEL_KEY_ATTR_IPV6_SRC:
591 SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.dst, 607 SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.src,
592 nla_get_in6_addr(a), is_mask); 608 nla_get_in6_addr(a), is_mask);
593 ipv6 = true; 609 ipv6 = true;
594 break; 610 break;
@@ -649,6 +665,8 @@ static int ip_tun_from_nlattr(const struct nlattr *attr,
649 tun_flags |= TUNNEL_VXLAN_OPT; 665 tun_flags |= TUNNEL_VXLAN_OPT;
650 opts_type = type; 666 opts_type = type;
651 break; 667 break;
668 case OVS_TUNNEL_KEY_ATTR_PAD:
669 break;
652 default: 670 default:
653 OVS_NLERR(log, "Unknown IP tunnel attribute %d", 671 OVS_NLERR(log, "Unknown IP tunnel attribute %d",
654 type); 672 type);
@@ -969,10 +987,33 @@ static int parse_vlan_from_nlattrs(struct sw_flow_match *match,
969 return 0; 987 return 0;
970} 988}
971 989
990static int parse_eth_type_from_nlattrs(struct sw_flow_match *match,
991 u64 *attrs, const struct nlattr **a,
992 bool is_mask, bool log)
993{
994 __be16 eth_type;
995
996 eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
997 if (is_mask) {
998 /* Always exact match EtherType. */
999 eth_type = htons(0xffff);
1000 } else if (!eth_proto_is_802_3(eth_type)) {
1001 OVS_NLERR(log, "EtherType %x is less than min %x",
1002 ntohs(eth_type), ETH_P_802_3_MIN);
1003 return -EINVAL;
1004 }
1005
1006 SW_FLOW_KEY_PUT(match, eth.type, eth_type, is_mask);
1007 *attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
1008 return 0;
1009}
1010
972static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match, 1011static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
973 u64 *attrs, const struct nlattr **a, 1012 u64 *attrs, const struct nlattr **a,
974 bool is_mask, bool log) 1013 bool is_mask, bool log)
975{ 1014{
1015 u8 mac_proto = MAC_PROTO_ETHERNET;
1016
976 if (*attrs & (1 << OVS_KEY_ATTR_DP_HASH)) { 1017 if (*attrs & (1 << OVS_KEY_ATTR_DP_HASH)) {
977 u32 hash_val = nla_get_u32(a[OVS_KEY_ATTR_DP_HASH]); 1018 u32 hash_val = nla_get_u32(a[OVS_KEY_ATTR_DP_HASH]);
978 1019
@@ -1033,14 +1074,14 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
1033 return -EINVAL; 1074 return -EINVAL;
1034 } 1075 }
1035 1076
1036 SW_FLOW_KEY_PUT(match, ct.state, ct_state, is_mask); 1077 SW_FLOW_KEY_PUT(match, ct_state, ct_state, is_mask);
1037 *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_STATE); 1078 *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_STATE);
1038 } 1079 }
1039 if (*attrs & (1 << OVS_KEY_ATTR_CT_ZONE) && 1080 if (*attrs & (1 << OVS_KEY_ATTR_CT_ZONE) &&
1040 ovs_ct_verify(net, OVS_KEY_ATTR_CT_ZONE)) { 1081 ovs_ct_verify(net, OVS_KEY_ATTR_CT_ZONE)) {
1041 u16 ct_zone = nla_get_u16(a[OVS_KEY_ATTR_CT_ZONE]); 1082 u16 ct_zone = nla_get_u16(a[OVS_KEY_ATTR_CT_ZONE]);
1042 1083
1043 SW_FLOW_KEY_PUT(match, ct.zone, ct_zone, is_mask); 1084 SW_FLOW_KEY_PUT(match, ct_zone, ct_zone, is_mask);
1044 *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ZONE); 1085 *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ZONE);
1045 } 1086 }
1046 if (*attrs & (1 << OVS_KEY_ATTR_CT_MARK) && 1087 if (*attrs & (1 << OVS_KEY_ATTR_CT_MARK) &&
@@ -1059,6 +1100,49 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
1059 sizeof(*cl), is_mask); 1100 sizeof(*cl), is_mask);
1060 *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_LABELS); 1101 *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_LABELS);
1061 } 1102 }
1103 if (*attrs & (1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4)) {
1104 const struct ovs_key_ct_tuple_ipv4 *ct;
1105
1106 ct = nla_data(a[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4]);
1107
1108 SW_FLOW_KEY_PUT(match, ipv4.ct_orig.src, ct->ipv4_src, is_mask);
1109 SW_FLOW_KEY_PUT(match, ipv4.ct_orig.dst, ct->ipv4_dst, is_mask);
1110 SW_FLOW_KEY_PUT(match, ct.orig_tp.src, ct->src_port, is_mask);
1111 SW_FLOW_KEY_PUT(match, ct.orig_tp.dst, ct->dst_port, is_mask);
1112 SW_FLOW_KEY_PUT(match, ct_orig_proto, ct->ipv4_proto, is_mask);
1113 *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4);
1114 }
1115 if (*attrs & (1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6)) {
1116 const struct ovs_key_ct_tuple_ipv6 *ct;
1117
1118 ct = nla_data(a[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6]);
1119
1120 SW_FLOW_KEY_MEMCPY(match, ipv6.ct_orig.src, &ct->ipv6_src,
1121 sizeof(match->key->ipv6.ct_orig.src),
1122 is_mask);
1123 SW_FLOW_KEY_MEMCPY(match, ipv6.ct_orig.dst, &ct->ipv6_dst,
1124 sizeof(match->key->ipv6.ct_orig.dst),
1125 is_mask);
1126 SW_FLOW_KEY_PUT(match, ct.orig_tp.src, ct->src_port, is_mask);
1127 SW_FLOW_KEY_PUT(match, ct.orig_tp.dst, ct->dst_port, is_mask);
1128 SW_FLOW_KEY_PUT(match, ct_orig_proto, ct->ipv6_proto, is_mask);
1129 *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6);
1130 }
1131
1132 /* For layer 3 packets the Ethernet type is provided
1133 * and treated as metadata but no MAC addresses are provided.
1134 */
1135 if (!(*attrs & (1ULL << OVS_KEY_ATTR_ETHERNET)) &&
1136 (*attrs & (1ULL << OVS_KEY_ATTR_ETHERTYPE)))
1137 mac_proto = MAC_PROTO_NONE;
1138
1139 /* Always exact match mac_proto */
1140 SW_FLOW_KEY_PUT(match, mac_proto, is_mask ? 0xff : mac_proto, is_mask);
1141
1142 if (mac_proto == MAC_PROTO_NONE)
1143 return parse_eth_type_from_nlattrs(match, attrs, a, is_mask,
1144 log);
1145
1062 return 0; 1146 return 0;
1063} 1147}
1064 1148
@@ -1081,33 +1165,26 @@ static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match,
1081 SW_FLOW_KEY_MEMCPY(match, eth.dst, 1165 SW_FLOW_KEY_MEMCPY(match, eth.dst,
1082 eth_key->eth_dst, ETH_ALEN, is_mask); 1166 eth_key->eth_dst, ETH_ALEN, is_mask);
1083 attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET); 1167 attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET);
1084 }
1085 1168
1086 if (attrs & (1 << OVS_KEY_ATTR_VLAN)) { 1169 if (attrs & (1 << OVS_KEY_ATTR_VLAN)) {
1087 /* VLAN attribute is always parsed before getting here since it 1170 /* VLAN attribute is always parsed before getting here since it
1088 * may occur multiple times. 1171 * may occur multiple times.
1089 */ 1172 */
1090 OVS_NLERR(log, "VLAN attribute unexpected."); 1173 OVS_NLERR(log, "VLAN attribute unexpected.");
1091 return -EINVAL;
1092 }
1093
1094 if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) {
1095 __be16 eth_type;
1096
1097 eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
1098 if (is_mask) {
1099 /* Always exact match EtherType. */
1100 eth_type = htons(0xffff);
1101 } else if (!eth_proto_is_802_3(eth_type)) {
1102 OVS_NLERR(log, "EtherType %x is less than min %x",
1103 ntohs(eth_type), ETH_P_802_3_MIN);
1104 return -EINVAL; 1174 return -EINVAL;
1105 } 1175 }
1106 1176
1107 SW_FLOW_KEY_PUT(match, eth.type, eth_type, is_mask); 1177 if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) {
1108 attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); 1178 err = parse_eth_type_from_nlattrs(match, &attrs, a, is_mask,
1109 } else if (!is_mask) { 1179 log);
1110 SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask); 1180 if (err)
1181 return err;
1182 } else if (!is_mask) {
1183 SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask);
1184 }
1185 } else if (!match->key->eth.type) {
1186 OVS_NLERR(log, "Either Ethernet header or EtherType is required.");
1187 return -EINVAL;
1111 } 1188 }
1112 1189
1113 if (attrs & (1 << OVS_KEY_ATTR_IPV4)) { 1190 if (attrs & (1 << OVS_KEY_ATTR_IPV4)) {
@@ -1462,9 +1539,12 @@ u32 ovs_nla_get_ufid_flags(const struct nlattr *attr)
1462 1539
1463/** 1540/**
1464 * ovs_nla_get_flow_metadata - parses Netlink attributes into a flow key. 1541 * ovs_nla_get_flow_metadata - parses Netlink attributes into a flow key.
1465 * @key: Receives extracted in_port, priority, tun_key and skb_mark. 1542 * @net: Network namespace.
1466 * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute 1543 * @key: Receives extracted in_port, priority, tun_key, skb_mark and conntrack
1467 * sequence. 1544 * metadata.
1545 * @a: Array of netlink attributes holding parsed %OVS_KEY_ATTR_* Netlink
1546 * attributes.
1547 * @attrs: Bit mask for the netlink attributes included in @a.
1468 * @log: Boolean to allow kernel error logging. Normally true, but when 1548 * @log: Boolean to allow kernel error logging. Normally true, but when
1469 * probing for feature compatibility this should be passed in as false to 1549 * probing for feature compatibility this should be passed in as false to
1470 * suppress unnecessary error logging. 1550 * suppress unnecessary error logging.
@@ -1473,25 +1553,26 @@ u32 ovs_nla_get_ufid_flags(const struct nlattr *attr)
1473 * take the same form accepted by flow_from_nlattrs(), but only enough of it to 1553 * take the same form accepted by flow_from_nlattrs(), but only enough of it to
1474 * get the metadata, that is, the parts of the flow key that cannot be 1554 * get the metadata, that is, the parts of the flow key that cannot be
1475 * extracted from the packet itself. 1555 * extracted from the packet itself.
1556 *
1557 * This must be called before the packet key fields are filled in 'key'.
1476 */ 1558 */
1477 1559
1478int ovs_nla_get_flow_metadata(struct net *net, const struct nlattr *attr, 1560int ovs_nla_get_flow_metadata(struct net *net,
1479 struct sw_flow_key *key, 1561 const struct nlattr *a[OVS_KEY_ATTR_MAX + 1],
1480 bool log) 1562 u64 attrs, struct sw_flow_key *key, bool log)
1481{ 1563{
1482 const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
1483 struct sw_flow_match match; 1564 struct sw_flow_match match;
1484 u64 attrs = 0;
1485 int err;
1486
1487 err = parse_flow_nlattrs(attr, a, &attrs, log);
1488 if (err)
1489 return -EINVAL;
1490 1565
1491 memset(&match, 0, sizeof(match)); 1566 memset(&match, 0, sizeof(match));
1492 match.key = key; 1567 match.key = key;
1493 1568
1569 key->ct_state = 0;
1570 key->ct_zone = 0;
1571 key->ct_orig_proto = 0;
1494 memset(&key->ct, 0, sizeof(key->ct)); 1572 memset(&key->ct, 0, sizeof(key->ct));
1573 memset(&key->ipv4.ct_orig, 0, sizeof(key->ipv4.ct_orig));
1574 memset(&key->ipv6.ct_orig, 0, sizeof(key->ipv6.ct_orig));
1575
1495 key->phy.in_port = DP_MAX_PORTS; 1576 key->phy.in_port = DP_MAX_PORTS;
1496 1577
1497 return metadata_from_nlattrs(net, &match, &attrs, a, false, log); 1578 return metadata_from_nlattrs(net, &match, &attrs, a, false, log);
@@ -1553,45 +1634,47 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
1553 if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark)) 1634 if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark))
1554 goto nla_put_failure; 1635 goto nla_put_failure;
1555 1636
1556 if (ovs_ct_put_key(output, skb)) 1637 if (ovs_ct_put_key(swkey, output, skb))
1557 goto nla_put_failure; 1638 goto nla_put_failure;
1558 1639
1559 nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key)); 1640 if (ovs_key_mac_proto(swkey) == MAC_PROTO_ETHERNET) {
1560 if (!nla) 1641 nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key));
1561 goto nla_put_failure; 1642 if (!nla)
1562
1563 eth_key = nla_data(nla);
1564 ether_addr_copy(eth_key->eth_src, output->eth.src);
1565 ether_addr_copy(eth_key->eth_dst, output->eth.dst);
1566
1567 if (swkey->eth.vlan.tci || eth_type_vlan(swkey->eth.type)) {
1568 if (ovs_nla_put_vlan(skb, &output->eth.vlan, is_mask))
1569 goto nla_put_failure; 1643 goto nla_put_failure;
1570 encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
1571 if (!swkey->eth.vlan.tci)
1572 goto unencap;
1573 1644
1574 if (swkey->eth.cvlan.tci || eth_type_vlan(swkey->eth.type)) { 1645 eth_key = nla_data(nla);
1575 if (ovs_nla_put_vlan(skb, &output->eth.cvlan, is_mask)) 1646 ether_addr_copy(eth_key->eth_src, output->eth.src);
1647 ether_addr_copy(eth_key->eth_dst, output->eth.dst);
1648
1649 if (swkey->eth.vlan.tci || eth_type_vlan(swkey->eth.type)) {
1650 if (ovs_nla_put_vlan(skb, &output->eth.vlan, is_mask))
1576 goto nla_put_failure; 1651 goto nla_put_failure;
1577 in_encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP); 1652 encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
1578 if (!swkey->eth.cvlan.tci) 1653 if (!swkey->eth.vlan.tci)
1579 goto unencap; 1654 goto unencap;
1655
1656 if (swkey->eth.cvlan.tci || eth_type_vlan(swkey->eth.type)) {
1657 if (ovs_nla_put_vlan(skb, &output->eth.cvlan, is_mask))
1658 goto nla_put_failure;
1659 in_encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
1660 if (!swkey->eth.cvlan.tci)
1661 goto unencap;
1662 }
1580 } 1663 }
1581 }
1582 1664
1583 if (swkey->eth.type == htons(ETH_P_802_2)) { 1665 if (swkey->eth.type == htons(ETH_P_802_2)) {
1584 /* 1666 /*
1585 * Ethertype 802.2 is represented in the netlink with omitted 1667 * Ethertype 802.2 is represented in the netlink with omitted
1586 * OVS_KEY_ATTR_ETHERTYPE in the flow key attribute, and 1668 * OVS_KEY_ATTR_ETHERTYPE in the flow key attribute, and
1587 * 0xffff in the mask attribute. Ethertype can also 1669 * 0xffff in the mask attribute. Ethertype can also
1588 * be wildcarded. 1670 * be wildcarded.
1589 */ 1671 */
1590 if (is_mask && output->eth.type) 1672 if (is_mask && output->eth.type)
1591 if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, 1673 if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE,
1592 output->eth.type)) 1674 output->eth.type))
1593 goto nla_put_failure; 1675 goto nla_put_failure;
1594 goto unencap; 1676 goto unencap;
1677 }
1595 } 1678 }
1596 1679
1597 if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type)) 1680 if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type))
@@ -2126,8 +2209,8 @@ static bool validate_masked(u8 *data, int len)
2126 2209
2127static int validate_set(const struct nlattr *a, 2210static int validate_set(const struct nlattr *a,
2128 const struct sw_flow_key *flow_key, 2211 const struct sw_flow_key *flow_key,
2129 struct sw_flow_actions **sfa, 2212 struct sw_flow_actions **sfa, bool *skip_copy,
2130 bool *skip_copy, __be16 eth_type, bool masked, bool log) 2213 u8 mac_proto, __be16 eth_type, bool masked, bool log)
2131{ 2214{
2132 const struct nlattr *ovs_key = nla_data(a); 2215 const struct nlattr *ovs_key = nla_data(a);
2133 int key_type = nla_type(ovs_key); 2216 int key_type = nla_type(ovs_key);
@@ -2157,7 +2240,11 @@ static int validate_set(const struct nlattr *a,
2157 case OVS_KEY_ATTR_SKB_MARK: 2240 case OVS_KEY_ATTR_SKB_MARK:
2158 case OVS_KEY_ATTR_CT_MARK: 2241 case OVS_KEY_ATTR_CT_MARK:
2159 case OVS_KEY_ATTR_CT_LABELS: 2242 case OVS_KEY_ATTR_CT_LABELS:
2243 break;
2244
2160 case OVS_KEY_ATTR_ETHERNET: 2245 case OVS_KEY_ATTR_ETHERNET:
2246 if (mac_proto != MAC_PROTO_ETHERNET)
2247 return -EINVAL;
2161 break; 2248 break;
2162 2249
2163 case OVS_KEY_ATTR_TUNNEL: 2250 case OVS_KEY_ATTR_TUNNEL:
@@ -2324,6 +2411,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
2324 int depth, struct sw_flow_actions **sfa, 2411 int depth, struct sw_flow_actions **sfa,
2325 __be16 eth_type, __be16 vlan_tci, bool log) 2412 __be16 eth_type, __be16 vlan_tci, bool log)
2326{ 2413{
2414 u8 mac_proto = ovs_key_mac_proto(key);
2327 const struct nlattr *a; 2415 const struct nlattr *a;
2328 int rem, err; 2416 int rem, err;
2329 2417
@@ -2346,6 +2434,8 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
2346 [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash), 2434 [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash),
2347 [OVS_ACTION_ATTR_CT] = (u32)-1, 2435 [OVS_ACTION_ATTR_CT] = (u32)-1,
2348 [OVS_ACTION_ATTR_TRUNC] = sizeof(struct ovs_action_trunc), 2436 [OVS_ACTION_ATTR_TRUNC] = sizeof(struct ovs_action_trunc),
2437 [OVS_ACTION_ATTR_PUSH_ETH] = sizeof(struct ovs_action_push_eth),
2438 [OVS_ACTION_ATTR_POP_ETH] = 0,
2349 }; 2439 };
2350 const struct ovs_action_push_vlan *vlan; 2440 const struct ovs_action_push_vlan *vlan;
2351 int type = nla_type(a); 2441 int type = nla_type(a);
@@ -2394,10 +2484,14 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
2394 } 2484 }
2395 2485
2396 case OVS_ACTION_ATTR_POP_VLAN: 2486 case OVS_ACTION_ATTR_POP_VLAN:
2487 if (mac_proto != MAC_PROTO_ETHERNET)
2488 return -EINVAL;
2397 vlan_tci = htons(0); 2489 vlan_tci = htons(0);
2398 break; 2490 break;
2399 2491
2400 case OVS_ACTION_ATTR_PUSH_VLAN: 2492 case OVS_ACTION_ATTR_PUSH_VLAN:
2493 if (mac_proto != MAC_PROTO_ETHERNET)
2494 return -EINVAL;
2401 vlan = nla_data(a); 2495 vlan = nla_data(a);
2402 if (!eth_type_vlan(vlan->vlan_tpid)) 2496 if (!eth_type_vlan(vlan->vlan_tpid))
2403 return -EINVAL; 2497 return -EINVAL;
@@ -2447,14 +2541,16 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
2447 2541
2448 case OVS_ACTION_ATTR_SET: 2542 case OVS_ACTION_ATTR_SET:
2449 err = validate_set(a, key, sfa, 2543 err = validate_set(a, key, sfa,
2450 &skip_copy, eth_type, false, log); 2544 &skip_copy, mac_proto, eth_type,
2545 false, log);
2451 if (err) 2546 if (err)
2452 return err; 2547 return err;
2453 break; 2548 break;
2454 2549
2455 case OVS_ACTION_ATTR_SET_MASKED: 2550 case OVS_ACTION_ATTR_SET_MASKED:
2456 err = validate_set(a, key, sfa, 2551 err = validate_set(a, key, sfa,
2457 &skip_copy, eth_type, true, log); 2552 &skip_copy, mac_proto, eth_type,
2553 true, log);
2458 if (err) 2554 if (err)
2459 return err; 2555 return err;
2460 break; 2556 break;
@@ -2474,6 +2570,22 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
2474 skip_copy = true; 2570 skip_copy = true;
2475 break; 2571 break;
2476 2572
2573 case OVS_ACTION_ATTR_PUSH_ETH:
2574 /* Disallow pushing an Ethernet header if one
2575 * is already present */
2576 if (mac_proto != MAC_PROTO_NONE)
2577 return -EINVAL;
2578 mac_proto = MAC_PROTO_NONE;
2579 break;
2580
2581 case OVS_ACTION_ATTR_POP_ETH:
2582 if (mac_proto != MAC_PROTO_ETHERNET)
2583 return -EINVAL;
2584 if (vlan_tci & htons(VLAN_TAG_PRESENT))
2585 return -EINVAL;
2586 mac_proto = MAC_PROTO_ETHERNET;
2587 break;
2588
2477 default: 2589 default:
2478 OVS_NLERR(log, "Unknown Action type %d", type); 2590 OVS_NLERR(log, "Unknown Action type %d", type);
2479 return -EINVAL; 2591 return -EINVAL;
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 45f9769e5aac..929c665ac3aa 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -46,8 +46,11 @@ void ovs_match_init(struct sw_flow_match *match,
46 46
47int ovs_nla_put_key(const struct sw_flow_key *, const struct sw_flow_key *, 47int ovs_nla_put_key(const struct sw_flow_key *, const struct sw_flow_key *,
48 int attr, bool is_mask, struct sk_buff *); 48 int attr, bool is_mask, struct sk_buff *);
49int ovs_nla_get_flow_metadata(struct net *, const struct nlattr *, 49int parse_flow_nlattrs(const struct nlattr *attr, const struct nlattr *a[],
50 struct sw_flow_key *, bool log); 50 u64 *attrsp, bool log);
51int ovs_nla_get_flow_metadata(struct net *net,
52 const struct nlattr *a[OVS_KEY_ATTR_MAX + 1],
53 u64 attrs, struct sw_flow_key *key, bool log);
51 54
52int ovs_nla_put_identifier(const struct sw_flow *flow, struct sk_buff *skb); 55int ovs_nla_put_identifier(const struct sw_flow *flow, struct sk_buff *skb);
53int ovs_nla_put_masked_key(const struct sw_flow *flow, struct sk_buff *skb); 56int ovs_nla_put_masked_key(const struct sw_flow *flow, struct sk_buff *skb);
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index e7da29021b38..89193a634da4 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -89,15 +89,6 @@ static const struct ethtool_ops internal_dev_ethtool_ops = {
89 .get_link = ethtool_op_get_link, 89 .get_link = ethtool_op_get_link,
90}; 90};
91 91
92static int internal_dev_change_mtu(struct net_device *netdev, int new_mtu)
93{
94 if (new_mtu < 68)
95 return -EINVAL;
96
97 netdev->mtu = new_mtu;
98 return 0;
99}
100
101static void internal_dev_destructor(struct net_device *dev) 92static void internal_dev_destructor(struct net_device *dev)
102{ 93{
103 struct vport *vport = ovs_internal_dev_get_vport(dev); 94 struct vport *vport = ovs_internal_dev_get_vport(dev);
@@ -106,7 +97,7 @@ static void internal_dev_destructor(struct net_device *dev)
106 free_netdev(dev); 97 free_netdev(dev);
107} 98}
108 99
109static struct rtnl_link_stats64 * 100static void
110internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) 101internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
111{ 102{
112 int i; 103 int i;
@@ -134,8 +125,6 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
134 stats->tx_bytes += local_stats.tx_bytes; 125 stats->tx_bytes += local_stats.tx_bytes;
135 stats->tx_packets += local_stats.tx_packets; 126 stats->tx_packets += local_stats.tx_packets;
136 } 127 }
137
138 return stats;
139} 128}
140 129
141static void internal_set_rx_headroom(struct net_device *dev, int new_hr) 130static void internal_set_rx_headroom(struct net_device *dev, int new_hr)
@@ -148,7 +137,6 @@ static const struct net_device_ops internal_dev_netdev_ops = {
148 .ndo_stop = internal_dev_stop, 137 .ndo_stop = internal_dev_stop,
149 .ndo_start_xmit = internal_dev_xmit, 138 .ndo_start_xmit = internal_dev_xmit,
150 .ndo_set_mac_address = eth_mac_addr, 139 .ndo_set_mac_address = eth_mac_addr,
151 .ndo_change_mtu = internal_dev_change_mtu,
152 .ndo_get_stats64 = internal_get_stats, 140 .ndo_get_stats64 = internal_get_stats,
153 .ndo_set_rx_headroom = internal_set_rx_headroom, 141 .ndo_set_rx_headroom = internal_set_rx_headroom,
154}; 142};
@@ -161,6 +149,8 @@ static void do_setup(struct net_device *netdev)
161{ 149{
162 ether_setup(netdev); 150 ether_setup(netdev);
163 151
152 netdev->max_mtu = ETH_MAX_MTU;
153
164 netdev->netdev_ops = &internal_dev_netdev_ops; 154 netdev->netdev_ops = &internal_dev_netdev_ops;
165 155
166 netdev->priv_flags &= ~IFF_TX_SKB_SHARING; 156 netdev->priv_flags &= ~IFF_TX_SKB_SHARING;
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index 4e3972344aa6..0389398fa4ab 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -57,8 +57,10 @@ static void netdev_port_receive(struct sk_buff *skb)
57 if (unlikely(!skb)) 57 if (unlikely(!skb))
58 return; 58 return;
59 59
60 skb_push(skb, ETH_HLEN); 60 if (skb->dev->type == ARPHRD_ETHER) {
61 skb_postpush_rcsum(skb, skb->data, ETH_HLEN); 61 skb_push(skb, ETH_HLEN);
62 skb_postpush_rcsum(skb, skb->data, ETH_HLEN);
63 }
62 ovs_vport_receive(vport, skb, skb_tunnel_info(skb)); 64 ovs_vport_receive(vport, skb, skb_tunnel_info(skb));
63 return; 65 return;
64error: 66error:
@@ -97,7 +99,8 @@ struct vport *ovs_netdev_link(struct vport *vport, const char *name)
97 } 99 }
98 100
99 if (vport->dev->flags & IFF_LOOPBACK || 101 if (vport->dev->flags & IFF_LOOPBACK ||
100 vport->dev->type != ARPHRD_ETHER || 102 (vport->dev->type != ARPHRD_ETHER &&
103 vport->dev->type != ARPHRD_NONE) ||
101 ovs_is_internal_dev(vport->dev)) { 104 ovs_is_internal_dev(vport->dev)) {
102 err = -EINVAL; 105 err = -EINVAL;
103 goto error_put; 106 goto error_put;
@@ -162,7 +165,6 @@ void ovs_netdev_detach_dev(struct vport *vport)
162 netdev_master_upper_dev_get(vport->dev)); 165 netdev_master_upper_dev_get(vport->dev));
163 dev_set_promiscuity(vport->dev, -1); 166 dev_set_promiscuity(vport->dev, -1);
164} 167}
165EXPORT_SYMBOL_GPL(ovs_netdev_detach_dev);
166 168
167static void netdev_destroy(struct vport *vport) 169static void netdev_destroy(struct vport *vport)
168{ 170{
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 7387418ac514..b6c8524032a0 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -463,27 +463,11 @@ int ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
463 ovs_dp_process_packet(skb, &key); 463 ovs_dp_process_packet(skb, &key);
464 return 0; 464 return 0;
465} 465}
466EXPORT_SYMBOL_GPL(ovs_vport_receive);
467 466
468static void free_vport_rcu(struct rcu_head *rcu) 467static unsigned int packet_length(const struct sk_buff *skb,
468 struct net_device *dev)
469{ 469{
470 struct vport *vport = container_of(rcu, struct vport, rcu); 470 unsigned int length = skb->len - dev->hard_header_len;
471
472 ovs_vport_free(vport);
473}
474
475void ovs_vport_deferred_free(struct vport *vport)
476{
477 if (!vport)
478 return;
479
480 call_rcu(&vport->rcu, free_vport_rcu);
481}
482EXPORT_SYMBOL_GPL(ovs_vport_deferred_free);
483
484static unsigned int packet_length(const struct sk_buff *skb)
485{
486 unsigned int length = skb->len - ETH_HLEN;
487 471
488 if (!skb_vlan_tag_present(skb) && 472 if (!skb_vlan_tag_present(skb) &&
489 eth_type_vlan(skb->protocol)) 473 eth_type_vlan(skb->protocol))
@@ -497,14 +481,34 @@ static unsigned int packet_length(const struct sk_buff *skb)
497 return length; 481 return length;
498} 482}
499 483
500void ovs_vport_send(struct vport *vport, struct sk_buff *skb) 484void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto)
501{ 485{
502 int mtu = vport->dev->mtu; 486 int mtu = vport->dev->mtu;
503 487
504 if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) { 488 switch (vport->dev->type) {
489 case ARPHRD_NONE:
490 if (mac_proto == MAC_PROTO_ETHERNET) {
491 skb_reset_network_header(skb);
492 skb_reset_mac_len(skb);
493 skb->protocol = htons(ETH_P_TEB);
494 } else if (mac_proto != MAC_PROTO_NONE) {
495 WARN_ON_ONCE(1);
496 goto drop;
497 }
498 break;
499 case ARPHRD_ETHER:
500 if (mac_proto != MAC_PROTO_ETHERNET)
501 goto drop;
502 break;
503 default:
504 goto drop;
505 }
506
507 if (unlikely(packet_length(skb, vport->dev) > mtu &&
508 !skb_is_gso(skb))) {
505 net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n", 509 net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n",
506 vport->dev->name, 510 vport->dev->name,
507 packet_length(skb), mtu); 511 packet_length(skb, vport->dev), mtu);
508 vport->dev->stats.tx_errors++; 512 vport->dev->stats.tx_errors++;
509 goto drop; 513 goto drop;
510 } 514 }
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
index f01f28a567ad..cda66c26ad08 100644
--- a/net/openvswitch/vport.h
+++ b/net/openvswitch/vport.h
@@ -149,7 +149,6 @@ struct vport_ops {
149struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *, 149struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *,
150 const struct vport_parms *); 150 const struct vport_parms *);
151void ovs_vport_free(struct vport *); 151void ovs_vport_free(struct vport *);
152void ovs_vport_deferred_free(struct vport *vport);
153 152
154#define VPORT_ALIGN 8 153#define VPORT_ALIGN 8
155 154
@@ -198,6 +197,6 @@ int __ovs_vport_ops_register(struct vport_ops *ops);
198 }) 197 })
199 198
200void ovs_vport_ops_unregister(struct vport_ops *ops); 199void ovs_vport_ops_unregister(struct vport_ops *ops);
201void ovs_vport_send(struct vport *vport, struct sk_buff *skb); 200void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto);
202 201
203#endif /* vport.h */ 202#endif /* vport.h */
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index dd2332390c45..8489beff5c25 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -73,7 +73,7 @@
73#include <net/sock.h> 73#include <net/sock.h>
74#include <linux/errno.h> 74#include <linux/errno.h>
75#include <linux/timer.h> 75#include <linux/timer.h>
76#include <asm/uaccess.h> 76#include <linux/uaccess.h>
77#include <asm/ioctls.h> 77#include <asm/ioctls.h>
78#include <asm/page.h> 78#include <asm/page.h>
79#include <asm/cacheflush.h> 79#include <asm/cacheflush.h>
@@ -409,6 +409,9 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
409 flush_dcache_page(pgv_to_page(&h.h2->tp_status)); 409 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
410 break; 410 break;
411 case TPACKET_V3: 411 case TPACKET_V3:
412 h.h3->tp_status = status;
413 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
414 break;
412 default: 415 default:
413 WARN(1, "TPACKET version not supported.\n"); 416 WARN(1, "TPACKET version not supported.\n");
414 BUG(); 417 BUG();
@@ -432,6 +435,8 @@ static int __packet_get_status(struct packet_sock *po, void *frame)
432 flush_dcache_page(pgv_to_page(&h.h2->tp_status)); 435 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
433 return h.h2->tp_status; 436 return h.h2->tp_status;
434 case TPACKET_V3: 437 case TPACKET_V3:
438 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
439 return h.h3->tp_status;
435 default: 440 default:
436 WARN(1, "TPACKET version not supported.\n"); 441 WARN(1, "TPACKET version not supported.\n");
437 BUG(); 442 BUG();
@@ -476,6 +481,9 @@ static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
476 h.h2->tp_nsec = ts.tv_nsec; 481 h.h2->tp_nsec = ts.tv_nsec;
477 break; 482 break;
478 case TPACKET_V3: 483 case TPACKET_V3:
484 h.h3->tp_sec = ts.tv_sec;
485 h.h3->tp_nsec = ts.tv_nsec;
486 break;
479 default: 487 default:
480 WARN(1, "TPACKET version not supported.\n"); 488 WARN(1, "TPACKET version not supported.\n");
481 BUG(); 489 BUG();
@@ -1497,6 +1505,8 @@ static void __fanout_link(struct sock *sk, struct packet_sock *po)
1497 f->arr[f->num_members] = sk; 1505 f->arr[f->num_members] = sk;
1498 smp_wmb(); 1506 smp_wmb();
1499 f->num_members++; 1507 f->num_members++;
1508 if (f->num_members == 1)
1509 dev_add_pack(&f->prot_hook);
1500 spin_unlock(&f->lock); 1510 spin_unlock(&f->lock);
1501} 1511}
1502 1512
@@ -1513,6 +1523,8 @@ static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1513 BUG_ON(i >= f->num_members); 1523 BUG_ON(i >= f->num_members);
1514 f->arr[i] = f->arr[f->num_members - 1]; 1524 f->arr[i] = f->arr[f->num_members - 1];
1515 f->num_members--; 1525 f->num_members--;
1526 if (f->num_members == 0)
1527 __dev_remove_pack(&f->prot_hook);
1516 spin_unlock(&f->lock); 1528 spin_unlock(&f->lock);
1517} 1529}
1518 1530
@@ -1619,6 +1631,7 @@ static void fanout_release_data(struct packet_fanout *f)
1619 1631
1620static int fanout_add(struct sock *sk, u16 id, u16 type_flags) 1632static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
1621{ 1633{
1634 struct packet_rollover *rollover = NULL;
1622 struct packet_sock *po = pkt_sk(sk); 1635 struct packet_sock *po = pkt_sk(sk);
1623 struct packet_fanout *f, *match; 1636 struct packet_fanout *f, *match;
1624 u8 type = type_flags & 0xff; 1637 u8 type = type_flags & 0xff;
@@ -1641,23 +1654,28 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
1641 return -EINVAL; 1654 return -EINVAL;
1642 } 1655 }
1643 1656
1657 mutex_lock(&fanout_mutex);
1658
1659 err = -EINVAL;
1644 if (!po->running) 1660 if (!po->running)
1645 return -EINVAL; 1661 goto out;
1646 1662
1663 err = -EALREADY;
1647 if (po->fanout) 1664 if (po->fanout)
1648 return -EALREADY; 1665 goto out;
1649 1666
1650 if (type == PACKET_FANOUT_ROLLOVER || 1667 if (type == PACKET_FANOUT_ROLLOVER ||
1651 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) { 1668 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
1652 po->rollover = kzalloc(sizeof(*po->rollover), GFP_KERNEL); 1669 err = -ENOMEM;
1653 if (!po->rollover) 1670 rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
1654 return -ENOMEM; 1671 if (!rollover)
1655 atomic_long_set(&po->rollover->num, 0); 1672 goto out;
1656 atomic_long_set(&po->rollover->num_huge, 0); 1673 atomic_long_set(&rollover->num, 0);
1657 atomic_long_set(&po->rollover->num_failed, 0); 1674 atomic_long_set(&rollover->num_huge, 0);
1675 atomic_long_set(&rollover->num_failed, 0);
1676 po->rollover = rollover;
1658 } 1677 }
1659 1678
1660 mutex_lock(&fanout_mutex);
1661 match = NULL; 1679 match = NULL;
1662 list_for_each_entry(f, &fanout_list, list) { 1680 list_for_each_entry(f, &fanout_list, list) {
1663 if (f->id == id && 1681 if (f->id == id &&
@@ -1687,7 +1705,6 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
1687 match->prot_hook.func = packet_rcv_fanout; 1705 match->prot_hook.func = packet_rcv_fanout;
1688 match->prot_hook.af_packet_priv = match; 1706 match->prot_hook.af_packet_priv = match;
1689 match->prot_hook.id_match = match_fanout_group; 1707 match->prot_hook.id_match = match_fanout_group;
1690 dev_add_pack(&match->prot_hook);
1691 list_add(&match->list, &fanout_list); 1708 list_add(&match->list, &fanout_list);
1692 } 1709 }
1693 err = -EINVAL; 1710 err = -EINVAL;
@@ -1704,36 +1721,40 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
1704 } 1721 }
1705 } 1722 }
1706out: 1723out:
1707 mutex_unlock(&fanout_mutex); 1724 if (err && rollover) {
1708 if (err) { 1725 kfree(rollover);
1709 kfree(po->rollover);
1710 po->rollover = NULL; 1726 po->rollover = NULL;
1711 } 1727 }
1728 mutex_unlock(&fanout_mutex);
1712 return err; 1729 return err;
1713} 1730}
1714 1731
1715static void fanout_release(struct sock *sk) 1732/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1733 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1734 * It is the responsibility of the caller to call fanout_release_data() and
1735 * free the returned packet_fanout (after synchronize_net())
1736 */
1737static struct packet_fanout *fanout_release(struct sock *sk)
1716{ 1738{
1717 struct packet_sock *po = pkt_sk(sk); 1739 struct packet_sock *po = pkt_sk(sk);
1718 struct packet_fanout *f; 1740 struct packet_fanout *f;
1719 1741
1742 mutex_lock(&fanout_mutex);
1720 f = po->fanout; 1743 f = po->fanout;
1721 if (!f) 1744 if (f) {
1722 return; 1745 po->fanout = NULL;
1723 1746
1724 mutex_lock(&fanout_mutex); 1747 if (atomic_dec_and_test(&f->sk_ref))
1725 po->fanout = NULL; 1748 list_del(&f->list);
1749 else
1750 f = NULL;
1726 1751
1727 if (atomic_dec_and_test(&f->sk_ref)) { 1752 if (po->rollover)
1728 list_del(&f->list); 1753 kfree_rcu(po->rollover, rcu);
1729 dev_remove_pack(&f->prot_hook);
1730 fanout_release_data(f);
1731 kfree(f);
1732 } 1754 }
1733 mutex_unlock(&fanout_mutex); 1755 mutex_unlock(&fanout_mutex);
1734 1756
1735 if (po->rollover) 1757 return f;
1736 kfree_rcu(po->rollover, rcu);
1737} 1758}
1738 1759
1739static bool packet_extra_vlan_len_allowed(const struct net_device *dev, 1760static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
@@ -1967,17 +1988,6 @@ static unsigned int run_filter(struct sk_buff *skb,
1967 return res; 1988 return res;
1968} 1989}
1969 1990
1970static int __packet_rcv_vnet(const struct sk_buff *skb,
1971 struct virtio_net_hdr *vnet_hdr)
1972{
1973 *vnet_hdr = (const struct virtio_net_hdr) { 0 };
1974
1975 if (virtio_net_hdr_from_skb(skb, vnet_hdr, vio_le()))
1976 BUG();
1977
1978 return 0;
1979}
1980
1981static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb, 1991static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
1982 size_t *len) 1992 size_t *len)
1983{ 1993{
@@ -1987,7 +1997,7 @@ static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
1987 return -EINVAL; 1997 return -EINVAL;
1988 *len -= sizeof(vnet_hdr); 1998 *len -= sizeof(vnet_hdr);
1989 1999
1990 if (__packet_rcv_vnet(skb, &vnet_hdr)) 2000 if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true))
1991 return -EINVAL; 2001 return -EINVAL;
1992 2002
1993 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr)); 2003 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
@@ -2246,8 +2256,9 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2246 spin_unlock(&sk->sk_receive_queue.lock); 2256 spin_unlock(&sk->sk_receive_queue.lock);
2247 2257
2248 if (po->has_vnet_hdr) { 2258 if (po->has_vnet_hdr) {
2249 if (__packet_rcv_vnet(skb, h.raw + macoff - 2259 if (virtio_net_hdr_from_skb(skb, h.raw + macoff -
2250 sizeof(struct virtio_net_hdr))) { 2260 sizeof(struct virtio_net_hdr),
2261 vio_le(), true)) {
2251 spin_lock(&sk->sk_receive_queue.lock); 2262 spin_lock(&sk->sk_receive_queue.lock);
2252 goto drop_n_account; 2263 goto drop_n_account;
2253 } 2264 }
@@ -2390,8 +2401,6 @@ static void tpacket_set_protocol(const struct net_device *dev,
2390 2401
2391static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len) 2402static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2392{ 2403{
2393 unsigned short gso_type = 0;
2394
2395 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && 2404 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2396 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) + 2405 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2397 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 > 2406 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
@@ -2403,69 +2412,22 @@ static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2403 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len) 2412 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2404 return -EINVAL; 2413 return -EINVAL;
2405 2414
2406 if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2407 switch (vnet_hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2408 case VIRTIO_NET_HDR_GSO_TCPV4:
2409 gso_type = SKB_GSO_TCPV4;
2410 break;
2411 case VIRTIO_NET_HDR_GSO_TCPV6:
2412 gso_type = SKB_GSO_TCPV6;
2413 break;
2414 case VIRTIO_NET_HDR_GSO_UDP:
2415 gso_type = SKB_GSO_UDP;
2416 break;
2417 default:
2418 return -EINVAL;
2419 }
2420
2421 if (vnet_hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN)
2422 gso_type |= SKB_GSO_TCP_ECN;
2423
2424 if (vnet_hdr->gso_size == 0)
2425 return -EINVAL;
2426 }
2427
2428 vnet_hdr->gso_type = gso_type; /* changes type, temporary storage */
2429 return 0; 2415 return 0;
2430} 2416}
2431 2417
2432static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len, 2418static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2433 struct virtio_net_hdr *vnet_hdr) 2419 struct virtio_net_hdr *vnet_hdr)
2434{ 2420{
2435 int n;
2436
2437 if (*len < sizeof(*vnet_hdr)) 2421 if (*len < sizeof(*vnet_hdr))
2438 return -EINVAL; 2422 return -EINVAL;
2439 *len -= sizeof(*vnet_hdr); 2423 *len -= sizeof(*vnet_hdr);
2440 2424
2441 n = copy_from_iter(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter); 2425 if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
2442 if (n != sizeof(*vnet_hdr))
2443 return -EFAULT; 2426 return -EFAULT;
2444 2427
2445 return __packet_snd_vnet_parse(vnet_hdr, *len); 2428 return __packet_snd_vnet_parse(vnet_hdr, *len);
2446} 2429}
2447 2430
2448static int packet_snd_vnet_gso(struct sk_buff *skb,
2449 struct virtio_net_hdr *vnet_hdr)
2450{
2451 if (vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2452 u16 s = __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start);
2453 u16 o = __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset);
2454
2455 if (!skb_partial_csum_set(skb, s, o))
2456 return -EINVAL;
2457 }
2458
2459 skb_shinfo(skb)->gso_size =
2460 __virtio16_to_cpu(vio_le(), vnet_hdr->gso_size);
2461 skb_shinfo(skb)->gso_type = vnet_hdr->gso_type;
2462
2463 /* Header must be checked, and gso_segs computed. */
2464 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2465 skb_shinfo(skb)->gso_segs = 0;
2466 return 0;
2467}
2468
2469static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, 2431static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
2470 void *frame, struct net_device *dev, void *data, int tp_len, 2432 void *frame, struct net_device *dev, void *data, int tp_len,
2471 __be16 proto, unsigned char *addr, int hlen, int copylen, 2433 __be16 proto, unsigned char *addr, int hlen, int copylen,
@@ -2556,6 +2518,13 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame,
2556 ph.raw = frame; 2518 ph.raw = frame;
2557 2519
2558 switch (po->tp_version) { 2520 switch (po->tp_version) {
2521 case TPACKET_V3:
2522 if (ph.h3->tp_next_offset != 0) {
2523 pr_warn_once("variable sized slot not supported");
2524 return -EINVAL;
2525 }
2526 tp_len = ph.h3->tp_len;
2527 break;
2559 case TPACKET_V2: 2528 case TPACKET_V2:
2560 tp_len = ph.h2->tp_len; 2529 tp_len = ph.h2->tp_len;
2561 break; 2530 break;
@@ -2575,6 +2544,9 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame,
2575 off_max = po->tx_ring.frame_size - tp_len; 2544 off_max = po->tx_ring.frame_size - tp_len;
2576 if (po->sk.sk_type == SOCK_DGRAM) { 2545 if (po->sk.sk_type == SOCK_DGRAM) {
2577 switch (po->tp_version) { 2546 switch (po->tp_version) {
2547 case TPACKET_V3:
2548 off = ph.h3->tp_net;
2549 break;
2578 case TPACKET_V2: 2550 case TPACKET_V2:
2579 off = ph.h2->tp_net; 2551 off = ph.h2->tp_net;
2580 break; 2552 break;
@@ -2584,6 +2556,9 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame,
2584 } 2556 }
2585 } else { 2557 } else {
2586 switch (po->tp_version) { 2558 switch (po->tp_version) {
2559 case TPACKET_V3:
2560 off = ph.h3->tp_mac;
2561 break;
2587 case TPACKET_V2: 2562 case TPACKET_V2:
2588 off = ph.h2->tp_mac; 2563 off = ph.h2->tp_mac;
2589 break; 2564 break;
@@ -2725,7 +2700,8 @@ tpacket_error:
2725 } 2700 }
2726 } 2701 }
2727 2702
2728 if (po->has_vnet_hdr && packet_snd_vnet_gso(skb, vnet_hdr)) { 2703 if (po->has_vnet_hdr && virtio_net_hdr_to_skb(skb, vnet_hdr,
2704 vio_le())) {
2729 tp_len = -EINVAL; 2705 tp_len = -EINVAL;
2730 goto tpacket_error; 2706 goto tpacket_error;
2731 } 2707 }
@@ -2813,7 +2789,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
2813 struct virtio_net_hdr vnet_hdr = { 0 }; 2789 struct virtio_net_hdr vnet_hdr = { 0 };
2814 int offset = 0; 2790 int offset = 0;
2815 struct packet_sock *po = pkt_sk(sk); 2791 struct packet_sock *po = pkt_sk(sk);
2816 int hlen, tlen; 2792 int hlen, tlen, linear;
2817 int extra_len = 0; 2793 int extra_len = 0;
2818 2794
2819 /* 2795 /*
@@ -2874,8 +2850,9 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
2874 err = -ENOBUFS; 2850 err = -ENOBUFS;
2875 hlen = LL_RESERVED_SPACE(dev); 2851 hlen = LL_RESERVED_SPACE(dev);
2876 tlen = dev->needed_tailroom; 2852 tlen = dev->needed_tailroom;
2877 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, 2853 linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
2878 __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len), 2854 linear = max(linear, min_t(int, len, dev->hard_header_len));
2855 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
2879 msg->msg_flags & MSG_DONTWAIT, &err); 2856 msg->msg_flags & MSG_DONTWAIT, &err);
2880 if (skb == NULL) 2857 if (skb == NULL)
2881 goto out_unlock; 2858 goto out_unlock;
@@ -2916,7 +2893,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
2916 packet_pick_tx_queue(dev, skb); 2893 packet_pick_tx_queue(dev, skb);
2917 2894
2918 if (po->has_vnet_hdr) { 2895 if (po->has_vnet_hdr) {
2919 err = packet_snd_vnet_gso(skb, &vnet_hdr); 2896 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
2920 if (err) 2897 if (err)
2921 goto out_free; 2898 goto out_free;
2922 len += sizeof(vnet_hdr); 2899 len += sizeof(vnet_hdr);
@@ -2964,6 +2941,7 @@ static int packet_release(struct socket *sock)
2964{ 2941{
2965 struct sock *sk = sock->sk; 2942 struct sock *sk = sock->sk;
2966 struct packet_sock *po; 2943 struct packet_sock *po;
2944 struct packet_fanout *f;
2967 struct net *net; 2945 struct net *net;
2968 union tpacket_req_u req_u; 2946 union tpacket_req_u req_u;
2969 2947
@@ -3003,9 +2981,14 @@ static int packet_release(struct socket *sock)
3003 packet_set_ring(sk, &req_u, 1, 1); 2981 packet_set_ring(sk, &req_u, 1, 1);
3004 } 2982 }
3005 2983
3006 fanout_release(sk); 2984 f = fanout_release(sk);
3007 2985
3008 synchronize_net(); 2986 synchronize_net();
2987
2988 if (f) {
2989 fanout_release_data(f);
2990 kfree(f);
2991 }
3009 /* 2992 /*
3010 * Now the socket is dead. No more input will appear. 2993 * Now the socket is dead. No more input will appear.
3011 */ 2994 */
@@ -3120,7 +3103,7 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3120 int addr_len) 3103 int addr_len)
3121{ 3104{
3122 struct sock *sk = sock->sk; 3105 struct sock *sk = sock->sk;
3123 char name[15]; 3106 char name[sizeof(uaddr->sa_data) + 1];
3124 3107
3125 /* 3108 /*
3126 * Check legality 3109 * Check legality
@@ -3128,7 +3111,11 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3128 3111
3129 if (addr_len != sizeof(struct sockaddr)) 3112 if (addr_len != sizeof(struct sockaddr))
3130 return -EINVAL; 3113 return -EINVAL;
3131 strlcpy(name, uaddr->sa_data, sizeof(name)); 3114 /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3115 * zero-terminated.
3116 */
3117 memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
3118 name[sizeof(uaddr->sa_data)] = 0;
3132 3119
3133 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num); 3120 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
3134} 3121}
@@ -3678,6 +3665,8 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
3678 return -EBUSY; 3665 return -EBUSY;
3679 if (copy_from_user(&val, optval, sizeof(val))) 3666 if (copy_from_user(&val, optval, sizeof(val)))
3680 return -EFAULT; 3667 return -EFAULT;
3668 if (val > INT_MAX)
3669 return -EINVAL;
3681 po->tp_reserve = val; 3670 po->tp_reserve = val;
3682 return 0; 3671 return 0;
3683 } 3672 }
@@ -3957,7 +3946,6 @@ static int packet_notifier(struct notifier_block *this,
3957 } 3946 }
3958 if (msg == NETDEV_UNREGISTER) { 3947 if (msg == NETDEV_UNREGISTER) {
3959 packet_cached_dev_reset(po); 3948 packet_cached_dev_reset(po);
3960 fanout_release(sk);
3961 po->ifindex = -1; 3949 po->ifindex = -1;
3962 if (po->prot_hook.dev) 3950 if (po->prot_hook.dev)
3963 dev_put(po->prot_hook.dev); 3951 dev_put(po->prot_hook.dev);
@@ -4171,11 +4159,6 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
4171 struct tpacket_req *req = &req_u->req; 4159 struct tpacket_req *req = &req_u->req;
4172 4160
4173 lock_sock(sk); 4161 lock_sock(sk);
4174 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
4175 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
4176 net_warn_ratelimited("Tx-ring is not supported.\n");
4177 goto out;
4178 }
4179 4162
4180 rb = tx_ring ? &po->tx_ring : &po->rx_ring; 4163 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4181 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; 4164 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
@@ -4212,8 +4195,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
4212 if (unlikely(!PAGE_ALIGNED(req->tp_block_size))) 4195 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
4213 goto out; 4196 goto out;
4214 if (po->tp_version >= TPACKET_V3 && 4197 if (po->tp_version >= TPACKET_V3 &&
4215 (int)(req->tp_block_size - 4198 req->tp_block_size <=
4216 BLK_PLUS_PRIV(req_u->req3.tp_sizeof_priv)) <= 0) 4199 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv))
4217 goto out; 4200 goto out;
4218 if (unlikely(req->tp_frame_size < po->tp_hdrlen + 4201 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
4219 po->tp_reserve)) 4202 po->tp_reserve))
@@ -4224,6 +4207,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
4224 rb->frames_per_block = req->tp_block_size / req->tp_frame_size; 4207 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4225 if (unlikely(rb->frames_per_block == 0)) 4208 if (unlikely(rb->frames_per_block == 0))
4226 goto out; 4209 goto out;
4210 if (unlikely(req->tp_block_size > UINT_MAX / req->tp_block_nr))
4211 goto out;
4227 if (unlikely((rb->frames_per_block * req->tp_block_nr) != 4212 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4228 req->tp_frame_nr)) 4213 req->tp_frame_nr))
4229 goto out; 4214 goto out;
@@ -4235,11 +4220,19 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
4235 goto out; 4220 goto out;
4236 switch (po->tp_version) { 4221 switch (po->tp_version) {
4237 case TPACKET_V3: 4222 case TPACKET_V3:
4238 /* Transmit path is not supported. We checked 4223 /* Block transmit is not supported yet */
4239 * it above but just being paranoid 4224 if (!tx_ring) {
4240 */
4241 if (!tx_ring)
4242 init_prb_bdqc(po, rb, pg_vec, req_u); 4225 init_prb_bdqc(po, rb, pg_vec, req_u);
4226 } else {
4227 struct tpacket_req3 *req3 = &req_u->req3;
4228
4229 if (req3->tp_retire_blk_tov ||
4230 req3->tp_sizeof_priv ||
4231 req3->tp_feature_req_word) {
4232 err = -EINVAL;
4233 goto out;
4234 }
4235 }
4243 break; 4236 break;
4244 default: 4237 default:
4245 break; 4238 break;
diff --git a/net/packet/diag.c b/net/packet/diag.c
index 0ed68f0238bf..7ef1c881ae74 100644
--- a/net/packet/diag.c
+++ b/net/packet/diag.c
@@ -73,8 +73,7 @@ static int pdiag_put_ring(struct packet_ring_buffer *ring, int ver, int nl_type,
73{ 73{
74 struct packet_diag_ring pdr; 74 struct packet_diag_ring pdr;
75 75
76 if (!ring->pg_vec || ((ver > TPACKET_V2) && 76 if (!ring->pg_vec)
77 (nl_type == PACKET_DIAG_TX_RING)))
78 return 0; 77 return 0;
79 78
80 pdr.pdr_block_size = ring->pg_vec_pages << PAGE_SHIFT; 79 pdr.pdr_block_size = ring->pg_vec_pages << PAGE_SHIFT;
diff --git a/net/phonet/pep-gprs.c b/net/phonet/pep-gprs.c
index fa8237fdc57b..21c28b51be94 100644
--- a/net/phonet/pep-gprs.c
+++ b/net/phonet/pep-gprs.c
@@ -217,20 +217,10 @@ static netdev_tx_t gprs_xmit(struct sk_buff *skb, struct net_device *dev)
217 return NETDEV_TX_OK; 217 return NETDEV_TX_OK;
218} 218}
219 219
220static int gprs_set_mtu(struct net_device *dev, int new_mtu)
221{
222 if ((new_mtu < 576) || (new_mtu > (PHONET_MAX_MTU - 11)))
223 return -EINVAL;
224
225 dev->mtu = new_mtu;
226 return 0;
227}
228
229static const struct net_device_ops gprs_netdev_ops = { 220static const struct net_device_ops gprs_netdev_ops = {
230 .ndo_open = gprs_open, 221 .ndo_open = gprs_open,
231 .ndo_stop = gprs_close, 222 .ndo_stop = gprs_close,
232 .ndo_start_xmit = gprs_xmit, 223 .ndo_start_xmit = gprs_xmit,
233 .ndo_change_mtu = gprs_set_mtu,
234}; 224};
235 225
236static void gprs_setup(struct net_device *dev) 226static void gprs_setup(struct net_device *dev)
@@ -239,6 +229,8 @@ static void gprs_setup(struct net_device *dev)
239 dev->type = ARPHRD_PHONET_PIPE; 229 dev->type = ARPHRD_PHONET_PIPE;
240 dev->flags = IFF_POINTOPOINT | IFF_NOARP; 230 dev->flags = IFF_POINTOPOINT | IFF_NOARP;
241 dev->mtu = GPRS_DEFAULT_MTU; 231 dev->mtu = GPRS_DEFAULT_MTU;
232 dev->min_mtu = 576;
233 dev->max_mtu = (PHONET_MAX_MTU - 11);
242 dev->hard_header_len = 0; 234 dev->hard_header_len = 0;
243 dev->addr_len = 0; 235 dev->addr_len = 0;
244 dev->tx_queue_len = 10; 236 dev->tx_queue_len = 10;
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index 850a86cde0b3..e81537991ddf 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -23,6 +23,7 @@
23 */ 23 */
24 24
25#include <linux/kernel.h> 25#include <linux/kernel.h>
26#include <linux/sched/signal.h>
26#include <linux/slab.h> 27#include <linux/slab.h>
27#include <linux/socket.h> 28#include <linux/socket.h>
28#include <net/sock.h> 29#include <net/sock.h>
@@ -771,7 +772,8 @@ static void pep_sock_close(struct sock *sk, long timeout)
771 sock_put(sk); 772 sock_put(sk);
772} 773}
773 774
774static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp) 775static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp,
776 bool kern)
775{ 777{
776 struct pep_sock *pn = pep_sk(sk), *newpn; 778 struct pep_sock *pn = pep_sk(sk), *newpn;
777 struct sock *newsk = NULL; 779 struct sock *newsk = NULL;
@@ -845,7 +847,8 @@ static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp)
845 } 847 }
846 848
847 /* Create a new to-be-accepted sock */ 849 /* Create a new to-be-accepted sock */
848 newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot, 0); 850 newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot,
851 kern);
849 if (!newsk) { 852 if (!newsk) {
850 pep_reject_conn(sk, skb, PN_PIPE_ERR_OVERLOAD, GFP_KERNEL); 853 pep_reject_conn(sk, skb, PN_PIPE_ERR_OVERLOAD, GFP_KERNEL);
851 err = -ENOBUFS; 854 err = -ENOBUFS;
@@ -1167,7 +1170,7 @@ disabled:
1167 /* Wait until flow control allows TX */ 1170 /* Wait until flow control allows TX */
1168 done = atomic_read(&pn->tx_credits); 1171 done = atomic_read(&pn->tx_credits);
1169 while (!done) { 1172 while (!done) {
1170 DEFINE_WAIT(wait); 1173 DEFINE_WAIT_FUNC(wait, woken_wake_function);
1171 1174
1172 if (!timeo) { 1175 if (!timeo) {
1173 err = -EAGAIN; 1176 err = -EAGAIN;
@@ -1178,10 +1181,9 @@ disabled:
1178 goto out; 1181 goto out;
1179 } 1182 }
1180 1183
1181 prepare_to_wait(sk_sleep(sk), &wait, 1184 add_wait_queue(sk_sleep(sk), &wait);
1182 TASK_INTERRUPTIBLE); 1185 done = sk_wait_event(sk, &timeo, atomic_read(&pn->tx_credits), &wait);
1183 done = sk_wait_event(sk, &timeo, atomic_read(&pn->tx_credits)); 1186 remove_wait_queue(sk_sleep(sk), &wait);
1184 finish_wait(sk_sleep(sk), &wait);
1185 1187
1186 if (sk->sk_state != TCP_ESTABLISHED) 1188 if (sk->sk_state != TCP_ESTABLISHED)
1187 goto disabled; 1189 goto disabled;
diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c
index a58680016472..2cb4c5dfad6f 100644
--- a/net/phonet/pn_dev.c
+++ b/net/phonet/pn_dev.c
@@ -44,7 +44,7 @@ struct phonet_net {
44 struct phonet_routes routes; 44 struct phonet_routes routes;
45}; 45};
46 46
47static int phonet_net_id __read_mostly; 47static unsigned int phonet_net_id __read_mostly;
48 48
49static struct phonet_net *phonet_pernet(struct net *net) 49static struct phonet_net *phonet_pernet(struct net *net)
50{ 50{
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index ffd5f2297584..64634e3ec2fc 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -27,6 +27,8 @@
27#include <linux/kernel.h> 27#include <linux/kernel.h>
28#include <linux/net.h> 28#include <linux/net.h>
29#include <linux/poll.h> 29#include <linux/poll.h>
30#include <linux/sched/signal.h>
31
30#include <net/sock.h> 32#include <net/sock.h>
31#include <net/tcp_states.h> 33#include <net/tcp_states.h>
32 34
@@ -303,7 +305,7 @@ out:
303} 305}
304 306
305static int pn_socket_accept(struct socket *sock, struct socket *newsock, 307static int pn_socket_accept(struct socket *sock, struct socket *newsock,
306 int flags) 308 int flags, bool kern)
307{ 309{
308 struct sock *sk = sock->sk; 310 struct sock *sk = sock->sk;
309 struct sock *newsk; 311 struct sock *newsk;
@@ -312,7 +314,7 @@ static int pn_socket_accept(struct socket *sock, struct socket *newsock,
312 if (unlikely(sk->sk_state != TCP_LISTEN)) 314 if (unlikely(sk->sk_state != TCP_LISTEN))
313 return -EINVAL; 315 return -EINVAL;
314 316
315 newsk = sk->sk_prot->accept(sk, flags, &err); 317 newsk = sk->sk_prot->accept(sk, flags, &err, kern);
316 if (!newsk) 318 if (!newsk)
317 return err; 319 return err;
318 320
diff --git a/net/psample/Kconfig b/net/psample/Kconfig
new file mode 100644
index 000000000000..d850246a6059
--- /dev/null
+++ b/net/psample/Kconfig
@@ -0,0 +1,15 @@
1#
2# psample packet sampling configuration
3#
4
5menuconfig PSAMPLE
6 depends on NET
7 tristate "Packet-sampling netlink channel"
8 default n
9 help
10 Say Y here to add support for packet-sampling netlink channel
11 This netlink channel allows transferring packets alongside some
12 metadata to userspace.
13
14 To compile this support as a module, choose M here: the module will
15 be called psample.
diff --git a/net/psample/Makefile b/net/psample/Makefile
new file mode 100644
index 000000000000..609b0a79c9f3
--- /dev/null
+++ b/net/psample/Makefile
@@ -0,0 +1,5 @@
1#
2# Makefile for the psample netlink channel
3#
4
5obj-$(CONFIG_PSAMPLE) += psample.o
diff --git a/net/psample/psample.c b/net/psample/psample.c
new file mode 100644
index 000000000000..8aa58a918783
--- /dev/null
+++ b/net/psample/psample.c
@@ -0,0 +1,301 @@
1/*
2 * net/psample/psample.c - Netlink channel for packet sampling
3 * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 */
9
10#include <linux/types.h>
11#include <linux/kernel.h>
12#include <linux/skbuff.h>
13#include <linux/module.h>
14#include <net/net_namespace.h>
15#include <net/sock.h>
16#include <net/netlink.h>
17#include <net/genetlink.h>
18#include <net/psample.h>
19#include <linux/spinlock.h>
20
21#define PSAMPLE_MAX_PACKET_SIZE 0xffff
22
23static LIST_HEAD(psample_groups_list);
24static DEFINE_SPINLOCK(psample_groups_lock);
25
26/* multicast groups */
27enum psample_nl_multicast_groups {
28 PSAMPLE_NL_MCGRP_CONFIG,
29 PSAMPLE_NL_MCGRP_SAMPLE,
30};
31
32static const struct genl_multicast_group psample_nl_mcgrps[] = {
33 [PSAMPLE_NL_MCGRP_CONFIG] = { .name = PSAMPLE_NL_MCGRP_CONFIG_NAME },
34 [PSAMPLE_NL_MCGRP_SAMPLE] = { .name = PSAMPLE_NL_MCGRP_SAMPLE_NAME },
35};
36
37static struct genl_family psample_nl_family __ro_after_init;
38
39static int psample_group_nl_fill(struct sk_buff *msg,
40 struct psample_group *group,
41 enum psample_command cmd, u32 portid, u32 seq,
42 int flags)
43{
44 void *hdr;
45 int ret;
46
47 hdr = genlmsg_put(msg, portid, seq, &psample_nl_family, flags, cmd);
48 if (!hdr)
49 return -EMSGSIZE;
50
51 ret = nla_put_u32(msg, PSAMPLE_ATTR_SAMPLE_GROUP, group->group_num);
52 if (ret < 0)
53 goto error;
54
55 ret = nla_put_u32(msg, PSAMPLE_ATTR_GROUP_REFCOUNT, group->refcount);
56 if (ret < 0)
57 goto error;
58
59 ret = nla_put_u32(msg, PSAMPLE_ATTR_GROUP_SEQ, group->seq);
60 if (ret < 0)
61 goto error;
62
63 genlmsg_end(msg, hdr);
64 return 0;
65
66error:
67 genlmsg_cancel(msg, hdr);
68 return -EMSGSIZE;
69}
70
71static int psample_nl_cmd_get_group_dumpit(struct sk_buff *msg,
72 struct netlink_callback *cb)
73{
74 struct psample_group *group;
75 int start = cb->args[0];
76 int idx = 0;
77 int err;
78
79 spin_lock(&psample_groups_lock);
80 list_for_each_entry(group, &psample_groups_list, list) {
81 if (!net_eq(group->net, sock_net(msg->sk)))
82 continue;
83 if (idx < start) {
84 idx++;
85 continue;
86 }
87 err = psample_group_nl_fill(msg, group, PSAMPLE_CMD_NEW_GROUP,
88 NETLINK_CB(cb->skb).portid,
89 cb->nlh->nlmsg_seq, NLM_F_MULTI);
90 if (err)
91 break;
92 idx++;
93 }
94
95 spin_unlock(&psample_groups_lock);
96 cb->args[0] = idx;
97 return msg->len;
98}
99
100static const struct genl_ops psample_nl_ops[] = {
101 {
102 .cmd = PSAMPLE_CMD_GET_GROUP,
103 .dumpit = psample_nl_cmd_get_group_dumpit,
104 /* can be retrieved by unprivileged users */
105 }
106};
107
108static struct genl_family psample_nl_family __ro_after_init = {
109 .name = PSAMPLE_GENL_NAME,
110 .version = PSAMPLE_GENL_VERSION,
111 .maxattr = PSAMPLE_ATTR_MAX,
112 .netnsok = true,
113 .module = THIS_MODULE,
114 .mcgrps = psample_nl_mcgrps,
115 .ops = psample_nl_ops,
116 .n_ops = ARRAY_SIZE(psample_nl_ops),
117 .n_mcgrps = ARRAY_SIZE(psample_nl_mcgrps),
118};
119
120static void psample_group_notify(struct psample_group *group,
121 enum psample_command cmd)
122{
123 struct sk_buff *msg;
124 int err;
125
126 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
127 if (!msg)
128 return;
129
130 err = psample_group_nl_fill(msg, group, cmd, 0, 0, NLM_F_MULTI);
131 if (!err)
132 genlmsg_multicast_netns(&psample_nl_family, group->net, msg, 0,
133 PSAMPLE_NL_MCGRP_CONFIG, GFP_ATOMIC);
134 else
135 nlmsg_free(msg);
136}
137
138static struct psample_group *psample_group_create(struct net *net,
139 u32 group_num)
140{
141 struct psample_group *group;
142
143 group = kzalloc(sizeof(*group), GFP_ATOMIC);
144 if (!group)
145 return NULL;
146
147 group->net = net;
148 group->group_num = group_num;
149 list_add_tail(&group->list, &psample_groups_list);
150
151 psample_group_notify(group, PSAMPLE_CMD_NEW_GROUP);
152 return group;
153}
154
155static void psample_group_destroy(struct psample_group *group)
156{
157 psample_group_notify(group, PSAMPLE_CMD_DEL_GROUP);
158 list_del(&group->list);
159 kfree(group);
160}
161
162static struct psample_group *
163psample_group_lookup(struct net *net, u32 group_num)
164{
165 struct psample_group *group;
166
167 list_for_each_entry(group, &psample_groups_list, list)
168 if ((group->group_num == group_num) && (group->net == net))
169 return group;
170 return NULL;
171}
172
173struct psample_group *psample_group_get(struct net *net, u32 group_num)
174{
175 struct psample_group *group;
176
177 spin_lock(&psample_groups_lock);
178
179 group = psample_group_lookup(net, group_num);
180 if (!group) {
181 group = psample_group_create(net, group_num);
182 if (!group)
183 goto out;
184 }
185 group->refcount++;
186
187out:
188 spin_unlock(&psample_groups_lock);
189 return group;
190}
191EXPORT_SYMBOL_GPL(psample_group_get);
192
193void psample_group_put(struct psample_group *group)
194{
195 spin_lock(&psample_groups_lock);
196
197 if (--group->refcount == 0)
198 psample_group_destroy(group);
199
200 spin_unlock(&psample_groups_lock);
201}
202EXPORT_SYMBOL_GPL(psample_group_put);
203
204void psample_sample_packet(struct psample_group *group, struct sk_buff *skb,
205 u32 trunc_size, int in_ifindex, int out_ifindex,
206 u32 sample_rate)
207{
208 struct sk_buff *nl_skb;
209 int data_len;
210 int meta_len;
211 void *data;
212 int ret;
213
214 meta_len = (in_ifindex ? nla_total_size(sizeof(u16)) : 0) +
215 (out_ifindex ? nla_total_size(sizeof(u16)) : 0) +
216 nla_total_size(sizeof(u32)) + /* sample_rate */
217 nla_total_size(sizeof(u32)) + /* orig_size */
218 nla_total_size(sizeof(u32)) + /* group_num */
219 nla_total_size(sizeof(u32)); /* seq */
220
221 data_len = min(skb->len, trunc_size);
222 if (meta_len + nla_total_size(data_len) > PSAMPLE_MAX_PACKET_SIZE)
223 data_len = PSAMPLE_MAX_PACKET_SIZE - meta_len - NLA_HDRLEN
224 - NLA_ALIGNTO;
225
226 nl_skb = genlmsg_new(meta_len + data_len, GFP_ATOMIC);
227 if (unlikely(!nl_skb))
228 return;
229
230 data = genlmsg_put(nl_skb, 0, 0, &psample_nl_family, 0,
231 PSAMPLE_CMD_SAMPLE);
232 if (unlikely(!data))
233 goto error;
234
235 if (in_ifindex) {
236 ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_IIFINDEX, in_ifindex);
237 if (unlikely(ret < 0))
238 goto error;
239 }
240
241 if (out_ifindex) {
242 ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_OIFINDEX, out_ifindex);
243 if (unlikely(ret < 0))
244 goto error;
245 }
246
247 ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_SAMPLE_RATE, sample_rate);
248 if (unlikely(ret < 0))
249 goto error;
250
251 ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_ORIGSIZE, skb->len);
252 if (unlikely(ret < 0))
253 goto error;
254
255 ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_SAMPLE_GROUP, group->group_num);
256 if (unlikely(ret < 0))
257 goto error;
258
259 ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_GROUP_SEQ, group->seq++);
260 if (unlikely(ret < 0))
261 goto error;
262
263 if (data_len) {
264 int nla_len = nla_total_size(data_len);
265 struct nlattr *nla;
266
267 nla = (struct nlattr *)skb_put(nl_skb, nla_len);
268 nla->nla_type = PSAMPLE_ATTR_DATA;
269 nla->nla_len = nla_attr_size(data_len);
270
271 if (skb_copy_bits(skb, 0, nla_data(nla), data_len))
272 goto error;
273 }
274
275 genlmsg_end(nl_skb, data);
276 genlmsg_multicast_netns(&psample_nl_family, group->net, nl_skb, 0,
277 PSAMPLE_NL_MCGRP_SAMPLE, GFP_ATOMIC);
278
279 return;
280error:
281 pr_err_ratelimited("Could not create psample log message\n");
282 nlmsg_free(nl_skb);
283}
284EXPORT_SYMBOL_GPL(psample_sample_packet);
285
286static int __init psample_module_init(void)
287{
288 return genl_register_family(&psample_nl_family);
289}
290
291static void __exit psample_module_exit(void)
292{
293 genl_unregister_family(&psample_nl_family);
294}
295
296module_init(psample_module_init);
297module_exit(psample_module_exit);
298
299MODULE_AUTHOR("Yotam Gigi <yotamg@mellanox.com>");
300MODULE_DESCRIPTION("netlink channel for packet sampling");
301MODULE_LICENSE("GPL v2");
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index c985ecbe9bd6..9da7368b0140 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -252,7 +252,7 @@ static struct sk_buff *qrtr_alloc_resume_tx(u32 src_node,
252 const int pkt_len = 20; 252 const int pkt_len = 20;
253 struct qrtr_hdr *hdr; 253 struct qrtr_hdr *hdr;
254 struct sk_buff *skb; 254 struct sk_buff *skb;
255 u32 *buf; 255 __le32 *buf;
256 256
257 skb = alloc_skb(QRTR_HDR_SIZE + pkt_len, GFP_KERNEL); 257 skb = alloc_skb(QRTR_HDR_SIZE + pkt_len, GFP_KERNEL);
258 if (!skb) 258 if (!skb)
@@ -269,7 +269,7 @@ static struct sk_buff *qrtr_alloc_resume_tx(u32 src_node,
269 hdr->dst_node_id = cpu_to_le32(dst_node); 269 hdr->dst_node_id = cpu_to_le32(dst_node);
270 hdr->dst_port_id = cpu_to_le32(QRTR_PORT_CTRL); 270 hdr->dst_port_id = cpu_to_le32(QRTR_PORT_CTRL);
271 271
272 buf = (u32 *)skb_put(skb, pkt_len); 272 buf = (__le32 *)skb_put(skb, pkt_len);
273 memset(buf, 0, pkt_len); 273 memset(buf, 0, pkt_len);
274 buf[0] = cpu_to_le32(QRTR_TYPE_RESUME_TX); 274 buf[0] = cpu_to_le32(QRTR_TYPE_RESUME_TX);
275 buf[1] = cpu_to_le32(src_node); 275 buf[1] = cpu_to_le32(src_node);
@@ -658,7 +658,9 @@ static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
658 } 658 }
659 659
660 if (plen != len) { 660 if (plen != len) {
661 skb_pad(skb, plen - len); 661 rc = skb_pad(skb, plen - len);
662 if (rc)
663 goto out_node;
662 skb_put(skb, plen - len); 664 skb_put(skb, plen - len);
663 } 665 }
664 666
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 6beaeb1138f3..b405f77d664c 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -298,6 +298,33 @@ static int rds_enable_recvtstamp(struct sock *sk, char __user *optval,
298 return 0; 298 return 0;
299} 299}
300 300
301static int rds_recv_track_latency(struct rds_sock *rs, char __user *optval,
302 int optlen)
303{
304 struct rds_rx_trace_so trace;
305 int i;
306
307 if (optlen != sizeof(struct rds_rx_trace_so))
308 return -EFAULT;
309
310 if (copy_from_user(&trace, optval, sizeof(trace)))
311 return -EFAULT;
312
313 if (trace.rx_traces > RDS_MSG_RX_DGRAM_TRACE_MAX)
314 return -EFAULT;
315
316 rs->rs_rx_traces = trace.rx_traces;
317 for (i = 0; i < rs->rs_rx_traces; i++) {
318 if (trace.rx_trace_pos[i] > RDS_MSG_RX_DGRAM_TRACE_MAX) {
319 rs->rs_rx_traces = 0;
320 return -EFAULT;
321 }
322 rs->rs_rx_trace[i] = trace.rx_trace_pos[i];
323 }
324
325 return 0;
326}
327
301static int rds_setsockopt(struct socket *sock, int level, int optname, 328static int rds_setsockopt(struct socket *sock, int level, int optname,
302 char __user *optval, unsigned int optlen) 329 char __user *optval, unsigned int optlen)
303{ 330{
@@ -338,6 +365,9 @@ static int rds_setsockopt(struct socket *sock, int level, int optname,
338 ret = rds_enable_recvtstamp(sock->sk, optval, optlen); 365 ret = rds_enable_recvtstamp(sock->sk, optval, optlen);
339 release_sock(sock->sk); 366 release_sock(sock->sk);
340 break; 367 break;
368 case SO_RDS_MSG_RXPATH_LATENCY:
369 ret = rds_recv_track_latency(rs, optval, optlen);
370 break;
341 default: 371 default:
342 ret = -ENOPROTOOPT; 372 ret = -ENOPROTOOPT;
343 } 373 }
@@ -484,6 +514,7 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
484 INIT_LIST_HEAD(&rs->rs_cong_list); 514 INIT_LIST_HEAD(&rs->rs_cong_list);
485 spin_lock_init(&rs->rs_rdma_lock); 515 spin_lock_init(&rs->rs_rdma_lock);
486 rs->rs_rdma_keys = RB_ROOT; 516 rs->rs_rdma_keys = RB_ROOT;
517 rs->rs_rx_traces = 0;
487 518
488 spin_lock_bh(&rds_sock_lock); 519 spin_lock_bh(&rds_sock_lock);
489 list_add_tail(&rs->rs_item, &rds_sock_list); 520 list_add_tail(&rs->rs_item, &rds_sock_list);
@@ -605,10 +636,14 @@ static void rds_exit(void)
605} 636}
606module_exit(rds_exit); 637module_exit(rds_exit);
607 638
639u32 rds_gen_num;
640
608static int rds_init(void) 641static int rds_init(void)
609{ 642{
610 int ret; 643 int ret;
611 644
645 net_get_random_once(&rds_gen_num, sizeof(rds_gen_num));
646
612 ret = rds_bind_lock_init(); 647 ret = rds_bind_lock_init();
613 if (ret) 648 if (ret)
614 goto out; 649 goto out;
diff --git a/net/rds/bind.c b/net/rds/bind.c
index 095f6ce583fe..3a915bedb76c 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -176,8 +176,8 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
176 if (!trans) { 176 if (!trans) {
177 ret = -EADDRNOTAVAIL; 177 ret = -EADDRNOTAVAIL;
178 rds_remove_bound(rs); 178 rds_remove_bound(rs);
179 printk_ratelimited(KERN_INFO "RDS: rds_bind() could not find a transport, " 179 pr_info_ratelimited("RDS: %s could not find a transport for %pI4, load rds_tcp or rds_rdma?\n",
180 "load rds_tcp or rds_rdma?\n"); 180 __func__, &sin->sin_addr.s_addr);
181 goto out; 181 goto out;
182 } 182 }
183 183
diff --git a/net/rds/connection.c b/net/rds/connection.c
index f5058559bb08..1fa75ab7b733 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -269,6 +269,8 @@ static struct rds_connection *__rds_conn_create(struct net *net,
269 kmem_cache_free(rds_conn_slab, conn); 269 kmem_cache_free(rds_conn_slab, conn);
270 conn = found; 270 conn = found;
271 } else { 271 } else {
272 conn->c_my_gen_num = rds_gen_num;
273 conn->c_peer_gen_num = 0;
272 hlist_add_head_rcu(&conn->c_hash_node, head); 274 hlist_add_head_rcu(&conn->c_hash_node, head);
273 rds_cong_add_conn(conn); 275 rds_cong_add_conn(conn);
274 rds_conn_count++; 276 rds_conn_count++;
@@ -427,6 +429,7 @@ void rds_conn_destroy(struct rds_connection *conn)
427 */ 429 */
428 rds_cong_remove_conn(conn); 430 rds_cong_remove_conn(conn);
429 431
432 put_net(conn->c_net);
430 kmem_cache_free(rds_conn_slab, conn); 433 kmem_cache_free(rds_conn_slab, conn);
431 434
432 spin_lock_irqsave(&rds_conn_lock, flags); 435 spin_lock_irqsave(&rds_conn_lock, flags);
@@ -543,11 +546,11 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
543} 546}
544EXPORT_SYMBOL_GPL(rds_for_each_conn_info); 547EXPORT_SYMBOL_GPL(rds_for_each_conn_info);
545 548
546void rds_walk_conn_path_info(struct socket *sock, unsigned int len, 549static void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
547 struct rds_info_iterator *iter, 550 struct rds_info_iterator *iter,
548 struct rds_info_lengths *lens, 551 struct rds_info_lengths *lens,
549 int (*visitor)(struct rds_conn_path *, void *), 552 int (*visitor)(struct rds_conn_path *, void *),
550 size_t item_len) 553 size_t item_len)
551{ 554{
552 u64 buffer[(item_len + 7) / 8]; 555 u64 buffer[(item_len + 7) / 8];
553 struct hlist_head *head; 556 struct hlist_head *head;
@@ -681,6 +684,7 @@ void rds_conn_path_connect_if_down(struct rds_conn_path *cp)
681 !test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags)) 684 !test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags))
682 queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); 685 queue_delayed_work(rds_wq, &cp->cp_conn_w, 0);
683} 686}
687EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down);
684 688
685void rds_conn_connect_if_down(struct rds_connection *conn) 689void rds_conn_connect_if_down(struct rds_connection *conn)
686{ 690{
@@ -689,21 +693,6 @@ void rds_conn_connect_if_down(struct rds_connection *conn)
689} 693}
690EXPORT_SYMBOL_GPL(rds_conn_connect_if_down); 694EXPORT_SYMBOL_GPL(rds_conn_connect_if_down);
691 695
692/*
693 * An error occurred on the connection
694 */
695void
696__rds_conn_error(struct rds_connection *conn, const char *fmt, ...)
697{
698 va_list ap;
699
700 va_start(ap, fmt);
701 vprintk(fmt, ap);
702 va_end(ap);
703
704 rds_conn_drop(conn);
705}
706
707void 696void
708__rds_conn_path_error(struct rds_conn_path *cp, const char *fmt, ...) 697__rds_conn_path_error(struct rds_conn_path *cp, const char *fmt, ...)
709{ 698{
diff --git a/net/rds/ib.c b/net/rds/ib.c
index 5680d90b0b77..7a64c8db81ab 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -45,8 +45,8 @@
45#include "ib.h" 45#include "ib.h"
46#include "ib_mr.h" 46#include "ib_mr.h"
47 47
48unsigned int rds_ib_mr_1m_pool_size = RDS_MR_1M_POOL_SIZE; 48static unsigned int rds_ib_mr_1m_pool_size = RDS_MR_1M_POOL_SIZE;
49unsigned int rds_ib_mr_8k_pool_size = RDS_MR_8K_POOL_SIZE; 49static unsigned int rds_ib_mr_8k_pool_size = RDS_MR_8K_POOL_SIZE;
50unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT; 50unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
51 51
52module_param(rds_ib_mr_1m_pool_size, int, 0444); 52module_param(rds_ib_mr_1m_pool_size, int, 0444);
@@ -111,6 +111,8 @@ static void rds_ib_dev_free(struct work_struct *work)
111 kfree(i_ipaddr); 111 kfree(i_ipaddr);
112 } 112 }
113 113
114 kfree(rds_ibdev->vector_load);
115
114 kfree(rds_ibdev); 116 kfree(rds_ibdev);
115} 117}
116 118
@@ -159,6 +161,14 @@ static void rds_ib_add_one(struct ib_device *device)
159 rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom; 161 rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom;
160 rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom; 162 rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom;
161 163
164 rds_ibdev->vector_load = kzalloc(sizeof(int) * device->num_comp_vectors,
165 GFP_KERNEL);
166 if (!rds_ibdev->vector_load) {
167 pr_err("RDS/IB: %s failed to allocate vector memory\n",
168 __func__);
169 goto put_dev;
170 }
171
162 rds_ibdev->dev = device; 172 rds_ibdev->dev = device;
163 rds_ibdev->pd = ib_alloc_pd(device, 0); 173 rds_ibdev->pd = ib_alloc_pd(device, 0);
164 if (IS_ERR(rds_ibdev->pd)) { 174 if (IS_ERR(rds_ibdev->pd)) {
@@ -428,16 +438,12 @@ int rds_ib_init(void)
428 if (ret) 438 if (ret)
429 goto out_sysctl; 439 goto out_sysctl;
430 440
431 ret = rds_trans_register(&rds_ib_transport); 441 rds_trans_register(&rds_ib_transport);
432 if (ret)
433 goto out_recv;
434 442
435 rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); 443 rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
436 444
437 goto out; 445 goto out;
438 446
439out_recv:
440 rds_ib_recv_exit();
441out_sysctl: 447out_sysctl:
442 rds_ib_sysctl_exit(); 448 rds_ib_sysctl_exit();
443out_ibreg: 449out_ibreg:
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 45ac8e8e58f4..ec550626e221 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -14,9 +14,10 @@
14 14
15#define RDS_IB_DEFAULT_RECV_WR 1024 15#define RDS_IB_DEFAULT_RECV_WR 1024
16#define RDS_IB_DEFAULT_SEND_WR 256 16#define RDS_IB_DEFAULT_SEND_WR 256
17#define RDS_IB_DEFAULT_FR_WR 512 17#define RDS_IB_DEFAULT_FR_WR 256
18#define RDS_IB_DEFAULT_FR_INV_WR 256
18 19
19#define RDS_IB_DEFAULT_RETRY_COUNT 2 20#define RDS_IB_DEFAULT_RETRY_COUNT 1
20 21
21#define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ 22#define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */
22 23
@@ -125,6 +126,7 @@ struct rds_ib_connection {
125 126
126 /* To control the number of wrs from fastreg */ 127 /* To control the number of wrs from fastreg */
127 atomic_t i_fastreg_wrs; 128 atomic_t i_fastreg_wrs;
129 atomic_t i_fastunreg_wrs;
128 130
129 /* interrupt handling */ 131 /* interrupt handling */
130 struct tasklet_struct i_send_tasklet; 132 struct tasklet_struct i_send_tasklet;
@@ -134,7 +136,7 @@ struct rds_ib_connection {
134 struct rds_ib_work_ring i_send_ring; 136 struct rds_ib_work_ring i_send_ring;
135 struct rm_data_op *i_data_op; 137 struct rm_data_op *i_data_op;
136 struct rds_header *i_send_hdrs; 138 struct rds_header *i_send_hdrs;
137 u64 i_send_hdrs_dma; 139 dma_addr_t i_send_hdrs_dma;
138 struct rds_ib_send_work *i_sends; 140 struct rds_ib_send_work *i_sends;
139 atomic_t i_signaled_sends; 141 atomic_t i_signaled_sends;
140 142
@@ -144,11 +146,12 @@ struct rds_ib_connection {
144 struct rds_ib_incoming *i_ibinc; 146 struct rds_ib_incoming *i_ibinc;
145 u32 i_recv_data_rem; 147 u32 i_recv_data_rem;
146 struct rds_header *i_recv_hdrs; 148 struct rds_header *i_recv_hdrs;
147 u64 i_recv_hdrs_dma; 149 dma_addr_t i_recv_hdrs_dma;
148 struct rds_ib_recv_work *i_recvs; 150 struct rds_ib_recv_work *i_recvs;
149 u64 i_ack_recv; /* last ACK received */ 151 u64 i_ack_recv; /* last ACK received */
150 struct rds_ib_refill_cache i_cache_incs; 152 struct rds_ib_refill_cache i_cache_incs;
151 struct rds_ib_refill_cache i_cache_frags; 153 struct rds_ib_refill_cache i_cache_frags;
154 atomic_t i_cache_allocs;
152 155
153 /* sending acks */ 156 /* sending acks */
154 unsigned long i_ack_flags; 157 unsigned long i_ack_flags;
@@ -161,7 +164,7 @@ struct rds_ib_connection {
161 struct rds_header *i_ack; 164 struct rds_header *i_ack;
162 struct ib_send_wr i_ack_wr; 165 struct ib_send_wr i_ack_wr;
163 struct ib_sge i_ack_sge; 166 struct ib_sge i_ack_sge;
164 u64 i_ack_dma; 167 dma_addr_t i_ack_dma;
165 unsigned long i_ack_queued; 168 unsigned long i_ack_queued;
166 169
167 /* Flow control related information 170 /* Flow control related information
@@ -179,6 +182,14 @@ struct rds_ib_connection {
179 182
180 /* Batched completions */ 183 /* Batched completions */
181 unsigned int i_unsignaled_wrs; 184 unsigned int i_unsignaled_wrs;
185
186 /* Endpoint role in connection */
187 bool i_active_side;
188 atomic_t i_cq_quiesce;
189
190 /* Send/Recv vectors */
191 int i_scq_vector;
192 int i_rcq_vector;
182}; 193};
183 194
184/* This assumes that atomic_t is at least 32 bits */ 195/* This assumes that atomic_t is at least 32 bits */
@@ -221,9 +232,10 @@ struct rds_ib_device {
221 spinlock_t spinlock; /* protect the above */ 232 spinlock_t spinlock; /* protect the above */
222 atomic_t refcount; 233 atomic_t refcount;
223 struct work_struct free_work; 234 struct work_struct free_work;
235 int *vector_load;
224}; 236};
225 237
226#define ibdev_to_node(ibdev) dev_to_node(ibdev->dma_device) 238#define ibdev_to_node(ibdev) dev_to_node((ibdev)->dev.parent)
227#define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev) 239#define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev)
228 240
229/* bits for i_ack_flags */ 241/* bits for i_ack_flags */
@@ -249,6 +261,8 @@ struct rds_ib_statistics {
249 uint64_t s_ib_rx_refill_from_cq; 261 uint64_t s_ib_rx_refill_from_cq;
250 uint64_t s_ib_rx_refill_from_thread; 262 uint64_t s_ib_rx_refill_from_thread;
251 uint64_t s_ib_rx_alloc_limit; 263 uint64_t s_ib_rx_alloc_limit;
264 uint64_t s_ib_rx_total_frags;
265 uint64_t s_ib_rx_total_incs;
252 uint64_t s_ib_rx_credit_updates; 266 uint64_t s_ib_rx_credit_updates;
253 uint64_t s_ib_ack_sent; 267 uint64_t s_ib_ack_sent;
254 uint64_t s_ib_ack_send_failure; 268 uint64_t s_ib_ack_send_failure;
@@ -271,6 +285,8 @@ struct rds_ib_statistics {
271 uint64_t s_ib_rdma_mr_1m_reused; 285 uint64_t s_ib_rdma_mr_1m_reused;
272 uint64_t s_ib_atomic_cswp; 286 uint64_t s_ib_atomic_cswp;
273 uint64_t s_ib_atomic_fadd; 287 uint64_t s_ib_atomic_fadd;
288 uint64_t s_ib_recv_added_to_cache;
289 uint64_t s_ib_recv_removed_from_cache;
274}; 290};
275 291
276extern struct workqueue_struct *rds_ib_wq; 292extern struct workqueue_struct *rds_ib_wq;
@@ -401,6 +417,8 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op);
401/* ib_stats.c */ 417/* ib_stats.c */
402DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats); 418DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
403#define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member) 419#define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member)
420#define rds_ib_stats_add(member, count) \
421 rds_stats_add_which(rds_ib_stats, member, count)
404unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, 422unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
405 unsigned int avail); 423 unsigned int avail);
406 424
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 5b2ab95afa07..1c38d2c7caa8 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -113,24 +113,26 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
113 } 113 }
114 114
115 if (conn->c_version < RDS_PROTOCOL(3, 1)) { 115 if (conn->c_version < RDS_PROTOCOL(3, 1)) {
116 printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed," 116 pr_notice("RDS/IB: Connection <%pI4,%pI4> version %u.%u no longer supported\n",
117 " no longer supported\n", 117 &conn->c_laddr, &conn->c_faddr,
118 &conn->c_faddr, 118 RDS_PROTOCOL_MAJOR(conn->c_version),
119 RDS_PROTOCOL_MAJOR(conn->c_version), 119 RDS_PROTOCOL_MINOR(conn->c_version));
120 RDS_PROTOCOL_MINOR(conn->c_version));
121 rds_conn_destroy(conn); 120 rds_conn_destroy(conn);
122 return; 121 return;
123 } else { 122 } else {
124 printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n", 123 pr_notice("RDS/IB: %s conn connected <%pI4,%pI4> version %u.%u%s\n",
125 &conn->c_faddr, 124 ic->i_active_side ? "Active" : "Passive",
126 RDS_PROTOCOL_MAJOR(conn->c_version), 125 &conn->c_laddr, &conn->c_faddr,
127 RDS_PROTOCOL_MINOR(conn->c_version), 126 RDS_PROTOCOL_MAJOR(conn->c_version),
128 ic->i_flowctl ? ", flow control" : ""); 127 RDS_PROTOCOL_MINOR(conn->c_version),
128 ic->i_flowctl ? ", flow control" : "");
129 } 129 }
130 130
131 /* 131 atomic_set(&ic->i_cq_quiesce, 0);
132 * Init rings and fill recv. this needs to wait until protocol negotiation 132
133 * is complete, since ring layout is different from 3.0 to 3.1. 133 /* Init rings and fill recv. this needs to wait until protocol
134 * negotiation is complete, since ring layout is different
135 * from 3.1 to 4.1.
134 */ 136 */
135 rds_ib_send_init_ring(ic); 137 rds_ib_send_init_ring(ic);
136 rds_ib_recv_init_ring(ic); 138 rds_ib_recv_init_ring(ic);
@@ -267,6 +269,10 @@ static void rds_ib_tasklet_fn_send(unsigned long data)
267 269
268 rds_ib_stats_inc(s_ib_tasklet_call); 270 rds_ib_stats_inc(s_ib_tasklet_call);
269 271
272 /* if cq has been already reaped, ignore incoming cq event */
273 if (atomic_read(&ic->i_cq_quiesce))
274 return;
275
270 poll_scq(ic, ic->i_send_cq, ic->i_send_wc); 276 poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
271 ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP); 277 ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
272 poll_scq(ic, ic->i_send_cq, ic->i_send_wc); 278 poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
@@ -308,6 +314,10 @@ static void rds_ib_tasklet_fn_recv(unsigned long data)
308 314
309 rds_ib_stats_inc(s_ib_tasklet_call); 315 rds_ib_stats_inc(s_ib_tasklet_call);
310 316
317 /* if cq has been already reaped, ignore incoming cq event */
318 if (atomic_read(&ic->i_cq_quiesce))
319 return;
320
311 memset(&state, 0, sizeof(state)); 321 memset(&state, 0, sizeof(state));
312 poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state); 322 poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
313 ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); 323 ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
@@ -358,6 +368,28 @@ static void rds_ib_cq_comp_handler_send(struct ib_cq *cq, void *context)
358 tasklet_schedule(&ic->i_send_tasklet); 368 tasklet_schedule(&ic->i_send_tasklet);
359} 369}
360 370
371static inline int ibdev_get_unused_vector(struct rds_ib_device *rds_ibdev)
372{
373 int min = rds_ibdev->vector_load[rds_ibdev->dev->num_comp_vectors - 1];
374 int index = rds_ibdev->dev->num_comp_vectors - 1;
375 int i;
376
377 for (i = rds_ibdev->dev->num_comp_vectors - 1; i >= 0; i--) {
378 if (rds_ibdev->vector_load[i] < min) {
379 index = i;
380 min = rds_ibdev->vector_load[i];
381 }
382 }
383
384 rds_ibdev->vector_load[index]++;
385 return index;
386}
387
388static inline void ibdev_put_vector(struct rds_ib_device *rds_ibdev, int index)
389{
390 rds_ibdev->vector_load[index]--;
391}
392
361/* 393/*
362 * This needs to be very careful to not leave IS_ERR pointers around for 394 * This needs to be very careful to not leave IS_ERR pointers around for
363 * cleanup to trip over. 395 * cleanup to trip over.
@@ -383,7 +415,10 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
383 * completion queue and send queue. This extra space is used for FRMR 415 * completion queue and send queue. This extra space is used for FRMR
384 * registration and invalidation work requests 416 * registration and invalidation work requests
385 */ 417 */
386 fr_queue_space = (rds_ibdev->use_fastreg ? RDS_IB_DEFAULT_FR_WR : 0); 418 fr_queue_space = rds_ibdev->use_fastreg ?
419 (RDS_IB_DEFAULT_FR_WR + 1) +
420 (RDS_IB_DEFAULT_FR_INV_WR + 1)
421 : 0;
387 422
388 /* add the conn now so that connection establishment has the dev */ 423 /* add the conn now so that connection establishment has the dev */
389 rds_ib_add_conn(rds_ibdev, conn); 424 rds_ib_add_conn(rds_ibdev, conn);
@@ -396,39 +431,44 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
396 /* Protection domain and memory range */ 431 /* Protection domain and memory range */
397 ic->i_pd = rds_ibdev->pd; 432 ic->i_pd = rds_ibdev->pd;
398 433
434 ic->i_scq_vector = ibdev_get_unused_vector(rds_ibdev);
399 cq_attr.cqe = ic->i_send_ring.w_nr + fr_queue_space + 1; 435 cq_attr.cqe = ic->i_send_ring.w_nr + fr_queue_space + 1;
400 436 cq_attr.comp_vector = ic->i_scq_vector;
401 ic->i_send_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_send, 437 ic->i_send_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_send,
402 rds_ib_cq_event_handler, conn, 438 rds_ib_cq_event_handler, conn,
403 &cq_attr); 439 &cq_attr);
404 if (IS_ERR(ic->i_send_cq)) { 440 if (IS_ERR(ic->i_send_cq)) {
405 ret = PTR_ERR(ic->i_send_cq); 441 ret = PTR_ERR(ic->i_send_cq);
406 ic->i_send_cq = NULL; 442 ic->i_send_cq = NULL;
443 ibdev_put_vector(rds_ibdev, ic->i_scq_vector);
407 rdsdebug("ib_create_cq send failed: %d\n", ret); 444 rdsdebug("ib_create_cq send failed: %d\n", ret);
408 goto out; 445 goto rds_ibdev_out;
409 } 446 }
410 447
448 ic->i_rcq_vector = ibdev_get_unused_vector(rds_ibdev);
411 cq_attr.cqe = ic->i_recv_ring.w_nr; 449 cq_attr.cqe = ic->i_recv_ring.w_nr;
450 cq_attr.comp_vector = ic->i_rcq_vector;
412 ic->i_recv_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_recv, 451 ic->i_recv_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_recv,
413 rds_ib_cq_event_handler, conn, 452 rds_ib_cq_event_handler, conn,
414 &cq_attr); 453 &cq_attr);
415 if (IS_ERR(ic->i_recv_cq)) { 454 if (IS_ERR(ic->i_recv_cq)) {
416 ret = PTR_ERR(ic->i_recv_cq); 455 ret = PTR_ERR(ic->i_recv_cq);
417 ic->i_recv_cq = NULL; 456 ic->i_recv_cq = NULL;
457 ibdev_put_vector(rds_ibdev, ic->i_rcq_vector);
418 rdsdebug("ib_create_cq recv failed: %d\n", ret); 458 rdsdebug("ib_create_cq recv failed: %d\n", ret);
419 goto out; 459 goto send_cq_out;
420 } 460 }
421 461
422 ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP); 462 ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
423 if (ret) { 463 if (ret) {
424 rdsdebug("ib_req_notify_cq send failed: %d\n", ret); 464 rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
425 goto out; 465 goto recv_cq_out;
426 } 466 }
427 467
428 ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); 468 ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
429 if (ret) { 469 if (ret) {
430 rdsdebug("ib_req_notify_cq recv failed: %d\n", ret); 470 rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
431 goto out; 471 goto recv_cq_out;
432 } 472 }
433 473
434 /* XXX negotiate max send/recv with remote? */ 474 /* XXX negotiate max send/recv with remote? */
@@ -445,6 +485,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
445 attr.send_cq = ic->i_send_cq; 485 attr.send_cq = ic->i_send_cq;
446 attr.recv_cq = ic->i_recv_cq; 486 attr.recv_cq = ic->i_recv_cq;
447 atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR); 487 atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR);
488 atomic_set(&ic->i_fastunreg_wrs, RDS_IB_DEFAULT_FR_INV_WR);
448 489
449 /* 490 /*
450 * XXX this can fail if max_*_wr is too large? Are we supposed 491 * XXX this can fail if max_*_wr is too large? Are we supposed
@@ -453,7 +494,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
453 ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr); 494 ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
454 if (ret) { 495 if (ret) {
455 rdsdebug("rdma_create_qp failed: %d\n", ret); 496 rdsdebug("rdma_create_qp failed: %d\n", ret);
456 goto out; 497 goto recv_cq_out;
457 } 498 }
458 499
459 ic->i_send_hdrs = ib_dma_alloc_coherent(dev, 500 ic->i_send_hdrs = ib_dma_alloc_coherent(dev,
@@ -463,7 +504,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
463 if (!ic->i_send_hdrs) { 504 if (!ic->i_send_hdrs) {
464 ret = -ENOMEM; 505 ret = -ENOMEM;
465 rdsdebug("ib_dma_alloc_coherent send failed\n"); 506 rdsdebug("ib_dma_alloc_coherent send failed\n");
466 goto out; 507 goto qp_out;
467 } 508 }
468 509
469 ic->i_recv_hdrs = ib_dma_alloc_coherent(dev, 510 ic->i_recv_hdrs = ib_dma_alloc_coherent(dev,
@@ -473,7 +514,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
473 if (!ic->i_recv_hdrs) { 514 if (!ic->i_recv_hdrs) {
474 ret = -ENOMEM; 515 ret = -ENOMEM;
475 rdsdebug("ib_dma_alloc_coherent recv failed\n"); 516 rdsdebug("ib_dma_alloc_coherent recv failed\n");
476 goto out; 517 goto send_hdrs_dma_out;
477 } 518 }
478 519
479 ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), 520 ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
@@ -481,7 +522,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
481 if (!ic->i_ack) { 522 if (!ic->i_ack) {
482 ret = -ENOMEM; 523 ret = -ENOMEM;
483 rdsdebug("ib_dma_alloc_coherent ack failed\n"); 524 rdsdebug("ib_dma_alloc_coherent ack failed\n");
484 goto out; 525 goto recv_hdrs_dma_out;
485 } 526 }
486 527
487 ic->i_sends = vzalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work), 528 ic->i_sends = vzalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work),
@@ -489,7 +530,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
489 if (!ic->i_sends) { 530 if (!ic->i_sends) {
490 ret = -ENOMEM; 531 ret = -ENOMEM;
491 rdsdebug("send allocation failed\n"); 532 rdsdebug("send allocation failed\n");
492 goto out; 533 goto ack_dma_out;
493 } 534 }
494 535
495 ic->i_recvs = vzalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work), 536 ic->i_recvs = vzalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work),
@@ -497,7 +538,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
497 if (!ic->i_recvs) { 538 if (!ic->i_recvs) {
498 ret = -ENOMEM; 539 ret = -ENOMEM;
499 rdsdebug("recv allocation failed\n"); 540 rdsdebug("recv allocation failed\n");
500 goto out; 541 goto sends_out;
501 } 542 }
502 543
503 rds_ib_recv_init_ack(ic); 544 rds_ib_recv_init_ack(ic);
@@ -505,8 +546,33 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
505 rdsdebug("conn %p pd %p cq %p %p\n", conn, ic->i_pd, 546 rdsdebug("conn %p pd %p cq %p %p\n", conn, ic->i_pd,
506 ic->i_send_cq, ic->i_recv_cq); 547 ic->i_send_cq, ic->i_recv_cq);
507 548
508out: 549 return ret;
550
551sends_out:
552 vfree(ic->i_sends);
553ack_dma_out:
554 ib_dma_free_coherent(dev, sizeof(struct rds_header),
555 ic->i_ack, ic->i_ack_dma);
556recv_hdrs_dma_out:
557 ib_dma_free_coherent(dev, ic->i_recv_ring.w_nr *
558 sizeof(struct rds_header),
559 ic->i_recv_hdrs, ic->i_recv_hdrs_dma);
560send_hdrs_dma_out:
561 ib_dma_free_coherent(dev, ic->i_send_ring.w_nr *
562 sizeof(struct rds_header),
563 ic->i_send_hdrs, ic->i_send_hdrs_dma);
564qp_out:
565 rdma_destroy_qp(ic->i_cm_id);
566recv_cq_out:
567 if (!ib_destroy_cq(ic->i_recv_cq))
568 ic->i_recv_cq = NULL;
569send_cq_out:
570 if (!ib_destroy_cq(ic->i_send_cq))
571 ic->i_send_cq = NULL;
572rds_ibdev_out:
573 rds_ib_remove_conn(rds_ibdev, conn);
509 rds_ib_dev_put(rds_ibdev); 574 rds_ib_dev_put(rds_ibdev);
575
510 return ret; 576 return ret;
511} 577}
512 578
@@ -682,6 +748,7 @@ out:
682 if (ic->i_cm_id == cm_id) 748 if (ic->i_cm_id == cm_id)
683 ret = 0; 749 ret = 0;
684 } 750 }
751 ic->i_active_side = true;
685 return ret; 752 return ret;
686} 753}
687 754
@@ -767,17 +834,27 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
767 wait_event(rds_ib_ring_empty_wait, 834 wait_event(rds_ib_ring_empty_wait,
768 rds_ib_ring_empty(&ic->i_recv_ring) && 835 rds_ib_ring_empty(&ic->i_recv_ring) &&
769 (atomic_read(&ic->i_signaled_sends) == 0) && 836 (atomic_read(&ic->i_signaled_sends) == 0) &&
770 (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR)); 837 (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR) &&
838 (atomic_read(&ic->i_fastunreg_wrs) == RDS_IB_DEFAULT_FR_INV_WR));
771 tasklet_kill(&ic->i_send_tasklet); 839 tasklet_kill(&ic->i_send_tasklet);
772 tasklet_kill(&ic->i_recv_tasklet); 840 tasklet_kill(&ic->i_recv_tasklet);
773 841
842 atomic_set(&ic->i_cq_quiesce, 1);
843
774 /* first destroy the ib state that generates callbacks */ 844 /* first destroy the ib state that generates callbacks */
775 if (ic->i_cm_id->qp) 845 if (ic->i_cm_id->qp)
776 rdma_destroy_qp(ic->i_cm_id); 846 rdma_destroy_qp(ic->i_cm_id);
777 if (ic->i_send_cq) 847 if (ic->i_send_cq) {
848 if (ic->rds_ibdev)
849 ibdev_put_vector(ic->rds_ibdev, ic->i_scq_vector);
778 ib_destroy_cq(ic->i_send_cq); 850 ib_destroy_cq(ic->i_send_cq);
779 if (ic->i_recv_cq) 851 }
852
853 if (ic->i_recv_cq) {
854 if (ic->rds_ibdev)
855 ibdev_put_vector(ic->rds_ibdev, ic->i_rcq_vector);
780 ib_destroy_cq(ic->i_recv_cq); 856 ib_destroy_cq(ic->i_recv_cq);
857 }
781 858
782 /* then free the resources that ib callbacks use */ 859 /* then free the resources that ib callbacks use */
783 if (ic->i_send_hdrs) 860 if (ic->i_send_hdrs)
@@ -855,6 +932,7 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
855 ic->i_sends = NULL; 932 ic->i_sends = NULL;
856 vfree(ic->i_recvs); 933 vfree(ic->i_recvs);
857 ic->i_recvs = NULL; 934 ic->i_recvs = NULL;
935 ic->i_active_side = false;
858} 936}
859 937
860int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) 938int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
index d921adc62765..48332a6ed738 100644
--- a/net/rds/ib_frmr.c
+++ b/net/rds/ib_frmr.c
@@ -104,14 +104,15 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
104 struct rds_ib_frmr *frmr = &ibmr->u.frmr; 104 struct rds_ib_frmr *frmr = &ibmr->u.frmr;
105 struct ib_send_wr *failed_wr; 105 struct ib_send_wr *failed_wr;
106 struct ib_reg_wr reg_wr; 106 struct ib_reg_wr reg_wr;
107 int ret; 107 int ret, off = 0;
108 108
109 while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) { 109 while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) {
110 atomic_inc(&ibmr->ic->i_fastreg_wrs); 110 atomic_inc(&ibmr->ic->i_fastreg_wrs);
111 cpu_relax(); 111 cpu_relax();
112 } 112 }
113 113
114 ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, 0, PAGE_SIZE); 114 ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len,
115 &off, PAGE_SIZE);
115 if (unlikely(ret != ibmr->sg_len)) 116 if (unlikely(ret != ibmr->sg_len))
116 return ret < 0 ? ret : -EINVAL; 117 return ret < 0 ? ret : -EINVAL;
117 118
@@ -240,8 +241,8 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr)
240 if (frmr->fr_state != FRMR_IS_INUSE) 241 if (frmr->fr_state != FRMR_IS_INUSE)
241 goto out; 242 goto out;
242 243
243 while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) { 244 while (atomic_dec_return(&ibmr->ic->i_fastunreg_wrs) <= 0) {
244 atomic_inc(&ibmr->ic->i_fastreg_wrs); 245 atomic_inc(&ibmr->ic->i_fastunreg_wrs);
245 cpu_relax(); 246 cpu_relax();
246 } 247 }
247 248
@@ -260,7 +261,7 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr)
260 if (unlikely(ret)) { 261 if (unlikely(ret)) {
261 frmr->fr_state = FRMR_IS_STALE; 262 frmr->fr_state = FRMR_IS_STALE;
262 frmr->fr_inv = false; 263 frmr->fr_inv = false;
263 atomic_inc(&ibmr->ic->i_fastreg_wrs); 264 atomic_inc(&ibmr->ic->i_fastunreg_wrs);
264 pr_err("RDS/IB: %s returned error(%d)\n", __func__, ret); 265 pr_err("RDS/IB: %s returned error(%d)\n", __func__, ret);
265 goto out; 266 goto out;
266 } 267 }
@@ -288,9 +289,10 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
288 if (frmr->fr_inv) { 289 if (frmr->fr_inv) {
289 frmr->fr_state = FRMR_IS_FREE; 290 frmr->fr_state = FRMR_IS_FREE;
290 frmr->fr_inv = false; 291 frmr->fr_inv = false;
292 atomic_inc(&ic->i_fastreg_wrs);
293 } else {
294 atomic_inc(&ic->i_fastunreg_wrs);
291 } 295 }
292
293 atomic_inc(&ic->i_fastreg_wrs);
294} 296}
295 297
296void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed, 298void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed,
diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h
index 1c754f4acbe5..5d6e98a79a5e 100644
--- a/net/rds/ib_mr.h
+++ b/net/rds/ib_mr.h
@@ -45,7 +45,6 @@
45 45
46struct rds_ib_fmr { 46struct rds_ib_fmr {
47 struct ib_fmr *fmr; 47 struct ib_fmr *fmr;
48 u64 *dma;
49}; 48};
50 49
51enum rds_ib_fr_state { 50enum rds_ib_fr_state {
@@ -108,8 +107,6 @@ struct rds_ib_mr_pool {
108}; 107};
109 108
110extern struct workqueue_struct *rds_ib_mr_wq; 109extern struct workqueue_struct *rds_ib_mr_wq;
111extern unsigned int rds_ib_mr_1m_pool_size;
112extern unsigned int rds_ib_mr_8k_pool_size;
113extern bool prefer_frmr; 110extern bool prefer_frmr;
114 111
115struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev, 112struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev,
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 606a11f681d2..e10624aa6959 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -194,6 +194,8 @@ static void rds_ib_frag_free(struct rds_ib_connection *ic,
194 rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg)); 194 rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg));
195 195
196 rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags); 196 rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags);
197 atomic_add(RDS_FRAG_SIZE / SZ_1K, &ic->i_cache_allocs);
198 rds_ib_stats_add(s_ib_recv_added_to_cache, RDS_FRAG_SIZE);
197} 199}
198 200
199/* Recycle inc after freeing attached frags */ 201/* Recycle inc after freeing attached frags */
@@ -261,6 +263,7 @@ static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *i
261 atomic_dec(&rds_ib_allocation); 263 atomic_dec(&rds_ib_allocation);
262 return NULL; 264 return NULL;
263 } 265 }
266 rds_ib_stats_inc(s_ib_rx_total_incs);
264 } 267 }
265 INIT_LIST_HEAD(&ibinc->ii_frags); 268 INIT_LIST_HEAD(&ibinc->ii_frags);
266 rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr); 269 rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr);
@@ -278,6 +281,8 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic
278 cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags); 281 cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags);
279 if (cache_item) { 282 if (cache_item) {
280 frag = container_of(cache_item, struct rds_page_frag, f_cache_entry); 283 frag = container_of(cache_item, struct rds_page_frag, f_cache_entry);
284 atomic_sub(RDS_FRAG_SIZE / SZ_1K, &ic->i_cache_allocs);
285 rds_ib_stats_add(s_ib_recv_added_to_cache, RDS_FRAG_SIZE);
281 } else { 286 } else {
282 frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask); 287 frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask);
283 if (!frag) 288 if (!frag)
@@ -290,6 +295,7 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic
290 kmem_cache_free(rds_ib_frag_slab, frag); 295 kmem_cache_free(rds_ib_frag_slab, frag);
291 return NULL; 296 return NULL;
292 } 297 }
298 rds_ib_stats_inc(s_ib_rx_total_frags);
293 } 299 }
294 300
295 INIT_LIST_HEAD(&frag->f_item); 301 INIT_LIST_HEAD(&frag->f_item);
@@ -905,8 +911,12 @@ static void rds_ib_process_recv(struct rds_connection *conn,
905 ic->i_ibinc = ibinc; 911 ic->i_ibinc = ibinc;
906 912
907 hdr = &ibinc->ii_inc.i_hdr; 913 hdr = &ibinc->ii_inc.i_hdr;
914 ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
915 local_clock();
908 memcpy(hdr, ihdr, sizeof(*hdr)); 916 memcpy(hdr, ihdr, sizeof(*hdr));
909 ic->i_recv_data_rem = be32_to_cpu(hdr->h_len); 917 ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
918 ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_START] =
919 local_clock();
910 920
911 rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc, 921 rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc,
912 ic->i_recv_data_rem, hdr->h_flags); 922 ic->i_recv_data_rem, hdr->h_flags);
@@ -980,8 +990,8 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
980 } else { 990 } else {
981 /* We expect errors as the qp is drained during shutdown */ 991 /* We expect errors as the qp is drained during shutdown */
982 if (rds_conn_up(conn) || rds_conn_connecting(conn)) 992 if (rds_conn_up(conn) || rds_conn_connecting(conn))
983 rds_ib_conn_error(conn, "recv completion on %pI4 had status %u (%s), disconnecting and reconnecting\n", 993 rds_ib_conn_error(conn, "recv completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n",
984 &conn->c_faddr, 994 &conn->c_laddr, &conn->c_faddr,
985 wc->status, 995 wc->status,
986 ib_wc_status_msg(wc->status)); 996 ib_wc_status_msg(wc->status));
987 } 997 }
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 84d90c97332f..6ab39dbcca01 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -69,16 +69,6 @@ static void rds_ib_send_complete(struct rds_message *rm,
69 complete(rm, notify_status); 69 complete(rm, notify_status);
70} 70}
71 71
72static void rds_ib_send_unmap_data(struct rds_ib_connection *ic,
73 struct rm_data_op *op,
74 int wc_status)
75{
76 if (op->op_nents)
77 ib_dma_unmap_sg(ic->i_cm_id->device,
78 op->op_sg, op->op_nents,
79 DMA_TO_DEVICE);
80}
81
82static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic, 72static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic,
83 struct rm_rdma_op *op, 73 struct rm_rdma_op *op,
84 int wc_status) 74 int wc_status)
@@ -139,6 +129,21 @@ static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic,
139 rds_ib_stats_inc(s_ib_atomic_fadd); 129 rds_ib_stats_inc(s_ib_atomic_fadd);
140} 130}
141 131
132static void rds_ib_send_unmap_data(struct rds_ib_connection *ic,
133 struct rm_data_op *op,
134 int wc_status)
135{
136 struct rds_message *rm = container_of(op, struct rds_message, data);
137
138 if (op->op_nents)
139 ib_dma_unmap_sg(ic->i_cm_id->device,
140 op->op_sg, op->op_nents,
141 DMA_TO_DEVICE);
142
143 if (rm->rdma.op_active && rm->data.op_notify)
144 rds_ib_send_unmap_rdma(ic, &rm->rdma, wc_status);
145}
146
142/* 147/*
143 * Unmap the resources associated with a struct send_work. 148 * Unmap the resources associated with a struct send_work.
144 * 149 *
@@ -300,8 +305,8 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
300 305
301 /* We expect errors as the qp is drained during shutdown */ 306 /* We expect errors as the qp is drained during shutdown */
302 if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) { 307 if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) {
303 rds_ib_conn_error(conn, "send completion on %pI4 had status %u (%s), disconnecting and reconnecting\n", 308 rds_ib_conn_error(conn, "send completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n",
304 &conn->c_faddr, wc->status, 309 &conn->c_laddr, &conn->c_faddr, wc->status,
305 ib_wc_status_msg(wc->status)); 310 ib_wc_status_msg(wc->status));
306 } 311 }
307} 312}
@@ -765,7 +770,6 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
765 770
766 work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos); 771 work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos);
767 if (work_alloc != 1) { 772 if (work_alloc != 1) {
768 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
769 rds_ib_stats_inc(s_ib_tx_ring_full); 773 rds_ib_stats_inc(s_ib_tx_ring_full);
770 ret = -ENOMEM; 774 ret = -ENOMEM;
771 goto out; 775 goto out;
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c
index 7e78dca1f252..9252ad126335 100644
--- a/net/rds/ib_stats.c
+++ b/net/rds/ib_stats.c
@@ -55,6 +55,8 @@ static const char *const rds_ib_stat_names[] = {
55 "ib_rx_refill_from_cq", 55 "ib_rx_refill_from_cq",
56 "ib_rx_refill_from_thread", 56 "ib_rx_refill_from_thread",
57 "ib_rx_alloc_limit", 57 "ib_rx_alloc_limit",
58 "ib_rx_total_frags",
59 "ib_rx_total_incs",
58 "ib_rx_credit_updates", 60 "ib_rx_credit_updates",
59 "ib_ack_sent", 61 "ib_ack_sent",
60 "ib_ack_send_failure", 62 "ib_ack_send_failure",
diff --git a/net/rds/message.c b/net/rds/message.c
index 6cb91061556a..49bfb512d808 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -42,6 +42,7 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
42[RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma), 42[RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma),
43[RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest), 43[RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest),
44[RDS_EXTHDR_NPATHS] = sizeof(u16), 44[RDS_EXTHDR_NPATHS] = sizeof(u16),
45[RDS_EXTHDR_GEN_NUM] = sizeof(u32),
45}; 46};
46 47
47 48
diff --git a/net/rds/page.c b/net/rds/page.c
index e2b5a5832d3d..7cc57e098ddb 100644
--- a/net/rds/page.c
+++ b/net/rds/page.c
@@ -45,35 +45,6 @@ struct rds_page_remainder {
45static 45static
46DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders); 46DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders);
47 47
48/*
49 * returns 0 on success or -errno on failure.
50 *
51 * We don't have to worry about flush_dcache_page() as this only works
52 * with private pages. If, say, we were to do directed receive to pinned
53 * user pages we'd have to worry more about cache coherence. (Though
54 * the flush_dcache_page() in get_user_pages() would probably be enough).
55 */
56int rds_page_copy_user(struct page *page, unsigned long offset,
57 void __user *ptr, unsigned long bytes,
58 int to_user)
59{
60 unsigned long ret;
61 void *addr;
62
63 addr = kmap(page);
64 if (to_user) {
65 rds_stats_add(s_copy_to_user, bytes);
66 ret = copy_to_user(ptr, addr + offset, bytes);
67 } else {
68 rds_stats_add(s_copy_from_user, bytes);
69 ret = copy_from_user(addr + offset, ptr, bytes);
70 }
71 kunmap(page);
72
73 return ret ? -EFAULT : 0;
74}
75EXPORT_SYMBOL_GPL(rds_page_copy_user);
76
77/** 48/**
78 * rds_page_remainder_alloc - build up regions of a message. 49 * rds_page_remainder_alloc - build up regions of a message.
79 * 50 *
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 4c93badeabf2..f06fac4886b0 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -40,7 +40,6 @@
40/* 40/*
41 * XXX 41 * XXX
42 * - build with sparse 42 * - build with sparse
43 * - should we limit the size of a mr region? let transport return failure?
44 * - should we detect duplicate keys on a socket? hmm. 43 * - should we detect duplicate keys on a socket? hmm.
45 * - an rdma is an mlock, apply rlimit? 44 * - an rdma is an mlock, apply rlimit?
46 */ 45 */
@@ -135,7 +134,7 @@ void rds_rdma_drop_keys(struct rds_sock *rs)
135 /* Release any MRs associated with this socket */ 134 /* Release any MRs associated with this socket */
136 spin_lock_irqsave(&rs->rs_rdma_lock, flags); 135 spin_lock_irqsave(&rs->rs_rdma_lock, flags);
137 while ((node = rb_first(&rs->rs_rdma_keys))) { 136 while ((node = rb_first(&rs->rs_rdma_keys))) {
138 mr = container_of(node, struct rds_mr, r_rb_node); 137 mr = rb_entry(node, struct rds_mr, r_rb_node);
139 if (mr->r_trans == rs->rs_transport) 138 if (mr->r_trans == rs->rs_transport)
140 mr->r_invalidate = 0; 139 mr->r_invalidate = 0;
141 rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); 140 rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
@@ -200,6 +199,14 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
200 goto out; 199 goto out;
201 } 200 }
202 201
202 /* Restrict the size of mr irrespective of underlying transport
203 * To account for unaligned mr regions, subtract one from nr_pages
204 */
205 if ((nr_pages - 1) > (RDS_MAX_MSG_SIZE >> PAGE_SHIFT)) {
206 ret = -EMSGSIZE;
207 goto out;
208 }
209
203 rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n", 210 rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n",
204 args->vec.addr, args->vec.bytes, nr_pages); 211 args->vec.addr, args->vec.bytes, nr_pages);
205 212
@@ -415,7 +422,8 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
415 spin_lock_irqsave(&rs->rs_rdma_lock, flags); 422 spin_lock_irqsave(&rs->rs_rdma_lock, flags);
416 mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); 423 mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
417 if (!mr) { 424 if (!mr) {
418 printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key); 425 pr_debug("rds: trying to unuse MR with unknown r_key %u!\n",
426 r_key);
419 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); 427 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
420 return; 428 return;
421 } 429 }
@@ -626,6 +634,16 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
626 } 634 }
627 op->op_notifier->n_user_token = args->user_token; 635 op->op_notifier->n_user_token = args->user_token;
628 op->op_notifier->n_status = RDS_RDMA_SUCCESS; 636 op->op_notifier->n_status = RDS_RDMA_SUCCESS;
637
638 /* Enable rmda notification on data operation for composite
639 * rds messages and make sure notification is enabled only
640 * for the data operation which follows it so that application
641 * gets notified only after full message gets delivered.
642 */
643 if (rm->data.op_sg) {
644 rm->rdma.op_notify = 0;
645 rm->data.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
646 }
629 } 647 }
630 648
631 /* The cookie contains the R_Key of the remote memory region, and 649 /* The cookie contains the R_Key of the remote memory region, and
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index 345f09059e9f..fc59821f0a27 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -100,11 +100,14 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
100 trans->cm_connect_complete(conn, event); 100 trans->cm_connect_complete(conn, event);
101 break; 101 break;
102 102
103 case RDMA_CM_EVENT_REJECTED:
104 rdsdebug("Connection rejected: %s\n",
105 rdma_reject_msg(cm_id, event->status));
106 /* FALLTHROUGH */
103 case RDMA_CM_EVENT_ADDR_ERROR: 107 case RDMA_CM_EVENT_ADDR_ERROR:
104 case RDMA_CM_EVENT_ROUTE_ERROR: 108 case RDMA_CM_EVENT_ROUTE_ERROR:
105 case RDMA_CM_EVENT_CONNECT_ERROR: 109 case RDMA_CM_EVENT_CONNECT_ERROR:
106 case RDMA_CM_EVENT_UNREACHABLE: 110 case RDMA_CM_EVENT_UNREACHABLE:
107 case RDMA_CM_EVENT_REJECTED:
108 case RDMA_CM_EVENT_DEVICE_REMOVAL: 111 case RDMA_CM_EVENT_DEVICE_REMOVAL:
109 case RDMA_CM_EVENT_ADDR_CHANGE: 112 case RDMA_CM_EVENT_ADDR_CHANGE:
110 if (conn) 113 if (conn)
@@ -203,18 +206,13 @@ static int rds_rdma_init(void)
203{ 206{
204 int ret; 207 int ret;
205 208
206 ret = rds_rdma_listen_init(); 209 ret = rds_ib_init();
207 if (ret) 210 if (ret)
208 goto out; 211 goto out;
209 212
210 ret = rds_ib_init(); 213 ret = rds_rdma_listen_init();
211 if (ret) 214 if (ret)
212 goto err_ib_init; 215 rds_ib_exit();
213
214 goto out;
215
216err_ib_init:
217 rds_rdma_listen_stop();
218out: 216out:
219 return ret; 217 return ret;
220} 218}
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 67ba67c058b1..82d38ccf5e8b 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -50,6 +50,9 @@ void rdsdebug(char *fmt, ...)
50#define RDS_FRAG_SHIFT 12 50#define RDS_FRAG_SHIFT 12
51#define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT)) 51#define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT))
52 52
53/* Used to limit both RDMA and non-RDMA RDS message to 1MB */
54#define RDS_MAX_MSG_SIZE ((unsigned int)(1 << 20))
55
53#define RDS_CONG_MAP_BYTES (65536 / 8) 56#define RDS_CONG_MAP_BYTES (65536 / 8)
54#define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE) 57#define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE)
55#define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8) 58#define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8)
@@ -144,25 +147,28 @@ struct rds_connection {
144 147
145 /* Protocol version */ 148 /* Protocol version */
146 unsigned int c_version; 149 unsigned int c_version;
147 possible_net_t c_net; 150 struct net *c_net;
148 151
149 struct list_head c_map_item; 152 struct list_head c_map_item;
150 unsigned long c_map_queued; 153 unsigned long c_map_queued;
151 154
152 struct rds_conn_path c_path[RDS_MPATH_WORKERS]; 155 struct rds_conn_path c_path[RDS_MPATH_WORKERS];
153 wait_queue_head_t c_hs_waitq; /* handshake waitq */ 156 wait_queue_head_t c_hs_waitq; /* handshake waitq */
157
158 u32 c_my_gen_num;
159 u32 c_peer_gen_num;
154}; 160};
155 161
156static inline 162static inline
157struct net *rds_conn_net(struct rds_connection *conn) 163struct net *rds_conn_net(struct rds_connection *conn)
158{ 164{
159 return read_pnet(&conn->c_net); 165 return conn->c_net;
160} 166}
161 167
162static inline 168static inline
163void rds_conn_net_set(struct rds_connection *conn, struct net *net) 169void rds_conn_net_set(struct rds_connection *conn, struct net *net)
164{ 170{
165 write_pnet(&conn->c_net, net); 171 conn->c_net = get_net(net);
166} 172}
167 173
168#define RDS_FLAG_CONG_BITMAP 0x01 174#define RDS_FLAG_CONG_BITMAP 0x01
@@ -243,9 +249,15 @@ struct rds_ext_header_rdma_dest {
243/* Extension header announcing number of paths. 249/* Extension header announcing number of paths.
244 * Implicit length = 2 bytes. 250 * Implicit length = 2 bytes.
245 */ 251 */
246#define RDS_EXTHDR_NPATHS 4 252#define RDS_EXTHDR_NPATHS 5
253#define RDS_EXTHDR_GEN_NUM 6
247 254
248#define __RDS_EXTHDR_MAX 16 /* for now */ 255#define __RDS_EXTHDR_MAX 16 /* for now */
256#define RDS_RX_MAX_TRACES (RDS_MSG_RX_DGRAM_TRACE_MAX + 1)
257#define RDS_MSG_RX_HDR 0
258#define RDS_MSG_RX_START 1
259#define RDS_MSG_RX_END 2
260#define RDS_MSG_RX_CMSG 3
249 261
250struct rds_incoming { 262struct rds_incoming {
251 atomic_t i_refcount; 263 atomic_t i_refcount;
@@ -258,6 +270,7 @@ struct rds_incoming {
258 270
259 rds_rdma_cookie_t i_rdma_cookie; 271 rds_rdma_cookie_t i_rdma_cookie;
260 struct timeval i_rx_tstamp; 272 struct timeval i_rx_tstamp;
273 u64 i_rx_lat_trace[RDS_RX_MAX_TRACES];
261}; 274};
262 275
263struct rds_mr { 276struct rds_mr {
@@ -338,6 +351,7 @@ static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
338#define RDS_MSG_RETRANSMITTED 5 351#define RDS_MSG_RETRANSMITTED 5
339#define RDS_MSG_MAPPED 6 352#define RDS_MSG_MAPPED 6
340#define RDS_MSG_PAGEVEC 7 353#define RDS_MSG_PAGEVEC 7
354#define RDS_MSG_FLUSH 8
341 355
342struct rds_message { 356struct rds_message {
343 atomic_t m_refcount; 357 atomic_t m_refcount;
@@ -414,6 +428,7 @@ struct rds_message {
414 } rdma; 428 } rdma;
415 struct rm_data_op { 429 struct rm_data_op {
416 unsigned int op_active:1; 430 unsigned int op_active:1;
431 unsigned int op_notify:1;
417 unsigned int op_nents; 432 unsigned int op_nents;
418 unsigned int op_count; 433 unsigned int op_count;
419 unsigned int op_dmasg; 434 unsigned int op_dmasg;
@@ -566,6 +581,10 @@ struct rds_sock {
566 unsigned char rs_recverr, 581 unsigned char rs_recverr,
567 rs_cong_monitor; 582 rs_cong_monitor;
568 u32 rs_hash_initval; 583 u32 rs_hash_initval;
584
585 /* Socket receive path trace points*/
586 u8 rs_rx_traces;
587 u8 rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX];
569}; 588};
570 589
571static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk) 590static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
@@ -625,6 +644,9 @@ struct rds_statistics {
625 uint64_t s_cong_update_received; 644 uint64_t s_cong_update_received;
626 uint64_t s_cong_send_error; 645 uint64_t s_cong_send_error;
627 uint64_t s_cong_send_blocked; 646 uint64_t s_cong_send_blocked;
647 uint64_t s_recv_bytes_added_to_socket;
648 uint64_t s_recv_bytes_removed_from_socket;
649
628}; 650};
629 651
630/* af_rds.c */ 652/* af_rds.c */
@@ -664,6 +686,7 @@ void rds_cong_exit(void);
664struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); 686struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
665 687
666/* conn.c */ 688/* conn.c */
689extern u32 rds_gen_num;
667int rds_conn_init(void); 690int rds_conn_init(void);
668void rds_conn_exit(void); 691void rds_conn_exit(void);
669struct rds_connection *rds_conn_create(struct net *net, 692struct rds_connection *rds_conn_create(struct net *net,
@@ -683,10 +706,6 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
683 struct rds_info_lengths *lens, 706 struct rds_info_lengths *lens,
684 int (*visitor)(struct rds_connection *, void *), 707 int (*visitor)(struct rds_connection *, void *),
685 size_t item_len); 708 size_t item_len);
686__printf(2, 3)
687void __rds_conn_error(struct rds_connection *conn, const char *, ...);
688#define rds_conn_error(conn, fmt...) \
689 __rds_conn_error(conn, KERN_WARNING "RDS: " fmt)
690 709
691__printf(2, 3) 710__printf(2, 3)
692void __rds_conn_path_error(struct rds_conn_path *cp, const char *, ...); 711void __rds_conn_path_error(struct rds_conn_path *cp, const char *, ...);
@@ -779,13 +798,6 @@ static inline int rds_message_verify_checksum(const struct rds_header *hdr)
779/* page.c */ 798/* page.c */
780int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, 799int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
781 gfp_t gfp); 800 gfp_t gfp);
782int rds_page_copy_user(struct page *page, unsigned long offset,
783 void __user *ptr, unsigned long bytes,
784 int to_user);
785#define rds_page_copy_to_user(page, offset, ptr, bytes) \
786 rds_page_copy_user(page, offset, ptr, bytes, 1)
787#define rds_page_copy_from_user(page, offset, ptr, bytes) \
788 rds_page_copy_user(page, offset, ptr, bytes, 0)
789void rds_page_exit(void); 801void rds_page_exit(void);
790 802
791/* recv.c */ 803/* recv.c */
@@ -891,7 +903,7 @@ void rds_connect_path_complete(struct rds_conn_path *conn, int curr);
891void rds_connect_complete(struct rds_connection *conn); 903void rds_connect_complete(struct rds_connection *conn);
892 904
893/* transport.c */ 905/* transport.c */
894int rds_trans_register(struct rds_transport *trans); 906void rds_trans_register(struct rds_transport *trans);
895void rds_trans_unregister(struct rds_transport *trans); 907void rds_trans_unregister(struct rds_transport *trans);
896struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr); 908struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr);
897void rds_trans_put(struct rds_transport *trans); 909void rds_trans_put(struct rds_transport *trans);
diff --git a/net/rds/recv.c b/net/rds/recv.c
index cbfabdf3ff48..8b7e7b7f2c2d 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -43,6 +43,8 @@
43void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, 43void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
44 __be32 saddr) 44 __be32 saddr)
45{ 45{
46 int i;
47
46 atomic_set(&inc->i_refcount, 1); 48 atomic_set(&inc->i_refcount, 1);
47 INIT_LIST_HEAD(&inc->i_item); 49 INIT_LIST_HEAD(&inc->i_item);
48 inc->i_conn = conn; 50 inc->i_conn = conn;
@@ -50,6 +52,9 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
50 inc->i_rdma_cookie = 0; 52 inc->i_rdma_cookie = 0;
51 inc->i_rx_tstamp.tv_sec = 0; 53 inc->i_rx_tstamp.tv_sec = 0;
52 inc->i_rx_tstamp.tv_usec = 0; 54 inc->i_rx_tstamp.tv_usec = 0;
55
56 for (i = 0; i < RDS_RX_MAX_TRACES; i++)
57 inc->i_rx_lat_trace[i] = 0;
53} 58}
54EXPORT_SYMBOL_GPL(rds_inc_init); 59EXPORT_SYMBOL_GPL(rds_inc_init);
55 60
@@ -94,6 +99,10 @@ static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
94 return; 99 return;
95 100
96 rs->rs_rcv_bytes += delta; 101 rs->rs_rcv_bytes += delta;
102 if (delta > 0)
103 rds_stats_add(s_recv_bytes_added_to_socket, delta);
104 else
105 rds_stats_add(s_recv_bytes_removed_from_socket, -delta);
97 now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs); 106 now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);
98 107
99 rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d " 108 rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d "
@@ -120,6 +129,36 @@ static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
120 /* do nothing if no change in cong state */ 129 /* do nothing if no change in cong state */
121} 130}
122 131
132static void rds_conn_peer_gen_update(struct rds_connection *conn,
133 u32 peer_gen_num)
134{
135 int i;
136 struct rds_message *rm, *tmp;
137 unsigned long flags;
138
139 WARN_ON(conn->c_trans->t_type != RDS_TRANS_TCP);
140 if (peer_gen_num != 0) {
141 if (conn->c_peer_gen_num != 0 &&
142 peer_gen_num != conn->c_peer_gen_num) {
143 for (i = 0; i < RDS_MPATH_WORKERS; i++) {
144 struct rds_conn_path *cp;
145
146 cp = &conn->c_path[i];
147 spin_lock_irqsave(&cp->cp_lock, flags);
148 cp->cp_next_tx_seq = 1;
149 cp->cp_next_rx_seq = 0;
150 list_for_each_entry_safe(rm, tmp,
151 &cp->cp_retrans,
152 m_conn_item) {
153 set_bit(RDS_MSG_FLUSH, &rm->m_flags);
154 }
155 spin_unlock_irqrestore(&cp->cp_lock, flags);
156 }
157 }
158 conn->c_peer_gen_num = peer_gen_num;
159 }
160}
161
123/* 162/*
124 * Process all extension headers that come with this message. 163 * Process all extension headers that come with this message.
125 */ 164 */
@@ -163,7 +202,9 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
163 union { 202 union {
164 struct rds_ext_header_version version; 203 struct rds_ext_header_version version;
165 u16 rds_npaths; 204 u16 rds_npaths;
205 u32 rds_gen_num;
166 } buffer; 206 } buffer;
207 u32 new_peer_gen_num = 0;
167 208
168 while (1) { 209 while (1) {
169 len = sizeof(buffer); 210 len = sizeof(buffer);
@@ -176,6 +217,9 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
176 conn->c_npaths = min_t(int, RDS_MPATH_WORKERS, 217 conn->c_npaths = min_t(int, RDS_MPATH_WORKERS,
177 buffer.rds_npaths); 218 buffer.rds_npaths);
178 break; 219 break;
220 case RDS_EXTHDR_GEN_NUM:
221 new_peer_gen_num = buffer.rds_gen_num;
222 break;
179 default: 223 default:
180 pr_warn_ratelimited("ignoring unknown exthdr type " 224 pr_warn_ratelimited("ignoring unknown exthdr type "
181 "0x%x\n", type); 225 "0x%x\n", type);
@@ -183,6 +227,7 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
183 } 227 }
184 /* if RDS_EXTHDR_NPATHS was not found, default to a single-path */ 228 /* if RDS_EXTHDR_NPATHS was not found, default to a single-path */
185 conn->c_npaths = max_t(int, conn->c_npaths, 1); 229 conn->c_npaths = max_t(int, conn->c_npaths, 1);
230 rds_conn_peer_gen_update(conn, new_peer_gen_num);
186} 231}
187 232
188/* rds_start_mprds() will synchronously start multiple paths when appropriate. 233/* rds_start_mprds() will synchronously start multiple paths when appropriate.
@@ -333,6 +378,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
333 if (sock_flag(sk, SOCK_RCVTSTAMP)) 378 if (sock_flag(sk, SOCK_RCVTSTAMP))
334 do_gettimeofday(&inc->i_rx_tstamp); 379 do_gettimeofday(&inc->i_rx_tstamp);
335 rds_inc_addref(inc); 380 rds_inc_addref(inc);
381 inc->i_rx_lat_trace[RDS_MSG_RX_END] = local_clock();
336 list_add_tail(&inc->i_item, &rs->rs_recv_queue); 382 list_add_tail(&inc->i_item, &rs->rs_recv_queue);
337 __rds_wake_sk_sleep(sk); 383 __rds_wake_sk_sleep(sk);
338 } else { 384 } else {
@@ -494,7 +540,7 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
494 ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST, 540 ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST,
495 sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie); 541 sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie);
496 if (ret) 542 if (ret)
497 return ret; 543 goto out;
498 } 544 }
499 545
500 if ((inc->i_rx_tstamp.tv_sec != 0) && 546 if ((inc->i_rx_tstamp.tv_sec != 0) &&
@@ -503,10 +549,30 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
503 sizeof(struct timeval), 549 sizeof(struct timeval),
504 &inc->i_rx_tstamp); 550 &inc->i_rx_tstamp);
505 if (ret) 551 if (ret)
506 return ret; 552 goto out;
507 } 553 }
508 554
509 return 0; 555 if (rs->rs_rx_traces) {
556 struct rds_cmsg_rx_trace t;
557 int i, j;
558
559 inc->i_rx_lat_trace[RDS_MSG_RX_CMSG] = local_clock();
560 t.rx_traces = rs->rs_rx_traces;
561 for (i = 0; i < rs->rs_rx_traces; i++) {
562 j = rs->rs_rx_trace[i];
563 t.rx_trace_pos[i] = j;
564 t.rx_trace[i] = inc->i_rx_lat_trace[j + 1] -
565 inc->i_rx_lat_trace[j];
566 }
567
568 ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RXPATH_LATENCY,
569 sizeof(t), &t);
570 if (ret)
571 goto out;
572 }
573
574out:
575 return ret;
510} 576}
511 577
512int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 578int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
diff --git a/net/rds/send.c b/net/rds/send.c
index 896626b9a0ef..5cc64039caf7 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -259,8 +259,9 @@ restart:
259 * connection. 259 * connection.
260 * Therefore, we never retransmit messages with RDMA ops. 260 * Therefore, we never retransmit messages with RDMA ops.
261 */ 261 */
262 if (rm->rdma.op_active && 262 if (test_bit(RDS_MSG_FLUSH, &rm->m_flags) ||
263 test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) { 263 (rm->rdma.op_active &&
264 test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))) {
264 spin_lock_irqsave(&cp->cp_lock, flags); 265 spin_lock_irqsave(&cp->cp_lock, flags);
265 if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) 266 if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
266 list_move(&rm->m_conn_item, &to_be_dropped); 267 list_move(&rm->m_conn_item, &to_be_dropped);
@@ -475,12 +476,14 @@ void rds_rdma_send_complete(struct rds_message *rm, int status)
475 struct rm_rdma_op *ro; 476 struct rm_rdma_op *ro;
476 struct rds_notifier *notifier; 477 struct rds_notifier *notifier;
477 unsigned long flags; 478 unsigned long flags;
479 unsigned int notify = 0;
478 480
479 spin_lock_irqsave(&rm->m_rs_lock, flags); 481 spin_lock_irqsave(&rm->m_rs_lock, flags);
480 482
483 notify = rm->rdma.op_notify | rm->data.op_notify;
481 ro = &rm->rdma; 484 ro = &rm->rdma;
482 if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && 485 if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
483 ro->op_active && ro->op_notify && ro->op_notifier) { 486 ro->op_active && notify && ro->op_notifier) {
484 notifier = ro->op_notifier; 487 notifier = ro->op_notifier;
485 rs = rm->m_rs; 488 rs = rm->m_rs;
486 sock_hold(rds_rs_to_sk(rs)); 489 sock_hold(rds_rs_to_sk(rs));
@@ -944,6 +947,11 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
944 ret = rds_cmsg_rdma_map(rs, rm, cmsg); 947 ret = rds_cmsg_rdma_map(rs, rm, cmsg);
945 if (!ret) 948 if (!ret)
946 *allocated_mr = 1; 949 *allocated_mr = 1;
950 else if (ret == -ENODEV)
951 /* Accommodate the get_mr() case which can fail
952 * if connection isn't established yet.
953 */
954 ret = -EAGAIN;
947 break; 955 break;
948 case RDS_CMSG_ATOMIC_CSWP: 956 case RDS_CMSG_ATOMIC_CSWP:
949 case RDS_CMSG_ATOMIC_FADD: 957 case RDS_CMSG_ATOMIC_FADD:
@@ -986,6 +994,26 @@ static int rds_send_mprds_hash(struct rds_sock *rs, struct rds_connection *conn)
986 return hash; 994 return hash;
987} 995}
988 996
997static int rds_rdma_bytes(struct msghdr *msg, size_t *rdma_bytes)
998{
999 struct rds_rdma_args *args;
1000 struct cmsghdr *cmsg;
1001
1002 for_each_cmsghdr(cmsg, msg) {
1003 if (!CMSG_OK(msg, cmsg))
1004 return -EINVAL;
1005
1006 if (cmsg->cmsg_level != SOL_RDS)
1007 continue;
1008
1009 if (cmsg->cmsg_type == RDS_CMSG_RDMA_ARGS) {
1010 args = CMSG_DATA(cmsg);
1011 *rdma_bytes += args->remote_vec.bytes;
1012 }
1013 }
1014 return 0;
1015}
1016
989int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) 1017int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
990{ 1018{
991 struct sock *sk = sock->sk; 1019 struct sock *sk = sock->sk;
@@ -1000,6 +1028,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
1000 int nonblock = msg->msg_flags & MSG_DONTWAIT; 1028 int nonblock = msg->msg_flags & MSG_DONTWAIT;
1001 long timeo = sock_sndtimeo(sk, nonblock); 1029 long timeo = sock_sndtimeo(sk, nonblock);
1002 struct rds_conn_path *cpath; 1030 struct rds_conn_path *cpath;
1031 size_t total_payload_len = payload_len, rdma_payload_len = 0;
1003 1032
1004 /* Mirror Linux UDP mirror of BSD error message compatibility */ 1033 /* Mirror Linux UDP mirror of BSD error message compatibility */
1005 /* XXX: Perhaps MSG_MORE someday */ 1034 /* XXX: Perhaps MSG_MORE someday */
@@ -1032,6 +1061,16 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
1032 } 1061 }
1033 release_sock(sk); 1062 release_sock(sk);
1034 1063
1064 ret = rds_rdma_bytes(msg, &rdma_payload_len);
1065 if (ret)
1066 goto out;
1067
1068 total_payload_len += rdma_payload_len;
1069 if (max_t(size_t, payload_len, rdma_payload_len) > RDS_MAX_MSG_SIZE) {
1070 ret = -EMSGSIZE;
1071 goto out;
1072 }
1073
1035 if (payload_len > rds_sk_sndbuf(rs)) { 1074 if (payload_len > rds_sk_sndbuf(rs)) {
1036 ret = -EMSGSIZE; 1075 ret = -EMSGSIZE;
1037 goto out; 1076 goto out;
@@ -1081,8 +1120,12 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
1081 1120
1082 /* Parse any control messages the user may have included. */ 1121 /* Parse any control messages the user may have included. */
1083 ret = rds_cmsg_send(rs, rm, msg, &allocated_mr); 1122 ret = rds_cmsg_send(rs, rm, msg, &allocated_mr);
1084 if (ret) 1123 if (ret) {
1124 /* Trigger connection so that its ready for the next retry */
1125 if (ret == -EAGAIN)
1126 rds_conn_connect_if_down(conn);
1085 goto out; 1127 goto out;
1128 }
1086 1129
1087 if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) { 1130 if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) {
1088 printk_ratelimited(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", 1131 printk_ratelimited(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
@@ -1168,7 +1211,7 @@ out:
1168 * or 1211 * or
1169 * RDS_FLAG_HB_PONG|RDS_FLAG_ACK_REQUIRED 1212 * RDS_FLAG_HB_PONG|RDS_FLAG_ACK_REQUIRED
1170 */ 1213 */
1171int 1214static int
1172rds_send_probe(struct rds_conn_path *cp, __be16 sport, 1215rds_send_probe(struct rds_conn_path *cp, __be16 sport,
1173 __be16 dport, u8 h_flags) 1216 __be16 dport, u8 h_flags)
1174{ 1217{
@@ -1209,6 +1252,10 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport,
1209 rds_message_add_extension(&rm->m_inc.i_hdr, 1252 rds_message_add_extension(&rm->m_inc.i_hdr,
1210 RDS_EXTHDR_NPATHS, &npaths, 1253 RDS_EXTHDR_NPATHS, &npaths,
1211 sizeof(npaths)); 1254 sizeof(npaths));
1255 rds_message_add_extension(&rm->m_inc.i_hdr,
1256 RDS_EXTHDR_GEN_NUM,
1257 &cp->cp_conn->c_my_gen_num,
1258 sizeof(u32));
1212 } 1259 }
1213 spin_unlock_irqrestore(&cp->cp_lock, flags); 1260 spin_unlock_irqrestore(&cp->cp_lock, flags);
1214 1261
@@ -1233,7 +1280,7 @@ rds_send_pong(struct rds_conn_path *cp, __be16 dport)
1233 return rds_send_probe(cp, 0, dport, 0); 1280 return rds_send_probe(cp, 0, dport, 0);
1234} 1281}
1235 1282
1236void 1283static void
1237rds_send_ping(struct rds_connection *conn) 1284rds_send_ping(struct rds_connection *conn)
1238{ 1285{
1239 unsigned long flags; 1286 unsigned long flags;
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 20e2923dc827..225690076773 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -220,7 +220,7 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp)
220 write_unlock_bh(&sock->sk->sk_callback_lock); 220 write_unlock_bh(&sock->sk->sk_callback_lock);
221} 221}
222 222
223static void rds_tcp_tc_info(struct socket *sock, unsigned int len, 223static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
224 struct rds_info_iterator *iter, 224 struct rds_info_iterator *iter,
225 struct rds_info_lengths *lens) 225 struct rds_info_lengths *lens)
226{ 226{
@@ -229,6 +229,7 @@ static void rds_tcp_tc_info(struct socket *sock, unsigned int len,
229 unsigned long flags; 229 unsigned long flags;
230 struct sockaddr_in sin; 230 struct sockaddr_in sin;
231 int sinlen; 231 int sinlen;
232 struct socket *sock;
232 233
233 spin_lock_irqsave(&rds_tcp_tc_list_lock, flags); 234 spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
234 235
@@ -237,12 +238,17 @@ static void rds_tcp_tc_info(struct socket *sock, unsigned int len,
237 238
238 list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) { 239 list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
239 240
240 sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 0); 241 sock = tc->t_sock;
241 tsinfo.local_addr = sin.sin_addr.s_addr; 242 if (sock) {
242 tsinfo.local_port = sin.sin_port; 243 sock->ops->getname(sock, (struct sockaddr *)&sin,
243 sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 1); 244 &sinlen, 0);
244 tsinfo.peer_addr = sin.sin_addr.s_addr; 245 tsinfo.local_addr = sin.sin_addr.s_addr;
245 tsinfo.peer_port = sin.sin_port; 246 tsinfo.local_port = sin.sin_port;
247 sock->ops->getname(sock, (struct sockaddr *)&sin,
248 &sinlen, 1);
249 tsinfo.peer_addr = sin.sin_addr.s_addr;
250 tsinfo.peer_port = sin.sin_port;
251 }
246 252
247 tsinfo.hdr_rem = tc->t_tinc_hdr_rem; 253 tsinfo.hdr_rem = tc->t_tinc_hdr_rem;
248 tsinfo.data_rem = tc->t_tinc_data_rem; 254 tsinfo.data_rem = tc->t_tinc_data_rem;
@@ -360,7 +366,7 @@ struct rds_transport rds_tcp_transport = {
360 .t_mp_capable = 1, 366 .t_mp_capable = 1,
361}; 367};
362 368
363static int rds_tcp_netid; 369static unsigned int rds_tcp_netid;
364 370
365/* per-network namespace private data for this module */ 371/* per-network namespace private data for this module */
366struct rds_tcp_net { 372struct rds_tcp_net {
@@ -478,9 +484,10 @@ static void __net_exit rds_tcp_exit_net(struct net *net)
478 * we do need to clean up the listen socket here. 484 * we do need to clean up the listen socket here.
479 */ 485 */
480 if (rtn->rds_tcp_listen_sock) { 486 if (rtn->rds_tcp_listen_sock) {
481 rds_tcp_listen_stop(rtn->rds_tcp_listen_sock); 487 struct socket *lsock = rtn->rds_tcp_listen_sock;
488
482 rtn->rds_tcp_listen_sock = NULL; 489 rtn->rds_tcp_listen_sock = NULL;
483 flush_work(&rtn->rds_tcp_accept_w); 490 rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w);
484 } 491 }
485} 492}
486 493
@@ -517,13 +524,13 @@ static void rds_tcp_kill_sock(struct net *net)
517 struct rds_tcp_connection *tc, *_tc; 524 struct rds_tcp_connection *tc, *_tc;
518 LIST_HEAD(tmp_list); 525 LIST_HEAD(tmp_list);
519 struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); 526 struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
527 struct socket *lsock = rtn->rds_tcp_listen_sock;
520 528
521 rds_tcp_listen_stop(rtn->rds_tcp_listen_sock);
522 rtn->rds_tcp_listen_sock = NULL; 529 rtn->rds_tcp_listen_sock = NULL;
523 flush_work(&rtn->rds_tcp_accept_w); 530 rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w);
524 spin_lock_irq(&rds_tcp_conn_lock); 531 spin_lock_irq(&rds_tcp_conn_lock);
525 list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { 532 list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
526 struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net); 533 struct net *c_net = tc->t_cpath->cp_conn->c_net;
527 534
528 if (net != c_net || !tc->t_sock) 535 if (net != c_net || !tc->t_sock)
529 continue; 536 continue;
@@ -540,8 +547,12 @@ static void rds_tcp_kill_sock(struct net *net)
540void *rds_tcp_listen_sock_def_readable(struct net *net) 547void *rds_tcp_listen_sock_def_readable(struct net *net)
541{ 548{
542 struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); 549 struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
550 struct socket *lsock = rtn->rds_tcp_listen_sock;
543 551
544 return rtn->rds_tcp_listen_sock->sk->sk_user_data; 552 if (!lsock)
553 return NULL;
554
555 return lsock->sk->sk_user_data;
545} 556}
546 557
547static int rds_tcp_dev_event(struct notifier_block *this, 558static int rds_tcp_dev_event(struct notifier_block *this,
@@ -578,7 +589,7 @@ static void rds_tcp_sysctl_reset(struct net *net)
578 589
579 spin_lock_irq(&rds_tcp_conn_lock); 590 spin_lock_irq(&rds_tcp_conn_lock);
580 list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { 591 list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
581 struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net); 592 struct net *c_net = tc->t_cpath->cp_conn->c_net;
582 593
583 if (net != c_net || !tc->t_sock) 594 if (net != c_net || !tc->t_sock)
584 continue; 595 continue;
@@ -632,35 +643,31 @@ static int rds_tcp_init(void)
632 goto out; 643 goto out;
633 } 644 }
634 645
635 ret = register_netdevice_notifier(&rds_tcp_dev_notifier); 646 ret = rds_tcp_recv_init();
636 if (ret) {
637 pr_warn("could not register rds_tcp_dev_notifier\n");
638 goto out;
639 }
640
641 ret = register_pernet_subsys(&rds_tcp_net_ops);
642 if (ret) 647 if (ret)
643 goto out_slab; 648 goto out_slab;
644 649
645 ret = rds_tcp_recv_init(); 650 ret = register_pernet_subsys(&rds_tcp_net_ops);
646 if (ret) 651 if (ret)
652 goto out_recv;
653
654 ret = register_netdevice_notifier(&rds_tcp_dev_notifier);
655 if (ret) {
656 pr_warn("could not register rds_tcp_dev_notifier\n");
647 goto out_pernet; 657 goto out_pernet;
658 }
648 659
649 ret = rds_trans_register(&rds_tcp_transport); 660 rds_trans_register(&rds_tcp_transport);
650 if (ret)
651 goto out_recv;
652 661
653 rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); 662 rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
654 663
655 goto out; 664 goto out;
656 665
657out_recv:
658 rds_tcp_recv_exit();
659out_pernet: 666out_pernet:
660 unregister_pernet_subsys(&rds_tcp_net_ops); 667 unregister_pernet_subsys(&rds_tcp_net_ops);
668out_recv:
669 rds_tcp_recv_exit();
661out_slab: 670out_slab:
662 if (unregister_netdevice_notifier(&rds_tcp_dev_notifier))
663 pr_warn("could not unregister rds_tcp_dev_notifier\n");
664 kmem_cache_destroy(rds_tcp_conn_slab); 671 kmem_cache_destroy(rds_tcp_conn_slab);
665out: 672out:
666 return ret; 673 return ret;
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 9a1cc8906576..56ea6620fcf9 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -66,7 +66,7 @@ void rds_tcp_state_change(struct sock *sk);
66 66
67/* tcp_listen.c */ 67/* tcp_listen.c */
68struct socket *rds_tcp_listen_init(struct net *); 68struct socket *rds_tcp_listen_init(struct net *);
69void rds_tcp_listen_stop(struct socket *); 69void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor);
70void rds_tcp_listen_data_ready(struct sock *sk); 70void rds_tcp_listen_data_ready(struct sock *sk);
71int rds_tcp_accept_one(struct socket *sock); 71int rds_tcp_accept_one(struct socket *sock);
72int rds_tcp_keepalive(struct socket *sock); 72int rds_tcp_keepalive(struct socket *sock);
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index 05f61c533ed3..d6839d96d539 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -60,7 +60,19 @@ void rds_tcp_state_change(struct sock *sk)
60 case TCP_SYN_RECV: 60 case TCP_SYN_RECV:
61 break; 61 break;
62 case TCP_ESTABLISHED: 62 case TCP_ESTABLISHED:
63 rds_connect_path_complete(cp, RDS_CONN_CONNECTING); 63 /* Force the peer to reconnect so that we have the
64 * TCP ports going from <smaller-ip>.<transient> to
65 * <larger-ip>.<RDS_TCP_PORT>. We avoid marking the
66 * RDS connection as RDS_CONN_UP until the reconnect,
67 * to avoid RDS datagram loss.
68 */
69 if (cp->cp_conn->c_laddr > cp->cp_conn->c_faddr &&
70 rds_conn_path_transition(cp, RDS_CONN_CONNECTING,
71 RDS_CONN_ERROR)) {
72 rds_conn_path_drop(cp);
73 } else {
74 rds_connect_path_complete(cp, RDS_CONN_CONNECTING);
75 }
64 break; 76 break;
65 case TCP_CLOSE_WAIT: 77 case TCP_CLOSE_WAIT:
66 case TCP_CLOSE: 78 case TCP_CLOSE:
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index e0b23fb5b8d5..507678853e6c 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -79,31 +79,27 @@ bail:
79 * smaller ip address, we recycle conns in RDS_CONN_ERROR on the passive side 79 * smaller ip address, we recycle conns in RDS_CONN_ERROR on the passive side
80 * by moving them to CONNECTING in this function. 80 * by moving them to CONNECTING in this function.
81 */ 81 */
82static
82struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn) 83struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
83{ 84{
84 int i; 85 int i;
85 bool peer_is_smaller = (conn->c_faddr < conn->c_laddr); 86 bool peer_is_smaller = (conn->c_faddr < conn->c_laddr);
86 int npaths = conn->c_npaths; 87 int npaths = max_t(int, 1, conn->c_npaths);
87
88 if (npaths <= 1) {
89 struct rds_conn_path *cp = &conn->c_path[0];
90 int ret;
91
92 ret = rds_conn_path_transition(cp, RDS_CONN_DOWN,
93 RDS_CONN_CONNECTING);
94 if (!ret)
95 rds_conn_path_transition(cp, RDS_CONN_ERROR,
96 RDS_CONN_CONNECTING);
97 return cp->cp_transport_data;
98 }
99 88
100 /* for mprds, paths with cp_index > 0 MUST be initiated by the peer 89 /* for mprds, all paths MUST be initiated by the peer
101 * with the smaller address. 90 * with the smaller address.
102 */ 91 */
103 if (!peer_is_smaller) 92 if (!peer_is_smaller) {
93 /* Make sure we initiate at least one path if this
94 * has not already been done; rds_start_mprds() will
95 * take care of additional paths, if necessary.
96 */
97 if (npaths == 1)
98 rds_conn_path_connect_if_down(&conn->c_path[0]);
104 return NULL; 99 return NULL;
100 }
105 101
106 for (i = 1; i < npaths; i++) { 102 for (i = 0; i < npaths; i++) {
107 struct rds_conn_path *cp = &conn->c_path[i]; 103 struct rds_conn_path *cp = &conn->c_path[i];
108 104
109 if (rds_conn_path_transition(cp, RDS_CONN_DOWN, 105 if (rds_conn_path_transition(cp, RDS_CONN_DOWN,
@@ -137,7 +133,7 @@ int rds_tcp_accept_one(struct socket *sock)
137 133
138 new_sock->type = sock->type; 134 new_sock->type = sock->type;
139 new_sock->ops = sock->ops; 135 new_sock->ops = sock->ops;
140 ret = sock->ops->accept(sock, new_sock, O_NONBLOCK); 136 ret = sock->ops->accept(sock, new_sock, O_NONBLOCK, true);
141 if (ret < 0) 137 if (ret < 0)
142 goto out; 138 goto out;
143 139
@@ -171,8 +167,8 @@ int rds_tcp_accept_one(struct socket *sock)
171 mutex_lock(&rs_tcp->t_conn_path_lock); 167 mutex_lock(&rs_tcp->t_conn_path_lock);
172 cp = rs_tcp->t_cpath; 168 cp = rs_tcp->t_cpath;
173 conn_state = rds_conn_path_state(cp); 169 conn_state = rds_conn_path_state(cp);
174 if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_UP && 170 WARN_ON(conn_state == RDS_CONN_UP);
175 conn_state != RDS_CONN_ERROR) 171 if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_ERROR)
176 goto rst_nsk; 172 goto rst_nsk;
177 if (rs_tcp->t_sock) { 173 if (rs_tcp->t_sock) {
178 /* Need to resolve a duelling SYN between peers. 174 /* Need to resolve a duelling SYN between peers.
@@ -227,6 +223,9 @@ void rds_tcp_listen_data_ready(struct sock *sk)
227 * before it has been accepted and the accepter has set up their 223 * before it has been accepted and the accepter has set up their
228 * data_ready.. we only want to queue listen work for our listening 224 * data_ready.. we only want to queue listen work for our listening
229 * socket 225 * socket
226 *
227 * (*ready)() may be null if we are racing with netns delete, and
228 * the listen socket is being torn down.
230 */ 229 */
231 if (sk->sk_state == TCP_LISTEN) 230 if (sk->sk_state == TCP_LISTEN)
232 rds_tcp_accept_work(sk); 231 rds_tcp_accept_work(sk);
@@ -235,7 +234,8 @@ void rds_tcp_listen_data_ready(struct sock *sk)
235 234
236out: 235out:
237 read_unlock_bh(&sk->sk_callback_lock); 236 read_unlock_bh(&sk->sk_callback_lock);
238 ready(sk); 237 if (ready)
238 ready(sk);
239} 239}
240 240
241struct socket *rds_tcp_listen_init(struct net *net) 241struct socket *rds_tcp_listen_init(struct net *net)
@@ -275,7 +275,7 @@ out:
275 return NULL; 275 return NULL;
276} 276}
277 277
278void rds_tcp_listen_stop(struct socket *sock) 278void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor)
279{ 279{
280 struct sock *sk; 280 struct sock *sk;
281 281
@@ -296,5 +296,6 @@ void rds_tcp_listen_stop(struct socket *sock)
296 296
297 /* wait for accepts to stop and close the socket */ 297 /* wait for accepts to stop and close the socket */
298 flush_workqueue(rds_wq); 298 flush_workqueue(rds_wq);
299 flush_work(acceptor);
299 sock_release(sock); 300 sock_release(sock);
300} 301}
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index ad4892e97f91..e006ef8e6d40 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -180,6 +180,9 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
180 rdsdebug("alloced tinc %p\n", tinc); 180 rdsdebug("alloced tinc %p\n", tinc);
181 rds_inc_path_init(&tinc->ti_inc, cp, 181 rds_inc_path_init(&tinc->ti_inc, cp,
182 cp->cp_conn->c_faddr); 182 cp->cp_conn->c_faddr);
183 tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
184 local_clock();
185
183 /* 186 /*
184 * XXX * we might be able to use the __ variants when 187 * XXX * we might be able to use the __ variants when
185 * we've already serialized at a higher level. 188 * we've already serialized at a higher level.
@@ -204,6 +207,8 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
204 /* could be 0 for a 0 len message */ 207 /* could be 0 for a 0 len message */
205 tc->t_tinc_data_rem = 208 tc->t_tinc_data_rem =
206 be32_to_cpu(tinc->ti_inc.i_hdr.h_len); 209 be32_to_cpu(tinc->ti_inc.i_hdr.h_len);
210 tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_START] =
211 local_clock();
207 } 212 }
208 } 213 }
209 214
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index 89d09b481f47..dcf4742083ea 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -100,6 +100,9 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
100 set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags); 100 set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags);
101 tc->t_last_expected_una = rm->m_ack_seq + 1; 101 tc->t_last_expected_una = rm->m_ack_seq + 1;
102 102
103 if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))
104 rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED;
105
103 rdsdebug("rm %p tcp nxt %u ack_seq %llu\n", 106 rdsdebug("rm %p tcp nxt %u ack_seq %llu\n",
104 rm, rds_tcp_snd_nxt(tc), 107 rm, rds_tcp_snd_nxt(tc),
105 (unsigned long long)rm->m_ack_seq); 108 (unsigned long long)rm->m_ack_seq);
diff --git a/net/rds/threads.c b/net/rds/threads.c
index e42df11bf30a..e36e333a0aa0 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -171,8 +171,7 @@ void rds_connect_worker(struct work_struct *work)
171 RDS_CONN_DOWN)) 171 RDS_CONN_DOWN))
172 rds_queue_reconnect(cp); 172 rds_queue_reconnect(cp);
173 else 173 else
174 rds_conn_path_error(cp, 174 rds_conn_path_error(cp, "connect failed\n");
175 "RDS: connect failed\n");
176 } 175 }
177 } 176 }
178} 177}
diff --git a/net/rds/transport.c b/net/rds/transport.c
index 2ffd3e30c643..0b188dd0a344 100644
--- a/net/rds/transport.c
+++ b/net/rds/transport.c
@@ -40,7 +40,7 @@
40static struct rds_transport *transports[RDS_TRANS_COUNT]; 40static struct rds_transport *transports[RDS_TRANS_COUNT];
41static DECLARE_RWSEM(rds_trans_sem); 41static DECLARE_RWSEM(rds_trans_sem);
42 42
43int rds_trans_register(struct rds_transport *trans) 43void rds_trans_register(struct rds_transport *trans)
44{ 44{
45 BUG_ON(strlen(trans->t_name) + 1 > TRANSNAMSIZ); 45 BUG_ON(strlen(trans->t_name) + 1 > TRANSNAMSIZ);
46 46
@@ -55,8 +55,6 @@ int rds_trans_register(struct rds_transport *trans)
55 } 55 }
56 56
57 up_write(&rds_trans_sem); 57 up_write(&rds_trans_sem);
58
59 return 0;
60} 58}
61EXPORT_SYMBOL_GPL(rds_trans_register); 59EXPORT_SYMBOL_GPL(rds_trans_register);
62 60
diff --git a/net/rfkill/Kconfig b/net/rfkill/Kconfig
index 868f1ad0415a..060600b03fad 100644
--- a/net/rfkill/Kconfig
+++ b/net/rfkill/Kconfig
@@ -23,17 +23,6 @@ config RFKILL_INPUT
23 depends on INPUT = y || RFKILL = INPUT 23 depends on INPUT = y || RFKILL = INPUT
24 default y if !EXPERT 24 default y if !EXPERT
25 25
26config RFKILL_REGULATOR
27 tristate "Generic rfkill regulator driver"
28 depends on RFKILL || !RFKILL
29 depends on REGULATOR
30 help
31 This options enable controlling radio transmitters connected to
32 voltage regulator using the regulator framework.
33
34 To compile this driver as a module, choose M here: the module will
35 be called rfkill-regulator.
36
37config RFKILL_GPIO 26config RFKILL_GPIO
38 tristate "GPIO RFKILL driver" 27 tristate "GPIO RFKILL driver"
39 depends on RFKILL 28 depends on RFKILL
diff --git a/net/rfkill/Makefile b/net/rfkill/Makefile
index 311768783f4a..87a80aded0b3 100644
--- a/net/rfkill/Makefile
+++ b/net/rfkill/Makefile
@@ -5,5 +5,4 @@
5rfkill-y += core.o 5rfkill-y += core.o
6rfkill-$(CONFIG_RFKILL_INPUT) += input.o 6rfkill-$(CONFIG_RFKILL_INPUT) += input.o
7obj-$(CONFIG_RFKILL) += rfkill.o 7obj-$(CONFIG_RFKILL) += rfkill.o
8obj-$(CONFIG_RFKILL_REGULATOR) += rfkill-regulator.o
9obj-$(CONFIG_RFKILL_GPIO) += rfkill-gpio.o 8obj-$(CONFIG_RFKILL_GPIO) += rfkill-gpio.o
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 884027f62783..2064c3a35ef8 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -176,6 +176,50 @@ static void rfkill_led_trigger_unregister(struct rfkill *rfkill)
176{ 176{
177 led_trigger_unregister(&rfkill->led_trigger); 177 led_trigger_unregister(&rfkill->led_trigger);
178} 178}
179
180static struct led_trigger rfkill_any_led_trigger;
181static struct work_struct rfkill_any_work;
182
183static void rfkill_any_led_trigger_worker(struct work_struct *work)
184{
185 enum led_brightness brightness = LED_OFF;
186 struct rfkill *rfkill;
187
188 mutex_lock(&rfkill_global_mutex);
189 list_for_each_entry(rfkill, &rfkill_list, node) {
190 if (!(rfkill->state & RFKILL_BLOCK_ANY)) {
191 brightness = LED_FULL;
192 break;
193 }
194 }
195 mutex_unlock(&rfkill_global_mutex);
196
197 led_trigger_event(&rfkill_any_led_trigger, brightness);
198}
199
200static void rfkill_any_led_trigger_event(void)
201{
202 schedule_work(&rfkill_any_work);
203}
204
205static void rfkill_any_led_trigger_activate(struct led_classdev *led_cdev)
206{
207 rfkill_any_led_trigger_event();
208}
209
210static int rfkill_any_led_trigger_register(void)
211{
212 INIT_WORK(&rfkill_any_work, rfkill_any_led_trigger_worker);
213 rfkill_any_led_trigger.name = "rfkill-any";
214 rfkill_any_led_trigger.activate = rfkill_any_led_trigger_activate;
215 return led_trigger_register(&rfkill_any_led_trigger);
216}
217
218static void rfkill_any_led_trigger_unregister(void)
219{
220 led_trigger_unregister(&rfkill_any_led_trigger);
221 cancel_work_sync(&rfkill_any_work);
222}
179#else 223#else
180static void rfkill_led_trigger_event(struct rfkill *rfkill) 224static void rfkill_led_trigger_event(struct rfkill *rfkill)
181{ 225{
@@ -189,6 +233,19 @@ static inline int rfkill_led_trigger_register(struct rfkill *rfkill)
189static inline void rfkill_led_trigger_unregister(struct rfkill *rfkill) 233static inline void rfkill_led_trigger_unregister(struct rfkill *rfkill)
190{ 234{
191} 235}
236
237static void rfkill_any_led_trigger_event(void)
238{
239}
240
241static int rfkill_any_led_trigger_register(void)
242{
243 return 0;
244}
245
246static void rfkill_any_led_trigger_unregister(void)
247{
248}
192#endif /* CONFIG_RFKILL_LEDS */ 249#endif /* CONFIG_RFKILL_LEDS */
193 250
194static void rfkill_fill_event(struct rfkill_event *ev, struct rfkill *rfkill, 251static void rfkill_fill_event(struct rfkill_event *ev, struct rfkill *rfkill,
@@ -297,6 +354,7 @@ static void rfkill_set_block(struct rfkill *rfkill, bool blocked)
297 spin_unlock_irqrestore(&rfkill->lock, flags); 354 spin_unlock_irqrestore(&rfkill->lock, flags);
298 355
299 rfkill_led_trigger_event(rfkill); 356 rfkill_led_trigger_event(rfkill);
357 rfkill_any_led_trigger_event();
300 358
301 if (prev != curr) 359 if (prev != curr)
302 rfkill_event(rfkill); 360 rfkill_event(rfkill);
@@ -477,11 +535,9 @@ bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked)
477 spin_unlock_irqrestore(&rfkill->lock, flags); 535 spin_unlock_irqrestore(&rfkill->lock, flags);
478 536
479 rfkill_led_trigger_event(rfkill); 537 rfkill_led_trigger_event(rfkill);
538 rfkill_any_led_trigger_event();
480 539
481 if (!rfkill->registered) 540 if (rfkill->registered && prev != blocked)
482 return ret;
483
484 if (prev != blocked)
485 schedule_work(&rfkill->uevent_work); 541 schedule_work(&rfkill->uevent_work);
486 542
487 return ret; 543 return ret;
@@ -523,6 +579,7 @@ bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked)
523 schedule_work(&rfkill->uevent_work); 579 schedule_work(&rfkill->uevent_work);
524 580
525 rfkill_led_trigger_event(rfkill); 581 rfkill_led_trigger_event(rfkill);
582 rfkill_any_led_trigger_event();
526 583
527 return blocked; 584 return blocked;
528} 585}
@@ -572,6 +629,7 @@ void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw)
572 schedule_work(&rfkill->uevent_work); 629 schedule_work(&rfkill->uevent_work);
573 630
574 rfkill_led_trigger_event(rfkill); 631 rfkill_led_trigger_event(rfkill);
632 rfkill_any_led_trigger_event();
575 } 633 }
576} 634}
577EXPORT_SYMBOL(rfkill_set_states); 635EXPORT_SYMBOL(rfkill_set_states);
@@ -988,6 +1046,7 @@ int __must_check rfkill_register(struct rfkill *rfkill)
988#endif 1046#endif
989 } 1047 }
990 1048
1049 rfkill_any_led_trigger_event();
991 rfkill_send_events(rfkill, RFKILL_OP_ADD); 1050 rfkill_send_events(rfkill, RFKILL_OP_ADD);
992 1051
993 mutex_unlock(&rfkill_global_mutex); 1052 mutex_unlock(&rfkill_global_mutex);
@@ -1020,6 +1079,7 @@ void rfkill_unregister(struct rfkill *rfkill)
1020 mutex_lock(&rfkill_global_mutex); 1079 mutex_lock(&rfkill_global_mutex);
1021 rfkill_send_events(rfkill, RFKILL_OP_DEL); 1080 rfkill_send_events(rfkill, RFKILL_OP_DEL);
1022 list_del_init(&rfkill->node); 1081 list_del_init(&rfkill->node);
1082 rfkill_any_led_trigger_event();
1023 mutex_unlock(&rfkill_global_mutex); 1083 mutex_unlock(&rfkill_global_mutex);
1024 1084
1025 rfkill_led_trigger_unregister(rfkill); 1085 rfkill_led_trigger_unregister(rfkill);
@@ -1266,24 +1326,33 @@ static int __init rfkill_init(void)
1266 1326
1267 error = class_register(&rfkill_class); 1327 error = class_register(&rfkill_class);
1268 if (error) 1328 if (error)
1269 goto out; 1329 goto error_class;
1270 1330
1271 error = misc_register(&rfkill_miscdev); 1331 error = misc_register(&rfkill_miscdev);
1272 if (error) { 1332 if (error)
1273 class_unregister(&rfkill_class); 1333 goto error_misc;
1274 goto out; 1334
1275 } 1335 error = rfkill_any_led_trigger_register();
1336 if (error)
1337 goto error_led_trigger;
1276 1338
1277#ifdef CONFIG_RFKILL_INPUT 1339#ifdef CONFIG_RFKILL_INPUT
1278 error = rfkill_handler_init(); 1340 error = rfkill_handler_init();
1279 if (error) { 1341 if (error)
1280 misc_deregister(&rfkill_miscdev); 1342 goto error_input;
1281 class_unregister(&rfkill_class);
1282 goto out;
1283 }
1284#endif 1343#endif
1285 1344
1286 out: 1345 return 0;
1346
1347#ifdef CONFIG_RFKILL_INPUT
1348error_input:
1349 rfkill_any_led_trigger_unregister();
1350#endif
1351error_led_trigger:
1352 misc_deregister(&rfkill_miscdev);
1353error_misc:
1354 class_unregister(&rfkill_class);
1355error_class:
1287 return error; 1356 return error;
1288} 1357}
1289subsys_initcall(rfkill_init); 1358subsys_initcall(rfkill_init);
@@ -1293,6 +1362,7 @@ static void __exit rfkill_exit(void)
1293#ifdef CONFIG_RFKILL_INPUT 1362#ifdef CONFIG_RFKILL_INPUT
1294 rfkill_handler_exit(); 1363 rfkill_handler_exit();
1295#endif 1364#endif
1365 rfkill_any_led_trigger_unregister();
1296 misc_deregister(&rfkill_miscdev); 1366 misc_deregister(&rfkill_miscdev);
1297 class_unregister(&rfkill_class); 1367 class_unregister(&rfkill_class);
1298} 1368}
diff --git a/net/rfkill/rfkill-regulator.c b/net/rfkill/rfkill-regulator.c
deleted file mode 100644
index 50cd26a48e87..000000000000
--- a/net/rfkill/rfkill-regulator.c
+++ /dev/null
@@ -1,154 +0,0 @@
1/*
2 * rfkill-regulator.c - Regulator consumer driver for rfkill
3 *
4 * Copyright (C) 2009 Guiming Zhuo <gmzhuo@gmail.com>
5 * Copyright (C) 2011 Antonio Ospite <ospite@studenti.unina.it>
6 *
7 * Implementation inspired by leds-regulator driver.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 *
13 */
14
15#include <linux/module.h>
16#include <linux/err.h>
17#include <linux/slab.h>
18#include <linux/platform_device.h>
19#include <linux/regulator/consumer.h>
20#include <linux/rfkill.h>
21#include <linux/rfkill-regulator.h>
22
23struct rfkill_regulator_data {
24 struct rfkill *rf_kill;
25 bool reg_enabled;
26
27 struct regulator *vcc;
28};
29
30static int rfkill_regulator_set_block(void *data, bool blocked)
31{
32 struct rfkill_regulator_data *rfkill_data = data;
33 int ret = 0;
34
35 pr_debug("%s: blocked: %d\n", __func__, blocked);
36
37 if (blocked) {
38 if (rfkill_data->reg_enabled) {
39 regulator_disable(rfkill_data->vcc);
40 rfkill_data->reg_enabled = false;
41 }
42 } else {
43 if (!rfkill_data->reg_enabled) {
44 ret = regulator_enable(rfkill_data->vcc);
45 if (!ret)
46 rfkill_data->reg_enabled = true;
47 }
48 }
49
50 pr_debug("%s: regulator_is_enabled after set_block: %d\n", __func__,
51 regulator_is_enabled(rfkill_data->vcc));
52
53 return ret;
54}
55
56static struct rfkill_ops rfkill_regulator_ops = {
57 .set_block = rfkill_regulator_set_block,
58};
59
60static int rfkill_regulator_probe(struct platform_device *pdev)
61{
62 struct rfkill_regulator_platform_data *pdata = pdev->dev.platform_data;
63 struct rfkill_regulator_data *rfkill_data;
64 struct regulator *vcc;
65 struct rfkill *rf_kill;
66 int ret = 0;
67
68 if (pdata == NULL) {
69 dev_err(&pdev->dev, "no platform data\n");
70 return -ENODEV;
71 }
72
73 if (pdata->name == NULL || pdata->type == 0) {
74 dev_err(&pdev->dev, "invalid name or type in platform data\n");
75 return -EINVAL;
76 }
77
78 vcc = regulator_get_exclusive(&pdev->dev, "vrfkill");
79 if (IS_ERR(vcc)) {
80 dev_err(&pdev->dev, "Cannot get vcc for %s\n", pdata->name);
81 ret = PTR_ERR(vcc);
82 goto out;
83 }
84
85 rfkill_data = kzalloc(sizeof(*rfkill_data), GFP_KERNEL);
86 if (rfkill_data == NULL) {
87 ret = -ENOMEM;
88 goto err_data_alloc;
89 }
90
91 rf_kill = rfkill_alloc(pdata->name, &pdev->dev,
92 pdata->type,
93 &rfkill_regulator_ops, rfkill_data);
94 if (rf_kill == NULL) {
95 ret = -ENOMEM;
96 goto err_rfkill_alloc;
97 }
98
99 if (regulator_is_enabled(vcc)) {
100 dev_dbg(&pdev->dev, "Regulator already enabled\n");
101 rfkill_data->reg_enabled = true;
102 }
103 rfkill_data->vcc = vcc;
104 rfkill_data->rf_kill = rf_kill;
105
106 ret = rfkill_register(rf_kill);
107 if (ret) {
108 dev_err(&pdev->dev, "Cannot register rfkill device\n");
109 goto err_rfkill_register;
110 }
111
112 platform_set_drvdata(pdev, rfkill_data);
113 dev_info(&pdev->dev, "%s initialized\n", pdata->name);
114
115 return 0;
116
117err_rfkill_register:
118 rfkill_destroy(rf_kill);
119err_rfkill_alloc:
120 kfree(rfkill_data);
121err_data_alloc:
122 regulator_put(vcc);
123out:
124 return ret;
125}
126
127static int rfkill_regulator_remove(struct platform_device *pdev)
128{
129 struct rfkill_regulator_data *rfkill_data = platform_get_drvdata(pdev);
130 struct rfkill *rf_kill = rfkill_data->rf_kill;
131
132 rfkill_unregister(rf_kill);
133 rfkill_destroy(rf_kill);
134 regulator_put(rfkill_data->vcc);
135 kfree(rfkill_data);
136
137 return 0;
138}
139
140static struct platform_driver rfkill_regulator_driver = {
141 .probe = rfkill_regulator_probe,
142 .remove = rfkill_regulator_remove,
143 .driver = {
144 .name = "rfkill-regulator",
145 },
146};
147
148module_platform_driver(rfkill_regulator_driver);
149
150MODULE_AUTHOR("Guiming Zhuo <gmzhuo@gmail.com>");
151MODULE_AUTHOR("Antonio Ospite <ospite@studenti.unina.it>");
152MODULE_DESCRIPTION("Regulator consumer driver for rfkill");
153MODULE_LICENSE("GPL");
154MODULE_ALIAS("platform:rfkill-regulator");
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 129d357d2722..4a9729257023 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -20,7 +20,7 @@
20#include <linux/in.h> 20#include <linux/in.h>
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/kernel.h> 22#include <linux/kernel.h>
23#include <linux/sched.h> 23#include <linux/sched/signal.h>
24#include <linux/spinlock.h> 24#include <linux/spinlock.h>
25#include <linux/timer.h> 25#include <linux/timer.h>
26#include <linux/string.h> 26#include <linux/string.h>
@@ -34,7 +34,7 @@
34#include <linux/if_arp.h> 34#include <linux/if_arp.h>
35#include <linux/skbuff.h> 35#include <linux/skbuff.h>
36#include <net/sock.h> 36#include <net/sock.h>
37#include <asm/uaccess.h> 37#include <linux/uaccess.h>
38#include <linux/fcntl.h> 38#include <linux/fcntl.h>
39#include <linux/termios.h> 39#include <linux/termios.h>
40#include <linux/mm.h> 40#include <linux/mm.h>
@@ -871,7 +871,8 @@ out_release:
871 return err; 871 return err;
872} 872}
873 873
874static int rose_accept(struct socket *sock, struct socket *newsock, int flags) 874static int rose_accept(struct socket *sock, struct socket *newsock, int flags,
875 bool kern)
875{ 876{
876 struct sk_buff *skb; 877 struct sk_buff *skb;
877 struct sock *newsk; 878 struct sock *newsk;
diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c
index 0fc76d845103..452bbb38d943 100644
--- a/net/rose/rose_route.c
+++ b/net/rose/rose_route.c
@@ -25,7 +25,7 @@
25#include <linux/skbuff.h> 25#include <linux/skbuff.h>
26#include <net/sock.h> 26#include <net/sock.h>
27#include <net/tcp_states.h> 27#include <net/tcp_states.h>
28#include <asm/uaccess.h> 28#include <linux/uaccess.h>
29#include <linux/fcntl.h> 29#include <linux/fcntl.h>
30#include <linux/termios.h> /* For TIOCINQ/OUTQ */ 30#include <linux/termios.h> /* For TIOCINQ/OUTQ */
31#include <linux/mm.h> 31#include <linux/mm.h>
diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile
index 8fc6ea347182..b9da4d6b914f 100644
--- a/net/rxrpc/Makefile
+++ b/net/rxrpc/Makefile
@@ -2,7 +2,9 @@
2# Makefile for Linux kernel RxRPC 2# Makefile for Linux kernel RxRPC
3# 3#
4 4
5af-rxrpc-y := \ 5obj-$(CONFIG_AF_RXRPC) += rxrpc.o
6
7rxrpc-y := \
6 af_rxrpc.o \ 8 af_rxrpc.o \
7 call_accept.o \ 9 call_accept.o \
8 call_event.o \ 10 call_event.o \
@@ -26,8 +28,6 @@ af-rxrpc-y := \
26 skbuff.o \ 28 skbuff.o \
27 utils.o 29 utils.o
28 30
29af-rxrpc-$(CONFIG_PROC_FS) += proc.o 31rxrpc-$(CONFIG_PROC_FS) += proc.o
30af-rxrpc-$(CONFIG_RXKAD) += rxkad.o 32rxrpc-$(CONFIG_RXKAD) += rxkad.o
31af-rxrpc-$(CONFIG_SYSCTL) += sysctl.o 33rxrpc-$(CONFIG_SYSCTL) += sysctl.o
32
33obj-$(CONFIG_AF_RXRPC) += af-rxrpc.o
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 2d59c9be40e1..7fb59c3f1542 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -224,6 +224,14 @@ static int rxrpc_listen(struct socket *sock, int backlog)
224 else 224 else
225 sk->sk_max_ack_backlog = old; 225 sk->sk_max_ack_backlog = old;
226 break; 226 break;
227 case RXRPC_SERVER_LISTENING:
228 if (backlog == 0) {
229 rx->sk.sk_state = RXRPC_SERVER_LISTEN_DISABLED;
230 sk->sk_max_ack_backlog = 0;
231 rxrpc_discard_prealloc(rx);
232 ret = 0;
233 break;
234 }
227 default: 235 default:
228 ret = -EBUSY; 236 ret = -EBUSY;
229 break; 237 break;
@@ -282,10 +290,11 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
282 cp.exclusive = false; 290 cp.exclusive = false;
283 cp.service_id = srx->srx_service; 291 cp.service_id = srx->srx_service;
284 call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, gfp); 292 call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, gfp);
293 /* The socket has been unlocked. */
285 if (!IS_ERR(call)) 294 if (!IS_ERR(call))
286 call->notify_rx = notify_rx; 295 call->notify_rx = notify_rx;
287 296
288 release_sock(&rx->sk); 297 mutex_unlock(&call->user_mutex);
289 _leave(" = %p", call); 298 _leave(" = %p", call);
290 return call; 299 return call;
291} 300}
@@ -302,7 +311,10 @@ EXPORT_SYMBOL(rxrpc_kernel_begin_call);
302void rxrpc_kernel_end_call(struct socket *sock, struct rxrpc_call *call) 311void rxrpc_kernel_end_call(struct socket *sock, struct rxrpc_call *call)
303{ 312{
304 _enter("%d{%d}", call->debug_id, atomic_read(&call->usage)); 313 _enter("%d{%d}", call->debug_id, atomic_read(&call->usage));
314
315 mutex_lock(&call->user_mutex);
305 rxrpc_release_call(rxrpc_sk(sock->sk), call); 316 rxrpc_release_call(rxrpc_sk(sock->sk), call);
317 mutex_unlock(&call->user_mutex);
306 rxrpc_put_call(call, rxrpc_call_put_kernel); 318 rxrpc_put_call(call, rxrpc_call_put_kernel);
307} 319}
308EXPORT_SYMBOL(rxrpc_kernel_end_call); 320EXPORT_SYMBOL(rxrpc_kernel_end_call);
@@ -442,14 +454,16 @@ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
442 case RXRPC_SERVER_BOUND: 454 case RXRPC_SERVER_BOUND:
443 case RXRPC_SERVER_LISTENING: 455 case RXRPC_SERVER_LISTENING:
444 ret = rxrpc_do_sendmsg(rx, m, len); 456 ret = rxrpc_do_sendmsg(rx, m, len);
445 break; 457 /* The socket has been unlocked */
458 goto out;
446 default: 459 default:
447 ret = -EINVAL; 460 ret = -EINVAL;
448 break; 461 goto error_unlock;
449 } 462 }
450 463
451error_unlock: 464error_unlock:
452 release_sock(&rx->sk); 465 release_sock(&rx->sk);
466out:
453 _leave(" = %d", ret); 467 _leave(" = %d", ret);
454 return ret; 468 return ret;
455} 469}
@@ -762,16 +776,17 @@ static const struct net_proto_family rxrpc_family_ops = {
762static int __init af_rxrpc_init(void) 776static int __init af_rxrpc_init(void)
763{ 777{
764 int ret = -1; 778 int ret = -1;
779 unsigned int tmp;
765 780
766 BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > FIELD_SIZEOF(struct sk_buff, cb)); 781 BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > FIELD_SIZEOF(struct sk_buff, cb));
767 782
768 get_random_bytes(&rxrpc_epoch, sizeof(rxrpc_epoch)); 783 get_random_bytes(&rxrpc_epoch, sizeof(rxrpc_epoch));
769 rxrpc_epoch |= RXRPC_RANDOM_EPOCH; 784 rxrpc_epoch |= RXRPC_RANDOM_EPOCH;
770 get_random_bytes(&rxrpc_client_conn_ids.cur, 785 get_random_bytes(&tmp, sizeof(tmp));
771 sizeof(rxrpc_client_conn_ids.cur)); 786 tmp &= 0x3fffffff;
772 rxrpc_client_conn_ids.cur &= 0x3fffffff; 787 if (tmp == 0)
773 if (rxrpc_client_conn_ids.cur == 0) 788 tmp = 1;
774 rxrpc_client_conn_ids.cur = 1; 789 idr_set_cursor(&rxrpc_client_conn_ids, tmp);
775 790
776 ret = -ENOMEM; 791 ret = -ENOMEM;
777 rxrpc_call_jar = kmem_cache_create( 792 rxrpc_call_jar = kmem_cache_create(
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index f60e35576526..26a7b1db1361 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -60,6 +60,7 @@ enum {
60 RXRPC_CLIENT_BOUND, /* client local address bound */ 60 RXRPC_CLIENT_BOUND, /* client local address bound */
61 RXRPC_SERVER_BOUND, /* server local address bound */ 61 RXRPC_SERVER_BOUND, /* server local address bound */
62 RXRPC_SERVER_LISTENING, /* server listening for connections */ 62 RXRPC_SERVER_LISTENING, /* server listening for connections */
63 RXRPC_SERVER_LISTEN_DISABLED, /* server listening disabled */
63 RXRPC_CLOSE, /* socket is being closed */ 64 RXRPC_CLOSE, /* socket is being closed */
64}; 65};
65 66
@@ -466,6 +467,7 @@ struct rxrpc_call {
466 struct rxrpc_connection *conn; /* connection carrying call */ 467 struct rxrpc_connection *conn; /* connection carrying call */
467 struct rxrpc_peer *peer; /* Peer record for remote address */ 468 struct rxrpc_peer *peer; /* Peer record for remote address */
468 struct rxrpc_sock __rcu *socket; /* socket responsible */ 469 struct rxrpc_sock __rcu *socket; /* socket responsible */
470 struct mutex user_mutex; /* User access mutex */
469 ktime_t ack_at; /* When deferred ACK needs to happen */ 471 ktime_t ack_at; /* When deferred ACK needs to happen */
470 ktime_t resend_at; /* When next resend needs to happen */ 472 ktime_t resend_at; /* When next resend needs to happen */
471 ktime_t ping_at; /* When next to send a ping */ 473 ktime_t ping_at; /* When next to send a ping */
@@ -593,200 +595,6 @@ struct rxrpc_ack_summary {
593 u8 cumulative_acks; 595 u8 cumulative_acks;
594}; 596};
595 597
596enum rxrpc_skb_trace {
597 rxrpc_skb_rx_cleaned,
598 rxrpc_skb_rx_freed,
599 rxrpc_skb_rx_got,
600 rxrpc_skb_rx_lost,
601 rxrpc_skb_rx_received,
602 rxrpc_skb_rx_rotated,
603 rxrpc_skb_rx_purged,
604 rxrpc_skb_rx_seen,
605 rxrpc_skb_tx_cleaned,
606 rxrpc_skb_tx_freed,
607 rxrpc_skb_tx_got,
608 rxrpc_skb_tx_new,
609 rxrpc_skb_tx_rotated,
610 rxrpc_skb_tx_seen,
611 rxrpc_skb__nr_trace
612};
613
614extern const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7];
615
616enum rxrpc_conn_trace {
617 rxrpc_conn_new_client,
618 rxrpc_conn_new_service,
619 rxrpc_conn_queued,
620 rxrpc_conn_seen,
621 rxrpc_conn_got,
622 rxrpc_conn_put_client,
623 rxrpc_conn_put_service,
624 rxrpc_conn__nr_trace
625};
626
627extern const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4];
628
629enum rxrpc_client_trace {
630 rxrpc_client_activate_chans,
631 rxrpc_client_alloc,
632 rxrpc_client_chan_activate,
633 rxrpc_client_chan_disconnect,
634 rxrpc_client_chan_pass,
635 rxrpc_client_chan_unstarted,
636 rxrpc_client_cleanup,
637 rxrpc_client_count,
638 rxrpc_client_discard,
639 rxrpc_client_duplicate,
640 rxrpc_client_exposed,
641 rxrpc_client_replace,
642 rxrpc_client_to_active,
643 rxrpc_client_to_culled,
644 rxrpc_client_to_idle,
645 rxrpc_client_to_inactive,
646 rxrpc_client_to_waiting,
647 rxrpc_client_uncount,
648 rxrpc_client__nr_trace
649};
650
651extern const char rxrpc_client_traces[rxrpc_client__nr_trace][7];
652extern const char rxrpc_conn_cache_states[RXRPC_CONN__NR_CACHE_STATES][5];
653
654enum rxrpc_call_trace {
655 rxrpc_call_new_client,
656 rxrpc_call_new_service,
657 rxrpc_call_queued,
658 rxrpc_call_queued_ref,
659 rxrpc_call_seen,
660 rxrpc_call_connected,
661 rxrpc_call_release,
662 rxrpc_call_got,
663 rxrpc_call_got_userid,
664 rxrpc_call_got_kernel,
665 rxrpc_call_put,
666 rxrpc_call_put_userid,
667 rxrpc_call_put_kernel,
668 rxrpc_call_put_noqueue,
669 rxrpc_call_error,
670 rxrpc_call__nr_trace
671};
672
673extern const char rxrpc_call_traces[rxrpc_call__nr_trace][4];
674
675enum rxrpc_transmit_trace {
676 rxrpc_transmit_wait,
677 rxrpc_transmit_queue,
678 rxrpc_transmit_queue_last,
679 rxrpc_transmit_rotate,
680 rxrpc_transmit_rotate_last,
681 rxrpc_transmit_await_reply,
682 rxrpc_transmit_end,
683 rxrpc_transmit__nr_trace
684};
685
686extern const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4];
687
688enum rxrpc_receive_trace {
689 rxrpc_receive_incoming,
690 rxrpc_receive_queue,
691 rxrpc_receive_queue_last,
692 rxrpc_receive_front,
693 rxrpc_receive_rotate,
694 rxrpc_receive_end,
695 rxrpc_receive__nr_trace
696};
697
698extern const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4];
699
700enum rxrpc_recvmsg_trace {
701 rxrpc_recvmsg_enter,
702 rxrpc_recvmsg_wait,
703 rxrpc_recvmsg_dequeue,
704 rxrpc_recvmsg_hole,
705 rxrpc_recvmsg_next,
706 rxrpc_recvmsg_cont,
707 rxrpc_recvmsg_full,
708 rxrpc_recvmsg_data_return,
709 rxrpc_recvmsg_terminal,
710 rxrpc_recvmsg_to_be_accepted,
711 rxrpc_recvmsg_return,
712 rxrpc_recvmsg__nr_trace
713};
714
715extern const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5];
716
717enum rxrpc_rtt_tx_trace {
718 rxrpc_rtt_tx_ping,
719 rxrpc_rtt_tx_data,
720 rxrpc_rtt_tx__nr_trace
721};
722
723extern const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5];
724
725enum rxrpc_rtt_rx_trace {
726 rxrpc_rtt_rx_ping_response,
727 rxrpc_rtt_rx_requested_ack,
728 rxrpc_rtt_rx__nr_trace
729};
730
731extern const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5];
732
733enum rxrpc_timer_trace {
734 rxrpc_timer_begin,
735 rxrpc_timer_init_for_reply,
736 rxrpc_timer_init_for_send_reply,
737 rxrpc_timer_expired,
738 rxrpc_timer_set_for_ack,
739 rxrpc_timer_set_for_ping,
740 rxrpc_timer_set_for_resend,
741 rxrpc_timer_set_for_send,
742 rxrpc_timer__nr_trace
743};
744
745extern const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8];
746
747enum rxrpc_propose_ack_trace {
748 rxrpc_propose_ack_client_tx_end,
749 rxrpc_propose_ack_input_data,
750 rxrpc_propose_ack_ping_for_lost_ack,
751 rxrpc_propose_ack_ping_for_lost_reply,
752 rxrpc_propose_ack_ping_for_params,
753 rxrpc_propose_ack_processing_op,
754 rxrpc_propose_ack_respond_to_ack,
755 rxrpc_propose_ack_respond_to_ping,
756 rxrpc_propose_ack_retry_tx,
757 rxrpc_propose_ack_rotate_rx,
758 rxrpc_propose_ack_terminal_ack,
759 rxrpc_propose_ack__nr_trace
760};
761
762enum rxrpc_propose_ack_outcome {
763 rxrpc_propose_ack_use,
764 rxrpc_propose_ack_update,
765 rxrpc_propose_ack_subsume,
766 rxrpc_propose_ack__nr_outcomes
767};
768
769extern const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8];
770extern const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes];
771
772enum rxrpc_congest_change {
773 rxrpc_cong_begin_retransmission,
774 rxrpc_cong_cleared_nacks,
775 rxrpc_cong_new_low_nack,
776 rxrpc_cong_no_change,
777 rxrpc_cong_progress,
778 rxrpc_cong_retransmit_again,
779 rxrpc_cong_rtt_window_end,
780 rxrpc_cong_saw_nack,
781 rxrpc_congest__nr_change
782};
783
784extern const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10];
785extern const char rxrpc_congest_changes[rxrpc_congest__nr_change][9];
786
787extern const char *const rxrpc_pkts[];
788extern const char rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4];
789
790#include <trace/events/rxrpc.h> 598#include <trace/events/rxrpc.h>
791 599
792/* 600/*
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 832d854c2d5c..0ed181f53f32 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -323,6 +323,8 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx,
323 * 323 *
324 * If we want to report an error, we mark the skb with the packet type and 324 * If we want to report an error, we mark the skb with the packet type and
325 * abort code and return NULL. 325 * abort code and return NULL.
326 *
327 * The call is returned with the user access mutex held.
326 */ 328 */
327struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, 329struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local,
328 struct rxrpc_connection *conn, 330 struct rxrpc_connection *conn,
@@ -349,7 +351,8 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local,
349 351
350found_service: 352found_service:
351 spin_lock(&rx->incoming_lock); 353 spin_lock(&rx->incoming_lock);
352 if (rx->sk.sk_state == RXRPC_CLOSE) { 354 if (rx->sk.sk_state == RXRPC_SERVER_LISTEN_DISABLED ||
355 rx->sk.sk_state == RXRPC_CLOSE) {
353 trace_rxrpc_abort("CLS", sp->hdr.cid, sp->hdr.callNumber, 356 trace_rxrpc_abort("CLS", sp->hdr.cid, sp->hdr.callNumber,
354 sp->hdr.seq, RX_INVALID_OPERATION, ESHUTDOWN); 357 sp->hdr.seq, RX_INVALID_OPERATION, ESHUTDOWN);
355 skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT; 358 skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT;
@@ -370,6 +373,18 @@ found_service:
370 trace_rxrpc_receive(call, rxrpc_receive_incoming, 373 trace_rxrpc_receive(call, rxrpc_receive_incoming,
371 sp->hdr.serial, sp->hdr.seq); 374 sp->hdr.serial, sp->hdr.seq);
372 375
376 /* Lock the call to prevent rxrpc_kernel_send/recv_data() and
377 * sendmsg()/recvmsg() inconveniently stealing the mutex once the
378 * notification is generated.
379 *
380 * The BUG should never happen because the kernel should be well
381 * behaved enough not to access the call before the first notification
382 * event and userspace is prevented from doing so until the state is
383 * appropriate.
384 */
385 if (!mutex_trylock(&call->user_mutex))
386 BUG();
387
373 /* Make the call live. */ 388 /* Make the call live. */
374 rxrpc_incoming_call(rx, call, skb); 389 rxrpc_incoming_call(rx, call, skb);
375 conn = call->conn; 390 conn = call->conn;
@@ -428,10 +443,12 @@ out:
428/* 443/*
429 * handle acceptance of a call by userspace 444 * handle acceptance of a call by userspace
430 * - assign the user call ID to the call at the front of the queue 445 * - assign the user call ID to the call at the front of the queue
446 * - called with the socket locked.
431 */ 447 */
432struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, 448struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
433 unsigned long user_call_ID, 449 unsigned long user_call_ID,
434 rxrpc_notify_rx_t notify_rx) 450 rxrpc_notify_rx_t notify_rx)
451 __releases(&rx->sk.sk_lock.slock)
435{ 452{
436 struct rxrpc_call *call; 453 struct rxrpc_call *call;
437 struct rb_node *parent, **pp; 454 struct rb_node *parent, **pp;
@@ -445,6 +462,7 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
445 462
446 if (list_empty(&rx->to_be_accepted)) { 463 if (list_empty(&rx->to_be_accepted)) {
447 write_unlock(&rx->call_lock); 464 write_unlock(&rx->call_lock);
465 release_sock(&rx->sk);
448 kleave(" = -ENODATA [empty]"); 466 kleave(" = -ENODATA [empty]");
449 return ERR_PTR(-ENODATA); 467 return ERR_PTR(-ENODATA);
450 } 468 }
@@ -469,10 +487,39 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
469 */ 487 */
470 call = list_entry(rx->to_be_accepted.next, 488 call = list_entry(rx->to_be_accepted.next,
471 struct rxrpc_call, accept_link); 489 struct rxrpc_call, accept_link);
490 write_unlock(&rx->call_lock);
491
492 /* We need to gain the mutex from the interrupt handler without
493 * upsetting lockdep, so we have to release it there and take it here.
494 * We are, however, still holding the socket lock, so other accepts
495 * must wait for us and no one can add the user ID behind our backs.
496 */
497 if (mutex_lock_interruptible(&call->user_mutex) < 0) {
498 release_sock(&rx->sk);
499 kleave(" = -ERESTARTSYS");
500 return ERR_PTR(-ERESTARTSYS);
501 }
502
503 write_lock(&rx->call_lock);
472 list_del_init(&call->accept_link); 504 list_del_init(&call->accept_link);
473 sk_acceptq_removed(&rx->sk); 505 sk_acceptq_removed(&rx->sk);
474 rxrpc_see_call(call); 506 rxrpc_see_call(call);
475 507
508 /* Find the user ID insertion point. */
509 pp = &rx->calls.rb_node;
510 parent = NULL;
511 while (*pp) {
512 parent = *pp;
513 call = rb_entry(parent, struct rxrpc_call, sock_node);
514
515 if (user_call_ID < call->user_call_ID)
516 pp = &(*pp)->rb_left;
517 else if (user_call_ID > call->user_call_ID)
518 pp = &(*pp)->rb_right;
519 else
520 BUG();
521 }
522
476 write_lock_bh(&call->state_lock); 523 write_lock_bh(&call->state_lock);
477 switch (call->state) { 524 switch (call->state) {
478 case RXRPC_CALL_SERVER_ACCEPTING: 525 case RXRPC_CALL_SERVER_ACCEPTING:
@@ -498,6 +545,7 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
498 write_unlock(&rx->call_lock); 545 write_unlock(&rx->call_lock);
499 rxrpc_notify_socket(call); 546 rxrpc_notify_socket(call);
500 rxrpc_service_prealloc(rx, GFP_KERNEL); 547 rxrpc_service_prealloc(rx, GFP_KERNEL);
548 release_sock(&rx->sk);
501 _leave(" = %p{%d}", call, call->debug_id); 549 _leave(" = %p{%d}", call, call->debug_id);
502 return call; 550 return call;
503 551
@@ -514,6 +562,7 @@ id_in_use:
514 write_unlock(&rx->call_lock); 562 write_unlock(&rx->call_lock);
515out: 563out:
516 rxrpc_service_prealloc(rx, GFP_KERNEL); 564 rxrpc_service_prealloc(rx, GFP_KERNEL);
565 release_sock(&rx->sk);
517 _leave(" = %d", ret); 566 _leave(" = %d", ret);
518 return ERR_PTR(ret); 567 return ERR_PTR(ret);
519} 568}
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 1ed18d8c9c9f..d79cd36987a9 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -43,24 +43,6 @@ const char *const rxrpc_call_completions[NR__RXRPC_CALL_COMPLETIONS] = {
43 [RXRPC_CALL_NETWORK_ERROR] = "NetError", 43 [RXRPC_CALL_NETWORK_ERROR] = "NetError",
44}; 44};
45 45
46const char rxrpc_call_traces[rxrpc_call__nr_trace][4] = {
47 [rxrpc_call_new_client] = "NWc",
48 [rxrpc_call_new_service] = "NWs",
49 [rxrpc_call_queued] = "QUE",
50 [rxrpc_call_queued_ref] = "QUR",
51 [rxrpc_call_connected] = "CON",
52 [rxrpc_call_release] = "RLS",
53 [rxrpc_call_seen] = "SEE",
54 [rxrpc_call_got] = "GOT",
55 [rxrpc_call_got_userid] = "Gus",
56 [rxrpc_call_got_kernel] = "Gke",
57 [rxrpc_call_put] = "PUT",
58 [rxrpc_call_put_userid] = "Pus",
59 [rxrpc_call_put_kernel] = "Pke",
60 [rxrpc_call_put_noqueue] = "PNQ",
61 [rxrpc_call_error] = "*E*",
62};
63
64struct kmem_cache *rxrpc_call_jar; 46struct kmem_cache *rxrpc_call_jar;
65LIST_HEAD(rxrpc_calls); 47LIST_HEAD(rxrpc_calls);
66DEFINE_RWLOCK(rxrpc_call_lock); 48DEFINE_RWLOCK(rxrpc_call_lock);
@@ -133,6 +115,7 @@ struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
133 if (!call->rxtx_annotations) 115 if (!call->rxtx_annotations)
134 goto nomem_2; 116 goto nomem_2;
135 117
118 mutex_init(&call->user_mutex);
136 setup_timer(&call->timer, rxrpc_call_timer_expired, 119 setup_timer(&call->timer, rxrpc_call_timer_expired,
137 (unsigned long)call); 120 (unsigned long)call);
138 INIT_WORK(&call->processor, &rxrpc_process_call); 121 INIT_WORK(&call->processor, &rxrpc_process_call);
@@ -212,14 +195,16 @@ static void rxrpc_start_call_timer(struct rxrpc_call *call)
212} 195}
213 196
214/* 197/*
215 * set up a call for the given data 198 * Set up a call for the given parameters.
216 * - called in process context with IRQs enabled 199 * - Called with the socket lock held, which it must release.
200 * - If it returns a call, the call's lock will need releasing by the caller.
217 */ 201 */
218struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, 202struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
219 struct rxrpc_conn_parameters *cp, 203 struct rxrpc_conn_parameters *cp,
220 struct sockaddr_rxrpc *srx, 204 struct sockaddr_rxrpc *srx,
221 unsigned long user_call_ID, 205 unsigned long user_call_ID,
222 gfp_t gfp) 206 gfp_t gfp)
207 __releases(&rx->sk.sk_lock.slock)
223{ 208{
224 struct rxrpc_call *call, *xcall; 209 struct rxrpc_call *call, *xcall;
225 struct rb_node *parent, **pp; 210 struct rb_node *parent, **pp;
@@ -230,6 +215,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
230 215
231 call = rxrpc_alloc_client_call(srx, gfp); 216 call = rxrpc_alloc_client_call(srx, gfp);
232 if (IS_ERR(call)) { 217 if (IS_ERR(call)) {
218 release_sock(&rx->sk);
233 _leave(" = %ld", PTR_ERR(call)); 219 _leave(" = %ld", PTR_ERR(call));
234 return call; 220 return call;
235 } 221 }
@@ -237,6 +223,11 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
237 trace_rxrpc_call(call, rxrpc_call_new_client, atomic_read(&call->usage), 223 trace_rxrpc_call(call, rxrpc_call_new_client, atomic_read(&call->usage),
238 here, (const void *)user_call_ID); 224 here, (const void *)user_call_ID);
239 225
226 /* We need to protect a partially set up call against the user as we
227 * will be acting outside the socket lock.
228 */
229 mutex_lock(&call->user_mutex);
230
240 /* Publish the call, even though it is incompletely set up as yet */ 231 /* Publish the call, even though it is incompletely set up as yet */
241 write_lock(&rx->call_lock); 232 write_lock(&rx->call_lock);
242 233
@@ -268,6 +259,9 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
268 list_add_tail(&call->link, &rxrpc_calls); 259 list_add_tail(&call->link, &rxrpc_calls);
269 write_unlock(&rxrpc_call_lock); 260 write_unlock(&rxrpc_call_lock);
270 261
262 /* From this point on, the call is protected by its own lock. */
263 release_sock(&rx->sk);
264
271 /* Set up or get a connection record and set the protocol parameters, 265 /* Set up or get a connection record and set the protocol parameters,
272 * including channel number and call ID. 266 * including channel number and call ID.
273 */ 267 */
@@ -297,6 +291,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
297 */ 291 */
298error_dup_user_ID: 292error_dup_user_ID:
299 write_unlock(&rx->call_lock); 293 write_unlock(&rx->call_lock);
294 release_sock(&rx->sk);
300 ret = -EEXIST; 295 ret = -EEXIST;
301 296
302error: 297error:
@@ -305,6 +300,7 @@ error:
305 trace_rxrpc_call(call, rxrpc_call_error, atomic_read(&call->usage), 300 trace_rxrpc_call(call, rxrpc_call_error, atomic_read(&call->usage),
306 here, ERR_PTR(ret)); 301 here, ERR_PTR(ret));
307 rxrpc_release_call(rx, call); 302 rxrpc_release_call(rx, call);
303 mutex_unlock(&call->user_mutex);
308 rxrpc_put_call(call, rxrpc_call_put); 304 rxrpc_put_call(call, rxrpc_call_put);
309 _leave(" = %d", ret); 305 _leave(" = %d", ret);
310 return ERR_PTR(ret); 306 return ERR_PTR(ret);
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 60ef9605167e..c3be03e8d098 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -76,6 +76,8 @@
76#include <linux/slab.h> 76#include <linux/slab.h>
77#include <linux/idr.h> 77#include <linux/idr.h>
78#include <linux/timer.h> 78#include <linux/timer.h>
79#include <linux/sched/signal.h>
80
79#include "ar-internal.h" 81#include "ar-internal.h"
80 82
81__read_mostly unsigned int rxrpc_max_client_connections = 1000; 83__read_mostly unsigned int rxrpc_max_client_connections = 1000;
@@ -105,14 +107,6 @@ static void rxrpc_discard_expired_client_conns(struct work_struct *);
105static DECLARE_DELAYED_WORK(rxrpc_client_conn_reap, 107static DECLARE_DELAYED_WORK(rxrpc_client_conn_reap,
106 rxrpc_discard_expired_client_conns); 108 rxrpc_discard_expired_client_conns);
107 109
108const char rxrpc_conn_cache_states[RXRPC_CONN__NR_CACHE_STATES][5] = {
109 [RXRPC_CONN_CLIENT_INACTIVE] = "Inac",
110 [RXRPC_CONN_CLIENT_WAITING] = "Wait",
111 [RXRPC_CONN_CLIENT_ACTIVE] = "Actv",
112 [RXRPC_CONN_CLIENT_CULLED] = "Cull",
113 [RXRPC_CONN_CLIENT_IDLE] = "Idle",
114};
115
116/* 110/*
117 * Get a connection ID and epoch for a client connection from the global pool. 111 * Get a connection ID and epoch for a client connection from the global pool.
118 * The connection struct pointer is then recorded in the idr radix tree. The 112 * The connection struct pointer is then recorded in the idr radix tree. The
@@ -263,12 +257,12 @@ static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn)
263 * times the maximum number of client conns away from the current 257 * times the maximum number of client conns away from the current
264 * allocation point to try and keep the IDs concentrated. 258 * allocation point to try and keep the IDs concentrated.
265 */ 259 */
266 id_cursor = READ_ONCE(rxrpc_client_conn_ids.cur); 260 id_cursor = idr_get_cursor(&rxrpc_client_conn_ids);
267 id = conn->proto.cid >> RXRPC_CIDSHIFT; 261 id = conn->proto.cid >> RXRPC_CIDSHIFT;
268 distance = id - id_cursor; 262 distance = id - id_cursor;
269 if (distance < 0) 263 if (distance < 0)
270 distance = -distance; 264 distance = -distance;
271 limit = round_up(rxrpc_max_client_connections, IDR_SIZE) * 4; 265 limit = max(rxrpc_max_client_connections * 4, 1024U);
272 if (distance > limit) 266 if (distance > limit)
273 goto mark_dont_reuse; 267 goto mark_dont_reuse;
274 268
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 3f9d8d7ec632..b099b64366f3 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -275,6 +275,10 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
275 rxrpc_conn_retransmit_call(conn, skb); 275 rxrpc_conn_retransmit_call(conn, skb);
276 return 0; 276 return 0;
277 277
278 case RXRPC_PACKET_TYPE_BUSY:
279 /* Just ignore BUSY packets for now. */
280 return 0;
281
278 case RXRPC_PACKET_TYPE_ABORT: 282 case RXRPC_PACKET_TYPE_ABORT:
279 if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), 283 if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
280 &wtmp, sizeof(wtmp)) < 0) 284 &wtmp, sizeof(wtmp)) < 0)
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index e1e83af47866..b0ecb770fdce 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -173,6 +173,7 @@ void __rxrpc_disconnect_call(struct rxrpc_connection *conn,
173 /* Save the result of the call so that we can repeat it if necessary 173 /* Save the result of the call so that we can repeat it if necessary
174 * through the channel, whilst disposing of the actual call record. 174 * through the channel, whilst disposing of the actual call record.
175 */ 175 */
176 trace_rxrpc_disconnect_call(call);
176 chan->last_service_id = call->service_id; 177 chan->last_service_id = call->service_id;
177 if (call->abort_code) { 178 if (call->abort_code) {
178 chan->last_abort = call->abort_code; 179 chan->last_abort = call->abort_code;
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 44fb8d893c7d..18b2ad8be8e2 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -420,6 +420,7 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb,
420 u16 skew) 420 u16 skew)
421{ 421{
422 struct rxrpc_skb_priv *sp = rxrpc_skb(skb); 422 struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
423 enum rxrpc_call_state state;
423 unsigned int offset = sizeof(struct rxrpc_wire_header); 424 unsigned int offset = sizeof(struct rxrpc_wire_header);
424 unsigned int ix; 425 unsigned int ix;
425 rxrpc_serial_t serial = sp->hdr.serial, ack_serial = 0; 426 rxrpc_serial_t serial = sp->hdr.serial, ack_serial = 0;
@@ -434,14 +435,15 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb,
434 _proto("Rx DATA %%%u { #%u f=%02x }", 435 _proto("Rx DATA %%%u { #%u f=%02x }",
435 sp->hdr.serial, seq, sp->hdr.flags); 436 sp->hdr.serial, seq, sp->hdr.flags);
436 437
437 if (call->state >= RXRPC_CALL_COMPLETE) 438 state = READ_ONCE(call->state);
439 if (state >= RXRPC_CALL_COMPLETE)
438 return; 440 return;
439 441
440 /* Received data implicitly ACKs all of the request packets we sent 442 /* Received data implicitly ACKs all of the request packets we sent
441 * when we're acting as a client. 443 * when we're acting as a client.
442 */ 444 */
443 if ((call->state == RXRPC_CALL_CLIENT_SEND_REQUEST || 445 if ((state == RXRPC_CALL_CLIENT_SEND_REQUEST ||
444 call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) && 446 state == RXRPC_CALL_CLIENT_AWAIT_REPLY) &&
445 !rxrpc_receiving_reply(call)) 447 !rxrpc_receiving_reply(call))
446 return; 448 return;
447 449
@@ -481,6 +483,7 @@ next_subpacket:
481 return rxrpc_proto_abort("LSA", call, seq); 483 return rxrpc_proto_abort("LSA", call, seq);
482 } 484 }
483 485
486 trace_rxrpc_rx_data(call, seq, serial, flags, annotation);
484 if (before_eq(seq, hard_ack)) { 487 if (before_eq(seq, hard_ack)) {
485 ack = RXRPC_ACK_DUPLICATE; 488 ack = RXRPC_ACK_DUPLICATE;
486 ack_serial = serial; 489 ack_serial = serial;
@@ -649,6 +652,7 @@ static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
649 struct rxrpc_skb_priv *sp = rxrpc_skb(skb); 652 struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
650 struct rxrpc_peer *peer; 653 struct rxrpc_peer *peer;
651 unsigned int mtu; 654 unsigned int mtu;
655 bool wake = false;
652 u32 rwind = ntohl(ackinfo->rwind); 656 u32 rwind = ntohl(ackinfo->rwind);
653 657
654 _proto("Rx ACK %%%u Info { rx=%u max=%u rwin=%u jm=%u }", 658 _proto("Rx ACK %%%u Info { rx=%u max=%u rwin=%u jm=%u }",
@@ -656,9 +660,14 @@ static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
656 ntohl(ackinfo->rxMTU), ntohl(ackinfo->maxMTU), 660 ntohl(ackinfo->rxMTU), ntohl(ackinfo->maxMTU),
657 rwind, ntohl(ackinfo->jumbo_max)); 661 rwind, ntohl(ackinfo->jumbo_max));
658 662
659 if (rwind > RXRPC_RXTX_BUFF_SIZE - 1) 663 if (call->tx_winsize != rwind) {
660 rwind = RXRPC_RXTX_BUFF_SIZE - 1; 664 if (rwind > RXRPC_RXTX_BUFF_SIZE - 1)
661 call->tx_winsize = rwind; 665 rwind = RXRPC_RXTX_BUFF_SIZE - 1;
666 if (rwind > call->tx_winsize)
667 wake = true;
668 call->tx_winsize = rwind;
669 }
670
662 if (call->cong_ssthresh > rwind) 671 if (call->cong_ssthresh > rwind)
663 call->cong_ssthresh = rwind; 672 call->cong_ssthresh = rwind;
664 673
@@ -672,6 +681,9 @@ static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
672 spin_unlock_bh(&peer->lock); 681 spin_unlock_bh(&peer->lock);
673 _net("Net MTU %u (maxdata %u)", peer->mtu, peer->maxdata); 682 _net("Net MTU %u (maxdata %u)", peer->mtu, peer->maxdata);
674 } 683 }
684
685 if (wake)
686 wake_up(&call->waitq);
675} 687}
676 688
677/* 689/*
@@ -765,16 +777,9 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
765 summary.ack_reason = (buf.ack.reason < RXRPC_ACK__INVALID ? 777 summary.ack_reason = (buf.ack.reason < RXRPC_ACK__INVALID ?
766 buf.ack.reason : RXRPC_ACK__INVALID); 778 buf.ack.reason : RXRPC_ACK__INVALID);
767 779
768 trace_rxrpc_rx_ack(call, first_soft_ack, summary.ack_reason, nr_acks); 780 trace_rxrpc_rx_ack(call, sp->hdr.serial, acked_serial,
769 781 first_soft_ack, ntohl(buf.ack.previousPacket),
770 _proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", 782 summary.ack_reason, nr_acks);
771 sp->hdr.serial,
772 ntohs(buf.ack.maxSkew),
773 first_soft_ack,
774 ntohl(buf.ack.previousPacket),
775 acked_serial,
776 rxrpc_ack_names[summary.ack_reason],
777 buf.ack.nAcks);
778 783
779 if (buf.ack.reason == RXRPC_ACK_PING_RESPONSE) 784 if (buf.ack.reason == RXRPC_ACK_PING_RESPONSE)
780 rxrpc_input_ping_response(call, skb->tstamp, acked_serial, 785 rxrpc_input_ping_response(call, skb->tstamp, acked_serial,
@@ -805,7 +810,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
805 return rxrpc_proto_abort("AK0", call, 0); 810 return rxrpc_proto_abort("AK0", call, 0);
806 811
807 /* Ignore ACKs unless we are or have just been transmitting. */ 812 /* Ignore ACKs unless we are or have just been transmitting. */
808 switch (call->state) { 813 switch (READ_ONCE(call->state)) {
809 case RXRPC_CALL_CLIENT_SEND_REQUEST: 814 case RXRPC_CALL_CLIENT_SEND_REQUEST:
810 case RXRPC_CALL_CLIENT_AWAIT_REPLY: 815 case RXRPC_CALL_CLIENT_AWAIT_REPLY:
811 case RXRPC_CALL_SERVER_SEND_REPLY: 816 case RXRPC_CALL_SERVER_SEND_REPLY:
@@ -931,7 +936,6 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call,
931 break; 936 break;
932 937
933 default: 938 default:
934 _proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], sp->hdr.serial);
935 break; 939 break;
936 } 940 }
937 941
@@ -947,7 +951,7 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call,
947static void rxrpc_input_implicit_end_call(struct rxrpc_connection *conn, 951static void rxrpc_input_implicit_end_call(struct rxrpc_connection *conn,
948 struct rxrpc_call *call) 952 struct rxrpc_call *call)
949{ 953{
950 switch (call->state) { 954 switch (READ_ONCE(call->state)) {
951 case RXRPC_CALL_SERVER_AWAIT_ACK: 955 case RXRPC_CALL_SERVER_AWAIT_ACK:
952 rxrpc_call_completed(call); 956 rxrpc_call_completed(call);
953 break; 957 break;
@@ -961,6 +965,7 @@ static void rxrpc_input_implicit_end_call(struct rxrpc_connection *conn,
961 break; 965 break;
962 } 966 }
963 967
968 trace_rxrpc_improper_term(call);
964 __rxrpc_disconnect_call(conn, call); 969 __rxrpc_disconnect_call(conn, call);
965 rxrpc_notify_socket(call); 970 rxrpc_notify_socket(call);
966} 971}
@@ -1053,7 +1058,7 @@ void rxrpc_data_ready(struct sock *udp_sk)
1053 1058
1054 ASSERT(!irqs_disabled()); 1059 ASSERT(!irqs_disabled());
1055 1060
1056 skb = skb_recv_datagram(udp_sk, 0, 1, &ret); 1061 skb = skb_recv_udp(udp_sk, 0, 1, &ret);
1057 if (!skb) { 1062 if (!skb) {
1058 if (ret == -EAGAIN) 1063 if (ret == -EAGAIN)
1059 return; 1064 return;
@@ -1075,10 +1080,9 @@ void rxrpc_data_ready(struct sock *udp_sk)
1075 1080
1076 __UDP_INC_STATS(&init_net, UDP_MIB_INDATAGRAMS, 0); 1081 __UDP_INC_STATS(&init_net, UDP_MIB_INDATAGRAMS, 0);
1077 1082
1078 /* The socket buffer we have is owned by UDP, with UDP's data all over 1083 /* The UDP protocol already released all skb resources;
1079 * it, but we really want our own data there. 1084 * we are free to add our own data there.
1080 */ 1085 */
1081 skb_orphan(skb);
1082 sp = rxrpc_skb(skb); 1086 sp = rxrpc_skb(skb);
1083 1087
1084 /* dig out the RxRPC connection details */ 1088 /* dig out the RxRPC connection details */
@@ -1201,6 +1205,7 @@ void rxrpc_data_ready(struct sock *udp_sk)
1201 goto reject_packet; 1205 goto reject_packet;
1202 } 1206 }
1203 rxrpc_send_ping(call, skb, skew); 1207 rxrpc_send_ping(call, skb, skew);
1208 mutex_unlock(&call->user_mutex);
1204 } 1209 }
1205 1210
1206 rxrpc_input_call_packet(call, skb, skew); 1211 rxrpc_input_call_packet(call, skb, skew);
diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c
index 18c737a61d80..0a4e28477ad9 100644
--- a/net/rxrpc/key.c
+++ b/net/rxrpc/key.c
@@ -1065,7 +1065,7 @@ static long rxrpc_read(const struct key *key,
1065 1065
1066 switch (token->security_index) { 1066 switch (token->security_index) {
1067 case RXRPC_SECURITY_RXKAD: 1067 case RXRPC_SECURITY_RXKAD:
1068 toksize += 8 * 4; /* viceid, kvno, key*2, begin, 1068 toksize += 9 * 4; /* viceid, kvno, key*2 + len, begin,
1069 * end, primary, tktlen */ 1069 * end, primary, tktlen */
1070 toksize += RND(token->kad->ticket_len); 1070 toksize += RND(token->kad->ticket_len);
1071 break; 1071 break;
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index 6dee55fad2d3..1a2d4b112064 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -77,12 +77,6 @@ unsigned int rxrpc_rx_jumbo_max = 4;
77 */ 77 */
78unsigned int rxrpc_resend_timeout = 4 * 1000; 78unsigned int rxrpc_resend_timeout = 4 * 1000;
79 79
80const char *const rxrpc_pkts[] = {
81 "?00",
82 "DATA", "ACK", "BUSY", "ABORT", "ACKALL", "CHALL", "RESP", "DEBUG",
83 "?09", "?10", "?11", "?12", "VERSION", "?14", "?15"
84};
85
86const s8 rxrpc_ack_priority[] = { 80const s8 rxrpc_ack_priority[] = {
87 [0] = 0, 81 [0] = 0,
88 [RXRPC_ACK_DELAY] = 1, 82 [RXRPC_ACK_DELAY] = 1,
@@ -94,148 +88,3 @@ const s8 rxrpc_ack_priority[] = {
94 [RXRPC_ACK_NOSPACE] = 7, 88 [RXRPC_ACK_NOSPACE] = 7,
95 [RXRPC_ACK_PING_RESPONSE] = 8, 89 [RXRPC_ACK_PING_RESPONSE] = 8,
96}; 90};
97
98const char rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4] = {
99 "---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY",
100 "IDL", "-?-"
101};
102
103const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7] = {
104 [rxrpc_skb_rx_cleaned] = "Rx CLN",
105 [rxrpc_skb_rx_freed] = "Rx FRE",
106 [rxrpc_skb_rx_got] = "Rx GOT",
107 [rxrpc_skb_rx_lost] = "Rx *L*",
108 [rxrpc_skb_rx_received] = "Rx RCV",
109 [rxrpc_skb_rx_purged] = "Rx PUR",
110 [rxrpc_skb_rx_rotated] = "Rx ROT",
111 [rxrpc_skb_rx_seen] = "Rx SEE",
112 [rxrpc_skb_tx_cleaned] = "Tx CLN",
113 [rxrpc_skb_tx_freed] = "Tx FRE",
114 [rxrpc_skb_tx_got] = "Tx GOT",
115 [rxrpc_skb_tx_new] = "Tx NEW",
116 [rxrpc_skb_tx_rotated] = "Tx ROT",
117 [rxrpc_skb_tx_seen] = "Tx SEE",
118};
119
120const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4] = {
121 [rxrpc_conn_new_client] = "NWc",
122 [rxrpc_conn_new_service] = "NWs",
123 [rxrpc_conn_queued] = "QUE",
124 [rxrpc_conn_seen] = "SEE",
125 [rxrpc_conn_got] = "GOT",
126 [rxrpc_conn_put_client] = "PTc",
127 [rxrpc_conn_put_service] = "PTs",
128};
129
130const char rxrpc_client_traces[rxrpc_client__nr_trace][7] = {
131 [rxrpc_client_activate_chans] = "Activa",
132 [rxrpc_client_alloc] = "Alloc ",
133 [rxrpc_client_chan_activate] = "ChActv",
134 [rxrpc_client_chan_disconnect] = "ChDisc",
135 [rxrpc_client_chan_pass] = "ChPass",
136 [rxrpc_client_chan_unstarted] = "ChUnst",
137 [rxrpc_client_cleanup] = "Clean ",
138 [rxrpc_client_count] = "Count ",
139 [rxrpc_client_discard] = "Discar",
140 [rxrpc_client_duplicate] = "Duplic",
141 [rxrpc_client_exposed] = "Expose",
142 [rxrpc_client_replace] = "Replac",
143 [rxrpc_client_to_active] = "->Actv",
144 [rxrpc_client_to_culled] = "->Cull",
145 [rxrpc_client_to_idle] = "->Idle",
146 [rxrpc_client_to_inactive] = "->Inac",
147 [rxrpc_client_to_waiting] = "->Wait",
148 [rxrpc_client_uncount] = "Uncoun",
149};
150
151const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4] = {
152 [rxrpc_transmit_wait] = "WAI",
153 [rxrpc_transmit_queue] = "QUE",
154 [rxrpc_transmit_queue_last] = "QLS",
155 [rxrpc_transmit_rotate] = "ROT",
156 [rxrpc_transmit_rotate_last] = "RLS",
157 [rxrpc_transmit_await_reply] = "AWR",
158 [rxrpc_transmit_end] = "END",
159};
160
161const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4] = {
162 [rxrpc_receive_incoming] = "INC",
163 [rxrpc_receive_queue] = "QUE",
164 [rxrpc_receive_queue_last] = "QLS",
165 [rxrpc_receive_front] = "FRN",
166 [rxrpc_receive_rotate] = "ROT",
167 [rxrpc_receive_end] = "END",
168};
169
170const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5] = {
171 [rxrpc_recvmsg_enter] = "ENTR",
172 [rxrpc_recvmsg_wait] = "WAIT",
173 [rxrpc_recvmsg_dequeue] = "DEQU",
174 [rxrpc_recvmsg_hole] = "HOLE",
175 [rxrpc_recvmsg_next] = "NEXT",
176 [rxrpc_recvmsg_cont] = "CONT",
177 [rxrpc_recvmsg_full] = "FULL",
178 [rxrpc_recvmsg_data_return] = "DATA",
179 [rxrpc_recvmsg_terminal] = "TERM",
180 [rxrpc_recvmsg_to_be_accepted] = "TBAC",
181 [rxrpc_recvmsg_return] = "RETN",
182};
183
184const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5] = {
185 [rxrpc_rtt_tx_ping] = "PING",
186 [rxrpc_rtt_tx_data] = "DATA",
187};
188
189const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5] = {
190 [rxrpc_rtt_rx_ping_response] = "PONG",
191 [rxrpc_rtt_rx_requested_ack] = "RACK",
192};
193
194const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8] = {
195 [rxrpc_timer_begin] = "Begin ",
196 [rxrpc_timer_expired] = "*EXPR*",
197 [rxrpc_timer_init_for_reply] = "IniRpl",
198 [rxrpc_timer_init_for_send_reply] = "SndRpl",
199 [rxrpc_timer_set_for_ack] = "SetAck",
200 [rxrpc_timer_set_for_ping] = "SetPng",
201 [rxrpc_timer_set_for_send] = "SetTx ",
202 [rxrpc_timer_set_for_resend] = "SetRTx",
203};
204
205const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8] = {
206 [rxrpc_propose_ack_client_tx_end] = "ClTxEnd",
207 [rxrpc_propose_ack_input_data] = "DataIn ",
208 [rxrpc_propose_ack_ping_for_lost_ack] = "LostAck",
209 [rxrpc_propose_ack_ping_for_lost_reply] = "LostRpl",
210 [rxrpc_propose_ack_ping_for_params] = "Params ",
211 [rxrpc_propose_ack_processing_op] = "ProcOp ",
212 [rxrpc_propose_ack_respond_to_ack] = "Rsp2Ack",
213 [rxrpc_propose_ack_respond_to_ping] = "Rsp2Png",
214 [rxrpc_propose_ack_retry_tx] = "RetryTx",
215 [rxrpc_propose_ack_rotate_rx] = "RxAck ",
216 [rxrpc_propose_ack_terminal_ack] = "ClTerm ",
217};
218
219const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes] = {
220 [rxrpc_propose_ack_use] = "",
221 [rxrpc_propose_ack_update] = " Update",
222 [rxrpc_propose_ack_subsume] = " Subsume",
223};
224
225const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10] = {
226 [RXRPC_CALL_SLOW_START] = "SlowStart",
227 [RXRPC_CALL_CONGEST_AVOIDANCE] = "CongAvoid",
228 [RXRPC_CALL_PACKET_LOSS] = "PktLoss ",
229 [RXRPC_CALL_FAST_RETRANSMIT] = "FastReTx ",
230};
231
232const char rxrpc_congest_changes[rxrpc_congest__nr_change][9] = {
233 [rxrpc_cong_begin_retransmission] = " Retrans",
234 [rxrpc_cong_cleared_nacks] = " Cleared",
235 [rxrpc_cong_new_low_nack] = " NewLowN",
236 [rxrpc_cong_no_change] = "",
237 [rxrpc_cong_progress] = " Progres",
238 [rxrpc_cong_retransmit_again] = " ReTxAgn",
239 [rxrpc_cong_rtt_window_end] = " RttWinE",
240 [rxrpc_cong_saw_nack] = " SawNack",
241};
diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c
index 65cd980767fa..b9bcfbfb095c 100644
--- a/net/rxrpc/proc.c
+++ b/net/rxrpc/proc.c
@@ -52,6 +52,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
52 struct rxrpc_sock *rx; 52 struct rxrpc_sock *rx;
53 struct rxrpc_peer *peer; 53 struct rxrpc_peer *peer;
54 struct rxrpc_call *call; 54 struct rxrpc_call *call;
55 rxrpc_seq_t tx_hard_ack, rx_hard_ack;
55 char lbuff[50], rbuff[50]; 56 char lbuff[50], rbuff[50];
56 57
57 if (v == &rxrpc_calls) { 58 if (v == &rxrpc_calls) {
@@ -82,9 +83,11 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
82 else 83 else
83 strcpy(rbuff, "no_connection"); 84 strcpy(rbuff, "no_connection");
84 85
86 tx_hard_ack = READ_ONCE(call->tx_hard_ack);
87 rx_hard_ack = READ_ONCE(call->rx_hard_ack);
85 seq_printf(seq, 88 seq_printf(seq,
86 "UDP %-47.47s %-47.47s %4x %08x %08x %s %3u" 89 "UDP %-47.47s %-47.47s %4x %08x %08x %s %3u"
87 " %-8.8s %08x %lx\n", 90 " %-8.8s %08x %lx %08x %02x %08x %02x\n",
88 lbuff, 91 lbuff,
89 rbuff, 92 rbuff,
90 call->service_id, 93 call->service_id,
@@ -94,7 +97,9 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
94 atomic_read(&call->usage), 97 atomic_read(&call->usage),
95 rxrpc_call_states[call->state], 98 rxrpc_call_states[call->state],
96 call->abort_code, 99 call->abort_code,
97 call->user_call_ID); 100 call->user_call_ID,
101 tx_hard_ack, READ_ONCE(call->tx_top) - tx_hard_ack,
102 rx_hard_ack, READ_ONCE(call->rx_top) - rx_hard_ack);
98 103
99 return 0; 104 return 0;
100} 105}
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index c29362d50a92..3e2f1a8e9c5b 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -14,6 +14,8 @@
14#include <linux/net.h> 14#include <linux/net.h>
15#include <linux/skbuff.h> 15#include <linux/skbuff.h>
16#include <linux/export.h> 16#include <linux/export.h>
17#include <linux/sched/signal.h>
18
17#include <net/sock.h> 19#include <net/sock.h>
18#include <net/af_rxrpc.h> 20#include <net/af_rxrpc.h>
19#include "ar-internal.h" 21#include "ar-internal.h"
@@ -320,8 +322,10 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
320 322
321 /* Barriers against rxrpc_input_data(). */ 323 /* Barriers against rxrpc_input_data(). */
322 hard_ack = call->rx_hard_ack; 324 hard_ack = call->rx_hard_ack;
323 top = smp_load_acquire(&call->rx_top); 325 seq = hard_ack + 1;
324 for (seq = hard_ack + 1; before_eq(seq, top); seq++) { 326 while (top = smp_load_acquire(&call->rx_top),
327 before_eq(seq, top)
328 ) {
325 ix = seq & RXRPC_RXTX_BUFF_MASK; 329 ix = seq & RXRPC_RXTX_BUFF_MASK;
326 skb = call->rxtx_buffer[ix]; 330 skb = call->rxtx_buffer[ix];
327 if (!skb) { 331 if (!skb) {
@@ -394,6 +398,8 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
394 ret = 1; 398 ret = 1;
395 goto out; 399 goto out;
396 } 400 }
401
402 seq++;
397 } 403 }
398 404
399out: 405out:
@@ -483,6 +489,20 @@ try_again:
483 489
484 trace_rxrpc_recvmsg(call, rxrpc_recvmsg_dequeue, 0, 0, 0, 0); 490 trace_rxrpc_recvmsg(call, rxrpc_recvmsg_dequeue, 0, 0, 0, 0);
485 491
492 /* We're going to drop the socket lock, so we need to lock the call
493 * against interference by sendmsg.
494 */
495 if (!mutex_trylock(&call->user_mutex)) {
496 ret = -EWOULDBLOCK;
497 if (flags & MSG_DONTWAIT)
498 goto error_requeue_call;
499 ret = -ERESTARTSYS;
500 if (mutex_lock_interruptible(&call->user_mutex) < 0)
501 goto error_requeue_call;
502 }
503
504 release_sock(&rx->sk);
505
486 if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) 506 if (test_bit(RXRPC_CALL_RELEASED, &call->flags))
487 BUG(); 507 BUG();
488 508
@@ -498,7 +518,7 @@ try_again:
498 &call->user_call_ID); 518 &call->user_call_ID);
499 } 519 }
500 if (ret < 0) 520 if (ret < 0)
501 goto error; 521 goto error_unlock_call;
502 } 522 }
503 523
504 if (msg->msg_name) { 524 if (msg->msg_name) {
@@ -507,7 +527,7 @@ try_again:
507 msg->msg_namelen = len; 527 msg->msg_namelen = len;
508 } 528 }
509 529
510 switch (call->state) { 530 switch (READ_ONCE(call->state)) {
511 case RXRPC_CALL_SERVER_ACCEPTING: 531 case RXRPC_CALL_SERVER_ACCEPTING:
512 ret = rxrpc_recvmsg_new_call(rx, call, msg, flags); 532 ret = rxrpc_recvmsg_new_call(rx, call, msg, flags);
513 break; 533 break;
@@ -529,12 +549,12 @@ try_again:
529 } 549 }
530 550
531 if (ret < 0) 551 if (ret < 0)
532 goto error; 552 goto error_unlock_call;
533 553
534 if (call->state == RXRPC_CALL_COMPLETE) { 554 if (call->state == RXRPC_CALL_COMPLETE) {
535 ret = rxrpc_recvmsg_term(call, msg); 555 ret = rxrpc_recvmsg_term(call, msg);
536 if (ret < 0) 556 if (ret < 0)
537 goto error; 557 goto error_unlock_call;
538 if (!(flags & MSG_PEEK)) 558 if (!(flags & MSG_PEEK))
539 rxrpc_release_call(rx, call); 559 rxrpc_release_call(rx, call);
540 msg->msg_flags |= MSG_EOR; 560 msg->msg_flags |= MSG_EOR;
@@ -547,8 +567,21 @@ try_again:
547 msg->msg_flags &= ~MSG_MORE; 567 msg->msg_flags &= ~MSG_MORE;
548 ret = copied; 568 ret = copied;
549 569
550error: 570error_unlock_call:
571 mutex_unlock(&call->user_mutex);
551 rxrpc_put_call(call, rxrpc_call_put); 572 rxrpc_put_call(call, rxrpc_call_put);
573 trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, 0, 0, 0, ret);
574 return ret;
575
576error_requeue_call:
577 if (!(flags & MSG_PEEK)) {
578 write_lock_bh(&rx->recvmsg_lock);
579 list_add(&call->recvmsg_link, &rx->recvmsg_q);
580 write_unlock_bh(&rx->recvmsg_lock);
581 trace_rxrpc_recvmsg(call, rxrpc_recvmsg_requeue, 0, 0, 0, 0);
582 } else {
583 rxrpc_put_call(call, rxrpc_call_put);
584 }
552error_no_call: 585error_no_call:
553 release_sock(&rx->sk); 586 release_sock(&rx->sk);
554 trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, 0, 0, 0, ret); 587 trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, 0, 0, 0, ret);
@@ -605,9 +638,9 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call,
605 iov.iov_len = size - *_offset; 638 iov.iov_len = size - *_offset;
606 iov_iter_kvec(&iter, ITER_KVEC | READ, &iov, 1, size - *_offset); 639 iov_iter_kvec(&iter, ITER_KVEC | READ, &iov, 1, size - *_offset);
607 640
608 lock_sock(sock->sk); 641 mutex_lock(&call->user_mutex);
609 642
610 switch (call->state) { 643 switch (READ_ONCE(call->state)) {
611 case RXRPC_CALL_CLIENT_RECV_REPLY: 644 case RXRPC_CALL_CLIENT_RECV_REPLY:
612 case RXRPC_CALL_SERVER_RECV_REQUEST: 645 case RXRPC_CALL_SERVER_RECV_REQUEST:
613 case RXRPC_CALL_SERVER_ACK_REQUEST: 646 case RXRPC_CALL_SERVER_ACK_REQUEST:
@@ -644,7 +677,7 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call,
644read_phase_complete: 677read_phase_complete:
645 ret = 1; 678 ret = 1;
646out: 679out:
647 release_sock(sock->sk); 680 mutex_unlock(&call->user_mutex);
648 _leave(" = %d [%zu,%d]", ret, *_offset, *_abort); 681 _leave(" = %d [%zu,%d]", ret, *_offset, *_abort);
649 return ret; 682 return ret;
650 683
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index b214a4d4a641..97ab214ca411 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -15,6 +15,8 @@
15#include <linux/gfp.h> 15#include <linux/gfp.h>
16#include <linux/skbuff.h> 16#include <linux/skbuff.h>
17#include <linux/export.h> 17#include <linux/export.h>
18#include <linux/sched/signal.h>
19
18#include <net/sock.h> 20#include <net/sock.h>
19#include <net/af_rxrpc.h> 21#include <net/af_rxrpc.h>
20#include "ar-internal.h" 22#include "ar-internal.h"
@@ -59,9 +61,12 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
59 } 61 }
60 62
61 trace_rxrpc_transmit(call, rxrpc_transmit_wait); 63 trace_rxrpc_transmit(call, rxrpc_transmit_wait);
62 release_sock(&rx->sk); 64 mutex_unlock(&call->user_mutex);
63 *timeo = schedule_timeout(*timeo); 65 *timeo = schedule_timeout(*timeo);
64 lock_sock(&rx->sk); 66 if (mutex_lock_interruptible(&call->user_mutex) < 0) {
67 ret = sock_intr_errno(*timeo);
68 break;
69 }
65 } 70 }
66 71
67 remove_wait_queue(&call->waitq, &myself); 72 remove_wait_queue(&call->waitq, &myself);
@@ -171,7 +176,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
171/* 176/*
172 * send data through a socket 177 * send data through a socket
173 * - must be called in process context 178 * - must be called in process context
174 * - caller holds the socket locked 179 * - The caller holds the call user access mutex, but not the socket lock.
175 */ 180 */
176static int rxrpc_send_data(struct rxrpc_sock *rx, 181static int rxrpc_send_data(struct rxrpc_sock *rx,
177 struct rxrpc_call *call, 182 struct rxrpc_call *call,
@@ -376,7 +381,7 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg,
376 if (!CMSG_OK(msg, cmsg)) 381 if (!CMSG_OK(msg, cmsg))
377 return -EINVAL; 382 return -EINVAL;
378 383
379 len = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); 384 len = cmsg->cmsg_len - sizeof(struct cmsghdr);
380 _debug("CMSG %d, %d, %d", 385 _debug("CMSG %d, %d, %d",
381 cmsg->cmsg_level, cmsg->cmsg_type, len); 386 cmsg->cmsg_level, cmsg->cmsg_type, len);
382 387
@@ -437,10 +442,13 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg,
437 442
438/* 443/*
439 * Create a new client call for sendmsg(). 444 * Create a new client call for sendmsg().
445 * - Called with the socket lock held, which it must release.
446 * - If it returns a call, the call's lock will need releasing by the caller.
440 */ 447 */
441static struct rxrpc_call * 448static struct rxrpc_call *
442rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, 449rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg,
443 unsigned long user_call_ID, bool exclusive) 450 unsigned long user_call_ID, bool exclusive)
451 __releases(&rx->sk.sk_lock.slock)
444{ 452{
445 struct rxrpc_conn_parameters cp; 453 struct rxrpc_conn_parameters cp;
446 struct rxrpc_call *call; 454 struct rxrpc_call *call;
@@ -450,8 +458,10 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg,
450 458
451 _enter(""); 459 _enter("");
452 460
453 if (!msg->msg_name) 461 if (!msg->msg_name) {
462 release_sock(&rx->sk);
454 return ERR_PTR(-EDESTADDRREQ); 463 return ERR_PTR(-EDESTADDRREQ);
464 }
455 465
456 key = rx->key; 466 key = rx->key;
457 if (key && !rx->key->payload.data[0]) 467 if (key && !rx->key->payload.data[0])
@@ -464,6 +474,7 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg,
464 cp.exclusive = rx->exclusive | exclusive; 474 cp.exclusive = rx->exclusive | exclusive;
465 cp.service_id = srx->srx_service; 475 cp.service_id = srx->srx_service;
466 call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, GFP_KERNEL); 476 call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, GFP_KERNEL);
477 /* The socket is now unlocked */
467 478
468 _leave(" = %p\n", call); 479 _leave(" = %p\n", call);
469 return call; 480 return call;
@@ -475,7 +486,9 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg,
475 * - the socket may be either a client socket or a server socket 486 * - the socket may be either a client socket or a server socket
476 */ 487 */
477int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) 488int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
489 __releases(&rx->sk.sk_lock.slock)
478{ 490{
491 enum rxrpc_call_state state;
479 enum rxrpc_command cmd; 492 enum rxrpc_command cmd;
480 struct rxrpc_call *call; 493 struct rxrpc_call *call;
481 unsigned long user_call_ID = 0; 494 unsigned long user_call_ID = 0;
@@ -488,12 +501,14 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
488 ret = rxrpc_sendmsg_cmsg(msg, &user_call_ID, &cmd, &abort_code, 501 ret = rxrpc_sendmsg_cmsg(msg, &user_call_ID, &cmd, &abort_code,
489 &exclusive); 502 &exclusive);
490 if (ret < 0) 503 if (ret < 0)
491 return ret; 504 goto error_release_sock;
492 505
493 if (cmd == RXRPC_CMD_ACCEPT) { 506 if (cmd == RXRPC_CMD_ACCEPT) {
507 ret = -EINVAL;
494 if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) 508 if (rx->sk.sk_state != RXRPC_SERVER_LISTENING)
495 return -EINVAL; 509 goto error_release_sock;
496 call = rxrpc_accept_call(rx, user_call_ID, NULL); 510 call = rxrpc_accept_call(rx, user_call_ID, NULL);
511 /* The socket is now unlocked. */
497 if (IS_ERR(call)) 512 if (IS_ERR(call))
498 return PTR_ERR(call); 513 return PTR_ERR(call);
499 rxrpc_put_call(call, rxrpc_call_put); 514 rxrpc_put_call(call, rxrpc_call_put);
@@ -502,18 +517,41 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
502 517
503 call = rxrpc_find_call_by_user_ID(rx, user_call_ID); 518 call = rxrpc_find_call_by_user_ID(rx, user_call_ID);
504 if (!call) { 519 if (!call) {
520 ret = -EBADSLT;
505 if (cmd != RXRPC_CMD_SEND_DATA) 521 if (cmd != RXRPC_CMD_SEND_DATA)
506 return -EBADSLT; 522 goto error_release_sock;
507 call = rxrpc_new_client_call_for_sendmsg(rx, msg, user_call_ID, 523 call = rxrpc_new_client_call_for_sendmsg(rx, msg, user_call_ID,
508 exclusive); 524 exclusive);
525 /* The socket is now unlocked... */
509 if (IS_ERR(call)) 526 if (IS_ERR(call))
510 return PTR_ERR(call); 527 return PTR_ERR(call);
528 /* ... and we have the call lock. */
529 } else {
530 switch (READ_ONCE(call->state)) {
531 case RXRPC_CALL_UNINITIALISED:
532 case RXRPC_CALL_CLIENT_AWAIT_CONN:
533 case RXRPC_CALL_SERVER_PREALLOC:
534 case RXRPC_CALL_SERVER_SECURING:
535 case RXRPC_CALL_SERVER_ACCEPTING:
536 ret = -EBUSY;
537 goto error_release_sock;
538 default:
539 break;
540 }
541
542 ret = mutex_lock_interruptible(&call->user_mutex);
543 release_sock(&rx->sk);
544 if (ret < 0) {
545 ret = -ERESTARTSYS;
546 goto error_put;
547 }
511 } 548 }
512 549
550 state = READ_ONCE(call->state);
513 _debug("CALL %d USR %lx ST %d on CONN %p", 551 _debug("CALL %d USR %lx ST %d on CONN %p",
514 call->debug_id, call->user_call_ID, call->state, call->conn); 552 call->debug_id, call->user_call_ID, state, call->conn);
515 553
516 if (call->state >= RXRPC_CALL_COMPLETE) { 554 if (state >= RXRPC_CALL_COMPLETE) {
517 /* it's too late for this call */ 555 /* it's too late for this call */
518 ret = -ESHUTDOWN; 556 ret = -ESHUTDOWN;
519 } else if (cmd == RXRPC_CMD_SEND_ABORT) { 557 } else if (cmd == RXRPC_CMD_SEND_ABORT) {
@@ -523,21 +561,27 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
523 } else if (cmd != RXRPC_CMD_SEND_DATA) { 561 } else if (cmd != RXRPC_CMD_SEND_DATA) {
524 ret = -EINVAL; 562 ret = -EINVAL;
525 } else if (rxrpc_is_client_call(call) && 563 } else if (rxrpc_is_client_call(call) &&
526 call->state != RXRPC_CALL_CLIENT_SEND_REQUEST) { 564 state != RXRPC_CALL_CLIENT_SEND_REQUEST) {
527 /* request phase complete for this client call */ 565 /* request phase complete for this client call */
528 ret = -EPROTO; 566 ret = -EPROTO;
529 } else if (rxrpc_is_service_call(call) && 567 } else if (rxrpc_is_service_call(call) &&
530 call->state != RXRPC_CALL_SERVER_ACK_REQUEST && 568 state != RXRPC_CALL_SERVER_ACK_REQUEST &&
531 call->state != RXRPC_CALL_SERVER_SEND_REPLY) { 569 state != RXRPC_CALL_SERVER_SEND_REPLY) {
532 /* Reply phase not begun or not complete for service call. */ 570 /* Reply phase not begun or not complete for service call. */
533 ret = -EPROTO; 571 ret = -EPROTO;
534 } else { 572 } else {
535 ret = rxrpc_send_data(rx, call, msg, len); 573 ret = rxrpc_send_data(rx, call, msg, len);
536 } 574 }
537 575
576 mutex_unlock(&call->user_mutex);
577error_put:
538 rxrpc_put_call(call, rxrpc_call_put); 578 rxrpc_put_call(call, rxrpc_call_put);
539 _leave(" = %d", ret); 579 _leave(" = %d", ret);
540 return ret; 580 return ret;
581
582error_release_sock:
583 release_sock(&rx->sk);
584 return ret;
541} 585}
542 586
543/** 587/**
@@ -562,22 +606,29 @@ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call,
562 ASSERTCMP(msg->msg_name, ==, NULL); 606 ASSERTCMP(msg->msg_name, ==, NULL);
563 ASSERTCMP(msg->msg_control, ==, NULL); 607 ASSERTCMP(msg->msg_control, ==, NULL);
564 608
565 lock_sock(sock->sk); 609 mutex_lock(&call->user_mutex);
566 610
567 _debug("CALL %d USR %lx ST %d on CONN %p", 611 _debug("CALL %d USR %lx ST %d on CONN %p",
568 call->debug_id, call->user_call_ID, call->state, call->conn); 612 call->debug_id, call->user_call_ID, call->state, call->conn);
569 613
570 if (call->state >= RXRPC_CALL_COMPLETE) { 614 switch (READ_ONCE(call->state)) {
571 ret = -ESHUTDOWN; /* it's too late for this call */ 615 case RXRPC_CALL_CLIENT_SEND_REQUEST:
572 } else if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST && 616 case RXRPC_CALL_SERVER_ACK_REQUEST:
573 call->state != RXRPC_CALL_SERVER_ACK_REQUEST && 617 case RXRPC_CALL_SERVER_SEND_REPLY:
574 call->state != RXRPC_CALL_SERVER_SEND_REPLY) {
575 ret = -EPROTO; /* request phase complete for this client call */
576 } else {
577 ret = rxrpc_send_data(rxrpc_sk(sock->sk), call, msg, len); 618 ret = rxrpc_send_data(rxrpc_sk(sock->sk), call, msg, len);
619 break;
620 case RXRPC_CALL_COMPLETE:
621 read_lock_bh(&call->state_lock);
622 ret = -call->error;
623 read_unlock_bh(&call->state_lock);
624 break;
625 default:
626 /* Request phase complete for this client call */
627 ret = -EPROTO;
628 break;
578 } 629 }
579 630
580 release_sock(sock->sk); 631 mutex_unlock(&call->user_mutex);
581 _leave(" = %d", ret); 632 _leave(" = %d", ret);
582 return ret; 633 return ret;
583} 634}
@@ -598,12 +649,12 @@ void rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call,
598{ 649{
599 _enter("{%d},%d,%d,%s", call->debug_id, abort_code, error, why); 650 _enter("{%d},%d,%d,%s", call->debug_id, abort_code, error, why);
600 651
601 lock_sock(sock->sk); 652 mutex_lock(&call->user_mutex);
602 653
603 if (rxrpc_abort_call(why, call, 0, abort_code, error)) 654 if (rxrpc_abort_call(why, call, 0, abort_code, error))
604 rxrpc_send_abort_packet(call); 655 rxrpc_send_abort_packet(call);
605 656
606 release_sock(sock->sk); 657 mutex_unlock(&call->user_mutex);
607 _leave(""); 658 _leave("");
608} 659}
609 660
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 87956a768d1b..403790cce7d2 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -650,6 +650,18 @@ config NET_ACT_MIRRED
650 To compile this code as a module, choose M here: the 650 To compile this code as a module, choose M here: the
651 module will be called act_mirred. 651 module will be called act_mirred.
652 652
653config NET_ACT_SAMPLE
654 tristate "Traffic Sampling"
655 depends on NET_CLS_ACT
656 select PSAMPLE
657 ---help---
658 Say Y here to allow packet sampling tc action. The packet sample
659 action consists of statistically choosing packets and sampling
660 them using the psample module.
661
662 To compile this code as a module, choose M here: the
663 module will be called act_sample.
664
653config NET_ACT_IPT 665config NET_ACT_IPT
654 tristate "IPtables targets" 666 tristate "IPtables targets"
655 depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES 667 depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
@@ -707,6 +719,7 @@ config NET_ACT_SKBEDIT
707config NET_ACT_CSUM 719config NET_ACT_CSUM
708 tristate "Checksum Updating" 720 tristate "Checksum Updating"
709 depends on NET_CLS_ACT && INET 721 depends on NET_CLS_ACT && INET
722 select LIBCRC32C
710 ---help--- 723 ---help---
711 Say Y here to update some common checksum after some direct 724 Say Y here to update some common checksum after some direct
712 packet alterations. 725 packet alterations.
@@ -763,6 +776,7 @@ config NET_ACT_SKBMOD
763config NET_ACT_IFE 776config NET_ACT_IFE
764 tristate "Inter-FE action based on IETF ForCES InterFE LFB" 777 tristate "Inter-FE action based on IETF ForCES InterFE LFB"
765 depends on NET_CLS_ACT 778 depends on NET_CLS_ACT
779 select NET_IFE
766 ---help--- 780 ---help---
767 Say Y here to allow for sourcing and terminating metadata 781 Say Y here to allow for sourcing and terminating metadata
768 For details refer to netdev01 paper: 782 For details refer to netdev01 paper:
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 4bdda3634e0b..7b915d226de7 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -10,6 +10,7 @@ obj-$(CONFIG_NET_CLS_ACT) += act_api.o
10obj-$(CONFIG_NET_ACT_POLICE) += act_police.o 10obj-$(CONFIG_NET_ACT_POLICE) += act_police.o
11obj-$(CONFIG_NET_ACT_GACT) += act_gact.o 11obj-$(CONFIG_NET_ACT_GACT) += act_gact.o
12obj-$(CONFIG_NET_ACT_MIRRED) += act_mirred.o 12obj-$(CONFIG_NET_ACT_MIRRED) += act_mirred.o
13obj-$(CONFIG_NET_ACT_SAMPLE) += act_sample.o
13obj-$(CONFIG_NET_ACT_IPT) += act_ipt.o 14obj-$(CONFIG_NET_ACT_IPT) += act_ipt.o
14obj-$(CONFIG_NET_ACT_NAT) += act_nat.o 15obj-$(CONFIG_NET_ACT_NAT) += act_nat.o
15obj-$(CONFIG_NET_ACT_PEDIT) += act_pedit.o 16obj-$(CONFIG_NET_ACT_PEDIT) += act_pedit.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index f893d180da1c..e05b924618a0 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -24,6 +24,7 @@
24#include <net/net_namespace.h> 24#include <net/net_namespace.h>
25#include <net/sock.h> 25#include <net/sock.h>
26#include <net/sch_generic.h> 26#include <net/sch_generic.h>
27#include <net/pkt_cls.h>
27#include <net/act_api.h> 28#include <net/act_api.h>
28#include <net/netlink.h> 29#include <net/netlink.h>
29 30
@@ -33,6 +34,12 @@ static void free_tcf(struct rcu_head *head)
33 34
34 free_percpu(p->cpu_bstats); 35 free_percpu(p->cpu_bstats);
35 free_percpu(p->cpu_qstats); 36 free_percpu(p->cpu_qstats);
37
38 if (p->act_cookie) {
39 kfree(p->act_cookie->data);
40 kfree(p->act_cookie);
41 }
42
36 kfree(p); 43 kfree(p);
37} 44}
38 45
@@ -41,8 +48,7 @@ static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *p)
41 spin_lock_bh(&hinfo->lock); 48 spin_lock_bh(&hinfo->lock);
42 hlist_del(&p->tcfa_head); 49 hlist_del(&p->tcfa_head);
43 spin_unlock_bh(&hinfo->lock); 50 spin_unlock_bh(&hinfo->lock);
44 gen_kill_estimator(&p->tcfa_bstats, 51 gen_kill_estimator(&p->tcfa_rate_est);
45 &p->tcfa_rate_est);
46 /* 52 /*
47 * gen_estimator est_timer() might access p->tcfa_lock 53 * gen_estimator est_timer() might access p->tcfa_lock
48 * or bstats, wait a RCU grace period before freeing p 54 * or bstats, wait a RCU grace period before freeing p
@@ -237,8 +243,7 @@ EXPORT_SYMBOL(tcf_hash_check);
237void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est) 243void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est)
238{ 244{
239 if (est) 245 if (est)
240 gen_kill_estimator(&a->tcfa_bstats, 246 gen_kill_estimator(&a->tcfa_rate_est);
241 &a->tcfa_rate_est);
242 call_rcu(&a->tcfa_rcu, free_tcf); 247 call_rcu(&a->tcfa_rcu, free_tcf);
243} 248}
244EXPORT_SYMBOL(tcf_hash_cleanup); 249EXPORT_SYMBOL(tcf_hash_cleanup);
@@ -428,11 +433,9 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions,
428{ 433{
429 int ret = -1, i; 434 int ret = -1, i;
430 435
431 if (skb->tc_verd & TC_NCLS) { 436 if (skb_skip_tc_classify(skb))
432 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); 437 return TC_ACT_OK;
433 ret = TC_ACT_OK; 438
434 goto exec_done;
435 }
436 for (i = 0; i < nr_actions; i++) { 439 for (i = 0; i < nr_actions; i++) {
437 const struct tc_action *a = actions[i]; 440 const struct tc_action *a = actions[i];
438 441
@@ -441,9 +444,8 @@ repeat:
441 if (ret == TC_ACT_REPEAT) 444 if (ret == TC_ACT_REPEAT)
442 goto repeat; /* we need a ttl - JHS */ 445 goto repeat; /* we need a ttl - JHS */
443 if (ret != TC_ACT_PIPE) 446 if (ret != TC_ACT_PIPE)
444 goto exec_done; 447 break;
445 } 448 }
446exec_done:
447 return ret; 449 return ret;
448} 450}
449EXPORT_SYMBOL(tcf_action_exec); 451EXPORT_SYMBOL(tcf_action_exec);
@@ -480,6 +482,12 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
480 goto nla_put_failure; 482 goto nla_put_failure;
481 if (tcf_action_copy_stats(skb, a, 0)) 483 if (tcf_action_copy_stats(skb, a, 0))
482 goto nla_put_failure; 484 goto nla_put_failure;
485 if (a->act_cookie) {
486 if (nla_put(skb, TCA_ACT_COOKIE, a->act_cookie->len,
487 a->act_cookie->data))
488 goto nla_put_failure;
489 }
490
483 nest = nla_nest_start(skb, TCA_OPTIONS); 491 nest = nla_nest_start(skb, TCA_OPTIONS);
484 if (nest == NULL) 492 if (nest == NULL)
485 goto nla_put_failure; 493 goto nla_put_failure;
@@ -521,12 +529,29 @@ errout:
521 return err; 529 return err;
522} 530}
523 531
532static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb)
533{
534 struct tc_cookie *c = kzalloc(sizeof(*c), GFP_KERNEL);
535 if (!c)
536 return NULL;
537
538 c->data = nla_memdup(tb[TCA_ACT_COOKIE], GFP_KERNEL);
539 if (!c->data) {
540 kfree(c);
541 return NULL;
542 }
543 c->len = nla_len(tb[TCA_ACT_COOKIE]);
544
545 return c;
546}
547
524struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla, 548struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
525 struct nlattr *est, char *name, int ovr, 549 struct nlattr *est, char *name, int ovr,
526 int bind) 550 int bind)
527{ 551{
528 struct tc_action *a; 552 struct tc_action *a;
529 struct tc_action_ops *a_o; 553 struct tc_action_ops *a_o;
554 struct tc_cookie *cookie = NULL;
530 char act_name[IFNAMSIZ]; 555 char act_name[IFNAMSIZ];
531 struct nlattr *tb[TCA_ACT_MAX + 1]; 556 struct nlattr *tb[TCA_ACT_MAX + 1];
532 struct nlattr *kind; 557 struct nlattr *kind;
@@ -542,6 +567,18 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
542 goto err_out; 567 goto err_out;
543 if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ) 568 if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ)
544 goto err_out; 569 goto err_out;
570 if (tb[TCA_ACT_COOKIE]) {
571 int cklen = nla_len(tb[TCA_ACT_COOKIE]);
572
573 if (cklen > TC_COOKIE_MAX_SIZE)
574 goto err_out;
575
576 cookie = nla_memdup_cookie(tb);
577 if (!cookie) {
578 err = -ENOMEM;
579 goto err_out;
580 }
581 }
545 } else { 582 } else {
546 err = -EINVAL; 583 err = -EINVAL;
547 if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ) 584 if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ)
@@ -580,6 +617,14 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
580 if (err < 0) 617 if (err < 0)
581 goto err_mod; 618 goto err_mod;
582 619
620 if (name == NULL && tb[TCA_ACT_COOKIE]) {
621 if (a->act_cookie) {
622 kfree(a->act_cookie->data);
623 kfree(a->act_cookie);
624 }
625 a->act_cookie = cookie;
626 }
627
583 /* module count goes up only when brand new policy is created 628 /* module count goes up only when brand new policy is created
584 * if it exists and is only bound to in a_o->init() then 629 * if it exists and is only bound to in a_o->init() then
585 * ACT_P_CREATED is not returned (a zero is). 630 * ACT_P_CREATED is not returned (a zero is).
@@ -592,6 +637,10 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
592err_mod: 637err_mod:
593 module_put(a_o->owner); 638 module_put(a_o->owner);
594err_out: 639err_out:
640 if (cookie) {
641 kfree(cookie->data);
642 kfree(cookie);
643 }
595 return ERR_PTR(err); 644 return ERR_PTR(err);
596} 645}
597 646
@@ -670,8 +719,7 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p,
670 goto errout; 719 goto errout;
671 720
672 if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 || 721 if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 ||
673 gnet_stats_copy_rate_est(&d, &p->tcfa_bstats, 722 gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 ||
674 &p->tcfa_rate_est) < 0 ||
675 gnet_stats_copy_queue(&d, p->cpu_qstats, 723 gnet_stats_copy_queue(&d, p->cpu_qstats,
676 &p->tcfa_qstats, 724 &p->tcfa_qstats,
677 p->tcfa_qstats.qlen) < 0) 725 p->tcfa_qstats.qlen) < 0)
@@ -820,10 +868,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
820 goto out_module_put; 868 goto out_module_put;
821 869
822 err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops); 870 err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops);
823 if (err < 0) 871 if (err <= 0)
824 goto out_module_put; 872 goto out_module_put;
825 if (err == 0)
826 goto noflush_out;
827 873
828 nla_nest_end(skb, nest); 874 nla_nest_end(skb, nest);
829 875
@@ -840,7 +886,6 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
840out_module_put: 886out_module_put:
841 module_put(ops->owner); 887 module_put(ops->owner);
842err_out: 888err_out:
843noflush_out:
844 kfree_skb(skb); 889 kfree_skb(skb);
845 return err; 890 return err;
846} 891}
@@ -903,8 +948,6 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
903 goto err; 948 goto err;
904 } 949 }
905 act->order = i; 950 act->order = i;
906 if (event == RTM_GETACTION)
907 act->tcfa_refcnt++;
908 list_add_tail(&act->list, &actions); 951 list_add_tail(&act->list, &actions);
909 } 952 }
910 953
@@ -917,7 +960,8 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
917 return ret; 960 return ret;
918 } 961 }
919err: 962err:
920 tcf_action_destroy(&actions, 0); 963 if (event != RTM_GETACTION)
964 tcf_action_destroy(&actions, 0);
921 return ret; 965 return ret;
922} 966}
923 967
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 1d3960033f61..520baa41cba3 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -28,12 +28,11 @@ struct tcf_bpf_cfg {
28 struct bpf_prog *filter; 28 struct bpf_prog *filter;
29 struct sock_filter *bpf_ops; 29 struct sock_filter *bpf_ops;
30 const char *bpf_name; 30 const char *bpf_name;
31 u32 bpf_fd;
32 u16 bpf_num_ops; 31 u16 bpf_num_ops;
33 bool is_ebpf; 32 bool is_ebpf;
34}; 33};
35 34
36static int bpf_net_id; 35static unsigned int bpf_net_id;
37static struct tc_action_ops act_bpf_ops; 36static struct tc_action_ops act_bpf_ops;
38 37
39static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, 38static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
@@ -118,13 +117,18 @@ static int tcf_bpf_dump_bpf_info(const struct tcf_bpf *prog,
118static int tcf_bpf_dump_ebpf_info(const struct tcf_bpf *prog, 117static int tcf_bpf_dump_ebpf_info(const struct tcf_bpf *prog,
119 struct sk_buff *skb) 118 struct sk_buff *skb)
120{ 119{
121 if (nla_put_u32(skb, TCA_ACT_BPF_FD, prog->bpf_fd)) 120 struct nlattr *nla;
122 return -EMSGSIZE;
123 121
124 if (prog->bpf_name && 122 if (prog->bpf_name &&
125 nla_put_string(skb, TCA_ACT_BPF_NAME, prog->bpf_name)) 123 nla_put_string(skb, TCA_ACT_BPF_NAME, prog->bpf_name))
126 return -EMSGSIZE; 124 return -EMSGSIZE;
127 125
126 nla = nla_reserve(skb, TCA_ACT_BPF_TAG, sizeof(prog->filter->tag));
127 if (nla == NULL)
128 return -EMSGSIZE;
129
130 memcpy(nla_data(nla), prog->filter->tag, nla_len(nla));
131
128 return 0; 132 return 0;
129} 133}
130 134
@@ -226,16 +230,13 @@ static int tcf_bpf_init_from_efd(struct nlattr **tb, struct tcf_bpf_cfg *cfg)
226 return PTR_ERR(fp); 230 return PTR_ERR(fp);
227 231
228 if (tb[TCA_ACT_BPF_NAME]) { 232 if (tb[TCA_ACT_BPF_NAME]) {
229 name = kmemdup(nla_data(tb[TCA_ACT_BPF_NAME]), 233 name = nla_memdup(tb[TCA_ACT_BPF_NAME], GFP_KERNEL);
230 nla_len(tb[TCA_ACT_BPF_NAME]),
231 GFP_KERNEL);
232 if (!name) { 234 if (!name) {
233 bpf_prog_put(fp); 235 bpf_prog_put(fp);
234 return -ENOMEM; 236 return -ENOMEM;
235 } 237 }
236 } 238 }
237 239
238 cfg->bpf_fd = bpf_fd;
239 cfg->bpf_name = name; 240 cfg->bpf_name = name;
240 cfg->filter = fp; 241 cfg->filter = fp;
241 cfg->is_ebpf = true; 242 cfg->is_ebpf = true;
@@ -334,8 +335,6 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
334 335
335 if (cfg.bpf_num_ops) 336 if (cfg.bpf_num_ops)
336 prog->bpf_num_ops = cfg.bpf_num_ops; 337 prog->bpf_num_ops = cfg.bpf_num_ops;
337 if (cfg.bpf_fd)
338 prog->bpf_fd = cfg.bpf_fd;
339 338
340 prog->tcf_action = parm->action; 339 prog->tcf_action = parm->action;
341 rcu_assign_pointer(prog->filter, cfg.filter); 340 rcu_assign_pointer(prog->filter, cfg.filter);
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index eae07a2e774d..f9bb43c25697 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -30,7 +30,7 @@
30 30
31#define CONNMARK_TAB_MASK 3 31#define CONNMARK_TAB_MASK 3
32 32
33static int connmark_net_id; 33static unsigned int connmark_net_id;
34static struct tc_action_ops act_connmark_ops; 34static struct tc_action_ops act_connmark_ops;
35 35
36static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a, 36static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a,
@@ -113,6 +113,9 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
113 if (ret < 0) 113 if (ret < 0)
114 return ret; 114 return ret;
115 115
116 if (!tb[TCA_CONNMARK_PARMS])
117 return -EINVAL;
118
116 parm = nla_data(tb[TCA_CONNMARK_PARMS]); 119 parm = nla_data(tb[TCA_CONNMARK_PARMS]);
117 120
118 if (!tcf_hash_check(tn, parm->index, a, bind)) { 121 if (!tcf_hash_check(tn, parm->index, a, bind)) {
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index e0defcef376d..e978ccd4402c 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -30,6 +30,7 @@
30#include <net/tcp.h> 30#include <net/tcp.h>
31#include <net/udp.h> 31#include <net/udp.h>
32#include <net/ip6_checksum.h> 32#include <net/ip6_checksum.h>
33#include <net/sctp/checksum.h>
33 34
34#include <net/act_api.h> 35#include <net/act_api.h>
35 36
@@ -42,7 +43,7 @@ static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = {
42 [TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), }, 43 [TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), },
43}; 44};
44 45
45static int csum_net_id; 46static unsigned int csum_net_id;
46static struct tc_action_ops act_csum_ops; 47static struct tc_action_ops act_csum_ops;
47 48
48static int tcf_csum_init(struct net *net, struct nlattr *nla, 49static int tcf_csum_init(struct net *net, struct nlattr *nla,
@@ -322,6 +323,25 @@ ignore_obscure_skb:
322 return 1; 323 return 1;
323} 324}
324 325
326static int tcf_csum_sctp(struct sk_buff *skb, unsigned int ihl,
327 unsigned int ipl)
328{
329 struct sctphdr *sctph;
330
331 if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_SCTP)
332 return 1;
333
334 sctph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*sctph));
335 if (!sctph)
336 return 0;
337
338 sctph->checksum = sctp_compute_cksum(skb,
339 skb_network_offset(skb) + ihl);
340 skb->ip_summed = CHECKSUM_NONE;
341
342 return 1;
343}
344
325static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags) 345static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags)
326{ 346{
327 const struct iphdr *iph; 347 const struct iphdr *iph;
@@ -365,6 +385,11 @@ static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags)
365 ntohs(iph->tot_len), 1)) 385 ntohs(iph->tot_len), 1))
366 goto fail; 386 goto fail;
367 break; 387 break;
388 case IPPROTO_SCTP:
389 if ((update_flags & TCA_CSUM_UPDATE_FLAG_SCTP) &&
390 !tcf_csum_sctp(skb, iph->ihl * 4, ntohs(iph->tot_len)))
391 goto fail;
392 break;
368 } 393 }
369 394
370 if (update_flags & TCA_CSUM_UPDATE_FLAG_IPV4HDR) { 395 if (update_flags & TCA_CSUM_UPDATE_FLAG_IPV4HDR) {
@@ -481,6 +506,11 @@ static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags)
481 pl + sizeof(*ip6h), 1)) 506 pl + sizeof(*ip6h), 1))
482 goto fail; 507 goto fail;
483 goto done; 508 goto done;
509 case IPPROTO_SCTP:
510 if ((update_flags & TCA_CSUM_UPDATE_FLAG_SCTP) &&
511 !tcf_csum_sctp(skb, hl, pl + sizeof(*ip6h)))
512 goto fail;
513 goto done;
484 default: 514 default:
485 goto ignore_skb; 515 goto ignore_skb;
486 } 516 }
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index e0aa30f83c6c..e6c874a2b283 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -25,7 +25,7 @@
25 25
26#define GACT_TAB_MASK 15 26#define GACT_TAB_MASK 15
27 27
28static int gact_net_id; 28static unsigned int gact_net_id;
29static struct tc_action_ops act_gact_ops; 29static struct tc_action_ops act_gact_ops;
30 30
31#ifdef CONFIG_GACT_PROB 31#ifdef CONFIG_GACT_PROB
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 95c463cbb9a6..71e7ff22f7c9 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -32,10 +32,11 @@
32#include <uapi/linux/tc_act/tc_ife.h> 32#include <uapi/linux/tc_act/tc_ife.h>
33#include <net/tc_act/tc_ife.h> 33#include <net/tc_act/tc_ife.h>
34#include <linux/etherdevice.h> 34#include <linux/etherdevice.h>
35#include <net/ife.h>
35 36
36#define IFE_TAB_MASK 15 37#define IFE_TAB_MASK 15
37 38
38static int ife_net_id; 39static unsigned int ife_net_id;
39static int max_metacnt = IFE_META_MAX + 1; 40static int max_metacnt = IFE_META_MAX + 1;
40static struct tc_action_ops act_ife_ops; 41static struct tc_action_ops act_ife_ops;
41 42
@@ -46,23 +47,6 @@ static const struct nla_policy ife_policy[TCA_IFE_MAX + 1] = {
46 [TCA_IFE_TYPE] = { .type = NLA_U16}, 47 [TCA_IFE_TYPE] = { .type = NLA_U16},
47}; 48};
48 49
49/* Caller takes care of presenting data in network order
50*/
51int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval)
52{
53 u32 *tlv = (u32 *)(skbdata);
54 u16 totlen = nla_total_size(dlen); /*alignment + hdr */
55 char *dptr = (char *)tlv + NLA_HDRLEN;
56 u32 htlv = attrtype << 16 | (dlen + NLA_HDRLEN);
57
58 *tlv = htonl(htlv);
59 memset(dptr, 0, totlen - NLA_HDRLEN);
60 memcpy(dptr, dval, dlen);
61
62 return totlen;
63}
64EXPORT_SYMBOL_GPL(ife_tlv_meta_encode);
65
66int ife_encode_meta_u16(u16 metaval, void *skbdata, struct tcf_meta_info *mi) 50int ife_encode_meta_u16(u16 metaval, void *skbdata, struct tcf_meta_info *mi)
67{ 51{
68 u16 edata = 0; 52 u16 edata = 0;
@@ -637,69 +621,59 @@ int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_info *ife,
637 return 0; 621 return 0;
638} 622}
639 623
640struct ifeheadr {
641 __be16 metalen;
642 u8 tlv_data[];
643};
644
645struct meta_tlvhdr {
646 __be16 type;
647 __be16 len;
648};
649
650static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, 624static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a,
651 struct tcf_result *res) 625 struct tcf_result *res)
652{ 626{
653 struct tcf_ife_info *ife = to_ife(a); 627 struct tcf_ife_info *ife = to_ife(a);
654 int action = ife->tcf_action; 628 int action = ife->tcf_action;
655 struct ifeheadr *ifehdr = (struct ifeheadr *)skb->data; 629 u8 *ifehdr_end;
656 int ifehdrln = (int)ifehdr->metalen; 630 u8 *tlv_data;
657 struct meta_tlvhdr *tlv = (struct meta_tlvhdr *)(ifehdr->tlv_data); 631 u16 metalen;
658 632
659 spin_lock(&ife->tcf_lock); 633 spin_lock(&ife->tcf_lock);
660 bstats_update(&ife->tcf_bstats, skb); 634 bstats_update(&ife->tcf_bstats, skb);
661 tcf_lastuse_update(&ife->tcf_tm); 635 tcf_lastuse_update(&ife->tcf_tm);
662 spin_unlock(&ife->tcf_lock); 636 spin_unlock(&ife->tcf_lock);
663 637
664 ifehdrln = ntohs(ifehdrln); 638 if (skb_at_tc_ingress(skb))
665 if (unlikely(!pskb_may_pull(skb, ifehdrln))) { 639 skb_push(skb, skb->dev->hard_header_len);
640
641 tlv_data = ife_decode(skb, &metalen);
642 if (unlikely(!tlv_data)) {
666 spin_lock(&ife->tcf_lock); 643 spin_lock(&ife->tcf_lock);
667 ife->tcf_qstats.drops++; 644 ife->tcf_qstats.drops++;
668 spin_unlock(&ife->tcf_lock); 645 spin_unlock(&ife->tcf_lock);
669 return TC_ACT_SHOT; 646 return TC_ACT_SHOT;
670 } 647 }
671 648
672 skb_set_mac_header(skb, ifehdrln); 649 ifehdr_end = tlv_data + metalen;
673 __skb_pull(skb, ifehdrln); 650 for (; tlv_data < ifehdr_end; tlv_data = ife_tlv_meta_next(tlv_data)) {
674 skb->protocol = eth_type_trans(skb, skb->dev); 651 u8 *curr_data;
675 ifehdrln -= IFE_METAHDRLEN; 652 u16 mtype;
676 653 u16 dlen;
677 while (ifehdrln > 0) {
678 u8 *tlvdata = (u8 *)tlv;
679 u16 mtype = tlv->type;
680 u16 mlen = tlv->len;
681 u16 alen;
682 654
683 mtype = ntohs(mtype); 655 curr_data = ife_tlv_meta_decode(tlv_data, &mtype, &dlen, NULL);
684 mlen = ntohs(mlen);
685 alen = NLA_ALIGN(mlen);
686 656
687 if (find_decode_metaid(skb, ife, mtype, (mlen - NLA_HDRLEN), 657 if (find_decode_metaid(skb, ife, mtype, dlen, curr_data)) {
688 (void *)(tlvdata + NLA_HDRLEN))) {
689 /* abuse overlimits to count when we receive metadata 658 /* abuse overlimits to count when we receive metadata
690 * but dont have an ops for it 659 * but dont have an ops for it
691 */ 660 */
692 pr_info_ratelimited("Unknown metaid %d alnlen %d\n", 661 pr_info_ratelimited("Unknown metaid %d dlen %d\n",
693 mtype, mlen); 662 mtype, dlen);
694 ife->tcf_qstats.overlimits++; 663 ife->tcf_qstats.overlimits++;
695 } 664 }
665 }
696 666
697 tlvdata += alen; 667 if (WARN_ON(tlv_data != ifehdr_end)) {
698 ifehdrln -= alen; 668 spin_lock(&ife->tcf_lock);
699 tlv = (struct meta_tlvhdr *)tlvdata; 669 ife->tcf_qstats.drops++;
670 spin_unlock(&ife->tcf_lock);
671 return TC_ACT_SHOT;
700 } 672 }
701 673
674 skb->protocol = eth_type_trans(skb, skb->dev);
702 skb_reset_network_header(skb); 675 skb_reset_network_header(skb);
676
703 return action; 677 return action;
704} 678}
705 679
@@ -727,7 +701,6 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
727 struct tcf_ife_info *ife = to_ife(a); 701 struct tcf_ife_info *ife = to_ife(a);
728 int action = ife->tcf_action; 702 int action = ife->tcf_action;
729 struct ethhdr *oethh; /* outer ether header */ 703 struct ethhdr *oethh; /* outer ether header */
730 struct ethhdr *iethh; /* inner eth header */
731 struct tcf_meta_info *e; 704 struct tcf_meta_info *e;
732 /* 705 /*
733 OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA 706 OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA
@@ -735,13 +708,13 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
735 */ 708 */
736 u16 metalen = ife_get_sz(skb, ife); 709 u16 metalen = ife_get_sz(skb, ife);
737 int hdrm = metalen + skb->dev->hard_header_len + IFE_METAHDRLEN; 710 int hdrm = metalen + skb->dev->hard_header_len + IFE_METAHDRLEN;
738 unsigned int skboff = skb->dev->hard_header_len; 711 unsigned int skboff = 0;
739 u32 at = G_TC_AT(skb->tc_verd);
740 int new_len = skb->len + hdrm; 712 int new_len = skb->len + hdrm;
741 bool exceed_mtu = false; 713 bool exceed_mtu = false;
742 int err; 714 void *ife_meta;
715 int err = 0;
743 716
744 if (at & AT_EGRESS) { 717 if (!skb_at_tc_ingress(skb)) {
745 if (new_len > skb->dev->mtu) 718 if (new_len > skb->dev->mtu)
746 exceed_mtu = true; 719 exceed_mtu = true;
747 } 720 }
@@ -766,27 +739,10 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
766 return TC_ACT_SHOT; 739 return TC_ACT_SHOT;
767 } 740 }
768 741
769 err = skb_cow_head(skb, hdrm); 742 if (skb_at_tc_ingress(skb))
770 if (unlikely(err)) {
771 ife->tcf_qstats.drops++;
772 spin_unlock(&ife->tcf_lock);
773 return TC_ACT_SHOT;
774 }
775
776 if (!(at & AT_EGRESS))
777 skb_push(skb, skb->dev->hard_header_len); 743 skb_push(skb, skb->dev->hard_header_len);
778 744
779 iethh = (struct ethhdr *)skb->data; 745 ife_meta = ife_encode(skb, metalen);
780 __skb_push(skb, hdrm);
781 memcpy(skb->data, iethh, skb->mac_len);
782 skb_reset_mac_header(skb);
783 oethh = eth_hdr(skb);
784
785 /*total metadata length */
786 metalen += IFE_METAHDRLEN;
787 metalen = htons(metalen);
788 memcpy((skb->data + skboff), &metalen, IFE_METAHDRLEN);
789 skboff += IFE_METAHDRLEN;
790 746
791 /* XXX: we dont have a clever way of telling encode to 747 /* XXX: we dont have a clever way of telling encode to
792 * not repeat some of the computations that are done by 748 * not repeat some of the computations that are done by
@@ -794,7 +750,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
794 */ 750 */
795 list_for_each_entry(e, &ife->metalist, metalist) { 751 list_for_each_entry(e, &ife->metalist, metalist) {
796 if (e->ops->encode) { 752 if (e->ops->encode) {
797 err = e->ops->encode(skb, (void *)(skb->data + skboff), 753 err = e->ops->encode(skb, (void *)(ife_meta + skboff),
798 e); 754 e);
799 } 755 }
800 if (err < 0) { 756 if (err < 0) {
@@ -805,18 +761,15 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
805 } 761 }
806 skboff += err; 762 skboff += err;
807 } 763 }
764 oethh = (struct ethhdr *)skb->data;
808 765
809 if (!is_zero_ether_addr(ife->eth_src)) 766 if (!is_zero_ether_addr(ife->eth_src))
810 ether_addr_copy(oethh->h_source, ife->eth_src); 767 ether_addr_copy(oethh->h_source, ife->eth_src);
811 else
812 ether_addr_copy(oethh->h_source, iethh->h_source);
813 if (!is_zero_ether_addr(ife->eth_dst)) 768 if (!is_zero_ether_addr(ife->eth_dst))
814 ether_addr_copy(oethh->h_dest, ife->eth_dst); 769 ether_addr_copy(oethh->h_dest, ife->eth_dst);
815 else
816 ether_addr_copy(oethh->h_dest, iethh->h_dest);
817 oethh->h_proto = htons(ife->eth_type); 770 oethh->h_proto = htons(ife->eth_type);
818 771
819 if (!(at & AT_EGRESS)) 772 if (skb_at_tc_ingress(skb))
820 skb_pull(skb, skb->dev->hard_header_len); 773 skb_pull(skb, skb->dev->hard_header_len);
821 774
822 spin_unlock(&ife->tcf_lock); 775 spin_unlock(&ife->tcf_lock);
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 378c1c976058..992ef8d624f1 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -30,10 +30,10 @@
30 30
31#define IPT_TAB_MASK 15 31#define IPT_TAB_MASK 15
32 32
33static int ipt_net_id; 33static unsigned int ipt_net_id;
34static struct tc_action_ops act_ipt_ops; 34static struct tc_action_ops act_ipt_ops;
35 35
36static int xt_net_id; 36static unsigned int xt_net_id;
37static struct tc_action_ops act_xt_ops; 37static struct tc_action_ops act_xt_ops;
38 38
39static int ipt_init_target(struct xt_entry_target *t, char *table, 39static int ipt_init_target(struct xt_entry_target *t, char *table,
@@ -213,6 +213,12 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,
213 int ret = 0, result = 0; 213 int ret = 0, result = 0;
214 struct tcf_ipt *ipt = to_ipt(a); 214 struct tcf_ipt *ipt = to_ipt(a);
215 struct xt_action_param par; 215 struct xt_action_param par;
216 struct nf_hook_state state = {
217 .net = dev_net(skb->dev),
218 .in = skb->dev,
219 .hook = ipt->tcfi_hook,
220 .pf = NFPROTO_IPV4,
221 };
216 222
217 if (skb_unclone(skb, GFP_ATOMIC)) 223 if (skb_unclone(skb, GFP_ATOMIC))
218 return TC_ACT_UNSPEC; 224 return TC_ACT_UNSPEC;
@@ -226,13 +232,9 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,
226 * worry later - danger - this API seems to have changed 232 * worry later - danger - this API seems to have changed
227 * from earlier kernels 233 * from earlier kernels
228 */ 234 */
229 par.net = dev_net(skb->dev); 235 par.state = &state;
230 par.in = skb->dev;
231 par.out = NULL;
232 par.hooknum = ipt->tcfi_hook;
233 par.target = ipt->tcfi_t->u.kernel.target; 236 par.target = ipt->tcfi_t->u.kernel.target;
234 par.targinfo = ipt->tcfi_t->data; 237 par.targinfo = ipt->tcfi_t->data;
235 par.family = NFPROTO_IPV4;
236 ret = par.target->target(skb, &par); 238 ret = par.target->target(skb, &par);
237 239
238 switch (ret) { 240 switch (ret) {
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 6b07fba5770b..af49c7dca860 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -21,18 +21,36 @@
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/gfp.h> 23#include <linux/gfp.h>
24#include <linux/if_arp.h>
24#include <net/net_namespace.h> 25#include <net/net_namespace.h>
25#include <net/netlink.h> 26#include <net/netlink.h>
26#include <net/pkt_sched.h> 27#include <net/pkt_sched.h>
27#include <linux/tc_act/tc_mirred.h> 28#include <linux/tc_act/tc_mirred.h>
28#include <net/tc_act/tc_mirred.h> 29#include <net/tc_act/tc_mirred.h>
29 30
30#include <linux/if_arp.h>
31
32#define MIRRED_TAB_MASK 7 31#define MIRRED_TAB_MASK 7
33static LIST_HEAD(mirred_list); 32static LIST_HEAD(mirred_list);
34static DEFINE_SPINLOCK(mirred_list_lock); 33static DEFINE_SPINLOCK(mirred_list_lock);
35 34
35static bool tcf_mirred_is_act_redirect(int action)
36{
37 return action == TCA_EGRESS_REDIR || action == TCA_INGRESS_REDIR;
38}
39
40static bool tcf_mirred_act_wants_ingress(int action)
41{
42 switch (action) {
43 case TCA_EGRESS_REDIR:
44 case TCA_EGRESS_MIRROR:
45 return false;
46 case TCA_INGRESS_REDIR:
47 case TCA_INGRESS_MIRROR:
48 return true;
49 default:
50 BUG();
51 }
52}
53
36static void tcf_mirred_release(struct tc_action *a, int bind) 54static void tcf_mirred_release(struct tc_action *a, int bind)
37{ 55{
38 struct tcf_mirred *m = to_mirred(a); 56 struct tcf_mirred *m = to_mirred(a);
@@ -51,7 +69,7 @@ static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
51 [TCA_MIRRED_PARMS] = { .len = sizeof(struct tc_mirred) }, 69 [TCA_MIRRED_PARMS] = { .len = sizeof(struct tc_mirred) },
52}; 70};
53 71
54static int mirred_net_id; 72static unsigned int mirred_net_id;
55static struct tc_action_ops act_mirred_ops; 73static struct tc_action_ops act_mirred_ops;
56 74
57static int tcf_mirred_init(struct net *net, struct nlattr *nla, 75static int tcf_mirred_init(struct net *net, struct nlattr *nla,
@@ -60,11 +78,12 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
60{ 78{
61 struct tc_action_net *tn = net_generic(net, mirred_net_id); 79 struct tc_action_net *tn = net_generic(net, mirred_net_id);
62 struct nlattr *tb[TCA_MIRRED_MAX + 1]; 80 struct nlattr *tb[TCA_MIRRED_MAX + 1];
81 bool mac_header_xmit = false;
63 struct tc_mirred *parm; 82 struct tc_mirred *parm;
64 struct tcf_mirred *m; 83 struct tcf_mirred *m;
65 struct net_device *dev; 84 struct net_device *dev;
66 int ret, ok_push = 0;
67 bool exists = false; 85 bool exists = false;
86 int ret;
68 87
69 if (nla == NULL) 88 if (nla == NULL)
70 return -EINVAL; 89 return -EINVAL;
@@ -82,6 +101,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
82 switch (parm->eaction) { 101 switch (parm->eaction) {
83 case TCA_EGRESS_MIRROR: 102 case TCA_EGRESS_MIRROR:
84 case TCA_EGRESS_REDIR: 103 case TCA_EGRESS_REDIR:
104 case TCA_INGRESS_REDIR:
105 case TCA_INGRESS_MIRROR:
85 break; 106 break;
86 default: 107 default:
87 if (exists) 108 if (exists)
@@ -95,19 +116,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
95 tcf_hash_release(*a, bind); 116 tcf_hash_release(*a, bind);
96 return -ENODEV; 117 return -ENODEV;
97 } 118 }
98 switch (dev->type) { 119 mac_header_xmit = dev_is_mac_header_xmit(dev);
99 case ARPHRD_TUNNEL:
100 case ARPHRD_TUNNEL6:
101 case ARPHRD_SIT:
102 case ARPHRD_IPGRE:
103 case ARPHRD_VOID:
104 case ARPHRD_NONE:
105 ok_push = 0;
106 break;
107 default:
108 ok_push = 1;
109 break;
110 }
111 } else { 120 } else {
112 dev = NULL; 121 dev = NULL;
113 } 122 }
@@ -136,7 +145,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
136 dev_put(rcu_dereference_protected(m->tcfm_dev, 1)); 145 dev_put(rcu_dereference_protected(m->tcfm_dev, 1));
137 dev_hold(dev); 146 dev_hold(dev);
138 rcu_assign_pointer(m->tcfm_dev, dev); 147 rcu_assign_pointer(m->tcfm_dev, dev);
139 m->tcfm_ok_push = ok_push; 148 m->tcfm_mac_header_xmit = mac_header_xmit;
140 } 149 }
141 150
142 if (ret == ACT_P_CREATED) { 151 if (ret == ACT_P_CREATED) {
@@ -153,15 +162,19 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
153 struct tcf_result *res) 162 struct tcf_result *res)
154{ 163{
155 struct tcf_mirred *m = to_mirred(a); 164 struct tcf_mirred *m = to_mirred(a);
165 bool m_mac_header_xmit;
156 struct net_device *dev; 166 struct net_device *dev;
157 struct sk_buff *skb2; 167 struct sk_buff *skb2;
158 int retval, err; 168 int retval, err = 0;
159 u32 at; 169 int m_eaction;
170 int mac_len;
160 171
161 tcf_lastuse_update(&m->tcf_tm); 172 tcf_lastuse_update(&m->tcf_tm);
162 bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb); 173 bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb);
163 174
164 rcu_read_lock(); 175 rcu_read_lock();
176 m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit);
177 m_eaction = READ_ONCE(m->tcfm_eaction);
165 retval = READ_ONCE(m->tcf_action); 178 retval = READ_ONCE(m->tcf_action);
166 dev = rcu_dereference(m->tcfm_dev); 179 dev = rcu_dereference(m->tcfm_dev);
167 if (unlikely(!dev)) { 180 if (unlikely(!dev)) {
@@ -175,28 +188,43 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
175 goto out; 188 goto out;
176 } 189 }
177 190
178 at = G_TC_AT(skb->tc_verd);
179 skb2 = skb_clone(skb, GFP_ATOMIC); 191 skb2 = skb_clone(skb, GFP_ATOMIC);
180 if (!skb2) 192 if (!skb2)
181 goto out; 193 goto out;
182 194
183 if (!(at & AT_EGRESS)) { 195 /* If action's target direction differs than filter's direction,
184 if (m->tcfm_ok_push) 196 * and devices expect a mac header on xmit, then mac push/pull is
197 * needed.
198 */
199 if (skb_at_tc_ingress(skb) != tcf_mirred_act_wants_ingress(m_eaction) &&
200 m_mac_header_xmit) {
201 if (!skb_at_tc_ingress(skb)) {
202 /* caught at egress, act ingress: pull mac */
203 mac_len = skb_network_header(skb) - skb_mac_header(skb);
204 skb_pull_rcsum(skb2, mac_len);
205 } else {
206 /* caught at ingress, act egress: push mac */
185 skb_push_rcsum(skb2, skb->mac_len); 207 skb_push_rcsum(skb2, skb->mac_len);
208 }
186 } 209 }
187 210
188 /* mirror is always swallowed */ 211 /* mirror is always swallowed */
189 if (m->tcfm_eaction != TCA_EGRESS_MIRROR) 212 if (tcf_mirred_is_act_redirect(m_eaction)) {
190 skb2->tc_verd = SET_TC_FROM(skb2->tc_verd, at); 213 skb2->tc_redirected = 1;
214 skb2->tc_from_ingress = skb2->tc_at_ingress;
215 }
191 216
192 skb2->skb_iif = skb->dev->ifindex; 217 skb2->skb_iif = skb->dev->ifindex;
193 skb2->dev = dev; 218 skb2->dev = dev;
194 err = dev_queue_xmit(skb2); 219 if (!tcf_mirred_act_wants_ingress(m_eaction))
220 err = dev_queue_xmit(skb2);
221 else
222 err = netif_receive_skb(skb2);
195 223
196 if (err) { 224 if (err) {
197out: 225out:
198 qstats_overlimit_inc(this_cpu_ptr(m->common.cpu_qstats)); 226 qstats_overlimit_inc(this_cpu_ptr(m->common.cpu_qstats));
199 if (m->tcfm_eaction != TCA_EGRESS_MIRROR) 227 if (tcf_mirred_is_act_redirect(m_eaction))
200 retval = TC_ACT_SHOT; 228 retval = TC_ACT_SHOT;
201 } 229 }
202 rcu_read_unlock(); 230 rcu_read_unlock();
@@ -286,6 +314,17 @@ static struct notifier_block mirred_device_notifier = {
286 .notifier_call = mirred_device_event, 314 .notifier_call = mirred_device_event,
287}; 315};
288 316
317static int tcf_mirred_device(const struct tc_action *a, struct net *net,
318 struct net_device **mirred_dev)
319{
320 int ifindex = tcf_mirred_ifindex(a);
321
322 *mirred_dev = __dev_get_by_index(net, ifindex);
323 if (!*mirred_dev)
324 return -EINVAL;
325 return 0;
326}
327
289static struct tc_action_ops act_mirred_ops = { 328static struct tc_action_ops act_mirred_ops = {
290 .kind = "mirred", 329 .kind = "mirred",
291 .type = TCA_ACT_MIRRED, 330 .type = TCA_ACT_MIRRED,
@@ -298,6 +337,7 @@ static struct tc_action_ops act_mirred_ops = {
298 .walk = tcf_mirred_walker, 337 .walk = tcf_mirred_walker,
299 .lookup = tcf_mirred_search, 338 .lookup = tcf_mirred_search,
300 .size = sizeof(struct tcf_mirred), 339 .size = sizeof(struct tcf_mirred),
340 .get_dev = tcf_mirred_device,
301}; 341};
302 342
303static __net_init int mirred_init_net(struct net *net) 343static __net_init int mirred_init_net(struct net *net)
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 8e8b0cc30704..9b6aec665495 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -31,7 +31,7 @@
31 31
32#define NAT_TAB_MASK 15 32#define NAT_TAB_MASK 15
33 33
34static int nat_net_id; 34static unsigned int nat_net_id;
35static struct tc_action_ops act_nat_ops; 35static struct tc_action_ops act_nat_ops;
36 36
37static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { 37static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index cf9b2fe8eac6..c1310472f620 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -22,26 +22,126 @@
22#include <net/pkt_sched.h> 22#include <net/pkt_sched.h>
23#include <linux/tc_act/tc_pedit.h> 23#include <linux/tc_act/tc_pedit.h>
24#include <net/tc_act/tc_pedit.h> 24#include <net/tc_act/tc_pedit.h>
25#include <uapi/linux/tc_act/tc_pedit.h>
25 26
26#define PEDIT_TAB_MASK 15 27#define PEDIT_TAB_MASK 15
27 28
28static int pedit_net_id; 29static unsigned int pedit_net_id;
29static struct tc_action_ops act_pedit_ops; 30static struct tc_action_ops act_pedit_ops;
30 31
31static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = { 32static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = {
32 [TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) }, 33 [TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) },
34 [TCA_PEDIT_KEYS_EX] = { .type = NLA_NESTED },
33}; 35};
34 36
37static const struct nla_policy pedit_key_ex_policy[TCA_PEDIT_KEY_EX_MAX + 1] = {
38 [TCA_PEDIT_KEY_EX_HTYPE] = { .type = NLA_U16 },
39 [TCA_PEDIT_KEY_EX_CMD] = { .type = NLA_U16 },
40};
41
42static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla,
43 u8 n)
44{
45 struct tcf_pedit_key_ex *keys_ex;
46 struct tcf_pedit_key_ex *k;
47 const struct nlattr *ka;
48 int err = -EINVAL;
49 int rem;
50
51 if (!nla || !n)
52 return NULL;
53
54 keys_ex = kcalloc(n, sizeof(*k), GFP_KERNEL);
55 if (!keys_ex)
56 return ERR_PTR(-ENOMEM);
57
58 k = keys_ex;
59
60 nla_for_each_nested(ka, nla, rem) {
61 struct nlattr *tb[TCA_PEDIT_KEY_EX_MAX + 1];
62
63 if (!n) {
64 err = -EINVAL;
65 goto err_out;
66 }
67 n--;
68
69 if (nla_type(ka) != TCA_PEDIT_KEY_EX) {
70 err = -EINVAL;
71 goto err_out;
72 }
73
74 err = nla_parse_nested(tb, TCA_PEDIT_KEY_EX_MAX, ka,
75 pedit_key_ex_policy);
76 if (err)
77 goto err_out;
78
79 if (!tb[TCA_PEDIT_KEY_EX_HTYPE] ||
80 !tb[TCA_PEDIT_KEY_EX_CMD]) {
81 err = -EINVAL;
82 goto err_out;
83 }
84
85 k->htype = nla_get_u16(tb[TCA_PEDIT_KEY_EX_HTYPE]);
86 k->cmd = nla_get_u16(tb[TCA_PEDIT_KEY_EX_CMD]);
87
88 if (k->htype > TCA_PEDIT_HDR_TYPE_MAX ||
89 k->cmd > TCA_PEDIT_CMD_MAX) {
90 err = -EINVAL;
91 goto err_out;
92 }
93
94 k++;
95 }
96
97 if (n)
98 goto err_out;
99
100 return keys_ex;
101
102err_out:
103 kfree(keys_ex);
104 return ERR_PTR(err);
105}
106
107static int tcf_pedit_key_ex_dump(struct sk_buff *skb,
108 struct tcf_pedit_key_ex *keys_ex, int n)
109{
110 struct nlattr *keys_start = nla_nest_start(skb, TCA_PEDIT_KEYS_EX);
111
112 for (; n > 0; n--) {
113 struct nlattr *key_start;
114
115 key_start = nla_nest_start(skb, TCA_PEDIT_KEY_EX);
116
117 if (nla_put_u16(skb, TCA_PEDIT_KEY_EX_HTYPE, keys_ex->htype) ||
118 nla_put_u16(skb, TCA_PEDIT_KEY_EX_CMD, keys_ex->cmd)) {
119 nlmsg_trim(skb, keys_start);
120 return -EINVAL;
121 }
122
123 nla_nest_end(skb, key_start);
124
125 keys_ex++;
126 }
127
128 nla_nest_end(skb, keys_start);
129
130 return 0;
131}
132
35static int tcf_pedit_init(struct net *net, struct nlattr *nla, 133static int tcf_pedit_init(struct net *net, struct nlattr *nla,
36 struct nlattr *est, struct tc_action **a, 134 struct nlattr *est, struct tc_action **a,
37 int ovr, int bind) 135 int ovr, int bind)
38{ 136{
39 struct tc_action_net *tn = net_generic(net, pedit_net_id); 137 struct tc_action_net *tn = net_generic(net, pedit_net_id);
40 struct nlattr *tb[TCA_PEDIT_MAX + 1]; 138 struct nlattr *tb[TCA_PEDIT_MAX + 1];
139 struct nlattr *pattr;
41 struct tc_pedit *parm; 140 struct tc_pedit *parm;
42 int ret = 0, err; 141 int ret = 0, err;
43 struct tcf_pedit *p; 142 struct tcf_pedit *p;
44 struct tc_pedit_key *keys = NULL; 143 struct tc_pedit_key *keys = NULL;
144 struct tcf_pedit_key_ex *keys_ex;
45 int ksize; 145 int ksize;
46 146
47 if (nla == NULL) 147 if (nla == NULL)
@@ -51,13 +151,21 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
51 if (err < 0) 151 if (err < 0)
52 return err; 152 return err;
53 153
54 if (tb[TCA_PEDIT_PARMS] == NULL) 154 pattr = tb[TCA_PEDIT_PARMS];
155 if (!pattr)
156 pattr = tb[TCA_PEDIT_PARMS_EX];
157 if (!pattr)
55 return -EINVAL; 158 return -EINVAL;
56 parm = nla_data(tb[TCA_PEDIT_PARMS]); 159
160 parm = nla_data(pattr);
57 ksize = parm->nkeys * sizeof(struct tc_pedit_key); 161 ksize = parm->nkeys * sizeof(struct tc_pedit_key);
58 if (nla_len(tb[TCA_PEDIT_PARMS]) < sizeof(*parm) + ksize) 162 if (nla_len(pattr) < sizeof(*parm) + ksize)
59 return -EINVAL; 163 return -EINVAL;
60 164
165 keys_ex = tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys);
166 if (IS_ERR(keys_ex))
167 return PTR_ERR(keys_ex);
168
61 if (!tcf_hash_check(tn, parm->index, a, bind)) { 169 if (!tcf_hash_check(tn, parm->index, a, bind)) {
62 if (!parm->nkeys) 170 if (!parm->nkeys)
63 return -EINVAL; 171 return -EINVAL;
@@ -69,6 +177,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
69 keys = kmalloc(ksize, GFP_KERNEL); 177 keys = kmalloc(ksize, GFP_KERNEL);
70 if (keys == NULL) { 178 if (keys == NULL) {
71 tcf_hash_cleanup(*a, est); 179 tcf_hash_cleanup(*a, est);
180 kfree(keys_ex);
72 return -ENOMEM; 181 return -ENOMEM;
73 } 182 }
74 ret = ACT_P_CREATED; 183 ret = ACT_P_CREATED;
@@ -81,8 +190,10 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
81 p = to_pedit(*a); 190 p = to_pedit(*a);
82 if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) { 191 if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) {
83 keys = kmalloc(ksize, GFP_KERNEL); 192 keys = kmalloc(ksize, GFP_KERNEL);
84 if (keys == NULL) 193 if (!keys) {
194 kfree(keys_ex);
85 return -ENOMEM; 195 return -ENOMEM;
196 }
86 } 197 }
87 } 198 }
88 199
@@ -95,6 +206,10 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
95 p->tcfp_nkeys = parm->nkeys; 206 p->tcfp_nkeys = parm->nkeys;
96 } 207 }
97 memcpy(p->tcfp_keys, parm->keys, ksize); 208 memcpy(p->tcfp_keys, parm->keys, ksize);
209
210 kfree(p->tcfp_keys_ex);
211 p->tcfp_keys_ex = keys_ex;
212
98 spin_unlock_bh(&p->tcf_lock); 213 spin_unlock_bh(&p->tcf_lock);
99 if (ret == ACT_P_CREATED) 214 if (ret == ACT_P_CREATED)
100 tcf_hash_insert(tn, *a); 215 tcf_hash_insert(tn, *a);
@@ -106,6 +221,7 @@ static void tcf_pedit_cleanup(struct tc_action *a, int bind)
106 struct tcf_pedit *p = to_pedit(a); 221 struct tcf_pedit *p = to_pedit(a);
107 struct tc_pedit_key *keys = p->tcfp_keys; 222 struct tc_pedit_key *keys = p->tcfp_keys;
108 kfree(keys); 223 kfree(keys);
224 kfree(p->tcfp_keys_ex);
109} 225}
110 226
111static bool offset_valid(struct sk_buff *skb, int offset) 227static bool offset_valid(struct sk_buff *skb, int offset)
@@ -119,38 +235,88 @@ static bool offset_valid(struct sk_buff *skb, int offset)
119 return true; 235 return true;
120} 236}
121 237
238static int pedit_skb_hdr_offset(struct sk_buff *skb,
239 enum pedit_header_type htype, int *hoffset)
240{
241 int ret = -EINVAL;
242
243 switch (htype) {
244 case TCA_PEDIT_KEY_EX_HDR_TYPE_ETH:
245 if (skb_mac_header_was_set(skb)) {
246 *hoffset = skb_mac_offset(skb);
247 ret = 0;
248 }
249 break;
250 case TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK:
251 case TCA_PEDIT_KEY_EX_HDR_TYPE_IP4:
252 case TCA_PEDIT_KEY_EX_HDR_TYPE_IP6:
253 *hoffset = skb_network_offset(skb);
254 ret = 0;
255 break;
256 case TCA_PEDIT_KEY_EX_HDR_TYPE_TCP:
257 case TCA_PEDIT_KEY_EX_HDR_TYPE_UDP:
258 if (skb_transport_header_was_set(skb)) {
259 *hoffset = skb_transport_offset(skb);
260 ret = 0;
261 }
262 break;
263 default:
264 ret = -EINVAL;
265 break;
266 };
267
268 return ret;
269}
270
122static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, 271static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
123 struct tcf_result *res) 272 struct tcf_result *res)
124{ 273{
125 struct tcf_pedit *p = to_pedit(a); 274 struct tcf_pedit *p = to_pedit(a);
126 int i; 275 int i;
127 unsigned int off;
128 276
129 if (skb_unclone(skb, GFP_ATOMIC)) 277 if (skb_unclone(skb, GFP_ATOMIC))
130 return p->tcf_action; 278 return p->tcf_action;
131 279
132 off = skb_network_offset(skb);
133
134 spin_lock(&p->tcf_lock); 280 spin_lock(&p->tcf_lock);
135 281
136 tcf_lastuse_update(&p->tcf_tm); 282 tcf_lastuse_update(&p->tcf_tm);
137 283
138 if (p->tcfp_nkeys > 0) { 284 if (p->tcfp_nkeys > 0) {
139 struct tc_pedit_key *tkey = p->tcfp_keys; 285 struct tc_pedit_key *tkey = p->tcfp_keys;
286 struct tcf_pedit_key_ex *tkey_ex = p->tcfp_keys_ex;
287 enum pedit_header_type htype = TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK;
288 enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET;
140 289
141 for (i = p->tcfp_nkeys; i > 0; i--, tkey++) { 290 for (i = p->tcfp_nkeys; i > 0; i--, tkey++) {
142 u32 *ptr, _data; 291 u32 *ptr, _data;
143 int offset = tkey->off; 292 int offset = tkey->off;
293 int hoffset;
294 u32 val;
295 int rc;
296
297 if (tkey_ex) {
298 htype = tkey_ex->htype;
299 cmd = tkey_ex->cmd;
300
301 tkey_ex++;
302 }
303
304 rc = pedit_skb_hdr_offset(skb, htype, &hoffset);
305 if (rc) {
306 pr_info("tc filter pedit bad header type specified (0x%x)\n",
307 htype);
308 goto bad;
309 }
144 310
145 if (tkey->offmask) { 311 if (tkey->offmask) {
146 char *d, _d; 312 char *d, _d;
147 313
148 if (!offset_valid(skb, off + tkey->at)) { 314 if (!offset_valid(skb, hoffset + tkey->at)) {
149 pr_info("tc filter pedit 'at' offset %d out of bounds\n", 315 pr_info("tc filter pedit 'at' offset %d out of bounds\n",
150 off + tkey->at); 316 hoffset + tkey->at);
151 goto bad; 317 goto bad;
152 } 318 }
153 d = skb_header_pointer(skb, off + tkey->at, 1, 319 d = skb_header_pointer(skb, hoffset + tkey->at, 1,
154 &_d); 320 &_d);
155 if (!d) 321 if (!d)
156 goto bad; 322 goto bad;
@@ -163,19 +329,32 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
163 goto bad; 329 goto bad;
164 } 330 }
165 331
166 if (!offset_valid(skb, off + offset)) { 332 if (!offset_valid(skb, hoffset + offset)) {
167 pr_info("tc filter pedit offset %d out of bounds\n", 333 pr_info("tc filter pedit offset %d out of bounds\n",
168 offset); 334 hoffset + offset);
169 goto bad; 335 goto bad;
170 } 336 }
171 337
172 ptr = skb_header_pointer(skb, off + offset, 4, &_data); 338 ptr = skb_header_pointer(skb, hoffset + offset, 4, &_data);
173 if (!ptr) 339 if (!ptr)
174 goto bad; 340 goto bad;
175 /* just do it, baby */ 341 /* just do it, baby */
176 *ptr = ((*ptr & tkey->mask) ^ tkey->val); 342 switch (cmd) {
343 case TCA_PEDIT_KEY_EX_CMD_SET:
344 val = tkey->val;
345 break;
346 case TCA_PEDIT_KEY_EX_CMD_ADD:
347 val = (*ptr + tkey->val) & ~tkey->mask;
348 break;
349 default:
350 pr_info("tc filter pedit bad command (%d)\n",
351 cmd);
352 goto bad;
353 }
354
355 *ptr = ((*ptr & tkey->mask) ^ val);
177 if (ptr == &_data) 356 if (ptr == &_data)
178 skb_store_bits(skb, off + offset, ptr, 4); 357 skb_store_bits(skb, hoffset + offset, ptr, 4);
179 } 358 }
180 359
181 goto done; 360 goto done;
@@ -215,8 +394,15 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
215 opt->refcnt = p->tcf_refcnt - ref; 394 opt->refcnt = p->tcf_refcnt - ref;
216 opt->bindcnt = p->tcf_bindcnt - bind; 395 opt->bindcnt = p->tcf_bindcnt - bind;
217 396
218 if (nla_put(skb, TCA_PEDIT_PARMS, s, opt)) 397 if (p->tcfp_keys_ex) {
219 goto nla_put_failure; 398 tcf_pedit_key_ex_dump(skb, p->tcfp_keys_ex, p->tcfp_nkeys);
399
400 if (nla_put(skb, TCA_PEDIT_PARMS_EX, s, opt))
401 goto nla_put_failure;
402 } else {
403 if (nla_put(skb, TCA_PEDIT_PARMS, s, opt))
404 goto nla_put_failure;
405 }
220 406
221 tcf_tm_dump(&t, &p->tcf_tm); 407 tcf_tm_dump(&t, &p->tcf_tm);
222 if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD)) 408 if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD))
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index d1bd248fe146..0ba91d1ce994 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -55,7 +55,7 @@ struct tc_police_compat {
55 55
56/* Each policer is serialized by its individual spinlock */ 56/* Each policer is serialized by its individual spinlock */
57 57
58static int police_net_id; 58static unsigned int police_net_id;
59static struct tc_action_ops act_police_ops; 59static struct tc_action_ops act_police_ops;
60 60
61static int tcf_act_police_walker(struct net *net, struct sk_buff *skb, 61static int tcf_act_police_walker(struct net *net, struct sk_buff *skb,
@@ -142,8 +142,7 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla,
142 goto failure_unlock; 142 goto failure_unlock;
143 } else if (tb[TCA_POLICE_AVRATE] && 143 } else if (tb[TCA_POLICE_AVRATE] &&
144 (ret == ACT_P_CREATED || 144 (ret == ACT_P_CREATED ||
145 !gen_estimator_active(&police->tcf_bstats, 145 !gen_estimator_active(&police->tcf_rate_est))) {
146 &police->tcf_rate_est))) {
147 err = -EINVAL; 146 err = -EINVAL;
148 goto failure_unlock; 147 goto failure_unlock;
149 } 148 }
@@ -216,13 +215,17 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
216 bstats_update(&police->tcf_bstats, skb); 215 bstats_update(&police->tcf_bstats, skb);
217 tcf_lastuse_update(&police->tcf_tm); 216 tcf_lastuse_update(&police->tcf_tm);
218 217
219 if (police->tcfp_ewma_rate && 218 if (police->tcfp_ewma_rate) {
220 police->tcf_rate_est.bps >= police->tcfp_ewma_rate) { 219 struct gnet_stats_rate_est64 sample;
221 police->tcf_qstats.overlimits++; 220
222 if (police->tcf_action == TC_ACT_SHOT) 221 if (!gen_estimator_read(&police->tcf_rate_est, &sample) ||
223 police->tcf_qstats.drops++; 222 sample.bps >= police->tcfp_ewma_rate) {
224 spin_unlock(&police->tcf_lock); 223 police->tcf_qstats.overlimits++;
225 return police->tcf_action; 224 if (police->tcf_action == TC_ACT_SHOT)
225 police->tcf_qstats.drops++;
226 spin_unlock(&police->tcf_lock);
227 return police->tcf_action;
228 }
226 } 229 }
227 230
228 if (qdisc_pkt_len(skb) <= police->tcfp_mtu) { 231 if (qdisc_pkt_len(skb) <= police->tcfp_mtu) {
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
new file mode 100644
index 000000000000..0b8217b4763f
--- /dev/null
+++ b/net/sched/act_sample.c
@@ -0,0 +1,276 @@
1/*
2 * net/sched/act_sample.c - Packet sampling tc action
3 * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 */
9
10#include <linux/types.h>
11#include <linux/kernel.h>
12#include <linux/string.h>
13#include <linux/errno.h>
14#include <linux/skbuff.h>
15#include <linux/rtnetlink.h>
16#include <linux/module.h>
17#include <linux/init.h>
18#include <linux/gfp.h>
19#include <net/net_namespace.h>
20#include <net/netlink.h>
21#include <net/pkt_sched.h>
22#include <linux/tc_act/tc_sample.h>
23#include <net/tc_act/tc_sample.h>
24#include <net/psample.h>
25
26#include <linux/if_arp.h>
27
28#define SAMPLE_TAB_MASK 7
29static unsigned int sample_net_id;
30static struct tc_action_ops act_sample_ops;
31
32static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = {
33 [TCA_SAMPLE_PARMS] = { .len = sizeof(struct tc_sample) },
34 [TCA_SAMPLE_RATE] = { .type = NLA_U32 },
35 [TCA_SAMPLE_TRUNC_SIZE] = { .type = NLA_U32 },
36 [TCA_SAMPLE_PSAMPLE_GROUP] = { .type = NLA_U32 },
37};
38
39static int tcf_sample_init(struct net *net, struct nlattr *nla,
40 struct nlattr *est, struct tc_action **a, int ovr,
41 int bind)
42{
43 struct tc_action_net *tn = net_generic(net, sample_net_id);
44 struct nlattr *tb[TCA_SAMPLE_MAX + 1];
45 struct psample_group *psample_group;
46 struct tc_sample *parm;
47 struct tcf_sample *s;
48 bool exists = false;
49 int ret;
50
51 if (!nla)
52 return -EINVAL;
53 ret = nla_parse_nested(tb, TCA_SAMPLE_MAX, nla, sample_policy);
54 if (ret < 0)
55 return ret;
56 if (!tb[TCA_SAMPLE_PARMS] || !tb[TCA_SAMPLE_RATE] ||
57 !tb[TCA_SAMPLE_PSAMPLE_GROUP])
58 return -EINVAL;
59
60 parm = nla_data(tb[TCA_SAMPLE_PARMS]);
61
62 exists = tcf_hash_check(tn, parm->index, a, bind);
63 if (exists && bind)
64 return 0;
65
66 if (!exists) {
67 ret = tcf_hash_create(tn, parm->index, est, a,
68 &act_sample_ops, bind, false);
69 if (ret)
70 return ret;
71 ret = ACT_P_CREATED;
72 } else {
73 tcf_hash_release(*a, bind);
74 if (!ovr)
75 return -EEXIST;
76 }
77 s = to_sample(*a);
78
79 s->tcf_action = parm->action;
80 s->rate = nla_get_u32(tb[TCA_SAMPLE_RATE]);
81 s->psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]);
82 psample_group = psample_group_get(net, s->psample_group_num);
83 if (!psample_group) {
84 if (ret == ACT_P_CREATED)
85 tcf_hash_release(*a, bind);
86 return -ENOMEM;
87 }
88 RCU_INIT_POINTER(s->psample_group, psample_group);
89
90 if (tb[TCA_SAMPLE_TRUNC_SIZE]) {
91 s->truncate = true;
92 s->trunc_size = nla_get_u32(tb[TCA_SAMPLE_TRUNC_SIZE]);
93 }
94
95 if (ret == ACT_P_CREATED)
96 tcf_hash_insert(tn, *a);
97 return ret;
98}
99
100static void tcf_sample_cleanup_rcu(struct rcu_head *rcu)
101{
102 struct tcf_sample *s = container_of(rcu, struct tcf_sample, rcu);
103 struct psample_group *psample_group;
104
105 psample_group = rcu_dereference_protected(s->psample_group, 1);
106 RCU_INIT_POINTER(s->psample_group, NULL);
107 psample_group_put(psample_group);
108}
109
110static void tcf_sample_cleanup(struct tc_action *a, int bind)
111{
112 struct tcf_sample *s = to_sample(a);
113
114 call_rcu(&s->rcu, tcf_sample_cleanup_rcu);
115}
116
117static bool tcf_sample_dev_ok_push(struct net_device *dev)
118{
119 switch (dev->type) {
120 case ARPHRD_TUNNEL:
121 case ARPHRD_TUNNEL6:
122 case ARPHRD_SIT:
123 case ARPHRD_IPGRE:
124 case ARPHRD_VOID:
125 case ARPHRD_NONE:
126 return false;
127 default:
128 return true;
129 }
130}
131
132static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a,
133 struct tcf_result *res)
134{
135 struct tcf_sample *s = to_sample(a);
136 struct psample_group *psample_group;
137 int retval;
138 int size;
139 int iif;
140 int oif;
141
142 tcf_lastuse_update(&s->tcf_tm);
143 bstats_cpu_update(this_cpu_ptr(s->common.cpu_bstats), skb);
144 retval = READ_ONCE(s->tcf_action);
145
146 rcu_read_lock();
147 psample_group = rcu_dereference(s->psample_group);
148
149 /* randomly sample packets according to rate */
150 if (psample_group && (prandom_u32() % s->rate == 0)) {
151 if (!skb_at_tc_ingress(skb)) {
152 iif = skb->skb_iif;
153 oif = skb->dev->ifindex;
154 } else {
155 iif = skb->dev->ifindex;
156 oif = 0;
157 }
158
159 /* on ingress, the mac header gets popped, so push it back */
160 if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev))
161 skb_push(skb, skb->mac_len);
162
163 size = s->truncate ? s->trunc_size : skb->len;
164 psample_sample_packet(psample_group, skb, size, iif, oif,
165 s->rate);
166
167 if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev))
168 skb_pull(skb, skb->mac_len);
169 }
170
171 rcu_read_unlock();
172 return retval;
173}
174
175static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a,
176 int bind, int ref)
177{
178 unsigned char *b = skb_tail_pointer(skb);
179 struct tcf_sample *s = to_sample(a);
180 struct tc_sample opt = {
181 .index = s->tcf_index,
182 .action = s->tcf_action,
183 .refcnt = s->tcf_refcnt - ref,
184 .bindcnt = s->tcf_bindcnt - bind,
185 };
186 struct tcf_t t;
187
188 if (nla_put(skb, TCA_SAMPLE_PARMS, sizeof(opt), &opt))
189 goto nla_put_failure;
190
191 tcf_tm_dump(&t, &s->tcf_tm);
192 if (nla_put_64bit(skb, TCA_SAMPLE_TM, sizeof(t), &t, TCA_SAMPLE_PAD))
193 goto nla_put_failure;
194
195 if (nla_put_u32(skb, TCA_SAMPLE_RATE, s->rate))
196 goto nla_put_failure;
197
198 if (s->truncate)
199 if (nla_put_u32(skb, TCA_SAMPLE_TRUNC_SIZE, s->trunc_size))
200 goto nla_put_failure;
201
202 if (nla_put_u32(skb, TCA_SAMPLE_PSAMPLE_GROUP, s->psample_group_num))
203 goto nla_put_failure;
204 return skb->len;
205
206nla_put_failure:
207 nlmsg_trim(skb, b);
208 return -1;
209}
210
211static int tcf_sample_walker(struct net *net, struct sk_buff *skb,
212 struct netlink_callback *cb, int type,
213 const struct tc_action_ops *ops)
214{
215 struct tc_action_net *tn = net_generic(net, sample_net_id);
216
217 return tcf_generic_walker(tn, skb, cb, type, ops);
218}
219
220static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index)
221{
222 struct tc_action_net *tn = net_generic(net, sample_net_id);
223
224 return tcf_hash_search(tn, a, index);
225}
226
227static struct tc_action_ops act_sample_ops = {
228 .kind = "sample",
229 .type = TCA_ACT_SAMPLE,
230 .owner = THIS_MODULE,
231 .act = tcf_sample_act,
232 .dump = tcf_sample_dump,
233 .init = tcf_sample_init,
234 .cleanup = tcf_sample_cleanup,
235 .walk = tcf_sample_walker,
236 .lookup = tcf_sample_search,
237 .size = sizeof(struct tcf_sample),
238};
239
240static __net_init int sample_init_net(struct net *net)
241{
242 struct tc_action_net *tn = net_generic(net, sample_net_id);
243
244 return tc_action_net_init(tn, &act_sample_ops, SAMPLE_TAB_MASK);
245}
246
247static void __net_exit sample_exit_net(struct net *net)
248{
249 struct tc_action_net *tn = net_generic(net, sample_net_id);
250
251 tc_action_net_exit(tn);
252}
253
254static struct pernet_operations sample_net_ops = {
255 .init = sample_init_net,
256 .exit = sample_exit_net,
257 .id = &sample_net_id,
258 .size = sizeof(struct tc_action_net),
259};
260
261static int __init sample_init_module(void)
262{
263 return tcf_register_action(&act_sample_ops, &sample_net_ops);
264}
265
266static void __exit sample_cleanup_module(void)
267{
268 tcf_unregister_action(&act_sample_ops, &sample_net_ops);
269}
270
271module_init(sample_init_module);
272module_exit(sample_cleanup_module);
273
274MODULE_AUTHOR("Yotam Gigi <yotamg@mellanox.com>");
275MODULE_DESCRIPTION("Packet sampling action");
276MODULE_LICENSE("GPL v2");
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 289af6f9bb3b..823a73ad0c60 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -26,7 +26,7 @@
26 26
27#define SIMP_TAB_MASK 7 27#define SIMP_TAB_MASK 7
28 28
29static int simp_net_id; 29static unsigned int simp_net_id;
30static struct tc_action_ops act_simp_ops; 30static struct tc_action_ops act_simp_ops;
31 31
32#define SIMP_MAX_DATA 32 32#define SIMP_MAX_DATA 32
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index a133dcb82132..06ccae3c12ee 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -29,7 +29,7 @@
29 29
30#define SKBEDIT_TAB_MASK 15 30#define SKBEDIT_TAB_MASK 15
31 31
32static int skbedit_net_id; 32static unsigned int skbedit_net_id;
33static struct tc_action_ops act_skbedit_ops; 33static struct tc_action_ops act_skbedit_ops;
34 34
35static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, 35static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,
@@ -46,8 +46,10 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,
46 if (d->flags & SKBEDIT_F_QUEUE_MAPPING && 46 if (d->flags & SKBEDIT_F_QUEUE_MAPPING &&
47 skb->dev->real_num_tx_queues > d->queue_mapping) 47 skb->dev->real_num_tx_queues > d->queue_mapping)
48 skb_set_queue_mapping(skb, d->queue_mapping); 48 skb_set_queue_mapping(skb, d->queue_mapping);
49 if (d->flags & SKBEDIT_F_MARK) 49 if (d->flags & SKBEDIT_F_MARK) {
50 skb->mark = d->mark; 50 skb->mark &= ~d->mask;
51 skb->mark |= d->mark & d->mask;
52 }
51 if (d->flags & SKBEDIT_F_PTYPE) 53 if (d->flags & SKBEDIT_F_PTYPE)
52 skb->pkt_type = d->ptype; 54 skb->pkt_type = d->ptype;
53 55
@@ -61,6 +63,7 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
61 [TCA_SKBEDIT_QUEUE_MAPPING] = { .len = sizeof(u16) }, 63 [TCA_SKBEDIT_QUEUE_MAPPING] = { .len = sizeof(u16) },
62 [TCA_SKBEDIT_MARK] = { .len = sizeof(u32) }, 64 [TCA_SKBEDIT_MARK] = { .len = sizeof(u32) },
63 [TCA_SKBEDIT_PTYPE] = { .len = sizeof(u16) }, 65 [TCA_SKBEDIT_PTYPE] = { .len = sizeof(u16) },
66 [TCA_SKBEDIT_MASK] = { .len = sizeof(u32) },
64}; 67};
65 68
66static int tcf_skbedit_init(struct net *net, struct nlattr *nla, 69static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
@@ -71,7 +74,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
71 struct nlattr *tb[TCA_SKBEDIT_MAX + 1]; 74 struct nlattr *tb[TCA_SKBEDIT_MAX + 1];
72 struct tc_skbedit *parm; 75 struct tc_skbedit *parm;
73 struct tcf_skbedit *d; 76 struct tcf_skbedit *d;
74 u32 flags = 0, *priority = NULL, *mark = NULL; 77 u32 flags = 0, *priority = NULL, *mark = NULL, *mask = NULL;
75 u16 *queue_mapping = NULL, *ptype = NULL; 78 u16 *queue_mapping = NULL, *ptype = NULL;
76 bool exists = false; 79 bool exists = false;
77 int ret = 0, err; 80 int ret = 0, err;
@@ -108,6 +111,11 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
108 mark = nla_data(tb[TCA_SKBEDIT_MARK]); 111 mark = nla_data(tb[TCA_SKBEDIT_MARK]);
109 } 112 }
110 113
114 if (tb[TCA_SKBEDIT_MASK] != NULL) {
115 flags |= SKBEDIT_F_MASK;
116 mask = nla_data(tb[TCA_SKBEDIT_MASK]);
117 }
118
111 parm = nla_data(tb[TCA_SKBEDIT_PARMS]); 119 parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
112 120
113 exists = tcf_hash_check(tn, parm->index, a, bind); 121 exists = tcf_hash_check(tn, parm->index, a, bind);
@@ -145,6 +153,10 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
145 d->mark = *mark; 153 d->mark = *mark;
146 if (flags & SKBEDIT_F_PTYPE) 154 if (flags & SKBEDIT_F_PTYPE)
147 d->ptype = *ptype; 155 d->ptype = *ptype;
156 /* default behaviour is to use all the bits */
157 d->mask = 0xffffffff;
158 if (flags & SKBEDIT_F_MASK)
159 d->mask = *mask;
148 160
149 d->tcf_action = parm->action; 161 d->tcf_action = parm->action;
150 162
@@ -182,6 +194,9 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
182 if ((d->flags & SKBEDIT_F_PTYPE) && 194 if ((d->flags & SKBEDIT_F_PTYPE) &&
183 nla_put_u16(skb, TCA_SKBEDIT_PTYPE, d->ptype)) 195 nla_put_u16(skb, TCA_SKBEDIT_PTYPE, d->ptype))
184 goto nla_put_failure; 196 goto nla_put_failure;
197 if ((d->flags & SKBEDIT_F_MASK) &&
198 nla_put_u32(skb, TCA_SKBEDIT_MASK, d->mask))
199 goto nla_put_failure;
185 200
186 tcf_tm_dump(&t, &d->tcf_tm); 201 tcf_tm_dump(&t, &d->tcf_tm);
187 if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD)) 202 if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD))
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index e7d96381c908..c736627f8f4a 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -22,7 +22,7 @@
22 22
23#define SKBMOD_TAB_MASK 15 23#define SKBMOD_TAB_MASK 15
24 24
25static int skbmod_net_id; 25static unsigned int skbmod_net_id;
26static struct tc_action_ops act_skbmod_ops; 26static struct tc_action_ops act_skbmod_ops;
27 27
28#define MAX_EDIT_LEN ETH_HLEN 28#define MAX_EDIT_LEN ETH_HLEN
@@ -228,7 +228,6 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a,
228 228
229 return skb->len; 229 return skb->len;
230nla_put_failure: 230nla_put_failure:
231 rcu_read_unlock();
232 nlmsg_trim(skb, b); 231 nlmsg_trim(skb, b);
233 return -1; 232 return -1;
234} 233}
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index af47bdf2f483..e3a58e021198 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -16,14 +16,13 @@
16#include <net/netlink.h> 16#include <net/netlink.h>
17#include <net/pkt_sched.h> 17#include <net/pkt_sched.h>
18#include <net/dst.h> 18#include <net/dst.h>
19#include <net/dst_metadata.h>
20 19
21#include <linux/tc_act/tc_tunnel_key.h> 20#include <linux/tc_act/tc_tunnel_key.h>
22#include <net/tc_act/tc_tunnel_key.h> 21#include <net/tc_act/tc_tunnel_key.h>
23 22
24#define TUNNEL_KEY_TAB_MASK 15 23#define TUNNEL_KEY_TAB_MASK 15
25 24
26static int tunnel_key_net_id; 25static unsigned int tunnel_key_net_id;
27static struct tc_action_ops act_tunnel_key_ops; 26static struct tc_action_ops act_tunnel_key_ops;
28 27
29static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a, 28static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a,
@@ -67,6 +66,7 @@ static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = {
67 [TCA_TUNNEL_KEY_ENC_IPV6_SRC] = { .len = sizeof(struct in6_addr) }, 66 [TCA_TUNNEL_KEY_ENC_IPV6_SRC] = { .len = sizeof(struct in6_addr) },
68 [TCA_TUNNEL_KEY_ENC_IPV6_DST] = { .len = sizeof(struct in6_addr) }, 67 [TCA_TUNNEL_KEY_ENC_IPV6_DST] = { .len = sizeof(struct in6_addr) },
69 [TCA_TUNNEL_KEY_ENC_KEY_ID] = { .type = NLA_U32 }, 68 [TCA_TUNNEL_KEY_ENC_KEY_ID] = { .type = NLA_U32 },
69 [TCA_TUNNEL_KEY_ENC_DST_PORT] = {.type = NLA_U16},
70}; 70};
71 71
72static int tunnel_key_init(struct net *net, struct nlattr *nla, 72static int tunnel_key_init(struct net *net, struct nlattr *nla,
@@ -81,6 +81,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
81 struct tc_tunnel_key *parm; 81 struct tc_tunnel_key *parm;
82 struct tcf_tunnel_key *t; 82 struct tcf_tunnel_key *t;
83 bool exists = false; 83 bool exists = false;
84 __be16 dst_port = 0;
84 __be64 key_id; 85 __be64 key_id;
85 int ret = 0; 86 int ret = 0;
86 int err; 87 int err;
@@ -111,6 +112,9 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
111 112
112 key_id = key32_to_tunnel_id(nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID])); 113 key_id = key32_to_tunnel_id(nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID]));
113 114
115 if (tb[TCA_TUNNEL_KEY_ENC_DST_PORT])
116 dst_port = nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_DST_PORT]);
117
114 if (tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC] && 118 if (tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC] &&
115 tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]) { 119 tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]) {
116 __be32 saddr; 120 __be32 saddr;
@@ -120,7 +124,8 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
120 daddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]); 124 daddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]);
121 125
122 metadata = __ip_tun_set_dst(saddr, daddr, 0, 0, 126 metadata = __ip_tun_set_dst(saddr, daddr, 0, 0,
123 TUNNEL_KEY, key_id, 0); 127 dst_port, TUNNEL_KEY,
128 key_id, 0);
124 } else if (tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC] && 129 } else if (tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC] &&
125 tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]) { 130 tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]) {
126 struct in6_addr saddr; 131 struct in6_addr saddr;
@@ -129,8 +134,9 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
129 saddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC]); 134 saddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC]);
130 daddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]); 135 daddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]);
131 136
132 metadata = __ipv6_tun_set_dst(&saddr, &daddr, 0, 0, 0, 137 metadata = __ipv6_tun_set_dst(&saddr, &daddr, 0, 0, dst_port,
133 TUNNEL_KEY, key_id, 0); 138 0, TUNNEL_KEY,
139 key_id, 0);
134 } 140 }
135 141
136 if (!metadata) { 142 if (!metadata) {
@@ -258,7 +264,8 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
258 264
259 if (nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id) || 265 if (nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id) ||
260 tunnel_key_dump_addresses(skb, 266 tunnel_key_dump_addresses(skb,
261 &params->tcft_enc_metadata->u.tun_info)) 267 &params->tcft_enc_metadata->u.tun_info) ||
268 nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT, key->tp_dst))
262 goto nla_put_failure; 269 goto nla_put_failure;
263 } 270 }
264 271
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index b57fcbcefea1..19e0dba305ce 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -21,7 +21,7 @@
21 21
22#define VLAN_TAB_MASK 15 22#define VLAN_TAB_MASK 15
23 23
24static int vlan_net_id; 24static unsigned int vlan_net_id;
25static struct tc_action_ops act_vlan_ops; 25static struct tc_action_ops act_vlan_ops;
26 26
27static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, 27static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index b05d4a2155b0..732f7cae459d 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -19,6 +19,7 @@
19#include <linux/kernel.h> 19#include <linux/kernel.h>
20#include <linux/string.h> 20#include <linux/string.h>
21#include <linux/errno.h> 21#include <linux/errno.h>
22#include <linux/err.h>
22#include <linux/skbuff.h> 23#include <linux/skbuff.h>
23#include <linux/init.h> 24#include <linux/init.h>
24#include <linux/kmod.h> 25#include <linux/kmod.h>
@@ -38,14 +39,14 @@ static DEFINE_RWLOCK(cls_mod_lock);
38 39
39/* Find classifier type by string name */ 40/* Find classifier type by string name */
40 41
41static const struct tcf_proto_ops *tcf_proto_lookup_ops(struct nlattr *kind) 42static const struct tcf_proto_ops *tcf_proto_lookup_ops(const char *kind)
42{ 43{
43 const struct tcf_proto_ops *t, *res = NULL; 44 const struct tcf_proto_ops *t, *res = NULL;
44 45
45 if (kind) { 46 if (kind) {
46 read_lock(&cls_mod_lock); 47 read_lock(&cls_mod_lock);
47 list_for_each_entry(t, &tcf_proto_base, head) { 48 list_for_each_entry(t, &tcf_proto_base, head) {
48 if (nla_strcmp(kind, t->kind) == 0) { 49 if (strcmp(kind, t->kind) == 0) {
49 if (try_module_get(t->owner)) 50 if (try_module_get(t->owner))
50 res = t; 51 res = t;
51 break; 52 break;
@@ -127,6 +128,77 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp)
127 return first; 128 return first;
128} 129}
129 130
131static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol,
132 u32 prio, u32 parent, struct Qdisc *q)
133{
134 struct tcf_proto *tp;
135 int err;
136
137 tp = kzalloc(sizeof(*tp), GFP_KERNEL);
138 if (!tp)
139 return ERR_PTR(-ENOBUFS);
140
141 err = -ENOENT;
142 tp->ops = tcf_proto_lookup_ops(kind);
143 if (!tp->ops) {
144#ifdef CONFIG_MODULES
145 rtnl_unlock();
146 request_module("cls_%s", kind);
147 rtnl_lock();
148 tp->ops = tcf_proto_lookup_ops(kind);
149 /* We dropped the RTNL semaphore in order to perform
150 * the module load. So, even if we succeeded in loading
151 * the module we have to replay the request. We indicate
152 * this using -EAGAIN.
153 */
154 if (tp->ops) {
155 module_put(tp->ops->owner);
156 err = -EAGAIN;
157 } else {
158 err = -ENOENT;
159 }
160 goto errout;
161#endif
162 }
163 tp->classify = tp->ops->classify;
164 tp->protocol = protocol;
165 tp->prio = prio;
166 tp->classid = parent;
167 tp->q = q;
168
169 err = tp->ops->init(tp);
170 if (err) {
171 module_put(tp->ops->owner);
172 goto errout;
173 }
174 return tp;
175
176errout:
177 kfree(tp);
178 return ERR_PTR(err);
179}
180
181static bool tcf_proto_destroy(struct tcf_proto *tp, bool force)
182{
183 if (tp->ops->destroy(tp, force)) {
184 module_put(tp->ops->owner);
185 kfree_rcu(tp, rcu);
186 return true;
187 }
188 return false;
189}
190
191void tcf_destroy_chain(struct tcf_proto __rcu **fl)
192{
193 struct tcf_proto *tp;
194
195 while ((tp = rtnl_dereference(*fl)) != NULL) {
196 RCU_INIT_POINTER(*fl, tp->next);
197 tcf_proto_destroy(tp, true);
198 }
199}
200EXPORT_SYMBOL(tcf_destroy_chain);
201
130/* Add/change/delete/get a filter node */ 202/* Add/change/delete/get a filter node */
131 203
132static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n) 204static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
@@ -142,19 +214,21 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
142 struct Qdisc *q; 214 struct Qdisc *q;
143 struct tcf_proto __rcu **back; 215 struct tcf_proto __rcu **back;
144 struct tcf_proto __rcu **chain; 216 struct tcf_proto __rcu **chain;
217 struct tcf_proto *next;
145 struct tcf_proto *tp; 218 struct tcf_proto *tp;
146 const struct tcf_proto_ops *tp_ops;
147 const struct Qdisc_class_ops *cops; 219 const struct Qdisc_class_ops *cops;
148 unsigned long cl; 220 unsigned long cl;
149 unsigned long fh; 221 unsigned long fh;
150 int err; 222 int err;
151 int tp_created = 0; 223 int tp_created;
152 224
153 if ((n->nlmsg_type != RTM_GETTFILTER) && 225 if ((n->nlmsg_type != RTM_GETTFILTER) &&
154 !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) 226 !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
155 return -EPERM; 227 return -EPERM;
156 228
157replay: 229replay:
230 tp_created = 0;
231
158 err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL); 232 err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL);
159 if (err < 0) 233 if (err < 0)
160 return err; 234 return err;
@@ -220,9 +294,10 @@ replay:
220 294
221 /* And the last stroke */ 295 /* And the last stroke */
222 chain = cops->tcf_chain(q, cl); 296 chain = cops->tcf_chain(q, cl);
223 err = -EINVAL; 297 if (chain == NULL) {
224 if (chain == NULL) 298 err = -EINVAL;
225 goto errout; 299 goto errout;
300 }
226 if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) { 301 if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) {
227 tfilter_notify_chain(net, skb, n, chain, RTM_DELTFILTER); 302 tfilter_notify_chain(net, skb, n, chain, RTM_DELTFILTER);
228 tcf_destroy_chain(chain); 303 tcf_destroy_chain(chain);
@@ -237,10 +312,13 @@ replay:
237 if (tp->prio >= prio) { 312 if (tp->prio >= prio) {
238 if (tp->prio == prio) { 313 if (tp->prio == prio) {
239 if (!nprio || 314 if (!nprio ||
240 (tp->protocol != protocol && protocol)) 315 (tp->protocol != protocol && protocol)) {
316 err = -EINVAL;
241 goto errout; 317 goto errout;
242 } else 318 }
319 } else {
243 tp = NULL; 320 tp = NULL;
321 }
244 break; 322 break;
245 } 323 }
246 } 324 }
@@ -248,109 +326,69 @@ replay:
248 if (tp == NULL) { 326 if (tp == NULL) {
249 /* Proto-tcf does not exist, create new one */ 327 /* Proto-tcf does not exist, create new one */
250 328
251 if (tca[TCA_KIND] == NULL || !protocol) 329 if (tca[TCA_KIND] == NULL || !protocol) {
330 err = -EINVAL;
252 goto errout; 331 goto errout;
332 }
253 333
254 err = -ENOENT;
255 if (n->nlmsg_type != RTM_NEWTFILTER || 334 if (n->nlmsg_type != RTM_NEWTFILTER ||
256 !(n->nlmsg_flags & NLM_F_CREATE)) 335 !(n->nlmsg_flags & NLM_F_CREATE)) {
336 err = -ENOENT;
257 goto errout; 337 goto errout;
338 }
258 339
340 if (!nprio)
341 nprio = TC_H_MAJ(tcf_auto_prio(rtnl_dereference(*back)));
259 342
260 /* Create new proto tcf */ 343 tp = tcf_proto_create(nla_data(tca[TCA_KIND]),
261 344 protocol, nprio, parent, q);
262 err = -ENOBUFS; 345 if (IS_ERR(tp)) {
263 tp = kzalloc(sizeof(*tp), GFP_KERNEL); 346 err = PTR_ERR(tp);
264 if (tp == NULL)
265 goto errout;
266 err = -ENOENT;
267 tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND]);
268 if (tp_ops == NULL) {
269#ifdef CONFIG_MODULES
270 struct nlattr *kind = tca[TCA_KIND];
271 char name[IFNAMSIZ];
272
273 if (kind != NULL &&
274 nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
275 rtnl_unlock();
276 request_module("cls_%s", name);
277 rtnl_lock();
278 tp_ops = tcf_proto_lookup_ops(kind);
279 /* We dropped the RTNL semaphore in order to
280 * perform the module load. So, even if we
281 * succeeded in loading the module we have to
282 * replay the request. We indicate this using
283 * -EAGAIN.
284 */
285 if (tp_ops != NULL) {
286 module_put(tp_ops->owner);
287 err = -EAGAIN;
288 }
289 }
290#endif
291 kfree(tp);
292 goto errout; 347 goto errout;
293 } 348 }
294 tp->ops = tp_ops;
295 tp->protocol = protocol;
296 tp->prio = nprio ? :
297 TC_H_MAJ(tcf_auto_prio(rtnl_dereference(*back)));
298 tp->q = q;
299 tp->classify = tp_ops->classify;
300 tp->classid = parent;
301
302 err = tp_ops->init(tp);
303 if (err != 0) {
304 module_put(tp_ops->owner);
305 kfree(tp);
306 goto errout;
307 }
308
309 tp_created = 1; 349 tp_created = 1;
310 350 } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) {
311 } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) 351 err = -EINVAL;
312 goto errout; 352 goto errout;
353 }
313 354
314 fh = tp->ops->get(tp, t->tcm_handle); 355 fh = tp->ops->get(tp, t->tcm_handle);
315 356
316 if (fh == 0) { 357 if (fh == 0) {
317 if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) { 358 if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
318 struct tcf_proto *next = rtnl_dereference(tp->next); 359 next = rtnl_dereference(tp->next);
319
320 RCU_INIT_POINTER(*back, next); 360 RCU_INIT_POINTER(*back, next);
321
322 tfilter_notify(net, skb, n, tp, fh, 361 tfilter_notify(net, skb, n, tp, fh,
323 RTM_DELTFILTER, false); 362 RTM_DELTFILTER, false);
324 tcf_destroy(tp, true); 363 tcf_proto_destroy(tp, true);
325 err = 0; 364 err = 0;
326 goto errout; 365 goto errout;
327 } 366 }
328 367
329 err = -ENOENT;
330 if (n->nlmsg_type != RTM_NEWTFILTER || 368 if (n->nlmsg_type != RTM_NEWTFILTER ||
331 !(n->nlmsg_flags & NLM_F_CREATE)) 369 !(n->nlmsg_flags & NLM_F_CREATE)) {
370 err = -ENOENT;
332 goto errout; 371 goto errout;
372 }
333 } else { 373 } else {
334 switch (n->nlmsg_type) { 374 switch (n->nlmsg_type) {
335 case RTM_NEWTFILTER: 375 case RTM_NEWTFILTER:
336 err = -EEXIST;
337 if (n->nlmsg_flags & NLM_F_EXCL) { 376 if (n->nlmsg_flags & NLM_F_EXCL) {
338 if (tp_created) 377 if (tp_created)
339 tcf_destroy(tp, true); 378 tcf_proto_destroy(tp, true);
379 err = -EEXIST;
340 goto errout; 380 goto errout;
341 } 381 }
342 break; 382 break;
343 case RTM_DELTFILTER: 383 case RTM_DELTFILTER:
344 err = tp->ops->delete(tp, fh); 384 err = tp->ops->delete(tp, fh);
345 if (err == 0) { 385 if (err)
346 struct tcf_proto *next = rtnl_dereference(tp->next); 386 goto errout;
347 387 next = rtnl_dereference(tp->next);
348 tfilter_notify(net, skb, n, tp, 388 tfilter_notify(net, skb, n, tp, t->tcm_handle,
349 t->tcm_handle, 389 RTM_DELTFILTER, false);
350 RTM_DELTFILTER, false); 390 if (tcf_proto_destroy(tp, false))
351 if (tcf_destroy(tp, false)) 391 RCU_INIT_POINTER(*back, next);
352 RCU_INIT_POINTER(*back, next);
353 }
354 goto errout; 392 goto errout;
355 case RTM_GETTFILTER: 393 case RTM_GETTFILTER:
356 err = tfilter_notify(net, skb, n, tp, fh, 394 err = tfilter_notify(net, skb, n, tp, fh,
@@ -372,7 +410,7 @@ replay:
372 tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER, false); 410 tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER, false);
373 } else { 411 } else {
374 if (tp_created) 412 if (tp_created)
375 tcf_destroy(tp, true); 413 tcf_proto_destroy(tp, true);
376 } 414 }
377 415
378errout: 416errout:
@@ -682,6 +720,30 @@ int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts)
682} 720}
683EXPORT_SYMBOL(tcf_exts_dump_stats); 721EXPORT_SYMBOL(tcf_exts_dump_stats);
684 722
723int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts,
724 struct net_device **hw_dev)
725{
726#ifdef CONFIG_NET_CLS_ACT
727 const struct tc_action *a;
728 LIST_HEAD(actions);
729
730 if (tc_no_actions(exts))
731 return -EINVAL;
732
733 tcf_exts_to_list(exts, &actions);
734 list_for_each_entry(a, &actions, list) {
735 if (a->ops->get_dev) {
736 a->ops->get_dev(a, dev_net(dev), hw_dev);
737 break;
738 }
739 }
740 if (*hw_dev)
741 return 0;
742#endif
743 return -EOPNOTSUPP;
744}
745EXPORT_SYMBOL(tcf_exts_get_dev);
746
685static int __init tc_filter_init(void) 747static int __init tc_filter_init(void)
686{ 748{
687 rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, NULL); 749 rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, NULL);
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 0a47ba5e6109..80f688436dd7 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -45,10 +45,7 @@ struct cls_bpf_prog {
45 u32 gen_flags; 45 u32 gen_flags;
46 struct tcf_exts exts; 46 struct tcf_exts exts;
47 u32 handle; 47 u32 handle;
48 union { 48 u16 bpf_num_ops;
49 u32 bpf_fd;
50 u16 bpf_num_ops;
51 };
52 struct sock_filter *bpf_ops; 49 struct sock_filter *bpf_ops;
53 const char *bpf_name; 50 const char *bpf_name;
54 struct tcf_proto *tp; 51 struct tcf_proto *tp;
@@ -151,6 +148,7 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
151 struct net_device *dev = tp->q->dev_queue->dev; 148 struct net_device *dev = tp->q->dev_queue->dev;
152 struct tc_cls_bpf_offload bpf_offload = {}; 149 struct tc_cls_bpf_offload bpf_offload = {};
153 struct tc_to_netdev offload; 150 struct tc_to_netdev offload;
151 int err;
154 152
155 offload.type = TC_SETUP_CLSBPF; 153 offload.type = TC_SETUP_CLSBPF;
156 offload.cls_bpf = &bpf_offload; 154 offload.cls_bpf = &bpf_offload;
@@ -162,8 +160,13 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
162 bpf_offload.exts_integrated = prog->exts_integrated; 160 bpf_offload.exts_integrated = prog->exts_integrated;
163 bpf_offload.gen_flags = prog->gen_flags; 161 bpf_offload.gen_flags = prog->gen_flags;
164 162
165 return dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, 163 err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
166 tp->protocol, &offload); 164 tp->protocol, &offload);
165
166 if (!err && (cmd == TC_CLSBPF_ADD || cmd == TC_CLSBPF_REPLACE))
167 prog->gen_flags |= TCA_CLS_FLAGS_IN_HW;
168
169 return err;
167} 170}
168 171
169static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog, 172static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog,
@@ -244,7 +247,7 @@ static int cls_bpf_init(struct tcf_proto *tp)
244 return 0; 247 return 0;
245} 248}
246 249
247static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog) 250static void __cls_bpf_delete_prog(struct cls_bpf_prog *prog)
248{ 251{
249 tcf_exts_destroy(&prog->exts); 252 tcf_exts_destroy(&prog->exts);
250 253
@@ -258,22 +261,22 @@ static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog)
258 kfree(prog); 261 kfree(prog);
259} 262}
260 263
261static void __cls_bpf_delete_prog(struct rcu_head *rcu) 264static void cls_bpf_delete_prog_rcu(struct rcu_head *rcu)
262{ 265{
263 struct cls_bpf_prog *prog = container_of(rcu, struct cls_bpf_prog, rcu); 266 __cls_bpf_delete_prog(container_of(rcu, struct cls_bpf_prog, rcu));
264
265 cls_bpf_delete_prog(prog->tp, prog);
266} 267}
267 268
268static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg) 269static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog)
269{ 270{
270 struct cls_bpf_prog *prog = (struct cls_bpf_prog *) arg;
271
272 cls_bpf_stop_offload(tp, prog); 271 cls_bpf_stop_offload(tp, prog);
273 list_del_rcu(&prog->link); 272 list_del_rcu(&prog->link);
274 tcf_unbind_filter(tp, &prog->res); 273 tcf_unbind_filter(tp, &prog->res);
275 call_rcu(&prog->rcu, __cls_bpf_delete_prog); 274 call_rcu(&prog->rcu, cls_bpf_delete_prog_rcu);
275}
276 276
277static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg)
278{
279 __cls_bpf_delete(tp, (struct cls_bpf_prog *) arg);
277 return 0; 280 return 0;
278} 281}
279 282
@@ -285,12 +288,8 @@ static bool cls_bpf_destroy(struct tcf_proto *tp, bool force)
285 if (!force && !list_empty(&head->plist)) 288 if (!force && !list_empty(&head->plist))
286 return false; 289 return false;
287 290
288 list_for_each_entry_safe(prog, tmp, &head->plist, link) { 291 list_for_each_entry_safe(prog, tmp, &head->plist, link)
289 cls_bpf_stop_offload(tp, prog); 292 __cls_bpf_delete(tp, prog);
290 list_del_rcu(&prog->link);
291 tcf_unbind_filter(tp, &prog->res);
292 call_rcu(&prog->rcu, __cls_bpf_delete_prog);
293 }
294 293
295 kfree_rcu(head, rcu); 294 kfree_rcu(head, rcu);
296 return true; 295 return true;
@@ -365,9 +364,7 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog,
365 return PTR_ERR(fp); 364 return PTR_ERR(fp);
366 365
367 if (tb[TCA_BPF_NAME]) { 366 if (tb[TCA_BPF_NAME]) {
368 name = kmemdup(nla_data(tb[TCA_BPF_NAME]), 367 name = nla_memdup(tb[TCA_BPF_NAME], GFP_KERNEL);
369 nla_len(tb[TCA_BPF_NAME]),
370 GFP_KERNEL);
371 if (!name) { 368 if (!name) {
372 bpf_prog_put(fp); 369 bpf_prog_put(fp);
373 return -ENOMEM; 370 return -ENOMEM;
@@ -375,7 +372,6 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog,
375 } 372 }
376 373
377 prog->bpf_ops = NULL; 374 prog->bpf_ops = NULL;
378 prog->bpf_fd = bpf_fd;
379 prog->bpf_name = name; 375 prog->bpf_name = name;
380 prog->filter = fp; 376 prog->filter = fp;
381 377
@@ -517,14 +513,17 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
517 513
518 ret = cls_bpf_offload(tp, prog, oldprog); 514 ret = cls_bpf_offload(tp, prog, oldprog);
519 if (ret) { 515 if (ret) {
520 cls_bpf_delete_prog(tp, prog); 516 __cls_bpf_delete_prog(prog);
521 return ret; 517 return ret;
522 } 518 }
523 519
520 if (!tc_in_hw(prog->gen_flags))
521 prog->gen_flags |= TCA_CLS_FLAGS_NOT_IN_HW;
522
524 if (oldprog) { 523 if (oldprog) {
525 list_replace_rcu(&oldprog->link, &prog->link); 524 list_replace_rcu(&oldprog->link, &prog->link);
526 tcf_unbind_filter(tp, &oldprog->res); 525 tcf_unbind_filter(tp, &oldprog->res);
527 call_rcu(&oldprog->rcu, __cls_bpf_delete_prog); 526 call_rcu(&oldprog->rcu, cls_bpf_delete_prog_rcu);
528 } else { 527 } else {
529 list_add_rcu(&prog->link, &head->plist); 528 list_add_rcu(&prog->link, &head->plist);
530 } 529 }
@@ -559,13 +558,18 @@ static int cls_bpf_dump_bpf_info(const struct cls_bpf_prog *prog,
559static int cls_bpf_dump_ebpf_info(const struct cls_bpf_prog *prog, 558static int cls_bpf_dump_ebpf_info(const struct cls_bpf_prog *prog,
560 struct sk_buff *skb) 559 struct sk_buff *skb)
561{ 560{
562 if (nla_put_u32(skb, TCA_BPF_FD, prog->bpf_fd)) 561 struct nlattr *nla;
563 return -EMSGSIZE;
564 562
565 if (prog->bpf_name && 563 if (prog->bpf_name &&
566 nla_put_string(skb, TCA_BPF_NAME, prog->bpf_name)) 564 nla_put_string(skb, TCA_BPF_NAME, prog->bpf_name))
567 return -EMSGSIZE; 565 return -EMSGSIZE;
568 566
567 nla = nla_reserve(skb, TCA_BPF_TAG, sizeof(prog->filter->tag));
568 if (nla == NULL)
569 return -EMSGSIZE;
570
571 memcpy(nla_data(nla), prog->filter->tag, nla_len(nla));
572
569 return 0; 573 return 0;
570} 574}
571 575
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 6575aba87630..3d6b9286c203 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -129,7 +129,7 @@ static u32 flow_get_mark(const struct sk_buff *skb)
129static u32 flow_get_nfct(const struct sk_buff *skb) 129static u32 flow_get_nfct(const struct sk_buff *skb)
130{ 130{
131#if IS_ENABLED(CONFIG_NF_CONNTRACK) 131#if IS_ENABLED(CONFIG_NF_CONNTRACK)
132 return addr_fold(skb->nfct); 132 return addr_fold(skb_nfct(skb));
133#else 133#else
134 return 0; 134 return 0;
135#endif 135#endif
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 904442421db3..9d0c99d2e9fb 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -39,11 +39,14 @@ struct fl_flow_key {
39 struct flow_dissector_key_ipv6_addrs ipv6; 39 struct flow_dissector_key_ipv6_addrs ipv6;
40 }; 40 };
41 struct flow_dissector_key_ports tp; 41 struct flow_dissector_key_ports tp;
42 struct flow_dissector_key_icmp icmp;
43 struct flow_dissector_key_arp arp;
42 struct flow_dissector_key_keyid enc_key_id; 44 struct flow_dissector_key_keyid enc_key_id;
43 union { 45 union {
44 struct flow_dissector_key_ipv4_addrs enc_ipv4; 46 struct flow_dissector_key_ipv4_addrs enc_ipv4;
45 struct flow_dissector_key_ipv6_addrs enc_ipv6; 47 struct flow_dissector_key_ipv6_addrs enc_ipv6;
46 }; 48 };
49 struct flow_dissector_key_ports enc_tp;
47} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ 50} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
48 51
49struct fl_flow_mask_range { 52struct fl_flow_mask_range {
@@ -81,6 +84,8 @@ struct cls_fl_filter {
81 u32 handle; 84 u32 handle;
82 u32 flags; 85 u32 flags;
83 struct rcu_head rcu; 86 struct rcu_head rcu;
87 struct tc_to_netdev tc;
88 struct net_device *hw_dev;
84}; 89};
85 90
86static unsigned short int fl_mask_range(const struct fl_flow_mask *mask) 91static unsigned short int fl_mask_range(const struct fl_flow_mask *mask)
@@ -129,6 +134,14 @@ static void fl_clear_masked_range(struct fl_flow_key *key,
129 memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask)); 134 memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask));
130} 135}
131 136
137static struct cls_fl_filter *fl_lookup(struct cls_fl_head *head,
138 struct fl_flow_key *mkey)
139{
140 return rhashtable_lookup_fast(&head->ht,
141 fl_key_get_start(mkey, &head->mask),
142 head->ht_params);
143}
144
132static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, 145static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
133 struct tcf_result *res) 146 struct tcf_result *res)
134{ 147{
@@ -149,16 +162,22 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
149 162
150 switch (ip_tunnel_info_af(info)) { 163 switch (ip_tunnel_info_af(info)) {
151 case AF_INET: 164 case AF_INET:
165 skb_key.enc_control.addr_type =
166 FLOW_DISSECTOR_KEY_IPV4_ADDRS;
152 skb_key.enc_ipv4.src = key->u.ipv4.src; 167 skb_key.enc_ipv4.src = key->u.ipv4.src;
153 skb_key.enc_ipv4.dst = key->u.ipv4.dst; 168 skb_key.enc_ipv4.dst = key->u.ipv4.dst;
154 break; 169 break;
155 case AF_INET6: 170 case AF_INET6:
171 skb_key.enc_control.addr_type =
172 FLOW_DISSECTOR_KEY_IPV6_ADDRS;
156 skb_key.enc_ipv6.src = key->u.ipv6.src; 173 skb_key.enc_ipv6.src = key->u.ipv6.src;
157 skb_key.enc_ipv6.dst = key->u.ipv6.dst; 174 skb_key.enc_ipv6.dst = key->u.ipv6.dst;
158 break; 175 break;
159 } 176 }
160 177
161 skb_key.enc_key_id.keyid = tunnel_id_to_key32(key->tun_id); 178 skb_key.enc_key_id.keyid = tunnel_id_to_key32(key->tun_id);
179 skb_key.enc_tp.src = key->tp_src;
180 skb_key.enc_tp.dst = key->tp_dst;
162 } 181 }
163 182
164 skb_key.indev_ifindex = skb->skb_iif; 183 skb_key.indev_ifindex = skb->skb_iif;
@@ -170,9 +189,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
170 189
171 fl_set_masked_key(&skb_mkey, &skb_key, &head->mask); 190 fl_set_masked_key(&skb_mkey, &skb_key, &head->mask);
172 191
173 f = rhashtable_lookup_fast(&head->ht, 192 f = fl_lookup(head, &skb_mkey);
174 fl_key_get_start(&skb_mkey, &head->mask),
175 head->ht_params);
176 if (f && !tc_skip_sw(f->flags)) { 193 if (f && !tc_skip_sw(f->flags)) {
177 *res = f->res; 194 *res = f->res;
178 return tcf_exts_exec(skb, &f->exts, res); 195 return tcf_exts_exec(skb, &f->exts, res);
@@ -202,75 +219,95 @@ static void fl_destroy_filter(struct rcu_head *head)
202 kfree(f); 219 kfree(f);
203} 220}
204 221
205static void fl_hw_destroy_filter(struct tcf_proto *tp, unsigned long cookie) 222static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f)
206{ 223{
207 struct net_device *dev = tp->q->dev_queue->dev;
208 struct tc_cls_flower_offload offload = {0}; 224 struct tc_cls_flower_offload offload = {0};
209 struct tc_to_netdev tc; 225 struct net_device *dev = f->hw_dev;
226 struct tc_to_netdev *tc = &f->tc;
210 227
211 if (!tc_should_offload(dev, tp, 0)) 228 if (!tc_can_offload(dev, tp))
212 return; 229 return;
213 230
214 offload.command = TC_CLSFLOWER_DESTROY; 231 offload.command = TC_CLSFLOWER_DESTROY;
215 offload.cookie = cookie; 232 offload.prio = tp->prio;
233 offload.cookie = (unsigned long)f;
216 234
217 tc.type = TC_SETUP_CLSFLOWER; 235 tc->type = TC_SETUP_CLSFLOWER;
218 tc.cls_flower = &offload; 236 tc->cls_flower = &offload;
219 237
220 dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc); 238 dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, tc);
221} 239}
222 240
223static int fl_hw_replace_filter(struct tcf_proto *tp, 241static int fl_hw_replace_filter(struct tcf_proto *tp,
224 struct flow_dissector *dissector, 242 struct flow_dissector *dissector,
225 struct fl_flow_key *mask, 243 struct fl_flow_key *mask,
226 struct fl_flow_key *key, 244 struct cls_fl_filter *f)
227 struct tcf_exts *actions,
228 unsigned long cookie, u32 flags)
229{ 245{
230 struct net_device *dev = tp->q->dev_queue->dev; 246 struct net_device *dev = tp->q->dev_queue->dev;
231 struct tc_cls_flower_offload offload = {0}; 247 struct tc_cls_flower_offload offload = {0};
232 struct tc_to_netdev tc; 248 struct tc_to_netdev *tc = &f->tc;
233 int err; 249 int err;
234 250
235 if (!tc_should_offload(dev, tp, flags)) 251 if (!tc_can_offload(dev, tp)) {
236 return tc_skip_sw(flags) ? -EINVAL : 0; 252 if (tcf_exts_get_dev(dev, &f->exts, &f->hw_dev) ||
253 (f->hw_dev && !tc_can_offload(f->hw_dev, tp))) {
254 f->hw_dev = dev;
255 return tc_skip_sw(f->flags) ? -EINVAL : 0;
256 }
257 dev = f->hw_dev;
258 tc->egress_dev = true;
259 } else {
260 f->hw_dev = dev;
261 }
237 262
238 offload.command = TC_CLSFLOWER_REPLACE; 263 offload.command = TC_CLSFLOWER_REPLACE;
239 offload.cookie = cookie; 264 offload.prio = tp->prio;
265 offload.cookie = (unsigned long)f;
240 offload.dissector = dissector; 266 offload.dissector = dissector;
241 offload.mask = mask; 267 offload.mask = mask;
242 offload.key = key; 268 offload.key = &f->mkey;
243 offload.exts = actions; 269 offload.exts = &f->exts;
244 270
245 tc.type = TC_SETUP_CLSFLOWER; 271 tc->type = TC_SETUP_CLSFLOWER;
246 tc.cls_flower = &offload; 272 tc->cls_flower = &offload;
247 273
248 err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, 274 err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol,
249 &tc); 275 tc);
276 if (!err)
277 f->flags |= TCA_CLS_FLAGS_IN_HW;
250 278
251 if (tc_skip_sw(flags)) 279 if (tc_skip_sw(f->flags))
252 return err; 280 return err;
253
254 return 0; 281 return 0;
255} 282}
256 283
257static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f) 284static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
258{ 285{
259 struct net_device *dev = tp->q->dev_queue->dev;
260 struct tc_cls_flower_offload offload = {0}; 286 struct tc_cls_flower_offload offload = {0};
261 struct tc_to_netdev tc; 287 struct net_device *dev = f->hw_dev;
288 struct tc_to_netdev *tc = &f->tc;
262 289
263 if (!tc_should_offload(dev, tp, 0)) 290 if (!tc_can_offload(dev, tp))
264 return; 291 return;
265 292
266 offload.command = TC_CLSFLOWER_STATS; 293 offload.command = TC_CLSFLOWER_STATS;
294 offload.prio = tp->prio;
267 offload.cookie = (unsigned long)f; 295 offload.cookie = (unsigned long)f;
268 offload.exts = &f->exts; 296 offload.exts = &f->exts;
269 297
270 tc.type = TC_SETUP_CLSFLOWER; 298 tc->type = TC_SETUP_CLSFLOWER;
271 tc.cls_flower = &offload; 299 tc->cls_flower = &offload;
300
301 dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, tc);
302}
272 303
273 dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc); 304static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f)
305{
306 list_del_rcu(&f->list);
307 if (!tc_skip_hw(f->flags))
308 fl_hw_destroy_filter(tp, f);
309 tcf_unbind_filter(tp, &f->res);
310 call_rcu(&f->rcu, fl_destroy_filter);
274} 311}
275 312
276static void fl_destroy_sleepable(struct work_struct *work) 313static void fl_destroy_sleepable(struct work_struct *work)
@@ -299,14 +336,12 @@ static bool fl_destroy(struct tcf_proto *tp, bool force)
299 if (!force && !list_empty(&head->filters)) 336 if (!force && !list_empty(&head->filters))
300 return false; 337 return false;
301 338
302 list_for_each_entry_safe(f, next, &head->filters, list) { 339 list_for_each_entry_safe(f, next, &head->filters, list)
303 fl_hw_destroy_filter(tp, (unsigned long)f); 340 __fl_delete(tp, f);
304 list_del_rcu(&f->list);
305 call_rcu(&f->rcu, fl_destroy_filter);
306 }
307 341
308 __module_get(THIS_MODULE); 342 __module_get(THIS_MODULE);
309 call_rcu(&head->rcu, fl_destroy_rcu); 343 call_rcu(&head->rcu, fl_destroy_rcu);
344
310 return true; 345 return true;
311} 346}
312 347
@@ -360,6 +395,34 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
360 [TCA_FLOWER_KEY_TCP_DST_MASK] = { .type = NLA_U16 }, 395 [TCA_FLOWER_KEY_TCP_DST_MASK] = { .type = NLA_U16 },
361 [TCA_FLOWER_KEY_UDP_SRC_MASK] = { .type = NLA_U16 }, 396 [TCA_FLOWER_KEY_UDP_SRC_MASK] = { .type = NLA_U16 },
362 [TCA_FLOWER_KEY_UDP_DST_MASK] = { .type = NLA_U16 }, 397 [TCA_FLOWER_KEY_UDP_DST_MASK] = { .type = NLA_U16 },
398 [TCA_FLOWER_KEY_SCTP_SRC_MASK] = { .type = NLA_U16 },
399 [TCA_FLOWER_KEY_SCTP_DST_MASK] = { .type = NLA_U16 },
400 [TCA_FLOWER_KEY_SCTP_SRC] = { .type = NLA_U16 },
401 [TCA_FLOWER_KEY_SCTP_DST] = { .type = NLA_U16 },
402 [TCA_FLOWER_KEY_ENC_UDP_SRC_PORT] = { .type = NLA_U16 },
403 [TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK] = { .type = NLA_U16 },
404 [TCA_FLOWER_KEY_ENC_UDP_DST_PORT] = { .type = NLA_U16 },
405 [TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK] = { .type = NLA_U16 },
406 [TCA_FLOWER_KEY_FLAGS] = { .type = NLA_U32 },
407 [TCA_FLOWER_KEY_FLAGS_MASK] = { .type = NLA_U32 },
408 [TCA_FLOWER_KEY_ICMPV4_TYPE] = { .type = NLA_U8 },
409 [TCA_FLOWER_KEY_ICMPV4_TYPE_MASK] = { .type = NLA_U8 },
410 [TCA_FLOWER_KEY_ICMPV4_CODE] = { .type = NLA_U8 },
411 [TCA_FLOWER_KEY_ICMPV4_CODE_MASK] = { .type = NLA_U8 },
412 [TCA_FLOWER_KEY_ICMPV6_TYPE] = { .type = NLA_U8 },
413 [TCA_FLOWER_KEY_ICMPV6_TYPE_MASK] = { .type = NLA_U8 },
414 [TCA_FLOWER_KEY_ICMPV6_CODE] = { .type = NLA_U8 },
415 [TCA_FLOWER_KEY_ICMPV6_CODE_MASK] = { .type = NLA_U8 },
416 [TCA_FLOWER_KEY_ARP_SIP] = { .type = NLA_U32 },
417 [TCA_FLOWER_KEY_ARP_SIP_MASK] = { .type = NLA_U32 },
418 [TCA_FLOWER_KEY_ARP_TIP] = { .type = NLA_U32 },
419 [TCA_FLOWER_KEY_ARP_TIP_MASK] = { .type = NLA_U32 },
420 [TCA_FLOWER_KEY_ARP_OP] = { .type = NLA_U8 },
421 [TCA_FLOWER_KEY_ARP_OP_MASK] = { .type = NLA_U8 },
422 [TCA_FLOWER_KEY_ARP_SHA] = { .len = ETH_ALEN },
423 [TCA_FLOWER_KEY_ARP_SHA_MASK] = { .len = ETH_ALEN },
424 [TCA_FLOWER_KEY_ARP_THA] = { .len = ETH_ALEN },
425 [TCA_FLOWER_KEY_ARP_THA_MASK] = { .len = ETH_ALEN },
363}; 426};
364 427
365static void fl_set_key_val(struct nlattr **tb, 428static void fl_set_key_val(struct nlattr **tb,
@@ -394,10 +457,43 @@ static void fl_set_key_vlan(struct nlattr **tb,
394 } 457 }
395} 458}
396 459
460static void fl_set_key_flag(u32 flower_key, u32 flower_mask,
461 u32 *dissector_key, u32 *dissector_mask,
462 u32 flower_flag_bit, u32 dissector_flag_bit)
463{
464 if (flower_mask & flower_flag_bit) {
465 *dissector_mask |= dissector_flag_bit;
466 if (flower_key & flower_flag_bit)
467 *dissector_key |= dissector_flag_bit;
468 }
469}
470
471static int fl_set_key_flags(struct nlattr **tb,
472 u32 *flags_key, u32 *flags_mask)
473{
474 u32 key, mask;
475
476 /* mask is mandatory for flags */
477 if (!tb[TCA_FLOWER_KEY_FLAGS_MASK])
478 return -EINVAL;
479
480 key = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS]));
481 mask = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS_MASK]));
482
483 *flags_key = 0;
484 *flags_mask = 0;
485
486 fl_set_key_flag(key, mask, flags_key, flags_mask,
487 TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT, FLOW_DIS_IS_FRAGMENT);
488
489 return 0;
490}
491
397static int fl_set_key(struct net *net, struct nlattr **tb, 492static int fl_set_key(struct net *net, struct nlattr **tb,
398 struct fl_flow_key *key, struct fl_flow_key *mask) 493 struct fl_flow_key *key, struct fl_flow_key *mask)
399{ 494{
400 __be16 ethertype; 495 __be16 ethertype;
496 int ret = 0;
401#ifdef CONFIG_NET_CLS_IND 497#ifdef CONFIG_NET_CLS_IND
402 if (tb[TCA_FLOWER_INDEV]) { 498 if (tb[TCA_FLOWER_INDEV]) {
403 int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV]); 499 int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV]);
@@ -439,6 +535,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
439 535
440 if (tb[TCA_FLOWER_KEY_IPV4_SRC] || tb[TCA_FLOWER_KEY_IPV4_DST]) { 536 if (tb[TCA_FLOWER_KEY_IPV4_SRC] || tb[TCA_FLOWER_KEY_IPV4_DST]) {
441 key->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 537 key->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
538 mask->control.addr_type = ~0;
442 fl_set_key_val(tb, &key->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC, 539 fl_set_key_val(tb, &key->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC,
443 &mask->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC_MASK, 540 &mask->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC_MASK,
444 sizeof(key->ipv4.src)); 541 sizeof(key->ipv4.src));
@@ -447,6 +544,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
447 sizeof(key->ipv4.dst)); 544 sizeof(key->ipv4.dst));
448 } else if (tb[TCA_FLOWER_KEY_IPV6_SRC] || tb[TCA_FLOWER_KEY_IPV6_DST]) { 545 } else if (tb[TCA_FLOWER_KEY_IPV6_SRC] || tb[TCA_FLOWER_KEY_IPV6_DST]) {
449 key->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 546 key->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
547 mask->control.addr_type = ~0;
450 fl_set_key_val(tb, &key->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC, 548 fl_set_key_val(tb, &key->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC,
451 &mask->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC_MASK, 549 &mask->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC_MASK,
452 sizeof(key->ipv6.src)); 550 sizeof(key->ipv6.src));
@@ -469,11 +567,56 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
469 fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST, 567 fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST,
470 &mask->tp.dst, TCA_FLOWER_KEY_UDP_DST_MASK, 568 &mask->tp.dst, TCA_FLOWER_KEY_UDP_DST_MASK,
471 sizeof(key->tp.dst)); 569 sizeof(key->tp.dst));
570 } else if (key->basic.ip_proto == IPPROTO_SCTP) {
571 fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_SCTP_SRC,
572 &mask->tp.src, TCA_FLOWER_KEY_SCTP_SRC_MASK,
573 sizeof(key->tp.src));
574 fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_SCTP_DST,
575 &mask->tp.dst, TCA_FLOWER_KEY_SCTP_DST_MASK,
576 sizeof(key->tp.dst));
577 } else if (key->basic.n_proto == htons(ETH_P_IP) &&
578 key->basic.ip_proto == IPPROTO_ICMP) {
579 fl_set_key_val(tb, &key->icmp.type, TCA_FLOWER_KEY_ICMPV4_TYPE,
580 &mask->icmp.type,
581 TCA_FLOWER_KEY_ICMPV4_TYPE_MASK,
582 sizeof(key->icmp.type));
583 fl_set_key_val(tb, &key->icmp.code, TCA_FLOWER_KEY_ICMPV4_CODE,
584 &mask->icmp.code,
585 TCA_FLOWER_KEY_ICMPV4_CODE_MASK,
586 sizeof(key->icmp.code));
587 } else if (key->basic.n_proto == htons(ETH_P_IPV6) &&
588 key->basic.ip_proto == IPPROTO_ICMPV6) {
589 fl_set_key_val(tb, &key->icmp.type, TCA_FLOWER_KEY_ICMPV6_TYPE,
590 &mask->icmp.type,
591 TCA_FLOWER_KEY_ICMPV6_TYPE_MASK,
592 sizeof(key->icmp.type));
593 fl_set_key_val(tb, &key->icmp.code, TCA_FLOWER_KEY_ICMPV6_CODE,
594 &mask->icmp.code,
595 TCA_FLOWER_KEY_ICMPV6_CODE_MASK,
596 sizeof(key->icmp.code));
597 } else if (key->basic.n_proto == htons(ETH_P_ARP) ||
598 key->basic.n_proto == htons(ETH_P_RARP)) {
599 fl_set_key_val(tb, &key->arp.sip, TCA_FLOWER_KEY_ARP_SIP,
600 &mask->arp.sip, TCA_FLOWER_KEY_ARP_SIP_MASK,
601 sizeof(key->arp.sip));
602 fl_set_key_val(tb, &key->arp.tip, TCA_FLOWER_KEY_ARP_TIP,
603 &mask->arp.tip, TCA_FLOWER_KEY_ARP_TIP_MASK,
604 sizeof(key->arp.tip));
605 fl_set_key_val(tb, &key->arp.op, TCA_FLOWER_KEY_ARP_OP,
606 &mask->arp.op, TCA_FLOWER_KEY_ARP_OP_MASK,
607 sizeof(key->arp.op));
608 fl_set_key_val(tb, key->arp.sha, TCA_FLOWER_KEY_ARP_SHA,
609 mask->arp.sha, TCA_FLOWER_KEY_ARP_SHA_MASK,
610 sizeof(key->arp.sha));
611 fl_set_key_val(tb, key->arp.tha, TCA_FLOWER_KEY_ARP_THA,
612 mask->arp.tha, TCA_FLOWER_KEY_ARP_THA_MASK,
613 sizeof(key->arp.tha));
472 } 614 }
473 615
474 if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] || 616 if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] ||
475 tb[TCA_FLOWER_KEY_ENC_IPV4_DST]) { 617 tb[TCA_FLOWER_KEY_ENC_IPV4_DST]) {
476 key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 618 key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
619 mask->enc_control.addr_type = ~0;
477 fl_set_key_val(tb, &key->enc_ipv4.src, 620 fl_set_key_val(tb, &key->enc_ipv4.src,
478 TCA_FLOWER_KEY_ENC_IPV4_SRC, 621 TCA_FLOWER_KEY_ENC_IPV4_SRC,
479 &mask->enc_ipv4.src, 622 &mask->enc_ipv4.src,
@@ -489,6 +632,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
489 if (tb[TCA_FLOWER_KEY_ENC_IPV6_SRC] || 632 if (tb[TCA_FLOWER_KEY_ENC_IPV6_SRC] ||
490 tb[TCA_FLOWER_KEY_ENC_IPV6_DST]) { 633 tb[TCA_FLOWER_KEY_ENC_IPV6_DST]) {
491 key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 634 key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
635 mask->enc_control.addr_type = ~0;
492 fl_set_key_val(tb, &key->enc_ipv6.src, 636 fl_set_key_val(tb, &key->enc_ipv6.src,
493 TCA_FLOWER_KEY_ENC_IPV6_SRC, 637 TCA_FLOWER_KEY_ENC_IPV6_SRC,
494 &mask->enc_ipv6.src, 638 &mask->enc_ipv6.src,
@@ -505,7 +649,18 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
505 &mask->enc_key_id.keyid, TCA_FLOWER_UNSPEC, 649 &mask->enc_key_id.keyid, TCA_FLOWER_UNSPEC,
506 sizeof(key->enc_key_id.keyid)); 650 sizeof(key->enc_key_id.keyid));
507 651
508 return 0; 652 fl_set_key_val(tb, &key->enc_tp.src, TCA_FLOWER_KEY_ENC_UDP_SRC_PORT,
653 &mask->enc_tp.src, TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK,
654 sizeof(key->enc_tp.src));
655
656 fl_set_key_val(tb, &key->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT,
657 &mask->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK,
658 sizeof(key->enc_tp.dst));
659
660 if (tb[TCA_FLOWER_KEY_FLAGS])
661 ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags);
662
663 return ret;
509} 664}
510 665
511static bool fl_mask_eq(struct fl_flow_mask *mask1, 666static bool fl_mask_eq(struct fl_flow_mask *mask1,
@@ -571,7 +726,23 @@ static void fl_init_dissector(struct cls_fl_head *head,
571 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, 726 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
572 FLOW_DISSECTOR_KEY_PORTS, tp); 727 FLOW_DISSECTOR_KEY_PORTS, tp);
573 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, 728 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
729 FLOW_DISSECTOR_KEY_ICMP, icmp);
730 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
731 FLOW_DISSECTOR_KEY_ARP, arp);
732 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
574 FLOW_DISSECTOR_KEY_VLAN, vlan); 733 FLOW_DISSECTOR_KEY_VLAN, vlan);
734 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
735 FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id);
736 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
737 FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, enc_ipv4);
738 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
739 FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, enc_ipv6);
740 if (FL_KEY_IS_MASKED(&mask->key, enc_ipv4) ||
741 FL_KEY_IS_MASKED(&mask->key, enc_ipv6))
742 FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_ENC_CONTROL,
743 enc_control);
744 FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
745 FLOW_DISSECTOR_KEY_ENC_PORTS, enc_tp);
575 746
576 skb_flow_dissector_init(&head->dissector, keys, cnt); 747 skb_flow_dissector_init(&head->dissector, keys, cnt);
577} 748}
@@ -666,23 +837,31 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
666 struct cls_fl_head *head = rtnl_dereference(tp->root); 837 struct cls_fl_head *head = rtnl_dereference(tp->root);
667 struct cls_fl_filter *fold = (struct cls_fl_filter *) *arg; 838 struct cls_fl_filter *fold = (struct cls_fl_filter *) *arg;
668 struct cls_fl_filter *fnew; 839 struct cls_fl_filter *fnew;
669 struct nlattr *tb[TCA_FLOWER_MAX + 1]; 840 struct nlattr **tb;
670 struct fl_flow_mask mask = {}; 841 struct fl_flow_mask mask = {};
671 int err; 842 int err;
672 843
673 if (!tca[TCA_OPTIONS]) 844 if (!tca[TCA_OPTIONS])
674 return -EINVAL; 845 return -EINVAL;
675 846
847 tb = kcalloc(TCA_FLOWER_MAX + 1, sizeof(struct nlattr *), GFP_KERNEL);
848 if (!tb)
849 return -ENOBUFS;
850
676 err = nla_parse_nested(tb, TCA_FLOWER_MAX, tca[TCA_OPTIONS], fl_policy); 851 err = nla_parse_nested(tb, TCA_FLOWER_MAX, tca[TCA_OPTIONS], fl_policy);
677 if (err < 0) 852 if (err < 0)
678 return err; 853 goto errout_tb;
679 854
680 if (fold && handle && fold->handle != handle) 855 if (fold && handle && fold->handle != handle) {
681 return -EINVAL; 856 err = -EINVAL;
857 goto errout_tb;
858 }
682 859
683 fnew = kzalloc(sizeof(*fnew), GFP_KERNEL); 860 fnew = kzalloc(sizeof(*fnew), GFP_KERNEL);
684 if (!fnew) 861 if (!fnew) {
685 return -ENOBUFS; 862 err = -ENOBUFS;
863 goto errout_tb;
864 }
686 865
687 err = tcf_exts_init(&fnew->exts, TCA_FLOWER_ACT, 0); 866 err = tcf_exts_init(&fnew->exts, TCA_FLOWER_ACT, 0);
688 if (err < 0) 867 if (err < 0)
@@ -715,27 +894,35 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
715 goto errout; 894 goto errout;
716 895
717 if (!tc_skip_sw(fnew->flags)) { 896 if (!tc_skip_sw(fnew->flags)) {
897 if (!fold && fl_lookup(head, &fnew->mkey)) {
898 err = -EEXIST;
899 goto errout;
900 }
901
718 err = rhashtable_insert_fast(&head->ht, &fnew->ht_node, 902 err = rhashtable_insert_fast(&head->ht, &fnew->ht_node,
719 head->ht_params); 903 head->ht_params);
720 if (err) 904 if (err)
721 goto errout; 905 goto errout;
722 } 906 }
723 907
724 err = fl_hw_replace_filter(tp, 908 if (!tc_skip_hw(fnew->flags)) {
725 &head->dissector, 909 err = fl_hw_replace_filter(tp,
726 &mask.key, 910 &head->dissector,
727 &fnew->key, 911 &mask.key,
728 &fnew->exts, 912 fnew);
729 (unsigned long)fnew, 913 if (err)
730 fnew->flags); 914 goto errout;
731 if (err) 915 }
732 goto errout; 916
917 if (!tc_in_hw(fnew->flags))
918 fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
733 919
734 if (fold) { 920 if (fold) {
735 if (!tc_skip_sw(fold->flags)) 921 if (!tc_skip_sw(fold->flags))
736 rhashtable_remove_fast(&head->ht, &fold->ht_node, 922 rhashtable_remove_fast(&head->ht, &fold->ht_node,
737 head->ht_params); 923 head->ht_params);
738 fl_hw_destroy_filter(tp, (unsigned long)fold); 924 if (!tc_skip_hw(fold->flags))
925 fl_hw_destroy_filter(tp, fold);
739 } 926 }
740 927
741 *arg = (unsigned long) fnew; 928 *arg = (unsigned long) fnew;
@@ -748,11 +935,14 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
748 list_add_tail_rcu(&fnew->list, &head->filters); 935 list_add_tail_rcu(&fnew->list, &head->filters);
749 } 936 }
750 937
938 kfree(tb);
751 return 0; 939 return 0;
752 940
753errout: 941errout:
754 tcf_exts_destroy(&fnew->exts); 942 tcf_exts_destroy(&fnew->exts);
755 kfree(fnew); 943 kfree(fnew);
944errout_tb:
945 kfree(tb);
756 return err; 946 return err;
757} 947}
758 948
@@ -764,10 +954,7 @@ static int fl_delete(struct tcf_proto *tp, unsigned long arg)
764 if (!tc_skip_sw(f->flags)) 954 if (!tc_skip_sw(f->flags))
765 rhashtable_remove_fast(&head->ht, &f->ht_node, 955 rhashtable_remove_fast(&head->ht, &f->ht_node,
766 head->ht_params); 956 head->ht_params);
767 list_del_rcu(&f->list); 957 __fl_delete(tp, f);
768 fl_hw_destroy_filter(tp, (unsigned long)f);
769 tcf_unbind_filter(tp, &f->res);
770 call_rcu(&f->rcu, fl_destroy_filter);
771 return 0; 958 return 0;
772} 959}
773 960
@@ -830,6 +1017,42 @@ static int fl_dump_key_vlan(struct sk_buff *skb,
830 return 0; 1017 return 0;
831} 1018}
832 1019
1020static void fl_get_key_flag(u32 dissector_key, u32 dissector_mask,
1021 u32 *flower_key, u32 *flower_mask,
1022 u32 flower_flag_bit, u32 dissector_flag_bit)
1023{
1024 if (dissector_mask & dissector_flag_bit) {
1025 *flower_mask |= flower_flag_bit;
1026 if (dissector_key & dissector_flag_bit)
1027 *flower_key |= flower_flag_bit;
1028 }
1029}
1030
1031static int fl_dump_key_flags(struct sk_buff *skb, u32 flags_key, u32 flags_mask)
1032{
1033 u32 key, mask;
1034 __be32 _key, _mask;
1035 int err;
1036
1037 if (!memchr_inv(&flags_mask, 0, sizeof(flags_mask)))
1038 return 0;
1039
1040 key = 0;
1041 mask = 0;
1042
1043 fl_get_key_flag(flags_key, flags_mask, &key, &mask,
1044 TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT, FLOW_DIS_IS_FRAGMENT);
1045
1046 _key = cpu_to_be32(key);
1047 _mask = cpu_to_be32(mask);
1048
1049 err = nla_put(skb, TCA_FLOWER_KEY_FLAGS, 4, &_key);
1050 if (err)
1051 return err;
1052
1053 return nla_put(skb, TCA_FLOWER_KEY_FLAGS_MASK, 4, &_mask);
1054}
1055
833static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, 1056static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
834 struct sk_buff *skb, struct tcmsg *t) 1057 struct sk_buff *skb, struct tcmsg *t)
835{ 1058{
@@ -862,7 +1085,8 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
862 goto nla_put_failure; 1085 goto nla_put_failure;
863 } 1086 }
864 1087
865 fl_hw_update_stats(tp, f); 1088 if (!tc_skip_hw(f->flags))
1089 fl_hw_update_stats(tp, f);
866 1090
867 if (fl_dump_key_val(skb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST, 1091 if (fl_dump_key_val(skb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST,
868 mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK, 1092 mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK,
@@ -918,6 +1142,57 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
918 &mask->tp.dst, TCA_FLOWER_KEY_UDP_DST_MASK, 1142 &mask->tp.dst, TCA_FLOWER_KEY_UDP_DST_MASK,
919 sizeof(key->tp.dst)))) 1143 sizeof(key->tp.dst))))
920 goto nla_put_failure; 1144 goto nla_put_failure;
1145 else if (key->basic.ip_proto == IPPROTO_SCTP &&
1146 (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_SCTP_SRC,
1147 &mask->tp.src, TCA_FLOWER_KEY_SCTP_SRC_MASK,
1148 sizeof(key->tp.src)) ||
1149 fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_SCTP_DST,
1150 &mask->tp.dst, TCA_FLOWER_KEY_SCTP_DST_MASK,
1151 sizeof(key->tp.dst))))
1152 goto nla_put_failure;
1153 else if (key->basic.n_proto == htons(ETH_P_IP) &&
1154 key->basic.ip_proto == IPPROTO_ICMP &&
1155 (fl_dump_key_val(skb, &key->icmp.type,
1156 TCA_FLOWER_KEY_ICMPV4_TYPE, &mask->icmp.type,
1157 TCA_FLOWER_KEY_ICMPV4_TYPE_MASK,
1158 sizeof(key->icmp.type)) ||
1159 fl_dump_key_val(skb, &key->icmp.code,
1160 TCA_FLOWER_KEY_ICMPV4_CODE, &mask->icmp.code,
1161 TCA_FLOWER_KEY_ICMPV4_CODE_MASK,
1162 sizeof(key->icmp.code))))
1163 goto nla_put_failure;
1164 else if (key->basic.n_proto == htons(ETH_P_IPV6) &&
1165 key->basic.ip_proto == IPPROTO_ICMPV6 &&
1166 (fl_dump_key_val(skb, &key->icmp.type,
1167 TCA_FLOWER_KEY_ICMPV6_TYPE, &mask->icmp.type,
1168 TCA_FLOWER_KEY_ICMPV6_TYPE_MASK,
1169 sizeof(key->icmp.type)) ||
1170 fl_dump_key_val(skb, &key->icmp.code,
1171 TCA_FLOWER_KEY_ICMPV6_CODE, &mask->icmp.code,
1172 TCA_FLOWER_KEY_ICMPV6_CODE_MASK,
1173 sizeof(key->icmp.code))))
1174 goto nla_put_failure;
1175 else if ((key->basic.n_proto == htons(ETH_P_ARP) ||
1176 key->basic.n_proto == htons(ETH_P_RARP)) &&
1177 (fl_dump_key_val(skb, &key->arp.sip,
1178 TCA_FLOWER_KEY_ARP_SIP, &mask->arp.sip,
1179 TCA_FLOWER_KEY_ARP_SIP_MASK,
1180 sizeof(key->arp.sip)) ||
1181 fl_dump_key_val(skb, &key->arp.tip,
1182 TCA_FLOWER_KEY_ARP_TIP, &mask->arp.tip,
1183 TCA_FLOWER_KEY_ARP_TIP_MASK,
1184 sizeof(key->arp.tip)) ||
1185 fl_dump_key_val(skb, &key->arp.op,
1186 TCA_FLOWER_KEY_ARP_OP, &mask->arp.op,
1187 TCA_FLOWER_KEY_ARP_OP_MASK,
1188 sizeof(key->arp.op)) ||
1189 fl_dump_key_val(skb, key->arp.sha, TCA_FLOWER_KEY_ARP_SHA,
1190 mask->arp.sha, TCA_FLOWER_KEY_ARP_SHA_MASK,
1191 sizeof(key->arp.sha)) ||
1192 fl_dump_key_val(skb, key->arp.tha, TCA_FLOWER_KEY_ARP_THA,
1193 mask->arp.tha, TCA_FLOWER_KEY_ARP_THA_MASK,
1194 sizeof(key->arp.tha))))
1195 goto nla_put_failure;
921 1196
922 if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS && 1197 if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
923 (fl_dump_key_val(skb, &key->enc_ipv4.src, 1198 (fl_dump_key_val(skb, &key->enc_ipv4.src,
@@ -943,10 +1218,24 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
943 1218
944 if (fl_dump_key_val(skb, &key->enc_key_id, TCA_FLOWER_KEY_ENC_KEY_ID, 1219 if (fl_dump_key_val(skb, &key->enc_key_id, TCA_FLOWER_KEY_ENC_KEY_ID,
945 &mask->enc_key_id, TCA_FLOWER_UNSPEC, 1220 &mask->enc_key_id, TCA_FLOWER_UNSPEC,
946 sizeof(key->enc_key_id))) 1221 sizeof(key->enc_key_id)) ||
1222 fl_dump_key_val(skb, &key->enc_tp.src,
1223 TCA_FLOWER_KEY_ENC_UDP_SRC_PORT,
1224 &mask->enc_tp.src,
1225 TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK,
1226 sizeof(key->enc_tp.src)) ||
1227 fl_dump_key_val(skb, &key->enc_tp.dst,
1228 TCA_FLOWER_KEY_ENC_UDP_DST_PORT,
1229 &mask->enc_tp.dst,
1230 TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK,
1231 sizeof(key->enc_tp.dst)))
947 goto nla_put_failure; 1232 goto nla_put_failure;
948 1233
949 nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags); 1234 if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags))
1235 goto nla_put_failure;
1236
1237 if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags))
1238 goto nla_put_failure;
950 1239
951 if (tcf_exts_dump(skb, &f->exts)) 1240 if (tcf_exts_dump(skb, &f->exts))
952 goto nla_put_failure; 1241 goto nla_put_failure;
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index f935429bd5ef..224eb2c14346 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -16,16 +16,11 @@
16#include <net/sch_generic.h> 16#include <net/sch_generic.h>
17#include <net/pkt_cls.h> 17#include <net/pkt_cls.h>
18 18
19struct cls_mall_filter { 19struct cls_mall_head {
20 struct tcf_exts exts; 20 struct tcf_exts exts;
21 struct tcf_result res; 21 struct tcf_result res;
22 u32 handle; 22 u32 handle;
23 struct rcu_head rcu;
24 u32 flags; 23 u32 flags;
25};
26
27struct cls_mall_head {
28 struct cls_mall_filter *filter;
29 struct rcu_head rcu; 24 struct rcu_head rcu;
30}; 25};
31 26
@@ -33,56 +28,52 @@ static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp,
33 struct tcf_result *res) 28 struct tcf_result *res)
34{ 29{
35 struct cls_mall_head *head = rcu_dereference_bh(tp->root); 30 struct cls_mall_head *head = rcu_dereference_bh(tp->root);
36 struct cls_mall_filter *f = head->filter;
37 31
38 if (tc_skip_sw(f->flags)) 32 if (tc_skip_sw(head->flags))
39 return -1; 33 return -1;
40 34
41 return tcf_exts_exec(skb, &f->exts, res); 35 return tcf_exts_exec(skb, &head->exts, res);
42} 36}
43 37
44static int mall_init(struct tcf_proto *tp) 38static int mall_init(struct tcf_proto *tp)
45{ 39{
46 struct cls_mall_head *head;
47
48 head = kzalloc(sizeof(*head), GFP_KERNEL);
49 if (!head)
50 return -ENOBUFS;
51
52 rcu_assign_pointer(tp->root, head);
53
54 return 0; 40 return 0;
55} 41}
56 42
57static void mall_destroy_filter(struct rcu_head *head) 43static void mall_destroy_rcu(struct rcu_head *rcu)
58{ 44{
59 struct cls_mall_filter *f = container_of(head, struct cls_mall_filter, rcu); 45 struct cls_mall_head *head = container_of(rcu, struct cls_mall_head,
46 rcu);
60 47
61 tcf_exts_destroy(&f->exts); 48 tcf_exts_destroy(&head->exts);
62 49 kfree(head);
63 kfree(f);
64} 50}
65 51
66static int mall_replace_hw_filter(struct tcf_proto *tp, 52static int mall_replace_hw_filter(struct tcf_proto *tp,
67 struct cls_mall_filter *f, 53 struct cls_mall_head *head,
68 unsigned long cookie) 54 unsigned long cookie)
69{ 55{
70 struct net_device *dev = tp->q->dev_queue->dev; 56 struct net_device *dev = tp->q->dev_queue->dev;
71 struct tc_to_netdev offload; 57 struct tc_to_netdev offload;
72 struct tc_cls_matchall_offload mall_offload = {0}; 58 struct tc_cls_matchall_offload mall_offload = {0};
59 int err;
73 60
74 offload.type = TC_SETUP_MATCHALL; 61 offload.type = TC_SETUP_MATCHALL;
75 offload.cls_mall = &mall_offload; 62 offload.cls_mall = &mall_offload;
76 offload.cls_mall->command = TC_CLSMATCHALL_REPLACE; 63 offload.cls_mall->command = TC_CLSMATCHALL_REPLACE;
77 offload.cls_mall->exts = &f->exts; 64 offload.cls_mall->exts = &head->exts;
78 offload.cls_mall->cookie = cookie; 65 offload.cls_mall->cookie = cookie;
79 66
80 return dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, 67 err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol,
81 &offload); 68 &offload);
69 if (!err)
70 head->flags |= TCA_CLS_FLAGS_IN_HW;
71
72 return err;
82} 73}
83 74
84static void mall_destroy_hw_filter(struct tcf_proto *tp, 75static void mall_destroy_hw_filter(struct tcf_proto *tp,
85 struct cls_mall_filter *f, 76 struct cls_mall_head *head,
86 unsigned long cookie) 77 unsigned long cookie)
87{ 78{
88 struct net_device *dev = tp->q->dev_queue->dev; 79 struct net_device *dev = tp->q->dev_queue->dev;
@@ -103,29 +94,20 @@ static bool mall_destroy(struct tcf_proto *tp, bool force)
103{ 94{
104 struct cls_mall_head *head = rtnl_dereference(tp->root); 95 struct cls_mall_head *head = rtnl_dereference(tp->root);
105 struct net_device *dev = tp->q->dev_queue->dev; 96 struct net_device *dev = tp->q->dev_queue->dev;
106 struct cls_mall_filter *f = head->filter;
107 97
108 if (!force && f) 98 if (!head)
109 return false; 99 return true;
110 100
111 if (f) { 101 if (tc_should_offload(dev, tp, head->flags))
112 if (tc_should_offload(dev, tp, f->flags)) 102 mall_destroy_hw_filter(tp, head, (unsigned long) head);
113 mall_destroy_hw_filter(tp, f, (unsigned long) f);
114 103
115 call_rcu(&f->rcu, mall_destroy_filter); 104 call_rcu(&head->rcu, mall_destroy_rcu);
116 }
117 kfree_rcu(head, rcu);
118 return true; 105 return true;
119} 106}
120 107
121static unsigned long mall_get(struct tcf_proto *tp, u32 handle) 108static unsigned long mall_get(struct tcf_proto *tp, u32 handle)
122{ 109{
123 struct cls_mall_head *head = rtnl_dereference(tp->root); 110 return 0UL;
124 struct cls_mall_filter *f = head->filter;
125
126 if (f && f->handle == handle)
127 return (unsigned long) f;
128 return 0;
129} 111}
130 112
131static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = { 113static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = {
@@ -134,26 +116,31 @@ static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = {
134}; 116};
135 117
136static int mall_set_parms(struct net *net, struct tcf_proto *tp, 118static int mall_set_parms(struct net *net, struct tcf_proto *tp,
137 struct cls_mall_filter *f, 119 struct cls_mall_head *head,
138 unsigned long base, struct nlattr **tb, 120 unsigned long base, struct nlattr **tb,
139 struct nlattr *est, bool ovr) 121 struct nlattr *est, bool ovr)
140{ 122{
141 struct tcf_exts e; 123 struct tcf_exts e;
142 int err; 124 int err;
143 125
144 tcf_exts_init(&e, TCA_MATCHALL_ACT, 0); 126 err = tcf_exts_init(&e, TCA_MATCHALL_ACT, 0);
127 if (err)
128 return err;
145 err = tcf_exts_validate(net, tp, tb, est, &e, ovr); 129 err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
146 if (err < 0) 130 if (err < 0)
147 return err; 131 goto errout;
148 132
149 if (tb[TCA_MATCHALL_CLASSID]) { 133 if (tb[TCA_MATCHALL_CLASSID]) {
150 f->res.classid = nla_get_u32(tb[TCA_MATCHALL_CLASSID]); 134 head->res.classid = nla_get_u32(tb[TCA_MATCHALL_CLASSID]);
151 tcf_bind_filter(tp, &f->res, base); 135 tcf_bind_filter(tp, &head->res, base);
152 } 136 }
153 137
154 tcf_exts_change(tp, &f->exts, &e); 138 tcf_exts_change(tp, &head->exts, &e);
155 139
156 return 0; 140 return 0;
141errout:
142 tcf_exts_destroy(&e);
143 return err;
157} 144}
158 145
159static int mall_change(struct net *net, struct sk_buff *in_skb, 146static int mall_change(struct net *net, struct sk_buff *in_skb,
@@ -162,21 +149,17 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
162 unsigned long *arg, bool ovr) 149 unsigned long *arg, bool ovr)
163{ 150{
164 struct cls_mall_head *head = rtnl_dereference(tp->root); 151 struct cls_mall_head *head = rtnl_dereference(tp->root);
165 struct cls_mall_filter *fold = (struct cls_mall_filter *) *arg;
166 struct net_device *dev = tp->q->dev_queue->dev; 152 struct net_device *dev = tp->q->dev_queue->dev;
167 struct cls_mall_filter *f;
168 struct nlattr *tb[TCA_MATCHALL_MAX + 1]; 153 struct nlattr *tb[TCA_MATCHALL_MAX + 1];
154 struct cls_mall_head *new;
169 u32 flags = 0; 155 u32 flags = 0;
170 int err; 156 int err;
171 157
172 if (!tca[TCA_OPTIONS]) 158 if (!tca[TCA_OPTIONS])
173 return -EINVAL; 159 return -EINVAL;
174 160
175 if (head->filter) 161 if (head)
176 return -EBUSY; 162 return -EEXIST;
177
178 if (fold)
179 return -EINVAL;
180 163
181 err = nla_parse_nested(tb, TCA_MATCHALL_MAX, 164 err = nla_parse_nested(tb, TCA_MATCHALL_MAX,
182 tca[TCA_OPTIONS], mall_policy); 165 tca[TCA_OPTIONS], mall_policy);
@@ -189,64 +172,62 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
189 return -EINVAL; 172 return -EINVAL;
190 } 173 }
191 174
192 f = kzalloc(sizeof(*f), GFP_KERNEL); 175 new = kzalloc(sizeof(*new), GFP_KERNEL);
193 if (!f) 176 if (!new)
194 return -ENOBUFS; 177 return -ENOBUFS;
195 178
196 tcf_exts_init(&f->exts, TCA_MATCHALL_ACT, 0); 179 err = tcf_exts_init(&new->exts, TCA_MATCHALL_ACT, 0);
180 if (err)
181 goto err_exts_init;
197 182
198 if (!handle) 183 if (!handle)
199 handle = 1; 184 handle = 1;
200 f->handle = handle; 185 new->handle = handle;
201 f->flags = flags; 186 new->flags = flags;
202 187
203 err = mall_set_parms(net, tp, f, base, tb, tca[TCA_RATE], ovr); 188 err = mall_set_parms(net, tp, new, base, tb, tca[TCA_RATE], ovr);
204 if (err) 189 if (err)
205 goto errout; 190 goto err_set_parms;
206 191
207 if (tc_should_offload(dev, tp, flags)) { 192 if (tc_should_offload(dev, tp, flags)) {
208 err = mall_replace_hw_filter(tp, f, (unsigned long) f); 193 err = mall_replace_hw_filter(tp, new, (unsigned long) new);
209 if (err) { 194 if (err) {
210 if (tc_skip_sw(flags)) 195 if (tc_skip_sw(flags))
211 goto errout; 196 goto err_replace_hw_filter;
212 else 197 else
213 err = 0; 198 err = 0;
214 } 199 }
215 } 200 }
216 201
217 *arg = (unsigned long) f; 202 if (!tc_in_hw(new->flags))
218 rcu_assign_pointer(head->filter, f); 203 new->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
219 204
205 *arg = (unsigned long) head;
206 rcu_assign_pointer(tp->root, new);
207 if (head)
208 call_rcu(&head->rcu, mall_destroy_rcu);
220 return 0; 209 return 0;
221 210
222errout: 211err_replace_hw_filter:
223 kfree(f); 212err_set_parms:
213 tcf_exts_destroy(&new->exts);
214err_exts_init:
215 kfree(new);
224 return err; 216 return err;
225} 217}
226 218
227static int mall_delete(struct tcf_proto *tp, unsigned long arg) 219static int mall_delete(struct tcf_proto *tp, unsigned long arg)
228{ 220{
229 struct cls_mall_head *head = rtnl_dereference(tp->root); 221 return -EOPNOTSUPP;
230 struct cls_mall_filter *f = (struct cls_mall_filter *) arg;
231 struct net_device *dev = tp->q->dev_queue->dev;
232
233 if (tc_should_offload(dev, tp, f->flags))
234 mall_destroy_hw_filter(tp, f, (unsigned long) f);
235
236 RCU_INIT_POINTER(head->filter, NULL);
237 tcf_unbind_filter(tp, &f->res);
238 call_rcu(&f->rcu, mall_destroy_filter);
239 return 0;
240} 222}
241 223
242static void mall_walk(struct tcf_proto *tp, struct tcf_walker *arg) 224static void mall_walk(struct tcf_proto *tp, struct tcf_walker *arg)
243{ 225{
244 struct cls_mall_head *head = rtnl_dereference(tp->root); 226 struct cls_mall_head *head = rtnl_dereference(tp->root);
245 struct cls_mall_filter *f = head->filter;
246 227
247 if (arg->count < arg->skip) 228 if (arg->count < arg->skip)
248 goto skip; 229 goto skip;
249 if (arg->fn(tp, (unsigned long) f, arg) < 0) 230 if (arg->fn(tp, (unsigned long) head, arg) < 0)
250 arg->stop = 1; 231 arg->stop = 1;
251skip: 232skip:
252 arg->count++; 233 arg->count++;
@@ -255,28 +236,31 @@ skip:
255static int mall_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, 236static int mall_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
256 struct sk_buff *skb, struct tcmsg *t) 237 struct sk_buff *skb, struct tcmsg *t)
257{ 238{
258 struct cls_mall_filter *f = (struct cls_mall_filter *) fh; 239 struct cls_mall_head *head = (struct cls_mall_head *) fh;
259 struct nlattr *nest; 240 struct nlattr *nest;
260 241
261 if (!f) 242 if (!head)
262 return skb->len; 243 return skb->len;
263 244
264 t->tcm_handle = f->handle; 245 t->tcm_handle = head->handle;
265 246
266 nest = nla_nest_start(skb, TCA_OPTIONS); 247 nest = nla_nest_start(skb, TCA_OPTIONS);
267 if (!nest) 248 if (!nest)
268 goto nla_put_failure; 249 goto nla_put_failure;
269 250
270 if (f->res.classid && 251 if (head->res.classid &&
271 nla_put_u32(skb, TCA_MATCHALL_CLASSID, f->res.classid)) 252 nla_put_u32(skb, TCA_MATCHALL_CLASSID, head->res.classid))
253 goto nla_put_failure;
254
255 if (head->flags && nla_put_u32(skb, TCA_MATCHALL_FLAGS, head->flags))
272 goto nla_put_failure; 256 goto nla_put_failure;
273 257
274 if (tcf_exts_dump(skb, &f->exts)) 258 if (tcf_exts_dump(skb, &head->exts))
275 goto nla_put_failure; 259 goto nla_put_failure;
276 260
277 nla_nest_end(skb, nest); 261 nla_nest_end(skb, nest);
278 262
279 if (tcf_exts_dump_stats(skb, &f->exts) < 0) 263 if (tcf_exts_dump_stats(skb, &head->exts) < 0)
280 goto nla_put_failure; 264 goto nla_put_failure;
281 265
282 return skb->len; 266 return skb->len;
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index ae83c3aec308..4dbe0c680fe6 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -334,7 +334,6 @@ static int u32_init(struct tcf_proto *tp)
334 if (root_ht == NULL) 334 if (root_ht == NULL)
335 return -ENOBUFS; 335 return -ENOBUFS;
336 336
337 root_ht->divisor = 0;
338 root_ht->refcnt++; 337 root_ht->refcnt++;
339 root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000; 338 root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000;
340 root_ht->prio = tp->prio; 339 root_ht->prio = tp->prio;
@@ -524,6 +523,10 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
524 523
525 err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, 524 err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
526 tp->protocol, &offload); 525 tp->protocol, &offload);
526
527 if (!err)
528 n->flags |= TCA_CLS_FLAGS_IN_HW;
529
527 if (tc_skip_sw(flags)) 530 if (tc_skip_sw(flags))
528 return err; 531 return err;
529 532
@@ -896,6 +899,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
896 return err; 899 return err;
897 } 900 }
898 901
902 if (!tc_in_hw(new->flags))
903 new->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
904
899 u32_replace_knode(tp, tp_c, new); 905 u32_replace_knode(tp, tp_c, new);
900 tcf_unbind_filter(tp, &n->res); 906 tcf_unbind_filter(tp, &n->res);
901 call_rcu(&n->rcu, u32_delete_key_rcu); 907 call_rcu(&n->rcu, u32_delete_key_rcu);
@@ -1015,6 +1021,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
1015 if (err) 1021 if (err)
1016 goto errhw; 1022 goto errhw;
1017 1023
1024 if (!tc_in_hw(n->flags))
1025 n->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
1026
1018 ins = &ht->ht[TC_U32_HASH(handle)]; 1027 ins = &ht->ht[TC_U32_HASH(handle)];
1019 for (pins = rtnl_dereference(*ins); pins; 1028 for (pins = rtnl_dereference(*ins); pins;
1020 ins = &pins->next, pins = rtnl_dereference(*ins)) 1029 ins = &pins->next, pins = rtnl_dereference(*ins))
diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c
index c66ca9400ab4..c1b23e3060b8 100644
--- a/net/sched/em_ipset.c
+++ b/net/sched/em_ipset.c
@@ -57,17 +57,20 @@ static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em,
57 struct xt_action_param acpar; 57 struct xt_action_param acpar;
58 const struct xt_set_info *set = (const void *) em->data; 58 const struct xt_set_info *set = (const void *) em->data;
59 struct net_device *dev, *indev = NULL; 59 struct net_device *dev, *indev = NULL;
60 struct nf_hook_state state = {
61 .net = em->net,
62 };
60 int ret, network_offset; 63 int ret, network_offset;
61 64
62 switch (tc_skb_protocol(skb)) { 65 switch (tc_skb_protocol(skb)) {
63 case htons(ETH_P_IP): 66 case htons(ETH_P_IP):
64 acpar.family = NFPROTO_IPV4; 67 state.pf = NFPROTO_IPV4;
65 if (!pskb_network_may_pull(skb, sizeof(struct iphdr))) 68 if (!pskb_network_may_pull(skb, sizeof(struct iphdr)))
66 return 0; 69 return 0;
67 acpar.thoff = ip_hdrlen(skb); 70 acpar.thoff = ip_hdrlen(skb);
68 break; 71 break;
69 case htons(ETH_P_IPV6): 72 case htons(ETH_P_IPV6):
70 acpar.family = NFPROTO_IPV6; 73 state.pf = NFPROTO_IPV6;
71 if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr))) 74 if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr)))
72 return 0; 75 return 0;
73 /* doesn't call ipv6_find_hdr() because ipset doesn't use thoff, yet */ 76 /* doesn't call ipv6_find_hdr() because ipset doesn't use thoff, yet */
@@ -77,9 +80,7 @@ static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em,
77 return 0; 80 return 0;
78 } 81 }
79 82
80 acpar.hooknum = 0; 83 opt.family = state.pf;
81
82 opt.family = acpar.family;
83 opt.dim = set->dim; 84 opt.dim = set->dim;
84 opt.flags = set->flags; 85 opt.flags = set->flags;
85 opt.cmdflags = 0; 86 opt.cmdflags = 0;
@@ -95,9 +96,9 @@ static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em,
95 if (skb->skb_iif) 96 if (skb->skb_iif)
96 indev = dev_get_by_index_rcu(em->net, skb->skb_iif); 97 indev = dev_get_by_index_rcu(em->net, skb->skb_iif);
97 98
98 acpar.net = em->net; 99 state.in = indev ? indev : dev;
99 acpar.in = indev ? indev : dev; 100 state.out = dev;
100 acpar.out = dev; 101 acpar.state = &state;
101 102
102 ret = ip_set_test(set->index, skb, &acpar, &opt); 103 ret = ip_set_test(set->index, skb, &acpar, &opt);
103 104
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index a309a07ccb35..ae7e4f5b348b 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -63,6 +63,7 @@
63#include <linux/types.h> 63#include <linux/types.h>
64#include <linux/kernel.h> 64#include <linux/kernel.h>
65#include <linux/sched.h> 65#include <linux/sched.h>
66#include <linux/sched/loadavg.h>
66#include <linux/string.h> 67#include <linux/string.h>
67#include <linux/skbuff.h> 68#include <linux/skbuff.h>
68#include <linux/random.h> 69#include <linux/random.h>
@@ -176,11 +177,12 @@ META_COLLECTOR(int_vlan_tag)
176{ 177{
177 unsigned short tag; 178 unsigned short tag;
178 179
179 tag = skb_vlan_tag_get(skb); 180 if (skb_vlan_tag_present(skb))
180 if (!tag && __vlan_get_tag(skb, &tag)) 181 dst->value = skb_vlan_tag_get(skb);
181 *err = -1; 182 else if (!__vlan_get_tag(skb, &tag))
182 else
183 dst->value = tag; 183 dst->value = tag;
184 else
185 *err = -1;
184} 186}
185 187
186 188
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 206dc24add3a..bcf49cd22786 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -440,7 +440,6 @@ void qdisc_put_rtab(struct qdisc_rate_table *tab)
440EXPORT_SYMBOL(qdisc_put_rtab); 440EXPORT_SYMBOL(qdisc_put_rtab);
441 441
442static LIST_HEAD(qdisc_stab_list); 442static LIST_HEAD(qdisc_stab_list);
443static DEFINE_SPINLOCK(qdisc_stab_lock);
444 443
445static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = { 444static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
446 [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) }, 445 [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
@@ -474,20 +473,15 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
474 if (tsize != s->tsize || (!tab && tsize > 0)) 473 if (tsize != s->tsize || (!tab && tsize > 0))
475 return ERR_PTR(-EINVAL); 474 return ERR_PTR(-EINVAL);
476 475
477 spin_lock(&qdisc_stab_lock);
478
479 list_for_each_entry(stab, &qdisc_stab_list, list) { 476 list_for_each_entry(stab, &qdisc_stab_list, list) {
480 if (memcmp(&stab->szopts, s, sizeof(*s))) 477 if (memcmp(&stab->szopts, s, sizeof(*s)))
481 continue; 478 continue;
482 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16))) 479 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
483 continue; 480 continue;
484 stab->refcnt++; 481 stab->refcnt++;
485 spin_unlock(&qdisc_stab_lock);
486 return stab; 482 return stab;
487 } 483 }
488 484
489 spin_unlock(&qdisc_stab_lock);
490
491 stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL); 485 stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
492 if (!stab) 486 if (!stab)
493 return ERR_PTR(-ENOMEM); 487 return ERR_PTR(-ENOMEM);
@@ -497,9 +491,7 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
497 if (tsize > 0) 491 if (tsize > 0)
498 memcpy(stab->data, tab, tsize * sizeof(u16)); 492 memcpy(stab->data, tab, tsize * sizeof(u16));
499 493
500 spin_lock(&qdisc_stab_lock);
501 list_add_tail(&stab->list, &qdisc_stab_list); 494 list_add_tail(&stab->list, &qdisc_stab_list);
502 spin_unlock(&qdisc_stab_lock);
503 495
504 return stab; 496 return stab;
505} 497}
@@ -514,14 +506,10 @@ void qdisc_put_stab(struct qdisc_size_table *tab)
514 if (!tab) 506 if (!tab)
515 return; 507 return;
516 508
517 spin_lock(&qdisc_stab_lock);
518
519 if (--tab->refcnt == 0) { 509 if (--tab->refcnt == 0) {
520 list_del(&tab->list); 510 list_del(&tab->list);
521 call_rcu_bh(&tab->rcu, stab_kfree_rcu); 511 call_rcu_bh(&tab->rcu, stab_kfree_rcu);
522 } 512 }
523
524 spin_unlock(&qdisc_stab_lock);
525} 513}
526EXPORT_SYMBOL(qdisc_put_stab); 514EXPORT_SYMBOL(qdisc_put_stab);
527 515
@@ -960,6 +948,17 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
960 948
961 sch->handle = handle; 949 sch->handle = handle;
962 950
951 /* This exist to keep backward compatible with a userspace
952 * loophole, what allowed userspace to get IFF_NO_QUEUE
953 * facility on older kernels by setting tx_queue_len=0 (prior
954 * to qdisc init), and then forgot to reinit tx_queue_len
955 * before again attaching a qdisc.
956 */
957 if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
958 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
959 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
960 }
961
963 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) { 962 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
964 if (qdisc_is_percpu_stats(sch)) { 963 if (qdisc_is_percpu_stats(sch)) {
965 sch->cpu_bstats = 964 sch->cpu_bstats =
@@ -1008,6 +1007,8 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
1008 1007
1009 return sch; 1008 return sch;
1010 } 1009 }
1010 /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1011 ops->destroy(sch);
1011err_out3: 1012err_out3:
1012 dev_put(dev); 1013 dev_put(dev);
1013 kfree((char *) sch - sch->padded); 1014 kfree((char *) sch - sch->padded);
@@ -1384,7 +1385,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1384 1385
1385 if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q), 1386 if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
1386 &d, cpu_bstats, &q->bstats) < 0 || 1387 &d, cpu_bstats, &q->bstats) < 0 ||
1387 gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 || 1388 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
1388 gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0) 1389 gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
1389 goto nla_put_failure; 1390 goto nla_put_failure;
1390 1391
@@ -1850,6 +1851,7 @@ int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1850{ 1851{
1851 __be16 protocol = tc_skb_protocol(skb); 1852 __be16 protocol = tc_skb_protocol(skb);
1852#ifdef CONFIG_NET_CLS_ACT 1853#ifdef CONFIG_NET_CLS_ACT
1854 const int max_reclassify_loop = 4;
1853 const struct tcf_proto *old_tp = tp; 1855 const struct tcf_proto *old_tp = tp;
1854 int limit = 0; 1856 int limit = 0;
1855 1857
@@ -1874,7 +1876,7 @@ reclassify:
1874 return TC_ACT_UNSPEC; /* signal: continue lookup */ 1876 return TC_ACT_UNSPEC; /* signal: continue lookup */
1875#ifdef CONFIG_NET_CLS_ACT 1877#ifdef CONFIG_NET_CLS_ACT
1876reset: 1878reset:
1877 if (unlikely(limit++ >= MAX_REC_LOOP)) { 1879 if (unlikely(limit++ >= max_reclassify_loop)) {
1878 net_notice_ratelimited("%s: reclassify loop, rule prio %u, protocol %02x\n", 1880 net_notice_ratelimited("%s: reclassify loop, rule prio %u, protocol %02x\n",
1879 tp->q->ops->id, tp->prio & 0xffff, 1881 tp->q->ops->id, tp->prio & 0xffff,
1880 ntohs(tp->protocol)); 1882 ntohs(tp->protocol));
@@ -1888,28 +1890,6 @@ reset:
1888} 1890}
1889EXPORT_SYMBOL(tc_classify); 1891EXPORT_SYMBOL(tc_classify);
1890 1892
1891bool tcf_destroy(struct tcf_proto *tp, bool force)
1892{
1893 if (tp->ops->destroy(tp, force)) {
1894 module_put(tp->ops->owner);
1895 kfree_rcu(tp, rcu);
1896 return true;
1897 }
1898
1899 return false;
1900}
1901
1902void tcf_destroy_chain(struct tcf_proto __rcu **fl)
1903{
1904 struct tcf_proto *tp;
1905
1906 while ((tp = rtnl_dereference(*fl)) != NULL) {
1907 RCU_INIT_POINTER(*fl, tp->next);
1908 tcf_destroy(tp, true);
1909 }
1910}
1911EXPORT_SYMBOL(tcf_destroy_chain);
1912
1913#ifdef CONFIG_PROC_FS 1893#ifdef CONFIG_PROC_FS
1914static int psched_show(struct seq_file *seq, void *v) 1894static int psched_show(struct seq_file *seq, void *v)
1915{ 1895{
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 481e4f12aeb4..2209c2ddacbf 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -15,6 +15,7 @@
15#include <linux/file.h> /* for fput */ 15#include <linux/file.h> /* for fput */
16#include <net/netlink.h> 16#include <net/netlink.h>
17#include <net/pkt_sched.h> 17#include <net/pkt_sched.h>
18#include <net/pkt_cls.h>
18 19
19/* 20/*
20 * The ATM queuing discipline provides a framework for invoking classifiers 21 * The ATM queuing discipline provides a framework for invoking classifiers
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index beb554aa8cfb..d6ca18dc04c3 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -19,6 +19,7 @@
19#include <linux/skbuff.h> 19#include <linux/skbuff.h>
20#include <net/netlink.h> 20#include <net/netlink.h>
21#include <net/pkt_sched.h> 21#include <net/pkt_sched.h>
22#include <net/pkt_cls.h>
22 23
23 24
24/* Class-Based Queueing (CBQ) algorithm. 25/* Class-Based Queueing (CBQ) algorithm.
@@ -122,7 +123,7 @@ struct cbq_class {
122 psched_time_t penalized; 123 psched_time_t penalized;
123 struct gnet_stats_basic_packed bstats; 124 struct gnet_stats_basic_packed bstats;
124 struct gnet_stats_queue qstats; 125 struct gnet_stats_queue qstats;
125 struct gnet_stats_rate_est64 rate_est; 126 struct net_rate_estimator __rcu *rate_est;
126 struct tc_cbq_xstats xstats; 127 struct tc_cbq_xstats xstats;
127 128
128 struct tcf_proto __rcu *filter_list; 129 struct tcf_proto __rcu *filter_list;
@@ -509,7 +510,7 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
509 if (delay) { 510 if (delay) {
510 ktime_t time; 511 ktime_t time;
511 512
512 time = ktime_set(0, 0); 513 time = 0;
513 time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay)); 514 time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
514 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED); 515 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
515 } 516 }
@@ -1346,7 +1347,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1346 1347
1347 if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), 1348 if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1348 d, NULL, &cl->bstats) < 0 || 1349 d, NULL, &cl->bstats) < 0 ||
1349 gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || 1350 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1350 gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0) 1351 gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
1351 return -1; 1352 return -1;
1352 1353
@@ -1405,7 +1406,7 @@ static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1405 tcf_destroy_chain(&cl->filter_list); 1406 tcf_destroy_chain(&cl->filter_list);
1406 qdisc_destroy(cl->q); 1407 qdisc_destroy(cl->q);
1407 qdisc_put_rtab(cl->R_tab); 1408 qdisc_put_rtab(cl->R_tab);
1408 gen_kill_estimator(&cl->bstats, &cl->rate_est); 1409 gen_kill_estimator(&cl->rate_est);
1409 if (cl != &q->link) 1410 if (cl != &q->link)
1410 kfree(cl); 1411 kfree(cl);
1411} 1412}
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index 3b6d5bd69101..3b86a97bc67c 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -16,6 +16,7 @@
16#include <linux/skbuff.h> 16#include <linux/skbuff.h>
17#include <linux/vmalloc.h> 17#include <linux/vmalloc.h>
18#include <net/pkt_sched.h> 18#include <net/pkt_sched.h>
19#include <net/pkt_cls.h>
19#include <net/inet_ecn.h> 20#include <net/inet_ecn.h>
20#include <net/red.h> 21#include <net/red.h>
21#include <net/flow_dissector.h> 22#include <net/flow_dissector.h>
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index 8af5c59eef84..bb4cbdf75004 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -25,7 +25,7 @@ struct drr_class {
25 25
26 struct gnet_stats_basic_packed bstats; 26 struct gnet_stats_basic_packed bstats;
27 struct gnet_stats_queue qstats; 27 struct gnet_stats_queue qstats;
28 struct gnet_stats_rate_est64 rate_est; 28 struct net_rate_estimator __rcu *rate_est;
29 struct list_head alist; 29 struct list_head alist;
30 struct Qdisc *qdisc; 30 struct Qdisc *qdisc;
31 31
@@ -142,7 +142,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
142 142
143static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl) 143static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl)
144{ 144{
145 gen_kill_estimator(&cl->bstats, &cl->rate_est); 145 gen_kill_estimator(&cl->rate_est);
146 qdisc_destroy(cl->qdisc); 146 qdisc_destroy(cl->qdisc);
147 kfree(cl); 147 kfree(cl);
148} 148}
@@ -283,7 +283,7 @@ static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg,
283 283
284 if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), 284 if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
285 d, NULL, &cl->bstats) < 0 || 285 d, NULL, &cl->bstats) < 0 ||
286 gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || 286 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
287 gnet_stats_copy_queue(d, NULL, &cl->qdisc->qstats, qlen) < 0) 287 gnet_stats_copy_queue(d, NULL, &cl->qdisc->qstats, qlen) < 0)
288 return -1; 288 return -1;
289 289
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 1308bbf460f7..5334e309f17f 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -13,6 +13,7 @@
13#include <linux/rtnetlink.h> 13#include <linux/rtnetlink.h>
14#include <linux/bitops.h> 14#include <linux/bitops.h>
15#include <net/pkt_sched.h> 15#include <net/pkt_sched.h>
16#include <net/pkt_cls.h>
16#include <net/dsfield.h> 17#include <net/dsfield.h>
17#include <net/inet_ecn.h> 18#include <net/inet_ecn.h>
18#include <asm/byteorder.h> 19#include <asm/byteorder.h>
@@ -200,9 +201,13 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch,
200 pr_debug("%s(skb %p,sch %p,[qdisc %p])\n", __func__, skb, sch, p); 201 pr_debug("%s(skb %p,sch %p,[qdisc %p])\n", __func__, skb, sch, p);
201 202
202 if (p->set_tc_index) { 203 if (p->set_tc_index) {
204 int wlen = skb_network_offset(skb);
205
203 switch (tc_skb_protocol(skb)) { 206 switch (tc_skb_protocol(skb)) {
204 case htons(ETH_P_IP): 207 case htons(ETH_P_IP):
205 if (skb_cow_head(skb, sizeof(struct iphdr))) 208 wlen += sizeof(struct iphdr);
209 if (!pskb_may_pull(skb, wlen) ||
210 skb_try_make_writable(skb, wlen))
206 goto drop; 211 goto drop;
207 212
208 skb->tc_index = ipv4_get_dsfield(ip_hdr(skb)) 213 skb->tc_index = ipv4_get_dsfield(ip_hdr(skb))
@@ -210,7 +215,9 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch,
210 break; 215 break;
211 216
212 case htons(ETH_P_IPV6): 217 case htons(ETH_P_IPV6):
213 if (skb_cow_head(skb, sizeof(struct ipv6hdr))) 218 wlen += sizeof(struct ipv6hdr);
219 if (!pskb_may_pull(skb, wlen) ||
220 skb_try_make_writable(skb, wlen))
214 goto drop; 221 goto drop;
215 222
216 skb->tc_index = ipv6_get_dsfield(ipv6_hdr(skb)) 223 skb->tc_index = ipv6_get_dsfield(ipv6_hdr(skb))
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 18e752439f6f..a4f738ac7728 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -136,7 +136,7 @@ static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
136 struct fq_flow *aux; 136 struct fq_flow *aux;
137 137
138 parent = *p; 138 parent = *p;
139 aux = container_of(parent, struct fq_flow, rate_node); 139 aux = rb_entry(parent, struct fq_flow, rate_node);
140 if (f->time_next_packet >= aux->time_next_packet) 140 if (f->time_next_packet >= aux->time_next_packet)
141 p = &parent->rb_right; 141 p = &parent->rb_right;
142 else 142 else
@@ -188,7 +188,7 @@ static void fq_gc(struct fq_sched_data *q,
188 while (*p) { 188 while (*p) {
189 parent = *p; 189 parent = *p;
190 190
191 f = container_of(parent, struct fq_flow, fq_node); 191 f = rb_entry(parent, struct fq_flow, fq_node);
192 if (f->sk == sk) 192 if (f->sk == sk)
193 break; 193 break;
194 194
@@ -245,7 +245,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
245 skb_orphan(skb); 245 skb_orphan(skb);
246 } 246 }
247 247
248 root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)]; 248 root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)];
249 249
250 if (q->flows >= (2U << q->fq_trees_log) && 250 if (q->flows >= (2U << q->fq_trees_log) &&
251 q->inactive_flows > q->flows/2) 251 q->inactive_flows > q->flows/2)
@@ -256,7 +256,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
256 while (*p) { 256 while (*p) {
257 parent = *p; 257 parent = *p;
258 258
259 f = container_of(parent, struct fq_flow, fq_node); 259 f = rb_entry(parent, struct fq_flow, fq_node);
260 if (f->sk == sk) { 260 if (f->sk == sk) {
261 /* socket might have been reallocated, so check 261 /* socket might have been reallocated, so check
262 * if its sk_hash is the same. 262 * if its sk_hash is the same.
@@ -424,7 +424,7 @@ static void fq_check_throttled(struct fq_sched_data *q, u64 now)
424 424
425 q->time_next_delayed_flow = ~0ULL; 425 q->time_next_delayed_flow = ~0ULL;
426 while ((p = rb_first(&q->delayed)) != NULL) { 426 while ((p = rb_first(&q->delayed)) != NULL) {
427 struct fq_flow *f = container_of(p, struct fq_flow, rate_node); 427 struct fq_flow *f = rb_entry(p, struct fq_flow, rate_node);
428 428
429 if (f->time_next_packet > now) { 429 if (f->time_next_packet > now) {
430 q->time_next_delayed_flow = f->time_next_packet; 430 q->time_next_delayed_flow = f->time_next_packet;
@@ -563,7 +563,7 @@ static void fq_reset(struct Qdisc *sch)
563 for (idx = 0; idx < (1U << q->fq_trees_log); idx++) { 563 for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
564 root = &q->fq_root[idx]; 564 root = &q->fq_root[idx];
565 while ((p = rb_first(root)) != NULL) { 565 while ((p = rb_first(root)) != NULL) {
566 f = container_of(p, struct fq_flow, fq_node); 566 f = rb_entry(p, struct fq_flow, fq_node);
567 rb_erase(p, root); 567 rb_erase(p, root);
568 568
569 fq_flow_purge(f); 569 fq_flow_purge(f);
@@ -593,20 +593,20 @@ static void fq_rehash(struct fq_sched_data *q,
593 oroot = &old_array[idx]; 593 oroot = &old_array[idx];
594 while ((op = rb_first(oroot)) != NULL) { 594 while ((op = rb_first(oroot)) != NULL) {
595 rb_erase(op, oroot); 595 rb_erase(op, oroot);
596 of = container_of(op, struct fq_flow, fq_node); 596 of = rb_entry(op, struct fq_flow, fq_node);
597 if (fq_gc_candidate(of)) { 597 if (fq_gc_candidate(of)) {
598 fcnt++; 598 fcnt++;
599 kmem_cache_free(fq_flow_cachep, of); 599 kmem_cache_free(fq_flow_cachep, of);
600 continue; 600 continue;
601 } 601 }
602 nroot = &new_array[hash_32((u32)(long)of->sk, new_log)]; 602 nroot = &new_array[hash_ptr(of->sk, new_log)];
603 603
604 np = &nroot->rb_node; 604 np = &nroot->rb_node;
605 parent = NULL; 605 parent = NULL;
606 while (*np) { 606 while (*np) {
607 parent = *np; 607 parent = *np;
608 608
609 nf = container_of(parent, struct fq_flow, fq_node); 609 nf = rb_entry(parent, struct fq_flow, fq_node);
610 BUG_ON(nf->sk == of->sk); 610 BUG_ON(nf->sk == of->sk);
611 611
612 if (nf->sk > of->sk) 612 if (nf->sk > of->sk)
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index a5ea0e9b6be4..9f3a884d1590 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -23,6 +23,7 @@
23#include <linux/vmalloc.h> 23#include <linux/vmalloc.h>
24#include <net/netlink.h> 24#include <net/netlink.h>
25#include <net/pkt_sched.h> 25#include <net/pkt_sched.h>
26#include <net/pkt_cls.h>
26#include <net/codel.h> 27#include <net/codel.h>
27#include <net/codel_impl.h> 28#include <net/codel_impl.h>
28#include <net/codel_qdisc.h> 29#include <net/codel_qdisc.h>
@@ -57,7 +58,6 @@ struct fq_codel_sched_data {
57 struct fq_codel_flow *flows; /* Flows table [flows_cnt] */ 58 struct fq_codel_flow *flows; /* Flows table [flows_cnt] */
58 u32 *backlogs; /* backlog table [flows_cnt] */ 59 u32 *backlogs; /* backlog table [flows_cnt] */
59 u32 flows_cnt; /* number of flows */ 60 u32 flows_cnt; /* number of flows */
60 u32 perturbation; /* hash perturbation */
61 u32 quantum; /* psched_mtu(qdisc_dev(sch)); */ 61 u32 quantum; /* psched_mtu(qdisc_dev(sch)); */
62 u32 drop_batch_size; 62 u32 drop_batch_size;
63 u32 memory_limit; 63 u32 memory_limit;
@@ -75,9 +75,7 @@ struct fq_codel_sched_data {
75static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q, 75static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q,
76 struct sk_buff *skb) 76 struct sk_buff *skb)
77{ 77{
78 u32 hash = skb_get_hash_perturb(skb, q->perturbation); 78 return reciprocal_scale(skb_get_hash(skb), q->flows_cnt);
79
80 return reciprocal_scale(hash, q->flows_cnt);
81} 79}
82 80
83static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, 81static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch,
@@ -482,7 +480,6 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt)
482 q->memory_limit = 32 << 20; /* 32 MBytes */ 480 q->memory_limit = 32 << 20; /* 32 MBytes */
483 q->drop_batch_size = 64; 481 q->drop_batch_size = 64;
484 q->quantum = psched_mtu(qdisc_dev(sch)); 482 q->quantum = psched_mtu(qdisc_dev(sch));
485 q->perturbation = prandom_u32();
486 INIT_LIST_HEAD(&q->new_flows); 483 INIT_LIST_HEAD(&q->new_flows);
487 INIT_LIST_HEAD(&q->old_flows); 484 INIT_LIST_HEAD(&q->old_flows);
488 codel_params_init(&q->cparams); 485 codel_params_init(&q->cparams);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 6cfb6e9038c2..1a2f9e964330 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -247,7 +247,7 @@ static inline int qdisc_restart(struct Qdisc *q, int *packets)
247 247
248void __qdisc_run(struct Qdisc *q) 248void __qdisc_run(struct Qdisc *q)
249{ 249{
250 int quota = weight_p; 250 int quota = dev_tx_weight;
251 int packets; 251 int packets;
252 252
253 while (qdisc_restart(q, &packets)) { 253 while (qdisc_restart(q, &packets)) {
@@ -709,7 +709,7 @@ void qdisc_destroy(struct Qdisc *qdisc)
709 709
710 qdisc_put_stab(rtnl_dereference(qdisc->stab)); 710 qdisc_put_stab(rtnl_dereference(qdisc->stab));
711#endif 711#endif
712 gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est); 712 gen_kill_estimator(&qdisc->rate_est);
713 if (ops->reset) 713 if (ops->reset)
714 ops->reset(qdisc); 714 ops->reset(qdisc);
715 if (ops->destroy) 715 if (ops->destroy)
@@ -794,7 +794,7 @@ static void attach_default_qdiscs(struct net_device *dev)
794 } 794 }
795 } 795 }
796#ifdef CONFIG_NET_SCHED 796#ifdef CONFIG_NET_SCHED
797 if (dev->qdisc) 797 if (dev->qdisc != &noop_qdisc)
798 qdisc_hash_add(dev->qdisc); 798 qdisc_hash_add(dev->qdisc);
799#endif 799#endif
800} 800}
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 000f1d36128e..3ffaa6fb0990 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -114,7 +114,7 @@ struct hfsc_class {
114 114
115 struct gnet_stats_basic_packed bstats; 115 struct gnet_stats_basic_packed bstats;
116 struct gnet_stats_queue qstats; 116 struct gnet_stats_queue qstats;
117 struct gnet_stats_rate_est64 rate_est; 117 struct net_rate_estimator __rcu *rate_est;
118 struct tcf_proto __rcu *filter_list; /* filter list */ 118 struct tcf_proto __rcu *filter_list; /* filter list */
119 unsigned int filter_cnt; /* filter count */ 119 unsigned int filter_cnt; /* filter count */
120 unsigned int level; /* class level in hierarchy */ 120 unsigned int level; /* class level in hierarchy */
@@ -1091,7 +1091,7 @@ hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl)
1091 1091
1092 tcf_destroy_chain(&cl->filter_list); 1092 tcf_destroy_chain(&cl->filter_list);
1093 qdisc_destroy(cl->qdisc); 1093 qdisc_destroy(cl->qdisc);
1094 gen_kill_estimator(&cl->bstats, &cl->rate_est); 1094 gen_kill_estimator(&cl->rate_est);
1095 if (cl != &q->root) 1095 if (cl != &q->root)
1096 kfree(cl); 1096 kfree(cl);
1097} 1097}
@@ -1348,7 +1348,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1348 xstats.rtwork = cl->cl_cumul; 1348 xstats.rtwork = cl->cl_cumul;
1349 1349
1350 if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 || 1350 if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 ||
1351 gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || 1351 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1352 gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->qdisc->q.qlen) < 0) 1352 gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->qdisc->q.qlen) < 0)
1353 return -1; 1353 return -1;
1354 1354
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
index e3d0458af17b..2fae8b5f1b80 100644
--- a/net/sched/sch_hhf.c
+++ b/net/sched/sch_hhf.c
@@ -627,7 +627,9 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt)
627 q->hhf_arrays[i] = hhf_zalloc(HHF_ARRAYS_LEN * 627 q->hhf_arrays[i] = hhf_zalloc(HHF_ARRAYS_LEN *
628 sizeof(u32)); 628 sizeof(u32));
629 if (!q->hhf_arrays[i]) { 629 if (!q->hhf_arrays[i]) {
630 hhf_destroy(sch); 630 /* Note: hhf_destroy() will be called
631 * by our caller.
632 */
631 return -ENOMEM; 633 return -ENOMEM;
632 } 634 }
633 } 635 }
@@ -638,7 +640,9 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt)
638 q->hhf_valid_bits[i] = hhf_zalloc(HHF_ARRAYS_LEN / 640 q->hhf_valid_bits[i] = hhf_zalloc(HHF_ARRAYS_LEN /
639 BITS_PER_BYTE); 641 BITS_PER_BYTE);
640 if (!q->hhf_valid_bits[i]) { 642 if (!q->hhf_valid_bits[i]) {
641 hhf_destroy(sch); 643 /* Note: hhf_destroy() will be called
644 * by our caller.
645 */
642 return -ENOMEM; 646 return -ENOMEM;
643 } 647 }
644 } 648 }
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index c798d0de8a9d..4cd5fb134bc9 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -40,6 +40,7 @@
40#include <net/netlink.h> 40#include <net/netlink.h>
41#include <net/sch_generic.h> 41#include <net/sch_generic.h>
42#include <net/pkt_sched.h> 42#include <net/pkt_sched.h>
43#include <net/pkt_cls.h>
43 44
44/* HTB algorithm. 45/* HTB algorithm.
45 Author: devik@cdi.cz 46 Author: devik@cdi.cz
@@ -111,7 +112,7 @@ struct htb_class {
111 unsigned int children; 112 unsigned int children;
112 struct htb_class *parent; /* parent class */ 113 struct htb_class *parent; /* parent class */
113 114
114 struct gnet_stats_rate_est64 rate_est; 115 struct net_rate_estimator __rcu *rate_est;
115 116
116 /* 117 /*
117 * Written often fields 118 * Written often fields
@@ -1145,7 +1146,7 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1145 1146
1146 if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), 1147 if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1147 d, NULL, &cl->bstats) < 0 || 1148 d, NULL, &cl->bstats) < 0 ||
1148 gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 || 1149 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1149 gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0) 1150 gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
1150 return -1; 1151 return -1;
1151 1152
@@ -1228,7 +1229,7 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1228 WARN_ON(!cl->un.leaf.q); 1229 WARN_ON(!cl->un.leaf.q);
1229 qdisc_destroy(cl->un.leaf.q); 1230 qdisc_destroy(cl->un.leaf.q);
1230 } 1231 }
1231 gen_kill_estimator(&cl->bstats, &cl->rate_est); 1232 gen_kill_estimator(&cl->rate_est);
1232 tcf_destroy_chain(&cl->filter_list); 1233 tcf_destroy_chain(&cl->filter_list);
1233 kfree(cl); 1234 kfree(cl);
1234} 1235}
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index 8fe6999b642a..3bab5f66c392 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -16,6 +16,7 @@
16 16
17#include <net/netlink.h> 17#include <net/netlink.h>
18#include <net/pkt_sched.h> 18#include <net/pkt_sched.h>
19#include <net/pkt_cls.h>
19 20
20static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg) 21static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg)
21{ 22{
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index 2bc8d7f8df16..20b7f1646f69 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -52,7 +52,7 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt)
52 /* pre-allocate qdiscs, attachment can't fail */ 52 /* pre-allocate qdiscs, attachment can't fail */
53 priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]), 53 priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
54 GFP_KERNEL); 54 GFP_KERNEL);
55 if (priv->qdiscs == NULL) 55 if (!priv->qdiscs)
56 return -ENOMEM; 56 return -ENOMEM;
57 57
58 for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { 58 for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
@@ -60,18 +60,14 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt)
60 qdisc = qdisc_create_dflt(dev_queue, get_default_qdisc_ops(dev, ntx), 60 qdisc = qdisc_create_dflt(dev_queue, get_default_qdisc_ops(dev, ntx),
61 TC_H_MAKE(TC_H_MAJ(sch->handle), 61 TC_H_MAKE(TC_H_MAJ(sch->handle),
62 TC_H_MIN(ntx + 1))); 62 TC_H_MIN(ntx + 1)));
63 if (qdisc == NULL) 63 if (!qdisc)
64 goto err; 64 return -ENOMEM;
65 priv->qdiscs[ntx] = qdisc; 65 priv->qdiscs[ntx] = qdisc;
66 qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; 66 qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
67 } 67 }
68 68
69 sch->flags |= TCQ_F_MQROOT; 69 sch->flags |= TCQ_F_MQROOT;
70 return 0; 70 return 0;
71
72err:
73 mq_destroy(sch);
74 return -ENOMEM;
75} 71}
76 72
77static void mq_attach(struct Qdisc *sch) 73static void mq_attach(struct Qdisc *sch)
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index b5c502c78143..922683418e53 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -118,10 +118,8 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
118 /* pre-allocate qdisc, attachment can't fail */ 118 /* pre-allocate qdisc, attachment can't fail */
119 priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]), 119 priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
120 GFP_KERNEL); 120 GFP_KERNEL);
121 if (priv->qdiscs == NULL) { 121 if (!priv->qdiscs)
122 err = -ENOMEM; 122 return -ENOMEM;
123 goto err;
124 }
125 123
126 for (i = 0; i < dev->num_tx_queues; i++) { 124 for (i = 0; i < dev->num_tx_queues; i++) {
127 dev_queue = netdev_get_tx_queue(dev, i); 125 dev_queue = netdev_get_tx_queue(dev, i);
@@ -129,10 +127,9 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
129 get_default_qdisc_ops(dev, i), 127 get_default_qdisc_ops(dev, i),
130 TC_H_MAKE(TC_H_MAJ(sch->handle), 128 TC_H_MAKE(TC_H_MAJ(sch->handle),
131 TC_H_MIN(i + 1))); 129 TC_H_MIN(i + 1)));
132 if (qdisc == NULL) { 130 if (!qdisc)
133 err = -ENOMEM; 131 return -ENOMEM;
134 goto err; 132
135 }
136 priv->qdiscs[i] = qdisc; 133 priv->qdiscs[i] = qdisc;
137 qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; 134 qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
138 } 135 }
@@ -148,7 +145,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
148 priv->hw_owned = 1; 145 priv->hw_owned = 1;
149 err = dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0, &tc); 146 err = dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0, &tc);
150 if (err) 147 if (err)
151 goto err; 148 return err;
152 } else { 149 } else {
153 netdev_set_num_tc(dev, qopt->num_tc); 150 netdev_set_num_tc(dev, qopt->num_tc);
154 for (i = 0; i < qopt->num_tc; i++) 151 for (i = 0; i < qopt->num_tc; i++)
@@ -162,10 +159,6 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
162 159
163 sch->flags |= TCQ_F_MQROOT; 160 sch->flags |= TCQ_F_MQROOT;
164 return 0; 161 return 0;
165
166err:
167 mqprio_destroy(sch);
168 return err;
169} 162}
170 163
171static void mqprio_attach(struct Qdisc *sch) 164static void mqprio_attach(struct Qdisc *sch)
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 9ffbb025b37e..e7839a0d0eaa 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -25,7 +25,7 @@
25#include <linux/skbuff.h> 25#include <linux/skbuff.h>
26#include <net/netlink.h> 26#include <net/netlink.h>
27#include <net/pkt_sched.h> 27#include <net/pkt_sched.h>
28 28#include <net/pkt_cls.h>
29 29
30struct multiq_sched_data { 30struct multiq_sched_data {
31 u16 bands; 31 u16 bands;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 9f7b380cf0a3..c8bb62a1e744 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -152,7 +152,7 @@ struct netem_skb_cb {
152 152
153static struct sk_buff *netem_rb_to_skb(struct rb_node *rb) 153static struct sk_buff *netem_rb_to_skb(struct rb_node *rb)
154{ 154{
155 return container_of(rb, struct sk_buff, rbnode); 155 return rb_entry(rb, struct sk_buff, rbnode);
156} 156}
157 157
158static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) 158static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
@@ -626,8 +626,8 @@ deliver:
626 * If it's at ingress let's pretend the delay is 626 * If it's at ingress let's pretend the delay is
627 * from the network (tstamp will be updated). 627 * from the network (tstamp will be updated).
628 */ 628 */
629 if (G_TC_FROM(skb->tc_verd) & AT_INGRESS) 629 if (skb->tc_redirected && skb->tc_from_ingress)
630 skb->tstamp.tv64 = 0; 630 skb->tstamp = 0;
631#endif 631#endif
632 632
633 if (q->qdisc) { 633 if (q->qdisc) {
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 8f575899adfa..d4d7db267b6e 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -20,7 +20,7 @@
20#include <linux/skbuff.h> 20#include <linux/skbuff.h>
21#include <net/netlink.h> 21#include <net/netlink.h>
22#include <net/pkt_sched.h> 22#include <net/pkt_sched.h>
23 23#include <net/pkt_cls.h>
24 24
25struct prio_sched_data { 25struct prio_sched_data {
26 int bands; 26 int bands;
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index ca0516e6f743..f9e712ce2d15 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -137,7 +137,7 @@ struct qfq_class {
137 137
138 struct gnet_stats_basic_packed bstats; 138 struct gnet_stats_basic_packed bstats;
139 struct gnet_stats_queue qstats; 139 struct gnet_stats_queue qstats;
140 struct gnet_stats_rate_est64 rate_est; 140 struct net_rate_estimator __rcu *rate_est;
141 struct Qdisc *qdisc; 141 struct Qdisc *qdisc;
142 struct list_head alist; /* Link for active-classes list. */ 142 struct list_head alist; /* Link for active-classes list. */
143 struct qfq_aggregate *agg; /* Parent aggregate. */ 143 struct qfq_aggregate *agg; /* Parent aggregate. */
@@ -508,7 +508,7 @@ set_change_agg:
508 new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL); 508 new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL);
509 if (new_agg == NULL) { 509 if (new_agg == NULL) {
510 err = -ENOBUFS; 510 err = -ENOBUFS;
511 gen_kill_estimator(&cl->bstats, &cl->rate_est); 511 gen_kill_estimator(&cl->rate_est);
512 goto destroy_class; 512 goto destroy_class;
513 } 513 }
514 sch_tree_lock(sch); 514 sch_tree_lock(sch);
@@ -533,7 +533,7 @@ static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl)
533 struct qfq_sched *q = qdisc_priv(sch); 533 struct qfq_sched *q = qdisc_priv(sch);
534 534
535 qfq_rm_from_agg(q, cl); 535 qfq_rm_from_agg(q, cl);
536 gen_kill_estimator(&cl->bstats, &cl->rate_est); 536 gen_kill_estimator(&cl->rate_est);
537 qdisc_destroy(cl->qdisc); 537 qdisc_destroy(cl->qdisc);
538 kfree(cl); 538 kfree(cl);
539} 539}
@@ -667,7 +667,7 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
667 667
668 if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), 668 if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
669 d, NULL, &cl->bstats) < 0 || 669 d, NULL, &cl->bstats) < 0 ||
670 gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || 670 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
671 gnet_stats_copy_queue(d, NULL, 671 gnet_stats_copy_queue(d, NULL,
672 &cl->qdisc->qstats, cl->qdisc->q.qlen) < 0) 672 &cl->qdisc->qstats, cl->qdisc->q.qlen) < 0)
673 return -1; 673 return -1;
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 20a350bd1b1d..fe6963d21519 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -25,6 +25,7 @@
25#include <linux/jhash.h> 25#include <linux/jhash.h>
26#include <net/ip.h> 26#include <net/ip.h>
27#include <net/pkt_sched.h> 27#include <net/pkt_sched.h>
28#include <net/pkt_cls.h>
28#include <net/inet_ecn.h> 29#include <net/inet_ecn.h>
29 30
30/* 31/*
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 7f195ed4d568..42e8c8615e65 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -23,6 +23,7 @@
23#include <linux/vmalloc.h> 23#include <linux/vmalloc.h>
24#include <net/netlink.h> 24#include <net/netlink.h>
25#include <net/pkt_sched.h> 25#include <net/pkt_sched.h>
26#include <net/pkt_cls.h>
26#include <net/red.h> 27#include <net/red.h>
27 28
28 29
@@ -742,9 +743,10 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
742 q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor); 743 q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
743 q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows); 744 q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
744 if (!q->ht || !q->slots) { 745 if (!q->ht || !q->slots) {
745 sfq_destroy(sch); 746 /* Note: sfq_destroy() will be called by our caller */
746 return -ENOMEM; 747 return -ENOMEM;
747 } 748 }
749
748 for (i = 0; i < q->divisor; i++) 750 for (i = 0; i < q->divisor; i++)
749 q->ht[i] = SFQ_EMPTY_SLOT; 751 q->ht[i] = SFQ_EMPTY_SLOT;
750 752
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 2cd9b4478b92..9fe6b427afed 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -401,8 +401,8 @@ static int teql_master_close(struct net_device *dev)
401 return 0; 401 return 0;
402} 402}
403 403
404static struct rtnl_link_stats64 *teql_master_stats64(struct net_device *dev, 404static void teql_master_stats64(struct net_device *dev,
405 struct rtnl_link_stats64 *stats) 405 struct rtnl_link_stats64 *stats)
406{ 406{
407 struct teql_master *m = netdev_priv(dev); 407 struct teql_master *m = netdev_priv(dev);
408 408
@@ -410,7 +410,6 @@ static struct rtnl_link_stats64 *teql_master_stats64(struct net_device *dev,
410 stats->tx_bytes = m->tx_bytes; 410 stats->tx_bytes = m->tx_bytes;
411 stats->tx_errors = m->tx_errors; 411 stats->tx_errors = m->tx_errors;
412 stats->tx_dropped = m->tx_dropped; 412 stats->tx_dropped = m->tx_dropped;
413 return stats;
414} 413}
415 414
416static int teql_master_mtu(struct net_device *dev, int new_mtu) 415static int teql_master_mtu(struct net_device *dev, int new_mtu)
@@ -418,9 +417,6 @@ static int teql_master_mtu(struct net_device *dev, int new_mtu)
418 struct teql_master *m = netdev_priv(dev); 417 struct teql_master *m = netdev_priv(dev);
419 struct Qdisc *q; 418 struct Qdisc *q;
420 419
421 if (new_mtu < 68)
422 return -EINVAL;
423
424 q = m->slaves; 420 q = m->slaves;
425 if (q) { 421 if (q) {
426 do { 422 do {
@@ -460,6 +456,8 @@ static __init void teql_master_setup(struct net_device *dev)
460 dev->netdev_ops = &teql_netdev_ops; 456 dev->netdev_ops = &teql_netdev_ops;
461 dev->type = ARPHRD_VOID; 457 dev->type = ARPHRD_VOID;
462 dev->mtu = 1500; 458 dev->mtu = 1500;
459 dev->min_mtu = 68;
460 dev->max_mtu = 65535;
463 dev->tx_queue_len = 100; 461 dev->tx_queue_len = 100;
464 dev->flags = IFF_NOARP; 462 dev->flags = IFF_NOARP;
465 dev->hard_header_len = LL_MAX_HEADER; 463 dev->hard_header_len = LL_MAX_HEADER;
diff --git a/net/sctp/Makefile b/net/sctp/Makefile
index 6c4f7496cec6..70f1b570bab9 100644
--- a/net/sctp/Makefile
+++ b/net/sctp/Makefile
@@ -11,7 +11,7 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
11 transport.o chunk.o sm_make_chunk.o ulpevent.o \ 11 transport.o chunk.o sm_make_chunk.o ulpevent.o \
12 inqueue.o outqueue.o ulpqueue.o \ 12 inqueue.o outqueue.o ulpqueue.o \
13 tsnmap.o bind_addr.o socket.o primitive.o \ 13 tsnmap.o bind_addr.o socket.o primitive.o \
14 output.o input.o debug.o ssnmap.o auth.o \ 14 output.o input.o debug.o stream.o auth.o \
15 offload.o 15 offload.o
16 16
17sctp_probe-y := probe.o 17sctp_probe-y := probe.o
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index f10d3397f917..a9708da28eb5 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -71,9 +71,8 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
71{ 71{
72 struct net *net = sock_net(sk); 72 struct net *net = sock_net(sk);
73 struct sctp_sock *sp; 73 struct sctp_sock *sp;
74 int i;
75 sctp_paramhdr_t *p; 74 sctp_paramhdr_t *p;
76 int err; 75 int i;
77 76
78 /* Retrieve the SCTP per socket area. */ 77 /* Retrieve the SCTP per socket area. */
79 sp = sctp_sk((struct sock *)sk); 78 sp = sctp_sk((struct sock *)sk);
@@ -207,6 +206,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
207 * association to the same value as the initial TSN. 206 * association to the same value as the initial TSN.
208 */ 207 */
209 asoc->addip_serial = asoc->c.initial_tsn; 208 asoc->addip_serial = asoc->c.initial_tsn;
209 asoc->strreset_outseq = asoc->c.initial_tsn;
210 210
211 INIT_LIST_HEAD(&asoc->addip_chunk_list); 211 INIT_LIST_HEAD(&asoc->addip_chunk_list);
212 INIT_LIST_HEAD(&asoc->asconf_ack_list); 212 INIT_LIST_HEAD(&asoc->asconf_ack_list);
@@ -246,6 +246,9 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
246 if (!sctp_ulpq_init(&asoc->ulpq, asoc)) 246 if (!sctp_ulpq_init(&asoc->ulpq, asoc))
247 goto fail_init; 247 goto fail_init;
248 248
249 if (sctp_stream_new(asoc, gfp))
250 goto fail_init;
251
249 /* Assume that peer would support both address types unless we are 252 /* Assume that peer would support both address types unless we are
250 * told otherwise. 253 * told otherwise.
251 */ 254 */
@@ -263,12 +266,13 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
263 266
264 /* AUTH related initializations */ 267 /* AUTH related initializations */
265 INIT_LIST_HEAD(&asoc->endpoint_shared_keys); 268 INIT_LIST_HEAD(&asoc->endpoint_shared_keys);
266 err = sctp_auth_asoc_copy_shkeys(ep, asoc, gfp); 269 if (sctp_auth_asoc_copy_shkeys(ep, asoc, gfp))
267 if (err) 270 goto stream_free;
268 goto fail_init;
269 271
270 asoc->active_key_id = ep->active_key_id; 272 asoc->active_key_id = ep->active_key_id;
271 asoc->prsctp_enable = ep->prsctp_enable; 273 asoc->prsctp_enable = ep->prsctp_enable;
274 asoc->reconf_enable = ep->reconf_enable;
275 asoc->strreset_enable = ep->strreset_enable;
272 276
273 /* Save the hmacs and chunks list into this association */ 277 /* Save the hmacs and chunks list into this association */
274 if (ep->auth_hmacs_list) 278 if (ep->auth_hmacs_list)
@@ -286,6 +290,8 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
286 290
287 return asoc; 291 return asoc;
288 292
293stream_free:
294 sctp_stream_free(asoc->stream);
289fail_init: 295fail_init:
290 sock_put(asoc->base.sk); 296 sock_put(asoc->base.sk);
291 sctp_endpoint_put(asoc->ep); 297 sctp_endpoint_put(asoc->ep);
@@ -358,8 +364,11 @@ void sctp_association_free(struct sctp_association *asoc)
358 364
359 sctp_tsnmap_free(&asoc->peer.tsn_map); 365 sctp_tsnmap_free(&asoc->peer.tsn_map);
360 366
361 /* Free ssnmap storage. */ 367 /* Free stream information. */
362 sctp_ssnmap_free(asoc->ssnmap); 368 sctp_stream_free(asoc->stream);
369
370 if (asoc->strreset_chunk)
371 sctp_chunk_free(asoc->strreset_chunk);
363 372
364 /* Clean up the bound address list. */ 373 /* Clean up the bound address list. */
365 sctp_bind_addr_free(&asoc->base.bind_addr); 374 sctp_bind_addr_free(&asoc->base.bind_addr);
@@ -519,6 +528,12 @@ void sctp_assoc_rm_peer(struct sctp_association *asoc,
519 if (asoc->peer.last_data_from == peer) 528 if (asoc->peer.last_data_from == peer)
520 asoc->peer.last_data_from = transport; 529 asoc->peer.last_data_from = transport;
521 530
531 if (asoc->strreset_chunk &&
532 asoc->strreset_chunk->transport == peer) {
533 asoc->strreset_chunk->transport = transport;
534 sctp_transport_reset_reconf_timer(transport);
535 }
536
522 /* If we remove the transport an INIT was last sent to, set it to 537 /* If we remove the transport an INIT was last sent to, set it to
523 * NULL. Combined with the update of the retran path above, this 538 * NULL. Combined with the update of the retran path above, this
524 * will cause the next INIT to be sent to the next available 539 * will cause the next INIT to be sent to the next available
@@ -700,11 +715,15 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
700 /* Set the peer's active state. */ 715 /* Set the peer's active state. */
701 peer->state = peer_state; 716 peer->state = peer_state;
702 717
718 /* Add this peer into the transport hashtable */
719 if (sctp_hash_transport(peer)) {
720 sctp_transport_free(peer);
721 return NULL;
722 }
723
703 /* Attach the remote transport to our asoc. */ 724 /* Attach the remote transport to our asoc. */
704 list_add_tail_rcu(&peer->transports, &asoc->peer.transport_addr_list); 725 list_add_tail_rcu(&peer->transports, &asoc->peer.transport_addr_list);
705 asoc->peer.transport_count++; 726 asoc->peer.transport_count++;
706 /* Add this peer into the transport hashtable */
707 sctp_hash_transport(peer);
708 727
709 /* If we do not yet have a primary path, set one. */ 728 /* If we do not yet have a primary path, set one. */
710 if (!asoc->peer.primary_path) { 729 if (!asoc->peer.primary_path) {
@@ -816,8 +835,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
816 if (transport->state != SCTP_UNCONFIRMED) 835 if (transport->state != SCTP_UNCONFIRMED)
817 transport->state = SCTP_INACTIVE; 836 transport->state = SCTP_INACTIVE;
818 else { 837 else {
819 dst_release(transport->dst); 838 sctp_transport_dst_release(transport);
820 transport->dst = NULL;
821 ulp_notify = false; 839 ulp_notify = false;
822 } 840 }
823 841
@@ -1133,7 +1151,7 @@ void sctp_assoc_update(struct sctp_association *asoc,
1133 /* Reinitialize SSN for both local streams 1151 /* Reinitialize SSN for both local streams
1134 * and peer's streams. 1152 * and peer's streams.
1135 */ 1153 */
1136 sctp_ssnmap_clear(asoc->ssnmap); 1154 sctp_stream_clear(asoc->stream);
1137 1155
1138 /* Flush the ULP reassembly and ordered queue. 1156 /* Flush the ULP reassembly and ordered queue.
1139 * Any data there will now be stale and will 1157 * Any data there will now be stale and will
@@ -1158,10 +1176,9 @@ void sctp_assoc_update(struct sctp_association *asoc,
1158 1176
1159 asoc->ctsn_ack_point = asoc->next_tsn - 1; 1177 asoc->ctsn_ack_point = asoc->next_tsn - 1;
1160 asoc->adv_peer_ack_point = asoc->ctsn_ack_point; 1178 asoc->adv_peer_ack_point = asoc->ctsn_ack_point;
1161 if (!asoc->ssnmap) { 1179 if (!asoc->stream) {
1162 /* Move the ssnmap. */ 1180 asoc->stream = new->stream;
1163 asoc->ssnmap = new->ssnmap; 1181 new->stream = NULL;
1164 new->ssnmap = NULL;
1165 } 1182 }
1166 1183
1167 if (!asoc->assoc_id) { 1184 if (!asoc->assoc_id) {
@@ -1395,7 +1412,7 @@ sctp_assoc_choose_alter_transport(struct sctp_association *asoc,
1395/* Update the association's pmtu and frag_point by going through all the 1412/* Update the association's pmtu and frag_point by going through all the
1396 * transports. This routine is called when a transport's PMTU has changed. 1413 * transports. This routine is called when a transport's PMTU has changed.
1397 */ 1414 */
1398void sctp_assoc_sync_pmtu(struct sock *sk, struct sctp_association *asoc) 1415void sctp_assoc_sync_pmtu(struct sctp_association *asoc)
1399{ 1416{
1400 struct sctp_transport *t; 1417 struct sctp_transport *t;
1401 __u32 pmtu = 0; 1418 __u32 pmtu = 0;
@@ -1407,8 +1424,8 @@ void sctp_assoc_sync_pmtu(struct sock *sk, struct sctp_association *asoc)
1407 list_for_each_entry(t, &asoc->peer.transport_addr_list, 1424 list_for_each_entry(t, &asoc->peer.transport_addr_list,
1408 transports) { 1425 transports) {
1409 if (t->pmtu_pending && t->dst) { 1426 if (t->pmtu_pending && t->dst) {
1410 sctp_transport_update_pmtu(sk, t, 1427 sctp_transport_update_pmtu(
1411 SCTP_TRUNC4(dst_mtu(t->dst))); 1428 t, SCTP_TRUNC4(dst_mtu(t->dst)));
1412 t->pmtu_pending = 0; 1429 t->pmtu_pending = 0;
1413 } 1430 }
1414 if (!pmtu || (t->pathmtu < pmtu)) 1431 if (!pmtu || (t->pathmtu < pmtu))
@@ -1467,7 +1484,7 @@ void sctp_assoc_rwnd_increase(struct sctp_association *asoc, unsigned int len)
1467 * threshold. The idea is to recover slowly, but up 1484 * threshold. The idea is to recover slowly, but up
1468 * to the initial advertised window. 1485 * to the initial advertised window.
1469 */ 1486 */
1470 if (asoc->rwnd_press && asoc->rwnd >= asoc->rwnd_press) { 1487 if (asoc->rwnd_press) {
1471 int change = min(asoc->pathmtu, asoc->rwnd_press); 1488 int change = min(asoc->pathmtu, asoc->rwnd_press);
1472 asoc->rwnd += change; 1489 asoc->rwnd += change;
1473 asoc->rwnd_press -= change; 1490 asoc->rwnd_press -= change;
@@ -1535,7 +1552,7 @@ void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned int len)
1535 asoc->rwnd = 0; 1552 asoc->rwnd = 0;
1536 } 1553 }
1537 } else { 1554 } else {
1538 asoc->rwnd_over = len - asoc->rwnd; 1555 asoc->rwnd_over += len - asoc->rwnd;
1539 asoc->rwnd = 0; 1556 asoc->rwnd = 0;
1540 } 1557 }
1541 1558
diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
index 401c60750b20..1ebc184a0e23 100644
--- a/net/sctp/bind_addr.c
+++ b/net/sctp/bind_addr.c
@@ -292,6 +292,8 @@ int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list,
292 } 292 }
293 293
294 af->from_addr_param(&addr, rawaddr, htons(port), 0); 294 af->from_addr_param(&addr, rawaddr, htons(port), 0);
295 if (sctp_bind_addr_state(bp, &addr) != -1)
296 goto next;
295 retval = sctp_add_bind_addr(bp, &addr, sizeof(addr), 297 retval = sctp_add_bind_addr(bp, &addr, sizeof(addr),
296 SCTP_ADDR_SRC, gfp); 298 SCTP_ADDR_SRC, gfp);
297 if (retval) { 299 if (retval) {
@@ -300,6 +302,7 @@ int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list,
300 break; 302 break;
301 } 303 }
302 304
305next:
303 len = ntohs(param->length); 306 len = ntohs(param->length);
304 addrs_len -= len; 307 addrs_len -= len;
305 raw_addr_list += len; 308 raw_addr_list += len;
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 7a1cdf43e49d..e3621cb4827f 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -52,7 +52,6 @@ static void sctp_datamsg_init(struct sctp_datamsg *msg)
52 atomic_set(&msg->refcnt, 1); 52 atomic_set(&msg->refcnt, 1);
53 msg->send_failed = 0; 53 msg->send_failed = 0;
54 msg->send_error = 0; 54 msg->send_error = 0;
55 msg->can_abandon = 0;
56 msg->can_delay = 1; 55 msg->can_delay = 1;
57 msg->expires_at = 0; 56 msg->expires_at = 0;
58 INIT_LIST_HEAD(&msg->chunks); 57 INIT_LIST_HEAD(&msg->chunks);
@@ -166,14 +165,12 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
166 struct sctp_sndrcvinfo *sinfo, 165 struct sctp_sndrcvinfo *sinfo,
167 struct iov_iter *from) 166 struct iov_iter *from)
168{ 167{
169 int max, whole, i, offset, over, err; 168 size_t len, first_len, max_data, remaining;
170 int len, first_len; 169 size_t msg_len = iov_iter_count(from);
171 int max_data; 170 struct list_head *pos, *temp;
172 struct sctp_chunk *chunk; 171 struct sctp_chunk *chunk;
173 struct sctp_datamsg *msg; 172 struct sctp_datamsg *msg;
174 struct list_head *pos, *temp; 173 int err;
175 size_t msg_len = iov_iter_count(from);
176 __u8 frag;
177 174
178 msg = sctp_datamsg_new(GFP_KERNEL); 175 msg = sctp_datamsg_new(GFP_KERNEL);
179 if (!msg) 176 if (!msg)
@@ -182,20 +179,11 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
182 /* Note: Calculate this outside of the loop, so that all fragments 179 /* Note: Calculate this outside of the loop, so that all fragments
183 * have the same expiration. 180 * have the same expiration.
184 */ 181 */
185 if (sinfo->sinfo_timetolive) { 182 if (asoc->peer.prsctp_capable && sinfo->sinfo_timetolive &&
186 /* sinfo_timetolive is in milliseconds */ 183 (SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags) ||
184 !SCTP_PR_POLICY(sinfo->sinfo_flags)))
187 msg->expires_at = jiffies + 185 msg->expires_at = jiffies +
188 msecs_to_jiffies(sinfo->sinfo_timetolive); 186 msecs_to_jiffies(sinfo->sinfo_timetolive);
189 msg->can_abandon = 1;
190
191 pr_debug("%s: msg:%p expires_at:%ld jiffies:%ld\n", __func__,
192 msg, msg->expires_at, jiffies);
193 }
194
195 if (asoc->peer.prsctp_capable &&
196 SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags))
197 msg->expires_at =
198 jiffies + msecs_to_jiffies(sinfo->sinfo_timetolive);
199 187
200 /* This is the biggest possible DATA chunk that can fit into 188 /* This is the biggest possible DATA chunk that can fit into
201 * the packet 189 * the packet
@@ -205,7 +193,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
205 sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk); 193 sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk);
206 max_data = SCTP_TRUNC4(max_data); 194 max_data = SCTP_TRUNC4(max_data);
207 195
208 max = asoc->frag_point;
209 /* If the the peer requested that we authenticate DATA chunks 196 /* If the the peer requested that we authenticate DATA chunks
210 * we need to account for bundling of the AUTH chunks along with 197 * we need to account for bundling of the AUTH chunks along with
211 * DATA. 198 * DATA.
@@ -218,12 +205,11 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
218 hmac_desc->hmac_len); 205 hmac_desc->hmac_len);
219 } 206 }
220 207
221 /* Now, check if we need to reduce our max */ 208 /* Check what's our max considering the above */
222 if (max > max_data) 209 max_data = min_t(size_t, max_data, asoc->frag_point);
223 max = max_data;
224 210
225 whole = 0; 211 /* Set first_len and then account for possible bundles on first frag */
226 first_len = max; 212 first_len = max_data;
227 213
228 /* Check to see if we have a pending SACK and try to let it be bundled 214 /* Check to see if we have a pending SACK and try to let it be bundled
229 * with this message. Do this if we don't have any data queued already. 215 * with this message. Do this if we don't have any data queued already.
@@ -234,40 +220,38 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
234 if (timer_pending(&asoc->timers[SCTP_EVENT_TIMEOUT_SACK]) && 220 if (timer_pending(&asoc->timers[SCTP_EVENT_TIMEOUT_SACK]) &&
235 asoc->outqueue.out_qlen == 0 && 221 asoc->outqueue.out_qlen == 0 &&
236 list_empty(&asoc->outqueue.retransmit) && 222 list_empty(&asoc->outqueue.retransmit) &&
237 msg_len > max) 223 msg_len > max_data)
238 max_data -= SCTP_PAD4(sizeof(sctp_sack_chunk_t)); 224 first_len -= SCTP_PAD4(sizeof(sctp_sack_chunk_t));
239 225
240 /* Encourage Cookie-ECHO bundling. */ 226 /* Encourage Cookie-ECHO bundling. */
241 if (asoc->state < SCTP_STATE_COOKIE_ECHOED) 227 if (asoc->state < SCTP_STATE_COOKIE_ECHOED)
242 max_data -= SCTP_ARBITRARY_COOKIE_ECHO_LEN; 228 first_len -= SCTP_ARBITRARY_COOKIE_ECHO_LEN;
243
244 /* Now that we adjusted completely, reset first_len */
245 if (first_len > max_data)
246 first_len = max_data;
247 229
248 /* Account for a different sized first fragment */ 230 /* Account for a different sized first fragment */
249 if (msg_len >= first_len) { 231 if (msg_len >= first_len) {
250 msg_len -= first_len;
251 whole = 1;
252 msg->can_delay = 0; 232 msg->can_delay = 0;
253 }
254
255 /* How many full sized? How many bytes leftover? */
256 whole += msg_len / max;
257 over = msg_len % max;
258 offset = 0;
259
260 if ((whole > 1) || (whole && over))
261 SCTP_INC_STATS(sock_net(asoc->base.sk), SCTP_MIB_FRAGUSRMSGS); 233 SCTP_INC_STATS(sock_net(asoc->base.sk), SCTP_MIB_FRAGUSRMSGS);
234 } else {
235 /* Which may be the only one... */
236 first_len = msg_len;
237 }
262 238
263 /* Create chunks for all the full sized DATA chunks. */ 239 /* Create chunks for all DATA chunks. */
264 for (i = 0, len = first_len; i < whole; i++) { 240 for (remaining = msg_len; remaining; remaining -= len) {
265 frag = SCTP_DATA_MIDDLE_FRAG; 241 u8 frag = SCTP_DATA_MIDDLE_FRAG;
266 242
267 if (0 == i) 243 if (remaining == msg_len) {
244 /* First frag, which may also be the last */
268 frag |= SCTP_DATA_FIRST_FRAG; 245 frag |= SCTP_DATA_FIRST_FRAG;
246 len = first_len;
247 } else {
248 /* Middle frags */
249 len = max_data;
250 }
269 251
270 if ((i == (whole - 1)) && !over) { 252 if (len >= remaining) {
253 /* Last frag, which may also be the first */
254 len = remaining;
271 frag |= SCTP_DATA_LAST_FRAG; 255 frag |= SCTP_DATA_LAST_FRAG;
272 256
273 /* The application requests to set the I-bit of the 257 /* The application requests to set the I-bit of the
@@ -281,7 +265,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
281 265
282 chunk = sctp_make_datafrag_empty(asoc, sinfo, len, frag, 266 chunk = sctp_make_datafrag_empty(asoc, sinfo, len, frag,
283 0, GFP_KERNEL); 267 0, GFP_KERNEL);
284
285 if (!chunk) { 268 if (!chunk) {
286 err = -ENOMEM; 269 err = -ENOMEM;
287 goto errout; 270 goto errout;
@@ -292,45 +275,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
292 goto errout_chunk_free; 275 goto errout_chunk_free;
293 276
294 /* Put the chunk->skb back into the form expected by send. */ 277 /* Put the chunk->skb back into the form expected by send. */
295 __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr 278 __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr -
296 - (__u8 *)chunk->skb->data); 279 chunk->skb->data);
297
298 sctp_datamsg_assign(msg, chunk);
299 list_add_tail(&chunk->frag_list, &msg->chunks);
300
301 /* The first chunk, the first chunk was likely short
302 * to allow bundling, so reset to full size.
303 */
304 if (0 == i)
305 len = max;
306 }
307
308 /* .. now the leftover bytes. */
309 if (over) {
310 if (!whole)
311 frag = SCTP_DATA_NOT_FRAG;
312 else
313 frag = SCTP_DATA_LAST_FRAG;
314
315 if ((sinfo->sinfo_flags & SCTP_EOF) ||
316 (sinfo->sinfo_flags & SCTP_SACK_IMMEDIATELY))
317 frag |= SCTP_DATA_SACK_IMM;
318
319 chunk = sctp_make_datafrag_empty(asoc, sinfo, over, frag,
320 0, GFP_KERNEL);
321
322 if (!chunk) {
323 err = -ENOMEM;
324 goto errout;
325 }
326
327 err = sctp_user_addto_chunk(chunk, over, from);
328
329 /* Put the chunk->skb back into the form expected by send. */
330 __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr
331 - (__u8 *)chunk->skb->data);
332 if (err < 0)
333 goto errout_chunk_free;
334 280
335 sctp_datamsg_assign(msg, chunk); 281 sctp_datamsg_assign(msg, chunk);
336 list_add_tail(&chunk->frag_list, &msg->chunks); 282 list_add_tail(&chunk->frag_list, &msg->chunks);
@@ -348,24 +294,15 @@ errout:
348 sctp_chunk_free(chunk); 294 sctp_chunk_free(chunk);
349 } 295 }
350 sctp_datamsg_put(msg); 296 sctp_datamsg_put(msg);
297
351 return ERR_PTR(err); 298 return ERR_PTR(err);
352} 299}
353 300
354/* Check whether this message has expired. */ 301/* Check whether this message has expired. */
355int sctp_chunk_abandoned(struct sctp_chunk *chunk) 302int sctp_chunk_abandoned(struct sctp_chunk *chunk)
356{ 303{
357 if (!chunk->asoc->peer.prsctp_capable || 304 if (!chunk->asoc->peer.prsctp_capable)
358 !SCTP_PR_POLICY(chunk->sinfo.sinfo_flags)) {
359 struct sctp_datamsg *msg = chunk->msg;
360
361 if (!msg->can_abandon)
362 return 0;
363
364 if (time_after(jiffies, msg->expires_at))
365 return 1;
366
367 return 0; 305 return 0;
368 }
369 306
370 if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) && 307 if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) &&
371 time_after(jiffies, chunk->msg->expires_at)) { 308 time_after(jiffies, chunk->msg->expires_at)) {
@@ -378,6 +315,10 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk)
378 chunk->sent_count > chunk->sinfo.sinfo_timetolive) { 315 chunk->sent_count > chunk->sinfo.sinfo_timetolive) {
379 chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++; 316 chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++;
380 return 1; 317 return 1;
318 } else if (!SCTP_PR_POLICY(chunk->sinfo.sinfo_flags) &&
319 chunk->msg->expires_at &&
320 time_after(jiffies, chunk->msg->expires_at)) {
321 return 1;
381 } 322 }
382 /* PRIO policy is processed by sendmsg, not here */ 323 /* PRIO policy is processed by sendmsg, not here */
383 324
diff --git a/net/sctp/debug.c b/net/sctp/debug.c
index 95d7b15dad21..2e47eb2f05cb 100644
--- a/net/sctp/debug.c
+++ b/net/sctp/debug.c
@@ -159,6 +159,7 @@ static const char *const sctp_timer_tbl[] = {
159 "TIMEOUT_T4_RTO", 159 "TIMEOUT_T4_RTO",
160 "TIMEOUT_T5_SHUTDOWN_GUARD", 160 "TIMEOUT_T5_SHUTDOWN_GUARD",
161 "TIMEOUT_HEARTBEAT", 161 "TIMEOUT_HEARTBEAT",
162 "TIMEOUT_RECONF",
162 "TIMEOUT_SACK", 163 "TIMEOUT_SACK",
163 "TIMEOUT_AUTOCLOSE", 164 "TIMEOUT_AUTOCLOSE",
164}; 165};
@@ -166,7 +167,9 @@ static const char *const sctp_timer_tbl[] = {
166/* Lookup timer debug name. */ 167/* Lookup timer debug name. */
167const char *sctp_tname(const sctp_subtype_t id) 168const char *sctp_tname(const sctp_subtype_t id)
168{ 169{
169 if (id.timeout <= SCTP_EVENT_TIMEOUT_MAX) 170 BUILD_BUG_ON(SCTP_EVENT_TIMEOUT_MAX + 1 != ARRAY_SIZE(sctp_timer_tbl));
171
172 if (id.timeout < ARRAY_SIZE(sctp_timer_tbl))
170 return sctp_timer_tbl[id.timeout]; 173 return sctp_timer_tbl[id.timeout];
171 return "unknown_timer"; 174 return "unknown_timer";
172} 175}
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 1f03065686fe..8c589230794f 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -164,6 +164,7 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
164 ep->auth_hmacs_list = auth_hmacs; 164 ep->auth_hmacs_list = auth_hmacs;
165 ep->auth_chunk_list = auth_chunks; 165 ep->auth_chunk_list = auth_chunks;
166 ep->prsctp_enable = net->sctp.prsctp_enable; 166 ep->prsctp_enable = net->sctp.prsctp_enable;
167 ep->reconf_enable = net->sctp.reconf_enable;
167 168
168 return ep; 169 return ep;
169 170
@@ -331,7 +332,9 @@ struct sctp_association *sctp_endpoint_lookup_assoc(
331 * on this endpoint. 332 * on this endpoint.
332 */ 333 */
333 if (!ep->base.bind_addr.port) 334 if (!ep->base.bind_addr.port)
334 goto out; 335 return NULL;
336
337 rcu_read_lock();
335 t = sctp_epaddr_lookup_transport(ep, paddr); 338 t = sctp_epaddr_lookup_transport(ep, paddr);
336 if (!t) 339 if (!t)
337 goto out; 340 goto out;
@@ -339,6 +342,7 @@ struct sctp_association *sctp_endpoint_lookup_assoc(
339 *transport = t; 342 *transport = t;
340 asoc = t->asoc; 343 asoc = t->asoc;
341out: 344out:
345 rcu_read_unlock();
342 return asoc; 346 return asoc;
343} 347}
344 348
diff --git a/net/sctp/input.c b/net/sctp/input.c
index a01a56ec8b8c..0e06a278d2a9 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -401,10 +401,10 @@ void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc,
401 401
402 if (t->param_flags & SPP_PMTUD_ENABLE) { 402 if (t->param_flags & SPP_PMTUD_ENABLE) {
403 /* Update transports view of the MTU */ 403 /* Update transports view of the MTU */
404 sctp_transport_update_pmtu(sk, t, pmtu); 404 sctp_transport_update_pmtu(t, pmtu);
405 405
406 /* Update association pmtu. */ 406 /* Update association pmtu. */
407 sctp_assoc_sync_pmtu(sk, asoc); 407 sctp_assoc_sync_pmtu(asoc);
408 } 408 }
409 409
410 /* Retransmit with the new pmtu setting. 410 /* Retransmit with the new pmtu setting.
@@ -790,10 +790,9 @@ hit:
790 790
791/* rhashtable for transport */ 791/* rhashtable for transport */
792struct sctp_hash_cmp_arg { 792struct sctp_hash_cmp_arg {
793 const struct sctp_endpoint *ep; 793 const union sctp_addr *paddr;
794 const union sctp_addr *laddr; 794 const struct net *net;
795 const union sctp_addr *paddr; 795 u16 lport;
796 const struct net *net;
797}; 796};
798 797
799static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg, 798static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg,
@@ -801,7 +800,6 @@ static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg,
801{ 800{
802 struct sctp_transport *t = (struct sctp_transport *)ptr; 801 struct sctp_transport *t = (struct sctp_transport *)ptr;
803 const struct sctp_hash_cmp_arg *x = arg->key; 802 const struct sctp_hash_cmp_arg *x = arg->key;
804 struct sctp_association *asoc;
805 int err = 1; 803 int err = 1;
806 804
807 if (!sctp_cmp_addr_exact(&t->ipaddr, x->paddr)) 805 if (!sctp_cmp_addr_exact(&t->ipaddr, x->paddr))
@@ -809,19 +807,10 @@ static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg,
809 if (!sctp_transport_hold(t)) 807 if (!sctp_transport_hold(t))
810 return err; 808 return err;
811 809
812 asoc = t->asoc; 810 if (!net_eq(sock_net(t->asoc->base.sk), x->net))
813 if (!net_eq(sock_net(asoc->base.sk), x->net)) 811 goto out;
812 if (x->lport != htons(t->asoc->base.bind_addr.port))
814 goto out; 813 goto out;
815 if (x->ep) {
816 if (x->ep != asoc->ep)
817 goto out;
818 } else {
819 if (x->laddr->v4.sin_port != htons(asoc->base.bind_addr.port))
820 goto out;
821 if (!sctp_bind_addr_match(&asoc->base.bind_addr,
822 x->laddr, sctp_sk(asoc->base.sk)))
823 goto out;
824 }
825 814
826 err = 0; 815 err = 0;
827out: 816out:
@@ -851,11 +840,9 @@ static inline u32 sctp_hash_key(const void *data, u32 len, u32 seed)
851 const struct sctp_hash_cmp_arg *x = data; 840 const struct sctp_hash_cmp_arg *x = data;
852 const union sctp_addr *paddr = x->paddr; 841 const union sctp_addr *paddr = x->paddr;
853 const struct net *net = x->net; 842 const struct net *net = x->net;
854 u16 lport; 843 u16 lport = x->lport;
855 u32 addr; 844 u32 addr;
856 845
857 lport = x->ep ? htons(x->ep->base.bind_addr.port) :
858 x->laddr->v4.sin_port;
859 if (paddr->sa.sa_family == AF_INET6) 846 if (paddr->sa.sa_family == AF_INET6)
860 addr = jhash(&paddr->v6.sin6_addr, 16, seed); 847 addr = jhash(&paddr->v6.sin6_addr, 16, seed);
861 else 848 else
@@ -875,29 +862,48 @@ static const struct rhashtable_params sctp_hash_params = {
875 862
876int sctp_transport_hashtable_init(void) 863int sctp_transport_hashtable_init(void)
877{ 864{
878 return rhashtable_init(&sctp_transport_hashtable, &sctp_hash_params); 865 return rhltable_init(&sctp_transport_hashtable, &sctp_hash_params);
879} 866}
880 867
881void sctp_transport_hashtable_destroy(void) 868void sctp_transport_hashtable_destroy(void)
882{ 869{
883 rhashtable_destroy(&sctp_transport_hashtable); 870 rhltable_destroy(&sctp_transport_hashtable);
884} 871}
885 872
886void sctp_hash_transport(struct sctp_transport *t) 873int sctp_hash_transport(struct sctp_transport *t)
887{ 874{
875 struct sctp_transport *transport;
876 struct rhlist_head *tmp, *list;
888 struct sctp_hash_cmp_arg arg; 877 struct sctp_hash_cmp_arg arg;
878 int err;
889 879
890 if (t->asoc->temp) 880 if (t->asoc->temp)
891 return; 881 return 0;
892 882
893 arg.ep = t->asoc->ep;
894 arg.paddr = &t->ipaddr;
895 arg.net = sock_net(t->asoc->base.sk); 883 arg.net = sock_net(t->asoc->base.sk);
884 arg.paddr = &t->ipaddr;
885 arg.lport = htons(t->asoc->base.bind_addr.port);
886
887 rcu_read_lock();
888 list = rhltable_lookup(&sctp_transport_hashtable, &arg,
889 sctp_hash_params);
890
891 rhl_for_each_entry_rcu(transport, tmp, list, node)
892 if (transport->asoc->ep == t->asoc->ep) {
893 rcu_read_unlock();
894 err = -EEXIST;
895 goto out;
896 }
897 rcu_read_unlock();
898
899 err = rhltable_insert_key(&sctp_transport_hashtable, &arg,
900 &t->node, sctp_hash_params);
901
902out:
903 if (err)
904 pr_err_once("insert transport fail, errno %d\n", err);
896 905
897reinsert: 906 return err;
898 if (rhashtable_lookup_insert_key(&sctp_transport_hashtable, &arg,
899 &t->node, sctp_hash_params) == -EBUSY)
900 goto reinsert;
901} 907}
902 908
903void sctp_unhash_transport(struct sctp_transport *t) 909void sctp_unhash_transport(struct sctp_transport *t)
@@ -905,39 +911,62 @@ void sctp_unhash_transport(struct sctp_transport *t)
905 if (t->asoc->temp) 911 if (t->asoc->temp)
906 return; 912 return;
907 913
908 rhashtable_remove_fast(&sctp_transport_hashtable, &t->node, 914 rhltable_remove(&sctp_transport_hashtable, &t->node,
909 sctp_hash_params); 915 sctp_hash_params);
910} 916}
911 917
918/* return a transport with holding it */
912struct sctp_transport *sctp_addrs_lookup_transport( 919struct sctp_transport *sctp_addrs_lookup_transport(
913 struct net *net, 920 struct net *net,
914 const union sctp_addr *laddr, 921 const union sctp_addr *laddr,
915 const union sctp_addr *paddr) 922 const union sctp_addr *paddr)
916{ 923{
924 struct rhlist_head *tmp, *list;
925 struct sctp_transport *t;
917 struct sctp_hash_cmp_arg arg = { 926 struct sctp_hash_cmp_arg arg = {
918 .ep = NULL,
919 .laddr = laddr,
920 .paddr = paddr, 927 .paddr = paddr,
921 .net = net, 928 .net = net,
929 .lport = laddr->v4.sin_port,
922 }; 930 };
923 931
924 return rhashtable_lookup_fast(&sctp_transport_hashtable, &arg, 932 list = rhltable_lookup(&sctp_transport_hashtable, &arg,
925 sctp_hash_params); 933 sctp_hash_params);
934
935 rhl_for_each_entry_rcu(t, tmp, list, node) {
936 if (!sctp_transport_hold(t))
937 continue;
938
939 if (sctp_bind_addr_match(&t->asoc->base.bind_addr,
940 laddr, sctp_sk(t->asoc->base.sk)))
941 return t;
942 sctp_transport_put(t);
943 }
944
945 return NULL;
926} 946}
927 947
948/* return a transport without holding it, as it's only used under sock lock */
928struct sctp_transport *sctp_epaddr_lookup_transport( 949struct sctp_transport *sctp_epaddr_lookup_transport(
929 const struct sctp_endpoint *ep, 950 const struct sctp_endpoint *ep,
930 const union sctp_addr *paddr) 951 const union sctp_addr *paddr)
931{ 952{
932 struct net *net = sock_net(ep->base.sk); 953 struct net *net = sock_net(ep->base.sk);
954 struct rhlist_head *tmp, *list;
955 struct sctp_transport *t;
933 struct sctp_hash_cmp_arg arg = { 956 struct sctp_hash_cmp_arg arg = {
934 .ep = ep,
935 .paddr = paddr, 957 .paddr = paddr,
936 .net = net, 958 .net = net,
959 .lport = htons(ep->base.bind_addr.port),
937 }; 960 };
938 961
939 return rhashtable_lookup_fast(&sctp_transport_hashtable, &arg, 962 list = rhltable_lookup(&sctp_transport_hashtable, &arg,
940 sctp_hash_params); 963 sctp_hash_params);
964
965 rhl_for_each_entry_rcu(t, tmp, list, node)
966 if (ep == t->asoc->ep)
967 return t;
968
969 return NULL;
941} 970}
942 971
943/* Look up an association. */ 972/* Look up an association. */
@@ -951,7 +980,7 @@ static struct sctp_association *__sctp_lookup_association(
951 struct sctp_association *asoc = NULL; 980 struct sctp_association *asoc = NULL;
952 981
953 t = sctp_addrs_lookup_transport(net, local, peer); 982 t = sctp_addrs_lookup_transport(net, local, peer);
954 if (!t || !sctp_transport_hold(t)) 983 if (!t)
955 goto out; 984 goto out;
956 985
957 asoc = t->asoc; 986 asoc = t->asoc;
@@ -1216,13 +1245,26 @@ static struct sctp_association *__sctp_rcv_lookup(struct net *net,
1216 struct sctp_association *asoc; 1245 struct sctp_association *asoc;
1217 1246
1218 asoc = __sctp_lookup_association(net, laddr, paddr, transportp); 1247 asoc = __sctp_lookup_association(net, laddr, paddr, transportp);
1248 if (asoc)
1249 goto out;
1219 1250
1220 /* Further lookup for INIT/INIT-ACK packets. 1251 /* Further lookup for INIT/INIT-ACK packets.
1221 * SCTP Implementors Guide, 2.18 Handling of address 1252 * SCTP Implementors Guide, 2.18 Handling of address
1222 * parameters within the INIT or INIT-ACK. 1253 * parameters within the INIT or INIT-ACK.
1223 */ 1254 */
1224 if (!asoc) 1255 asoc = __sctp_rcv_lookup_harder(net, skb, laddr, transportp);
1225 asoc = __sctp_rcv_lookup_harder(net, skb, laddr, transportp); 1256 if (asoc)
1257 goto out;
1226 1258
1259 if (paddr->sa.sa_family == AF_INET)
1260 pr_debug("sctp: asoc not found for src:%pI4:%d dst:%pI4:%d\n",
1261 &laddr->v4.sin_addr, ntohs(laddr->v4.sin_port),
1262 &paddr->v4.sin_addr, ntohs(paddr->v4.sin_port));
1263 else
1264 pr_debug("sctp: asoc not found for src:%pI6:%d dst:%pI6:%d\n",
1265 &laddr->v6.sin6_addr, ntohs(laddr->v6.sin6_port),
1266 &paddr->v6.sin6_addr, ntohs(paddr->v6.sin6_port));
1267
1268out:
1227 return asoc; 1269 return asoc;
1228} 1270}
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 176af3080a2b..961ee59f696a 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -71,7 +71,7 @@
71#include <net/inet_ecn.h> 71#include <net/inet_ecn.h>
72#include <net/sctp/sctp.h> 72#include <net/sctp/sctp.h>
73 73
74#include <asm/uaccess.h> 74#include <linux/uaccess.h>
75 75
76static inline int sctp_v6_addr_match_len(union sctp_addr *s1, 76static inline int sctp_v6_addr_match_len(union sctp_addr *s1,
77 union sctp_addr *s2); 77 union sctp_addr *s2);
@@ -222,7 +222,8 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
222 SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS); 222 SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS);
223 223
224 rcu_read_lock(); 224 rcu_read_lock();
225 res = ip6_xmit(sk, skb, fl6, rcu_dereference(np->opt), np->tclass); 225 res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt),
226 np->tclass);
226 rcu_read_unlock(); 227 rcu_read_unlock();
227 return res; 228 return res;
228} 229}
@@ -412,22 +413,20 @@ static void sctp_v6_copy_addrlist(struct list_head *addrlist,
412static void sctp_v6_from_skb(union sctp_addr *addr, struct sk_buff *skb, 413static void sctp_v6_from_skb(union sctp_addr *addr, struct sk_buff *skb,
413 int is_saddr) 414 int is_saddr)
414{ 415{
415 __be16 *port; 416 /* Always called on head skb, so this is safe */
416 struct sctphdr *sh; 417 struct sctphdr *sh = sctp_hdr(skb);
418 struct sockaddr_in6 *sa = &addr->v6;
417 419
418 port = &addr->v6.sin6_port;
419 addr->v6.sin6_family = AF_INET6; 420 addr->v6.sin6_family = AF_INET6;
420 addr->v6.sin6_flowinfo = 0; /* FIXME */ 421 addr->v6.sin6_flowinfo = 0; /* FIXME */
421 addr->v6.sin6_scope_id = ((struct inet6_skb_parm *)skb->cb)->iif; 422 addr->v6.sin6_scope_id = ((struct inet6_skb_parm *)skb->cb)->iif;
422 423
423 /* Always called on head skb, so this is safe */
424 sh = sctp_hdr(skb);
425 if (is_saddr) { 424 if (is_saddr) {
426 *port = sh->source; 425 sa->sin6_port = sh->source;
427 addr->v6.sin6_addr = ipv6_hdr(skb)->saddr; 426 sa->sin6_addr = ipv6_hdr(skb)->saddr;
428 } else { 427 } else {
429 *port = sh->dest; 428 sa->sin6_port = sh->dest;
430 addr->v6.sin6_addr = ipv6_hdr(skb)->daddr; 429 sa->sin6_addr = ipv6_hdr(skb)->daddr;
431 } 430 }
432} 431}
433 432
@@ -641,14 +640,15 @@ static sctp_scope_t sctp_v6_scope(union sctp_addr *addr)
641 640
642/* Create and initialize a new sk for the socket to be returned by accept(). */ 641/* Create and initialize a new sk for the socket to be returned by accept(). */
643static struct sock *sctp_v6_create_accept_sk(struct sock *sk, 642static struct sock *sctp_v6_create_accept_sk(struct sock *sk,
644 struct sctp_association *asoc) 643 struct sctp_association *asoc,
644 bool kern)
645{ 645{
646 struct sock *newsk; 646 struct sock *newsk;
647 struct ipv6_pinfo *newnp, *np = inet6_sk(sk); 647 struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
648 struct sctp6_sock *newsctp6sk; 648 struct sctp6_sock *newsctp6sk;
649 struct ipv6_txoptions *opt; 649 struct ipv6_txoptions *opt;
650 650
651 newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot, 0); 651 newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot, kern);
652 if (!newsk) 652 if (!newsk)
653 goto out; 653 goto out;
654 654
diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c
index 40e7fac96c41..105ac3327b28 100644
--- a/net/sctp/objcnt.c
+++ b/net/sctp/objcnt.c
@@ -51,7 +51,6 @@ SCTP_DBG_OBJCNT(bind_addr);
51SCTP_DBG_OBJCNT(bind_bucket); 51SCTP_DBG_OBJCNT(bind_bucket);
52SCTP_DBG_OBJCNT(chunk); 52SCTP_DBG_OBJCNT(chunk);
53SCTP_DBG_OBJCNT(addr); 53SCTP_DBG_OBJCNT(addr);
54SCTP_DBG_OBJCNT(ssnmap);
55SCTP_DBG_OBJCNT(datamsg); 54SCTP_DBG_OBJCNT(datamsg);
56SCTP_DBG_OBJCNT(keys); 55SCTP_DBG_OBJCNT(keys);
57 56
@@ -67,7 +66,6 @@ static sctp_dbg_objcnt_entry_t sctp_dbg_objcnt[] = {
67 SCTP_DBG_OBJCNT_ENTRY(bind_addr), 66 SCTP_DBG_OBJCNT_ENTRY(bind_addr),
68 SCTP_DBG_OBJCNT_ENTRY(bind_bucket), 67 SCTP_DBG_OBJCNT_ENTRY(bind_bucket),
69 SCTP_DBG_OBJCNT_ENTRY(addr), 68 SCTP_DBG_OBJCNT_ENTRY(addr),
70 SCTP_DBG_OBJCNT_ENTRY(ssnmap),
71 SCTP_DBG_OBJCNT_ENTRY(datamsg), 69 SCTP_DBG_OBJCNT_ENTRY(datamsg),
72 SCTP_DBG_OBJCNT_ENTRY(keys), 70 SCTP_DBG_OBJCNT_ENTRY(keys),
73}; 71};
diff --git a/net/sctp/offload.c b/net/sctp/offload.c
index 7e869d0cca69..4f5a2b580aa5 100644
--- a/net/sctp/offload.c
+++ b/net/sctp/offload.c
@@ -68,7 +68,7 @@ static struct sk_buff *sctp_gso_segment(struct sk_buff *skb,
68 goto out; 68 goto out;
69 } 69 }
70 70
71 segs = skb_segment(skb, features | NETIF_F_HW_CSUM); 71 segs = skb_segment(skb, features | NETIF_F_HW_CSUM | NETIF_F_SG);
72 if (IS_ERR(segs)) 72 if (IS_ERR(segs))
73 goto out; 73 goto out;
74 74
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 6cb0df859195..1409a875ad8e 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -81,56 +81,64 @@ static void sctp_packet_reset(struct sctp_packet *packet)
81/* Config a packet. 81/* Config a packet.
82 * This appears to be a followup set of initializations. 82 * This appears to be a followup set of initializations.
83 */ 83 */
84struct sctp_packet *sctp_packet_config(struct sctp_packet *packet, 84void sctp_packet_config(struct sctp_packet *packet, __u32 vtag,
85 __u32 vtag, int ecn_capable) 85 int ecn_capable)
86{ 86{
87 struct sctp_transport *tp = packet->transport; 87 struct sctp_transport *tp = packet->transport;
88 struct sctp_association *asoc = tp->asoc; 88 struct sctp_association *asoc = tp->asoc;
89 struct sock *sk;
89 90
90 pr_debug("%s: packet:%p vtag:0x%x\n", __func__, packet, vtag); 91 pr_debug("%s: packet:%p vtag:0x%x\n", __func__, packet, vtag);
91
92 packet->vtag = vtag; 92 packet->vtag = vtag;
93 93
94 if (asoc && tp->dst) { 94 /* do the following jobs only once for a flush schedule */
95 struct sock *sk = asoc->base.sk; 95 if (!sctp_packet_empty(packet))
96 96 return;
97 rcu_read_lock();
98 if (__sk_dst_get(sk) != tp->dst) {
99 dst_hold(tp->dst);
100 sk_setup_caps(sk, tp->dst);
101 }
102
103 if (sk_can_gso(sk)) {
104 struct net_device *dev = tp->dst->dev;
105 97
106 packet->max_size = dev->gso_max_size; 98 /* set packet max_size with pathmtu */
107 } else { 99 packet->max_size = tp->pathmtu;
108 packet->max_size = asoc->pathmtu; 100 if (!asoc)
109 } 101 return;
110 rcu_read_unlock();
111 102
112 } else { 103 /* update dst or transport pathmtu if in need */
113 packet->max_size = tp->pathmtu; 104 sk = asoc->base.sk;
105 if (!sctp_transport_dst_check(tp)) {
106 sctp_transport_route(tp, NULL, sctp_sk(sk));
107 if (asoc->param_flags & SPP_PMTUD_ENABLE)
108 sctp_assoc_sync_pmtu(asoc);
109 } else if (!sctp_transport_pmtu_check(tp)) {
110 if (asoc->param_flags & SPP_PMTUD_ENABLE)
111 sctp_assoc_sync_pmtu(asoc);
114 } 112 }
115 113
116 if (ecn_capable && sctp_packet_empty(packet)) { 114 /* If there a is a prepend chunk stick it on the list before
117 struct sctp_chunk *chunk; 115 * any other chunks get appended.
116 */
117 if (ecn_capable) {
118 struct sctp_chunk *chunk = sctp_get_ecne_prepend(asoc);
118 119
119 /* If there a is a prepend chunk stick it on the list before
120 * any other chunks get appended.
121 */
122 chunk = sctp_get_ecne_prepend(asoc);
123 if (chunk) 120 if (chunk)
124 sctp_packet_append_chunk(packet, chunk); 121 sctp_packet_append_chunk(packet, chunk);
125 } 122 }
126 123
127 return packet; 124 if (!tp->dst)
125 return;
126
127 /* set packet max_size with gso_max_size if gso is enabled*/
128 rcu_read_lock();
129 if (__sk_dst_get(sk) != tp->dst) {
130 dst_hold(tp->dst);
131 sk_setup_caps(sk, tp->dst);
132 }
133 packet->max_size = sk_can_gso(sk) ? tp->dst->dev->gso_max_size
134 : asoc->pathmtu;
135 rcu_read_unlock();
128} 136}
129 137
130/* Initialize the packet structure. */ 138/* Initialize the packet structure. */
131struct sctp_packet *sctp_packet_init(struct sctp_packet *packet, 139void sctp_packet_init(struct sctp_packet *packet,
132 struct sctp_transport *transport, 140 struct sctp_transport *transport,
133 __u16 sport, __u16 dport) 141 __u16 sport, __u16 dport)
134{ 142{
135 struct sctp_association *asoc = transport->asoc; 143 struct sctp_association *asoc = transport->asoc;
136 size_t overhead; 144 size_t overhead;
@@ -151,8 +159,6 @@ struct sctp_packet *sctp_packet_init(struct sctp_packet *packet,
151 packet->overhead = overhead; 159 packet->overhead = overhead;
152 sctp_packet_reset(packet); 160 sctp_packet_reset(packet);
153 packet->vtag = 0; 161 packet->vtag = 0;
154
155 return packet;
156} 162}
157 163
158/* Free a packet. */ 164/* Free a packet. */
@@ -181,7 +187,7 @@ sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet,
181{ 187{
182 sctp_xmit_t retval; 188 sctp_xmit_t retval;
183 189
184 pr_debug("%s: packet:%p size:%Zu chunk:%p size:%d\n", __func__, 190 pr_debug("%s: packet:%p size:%zu chunk:%p size:%d\n", __func__,
185 packet, packet->size, chunk, chunk->skb ? chunk->skb->len : -1); 191 packet, packet->size, chunk, chunk->skb ? chunk->skb->len : -1);
186 192
187 switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) { 193 switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) {
@@ -399,186 +405,72 @@ static void sctp_packet_set_owner_w(struct sk_buff *skb, struct sock *sk)
399 atomic_inc(&sk->sk_wmem_alloc); 405 atomic_inc(&sk->sk_wmem_alloc);
400} 406}
401 407
402/* All packets are sent to the network through this function from 408static int sctp_packet_pack(struct sctp_packet *packet,
403 * sctp_outq_tail(). 409 struct sk_buff *head, int gso, gfp_t gfp)
404 *
405 * The return value is a normal kernel error return value.
406 */
407int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
408{ 410{
409 struct sctp_transport *tp = packet->transport; 411 struct sctp_transport *tp = packet->transport;
410 struct sctp_association *asoc = tp->asoc; 412 struct sctp_auth_chunk *auth = NULL;
411 struct sctphdr *sh;
412 struct sk_buff *nskb = NULL, *head = NULL;
413 struct sctp_chunk *chunk, *tmp; 413 struct sctp_chunk *chunk, *tmp;
414 struct sock *sk; 414 int pkt_count = 0, pkt_size;
415 int err = 0; 415 struct sock *sk = head->sk;
416 int padding; /* How much padding do we need? */ 416 struct sk_buff *nskb;
417 int pkt_size;
418 __u8 has_data = 0;
419 int gso = 0;
420 int pktcount = 0;
421 int auth_len = 0; 417 int auth_len = 0;
422 struct dst_entry *dst;
423 unsigned char *auth = NULL; /* pointer to auth in skb data */
424
425 pr_debug("%s: packet:%p\n", __func__, packet);
426
427 /* Do NOT generate a chunkless packet. */
428 if (list_empty(&packet->chunk_list))
429 return err;
430 418
431 /* Set up convenience variables... */
432 chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list);
433 sk = chunk->skb->sk;
434
435 /* Allocate the head skb, or main one if not in GSO */
436 if (packet->size > tp->pathmtu && !packet->ipfragok) {
437 if (sk_can_gso(sk)) {
438 gso = 1;
439 pkt_size = packet->overhead;
440 } else {
441 /* If this happens, we trash this packet and try
442 * to build a new one, hopefully correct this
443 * time. Application may notice this error.
444 */
445 pr_err_once("Trying to GSO but underlying device doesn't support it.");
446 goto err;
447 }
448 } else {
449 pkt_size = packet->size;
450 }
451 head = alloc_skb(pkt_size + MAX_HEADER, gfp);
452 if (!head)
453 goto err;
454 if (gso) { 419 if (gso) {
455 NAPI_GRO_CB(head)->last = head;
456 skb_shinfo(head)->gso_type = sk->sk_gso_type; 420 skb_shinfo(head)->gso_type = sk->sk_gso_type;
421 NAPI_GRO_CB(head)->last = head;
422 } else {
423 nskb = head;
424 pkt_size = packet->size;
425 goto merge;
457 } 426 }
458 427
459 /* Make sure the outbound skb has enough header room reserved. */
460 skb_reserve(head, packet->overhead + MAX_HEADER);
461
462 /* Set the owning socket so that we know where to get the
463 * destination IP address.
464 */
465 sctp_packet_set_owner_w(head, sk);
466
467 if (!sctp_transport_dst_check(tp)) {
468 sctp_transport_route(tp, NULL, sctp_sk(sk));
469 if (asoc && (asoc->param_flags & SPP_PMTUD_ENABLE)) {
470 sctp_assoc_sync_pmtu(sk, asoc);
471 }
472 }
473 dst = dst_clone(tp->dst);
474 if (!dst) {
475 if (asoc)
476 IP_INC_STATS(sock_net(asoc->base.sk),
477 IPSTATS_MIB_OUTNOROUTES);
478 goto nodst;
479 }
480 skb_dst_set(head, dst);
481
482 /* Build the SCTP header. */
483 sh = (struct sctphdr *)skb_push(head, sizeof(struct sctphdr));
484 skb_reset_transport_header(head);
485 sh->source = htons(packet->source_port);
486 sh->dest = htons(packet->destination_port);
487
488 /* From 6.8 Adler-32 Checksum Calculation:
489 * After the packet is constructed (containing the SCTP common
490 * header and one or more control or DATA chunks), the
491 * transmitter shall:
492 *
493 * 1) Fill in the proper Verification Tag in the SCTP common
494 * header and initialize the checksum field to 0's.
495 */
496 sh->vtag = htonl(packet->vtag);
497 sh->checksum = 0;
498
499 pr_debug("***sctp_transmit_packet***\n");
500
501 do { 428 do {
502 /* Set up convenience variables... */ 429 /* calculate the pkt_size and alloc nskb */
503 chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list); 430 pkt_size = packet->overhead;
504 pktcount++; 431 list_for_each_entry_safe(chunk, tmp, &packet->chunk_list,
432 list) {
433 int padded = SCTP_PAD4(chunk->skb->len);
505 434
506 /* Calculate packet size, so it fits in PMTU. Leave 435 if (chunk == packet->auth)
507 * other chunks for the next packets. 436 auth_len = padded;
508 */ 437 else if (auth_len + padded + packet->overhead >
509 if (gso) { 438 tp->pathmtu)
510 pkt_size = packet->overhead; 439 return 0;
511 list_for_each_entry(chunk, &packet->chunk_list, list) { 440 else if (pkt_size + padded > tp->pathmtu)
512 int padded = SCTP_PAD4(chunk->skb->len); 441 break;
513 442 pkt_size += padded;
514 if (chunk == packet->auth)
515 auth_len = padded;
516 else if (auth_len + padded + packet->overhead >
517 tp->pathmtu)
518 goto nomem;
519 else if (pkt_size + padded > tp->pathmtu)
520 break;
521 pkt_size += padded;
522 }
523
524 /* Allocate a new skb. */
525 nskb = alloc_skb(pkt_size + MAX_HEADER, gfp);
526 if (!nskb)
527 goto nomem;
528
529 /* Make sure the outbound skb has enough header
530 * room reserved.
531 */
532 skb_reserve(nskb, packet->overhead + MAX_HEADER);
533 } else {
534 nskb = head;
535 } 443 }
444 nskb = alloc_skb(pkt_size + MAX_HEADER, gfp);
445 if (!nskb)
446 return 0;
447 skb_reserve(nskb, packet->overhead + MAX_HEADER);
536 448
537 /** 449merge:
538 * 3.2 Chunk Field Descriptions 450 /* merge chunks into nskb and append nskb into head list */
539 *
540 * The total length of a chunk (including Type, Length and
541 * Value fields) MUST be a multiple of 4 bytes. If the length
542 * of the chunk is not a multiple of 4 bytes, the sender MUST
543 * pad the chunk with all zero bytes and this padding is not
544 * included in the chunk length field. The sender should
545 * never pad with more than 3 bytes.
546 *
547 * [This whole comment explains SCTP_PAD4() below.]
548 */
549
550 pkt_size -= packet->overhead; 451 pkt_size -= packet->overhead;
551 list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { 452 list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
453 int padding;
454
552 list_del_init(&chunk->list); 455 list_del_init(&chunk->list);
553 if (sctp_chunk_is_data(chunk)) { 456 if (sctp_chunk_is_data(chunk)) {
554 /* 6.3.1 C4) When data is in flight and when allowed 457 if (!sctp_chunk_retransmitted(chunk) &&
555 * by rule C5, a new RTT measurement MUST be made each 458 !tp->rto_pending) {
556 * round trip. Furthermore, new RTT measurements
557 * SHOULD be made no more than once per round-trip
558 * for a given destination transport address.
559 */
560
561 if (!chunk->resent && !tp->rto_pending) {
562 chunk->rtt_in_progress = 1; 459 chunk->rtt_in_progress = 1;
563 tp->rto_pending = 1; 460 tp->rto_pending = 1;
564 } 461 }
565
566 has_data = 1;
567 } 462 }
568 463
569 padding = SCTP_PAD4(chunk->skb->len) - chunk->skb->len; 464 padding = SCTP_PAD4(chunk->skb->len) - chunk->skb->len;
570 if (padding) 465 if (padding)
571 memset(skb_put(chunk->skb, padding), 0, padding); 466 memset(skb_put(chunk->skb, padding), 0, padding);
572 467
573 /* if this is the auth chunk that we are adding,
574 * store pointer where it will be added and put
575 * the auth into the packet.
576 */
577 if (chunk == packet->auth) 468 if (chunk == packet->auth)
578 auth = skb_tail_pointer(nskb); 469 auth = (struct sctp_auth_chunk *)
470 skb_tail_pointer(nskb);
579 471
580 memcpy(skb_put(nskb, chunk->skb->len), 472 memcpy(skb_put(nskb, chunk->skb->len), chunk->skb->data,
581 chunk->skb->data, chunk->skb->len); 473 chunk->skb->len);
582 474
583 pr_debug("*** Chunk:%p[%s] %s 0x%x, length:%d, chunk->skb->len:%d, rtt_in_progress:%d\n", 475 pr_debug("*** Chunk:%p[%s] %s 0x%x, length:%d, chunk->skb->len:%d, rtt_in_progress:%d\n",
584 chunk, 476 chunk,
@@ -588,11 +480,6 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
588 ntohs(chunk->chunk_hdr->length), chunk->skb->len, 480 ntohs(chunk->chunk_hdr->length), chunk->skb->len,
589 chunk->rtt_in_progress); 481 chunk->rtt_in_progress);
590 482
591 /* If this is a control chunk, this is our last
592 * reference. Free data chunks after they've been
593 * acknowledged or have failed.
594 * Re-queue auth chunks if needed.
595 */
596 pkt_size -= SCTP_PAD4(chunk->skb->len); 483 pkt_size -= SCTP_PAD4(chunk->skb->len);
597 484
598 if (!sctp_chunk_is_data(chunk) && chunk != packet->auth) 485 if (!sctp_chunk_is_data(chunk) && chunk != packet->auth)
@@ -602,160 +489,163 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
602 break; 489 break;
603 } 490 }
604 491
605 /* SCTP-AUTH, Section 6.2 492 if (auth) {
606 * The sender MUST calculate the MAC as described in RFC2104 [2] 493 sctp_auth_calculate_hmac(tp->asoc, nskb, auth, gfp);
607 * using the hash function H as described by the MAC Identifier and 494 /* free auth if no more chunks, or add it back */
608 * the shared association key K based on the endpoint pair shared key 495 if (list_empty(&packet->chunk_list))
609 * described by the shared key identifier. The 'data' used for the 496 sctp_chunk_free(packet->auth);
610 * computation of the AUTH-chunk is given by the AUTH chunk with its 497 else
611 * HMAC field set to zero (as shown in Figure 6) followed by all
612 * chunks that are placed after the AUTH chunk in the SCTP packet.
613 */
614 if (auth)
615 sctp_auth_calculate_hmac(asoc, nskb,
616 (struct sctp_auth_chunk *)auth,
617 gfp);
618
619 if (packet->auth) {
620 if (!list_empty(&packet->chunk_list)) {
621 /* We will generate more packets, so re-queue
622 * auth chunk.
623 */
624 list_add(&packet->auth->list, 498 list_add(&packet->auth->list,
625 &packet->chunk_list); 499 &packet->chunk_list);
626 } else {
627 sctp_chunk_free(packet->auth);
628 packet->auth = NULL;
629 }
630 } 500 }
631 501
632 if (!gso) 502 if (gso) {
633 break; 503 if (skb_gro_receive(&head, nskb)) {
634 504 kfree_skb(nskb);
635 if (skb_gro_receive(&head, nskb)) { 505 return 0;
636 kfree_skb(nskb); 506 }
637 goto nomem; 507 if (WARN_ON_ONCE(skb_shinfo(head)->gso_segs >=
508 sk->sk_gso_max_segs))
509 return 0;
638 } 510 }
639 nskb = NULL; 511
640 if (WARN_ON_ONCE(skb_shinfo(head)->gso_segs >= 512 pkt_count++;
641 sk->sk_gso_max_segs))
642 goto nomem;
643 } while (!list_empty(&packet->chunk_list)); 513 } while (!list_empty(&packet->chunk_list));
644 514
645 /* 2) Calculate the Adler-32 checksum of the whole packet, 515 if (gso) {
646 * including the SCTP common header and all the 516 memset(head->cb, 0, max(sizeof(struct inet_skb_parm),
647 * chunks. 517 sizeof(struct inet6_skb_parm)));
648 * 518 skb_shinfo(head)->gso_segs = pkt_count;
649 * Note: Adler-32 is no longer applicable, as has been replaced 519 skb_shinfo(head)->gso_size = GSO_BY_FRAGS;
650 * by CRC32-C as described in <draft-ietf-tsvwg-sctpcsum-02.txt>. 520 rcu_read_lock();
651 * 521 if (skb_dst(head) != tp->dst) {
652 * If it's a GSO packet, it's postponed to sctp_skb_segment. 522 dst_hold(tp->dst);
653 */ 523 sk_setup_caps(sk, tp->dst);
654 if (!sctp_checksum_disable || gso) {
655 if (!gso && (!(dst->dev->features & NETIF_F_SCTP_CRC) ||
656 dst_xfrm(dst) || packet->ipfragok)) {
657 sh->checksum = sctp_compute_cksum(head, 0);
658 } else {
659 /* no need to seed pseudo checksum for SCTP */
660 head->ip_summed = CHECKSUM_PARTIAL;
661 head->csum_start = skb_transport_header(head) - head->head;
662 head->csum_offset = offsetof(struct sctphdr, checksum);
663 } 524 }
525 rcu_read_unlock();
526 goto chksum;
664 } 527 }
665 528
666 /* IP layer ECN support 529 if (sctp_checksum_disable)
667 * From RFC 2481 530 return 1;
668 * "The ECN-Capable Transport (ECT) bit would be set by the
669 * data sender to indicate that the end-points of the
670 * transport protocol are ECN-capable."
671 *
672 * Now setting the ECT bit all the time, as it should not cause
673 * any problems protocol-wise even if our peer ignores it.
674 *
675 * Note: The works for IPv6 layer checks this bit too later
676 * in transmission. See IP6_ECN_flow_xmit().
677 */
678 tp->af_specific->ecn_capable(sk);
679 531
680 /* Set up the IP options. */ 532 if (!(skb_dst(head)->dev->features & NETIF_F_SCTP_CRC) ||
681 /* BUG: not implemented 533 dst_xfrm(skb_dst(head)) || packet->ipfragok) {
682 * For v4 this all lives somewhere in sk->sk_opt... 534 struct sctphdr *sh =
683 */ 535 (struct sctphdr *)skb_transport_header(head);
684 536
685 /* Dump that on IP! */ 537 sh->checksum = sctp_compute_cksum(head, 0);
686 if (asoc) { 538 } else {
687 asoc->stats.opackets += pktcount; 539chksum:
688 if (asoc->peer.last_sent_to != tp) 540 head->ip_summed = CHECKSUM_PARTIAL;
689 /* Considering the multiple CPU scenario, this is a 541 head->csum_start = skb_transport_header(head) - head->head;
690 * "correcter" place for last_sent_to. --xguo 542 head->csum_offset = offsetof(struct sctphdr, checksum);
691 */
692 asoc->peer.last_sent_to = tp;
693 } 543 }
694 544
695 if (has_data) { 545 return pkt_count;
696 struct timer_list *timer; 546}
697 unsigned long timeout;
698 547
699 /* Restart the AUTOCLOSE timer when sending data. */ 548/* All packets are sent to the network through this function from
700 if (sctp_state(asoc, ESTABLISHED) && 549 * sctp_outq_tail().
701 asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]) { 550 *
702 timer = &asoc->timers[SCTP_EVENT_TIMEOUT_AUTOCLOSE]; 551 * The return value is always 0 for now.
703 timeout = asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]; 552 */
553int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
554{
555 struct sctp_transport *tp = packet->transport;
556 struct sctp_association *asoc = tp->asoc;
557 struct sctp_chunk *chunk, *tmp;
558 int pkt_count, gso = 0;
559 struct dst_entry *dst;
560 struct sk_buff *head;
561 struct sctphdr *sh;
562 struct sock *sk;
563
564 pr_debug("%s: packet:%p\n", __func__, packet);
565 if (list_empty(&packet->chunk_list))
566 return 0;
567 chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list);
568 sk = chunk->skb->sk;
704 569
705 if (!mod_timer(timer, jiffies + timeout)) 570 /* check gso */
706 sctp_association_hold(asoc); 571 if (packet->size > tp->pathmtu && !packet->ipfragok) {
572 if (!sk_can_gso(sk)) {
573 pr_err_once("Trying to GSO but underlying device doesn't support it.");
574 goto out;
707 } 575 }
576 gso = 1;
577 }
578
579 /* alloc head skb */
580 head = alloc_skb((gso ? packet->overhead : packet->size) +
581 MAX_HEADER, gfp);
582 if (!head)
583 goto out;
584 skb_reserve(head, packet->overhead + MAX_HEADER);
585 sctp_packet_set_owner_w(head, sk);
586
587 /* set sctp header */
588 sh = (struct sctphdr *)skb_push(head, sizeof(struct sctphdr));
589 skb_reset_transport_header(head);
590 sh->source = htons(packet->source_port);
591 sh->dest = htons(packet->destination_port);
592 sh->vtag = htonl(packet->vtag);
593 sh->checksum = 0;
594
595 /* drop packet if no dst */
596 dst = dst_clone(tp->dst);
597 if (!dst) {
598 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
599 kfree_skb(head);
600 goto out;
708 } 601 }
602 skb_dst_set(head, dst);
709 603
604 /* pack up chunks */
605 pkt_count = sctp_packet_pack(packet, head, gso, gfp);
606 if (!pkt_count) {
607 kfree_skb(head);
608 goto out;
609 }
710 pr_debug("***sctp_transmit_packet*** skb->len:%d\n", head->len); 610 pr_debug("***sctp_transmit_packet*** skb->len:%d\n", head->len);
711 611
712 if (gso) { 612 /* start autoclose timer */
713 /* Cleanup our debris for IP stacks */ 613 if (packet->has_data && sctp_state(asoc, ESTABLISHED) &&
714 memset(head->cb, 0, max(sizeof(struct inet_skb_parm), 614 asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]) {
715 sizeof(struct inet6_skb_parm))); 615 struct timer_list *timer =
616 &asoc->timers[SCTP_EVENT_TIMEOUT_AUTOCLOSE];
617 unsigned long timeout =
618 asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE];
716 619
717 skb_shinfo(head)->gso_segs = pktcount; 620 if (!mod_timer(timer, jiffies + timeout))
718 skb_shinfo(head)->gso_size = GSO_BY_FRAGS; 621 sctp_association_hold(asoc);
622 }
719 623
720 /* We have to refresh this in case we are xmiting to 624 /* sctp xmit */
721 * more than one transport at a time 625 tp->af_specific->ecn_capable(sk);
722 */ 626 if (asoc) {
723 rcu_read_lock(); 627 asoc->stats.opackets += pkt_count;
724 if (__sk_dst_get(sk) != tp->dst) { 628 if (asoc->peer.last_sent_to != tp)
725 dst_hold(tp->dst); 629 asoc->peer.last_sent_to = tp;
726 sk_setup_caps(sk, tp->dst);
727 }
728 rcu_read_unlock();
729 } 630 }
730 head->ignore_df = packet->ipfragok; 631 head->ignore_df = packet->ipfragok;
731 tp->af_specific->sctp_xmit(head, tp); 632 if (tp->dst_pending_confirm)
732 goto out; 633 skb_set_dst_pending_confirm(head, 1);
733 634 /* neighbour should be confirmed on successful transmission or
734nomem: 635 * positive error
735 if (packet->auth && list_empty(&packet->auth->list))
736 sctp_chunk_free(packet->auth);
737
738nodst:
739 /* FIXME: Returning the 'err' will effect all the associations
740 * associated with a socket, although only one of the paths of the
741 * association is unreachable.
742 * The real failure of a transport or association can be passed on
743 * to the user via notifications. So setting this error may not be
744 * required.
745 */ 636 */
746 /* err = -EHOSTUNREACH; */ 637 if (tp->af_specific->sctp_xmit(head, tp) >= 0 &&
747 kfree_skb(head); 638 tp->dst_pending_confirm)
639 tp->dst_pending_confirm = 0;
748 640
749err: 641out:
750 list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { 642 list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
751 list_del_init(&chunk->list); 643 list_del_init(&chunk->list);
752 if (!sctp_chunk_is_data(chunk)) 644 if (!sctp_chunk_is_data(chunk))
753 sctp_chunk_free(chunk); 645 sctp_chunk_free(chunk);
754 } 646 }
755
756out:
757 sctp_packet_reset(packet); 647 sctp_packet_reset(packet);
758 return err; 648 return 0;
759} 649}
760 650
761/******************************************************************** 651/********************************************************************
@@ -818,18 +708,15 @@ static sctp_xmit_t sctp_packet_can_append_data(struct sctp_packet *packet,
818 * unacknowledged. 708 * unacknowledged.
819 */ 709 */
820 710
821 if (sctp_sk(asoc->base.sk)->nodelay) 711 if ((sctp_sk(asoc->base.sk)->nodelay || inflight == 0) &&
822 /* Nagle disabled */ 712 !asoc->force_delay)
713 /* Nothing unacked */
823 return SCTP_XMIT_OK; 714 return SCTP_XMIT_OK;
824 715
825 if (!sctp_packet_empty(packet)) 716 if (!sctp_packet_empty(packet))
826 /* Append to packet */ 717 /* Append to packet */
827 return SCTP_XMIT_OK; 718 return SCTP_XMIT_OK;
828 719
829 if (inflight == 0)
830 /* Nothing unacked */
831 return SCTP_XMIT_OK;
832
833 if (!sctp_state(asoc, ESTABLISHED)) 720 if (!sctp_state(asoc, ESTABLISHED))
834 return SCTP_XMIT_OK; 721 return SCTP_XMIT_OK;
835 722
@@ -871,9 +758,6 @@ static void sctp_packet_append_data(struct sctp_packet *packet,
871 rwnd = 0; 758 rwnd = 0;
872 759
873 asoc->peer.rwnd = rwnd; 760 asoc->peer.rwnd = rwnd;
874 /* Has been accepted for transmission. */
875 if (!asoc->peer.prsctp_capable)
876 chunk->msg->can_abandon = 0;
877 sctp_chunk_assign_tsn(chunk); 761 sctp_chunk_assign_tsn(chunk);
878 sctp_chunk_assign_ssn(chunk); 762 sctp_chunk_assign_ssn(chunk);
879} 763}
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 582585393d35..8081476ed313 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -382,17 +382,18 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc,
382} 382}
383 383
384static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, 384static int sctp_prsctp_prune_unsent(struct sctp_association *asoc,
385 struct sctp_sndrcvinfo *sinfo, 385 struct sctp_sndrcvinfo *sinfo, int msg_len)
386 struct list_head *queue, int msg_len)
387{ 386{
387 struct sctp_outq *q = &asoc->outqueue;
388 struct sctp_chunk *chk, *temp; 388 struct sctp_chunk *chk, *temp;
389 389
390 list_for_each_entry_safe(chk, temp, queue, list) { 390 list_for_each_entry_safe(chk, temp, &q->out_chunk_list, list) {
391 if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || 391 if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) ||
392 chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive) 392 chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)
393 continue; 393 continue;
394 394
395 list_del_init(&chk->list); 395 list_del_init(&chk->list);
396 q->out_qlen -= chk->skb->len;
396 asoc->sent_cnt_removable--; 397 asoc->sent_cnt_removable--;
397 asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; 398 asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++;
398 399
@@ -431,9 +432,7 @@ void sctp_prsctp_prune(struct sctp_association *asoc,
431 return; 432 return;
432 } 433 }
433 434
434 sctp_prsctp_prune_unsent(asoc, sinfo, 435 sctp_prsctp_prune_unsent(asoc, sinfo, msg_len);
435 &asoc->outqueue.out_chunk_list,
436 msg_len);
437} 436}
438 437
439/* Mark all the eligible packets on a transport for retransmission. */ 438/* Mark all the eligible packets on a transport for retransmission. */
@@ -507,8 +506,6 @@ void sctp_retransmit_mark(struct sctp_outq *q,
507 transport->rto_pending = 0; 506 transport->rto_pending = 0;
508 } 507 }
509 508
510 chunk->resent = 1;
511
512 /* Move the chunk to the retransmit queue. The chunks 509 /* Move the chunk to the retransmit queue. The chunks
513 * on the retransmit queue are always kept in order. 510 * on the retransmit queue are always kept in order.
514 */ 511 */
@@ -917,22 +914,28 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
917 case SCTP_CID_ECN_ECNE: 914 case SCTP_CID_ECN_ECNE:
918 case SCTP_CID_ASCONF: 915 case SCTP_CID_ASCONF:
919 case SCTP_CID_FWD_TSN: 916 case SCTP_CID_FWD_TSN:
917 case SCTP_CID_RECONF:
920 status = sctp_packet_transmit_chunk(packet, chunk, 918 status = sctp_packet_transmit_chunk(packet, chunk,
921 one_packet, gfp); 919 one_packet, gfp);
922 if (status != SCTP_XMIT_OK) { 920 if (status != SCTP_XMIT_OK) {
923 /* put the chunk back */ 921 /* put the chunk back */
924 list_add(&chunk->list, &q->control_chunk_list); 922 list_add(&chunk->list, &q->control_chunk_list);
925 } else { 923 break;
926 asoc->stats.octrlchunks++;
927 /* PR-SCTP C5) If a FORWARD TSN is sent, the
928 * sender MUST assure that at least one T3-rtx
929 * timer is running.
930 */
931 if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN) {
932 sctp_transport_reset_t3_rtx(transport);
933 transport->last_time_sent = jiffies;
934 }
935 } 924 }
925
926 asoc->stats.octrlchunks++;
927 /* PR-SCTP C5) If a FORWARD TSN is sent, the
928 * sender MUST assure that at least one T3-rtx
929 * timer is running.
930 */
931 if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN) {
932 sctp_transport_reset_t3_rtx(transport);
933 transport->last_time_sent = jiffies;
934 }
935
936 if (chunk == asoc->strreset_chunk)
937 sctp_transport_reset_reconf_timer(transport);
938
936 break; 939 break;
937 940
938 default: 941 default:
@@ -1018,11 +1021,12 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
1018 1021
1019 /* Finally, transmit new packets. */ 1022 /* Finally, transmit new packets. */
1020 while ((chunk = sctp_outq_dequeue_data(q)) != NULL) { 1023 while ((chunk = sctp_outq_dequeue_data(q)) != NULL) {
1024 __u32 sid = ntohs(chunk->subh.data_hdr->stream);
1025
1021 /* RFC 2960 6.5 Every DATA chunk MUST carry a valid 1026 /* RFC 2960 6.5 Every DATA chunk MUST carry a valid
1022 * stream identifier. 1027 * stream identifier.
1023 */ 1028 */
1024 if (chunk->sinfo.sinfo_stream >= 1029 if (chunk->sinfo.sinfo_stream >= asoc->stream->outcnt) {
1025 asoc->c.sinit_num_ostreams) {
1026 1030
1027 /* Mark as failed send. */ 1031 /* Mark as failed send. */
1028 sctp_chunk_fail(chunk, SCTP_ERROR_INV_STRM); 1032 sctp_chunk_fail(chunk, SCTP_ERROR_INV_STRM);
@@ -1040,6 +1044,11 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
1040 continue; 1044 continue;
1041 } 1045 }
1042 1046
1047 if (asoc->stream->out[sid].state == SCTP_STREAM_CLOSED) {
1048 sctp_outq_head_data(q, chunk);
1049 goto sctp_flush_out;
1050 }
1051
1043 /* If there is a specified transport, use it. 1052 /* If there is a specified transport, use it.
1044 * Otherwise, we want to use the active path. 1053 * Otherwise, we want to use the active path.
1045 */ 1054 */
@@ -1050,7 +1059,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
1050 (new_transport->state == SCTP_PF))) 1059 (new_transport->state == SCTP_PF)))
1051 new_transport = asoc->peer.active_path; 1060 new_transport = asoc->peer.active_path;
1052 if (new_transport->state == SCTP_UNCONFIRMED) { 1061 if (new_transport->state == SCTP_UNCONFIRMED) {
1053 WARN_ONCE(1, "Atempt to send packet on unconfirmed path."); 1062 WARN_ONCE(1, "Attempt to send packet on unconfirmed path.");
1054 sctp_chunk_fail(chunk, 0); 1063 sctp_chunk_fail(chunk, 0);
1055 sctp_chunk_free(chunk); 1064 sctp_chunk_free(chunk);
1056 continue; 1065 continue;
@@ -1439,7 +1448,7 @@ static void sctp_check_transmitted(struct sctp_outq *q,
1439 * instance). 1448 * instance).
1440 */ 1449 */
1441 if (!tchunk->tsn_gap_acked && 1450 if (!tchunk->tsn_gap_acked &&
1442 !tchunk->resent && 1451 !sctp_chunk_retransmitted(tchunk) &&
1443 tchunk->rtt_in_progress) { 1452 tchunk->rtt_in_progress) {
1444 tchunk->rtt_in_progress = 0; 1453 tchunk->rtt_in_progress = 0;
1445 rtt = jiffies - tchunk->sent_at; 1454 rtt = jiffies - tchunk->sent_at;
@@ -1643,7 +1652,7 @@ static void sctp_check_transmitted(struct sctp_outq *q,
1643 1652
1644 if (forward_progress) { 1653 if (forward_progress) {
1645 if (transport->dst) 1654 if (transport->dst)
1646 dst_confirm(transport->dst); 1655 sctp_transport_dst_confirm(transport);
1647 } 1656 }
1648 } 1657 }
1649 1658
diff --git a/net/sctp/primitive.c b/net/sctp/primitive.c
index ab8d9f96a177..f0553a022859 100644
--- a/net/sctp/primitive.c
+++ b/net/sctp/primitive.c
@@ -211,3 +211,6 @@ DECLARE_PRIMITIVE(REQUESTHEARTBEAT);
211*/ 211*/
212 212
213DECLARE_PRIMITIVE(ASCONF); 213DECLARE_PRIMITIVE(ASCONF);
214
215/* RE-CONFIG 5.1 */
216DECLARE_PRIMITIVE(RECONF);
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index 206377fe91ec..a0b29d43627f 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -361,8 +361,8 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v)
361 sctp_seq_dump_remote_addrs(seq, assoc); 361 sctp_seq_dump_remote_addrs(seq, assoc);
362 seq_printf(seq, "\t%8lu %5d %5d %4d %4d %4d %8d " 362 seq_printf(seq, "\t%8lu %5d %5d %4d %4d %4d %8d "
363 "%8d %8d %8d %8d", 363 "%8d %8d %8d %8d",
364 assoc->hbinterval, assoc->c.sinit_max_instreams, 364 assoc->hbinterval, assoc->stream->incnt,
365 assoc->c.sinit_num_ostreams, assoc->max_retrans, 365 assoc->stream->outcnt, assoc->max_retrans,
366 assoc->init_retries, assoc->shutdown_retries, 366 assoc->init_retries, assoc->shutdown_retries,
367 assoc->rtx_data_chunks, 367 assoc->rtx_data_chunks,
368 atomic_read(&sk->sk_wmem_alloc), 368 atomic_read(&sk->sk_wmem_alloc),
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 7b523e3f551f..989a900383b5 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -199,32 +199,40 @@ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *bp,
199 sctp_scope_t scope, gfp_t gfp, int copy_flags) 199 sctp_scope_t scope, gfp_t gfp, int copy_flags)
200{ 200{
201 struct sctp_sockaddr_entry *addr; 201 struct sctp_sockaddr_entry *addr;
202 union sctp_addr laddr;
202 int error = 0; 203 int error = 0;
203 204
204 rcu_read_lock(); 205 rcu_read_lock();
205 list_for_each_entry_rcu(addr, &net->sctp.local_addr_list, list) { 206 list_for_each_entry_rcu(addr, &net->sctp.local_addr_list, list) {
206 if (!addr->valid) 207 if (!addr->valid)
207 continue; 208 continue;
208 if (sctp_in_scope(net, &addr->a, scope)) { 209 if (!sctp_in_scope(net, &addr->a, scope))
209 /* Now that the address is in scope, check to see if 210 continue;
210 * the address type is really supported by the local 211
211 * sock as well as the remote peer. 212 /* Now that the address is in scope, check to see if
212 */ 213 * the address type is really supported by the local
213 if ((((AF_INET == addr->a.sa.sa_family) && 214 * sock as well as the remote peer.
214 (copy_flags & SCTP_ADDR4_PEERSUPP))) || 215 */
215 (((AF_INET6 == addr->a.sa.sa_family) && 216 if (addr->a.sa.sa_family == AF_INET &&
216 (copy_flags & SCTP_ADDR6_ALLOWED) && 217 !(copy_flags & SCTP_ADDR4_PEERSUPP))
217 (copy_flags & SCTP_ADDR6_PEERSUPP)))) { 218 continue;
218 error = sctp_add_bind_addr(bp, &addr->a, 219 if (addr->a.sa.sa_family == AF_INET6 &&
219 sizeof(addr->a), 220 (!(copy_flags & SCTP_ADDR6_ALLOWED) ||
220 SCTP_ADDR_SRC, GFP_ATOMIC); 221 !(copy_flags & SCTP_ADDR6_PEERSUPP)))
221 if (error) 222 continue;
222 goto end_copy; 223
223 } 224 laddr = addr->a;
224 } 225 /* also works for setting ipv6 address port */
226 laddr.v4.sin_port = htons(bp->port);
227 if (sctp_bind_addr_state(bp, &laddr) != -1)
228 continue;
229
230 error = sctp_add_bind_addr(bp, &addr->a, sizeof(addr->a),
231 SCTP_ADDR_SRC, GFP_ATOMIC);
232 if (error)
233 break;
225 } 234 }
226 235
227end_copy:
228 rcu_read_unlock(); 236 rcu_read_unlock();
229 return error; 237 return error;
230} 238}
@@ -233,23 +241,19 @@ end_copy:
233static void sctp_v4_from_skb(union sctp_addr *addr, struct sk_buff *skb, 241static void sctp_v4_from_skb(union sctp_addr *addr, struct sk_buff *skb,
234 int is_saddr) 242 int is_saddr)
235{ 243{
236 void *from; 244 /* Always called on head skb, so this is safe */
237 __be16 *port; 245 struct sctphdr *sh = sctp_hdr(skb);
238 struct sctphdr *sh; 246 struct sockaddr_in *sa = &addr->v4;
239 247
240 port = &addr->v4.sin_port;
241 addr->v4.sin_family = AF_INET; 248 addr->v4.sin_family = AF_INET;
242 249
243 /* Always called on head skb, so this is safe */
244 sh = sctp_hdr(skb);
245 if (is_saddr) { 250 if (is_saddr) {
246 *port = sh->source; 251 sa->sin_port = sh->source;
247 from = &ip_hdr(skb)->saddr; 252 sa->sin_addr.s_addr = ip_hdr(skb)->saddr;
248 } else { 253 } else {
249 *port = sh->dest; 254 sa->sin_port = sh->dest;
250 from = &ip_hdr(skb)->daddr; 255 sa->sin_addr.s_addr = ip_hdr(skb)->daddr;
251 } 256 }
252 memcpy(&addr->v4.sin_addr.s_addr, from, sizeof(struct in_addr));
253} 257}
254 258
255/* Initialize an sctp_addr from a socket. */ 259/* Initialize an sctp_addr from a socket. */
@@ -571,10 +575,11 @@ static int sctp_v4_is_ce(const struct sk_buff *skb)
571 575
572/* Create and initialize a new sk for the socket returned by accept(). */ 576/* Create and initialize a new sk for the socket returned by accept(). */
573static struct sock *sctp_v4_create_accept_sk(struct sock *sk, 577static struct sock *sctp_v4_create_accept_sk(struct sock *sk,
574 struct sctp_association *asoc) 578 struct sctp_association *asoc,
579 bool kern)
575{ 580{
576 struct sock *newsk = sk_alloc(sock_net(sk), PF_INET, GFP_KERNEL, 581 struct sock *newsk = sk_alloc(sock_net(sk), PF_INET, GFP_KERNEL,
577 sk->sk_prot, 0); 582 sk->sk_prot, kern);
578 struct inet_sock *newinet; 583 struct inet_sock *newinet;
579 584
580 if (!newsk) 585 if (!newsk)
@@ -1258,6 +1263,9 @@ static int __net_init sctp_defaults_init(struct net *net)
1258 /* Enable PR-SCTP by default. */ 1263 /* Enable PR-SCTP by default. */
1259 net->sctp.prsctp_enable = 1; 1264 net->sctp.prsctp_enable = 1;
1260 1265
1266 /* Disable RECONF by default. */
1267 net->sctp.reconf_enable = 0;
1268
1261 /* Disable AUTH by default. */ 1269 /* Disable AUTH by default. */
1262 net->sctp.auth_enable = 0; 1270 net->sctp.auth_enable = 0;
1263 1271
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 9e9690b7afe1..118faff6a332 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -270,6 +270,11 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
270 num_ext += 2; 270 num_ext += 2;
271 } 271 }
272 272
273 if (asoc->reconf_enable) {
274 extensions[num_ext] = SCTP_CID_RECONF;
275 num_ext += 1;
276 }
277
273 if (sp->adaptation_ind) 278 if (sp->adaptation_ind)
274 chunksize += sizeof(aiparam); 279 chunksize += sizeof(aiparam);
275 280
@@ -434,6 +439,11 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
434 num_ext += 2; 439 num_ext += 2;
435 } 440 }
436 441
442 if (asoc->peer.reconf_capable) {
443 extensions[num_ext] = SCTP_CID_RECONF;
444 num_ext += 1;
445 }
446
437 if (sp->adaptation_ind) 447 if (sp->adaptation_ind)
438 chunksize += sizeof(aiparam); 448 chunksize += sizeof(aiparam);
439 449
@@ -1536,7 +1546,7 @@ void sctp_chunk_assign_ssn(struct sctp_chunk *chunk)
1536 1546
1537 /* All fragments will be on the same stream */ 1547 /* All fragments will be on the same stream */
1538 sid = ntohs(chunk->subh.data_hdr->stream); 1548 sid = ntohs(chunk->subh.data_hdr->stream);
1539 stream = &chunk->asoc->ssnmap->out; 1549 stream = chunk->asoc->stream;
1540 1550
1541 /* Now assign the sequence number to the entire message. 1551 /* Now assign the sequence number to the entire message.
1542 * All fragments must have the same stream sequence number. 1552 * All fragments must have the same stream sequence number.
@@ -1547,9 +1557,9 @@ void sctp_chunk_assign_ssn(struct sctp_chunk *chunk)
1547 ssn = 0; 1557 ssn = 0;
1548 } else { 1558 } else {
1549 if (lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG) 1559 if (lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG)
1550 ssn = sctp_ssn_next(stream, sid); 1560 ssn = sctp_ssn_next(stream, out, sid);
1551 else 1561 else
1552 ssn = sctp_ssn_peek(stream, sid); 1562 ssn = sctp_ssn_peek(stream, out, sid);
1553 } 1563 }
1554 1564
1555 lchunk->subh.data_hdr->ssn = htons(ssn); 1565 lchunk->subh.data_hdr->ssn = htons(ssn);
@@ -1844,6 +1854,7 @@ no_hmac:
1844 retval->next_tsn = retval->c.initial_tsn; 1854 retval->next_tsn = retval->c.initial_tsn;
1845 retval->ctsn_ack_point = retval->next_tsn - 1; 1855 retval->ctsn_ack_point = retval->next_tsn - 1;
1846 retval->addip_serial = retval->c.initial_tsn; 1856 retval->addip_serial = retval->c.initial_tsn;
1857 retval->strreset_outseq = retval->c.initial_tsn;
1847 retval->adv_peer_ack_point = retval->ctsn_ack_point; 1858 retval->adv_peer_ack_point = retval->ctsn_ack_point;
1848 retval->peer.prsctp_capable = retval->c.prsctp_capable; 1859 retval->peer.prsctp_capable = retval->c.prsctp_capable;
1849 retval->peer.adaptation_ind = retval->c.adaptation_ind; 1860 retval->peer.adaptation_ind = retval->c.adaptation_ind;
@@ -2011,6 +2022,11 @@ static void sctp_process_ext_param(struct sctp_association *asoc,
2011 2022
2012 for (i = 0; i < num_ext; i++) { 2023 for (i = 0; i < num_ext; i++) {
2013 switch (param.ext->chunks[i]) { 2024 switch (param.ext->chunks[i]) {
2025 case SCTP_CID_RECONF:
2026 if (asoc->reconf_enable &&
2027 !asoc->peer.reconf_capable)
2028 asoc->peer.reconf_capable = 1;
2029 break;
2014 case SCTP_CID_FWD_TSN: 2030 case SCTP_CID_FWD_TSN:
2015 if (asoc->prsctp_enable && !asoc->peer.prsctp_capable) 2031 if (asoc->prsctp_enable && !asoc->peer.prsctp_capable)
2016 asoc->peer.prsctp_capable = 1; 2032 asoc->peer.prsctp_capable = 1;
@@ -2387,6 +2403,8 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk,
2387 asoc->peer.i.initial_tsn = 2403 asoc->peer.i.initial_tsn =
2388 ntohl(peer_init->init_hdr.initial_tsn); 2404 ntohl(peer_init->init_hdr.initial_tsn);
2389 2405
2406 asoc->strreset_inseq = asoc->peer.i.initial_tsn;
2407
2390 /* Apply the upper bounds for output streams based on peer's 2408 /* Apply the upper bounds for output streams based on peer's
2391 * number of inbound streams. 2409 * number of inbound streams.
2392 */ 2410 */
@@ -2442,15 +2460,10 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk,
2442 * association. 2460 * association.
2443 */ 2461 */
2444 if (!asoc->temp) { 2462 if (!asoc->temp) {
2445 int error; 2463 if (sctp_stream_init(asoc, gfp))
2446
2447 asoc->ssnmap = sctp_ssnmap_new(asoc->c.sinit_max_instreams,
2448 asoc->c.sinit_num_ostreams, gfp);
2449 if (!asoc->ssnmap)
2450 goto clean_up; 2464 goto clean_up;
2451 2465
2452 error = sctp_assoc_set_id(asoc, gfp); 2466 if (sctp_assoc_set_id(asoc, gfp))
2453 if (error)
2454 goto clean_up; 2467 goto clean_up;
2455 } 2468 }
2456 2469
@@ -3210,7 +3223,6 @@ struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc,
3210 union sctp_params param; 3223 union sctp_params param;
3211 sctp_addiphdr_t *hdr; 3224 sctp_addiphdr_t *hdr;
3212 union sctp_addr_param *addr_param; 3225 union sctp_addr_param *addr_param;
3213 sctp_addip_param_t *asconf_param;
3214 struct sctp_chunk *asconf_ack; 3226 struct sctp_chunk *asconf_ack;
3215 __be16 err_code; 3227 __be16 err_code;
3216 int length = 0; 3228 int length = 0;
@@ -3230,7 +3242,6 @@ struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc,
3230 * asconf parameter. 3242 * asconf parameter.
3231 */ 3243 */
3232 length = ntohs(addr_param->p.length); 3244 length = ntohs(addr_param->p.length);
3233 asconf_param = (void *)addr_param + length;
3234 chunk_len -= length; 3245 chunk_len -= length;
3235 3246
3236 /* create an ASCONF_ACK chunk. 3247 /* create an ASCONF_ACK chunk.
@@ -3317,8 +3328,7 @@ static void sctp_asconf_param_success(struct sctp_association *asoc,
3317 local_bh_enable(); 3328 local_bh_enable();
3318 list_for_each_entry(transport, &asoc->peer.transport_addr_list, 3329 list_for_each_entry(transport, &asoc->peer.transport_addr_list,
3319 transports) { 3330 transports) {
3320 dst_release(transport->dst); 3331 sctp_transport_dst_release(transport);
3321 transport->dst = NULL;
3322 } 3332 }
3323 break; 3333 break;
3324 case SCTP_PARAM_DEL_IP: 3334 case SCTP_PARAM_DEL_IP:
@@ -3332,8 +3342,7 @@ static void sctp_asconf_param_success(struct sctp_association *asoc,
3332 local_bh_enable(); 3342 local_bh_enable();
3333 list_for_each_entry(transport, &asoc->peer.transport_addr_list, 3343 list_for_each_entry(transport, &asoc->peer.transport_addr_list,
3334 transports) { 3344 transports) {
3335 dst_release(transport->dst); 3345 sctp_transport_dst_release(transport);
3336 transport->dst = NULL;
3337 } 3346 }
3338 break; 3347 break;
3339 default: 3348 default:
@@ -3526,3 +3535,323 @@ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc,
3526 3535
3527 return retval; 3536 return retval;
3528} 3537}
3538
3539/* RE-CONFIG 3.1 (RE-CONFIG chunk)
3540 * 0 1 2 3
3541 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
3542 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3543 * | Type = 130 | Chunk Flags | Chunk Length |
3544 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3545 * \ \
3546 * / Re-configuration Parameter /
3547 * \ \
3548 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3549 * \ \
3550 * / Re-configuration Parameter (optional) /
3551 * \ \
3552 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3553 */
3554static struct sctp_chunk *sctp_make_reconf(
3555 const struct sctp_association *asoc,
3556 int length)
3557{
3558 struct sctp_reconf_chunk *reconf;
3559 struct sctp_chunk *retval;
3560
3561 retval = sctp_make_control(asoc, SCTP_CID_RECONF, 0, length,
3562 GFP_ATOMIC);
3563 if (!retval)
3564 return NULL;
3565
3566 reconf = (struct sctp_reconf_chunk *)retval->chunk_hdr;
3567 retval->param_hdr.v = reconf->params;
3568
3569 return retval;
3570}
3571
3572/* RE-CONFIG 4.1 (STREAM OUT RESET)
3573 * 0 1 2 3
3574 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
3575 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3576 * | Parameter Type = 13 | Parameter Length = 16 + 2 * N |
3577 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3578 * | Re-configuration Request Sequence Number |
3579 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3580 * | Re-configuration Response Sequence Number |
3581 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3582 * | Sender's Last Assigned TSN |
3583 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3584 * | Stream Number 1 (optional) | Stream Number 2 (optional) |
3585 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3586 * / ...... /
3587 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3588 * | Stream Number N-1 (optional) | Stream Number N (optional) |
3589 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3590 *
3591 * RE-CONFIG 4.2 (STREAM IN RESET)
3592 * 0 1 2 3
3593 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
3594 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3595 * | Parameter Type = 14 | Parameter Length = 8 + 2 * N |
3596 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3597 * | Re-configuration Request Sequence Number |
3598 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3599 * | Stream Number 1 (optional) | Stream Number 2 (optional) |
3600 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3601 * / ...... /
3602 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3603 * | Stream Number N-1 (optional) | Stream Number N (optional) |
3604 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3605 */
3606struct sctp_chunk *sctp_make_strreset_req(
3607 const struct sctp_association *asoc,
3608 __u16 stream_num, __u16 *stream_list,
3609 bool out, bool in)
3610{
3611 struct sctp_strreset_outreq outreq;
3612 __u16 stream_len = stream_num * 2;
3613 struct sctp_strreset_inreq inreq;
3614 struct sctp_chunk *retval;
3615 __u16 outlen, inlen;
3616
3617 outlen = (sizeof(outreq) + stream_len) * out;
3618 inlen = (sizeof(inreq) + stream_len) * in;
3619
3620 retval = sctp_make_reconf(asoc, outlen + inlen);
3621 if (!retval)
3622 return NULL;
3623
3624 if (outlen) {
3625 outreq.param_hdr.type = SCTP_PARAM_RESET_OUT_REQUEST;
3626 outreq.param_hdr.length = htons(outlen);
3627 outreq.request_seq = htonl(asoc->strreset_outseq);
3628 outreq.response_seq = htonl(asoc->strreset_inseq - 1);
3629 outreq.send_reset_at_tsn = htonl(asoc->next_tsn - 1);
3630
3631 sctp_addto_chunk(retval, sizeof(outreq), &outreq);
3632
3633 if (stream_len)
3634 sctp_addto_chunk(retval, stream_len, stream_list);
3635 }
3636
3637 if (inlen) {
3638 inreq.param_hdr.type = SCTP_PARAM_RESET_IN_REQUEST;
3639 inreq.param_hdr.length = htons(inlen);
3640 inreq.request_seq = htonl(asoc->strreset_outseq + out);
3641
3642 sctp_addto_chunk(retval, sizeof(inreq), &inreq);
3643
3644 if (stream_len)
3645 sctp_addto_chunk(retval, stream_len, stream_list);
3646 }
3647
3648 return retval;
3649}
3650
3651/* RE-CONFIG 4.3 (SSN/TSN RESET ALL)
3652 * 0 1 2 3
3653 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
3654 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3655 * | Parameter Type = 15 | Parameter Length = 8 |
3656 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3657 * | Re-configuration Request Sequence Number |
3658 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3659 */
3660struct sctp_chunk *sctp_make_strreset_tsnreq(
3661 const struct sctp_association *asoc)
3662{
3663 struct sctp_strreset_tsnreq tsnreq;
3664 __u16 length = sizeof(tsnreq);
3665 struct sctp_chunk *retval;
3666
3667 retval = sctp_make_reconf(asoc, length);
3668 if (!retval)
3669 return NULL;
3670
3671 tsnreq.param_hdr.type = SCTP_PARAM_RESET_TSN_REQUEST;
3672 tsnreq.param_hdr.length = htons(length);
3673 tsnreq.request_seq = htonl(asoc->strreset_outseq);
3674
3675 sctp_addto_chunk(retval, sizeof(tsnreq), &tsnreq);
3676
3677 return retval;
3678}
3679
3680/* RE-CONFIG 4.5/4.6 (ADD STREAM)
3681 * 0 1 2 3
3682 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
3683 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3684 * | Parameter Type = 17 | Parameter Length = 12 |
3685 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3686 * | Re-configuration Request Sequence Number |
3687 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3688 * | Number of new streams | Reserved |
3689 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3690 */
3691struct sctp_chunk *sctp_make_strreset_addstrm(
3692 const struct sctp_association *asoc,
3693 __u16 out, __u16 in)
3694{
3695 struct sctp_strreset_addstrm addstrm;
3696 __u16 size = sizeof(addstrm);
3697 struct sctp_chunk *retval;
3698
3699 retval = sctp_make_reconf(asoc, (!!out + !!in) * size);
3700 if (!retval)
3701 return NULL;
3702
3703 if (out) {
3704 addstrm.param_hdr.type = SCTP_PARAM_RESET_ADD_OUT_STREAMS;
3705 addstrm.param_hdr.length = htons(size);
3706 addstrm.number_of_streams = htons(out);
3707 addstrm.request_seq = htonl(asoc->strreset_outseq);
3708 addstrm.reserved = 0;
3709
3710 sctp_addto_chunk(retval, size, &addstrm);
3711 }
3712
3713 if (in) {
3714 addstrm.param_hdr.type = SCTP_PARAM_RESET_ADD_IN_STREAMS;
3715 addstrm.param_hdr.length = htons(size);
3716 addstrm.number_of_streams = htons(in);
3717 addstrm.request_seq = htonl(asoc->strreset_outseq + !!out);
3718 addstrm.reserved = 0;
3719
3720 sctp_addto_chunk(retval, size, &addstrm);
3721 }
3722
3723 return retval;
3724}
3725
3726/* RE-CONFIG 4.4 (RESP)
3727 * 0 1 2 3
3728 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
3729 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3730 * | Parameter Type = 16 | Parameter Length |
3731 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3732 * | Re-configuration Response Sequence Number |
3733 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3734 * | Result |
3735 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3736 */
3737struct sctp_chunk *sctp_make_strreset_resp(
3738 const struct sctp_association *asoc,
3739 __u32 result, __u32 sn)
3740{
3741 struct sctp_strreset_resp resp;
3742 __u16 length = sizeof(resp);
3743 struct sctp_chunk *retval;
3744
3745 retval = sctp_make_reconf(asoc, length);
3746 if (!retval)
3747 return NULL;
3748
3749 resp.param_hdr.type = SCTP_PARAM_RESET_RESPONSE;
3750 resp.param_hdr.length = htons(length);
3751 resp.response_seq = htonl(sn);
3752 resp.result = htonl(result);
3753
3754 sctp_addto_chunk(retval, sizeof(resp), &resp);
3755
3756 return retval;
3757}
3758
3759/* RE-CONFIG 4.4 OPTIONAL (TSNRESP)
3760 * 0 1 2 3
3761 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
3762 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3763 * | Parameter Type = 16 | Parameter Length |
3764 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3765 * | Re-configuration Response Sequence Number |
3766 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3767 * | Result |
3768 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3769 * | Sender's Next TSN (optional) |
3770 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3771 * | Receiver's Next TSN (optional) |
3772 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
3773 */
3774struct sctp_chunk *sctp_make_strreset_tsnresp(
3775 struct sctp_association *asoc,
3776 __u32 result, __u32 sn,
3777 __u32 sender_tsn, __u32 receiver_tsn)
3778{
3779 struct sctp_strreset_resptsn tsnresp;
3780 __u16 length = sizeof(tsnresp);
3781 struct sctp_chunk *retval;
3782
3783 retval = sctp_make_reconf(asoc, length);
3784 if (!retval)
3785 return NULL;
3786
3787 tsnresp.param_hdr.type = SCTP_PARAM_RESET_RESPONSE;
3788 tsnresp.param_hdr.length = htons(length);
3789
3790 tsnresp.response_seq = htonl(sn);
3791 tsnresp.result = htonl(result);
3792 tsnresp.senders_next_tsn = htonl(sender_tsn);
3793 tsnresp.receivers_next_tsn = htonl(receiver_tsn);
3794
3795 sctp_addto_chunk(retval, sizeof(tsnresp), &tsnresp);
3796
3797 return retval;
3798}
3799
3800bool sctp_verify_reconf(const struct sctp_association *asoc,
3801 struct sctp_chunk *chunk,
3802 struct sctp_paramhdr **errp)
3803{
3804 struct sctp_reconf_chunk *hdr;
3805 union sctp_params param;
3806 __u16 last = 0, cnt = 0;
3807
3808 hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr;
3809 sctp_walk_params(param, hdr, params) {
3810 __u16 length = ntohs(param.p->length);
3811
3812 *errp = param.p;
3813 if (cnt++ > 2)
3814 return false;
3815 switch (param.p->type) {
3816 case SCTP_PARAM_RESET_OUT_REQUEST:
3817 if (length < sizeof(struct sctp_strreset_outreq) ||
3818 (last && last != SCTP_PARAM_RESET_RESPONSE &&
3819 last != SCTP_PARAM_RESET_IN_REQUEST))
3820 return false;
3821 break;
3822 case SCTP_PARAM_RESET_IN_REQUEST:
3823 if (length < sizeof(struct sctp_strreset_inreq) ||
3824 (last && last != SCTP_PARAM_RESET_OUT_REQUEST))
3825 return false;
3826 break;
3827 case SCTP_PARAM_RESET_RESPONSE:
3828 if ((length != sizeof(struct sctp_strreset_resp) &&
3829 length != sizeof(struct sctp_strreset_resptsn)) ||
3830 (last && last != SCTP_PARAM_RESET_RESPONSE &&
3831 last != SCTP_PARAM_RESET_OUT_REQUEST))
3832 return false;
3833 break;
3834 case SCTP_PARAM_RESET_TSN_REQUEST:
3835 if (length !=
3836 sizeof(struct sctp_strreset_tsnreq) || last)
3837 return false;
3838 break;
3839 case SCTP_PARAM_RESET_ADD_IN_STREAMS:
3840 if (length != sizeof(struct sctp_strreset_addstrm) ||
3841 (last && last != SCTP_PARAM_RESET_ADD_OUT_STREAMS))
3842 return false;
3843 break;
3844 case SCTP_PARAM_RESET_ADD_OUT_STREAMS:
3845 if (length != sizeof(struct sctp_strreset_addstrm) ||
3846 (last && last != SCTP_PARAM_RESET_ADD_IN_STREAMS))
3847 return false;
3848 break;
3849 default:
3850 return false;
3851 }
3852
3853 last = param.p->type;
3854 }
3855
3856 return true;
3857}
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index c345bf153bed..25384fa82ba9 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -436,6 +436,37 @@ out_unlock:
436 sctp_association_put(asoc); 436 sctp_association_put(asoc);
437} 437}
438 438
439 /* Handle the timeout of the RE-CONFIG timer. */
440void sctp_generate_reconf_event(unsigned long data)
441{
442 struct sctp_transport *transport = (struct sctp_transport *)data;
443 struct sctp_association *asoc = transport->asoc;
444 struct sock *sk = asoc->base.sk;
445 struct net *net = sock_net(sk);
446 int error = 0;
447
448 bh_lock_sock(sk);
449 if (sock_owned_by_user(sk)) {
450 pr_debug("%s: sock is busy\n", __func__);
451
452 /* Try again later. */
453 if (!mod_timer(&transport->reconf_timer, jiffies + (HZ / 20)))
454 sctp_transport_hold(transport);
455 goto out_unlock;
456 }
457
458 error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT,
459 SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_RECONF),
460 asoc->state, asoc->ep, asoc,
461 transport, GFP_ATOMIC);
462
463 if (error)
464 sk->sk_err = -error;
465
466out_unlock:
467 bh_unlock_sock(sk);
468 sctp_transport_put(transport);
469}
439 470
440/* Inject a SACK Timeout event into the state machine. */ 471/* Inject a SACK Timeout event into the state machine. */
441static void sctp_generate_sack_event(unsigned long data) 472static void sctp_generate_sack_event(unsigned long data)
@@ -453,6 +484,7 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
453 sctp_generate_t4_rto_event, 484 sctp_generate_t4_rto_event,
454 sctp_generate_t5_shutdown_guard_event, 485 sctp_generate_t5_shutdown_guard_event,
455 NULL, 486 NULL,
487 NULL,
456 sctp_generate_sack_event, 488 sctp_generate_sack_event,
457 sctp_generate_autoclose_event, 489 sctp_generate_autoclose_event,
458}; 490};
@@ -723,7 +755,7 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
723 * forward progress. 755 * forward progress.
724 */ 756 */
725 if (t->dst) 757 if (t->dst)
726 dst_confirm(t->dst); 758 sctp_transport_dst_confirm(t);
727 759
728 /* The receiver of the HEARTBEAT ACK should also perform an 760 /* The receiver of the HEARTBEAT ACK should also perform an
729 * RTT measurement for that destination transport address 761 * RTT measurement for that destination transport address
@@ -840,6 +872,10 @@ static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds,
840 if (!sctp_style(sk, UDP)) 872 if (!sctp_style(sk, UDP))
841 sk->sk_state_change(sk); 873 sk->sk_state_change(sk);
842 } 874 }
875
876 if (sctp_state(asoc, SHUTDOWN_PENDING) &&
877 !sctp_outq_is_empty(&asoc->outqueue))
878 sctp_outq_uncork(&asoc->outqueue, GFP_ATOMIC);
843} 879}
844 880
845/* Helper function to delete an association. */ 881/* Helper function to delete an association. */
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 8ec20a64a3f8..24c6ccce7539 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -160,23 +160,22 @@ static sctp_disposition_t __sctp_sf_do_9_1_abort(struct net *net,
160/* Small helper function that checks if the chunk length 160/* Small helper function that checks if the chunk length
161 * is of the appropriate length. The 'required_length' argument 161 * is of the appropriate length. The 'required_length' argument
162 * is set to be the size of a specific chunk we are testing. 162 * is set to be the size of a specific chunk we are testing.
163 * Return Values: 1 = Valid length 163 * Return Values: true = Valid length
164 * 0 = Invalid length 164 * false = Invalid length
165 * 165 *
166 */ 166 */
167static inline int 167static inline bool
168sctp_chunk_length_valid(struct sctp_chunk *chunk, 168sctp_chunk_length_valid(struct sctp_chunk *chunk, __u16 required_length)
169 __u16 required_length)
170{ 169{
171 __u16 chunk_length = ntohs(chunk->chunk_hdr->length); 170 __u16 chunk_length = ntohs(chunk->chunk_hdr->length);
172 171
173 /* Previously already marked? */ 172 /* Previously already marked? */
174 if (unlikely(chunk->pdiscard)) 173 if (unlikely(chunk->pdiscard))
175 return 0; 174 return false;
176 if (unlikely(chunk_length < required_length)) 175 if (unlikely(chunk_length < required_length))
177 return 0; 176 return false;
178 177
179 return 1; 178 return true;
180} 179}
181 180
182/********************************************************** 181/**********************************************************
@@ -1022,6 +1021,34 @@ sctp_disposition_t sctp_sf_sendbeat_8_3(struct net *net,
1022 return SCTP_DISPOSITION_CONSUME; 1021 return SCTP_DISPOSITION_CONSUME;
1023} 1022}
1024 1023
1024/* resend asoc strreset_chunk. */
1025sctp_disposition_t sctp_sf_send_reconf(struct net *net,
1026 const struct sctp_endpoint *ep,
1027 const struct sctp_association *asoc,
1028 const sctp_subtype_t type, void *arg,
1029 sctp_cmd_seq_t *commands)
1030{
1031 struct sctp_transport *transport = arg;
1032
1033 if (asoc->overall_error_count >= asoc->max_retrans) {
1034 sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
1035 SCTP_ERROR(ETIMEDOUT));
1036 /* CMD_ASSOC_FAILED calls CMD_DELETE_TCB. */
1037 sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
1038 SCTP_PERR(SCTP_ERROR_NO_ERROR));
1039 SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
1040 SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
1041 return SCTP_DISPOSITION_DELETE_TCB;
1042 }
1043
1044 sctp_chunk_hold(asoc->strreset_chunk);
1045 sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
1046 SCTP_CHUNK(asoc->strreset_chunk));
1047 sctp_add_cmd_sf(commands, SCTP_CMD_STRIKE, SCTP_TRANSPORT(transport));
1048
1049 return SCTP_DISPOSITION_CONSUME;
1050}
1051
1025/* 1052/*
1026 * Process an heartbeat request. 1053 * Process an heartbeat request.
1027 * 1054 *
@@ -3237,36 +3264,34 @@ static sctp_disposition_t sctp_sf_tabort_8_4_8(struct net *net,
3237 struct sctp_chunk *abort; 3264 struct sctp_chunk *abort;
3238 3265
3239 packet = sctp_ootb_pkt_new(net, asoc, chunk); 3266 packet = sctp_ootb_pkt_new(net, asoc, chunk);
3267 if (!packet)
3268 return SCTP_DISPOSITION_NOMEM;
3240 3269
3241 if (packet) { 3270 /* Make an ABORT. The T bit will be set if the asoc
3242 /* Make an ABORT. The T bit will be set if the asoc 3271 * is NULL.
3243 * is NULL. 3272 */
3244 */ 3273 abort = sctp_make_abort(asoc, chunk, 0);
3245 abort = sctp_make_abort(asoc, chunk, 0); 3274 if (!abort) {
3246 if (!abort) { 3275 sctp_ootb_pkt_free(packet);
3247 sctp_ootb_pkt_free(packet); 3276 return SCTP_DISPOSITION_NOMEM;
3248 return SCTP_DISPOSITION_NOMEM; 3277 }
3249 }
3250
3251 /* Reflect vtag if T-Bit is set */
3252 if (sctp_test_T_bit(abort))
3253 packet->vtag = ntohl(chunk->sctp_hdr->vtag);
3254 3278
3255 /* Set the skb to the belonging sock for accounting. */ 3279 /* Reflect vtag if T-Bit is set */
3256 abort->skb->sk = ep->base.sk; 3280 if (sctp_test_T_bit(abort))
3281 packet->vtag = ntohl(chunk->sctp_hdr->vtag);
3257 3282
3258 sctp_packet_append_chunk(packet, abort); 3283 /* Set the skb to the belonging sock for accounting. */
3284 abort->skb->sk = ep->base.sk;
3259 3285
3260 sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, 3286 sctp_packet_append_chunk(packet, abort);
3261 SCTP_PACKET(packet));
3262 3287
3263 SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS); 3288 sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
3289 SCTP_PACKET(packet));
3264 3290
3265 sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); 3291 SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
3266 return SCTP_DISPOSITION_CONSUME;
3267 }
3268 3292
3269 return SCTP_DISPOSITION_NOMEM; 3293 sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
3294 return SCTP_DISPOSITION_CONSUME;
3270} 3295}
3271 3296
3272/* 3297/*
@@ -3503,45 +3528,43 @@ static sctp_disposition_t sctp_sf_shut_8_4_5(struct net *net,
3503 struct sctp_chunk *shut; 3528 struct sctp_chunk *shut;
3504 3529
3505 packet = sctp_ootb_pkt_new(net, asoc, chunk); 3530 packet = sctp_ootb_pkt_new(net, asoc, chunk);
3531 if (!packet)
3532 return SCTP_DISPOSITION_NOMEM;
3506 3533
3507 if (packet) { 3534 /* Make an SHUTDOWN_COMPLETE.
3508 /* Make an SHUTDOWN_COMPLETE. 3535 * The T bit will be set if the asoc is NULL.
3509 * The T bit will be set if the asoc is NULL. 3536 */
3510 */ 3537 shut = sctp_make_shutdown_complete(asoc, chunk);
3511 shut = sctp_make_shutdown_complete(asoc, chunk); 3538 if (!shut) {
3512 if (!shut) { 3539 sctp_ootb_pkt_free(packet);
3513 sctp_ootb_pkt_free(packet); 3540 return SCTP_DISPOSITION_NOMEM;
3514 return SCTP_DISPOSITION_NOMEM; 3541 }
3515 }
3516
3517 /* Reflect vtag if T-Bit is set */
3518 if (sctp_test_T_bit(shut))
3519 packet->vtag = ntohl(chunk->sctp_hdr->vtag);
3520 3542
3521 /* Set the skb to the belonging sock for accounting. */ 3543 /* Reflect vtag if T-Bit is set */
3522 shut->skb->sk = ep->base.sk; 3544 if (sctp_test_T_bit(shut))
3545 packet->vtag = ntohl(chunk->sctp_hdr->vtag);
3523 3546
3524 sctp_packet_append_chunk(packet, shut); 3547 /* Set the skb to the belonging sock for accounting. */
3548 shut->skb->sk = ep->base.sk;
3525 3549
3526 sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, 3550 sctp_packet_append_chunk(packet, shut);
3527 SCTP_PACKET(packet));
3528 3551
3529 SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS); 3552 sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
3553 SCTP_PACKET(packet));
3530 3554
3531 /* If the chunk length is invalid, we don't want to process 3555 SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
3532 * the reset of the packet.
3533 */
3534 if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t)))
3535 return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
3536 3556
3537 /* We need to discard the rest of the packet to prevent 3557 /* If the chunk length is invalid, we don't want to process
3538 * potential bomming attacks from additional bundled chunks. 3558 * the reset of the packet.
3539 * This is documented in SCTP Threats ID. 3559 */
3540 */ 3560 if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t)))
3541 return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); 3561 return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
3542 }
3543 3562
3544 return SCTP_DISPOSITION_NOMEM; 3563 /* We need to discard the rest of the packet to prevent
3564 * potential bomming attacks from additional bundled chunks.
3565 * This is documented in SCTP Threats ID.
3566 */
3567 return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
3545} 3568}
3546 3569
3547/* 3570/*
@@ -3811,6 +3834,60 @@ sctp_disposition_t sctp_sf_do_asconf_ack(struct net *net,
3811 return SCTP_DISPOSITION_DISCARD; 3834 return SCTP_DISPOSITION_DISCARD;
3812} 3835}
3813 3836
3837/* RE-CONFIG Section 5.2 Upon reception of an RECONF Chunk. */
3838sctp_disposition_t sctp_sf_do_reconf(struct net *net,
3839 const struct sctp_endpoint *ep,
3840 const struct sctp_association *asoc,
3841 const sctp_subtype_t type, void *arg,
3842 sctp_cmd_seq_t *commands)
3843{
3844 struct sctp_paramhdr *err_param = NULL;
3845 struct sctp_chunk *chunk = arg;
3846 struct sctp_reconf_chunk *hdr;
3847 union sctp_params param;
3848
3849 if (!sctp_vtag_verify(chunk, asoc)) {
3850 sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG,
3851 SCTP_NULL());
3852 return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
3853 }
3854
3855 /* Make sure that the RECONF chunk has a valid length. */
3856 if (!sctp_chunk_length_valid(chunk, sizeof(*hdr)))
3857 return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
3858 commands);
3859
3860 if (!sctp_verify_reconf(asoc, chunk, &err_param))
3861 return sctp_sf_violation_paramlen(net, ep, asoc, type, arg,
3862 (void *)err_param, commands);
3863
3864 hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr;
3865 sctp_walk_params(param, hdr, params) {
3866 struct sctp_chunk *reply = NULL;
3867 struct sctp_ulpevent *ev = NULL;
3868
3869 if (param.p->type == SCTP_PARAM_RESET_OUT_REQUEST)
3870 reply = sctp_process_strreset_outreq(
3871 (struct sctp_association *)asoc, param, &ev);
3872 else if (param.p->type == SCTP_PARAM_RESET_IN_REQUEST)
3873 reply = sctp_process_strreset_inreq(
3874 (struct sctp_association *)asoc, param, &ev);
3875 /* More handles for other types will be added here, by now it
3876 * just ignores other types.
3877 */
3878
3879 if (ev)
3880 sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
3881 SCTP_ULPEVENT(ev));
3882
3883 if (reply)
3884 sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
3885 SCTP_CHUNK(reply));
3886 }
3887
3888 return SCTP_DISPOSITION_CONSUME;
3889}
3890
3814/* 3891/*
3815 * PR-SCTP Section 3.6 Receiver Side Implementation of PR-SCTP 3892 * PR-SCTP Section 3.6 Receiver Side Implementation of PR-SCTP
3816 * 3893 *
@@ -3844,6 +3921,9 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn(struct net *net,
3844 return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); 3921 return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
3845 } 3922 }
3846 3923
3924 if (!asoc->peer.prsctp_capable)
3925 return sctp_sf_unk_chunk(net, ep, asoc, type, arg, commands);
3926
3847 /* Make sure that the FORWARD_TSN chunk has valid length. */ 3927 /* Make sure that the FORWARD_TSN chunk has valid length. */
3848 if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk))) 3928 if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk)))
3849 return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, 3929 return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
@@ -3866,7 +3946,7 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn(struct net *net,
3866 3946
3867 /* Silently discard the chunk if stream-id is not valid */ 3947 /* Silently discard the chunk if stream-id is not valid */
3868 sctp_walk_fwdtsn(skip, chunk) { 3948 sctp_walk_fwdtsn(skip, chunk) {
3869 if (ntohs(skip->stream) >= asoc->c.sinit_max_instreams) 3949 if (ntohs(skip->stream) >= asoc->stream->incnt)
3870 goto discard_noforce; 3950 goto discard_noforce;
3871 } 3951 }
3872 3952
@@ -3912,6 +3992,9 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn_fast(
3912 return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); 3992 return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
3913 } 3993 }
3914 3994
3995 if (!asoc->peer.prsctp_capable)
3996 return sctp_sf_unk_chunk(net, ep, asoc, type, arg, commands);
3997
3915 /* Make sure that the FORWARD_TSN chunk has a valid length. */ 3998 /* Make sure that the FORWARD_TSN chunk has a valid length. */
3916 if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk))) 3999 if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk)))
3917 return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, 4000 return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
@@ -3934,7 +4017,7 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn_fast(
3934 4017
3935 /* Silently discard the chunk if stream-id is not valid */ 4018 /* Silently discard the chunk if stream-id is not valid */
3936 sctp_walk_fwdtsn(skip, chunk) { 4019 sctp_walk_fwdtsn(skip, chunk) {
3937 if (ntohs(skip->stream) >= asoc->c.sinit_max_instreams) 4020 if (ntohs(skip->stream) >= asoc->stream->incnt)
3938 goto gen_shutdown; 4021 goto gen_shutdown;
3939 } 4022 }
3940 4023
@@ -5162,6 +5245,19 @@ sctp_disposition_t sctp_sf_do_prm_asconf(struct net *net,
5162 return SCTP_DISPOSITION_CONSUME; 5245 return SCTP_DISPOSITION_CONSUME;
5163} 5246}
5164 5247
5248/* RE-CONFIG Section 5.1 RECONF Chunk Procedures */
5249sctp_disposition_t sctp_sf_do_prm_reconf(struct net *net,
5250 const struct sctp_endpoint *ep,
5251 const struct sctp_association *asoc,
5252 const sctp_subtype_t type,
5253 void *arg, sctp_cmd_seq_t *commands)
5254{
5255 struct sctp_chunk *chunk = arg;
5256
5257 sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(chunk));
5258 return SCTP_DISPOSITION_CONSUME;
5259}
5260
5165/* 5261/*
5166 * Ignore the primitive event 5262 * Ignore the primitive event
5167 * 5263 *
@@ -6036,8 +6132,9 @@ static struct sctp_packet *sctp_ootb_pkt_new(struct net *net,
6036 sctp_transport_route(transport, (union sctp_addr *)&chunk->dest, 6132 sctp_transport_route(transport, (union sctp_addr *)&chunk->dest,
6037 sctp_sk(net->sctp.ctl_sock)); 6133 sctp_sk(net->sctp.ctl_sock));
6038 6134
6039 packet = sctp_packet_init(&transport->packet, transport, sport, dport); 6135 packet = &transport->packet;
6040 packet = sctp_packet_config(packet, vtag, 0); 6136 sctp_packet_init(packet, transport, sport, dport);
6137 sctp_packet_config(packet, vtag, 0);
6041 6138
6042 return packet; 6139 return packet;
6043 6140
@@ -6256,7 +6353,7 @@ static int sctp_eat_data(const struct sctp_association *asoc,
6256 * and discard the DATA chunk. 6353 * and discard the DATA chunk.
6257 */ 6354 */
6258 sid = ntohs(data_hdr->stream); 6355 sid = ntohs(data_hdr->stream);
6259 if (sid >= asoc->c.sinit_max_instreams) { 6356 if (sid >= asoc->stream->incnt) {
6260 /* Mark tsn as received even though we drop it */ 6357 /* Mark tsn as received even though we drop it */
6261 sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_TSN, SCTP_U32(tsn)); 6358 sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_TSN, SCTP_U32(tsn));
6262 6359
@@ -6278,9 +6375,8 @@ static int sctp_eat_data(const struct sctp_association *asoc,
6278 * and is invalid. 6375 * and is invalid.
6279 */ 6376 */
6280 ssn = ntohs(data_hdr->ssn); 6377 ssn = ntohs(data_hdr->ssn);
6281 if (ordered && SSN_lt(ssn, sctp_ssn_peek(&asoc->ssnmap->in, sid))) { 6378 if (ordered && SSN_lt(ssn, sctp_ssn_peek(asoc->stream, in, sid)))
6282 return SCTP_IERROR_PROTO_VIOLATION; 6379 return SCTP_IERROR_PROTO_VIOLATION;
6283 }
6284 6380
6285 /* Send the data up to the user. Note: Schedule the 6381 /* Send the data up to the user. Note: Schedule the
6286 * SCTP_CMD_CHUNK_ULP cmd before the SCTP_CMD_GEN_SACK, as the SACK 6382 * SCTP_CMD_CHUNK_ULP cmd before the SCTP_CMD_GEN_SACK, as the SACK
diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c
index a987d54b379c..419b18ebb056 100644
--- a/net/sctp/sm_statetable.c
+++ b/net/sctp/sm_statetable.c
@@ -482,6 +482,32 @@ static const sctp_sm_table_entry_t prsctp_chunk_event_table[SCTP_NUM_PRSCTP_CHUN
482 TYPE_SCTP_FWD_TSN, 482 TYPE_SCTP_FWD_TSN,
483}; /*state_fn_t prsctp_chunk_event_table[][] */ 483}; /*state_fn_t prsctp_chunk_event_table[][] */
484 484
485#define TYPE_SCTP_RECONF { \
486 /* SCTP_STATE_CLOSED */ \
487 TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
488 /* SCTP_STATE_COOKIE_WAIT */ \
489 TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
490 /* SCTP_STATE_COOKIE_ECHOED */ \
491 TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
492 /* SCTP_STATE_ESTABLISHED */ \
493 TYPE_SCTP_FUNC(sctp_sf_do_reconf), \
494 /* SCTP_STATE_SHUTDOWN_PENDING */ \
495 TYPE_SCTP_FUNC(sctp_sf_do_reconf), \
496 /* SCTP_STATE_SHUTDOWN_SENT */ \
497 TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
498 /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
499 TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
500 /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
501 TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
502} /* TYPE_SCTP_RECONF */
503
504/* The primary index for this table is the chunk type.
505 * The secondary index for this table is the state.
506 */
507static const sctp_sm_table_entry_t reconf_chunk_event_table[SCTP_NUM_RECONF_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = {
508 TYPE_SCTP_RECONF,
509}; /*state_fn_t reconf_chunk_event_table[][] */
510
485#define TYPE_SCTP_AUTH { \ 511#define TYPE_SCTP_AUTH { \
486 /* SCTP_STATE_CLOSED */ \ 512 /* SCTP_STATE_CLOSED */ \
487 TYPE_SCTP_FUNC(sctp_sf_ootb), \ 513 TYPE_SCTP_FUNC(sctp_sf_ootb), \
@@ -643,6 +669,25 @@ chunk_event_table_unknown[SCTP_STATE_NUM_STATES] = {
643 TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ 669 TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \
644} /* TYPE_SCTP_PRIMITIVE_ASCONF */ 670} /* TYPE_SCTP_PRIMITIVE_ASCONF */
645 671
672#define TYPE_SCTP_PRIMITIVE_RECONF { \
673 /* SCTP_STATE_CLOSED */ \
674 TYPE_SCTP_FUNC(sctp_sf_error_closed), \
675 /* SCTP_STATE_COOKIE_WAIT */ \
676 TYPE_SCTP_FUNC(sctp_sf_error_closed), \
677 /* SCTP_STATE_COOKIE_ECHOED */ \
678 TYPE_SCTP_FUNC(sctp_sf_error_closed), \
679 /* SCTP_STATE_ESTABLISHED */ \
680 TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \
681 /* SCTP_STATE_SHUTDOWN_PENDING */ \
682 TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \
683 /* SCTP_STATE_SHUTDOWN_SENT */ \
684 TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \
685 /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
686 TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \
687 /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
688 TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \
689} /* TYPE_SCTP_PRIMITIVE_RECONF */
690
646/* The primary index for this table is the primitive type. 691/* The primary index for this table is the primitive type.
647 * The secondary index for this table is the state. 692 * The secondary index for this table is the state.
648 */ 693 */
@@ -653,6 +698,7 @@ static const sctp_sm_table_entry_t primitive_event_table[SCTP_NUM_PRIMITIVE_TYPE
653 TYPE_SCTP_PRIMITIVE_SEND, 698 TYPE_SCTP_PRIMITIVE_SEND,
654 TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT, 699 TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT,
655 TYPE_SCTP_PRIMITIVE_ASCONF, 700 TYPE_SCTP_PRIMITIVE_ASCONF,
701 TYPE_SCTP_PRIMITIVE_RECONF,
656}; 702};
657 703
658#define TYPE_SCTP_OTHER_NO_PENDING_TSN { \ 704#define TYPE_SCTP_OTHER_NO_PENDING_TSN { \
@@ -888,6 +934,25 @@ static const sctp_sm_table_entry_t other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_
888 TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ 934 TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
889} 935}
890 936
937#define TYPE_SCTP_EVENT_TIMEOUT_RECONF { \
938 /* SCTP_STATE_CLOSED */ \
939 TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
940 /* SCTP_STATE_COOKIE_WAIT */ \
941 TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
942 /* SCTP_STATE_COOKIE_ECHOED */ \
943 TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
944 /* SCTP_STATE_ESTABLISHED */ \
945 TYPE_SCTP_FUNC(sctp_sf_send_reconf), \
946 /* SCTP_STATE_SHUTDOWN_PENDING */ \
947 TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
948 /* SCTP_STATE_SHUTDOWN_SENT */ \
949 TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
950 /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
951 TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
952 /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
953 TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
954}
955
891static const sctp_sm_table_entry_t timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES] = { 956static const sctp_sm_table_entry_t timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES] = {
892 TYPE_SCTP_EVENT_TIMEOUT_NONE, 957 TYPE_SCTP_EVENT_TIMEOUT_NONE,
893 TYPE_SCTP_EVENT_TIMEOUT_T1_COOKIE, 958 TYPE_SCTP_EVENT_TIMEOUT_T1_COOKIE,
@@ -897,6 +962,7 @@ static const sctp_sm_table_entry_t timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][S
897 TYPE_SCTP_EVENT_TIMEOUT_T4_RTO, 962 TYPE_SCTP_EVENT_TIMEOUT_T4_RTO,
898 TYPE_SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD, 963 TYPE_SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD,
899 TYPE_SCTP_EVENT_TIMEOUT_HEARTBEAT, 964 TYPE_SCTP_EVENT_TIMEOUT_HEARTBEAT,
965 TYPE_SCTP_EVENT_TIMEOUT_RECONF,
900 TYPE_SCTP_EVENT_TIMEOUT_SACK, 966 TYPE_SCTP_EVENT_TIMEOUT_SACK,
901 TYPE_SCTP_EVENT_TIMEOUT_AUTOCLOSE, 967 TYPE_SCTP_EVENT_TIMEOUT_AUTOCLOSE,
902}; 968};
@@ -924,6 +990,10 @@ static const sctp_sm_table_entry_t *sctp_chunk_event_lookup(struct net *net,
924 return &addip_chunk_event_table[1][state]; 990 return &addip_chunk_event_table[1][state];
925 } 991 }
926 992
993 if (net->sctp.reconf_enable)
994 if (cid == SCTP_CID_RECONF)
995 return &reconf_chunk_event_table[0][state];
996
927 if (net->sctp.auth_enable) { 997 if (net->sctp.auth_enable) {
928 if (cid == SCTP_CID_AUTH) 998 if (cid == SCTP_CID_AUTH)
929 return &auth_chunk_event_table[0][state]; 999 return &auth_chunk_event_table[0][state];
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index f23ad913dc7a..d9d4c92e06b3 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -57,6 +57,7 @@
57#include <linux/kernel.h> 57#include <linux/kernel.h>
58#include <linux/wait.h> 58#include <linux/wait.h>
59#include <linux/time.h> 59#include <linux/time.h>
60#include <linux/sched/signal.h>
60#include <linux/ip.h> 61#include <linux/ip.h>
61#include <linux/capability.h> 62#include <linux/capability.h>
62#include <linux/fcntl.h> 63#include <linux/fcntl.h>
@@ -235,8 +236,12 @@ static struct sctp_transport *sctp_addr_id2transport(struct sock *sk,
235 sctp_assoc_t id) 236 sctp_assoc_t id)
236{ 237{
237 struct sctp_association *addr_asoc = NULL, *id_asoc = NULL; 238 struct sctp_association *addr_asoc = NULL, *id_asoc = NULL;
238 struct sctp_transport *transport; 239 struct sctp_af *af = sctp_get_af_specific(addr->ss_family);
239 union sctp_addr *laddr = (union sctp_addr *)addr; 240 union sctp_addr *laddr = (union sctp_addr *)addr;
241 struct sctp_transport *transport;
242
243 if (!af || sctp_verify_addr(sk, laddr, af->sockaddr_len))
244 return NULL;
240 245
241 addr_asoc = sctp_endpoint_lookup_assoc(sctp_sk(sk)->ep, 246 addr_asoc = sctp_endpoint_lookup_assoc(sctp_sk(sk)->ep,
242 laddr, 247 laddr,
@@ -360,7 +365,7 @@ static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len)
360 } 365 }
361 } 366 }
362 367
363 if (snum && snum < PROT_SOCK && 368 if (snum && snum < inet_prot_sock(net) &&
364 !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) 369 !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
365 return -EACCES; 370 return -EACCES;
366 371
@@ -588,7 +593,7 @@ static int sctp_send_asconf_add_ip(struct sock *sk,
588 list_for_each_entry(trans, 593 list_for_each_entry(trans,
589 &asoc->peer.transport_addr_list, transports) { 594 &asoc->peer.transport_addr_list, transports) {
590 /* Clear the source and route cache */ 595 /* Clear the source and route cache */
591 dst_release(trans->dst); 596 sctp_transport_dst_release(trans);
592 trans->cwnd = min(4*asoc->pathmtu, max_t(__u32, 597 trans->cwnd = min(4*asoc->pathmtu, max_t(__u32,
593 2*asoc->pathmtu, 4380)); 598 2*asoc->pathmtu, 4380));
594 trans->ssthresh = asoc->peer.i.a_rwnd; 599 trans->ssthresh = asoc->peer.i.a_rwnd;
@@ -839,7 +844,7 @@ skip_mkasconf:
839 */ 844 */
840 list_for_each_entry(transport, &asoc->peer.transport_addr_list, 845 list_for_each_entry(transport, &asoc->peer.transport_addr_list,
841 transports) { 846 transports) {
842 dst_release(transport->dst); 847 sctp_transport_dst_release(transport);
843 sctp_transport_route(transport, NULL, 848 sctp_transport_route(transport, NULL,
844 sctp_sk(asoc->base.sk)); 849 sctp_sk(asoc->base.sk));
845 } 850 }
@@ -1152,8 +1157,10 @@ static int __sctp_connect(struct sock *sk,
1152 * accept new associations, but it SHOULD NOT 1157 * accept new associations, but it SHOULD NOT
1153 * be permitted to open new associations. 1158 * be permitted to open new associations.
1154 */ 1159 */
1155 if (ep->base.bind_addr.port < PROT_SOCK && 1160 if (ep->base.bind_addr.port <
1156 !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) { 1161 inet_prot_sock(net) &&
1162 !ns_capable(net->user_ns,
1163 CAP_NET_BIND_SERVICE)) {
1157 err = -EACCES; 1164 err = -EACCES;
1158 goto out_free; 1165 goto out_free;
1159 } 1166 }
@@ -1818,7 +1825,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
1818 * but it SHOULD NOT be permitted to open new 1825 * but it SHOULD NOT be permitted to open new
1819 * associations. 1826 * associations.
1820 */ 1827 */
1821 if (ep->base.bind_addr.port < PROT_SOCK && 1828 if (ep->base.bind_addr.port < inet_prot_sock(net) &&
1822 !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) { 1829 !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) {
1823 err = -EACCES; 1830 err = -EACCES;
1824 goto out_unlock; 1831 goto out_unlock;
@@ -1900,7 +1907,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
1900 } 1907 }
1901 1908
1902 if (asoc->pmtu_pending) 1909 if (asoc->pmtu_pending)
1903 sctp_assoc_pending_pmtu(sk, asoc); 1910 sctp_assoc_pending_pmtu(asoc);
1904 1911
1905 /* If fragmentation is disabled and the message length exceeds the 1912 /* If fragmentation is disabled and the message length exceeds the
1906 * association fragmentation point, return EMSGSIZE. The I-D 1913 * association fragmentation point, return EMSGSIZE. The I-D
@@ -1913,7 +1920,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
1913 } 1920 }
1914 1921
1915 /* Check for invalid stream. */ 1922 /* Check for invalid stream. */
1916 if (sinfo->sinfo_stream >= asoc->c.sinit_num_ostreams) { 1923 if (sinfo->sinfo_stream >= asoc->stream->outcnt) {
1917 err = -EINVAL; 1924 err = -EINVAL;
1918 goto out_free; 1925 goto out_free;
1919 } 1926 }
@@ -1958,6 +1965,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
1958 err = PTR_ERR(datamsg); 1965 err = PTR_ERR(datamsg);
1959 goto out_free; 1966 goto out_free;
1960 } 1967 }
1968 asoc->force_delay = !!(msg->msg_flags & MSG_MORE);
1961 1969
1962 /* Now send the (possibly) fragmented message. */ 1970 /* Now send the (possibly) fragmented message. */
1963 list_for_each_entry(chunk, &datamsg->chunks, frag_list) { 1971 list_for_each_entry(chunk, &datamsg->chunks, frag_list) {
@@ -2427,10 +2435,9 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
2427 if ((params->spp_flags & SPP_PMTUD_DISABLE) && params->spp_pathmtu) { 2435 if ((params->spp_flags & SPP_PMTUD_DISABLE) && params->spp_pathmtu) {
2428 if (trans) { 2436 if (trans) {
2429 trans->pathmtu = params->spp_pathmtu; 2437 trans->pathmtu = params->spp_pathmtu;
2430 sctp_assoc_sync_pmtu(sctp_opt2sk(sp), asoc); 2438 sctp_assoc_sync_pmtu(asoc);
2431 } else if (asoc) { 2439 } else if (asoc) {
2432 asoc->pathmtu = params->spp_pathmtu; 2440 asoc->pathmtu = params->spp_pathmtu;
2433 sctp_frag_point(asoc, params->spp_pathmtu);
2434 } else { 2441 } else {
2435 sp->pathmtu = params->spp_pathmtu; 2442 sp->pathmtu = params->spp_pathmtu;
2436 } 2443 }
@@ -2444,7 +2451,7 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
2444 (trans->param_flags & ~SPP_PMTUD) | pmtud_change; 2451 (trans->param_flags & ~SPP_PMTUD) | pmtud_change;
2445 if (update) { 2452 if (update) {
2446 sctp_transport_pmtu(trans, sctp_opt2sk(sp)); 2453 sctp_transport_pmtu(trans, sctp_opt2sk(sp));
2447 sctp_assoc_sync_pmtu(sctp_opt2sk(sp), asoc); 2454 sctp_assoc_sync_pmtu(asoc);
2448 } 2455 }
2449 } else if (asoc) { 2456 } else if (asoc) {
2450 asoc->param_flags = 2457 asoc->param_flags =
@@ -3751,6 +3758,120 @@ out:
3751 return retval; 3758 return retval;
3752} 3759}
3753 3760
3761static int sctp_setsockopt_enable_strreset(struct sock *sk,
3762 char __user *optval,
3763 unsigned int optlen)
3764{
3765 struct sctp_assoc_value params;
3766 struct sctp_association *asoc;
3767 int retval = -EINVAL;
3768
3769 if (optlen != sizeof(params))
3770 goto out;
3771
3772 if (copy_from_user(&params, optval, optlen)) {
3773 retval = -EFAULT;
3774 goto out;
3775 }
3776
3777 if (params.assoc_value & (~SCTP_ENABLE_STRRESET_MASK))
3778 goto out;
3779
3780 asoc = sctp_id2assoc(sk, params.assoc_id);
3781 if (asoc) {
3782 asoc->strreset_enable = params.assoc_value;
3783 } else if (!params.assoc_id) {
3784 struct sctp_sock *sp = sctp_sk(sk);
3785
3786 sp->ep->strreset_enable = params.assoc_value;
3787 } else {
3788 goto out;
3789 }
3790
3791 retval = 0;
3792
3793out:
3794 return retval;
3795}
3796
3797static int sctp_setsockopt_reset_streams(struct sock *sk,
3798 char __user *optval,
3799 unsigned int optlen)
3800{
3801 struct sctp_reset_streams *params;
3802 struct sctp_association *asoc;
3803 int retval = -EINVAL;
3804
3805 if (optlen < sizeof(struct sctp_reset_streams))
3806 return -EINVAL;
3807
3808 params = memdup_user(optval, optlen);
3809 if (IS_ERR(params))
3810 return PTR_ERR(params);
3811
3812 asoc = sctp_id2assoc(sk, params->srs_assoc_id);
3813 if (!asoc)
3814 goto out;
3815
3816 retval = sctp_send_reset_streams(asoc, params);
3817
3818out:
3819 kfree(params);
3820 return retval;
3821}
3822
3823static int sctp_setsockopt_reset_assoc(struct sock *sk,
3824 char __user *optval,
3825 unsigned int optlen)
3826{
3827 struct sctp_association *asoc;
3828 sctp_assoc_t associd;
3829 int retval = -EINVAL;
3830
3831 if (optlen != sizeof(associd))
3832 goto out;
3833
3834 if (copy_from_user(&associd, optval, optlen)) {
3835 retval = -EFAULT;
3836 goto out;
3837 }
3838
3839 asoc = sctp_id2assoc(sk, associd);
3840 if (!asoc)
3841 goto out;
3842
3843 retval = sctp_send_reset_assoc(asoc);
3844
3845out:
3846 return retval;
3847}
3848
3849static int sctp_setsockopt_add_streams(struct sock *sk,
3850 char __user *optval,
3851 unsigned int optlen)
3852{
3853 struct sctp_association *asoc;
3854 struct sctp_add_streams params;
3855 int retval = -EINVAL;
3856
3857 if (optlen != sizeof(params))
3858 goto out;
3859
3860 if (copy_from_user(&params, optval, optlen)) {
3861 retval = -EFAULT;
3862 goto out;
3863 }
3864
3865 asoc = sctp_id2assoc(sk, params.sas_assoc_id);
3866 if (!asoc)
3867 goto out;
3868
3869 retval = sctp_send_add_streams(asoc, &params);
3870
3871out:
3872 return retval;
3873}
3874
3754/* API 6.2 setsockopt(), getsockopt() 3875/* API 6.2 setsockopt(), getsockopt()
3755 * 3876 *
3756 * Applications use setsockopt() and getsockopt() to set or retrieve 3877 * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -3917,6 +4038,18 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
3917 case SCTP_DEFAULT_PRINFO: 4038 case SCTP_DEFAULT_PRINFO:
3918 retval = sctp_setsockopt_default_prinfo(sk, optval, optlen); 4039 retval = sctp_setsockopt_default_prinfo(sk, optval, optlen);
3919 break; 4040 break;
4041 case SCTP_ENABLE_STREAM_RESET:
4042 retval = sctp_setsockopt_enable_strreset(sk, optval, optlen);
4043 break;
4044 case SCTP_RESET_STREAMS:
4045 retval = sctp_setsockopt_reset_streams(sk, optval, optlen);
4046 break;
4047 case SCTP_RESET_ASSOC:
4048 retval = sctp_setsockopt_reset_assoc(sk, optval, optlen);
4049 break;
4050 case SCTP_ADD_STREAMS:
4051 retval = sctp_setsockopt_add_streams(sk, optval, optlen);
4052 break;
3920 default: 4053 default:
3921 retval = -ENOPROTOOPT; 4054 retval = -ENOPROTOOPT;
3922 break; 4055 break;
@@ -3983,7 +4116,7 @@ static int sctp_disconnect(struct sock *sk, int flags)
3983 * descriptor will be returned from accept() to represent the newly 4116 * descriptor will be returned from accept() to represent the newly
3984 * formed association. 4117 * formed association.
3985 */ 4118 */
3986static struct sock *sctp_accept(struct sock *sk, int flags, int *err) 4119static struct sock *sctp_accept(struct sock *sk, int flags, int *err, bool kern)
3987{ 4120{
3988 struct sctp_sock *sp; 4121 struct sctp_sock *sp;
3989 struct sctp_endpoint *ep; 4122 struct sctp_endpoint *ep;
@@ -4018,7 +4151,7 @@ static struct sock *sctp_accept(struct sock *sk, int flags, int *err)
4018 */ 4151 */
4019 asoc = list_entry(ep->asocs.next, struct sctp_association, asocs); 4152 asoc = list_entry(ep->asocs.next, struct sctp_association, asocs);
4020 4153
4021 newsk = sp->pf->create_accept_sk(sk, asoc); 4154 newsk = sp->pf->create_accept_sk(sk, asoc, kern);
4022 if (!newsk) { 4155 if (!newsk) {
4023 error = -ENOMEM; 4156 error = -ENOMEM;
4024 goto out; 4157 goto out;
@@ -4328,8 +4461,8 @@ int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
4328 info->sctpi_rwnd = asoc->a_rwnd; 4461 info->sctpi_rwnd = asoc->a_rwnd;
4329 info->sctpi_unackdata = asoc->unack_data; 4462 info->sctpi_unackdata = asoc->unack_data;
4330 info->sctpi_penddata = sctp_tsnmap_pending(&asoc->peer.tsn_map); 4463 info->sctpi_penddata = sctp_tsnmap_pending(&asoc->peer.tsn_map);
4331 info->sctpi_instrms = asoc->c.sinit_max_instreams; 4464 info->sctpi_instrms = asoc->stream->incnt;
4332 info->sctpi_outstrms = asoc->c.sinit_num_ostreams; 4465 info->sctpi_outstrms = asoc->stream->outcnt;
4333 list_for_each(pos, &asoc->base.inqueue.in_chunk_list) 4466 list_for_each(pos, &asoc->base.inqueue.in_chunk_list)
4334 info->sctpi_inqueue++; 4467 info->sctpi_inqueue++;
4335 list_for_each(pos, &asoc->outqueue.out_chunk_list) 4468 list_for_each(pos, &asoc->outqueue.out_chunk_list)
@@ -4392,10 +4525,7 @@ int sctp_transport_walk_start(struct rhashtable_iter *iter)
4392{ 4525{
4393 int err; 4526 int err;
4394 4527
4395 err = rhashtable_walk_init(&sctp_transport_hashtable, iter, 4528 rhltable_walk_enter(&sctp_transport_hashtable, iter);
4396 GFP_KERNEL);
4397 if (err)
4398 return err;
4399 4529
4400 err = rhashtable_walk_start(iter); 4530 err = rhashtable_walk_start(iter);
4401 if (err && err != -EAGAIN) { 4531 if (err && err != -EAGAIN) {
@@ -4475,18 +4605,17 @@ int sctp_transport_lookup_process(int (*cb)(struct sctp_transport *, void *),
4475 const union sctp_addr *paddr, void *p) 4605 const union sctp_addr *paddr, void *p)
4476{ 4606{
4477 struct sctp_transport *transport; 4607 struct sctp_transport *transport;
4478 int err = -ENOENT; 4608 int err;
4479 4609
4480 rcu_read_lock(); 4610 rcu_read_lock();
4481 transport = sctp_addrs_lookup_transport(net, laddr, paddr); 4611 transport = sctp_addrs_lookup_transport(net, laddr, paddr);
4482 if (!transport || !sctp_transport_hold(transport))
4483 goto out;
4484
4485 rcu_read_unlock(); 4612 rcu_read_unlock();
4613 if (!transport)
4614 return -ENOENT;
4615
4486 err = cb(transport, p); 4616 err = cb(transport, p);
4487 sctp_transport_put(transport); 4617 sctp_transport_put(transport);
4488 4618
4489out:
4490 return err; 4619 return err;
4491} 4620}
4492EXPORT_SYMBOL_GPL(sctp_transport_lookup_process); 4621EXPORT_SYMBOL_GPL(sctp_transport_lookup_process);
@@ -4562,8 +4691,8 @@ static int sctp_getsockopt_sctp_status(struct sock *sk, int len,
4562 status.sstat_unackdata = asoc->unack_data; 4691 status.sstat_unackdata = asoc->unack_data;
4563 4692
4564 status.sstat_penddata = sctp_tsnmap_pending(&asoc->peer.tsn_map); 4693 status.sstat_penddata = sctp_tsnmap_pending(&asoc->peer.tsn_map);
4565 status.sstat_instrms = asoc->c.sinit_max_instreams; 4694 status.sstat_instrms = asoc->stream->incnt;
4566 status.sstat_outstrms = asoc->c.sinit_num_ostreams; 4695 status.sstat_outstrms = asoc->stream->outcnt;
4567 status.sstat_fragmentation_point = asoc->frag_point; 4696 status.sstat_fragmentation_point = asoc->frag_point;
4568 status.sstat_primary.spinfo_assoc_id = sctp_assoc2id(transport->asoc); 4697 status.sstat_primary.spinfo_assoc_id = sctp_assoc2id(transport->asoc);
4569 memcpy(&status.sstat_primary.spinfo_address, &transport->ipaddr, 4698 memcpy(&status.sstat_primary.spinfo_address, &transport->ipaddr,
@@ -4734,6 +4863,12 @@ int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp)
4734 if (!asoc) 4863 if (!asoc)
4735 return -EINVAL; 4864 return -EINVAL;
4736 4865
4866 /* If there is a thread waiting on more sndbuf space for
4867 * sending on this asoc, it cannot be peeled.
4868 */
4869 if (waitqueue_active(&asoc->wait))
4870 return -EBUSY;
4871
4737 /* An association cannot be branched off from an already peeled-off 4872 /* An association cannot be branched off from an already peeled-off
4738 * socket, nor is this supported for tcp style sockets. 4873 * socket, nor is this supported for tcp style sockets.
4739 */ 4874 */
@@ -6405,6 +6540,47 @@ out:
6405 return retval; 6540 return retval;
6406} 6541}
6407 6542
6543static int sctp_getsockopt_enable_strreset(struct sock *sk, int len,
6544 char __user *optval,
6545 int __user *optlen)
6546{
6547 struct sctp_assoc_value params;
6548 struct sctp_association *asoc;
6549 int retval = -EFAULT;
6550
6551 if (len < sizeof(params)) {
6552 retval = -EINVAL;
6553 goto out;
6554 }
6555
6556 len = sizeof(params);
6557 if (copy_from_user(&params, optval, len))
6558 goto out;
6559
6560 asoc = sctp_id2assoc(sk, params.assoc_id);
6561 if (asoc) {
6562 params.assoc_value = asoc->strreset_enable;
6563 } else if (!params.assoc_id) {
6564 struct sctp_sock *sp = sctp_sk(sk);
6565
6566 params.assoc_value = sp->ep->strreset_enable;
6567 } else {
6568 retval = -EINVAL;
6569 goto out;
6570 }
6571
6572 if (put_user(len, optlen))
6573 goto out;
6574
6575 if (copy_to_user(optval, &params, len))
6576 goto out;
6577
6578 retval = 0;
6579
6580out:
6581 return retval;
6582}
6583
6408static int sctp_getsockopt(struct sock *sk, int level, int optname, 6584static int sctp_getsockopt(struct sock *sk, int level, int optname,
6409 char __user *optval, int __user *optlen) 6585 char __user *optval, int __user *optlen)
6410{ 6586{
@@ -6572,6 +6748,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
6572 retval = sctp_getsockopt_pr_assocstatus(sk, len, optval, 6748 retval = sctp_getsockopt_pr_assocstatus(sk, len, optval,
6573 optlen); 6749 optlen);
6574 break; 6750 break;
6751 case SCTP_ENABLE_STREAM_RESET:
6752 retval = sctp_getsockopt_enable_strreset(sk, len, optval,
6753 optlen);
6754 break;
6575 default: 6755 default:
6576 retval = -ENOPROTOOPT; 6756 retval = -ENOPROTOOPT;
6577 break; 6757 break;
@@ -6854,6 +7034,9 @@ int sctp_inet_listen(struct socket *sock, int backlog)
6854 if (sock->state != SS_UNCONNECTED) 7034 if (sock->state != SS_UNCONNECTED)
6855 goto out; 7035 goto out;
6856 7036
7037 if (!sctp_sstate(sk, LISTENING) && !sctp_sstate(sk, CLOSED))
7038 goto out;
7039
6857 /* If backlog is zero, disable listening. */ 7040 /* If backlog is zero, disable listening. */
6858 if (!backlog) { 7041 if (!backlog) {
6859 if (sctp_sstate(sk, CLOSED)) 7042 if (sctp_sstate(sk, CLOSED))
@@ -7426,7 +7609,6 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
7426 */ 7609 */
7427 release_sock(sk); 7610 release_sock(sk);
7428 current_timeo = schedule_timeout(current_timeo); 7611 current_timeo = schedule_timeout(current_timeo);
7429 BUG_ON(sk != asoc->base.sk);
7430 lock_sock(sk); 7612 lock_sock(sk);
7431 7613
7432 *timeo_p = current_timeo; 7614 *timeo_p = current_timeo;
diff --git a/net/sctp/ssnmap.c b/net/sctp/ssnmap.c
deleted file mode 100644
index b9c8521c1a98..000000000000
--- a/net/sctp/ssnmap.c
+++ /dev/null
@@ -1,125 +0,0 @@
1/* SCTP kernel implementation
2 * Copyright (c) 2003 International Business Machines, Corp.
3 *
4 * This file is part of the SCTP kernel implementation
5 *
6 * These functions manipulate sctp SSN tracker.
7 *
8 * This SCTP implementation is free software;
9 * you can redistribute it and/or modify it under the terms of
10 * the GNU General Public License as published by
11 * the Free Software Foundation; either version 2, or (at your option)
12 * any later version.
13 *
14 * This SCTP implementation is distributed in the hope that it
15 * will be useful, but WITHOUT ANY WARRANTY; without even the implied
16 * ************************
17 * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
18 * See the GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with GNU CC; see the file COPYING. If not, see
22 * <http://www.gnu.org/licenses/>.
23 *
24 * Please send any bug reports or fixes you make to the
25 * email address(es):
26 * lksctp developers <linux-sctp@vger.kernel.org>
27 *
28 * Written or modified by:
29 * Jon Grimm <jgrimm@us.ibm.com>
30 */
31
32#include <linux/types.h>
33#include <linux/slab.h>
34#include <net/sctp/sctp.h>
35#include <net/sctp/sm.h>
36
37static struct sctp_ssnmap *sctp_ssnmap_init(struct sctp_ssnmap *map, __u16 in,
38 __u16 out);
39
40/* Storage size needed for map includes 2 headers and then the
41 * specific needs of in or out streams.
42 */
43static inline size_t sctp_ssnmap_size(__u16 in, __u16 out)
44{
45 return sizeof(struct sctp_ssnmap) + (in + out) * sizeof(__u16);
46}
47
48
49/* Create a new sctp_ssnmap.
50 * Allocate room to store at least 'len' contiguous TSNs.
51 */
52struct sctp_ssnmap *sctp_ssnmap_new(__u16 in, __u16 out,
53 gfp_t gfp)
54{
55 struct sctp_ssnmap *retval;
56 int size;
57
58 size = sctp_ssnmap_size(in, out);
59 if (size <= KMALLOC_MAX_SIZE)
60 retval = kmalloc(size, gfp);
61 else
62 retval = (struct sctp_ssnmap *)
63 __get_free_pages(gfp, get_order(size));
64 if (!retval)
65 goto fail;
66
67 if (!sctp_ssnmap_init(retval, in, out))
68 goto fail_map;
69
70 SCTP_DBG_OBJCNT_INC(ssnmap);
71
72 return retval;
73
74fail_map:
75 if (size <= KMALLOC_MAX_SIZE)
76 kfree(retval);
77 else
78 free_pages((unsigned long)retval, get_order(size));
79fail:
80 return NULL;
81}
82
83
84/* Initialize a block of memory as a ssnmap. */
85static struct sctp_ssnmap *sctp_ssnmap_init(struct sctp_ssnmap *map, __u16 in,
86 __u16 out)
87{
88 memset(map, 0x00, sctp_ssnmap_size(in, out));
89
90 /* Start 'in' stream just after the map header. */
91 map->in.ssn = (__u16 *)&map[1];
92 map->in.len = in;
93
94 /* Start 'out' stream just after 'in'. */
95 map->out.ssn = &map->in.ssn[in];
96 map->out.len = out;
97
98 return map;
99}
100
101/* Clear out the ssnmap streams. */
102void sctp_ssnmap_clear(struct sctp_ssnmap *map)
103{
104 size_t size;
105
106 size = (map->in.len + map->out.len) * sizeof(__u16);
107 memset(map->in.ssn, 0x00, size);
108}
109
110/* Dispose of a ssnmap. */
111void sctp_ssnmap_free(struct sctp_ssnmap *map)
112{
113 int size;
114
115 if (unlikely(!map))
116 return;
117
118 size = sctp_ssnmap_size(map->in.len, map->out.len);
119 if (size <= KMALLOC_MAX_SIZE)
120 kfree(map);
121 else
122 free_pages((unsigned long)map, get_order(size));
123
124 SCTP_DBG_OBJCNT_DEC(ssnmap);
125}
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
new file mode 100644
index 000000000000..bbed997e1c5f
--- /dev/null
+++ b/net/sctp/stream.c
@@ -0,0 +1,506 @@
1/* SCTP kernel implementation
2 * (C) Copyright IBM Corp. 2001, 2004
3 * Copyright (c) 1999-2000 Cisco, Inc.
4 * Copyright (c) 1999-2001 Motorola, Inc.
5 * Copyright (c) 2001 Intel Corp.
6 *
7 * This file is part of the SCTP kernel implementation
8 *
9 * These functions manipulate sctp tsn mapping array.
10 *
11 * This SCTP implementation is free software;
12 * you can redistribute it and/or modify it under the terms of
13 * the GNU General Public License as published by
14 * the Free Software Foundation; either version 2, or (at your option)
15 * any later version.
16 *
17 * This SCTP implementation is distributed in the hope that it
18 * will be useful, but WITHOUT ANY WARRANTY; without even the implied
19 * ************************
20 * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
21 * See the GNU General Public License for more details.
22 *
23 * You should have received a copy of the GNU General Public License
24 * along with GNU CC; see the file COPYING. If not, see
25 * <http://www.gnu.org/licenses/>.
26 *
27 * Please send any bug reports or fixes you make to the
28 * email address(es):
29 * lksctp developers <linux-sctp@vger.kernel.org>
30 *
31 * Written or modified by:
32 * Xin Long <lucien.xin@gmail.com>
33 */
34
35#include <net/sctp/sctp.h>
36#include <net/sctp/sm.h>
37
38int sctp_stream_new(struct sctp_association *asoc, gfp_t gfp)
39{
40 struct sctp_stream *stream;
41 int i;
42
43 stream = kzalloc(sizeof(*stream), gfp);
44 if (!stream)
45 return -ENOMEM;
46
47 stream->outcnt = asoc->c.sinit_num_ostreams;
48 stream->out = kcalloc(stream->outcnt, sizeof(*stream->out), gfp);
49 if (!stream->out) {
50 kfree(stream);
51 return -ENOMEM;
52 }
53 for (i = 0; i < stream->outcnt; i++)
54 stream->out[i].state = SCTP_STREAM_OPEN;
55
56 asoc->stream = stream;
57
58 return 0;
59}
60
61int sctp_stream_init(struct sctp_association *asoc, gfp_t gfp)
62{
63 struct sctp_stream *stream = asoc->stream;
64 int i;
65
66 /* Initial stream->out size may be very big, so free it and alloc
67 * a new one with new outcnt to save memory.
68 */
69 kfree(stream->out);
70 stream->outcnt = asoc->c.sinit_num_ostreams;
71 stream->out = kcalloc(stream->outcnt, sizeof(*stream->out), gfp);
72 if (!stream->out)
73 goto nomem;
74
75 for (i = 0; i < stream->outcnt; i++)
76 stream->out[i].state = SCTP_STREAM_OPEN;
77
78 stream->incnt = asoc->c.sinit_max_instreams;
79 stream->in = kcalloc(stream->incnt, sizeof(*stream->in), gfp);
80 if (!stream->in) {
81 kfree(stream->out);
82 goto nomem;
83 }
84
85 return 0;
86
87nomem:
88 asoc->stream = NULL;
89 kfree(stream);
90
91 return -ENOMEM;
92}
93
94void sctp_stream_free(struct sctp_stream *stream)
95{
96 if (unlikely(!stream))
97 return;
98
99 kfree(stream->out);
100 kfree(stream->in);
101 kfree(stream);
102}
103
104void sctp_stream_clear(struct sctp_stream *stream)
105{
106 int i;
107
108 for (i = 0; i < stream->outcnt; i++)
109 stream->out[i].ssn = 0;
110
111 for (i = 0; i < stream->incnt; i++)
112 stream->in[i].ssn = 0;
113}
114
115static int sctp_send_reconf(struct sctp_association *asoc,
116 struct sctp_chunk *chunk)
117{
118 struct net *net = sock_net(asoc->base.sk);
119 int retval = 0;
120
121 retval = sctp_primitive_RECONF(net, asoc, chunk);
122 if (retval)
123 sctp_chunk_free(chunk);
124
125 return retval;
126}
127
128int sctp_send_reset_streams(struct sctp_association *asoc,
129 struct sctp_reset_streams *params)
130{
131 struct sctp_stream *stream = asoc->stream;
132 __u16 i, str_nums, *str_list;
133 struct sctp_chunk *chunk;
134 int retval = -EINVAL;
135 bool out, in;
136
137 if (!asoc->peer.reconf_capable ||
138 !(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ)) {
139 retval = -ENOPROTOOPT;
140 goto out;
141 }
142
143 if (asoc->strreset_outstanding) {
144 retval = -EINPROGRESS;
145 goto out;
146 }
147
148 out = params->srs_flags & SCTP_STREAM_RESET_OUTGOING;
149 in = params->srs_flags & SCTP_STREAM_RESET_INCOMING;
150 if (!out && !in)
151 goto out;
152
153 str_nums = params->srs_number_streams;
154 str_list = params->srs_stream_list;
155 if (out && str_nums)
156 for (i = 0; i < str_nums; i++)
157 if (str_list[i] >= stream->outcnt)
158 goto out;
159
160 if (in && str_nums)
161 for (i = 0; i < str_nums; i++)
162 if (str_list[i] >= stream->incnt)
163 goto out;
164
165 for (i = 0; i < str_nums; i++)
166 str_list[i] = htons(str_list[i]);
167
168 chunk = sctp_make_strreset_req(asoc, str_nums, str_list, out, in);
169
170 for (i = 0; i < str_nums; i++)
171 str_list[i] = ntohs(str_list[i]);
172
173 if (!chunk) {
174 retval = -ENOMEM;
175 goto out;
176 }
177
178 if (out) {
179 if (str_nums)
180 for (i = 0; i < str_nums; i++)
181 stream->out[str_list[i]].state =
182 SCTP_STREAM_CLOSED;
183 else
184 for (i = 0; i < stream->outcnt; i++)
185 stream->out[i].state = SCTP_STREAM_CLOSED;
186 }
187
188 asoc->strreset_chunk = chunk;
189 sctp_chunk_hold(asoc->strreset_chunk);
190
191 retval = sctp_send_reconf(asoc, chunk);
192 if (retval) {
193 sctp_chunk_put(asoc->strreset_chunk);
194 asoc->strreset_chunk = NULL;
195 if (!out)
196 goto out;
197
198 if (str_nums)
199 for (i = 0; i < str_nums; i++)
200 stream->out[str_list[i]].state =
201 SCTP_STREAM_OPEN;
202 else
203 for (i = 0; i < stream->outcnt; i++)
204 stream->out[i].state = SCTP_STREAM_OPEN;
205
206 goto out;
207 }
208
209 asoc->strreset_outstanding = out + in;
210
211out:
212 return retval;
213}
214
215int sctp_send_reset_assoc(struct sctp_association *asoc)
216{
217 struct sctp_chunk *chunk = NULL;
218 int retval;
219 __u16 i;
220
221 if (!asoc->peer.reconf_capable ||
222 !(asoc->strreset_enable & SCTP_ENABLE_RESET_ASSOC_REQ))
223 return -ENOPROTOOPT;
224
225 if (asoc->strreset_outstanding)
226 return -EINPROGRESS;
227
228 chunk = sctp_make_strreset_tsnreq(asoc);
229 if (!chunk)
230 return -ENOMEM;
231
232 /* Block further xmit of data until this request is completed */
233 for (i = 0; i < asoc->stream->outcnt; i++)
234 asoc->stream->out[i].state = SCTP_STREAM_CLOSED;
235
236 asoc->strreset_chunk = chunk;
237 sctp_chunk_hold(asoc->strreset_chunk);
238
239 retval = sctp_send_reconf(asoc, chunk);
240 if (retval) {
241 sctp_chunk_put(asoc->strreset_chunk);
242 asoc->strreset_chunk = NULL;
243
244 for (i = 0; i < asoc->stream->outcnt; i++)
245 asoc->stream->out[i].state = SCTP_STREAM_OPEN;
246
247 return retval;
248 }
249
250 asoc->strreset_outstanding = 1;
251
252 return 0;
253}
254
255int sctp_send_add_streams(struct sctp_association *asoc,
256 struct sctp_add_streams *params)
257{
258 struct sctp_stream *stream = asoc->stream;
259 struct sctp_chunk *chunk = NULL;
260 int retval = -ENOMEM;
261 __u32 outcnt, incnt;
262 __u16 out, in;
263
264 if (!asoc->peer.reconf_capable ||
265 !(asoc->strreset_enable & SCTP_ENABLE_CHANGE_ASSOC_REQ)) {
266 retval = -ENOPROTOOPT;
267 goto out;
268 }
269
270 if (asoc->strreset_outstanding) {
271 retval = -EINPROGRESS;
272 goto out;
273 }
274
275 out = params->sas_outstrms;
276 in = params->sas_instrms;
277 outcnt = stream->outcnt + out;
278 incnt = stream->incnt + in;
279 if (outcnt > SCTP_MAX_STREAM || incnt > SCTP_MAX_STREAM ||
280 (!out && !in)) {
281 retval = -EINVAL;
282 goto out;
283 }
284
285 if (out) {
286 struct sctp_stream_out *streamout;
287
288 streamout = krealloc(stream->out, outcnt * sizeof(*streamout),
289 GFP_KERNEL);
290 if (!streamout)
291 goto out;
292
293 memset(streamout + stream->outcnt, 0, out * sizeof(*streamout));
294 stream->out = streamout;
295 }
296
297 if (in) {
298 struct sctp_stream_in *streamin;
299
300 streamin = krealloc(stream->in, incnt * sizeof(*streamin),
301 GFP_KERNEL);
302 if (!streamin)
303 goto out;
304
305 memset(streamin + stream->incnt, 0, in * sizeof(*streamin));
306 stream->in = streamin;
307 }
308
309 chunk = sctp_make_strreset_addstrm(asoc, out, in);
310 if (!chunk)
311 goto out;
312
313 asoc->strreset_chunk = chunk;
314 sctp_chunk_hold(asoc->strreset_chunk);
315
316 retval = sctp_send_reconf(asoc, chunk);
317 if (retval) {
318 sctp_chunk_put(asoc->strreset_chunk);
319 asoc->strreset_chunk = NULL;
320 goto out;
321 }
322
323 stream->incnt = incnt;
324 stream->outcnt = outcnt;
325
326 asoc->strreset_outstanding = !!out + !!in;
327
328out:
329 return retval;
330}
331
332static sctp_paramhdr_t *sctp_chunk_lookup_strreset_param(
333 struct sctp_association *asoc, __u32 resp_seq)
334{
335 struct sctp_chunk *chunk = asoc->strreset_chunk;
336 struct sctp_reconf_chunk *hdr;
337 union sctp_params param;
338
339 if (ntohl(resp_seq) != asoc->strreset_outseq || !chunk)
340 return NULL;
341
342 hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr;
343 sctp_walk_params(param, hdr, params) {
344 /* sctp_strreset_tsnreq is actually the basic structure
345 * of all stream reconf params, so it's safe to use it
346 * to access request_seq.
347 */
348 struct sctp_strreset_tsnreq *req = param.v;
349
350 if (req->request_seq == resp_seq)
351 return param.v;
352 }
353
354 return NULL;
355}
356
357struct sctp_chunk *sctp_process_strreset_outreq(
358 struct sctp_association *asoc,
359 union sctp_params param,
360 struct sctp_ulpevent **evp)
361{
362 struct sctp_strreset_outreq *outreq = param.v;
363 struct sctp_stream *stream = asoc->stream;
364 __u16 i, nums, flags = 0, *str_p = NULL;
365 __u32 result = SCTP_STRRESET_DENIED;
366 __u32 request_seq;
367
368 request_seq = ntohl(outreq->request_seq);
369
370 if (ntohl(outreq->send_reset_at_tsn) >
371 sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map)) {
372 result = SCTP_STRRESET_IN_PROGRESS;
373 goto out;
374 }
375
376 if (request_seq > asoc->strreset_inseq) {
377 result = SCTP_STRRESET_ERR_BAD_SEQNO;
378 goto out;
379 } else if (request_seq == asoc->strreset_inseq) {
380 asoc->strreset_inseq++;
381 }
382
383 /* Check strreset_enable after inseq inc, as sender cannot tell
384 * the peer doesn't enable strreset after receiving response with
385 * result denied, as well as to keep consistent with bsd.
386 */
387 if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ))
388 goto out;
389
390 if (asoc->strreset_chunk) {
391 sctp_paramhdr_t *param_hdr;
392 struct sctp_transport *t;
393
394 param_hdr = sctp_chunk_lookup_strreset_param(
395 asoc, outreq->response_seq);
396 if (!param_hdr || param_hdr->type !=
397 SCTP_PARAM_RESET_IN_REQUEST) {
398 /* same process with outstanding isn't 0 */
399 result = SCTP_STRRESET_ERR_IN_PROGRESS;
400 goto out;
401 }
402
403 asoc->strreset_outstanding--;
404 asoc->strreset_outseq++;
405
406 if (!asoc->strreset_outstanding) {
407 t = asoc->strreset_chunk->transport;
408 if (del_timer(&t->reconf_timer))
409 sctp_transport_put(t);
410
411 sctp_chunk_put(asoc->strreset_chunk);
412 asoc->strreset_chunk = NULL;
413 }
414
415 flags = SCTP_STREAM_RESET_INCOMING_SSN;
416 }
417
418 nums = (ntohs(param.p->length) - sizeof(*outreq)) / 2;
419 if (nums) {
420 str_p = outreq->list_of_streams;
421 for (i = 0; i < nums; i++) {
422 if (ntohs(str_p[i]) >= stream->incnt) {
423 result = SCTP_STRRESET_ERR_WRONG_SSN;
424 goto out;
425 }
426 }
427
428 for (i = 0; i < nums; i++)
429 stream->in[ntohs(str_p[i])].ssn = 0;
430 } else {
431 for (i = 0; i < stream->incnt; i++)
432 stream->in[i].ssn = 0;
433 }
434
435 result = SCTP_STRRESET_PERFORMED;
436
437 *evp = sctp_ulpevent_make_stream_reset_event(asoc,
438 flags | SCTP_STREAM_RESET_OUTGOING_SSN, nums, str_p,
439 GFP_ATOMIC);
440
441out:
442 return sctp_make_strreset_resp(asoc, result, request_seq);
443}
444
445struct sctp_chunk *sctp_process_strreset_inreq(
446 struct sctp_association *asoc,
447 union sctp_params param,
448 struct sctp_ulpevent **evp)
449{
450 struct sctp_strreset_inreq *inreq = param.v;
451 struct sctp_stream *stream = asoc->stream;
452 __u32 result = SCTP_STRRESET_DENIED;
453 struct sctp_chunk *chunk = NULL;
454 __u16 i, nums, *str_p;
455 __u32 request_seq;
456
457 request_seq = ntohl(inreq->request_seq);
458 if (request_seq > asoc->strreset_inseq) {
459 result = SCTP_STRRESET_ERR_BAD_SEQNO;
460 goto out;
461 } else if (request_seq == asoc->strreset_inseq) {
462 asoc->strreset_inseq++;
463 }
464
465 if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ))
466 goto out;
467
468 if (asoc->strreset_outstanding) {
469 result = SCTP_STRRESET_ERR_IN_PROGRESS;
470 goto out;
471 }
472
473 nums = (ntohs(param.p->length) - sizeof(*inreq)) / 2;
474 str_p = inreq->list_of_streams;
475 for (i = 0; i < nums; i++) {
476 if (ntohs(str_p[i]) >= stream->outcnt) {
477 result = SCTP_STRRESET_ERR_WRONG_SSN;
478 goto out;
479 }
480 }
481
482 chunk = sctp_make_strreset_req(asoc, nums, str_p, 1, 0);
483 if (!chunk)
484 goto out;
485
486 if (nums)
487 for (i = 0; i < nums; i++)
488 stream->out[ntohs(str_p[i])].state =
489 SCTP_STREAM_CLOSED;
490 else
491 for (i = 0; i < stream->outcnt; i++)
492 stream->out[i].state = SCTP_STREAM_CLOSED;
493
494 asoc->strreset_chunk = chunk;
495 asoc->strreset_outstanding = 1;
496 sctp_chunk_hold(asoc->strreset_chunk);
497
498 *evp = sctp_ulpevent_make_stream_reset_event(asoc,
499 SCTP_STREAM_RESET_INCOMING_SSN, nums, str_p, GFP_ATOMIC);
500
501out:
502 if (!chunk)
503 chunk = sctp_make_strreset_resp(asoc, result, request_seq);
504
505 return chunk;
506}
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index ce54dce13ddb..721eeebfcd8a 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -72,7 +72,7 @@ static struct sctp_transport *sctp_transport_init(struct net *net,
72 */ 72 */
73 peer->rto = msecs_to_jiffies(net->sctp.rto_initial); 73 peer->rto = msecs_to_jiffies(net->sctp.rto_initial);
74 74
75 peer->last_time_heard = ktime_set(0, 0); 75 peer->last_time_heard = 0;
76 peer->last_time_ecne_reduced = jiffies; 76 peer->last_time_ecne_reduced = jiffies;
77 77
78 peer->param_flags = SPP_HB_DISABLE | 78 peer->param_flags = SPP_HB_DISABLE |
@@ -88,9 +88,11 @@ static struct sctp_transport *sctp_transport_init(struct net *net,
88 INIT_LIST_HEAD(&peer->transports); 88 INIT_LIST_HEAD(&peer->transports);
89 89
90 setup_timer(&peer->T3_rtx_timer, sctp_generate_t3_rtx_event, 90 setup_timer(&peer->T3_rtx_timer, sctp_generate_t3_rtx_event,
91 (unsigned long)peer); 91 (unsigned long)peer);
92 setup_timer(&peer->hb_timer, sctp_generate_heartbeat_event, 92 setup_timer(&peer->hb_timer, sctp_generate_heartbeat_event,
93 (unsigned long)peer); 93 (unsigned long)peer);
94 setup_timer(&peer->reconf_timer, sctp_generate_reconf_event,
95 (unsigned long)peer);
94 setup_timer(&peer->proto_unreach_timer, 96 setup_timer(&peer->proto_unreach_timer,
95 sctp_generate_proto_unreach_event, (unsigned long)peer); 97 sctp_generate_proto_unreach_event, (unsigned long)peer);
96 98
@@ -144,6 +146,9 @@ void sctp_transport_free(struct sctp_transport *transport)
144 if (del_timer(&transport->T3_rtx_timer)) 146 if (del_timer(&transport->T3_rtx_timer))
145 sctp_transport_put(transport); 147 sctp_transport_put(transport);
146 148
149 if (del_timer(&transport->reconf_timer))
150 sctp_transport_put(transport);
151
147 /* Delete the ICMP proto unreachable timer if it's active. */ 152 /* Delete the ICMP proto unreachable timer if it's active. */
148 if (del_timer(&transport->proto_unreach_timer)) 153 if (del_timer(&transport->proto_unreach_timer))
149 sctp_association_put(transport->asoc); 154 sctp_association_put(transport->asoc);
@@ -211,6 +216,14 @@ void sctp_transport_reset_hb_timer(struct sctp_transport *transport)
211 sctp_transport_hold(transport); 216 sctp_transport_hold(transport);
212} 217}
213 218
219void sctp_transport_reset_reconf_timer(struct sctp_transport *transport)
220{
221 if (!timer_pending(&transport->reconf_timer))
222 if (!mod_timer(&transport->reconf_timer,
223 jiffies + transport->rto))
224 sctp_transport_hold(transport);
225}
226
214/* This transport has been assigned to an association. 227/* This transport has been assigned to an association.
215 * Initialize fields from the association or from the sock itself. 228 * Initialize fields from the association or from the sock itself.
216 * Register the reference count in the association. 229 * Register the reference count in the association.
@@ -227,7 +240,7 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk)
227{ 240{
228 /* If we don't have a fresh route, look one up */ 241 /* If we don't have a fresh route, look one up */
229 if (!transport->dst || transport->dst->obsolete) { 242 if (!transport->dst || transport->dst->obsolete) {
230 dst_release(transport->dst); 243 sctp_transport_dst_release(transport);
231 transport->af_specific->get_dst(transport, &transport->saddr, 244 transport->af_specific->get_dst(transport, &transport->saddr,
232 &transport->fl, sk); 245 &transport->fl, sk);
233 } 246 }
@@ -238,14 +251,13 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk)
238 transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT; 251 transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
239} 252}
240 253
241void sctp_transport_update_pmtu(struct sock *sk, struct sctp_transport *t, u32 pmtu) 254void sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu)
242{ 255{
243 struct dst_entry *dst; 256 struct dst_entry *dst = sctp_transport_dst_check(t);
244 257
245 if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) { 258 if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) {
246 pr_warn("%s: Reported pmtu %d too low, using default minimum of %d\n", 259 pr_warn("%s: Reported pmtu %d too low, using default minimum of %d\n",
247 __func__, pmtu, 260 __func__, pmtu, SCTP_DEFAULT_MINSEGMENT);
248 SCTP_DEFAULT_MINSEGMENT);
249 /* Use default minimum segment size and disable 261 /* Use default minimum segment size and disable
250 * pmtu discovery on this transport. 262 * pmtu discovery on this transport.
251 */ 263 */
@@ -254,17 +266,13 @@ void sctp_transport_update_pmtu(struct sock *sk, struct sctp_transport *t, u32 p
254 t->pathmtu = pmtu; 266 t->pathmtu = pmtu;
255 } 267 }
256 268
257 dst = sctp_transport_dst_check(t);
258 if (!dst)
259 t->af_specific->get_dst(t, &t->saddr, &t->fl, sk);
260
261 if (dst) { 269 if (dst) {
262 dst->ops->update_pmtu(dst, sk, NULL, pmtu); 270 dst->ops->update_pmtu(dst, t->asoc->base.sk, NULL, pmtu);
263
264 dst = sctp_transport_dst_check(t); 271 dst = sctp_transport_dst_check(t);
265 if (!dst)
266 t->af_specific->get_dst(t, &t->saddr, &t->fl, sk);
267 } 272 }
273
274 if (!dst)
275 t->af_specific->get_dst(t, &t->saddr, &t->fl, t->asoc->base.sk);
268} 276}
269 277
270/* Caches the dst entry and source address for a transport's destination 278/* Caches the dst entry and source address for a transport's destination
@@ -630,9 +638,7 @@ void sctp_transport_reset(struct sctp_transport *t)
630 t->srtt = 0; 638 t->srtt = 0;
631 t->rttvar = 0; 639 t->rttvar = 0;
632 640
633 /* Reset these additional varibles so that we have a clean 641 /* Reset these additional variables so that we have a clean slate. */
634 * slate.
635 */
636 t->partial_bytes_acked = 0; 642 t->partial_bytes_acked = 0;
637 t->flight_size = 0; 643 t->flight_size = 0;
638 t->error_count = 0; 644 t->error_count = 0;
@@ -659,3 +665,17 @@ void sctp_transport_immediate_rtx(struct sctp_transport *t)
659 sctp_transport_hold(t); 665 sctp_transport_hold(t);
660 } 666 }
661} 667}
668
669/* Drop dst */
670void sctp_transport_dst_release(struct sctp_transport *t)
671{
672 dst_release(t->dst);
673 t->dst = NULL;
674 t->dst_pending_confirm = 0;
675}
676
677/* Schedule neighbour confirm */
678void sctp_transport_dst_confirm(struct sctp_transport *t)
679{
680 t->dst_pending_confirm = 1;
681}
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index bea00058ce35..c8881bc542a0 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -854,6 +854,35 @@ struct sctp_ulpevent *sctp_ulpevent_make_sender_dry_event(
854 return event; 854 return event;
855} 855}
856 856
857struct sctp_ulpevent *sctp_ulpevent_make_stream_reset_event(
858 const struct sctp_association *asoc, __u16 flags, __u16 stream_num,
859 __u16 *stream_list, gfp_t gfp)
860{
861 struct sctp_stream_reset_event *sreset;
862 struct sctp_ulpevent *event;
863 struct sk_buff *skb;
864 int length, i;
865
866 length = sizeof(struct sctp_stream_reset_event) + 2 * stream_num;
867 event = sctp_ulpevent_new(length, MSG_NOTIFICATION, gfp);
868 if (!event)
869 return NULL;
870
871 skb = sctp_event2skb(event);
872 sreset = (struct sctp_stream_reset_event *)skb_put(skb, length);
873
874 sreset->strreset_type = SCTP_STREAM_RESET_EVENT;
875 sreset->strreset_flags = flags;
876 sreset->strreset_length = length;
877 sctp_ulpevent_set_owner(event, asoc);
878 sreset->strreset_assoc_id = sctp_assoc2id(asoc);
879
880 for (i = 0; i < stream_num; i++)
881 sreset->strreset_stream_list[i] = ntohs(stream_list[i]);
882
883 return event;
884}
885
857/* Return the notification type, assuming this is a notification 886/* Return the notification type, assuming this is a notification
858 * event. 887 * event.
859 */ 888 */
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 84d0fdaf7de9..aa3624d50278 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -760,11 +760,11 @@ static void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq,
760 struct sk_buff_head *event_list; 760 struct sk_buff_head *event_list;
761 struct sk_buff *pos, *tmp; 761 struct sk_buff *pos, *tmp;
762 struct sctp_ulpevent *cevent; 762 struct sctp_ulpevent *cevent;
763 struct sctp_stream *in; 763 struct sctp_stream *stream;
764 __u16 sid, csid, cssn; 764 __u16 sid, csid, cssn;
765 765
766 sid = event->stream; 766 sid = event->stream;
767 in = &ulpq->asoc->ssnmap->in; 767 stream = ulpq->asoc->stream;
768 768
769 event_list = (struct sk_buff_head *) sctp_event2skb(event)->prev; 769 event_list = (struct sk_buff_head *) sctp_event2skb(event)->prev;
770 770
@@ -782,11 +782,11 @@ static void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq,
782 if (csid < sid) 782 if (csid < sid)
783 continue; 783 continue;
784 784
785 if (cssn != sctp_ssn_peek(in, sid)) 785 if (cssn != sctp_ssn_peek(stream, in, sid))
786 break; 786 break;
787 787
788 /* Found it, so mark in the ssnmap. */ 788 /* Found it, so mark in the stream. */
789 sctp_ssn_next(in, sid); 789 sctp_ssn_next(stream, in, sid);
790 790
791 __skb_unlink(pos, &ulpq->lobby); 791 __skb_unlink(pos, &ulpq->lobby);
792 792
@@ -849,7 +849,7 @@ static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq,
849 struct sctp_ulpevent *event) 849 struct sctp_ulpevent *event)
850{ 850{
851 __u16 sid, ssn; 851 __u16 sid, ssn;
852 struct sctp_stream *in; 852 struct sctp_stream *stream;
853 853
854 /* Check if this message needs ordering. */ 854 /* Check if this message needs ordering. */
855 if (SCTP_DATA_UNORDERED & event->msg_flags) 855 if (SCTP_DATA_UNORDERED & event->msg_flags)
@@ -858,10 +858,10 @@ static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq,
858 /* Note: The stream ID must be verified before this routine. */ 858 /* Note: The stream ID must be verified before this routine. */
859 sid = event->stream; 859 sid = event->stream;
860 ssn = event->ssn; 860 ssn = event->ssn;
861 in = &ulpq->asoc->ssnmap->in; 861 stream = ulpq->asoc->stream;
862 862
863 /* Is this the expected SSN for this stream ID? */ 863 /* Is this the expected SSN for this stream ID? */
864 if (ssn != sctp_ssn_peek(in, sid)) { 864 if (ssn != sctp_ssn_peek(stream, in, sid)) {
865 /* We've received something out of order, so find where it 865 /* We've received something out of order, so find where it
866 * needs to be placed. We order by stream and then by SSN. 866 * needs to be placed. We order by stream and then by SSN.
867 */ 867 */
@@ -870,7 +870,7 @@ static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq,
870 } 870 }
871 871
872 /* Mark that the next chunk has been found. */ 872 /* Mark that the next chunk has been found. */
873 sctp_ssn_next(in, sid); 873 sctp_ssn_next(stream, in, sid);
874 874
875 /* Go find any other chunks that were waiting for 875 /* Go find any other chunks that were waiting for
876 * ordering. 876 * ordering.
@@ -888,12 +888,12 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
888 struct sk_buff *pos, *tmp; 888 struct sk_buff *pos, *tmp;
889 struct sctp_ulpevent *cevent; 889 struct sctp_ulpevent *cevent;
890 struct sctp_ulpevent *event; 890 struct sctp_ulpevent *event;
891 struct sctp_stream *in; 891 struct sctp_stream *stream;
892 struct sk_buff_head temp; 892 struct sk_buff_head temp;
893 struct sk_buff_head *lobby = &ulpq->lobby; 893 struct sk_buff_head *lobby = &ulpq->lobby;
894 __u16 csid, cssn; 894 __u16 csid, cssn;
895 895
896 in = &ulpq->asoc->ssnmap->in; 896 stream = ulpq->asoc->stream;
897 897
898 /* We are holding the chunks by stream, by SSN. */ 898 /* We are holding the chunks by stream, by SSN. */
899 skb_queue_head_init(&temp); 899 skb_queue_head_init(&temp);
@@ -912,7 +912,7 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
912 continue; 912 continue;
913 913
914 /* see if this ssn has been marked by skipping */ 914 /* see if this ssn has been marked by skipping */
915 if (!SSN_lt(cssn, sctp_ssn_peek(in, csid))) 915 if (!SSN_lt(cssn, sctp_ssn_peek(stream, in, csid)))
916 break; 916 break;
917 917
918 __skb_unlink(pos, lobby); 918 __skb_unlink(pos, lobby);
@@ -932,8 +932,8 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
932 csid = cevent->stream; 932 csid = cevent->stream;
933 cssn = cevent->ssn; 933 cssn = cevent->ssn;
934 934
935 if (csid == sid && cssn == sctp_ssn_peek(in, csid)) { 935 if (csid == sid && cssn == sctp_ssn_peek(stream, in, csid)) {
936 sctp_ssn_next(in, csid); 936 sctp_ssn_next(stream, in, csid);
937 __skb_unlink(pos, lobby); 937 __skb_unlink(pos, lobby);
938 __skb_queue_tail(&temp, pos); 938 __skb_queue_tail(&temp, pos);
939 event = sctp_skb2event(pos); 939 event = sctp_skb2event(pos);
@@ -955,17 +955,17 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
955 */ 955 */
956void sctp_ulpq_skip(struct sctp_ulpq *ulpq, __u16 sid, __u16 ssn) 956void sctp_ulpq_skip(struct sctp_ulpq *ulpq, __u16 sid, __u16 ssn)
957{ 957{
958 struct sctp_stream *in; 958 struct sctp_stream *stream;
959 959
960 /* Note: The stream ID must be verified before this routine. */ 960 /* Note: The stream ID must be verified before this routine. */
961 in = &ulpq->asoc->ssnmap->in; 961 stream = ulpq->asoc->stream;
962 962
963 /* Is this an old SSN? If so ignore. */ 963 /* Is this an old SSN? If so ignore. */
964 if (SSN_lt(ssn, sctp_ssn_peek(in, sid))) 964 if (SSN_lt(ssn, sctp_ssn_peek(stream, in, sid)))
965 return; 965 return;
966 966
967 /* Mark that we are no longer expecting this SSN or lower. */ 967 /* Mark that we are no longer expecting this SSN or lower. */
968 sctp_ssn_skip(in, sid, ssn); 968 sctp_ssn_skip(stream, in, sid, ssn);
969 969
970 /* Go find any other chunks that were waiting for 970 /* Go find any other chunks that were waiting for
971 * ordering and deliver them if needed. 971 * ordering and deliver them if needed.
diff --git a/net/smc/Kconfig b/net/smc/Kconfig
new file mode 100644
index 000000000000..c717ef0896aa
--- /dev/null
+++ b/net/smc/Kconfig
@@ -0,0 +1,20 @@
1config SMC
2 tristate "SMC socket protocol family"
3 depends on INET && INFINIBAND
4 ---help---
5 SMC-R provides a "sockets over RDMA" solution making use of
6 RDMA over Converged Ethernet (RoCE) technology to upgrade
7 AF_INET TCP connections transparently.
8 The Linux implementation of the SMC-R solution is designed as
9 a separate socket family SMC.
10
11 Select this option if you want to run SMC socket applications
12
13config SMC_DIAG
14 tristate "SMC: socket monitoring interface"
15 depends on SMC
16 ---help---
17 Support for SMC socket monitoring interface used by tools such as
18 smcss.
19
20 if unsure, say Y.
diff --git a/net/smc/Makefile b/net/smc/Makefile
new file mode 100644
index 000000000000..188104654b54
--- /dev/null
+++ b/net/smc/Makefile
@@ -0,0 +1,4 @@
1obj-$(CONFIG_SMC) += smc.o
2obj-$(CONFIG_SMC_DIAG) += smc_diag.o
3smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
4smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
new file mode 100644
index 000000000000..093803786eac
--- /dev/null
+++ b/net/smc/af_smc.c
@@ -0,0 +1,1409 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * AF_SMC protocol family socket handler keeping the AF_INET sock address type
5 * applies to SOCK_STREAM sockets only
6 * offers an alternative communication option for TCP-protocol sockets
7 * applicable with RoCE-cards only
8 *
9 * Initial restrictions:
10 * - non-blocking connect postponed
11 * - IPv6 support postponed
12 * - support for alternate links postponed
13 * - partial support for non-blocking sockets only
14 * - support for urgent data postponed
15 *
16 * Copyright IBM Corp. 2016
17 *
18 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
19 * based on prototype from Frank Blaschka
20 */
21
22#define KMSG_COMPONENT "smc"
23#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
24
25#include <linux/module.h>
26#include <linux/socket.h>
27#include <linux/inetdevice.h>
28#include <linux/workqueue.h>
29#include <linux/in.h>
30#include <linux/sched/signal.h>
31
32#include <net/sock.h>
33#include <net/tcp.h>
34#include <net/smc.h>
35
36#include "smc.h"
37#include "smc_clc.h"
38#include "smc_llc.h"
39#include "smc_cdc.h"
40#include "smc_core.h"
41#include "smc_ib.h"
42#include "smc_pnet.h"
43#include "smc_tx.h"
44#include "smc_rx.h"
45#include "smc_close.h"
46
47static DEFINE_MUTEX(smc_create_lgr_pending); /* serialize link group
48 * creation
49 */
50
51struct smc_lgr_list smc_lgr_list = { /* established link groups */
52 .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
53 .list = LIST_HEAD_INIT(smc_lgr_list.list),
54};
55
56static void smc_tcp_listen_work(struct work_struct *);
57
58static void smc_set_keepalive(struct sock *sk, int val)
59{
60 struct smc_sock *smc = smc_sk(sk);
61
62 smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
63}
64
65static struct smc_hashinfo smc_v4_hashinfo = {
66 .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
67};
68
69int smc_hash_sk(struct sock *sk)
70{
71 struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
72 struct hlist_head *head;
73
74 head = &h->ht;
75
76 write_lock_bh(&h->lock);
77 sk_add_node(sk, head);
78 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
79 write_unlock_bh(&h->lock);
80
81 return 0;
82}
83EXPORT_SYMBOL_GPL(smc_hash_sk);
84
85void smc_unhash_sk(struct sock *sk)
86{
87 struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
88
89 write_lock_bh(&h->lock);
90 if (sk_del_node_init(sk))
91 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
92 write_unlock_bh(&h->lock);
93}
94EXPORT_SYMBOL_GPL(smc_unhash_sk);
95
96struct proto smc_proto = {
97 .name = "SMC",
98 .owner = THIS_MODULE,
99 .keepalive = smc_set_keepalive,
100 .hash = smc_hash_sk,
101 .unhash = smc_unhash_sk,
102 .obj_size = sizeof(struct smc_sock),
103 .h.smc_hash = &smc_v4_hashinfo,
104 .slab_flags = SLAB_DESTROY_BY_RCU,
105};
106EXPORT_SYMBOL_GPL(smc_proto);
107
108static int smc_release(struct socket *sock)
109{
110 struct sock *sk = sock->sk;
111 struct smc_sock *smc;
112 int rc = 0;
113
114 if (!sk)
115 goto out;
116
117 smc = smc_sk(sk);
118 sock_hold(sk);
119 if (sk->sk_state == SMC_LISTEN)
120 /* smc_close_non_accepted() is called and acquires
121 * sock lock for child sockets again
122 */
123 lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
124 else
125 lock_sock(sk);
126
127 if (smc->use_fallback) {
128 sk->sk_state = SMC_CLOSED;
129 sk->sk_state_change(sk);
130 } else {
131 rc = smc_close_active(smc);
132 sock_set_flag(sk, SOCK_DEAD);
133 sk->sk_shutdown |= SHUTDOWN_MASK;
134 }
135 if (smc->clcsock) {
136 sock_release(smc->clcsock);
137 smc->clcsock = NULL;
138 }
139
140 /* detach socket */
141 sock_orphan(sk);
142 sock->sk = NULL;
143 if (smc->use_fallback) {
144 schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN);
145 } else if (sk->sk_state == SMC_CLOSED) {
146 smc_conn_free(&smc->conn);
147 schedule_delayed_work(&smc->sock_put_work,
148 SMC_CLOSE_SOCK_PUT_DELAY);
149 }
150 sk->sk_prot->unhash(sk);
151 release_sock(sk);
152
153 sock_put(sk);
154out:
155 return rc;
156}
157
158static void smc_destruct(struct sock *sk)
159{
160 if (sk->sk_state != SMC_CLOSED)
161 return;
162 if (!sock_flag(sk, SOCK_DEAD))
163 return;
164
165 sk_refcnt_debug_dec(sk);
166}
167
168static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
169{
170 struct smc_sock *smc;
171 struct sock *sk;
172
173 sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0);
174 if (!sk)
175 return NULL;
176
177 sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
178 sk->sk_state = SMC_INIT;
179 sk->sk_destruct = smc_destruct;
180 sk->sk_protocol = SMCPROTO_SMC;
181 smc = smc_sk(sk);
182 INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
183 INIT_LIST_HEAD(&smc->accept_q);
184 spin_lock_init(&smc->accept_q_lock);
185 INIT_DELAYED_WORK(&smc->sock_put_work, smc_close_sock_put_work);
186 sk->sk_prot->hash(sk);
187 sk_refcnt_debug_inc(sk);
188
189 return sk;
190}
191
192static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
193 int addr_len)
194{
195 struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
196 struct sock *sk = sock->sk;
197 struct smc_sock *smc;
198 int rc;
199
200 smc = smc_sk(sk);
201
202 /* replicate tests from inet_bind(), to be safe wrt. future changes */
203 rc = -EINVAL;
204 if (addr_len < sizeof(struct sockaddr_in))
205 goto out;
206
207 rc = -EAFNOSUPPORT;
208 /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
209 if ((addr->sin_family != AF_INET) &&
210 ((addr->sin_family != AF_UNSPEC) ||
211 (addr->sin_addr.s_addr != htonl(INADDR_ANY))))
212 goto out;
213
214 lock_sock(sk);
215
216 /* Check if socket is already active */
217 rc = -EINVAL;
218 if (sk->sk_state != SMC_INIT)
219 goto out_rel;
220
221 smc->clcsock->sk->sk_reuse = sk->sk_reuse;
222 rc = kernel_bind(smc->clcsock, uaddr, addr_len);
223
224out_rel:
225 release_sock(sk);
226out:
227 return rc;
228}
229
230static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
231 unsigned long mask)
232{
233 /* options we don't get control via setsockopt for */
234 nsk->sk_type = osk->sk_type;
235 nsk->sk_sndbuf = osk->sk_sndbuf;
236 nsk->sk_rcvbuf = osk->sk_rcvbuf;
237 nsk->sk_sndtimeo = osk->sk_sndtimeo;
238 nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
239 nsk->sk_mark = osk->sk_mark;
240 nsk->sk_priority = osk->sk_priority;
241 nsk->sk_rcvlowat = osk->sk_rcvlowat;
242 nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
243 nsk->sk_err = osk->sk_err;
244
245 nsk->sk_flags &= ~mask;
246 nsk->sk_flags |= osk->sk_flags & mask;
247}
248
249#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
250 (1UL << SOCK_KEEPOPEN) | \
251 (1UL << SOCK_LINGER) | \
252 (1UL << SOCK_BROADCAST) | \
253 (1UL << SOCK_TIMESTAMP) | \
254 (1UL << SOCK_DBG) | \
255 (1UL << SOCK_RCVTSTAMP) | \
256 (1UL << SOCK_RCVTSTAMPNS) | \
257 (1UL << SOCK_LOCALROUTE) | \
258 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
259 (1UL << SOCK_RXQ_OVFL) | \
260 (1UL << SOCK_WIFI_STATUS) | \
261 (1UL << SOCK_NOFCS) | \
262 (1UL << SOCK_FILTER_LOCKED))
263/* copy only relevant settings and flags of SOL_SOCKET level from smc to
264 * clc socket (since smc is not called for these options from net/core)
265 */
266static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
267{
268 smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
269}
270
271#define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
272 (1UL << SOCK_KEEPOPEN) | \
273 (1UL << SOCK_LINGER) | \
274 (1UL << SOCK_DBG))
275/* copy only settings and flags relevant for smc from clc to smc socket */
276static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
277{
278 smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
279}
280
281/* determine subnet and mask of internal TCP socket */
282int smc_netinfo_by_tcpsk(struct socket *clcsock,
283 __be32 *subnet, u8 *prefix_len)
284{
285 struct dst_entry *dst = sk_dst_get(clcsock->sk);
286 struct sockaddr_in addr;
287 int rc = -ENOENT;
288 int len;
289
290 if (!dst) {
291 rc = -ENOTCONN;
292 goto out;
293 }
294 if (!dst->dev) {
295 rc = -ENODEV;
296 goto out_rel;
297 }
298
299 /* get address to which the internal TCP socket is bound */
300 kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len);
301 /* analyze IPv4 specific data of net_device belonging to TCP socket */
302 for_ifa(dst->dev->ip_ptr) {
303 if (ifa->ifa_address != addr.sin_addr.s_addr)
304 continue;
305 *prefix_len = inet_mask_len(ifa->ifa_mask);
306 *subnet = ifa->ifa_address & ifa->ifa_mask;
307 rc = 0;
308 break;
309 } endfor_ifa(dst->dev->ip_ptr);
310
311out_rel:
312 dst_release(dst);
313out:
314 return rc;
315}
316
317static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid)
318{
319 struct smc_link_group *lgr = smc->conn.lgr;
320 struct smc_link *link;
321 int rest;
322 int rc;
323
324 link = &lgr->lnk[SMC_SINGLE_LINK];
325 /* receive CONFIRM LINK request from server over RoCE fabric */
326 rest = wait_for_completion_interruptible_timeout(
327 &link->llc_confirm,
328 SMC_LLC_WAIT_FIRST_TIME);
329 if (rest <= 0) {
330 struct smc_clc_msg_decline dclc;
331
332 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
333 SMC_CLC_DECLINE);
334 return rc;
335 }
336
337 rc = smc_ib_modify_qp_rts(link);
338 if (rc)
339 return SMC_CLC_DECL_INTERR;
340
341 smc_wr_remember_qp_attr(link);
342 /* send CONFIRM LINK response over RoCE fabric */
343 rc = smc_llc_send_confirm_link(link,
344 link->smcibdev->mac[link->ibport - 1],
345 gid, SMC_LLC_RESP);
346 if (rc < 0)
347 return SMC_CLC_DECL_TCL;
348
349 return rc;
350}
351
352static void smc_conn_save_peer_info(struct smc_sock *smc,
353 struct smc_clc_msg_accept_confirm *clc)
354{
355 smc->conn.peer_conn_idx = clc->conn_idx;
356 smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
357 smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size);
358 atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
359}
360
361static void smc_link_save_peer_info(struct smc_link *link,
362 struct smc_clc_msg_accept_confirm *clc)
363{
364 link->peer_qpn = ntoh24(clc->qpn);
365 memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
366 memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
367 link->peer_psn = ntoh24(clc->psn);
368 link->peer_mtu = clc->qp_mtu;
369}
370
371/* setup for RDMA connection of client */
372static int smc_connect_rdma(struct smc_sock *smc)
373{
374 struct sockaddr_in *inaddr = (struct sockaddr_in *)smc->addr;
375 struct smc_clc_msg_accept_confirm aclc;
376 int local_contact = SMC_FIRST_CONTACT;
377 struct smc_ib_device *smcibdev;
378 struct smc_link *link;
379 u8 srv_first_contact;
380 int reason_code = 0;
381 int rc = 0;
382 u8 ibport;
383
384 /* IPSec connections opt out of SMC-R optimizations */
385 if (using_ipsec(smc)) {
386 reason_code = SMC_CLC_DECL_IPSEC;
387 goto decline_rdma;
388 }
389
390 /* PNET table look up: search active ib_device and port
391 * within same PNETID that also contains the ethernet device
392 * used for the internal TCP socket
393 */
394 smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport);
395 if (!smcibdev) {
396 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
397 goto decline_rdma;
398 }
399
400 /* do inband token exchange */
401 reason_code = smc_clc_send_proposal(smc, smcibdev, ibport);
402 if (reason_code < 0) {
403 rc = reason_code;
404 goto out_err;
405 }
406 if (reason_code > 0) /* configuration error */
407 goto decline_rdma;
408 /* receive SMC Accept CLC message */
409 reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc),
410 SMC_CLC_ACCEPT);
411 if (reason_code < 0) {
412 rc = reason_code;
413 goto out_err;
414 }
415 if (reason_code > 0)
416 goto decline_rdma;
417
418 srv_first_contact = aclc.hdr.flag;
419 mutex_lock(&smc_create_lgr_pending);
420 local_contact = smc_conn_create(smc, inaddr->sin_addr.s_addr, smcibdev,
421 ibport, &aclc.lcl, srv_first_contact);
422 if (local_contact < 0) {
423 rc = local_contact;
424 if (rc == -ENOMEM)
425 reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
426 else if (rc == -ENOLINK)
427 reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
428 goto decline_rdma_unlock;
429 }
430 link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
431
432 smc_conn_save_peer_info(smc, &aclc);
433
434 rc = smc_sndbuf_create(smc);
435 if (rc) {
436 reason_code = SMC_CLC_DECL_MEM;
437 goto decline_rdma_unlock;
438 }
439 rc = smc_rmb_create(smc);
440 if (rc) {
441 reason_code = SMC_CLC_DECL_MEM;
442 goto decline_rdma_unlock;
443 }
444
445 if (local_contact == SMC_FIRST_CONTACT)
446 smc_link_save_peer_info(link, &aclc);
447
448 rc = smc_rmb_rtoken_handling(&smc->conn, &aclc);
449 if (rc) {
450 reason_code = SMC_CLC_DECL_INTERR;
451 goto decline_rdma_unlock;
452 }
453
454 if (local_contact == SMC_FIRST_CONTACT) {
455 rc = smc_ib_ready_link(link);
456 if (rc) {
457 reason_code = SMC_CLC_DECL_INTERR;
458 goto decline_rdma_unlock;
459 }
460 }
461
462 rc = smc_clc_send_confirm(smc);
463 if (rc)
464 goto out_err_unlock;
465
466 if (local_contact == SMC_FIRST_CONTACT) {
467 /* QP confirmation over RoCE fabric */
468 reason_code = smc_clnt_conf_first_link(
469 smc, &smcibdev->gid[ibport - 1]);
470 if (reason_code < 0) {
471 rc = reason_code;
472 goto out_err_unlock;
473 }
474 if (reason_code > 0)
475 goto decline_rdma_unlock;
476 }
477
478 mutex_unlock(&smc_create_lgr_pending);
479 smc_tx_init(smc);
480 smc_rx_init(smc);
481
482out_connected:
483 smc_copy_sock_settings_to_clc(smc);
484 if (smc->sk.sk_state == SMC_INIT)
485 smc->sk.sk_state = SMC_ACTIVE;
486
487 return rc ? rc : local_contact;
488
489decline_rdma_unlock:
490 mutex_unlock(&smc_create_lgr_pending);
491 smc_conn_free(&smc->conn);
492decline_rdma:
493 /* RDMA setup failed, switch back to TCP */
494 smc->use_fallback = true;
495 if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
496 rc = smc_clc_send_decline(smc, reason_code, 0);
497 if (rc < sizeof(struct smc_clc_msg_decline))
498 goto out_err;
499 }
500 goto out_connected;
501
502out_err_unlock:
503 mutex_unlock(&smc_create_lgr_pending);
504 smc_conn_free(&smc->conn);
505out_err:
506 return rc;
507}
508
509static int smc_connect(struct socket *sock, struct sockaddr *addr,
510 int alen, int flags)
511{
512 struct sock *sk = sock->sk;
513 struct smc_sock *smc;
514 int rc = -EINVAL;
515
516 smc = smc_sk(sk);
517
518 /* separate smc parameter checking to be safe */
519 if (alen < sizeof(addr->sa_family))
520 goto out_err;
521 if (addr->sa_family != AF_INET)
522 goto out_err;
523 smc->addr = addr; /* needed for nonblocking connect */
524
525 lock_sock(sk);
526 switch (sk->sk_state) {
527 default:
528 goto out;
529 case SMC_ACTIVE:
530 rc = -EISCONN;
531 goto out;
532 case SMC_INIT:
533 rc = 0;
534 break;
535 }
536
537 smc_copy_sock_settings_to_clc(smc);
538 rc = kernel_connect(smc->clcsock, addr, alen, flags);
539 if (rc)
540 goto out;
541
542 /* setup RDMA connection */
543 rc = smc_connect_rdma(smc);
544 if (rc < 0)
545 goto out;
546 else
547 rc = 0; /* success cases including fallback */
548
549out:
550 release_sock(sk);
551out_err:
552 return rc;
553}
554
555static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
556{
557 struct sock *sk = &lsmc->sk;
558 struct socket *new_clcsock;
559 struct sock *new_sk;
560 int rc;
561
562 release_sock(&lsmc->sk);
563 new_sk = smc_sock_alloc(sock_net(sk), NULL);
564 if (!new_sk) {
565 rc = -ENOMEM;
566 lsmc->sk.sk_err = ENOMEM;
567 *new_smc = NULL;
568 lock_sock(&lsmc->sk);
569 goto out;
570 }
571 *new_smc = smc_sk(new_sk);
572
573 rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
574 lock_sock(&lsmc->sk);
575 if (rc < 0) {
576 lsmc->sk.sk_err = -rc;
577 new_sk->sk_state = SMC_CLOSED;
578 sock_set_flag(new_sk, SOCK_DEAD);
579 sk->sk_prot->unhash(new_sk);
580 sock_put(new_sk);
581 *new_smc = NULL;
582 goto out;
583 }
584 if (lsmc->sk.sk_state == SMC_CLOSED) {
585 if (new_clcsock)
586 sock_release(new_clcsock);
587 new_sk->sk_state = SMC_CLOSED;
588 sock_set_flag(new_sk, SOCK_DEAD);
589 sk->sk_prot->unhash(new_sk);
590 sock_put(new_sk);
591 *new_smc = NULL;
592 goto out;
593 }
594
595 (*new_smc)->clcsock = new_clcsock;
596out:
597 return rc;
598}
599
600/* add a just created sock to the accept queue of the listen sock as
601 * candidate for a following socket accept call from user space
602 */
603static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
604{
605 struct smc_sock *par = smc_sk(parent);
606
607 sock_hold(sk);
608 spin_lock(&par->accept_q_lock);
609 list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
610 spin_unlock(&par->accept_q_lock);
611 sk_acceptq_added(parent);
612}
613
614/* remove a socket from the accept queue of its parental listening socket */
615static void smc_accept_unlink(struct sock *sk)
616{
617 struct smc_sock *par = smc_sk(sk)->listen_smc;
618
619 spin_lock(&par->accept_q_lock);
620 list_del_init(&smc_sk(sk)->accept_q);
621 spin_unlock(&par->accept_q_lock);
622 sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
623 sock_put(sk);
624}
625
626/* remove a sock from the accept queue to bind it to a new socket created
627 * for a socket accept call from user space
628 */
629struct sock *smc_accept_dequeue(struct sock *parent,
630 struct socket *new_sock)
631{
632 struct smc_sock *isk, *n;
633 struct sock *new_sk;
634
635 list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
636 new_sk = (struct sock *)isk;
637
638 smc_accept_unlink(new_sk);
639 if (new_sk->sk_state == SMC_CLOSED) {
640 /* tbd in follow-on patch: close this sock */
641 continue;
642 }
643 if (new_sock)
644 sock_graft(new_sk, new_sock);
645 return new_sk;
646 }
647 return NULL;
648}
649
650/* clean up for a created but never accepted sock */
651void smc_close_non_accepted(struct sock *sk)
652{
653 struct smc_sock *smc = smc_sk(sk);
654
655 sock_hold(sk);
656 lock_sock(sk);
657 if (!sk->sk_lingertime)
658 /* wait for peer closing */
659 sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
660 if (!smc->use_fallback)
661 smc_close_active(smc);
662 if (smc->clcsock) {
663 struct socket *tcp;
664
665 tcp = smc->clcsock;
666 smc->clcsock = NULL;
667 sock_release(tcp);
668 }
669 sock_set_flag(sk, SOCK_DEAD);
670 sk->sk_shutdown |= SHUTDOWN_MASK;
671 if (smc->use_fallback) {
672 schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN);
673 } else {
674 smc_conn_free(&smc->conn);
675 schedule_delayed_work(&smc->sock_put_work,
676 SMC_CLOSE_SOCK_PUT_DELAY);
677 }
678 release_sock(sk);
679 sock_put(sk);
680}
681
682static int smc_serv_conf_first_link(struct smc_sock *smc)
683{
684 struct smc_link_group *lgr = smc->conn.lgr;
685 struct smc_link *link;
686 int rest;
687 int rc;
688
689 link = &lgr->lnk[SMC_SINGLE_LINK];
690 /* send CONFIRM LINK request to client over the RoCE fabric */
691 rc = smc_llc_send_confirm_link(link,
692 link->smcibdev->mac[link->ibport - 1],
693 &link->smcibdev->gid[link->ibport - 1],
694 SMC_LLC_REQ);
695 if (rc < 0)
696 return SMC_CLC_DECL_TCL;
697
698 /* receive CONFIRM LINK response from client over the RoCE fabric */
699 rest = wait_for_completion_interruptible_timeout(
700 &link->llc_confirm_resp,
701 SMC_LLC_WAIT_FIRST_TIME);
702 if (rest <= 0) {
703 struct smc_clc_msg_decline dclc;
704
705 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
706 SMC_CLC_DECLINE);
707 }
708
709 return rc;
710}
711
712/* setup for RDMA connection of server */
713static void smc_listen_work(struct work_struct *work)
714{
715 struct smc_sock *new_smc = container_of(work, struct smc_sock,
716 smc_listen_work);
717 struct socket *newclcsock = new_smc->clcsock;
718 struct smc_sock *lsmc = new_smc->listen_smc;
719 struct smc_clc_msg_accept_confirm cclc;
720 int local_contact = SMC_REUSE_CONTACT;
721 struct sock *newsmcsk = &new_smc->sk;
722 struct smc_clc_msg_proposal pclc;
723 struct smc_ib_device *smcibdev;
724 struct sockaddr_in peeraddr;
725 struct smc_link *link;
726 int reason_code = 0;
727 int rc = 0, len;
728 __be32 subnet;
729 u8 prefix_len;
730 u8 ibport;
731
732 /* do inband token exchange -
733 *wait for and receive SMC Proposal CLC message
734 */
735 reason_code = smc_clc_wait_msg(new_smc, &pclc, sizeof(pclc),
736 SMC_CLC_PROPOSAL);
737 if (reason_code < 0)
738 goto out_err;
739 if (reason_code > 0)
740 goto decline_rdma;
741
742 /* IPSec connections opt out of SMC-R optimizations */
743 if (using_ipsec(new_smc)) {
744 reason_code = SMC_CLC_DECL_IPSEC;
745 goto decline_rdma;
746 }
747
748 /* PNET table look up: search active ib_device and port
749 * within same PNETID that also contains the ethernet device
750 * used for the internal TCP socket
751 */
752 smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport);
753 if (!smcibdev) {
754 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
755 goto decline_rdma;
756 }
757
758 /* determine subnet and mask from internal TCP socket */
759 rc = smc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len);
760 if (rc) {
761 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
762 goto decline_rdma;
763 }
764 if ((pclc.outgoing_subnet != subnet) ||
765 (pclc.prefix_len != prefix_len)) {
766 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
767 goto decline_rdma;
768 }
769
770 /* get address of the peer connected to the internal TCP socket */
771 kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr, &len);
772
773 /* allocate connection / link group */
774 mutex_lock(&smc_create_lgr_pending);
775 local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr,
776 smcibdev, ibport, &pclc.lcl, 0);
777 if (local_contact == SMC_REUSE_CONTACT)
778 /* lock no longer needed, free it due to following
779 * smc_clc_wait_msg() call
780 */
781 mutex_unlock(&smc_create_lgr_pending);
782 if (local_contact < 0) {
783 rc = local_contact;
784 if (rc == -ENOMEM)
785 reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
786 else if (rc == -ENOLINK)
787 reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
788 goto decline_rdma;
789 }
790 link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
791
792 rc = smc_sndbuf_create(new_smc);
793 if (rc) {
794 reason_code = SMC_CLC_DECL_MEM;
795 goto decline_rdma;
796 }
797 rc = smc_rmb_create(new_smc);
798 if (rc) {
799 reason_code = SMC_CLC_DECL_MEM;
800 goto decline_rdma;
801 }
802
803 rc = smc_clc_send_accept(new_smc, local_contact);
804 if (rc)
805 goto out_err;
806
807 /* receive SMC Confirm CLC message */
808 reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
809 SMC_CLC_CONFIRM);
810 if (reason_code < 0)
811 goto out_err;
812 if (reason_code > 0)
813 goto decline_rdma;
814 smc_conn_save_peer_info(new_smc, &cclc);
815 if (local_contact == SMC_FIRST_CONTACT)
816 smc_link_save_peer_info(link, &cclc);
817
818 rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc);
819 if (rc) {
820 reason_code = SMC_CLC_DECL_INTERR;
821 goto decline_rdma;
822 }
823
824 if (local_contact == SMC_FIRST_CONTACT) {
825 rc = smc_ib_ready_link(link);
826 if (rc) {
827 reason_code = SMC_CLC_DECL_INTERR;
828 goto decline_rdma;
829 }
830 /* QP confirmation over RoCE fabric */
831 reason_code = smc_serv_conf_first_link(new_smc);
832 if (reason_code < 0) {
833 /* peer is not aware of a problem */
834 rc = reason_code;
835 goto out_err;
836 }
837 if (reason_code > 0)
838 goto decline_rdma;
839 }
840
841 smc_tx_init(new_smc);
842 smc_rx_init(new_smc);
843
844out_connected:
845 sk_refcnt_debug_inc(newsmcsk);
846 if (newsmcsk->sk_state == SMC_INIT)
847 newsmcsk->sk_state = SMC_ACTIVE;
848enqueue:
849 if (local_contact == SMC_FIRST_CONTACT)
850 mutex_unlock(&smc_create_lgr_pending);
851 lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
852 if (lsmc->sk.sk_state == SMC_LISTEN) {
853 smc_accept_enqueue(&lsmc->sk, newsmcsk);
854 } else { /* no longer listening */
855 smc_close_non_accepted(newsmcsk);
856 }
857 release_sock(&lsmc->sk);
858
859 /* Wake up accept */
860 lsmc->sk.sk_data_ready(&lsmc->sk);
861 sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
862 return;
863
864decline_rdma:
865 /* RDMA setup failed, switch back to TCP */
866 smc_conn_free(&new_smc->conn);
867 new_smc->use_fallback = true;
868 if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
869 rc = smc_clc_send_decline(new_smc, reason_code, 0);
870 if (rc < sizeof(struct smc_clc_msg_decline))
871 goto out_err;
872 }
873 goto out_connected;
874
875out_err:
876 newsmcsk->sk_state = SMC_CLOSED;
877 smc_conn_free(&new_smc->conn);
878 goto enqueue; /* queue new sock with sk_err set */
879}
880
881static void smc_tcp_listen_work(struct work_struct *work)
882{
883 struct smc_sock *lsmc = container_of(work, struct smc_sock,
884 tcp_listen_work);
885 struct smc_sock *new_smc;
886 int rc = 0;
887
888 lock_sock(&lsmc->sk);
889 while (lsmc->sk.sk_state == SMC_LISTEN) {
890 rc = smc_clcsock_accept(lsmc, &new_smc);
891 if (rc)
892 goto out;
893 if (!new_smc)
894 continue;
895
896 new_smc->listen_smc = lsmc;
897 new_smc->use_fallback = false; /* assume rdma capability first*/
898 sock_hold(&lsmc->sk); /* sock_put in smc_listen_work */
899 INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
900 smc_copy_sock_settings_to_smc(new_smc);
901 schedule_work(&new_smc->smc_listen_work);
902 }
903
904out:
905 release_sock(&lsmc->sk);
906 lsmc->sk.sk_data_ready(&lsmc->sk); /* no more listening, wake accept */
907}
908
909static int smc_listen(struct socket *sock, int backlog)
910{
911 struct sock *sk = sock->sk;
912 struct smc_sock *smc;
913 int rc;
914
915 smc = smc_sk(sk);
916 lock_sock(sk);
917
918 rc = -EINVAL;
919 if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
920 goto out;
921
922 rc = 0;
923 if (sk->sk_state == SMC_LISTEN) {
924 sk->sk_max_ack_backlog = backlog;
925 goto out;
926 }
927 /* some socket options are handled in core, so we could not apply
928 * them to the clc socket -- copy smc socket options to clc socket
929 */
930 smc_copy_sock_settings_to_clc(smc);
931
932 rc = kernel_listen(smc->clcsock, backlog);
933 if (rc)
934 goto out;
935 sk->sk_max_ack_backlog = backlog;
936 sk->sk_ack_backlog = 0;
937 sk->sk_state = SMC_LISTEN;
938 INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
939 schedule_work(&smc->tcp_listen_work);
940
941out:
942 release_sock(sk);
943 return rc;
944}
945
946static int smc_accept(struct socket *sock, struct socket *new_sock,
947 int flags, bool kern)
948{
949 struct sock *sk = sock->sk, *nsk;
950 DECLARE_WAITQUEUE(wait, current);
951 struct smc_sock *lsmc;
952 long timeo;
953 int rc = 0;
954
955 lsmc = smc_sk(sk);
956 lock_sock(sk);
957
958 if (lsmc->sk.sk_state != SMC_LISTEN) {
959 rc = -EINVAL;
960 goto out;
961 }
962
963 /* Wait for an incoming connection */
964 timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
965 add_wait_queue_exclusive(sk_sleep(sk), &wait);
966 while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
967 set_current_state(TASK_INTERRUPTIBLE);
968 if (!timeo) {
969 rc = -EAGAIN;
970 break;
971 }
972 release_sock(sk);
973 timeo = schedule_timeout(timeo);
974 /* wakeup by sk_data_ready in smc_listen_work() */
975 sched_annotate_sleep();
976 lock_sock(sk);
977 if (signal_pending(current)) {
978 rc = sock_intr_errno(timeo);
979 break;
980 }
981 }
982 set_current_state(TASK_RUNNING);
983 remove_wait_queue(sk_sleep(sk), &wait);
984
985 if (!rc)
986 rc = sock_error(nsk);
987
988out:
989 release_sock(sk);
990 return rc;
991}
992
993static int smc_getname(struct socket *sock, struct sockaddr *addr,
994 int *len, int peer)
995{
996 struct smc_sock *smc;
997
998 if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
999 (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
1000 return -ENOTCONN;
1001
1002 smc = smc_sk(sock->sk);
1003
1004 return smc->clcsock->ops->getname(smc->clcsock, addr, len, peer);
1005}
1006
1007static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1008{
1009 struct sock *sk = sock->sk;
1010 struct smc_sock *smc;
1011 int rc = -EPIPE;
1012
1013 smc = smc_sk(sk);
1014 lock_sock(sk);
1015 if ((sk->sk_state != SMC_ACTIVE) &&
1016 (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1017 (sk->sk_state != SMC_INIT))
1018 goto out;
1019 if (smc->use_fallback)
1020 rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
1021 else
1022 rc = smc_tx_sendmsg(smc, msg, len);
1023out:
1024 release_sock(sk);
1025 return rc;
1026}
1027
1028static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
1029 int flags)
1030{
1031 struct sock *sk = sock->sk;
1032 struct smc_sock *smc;
1033 int rc = -ENOTCONN;
1034
1035 smc = smc_sk(sk);
1036 lock_sock(sk);
1037 if ((sk->sk_state == SMC_INIT) ||
1038 (sk->sk_state == SMC_LISTEN) ||
1039 (sk->sk_state == SMC_CLOSED))
1040 goto out;
1041
1042 if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1043 rc = 0;
1044 goto out;
1045 }
1046
1047 if (smc->use_fallback)
1048 rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
1049 else
1050 rc = smc_rx_recvmsg(smc, msg, len, flags);
1051
1052out:
1053 release_sock(sk);
1054 return rc;
1055}
1056
1057static unsigned int smc_accept_poll(struct sock *parent)
1058{
1059 struct smc_sock *isk;
1060 struct sock *sk;
1061
1062 lock_sock(parent);
1063 list_for_each_entry(isk, &smc_sk(parent)->accept_q, accept_q) {
1064 sk = (struct sock *)isk;
1065
1066 if (sk->sk_state == SMC_ACTIVE) {
1067 release_sock(parent);
1068 return POLLIN | POLLRDNORM;
1069 }
1070 }
1071 release_sock(parent);
1072
1073 return 0;
1074}
1075
1076static unsigned int smc_poll(struct file *file, struct socket *sock,
1077 poll_table *wait)
1078{
1079 struct sock *sk = sock->sk;
1080 unsigned int mask = 0;
1081 struct smc_sock *smc;
1082 int rc;
1083
1084 smc = smc_sk(sock->sk);
1085 if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
1086 /* delegate to CLC child sock */
1087 mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
1088 /* if non-blocking connect finished ... */
1089 lock_sock(sk);
1090 if ((sk->sk_state == SMC_INIT) && (mask & POLLOUT)) {
1091 sk->sk_err = smc->clcsock->sk->sk_err;
1092 if (sk->sk_err) {
1093 mask |= POLLERR;
1094 } else {
1095 rc = smc_connect_rdma(smc);
1096 if (rc < 0)
1097 mask |= POLLERR;
1098 else
1099 /* success cases including fallback */
1100 mask |= POLLOUT | POLLWRNORM;
1101 }
1102 }
1103 release_sock(sk);
1104 } else {
1105 sock_poll_wait(file, sk_sleep(sk), wait);
1106 if (sk->sk_state == SMC_LISTEN)
1107 /* woken up by sk_data_ready in smc_listen_work() */
1108 mask |= smc_accept_poll(sk);
1109 if (sk->sk_err)
1110 mask |= POLLERR;
1111 if (atomic_read(&smc->conn.sndbuf_space) ||
1112 (sk->sk_shutdown & SEND_SHUTDOWN)) {
1113 mask |= POLLOUT | POLLWRNORM;
1114 } else {
1115 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1116 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1117 }
1118 if (atomic_read(&smc->conn.bytes_to_rcv))
1119 mask |= POLLIN | POLLRDNORM;
1120 if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
1121 (sk->sk_state == SMC_CLOSED))
1122 mask |= POLLHUP;
1123 if (sk->sk_shutdown & RCV_SHUTDOWN)
1124 mask |= POLLIN | POLLRDNORM | POLLRDHUP;
1125 if (sk->sk_state == SMC_APPCLOSEWAIT1)
1126 mask |= POLLIN;
1127
1128 }
1129
1130 return mask;
1131}
1132
1133static int smc_shutdown(struct socket *sock, int how)
1134{
1135 struct sock *sk = sock->sk;
1136 struct smc_sock *smc;
1137 int rc = -EINVAL;
1138 int rc1 = 0;
1139
1140 smc = smc_sk(sk);
1141
1142 if ((how < SHUT_RD) || (how > SHUT_RDWR))
1143 return rc;
1144
1145 lock_sock(sk);
1146
1147 rc = -ENOTCONN;
1148 if ((sk->sk_state != SMC_LISTEN) &&
1149 (sk->sk_state != SMC_ACTIVE) &&
1150 (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
1151 (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
1152 (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1153 (sk->sk_state != SMC_APPCLOSEWAIT2) &&
1154 (sk->sk_state != SMC_APPFINCLOSEWAIT))
1155 goto out;
1156 if (smc->use_fallback) {
1157 rc = kernel_sock_shutdown(smc->clcsock, how);
1158 sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
1159 if (sk->sk_shutdown == SHUTDOWN_MASK)
1160 sk->sk_state = SMC_CLOSED;
1161 goto out;
1162 }
1163 switch (how) {
1164 case SHUT_RDWR: /* shutdown in both directions */
1165 rc = smc_close_active(smc);
1166 break;
1167 case SHUT_WR:
1168 rc = smc_close_shutdown_write(smc);
1169 break;
1170 case SHUT_RD:
1171 if (sk->sk_state == SMC_LISTEN)
1172 rc = smc_close_active(smc);
1173 else
1174 rc = 0;
1175 /* nothing more to do because peer is not involved */
1176 break;
1177 }
1178 rc1 = kernel_sock_shutdown(smc->clcsock, how);
1179 /* map sock_shutdown_cmd constants to sk_shutdown value range */
1180 sk->sk_shutdown |= how + 1;
1181
1182out:
1183 release_sock(sk);
1184 return rc ? rc : rc1;
1185}
1186
1187static int smc_setsockopt(struct socket *sock, int level, int optname,
1188 char __user *optval, unsigned int optlen)
1189{
1190 struct sock *sk = sock->sk;
1191 struct smc_sock *smc;
1192
1193 smc = smc_sk(sk);
1194
1195 /* generic setsockopts reaching us here always apply to the
1196 * CLC socket
1197 */
1198 return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
1199 optval, optlen);
1200}
1201
1202static int smc_getsockopt(struct socket *sock, int level, int optname,
1203 char __user *optval, int __user *optlen)
1204{
1205 struct smc_sock *smc;
1206
1207 smc = smc_sk(sock->sk);
1208 /* socket options apply to the CLC socket */
1209 return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
1210 optval, optlen);
1211}
1212
1213static int smc_ioctl(struct socket *sock, unsigned int cmd,
1214 unsigned long arg)
1215{
1216 struct smc_sock *smc;
1217
1218 smc = smc_sk(sock->sk);
1219 if (smc->use_fallback)
1220 return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
1221 else
1222 return sock_no_ioctl(sock, cmd, arg);
1223}
1224
1225static ssize_t smc_sendpage(struct socket *sock, struct page *page,
1226 int offset, size_t size, int flags)
1227{
1228 struct sock *sk = sock->sk;
1229 struct smc_sock *smc;
1230 int rc = -EPIPE;
1231
1232 smc = smc_sk(sk);
1233 lock_sock(sk);
1234 if (sk->sk_state != SMC_ACTIVE)
1235 goto out;
1236 if (smc->use_fallback)
1237 rc = kernel_sendpage(smc->clcsock, page, offset,
1238 size, flags);
1239 else
1240 rc = sock_no_sendpage(sock, page, offset, size, flags);
1241
1242out:
1243 release_sock(sk);
1244 return rc;
1245}
1246
1247static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
1248 struct pipe_inode_info *pipe, size_t len,
1249 unsigned int flags)
1250{
1251 struct sock *sk = sock->sk;
1252 struct smc_sock *smc;
1253 int rc = -ENOTCONN;
1254
1255 smc = smc_sk(sk);
1256 lock_sock(sk);
1257 if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
1258 goto out;
1259 if (smc->use_fallback) {
1260 rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
1261 pipe, len, flags);
1262 } else {
1263 rc = -EOPNOTSUPP;
1264 }
1265out:
1266 release_sock(sk);
1267 return rc;
1268}
1269
1270/* must look like tcp */
1271static const struct proto_ops smc_sock_ops = {
1272 .family = PF_SMC,
1273 .owner = THIS_MODULE,
1274 .release = smc_release,
1275 .bind = smc_bind,
1276 .connect = smc_connect,
1277 .socketpair = sock_no_socketpair,
1278 .accept = smc_accept,
1279 .getname = smc_getname,
1280 .poll = smc_poll,
1281 .ioctl = smc_ioctl,
1282 .listen = smc_listen,
1283 .shutdown = smc_shutdown,
1284 .setsockopt = smc_setsockopt,
1285 .getsockopt = smc_getsockopt,
1286 .sendmsg = smc_sendmsg,
1287 .recvmsg = smc_recvmsg,
1288 .mmap = sock_no_mmap,
1289 .sendpage = smc_sendpage,
1290 .splice_read = smc_splice_read,
1291};
1292
1293static int smc_create(struct net *net, struct socket *sock, int protocol,
1294 int kern)
1295{
1296 struct smc_sock *smc;
1297 struct sock *sk;
1298 int rc;
1299
1300 rc = -ESOCKTNOSUPPORT;
1301 if (sock->type != SOCK_STREAM)
1302 goto out;
1303
1304 rc = -EPROTONOSUPPORT;
1305 if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP))
1306 goto out;
1307
1308 rc = -ENOBUFS;
1309 sock->ops = &smc_sock_ops;
1310 sk = smc_sock_alloc(net, sock);
1311 if (!sk)
1312 goto out;
1313
1314 /* create internal TCP socket for CLC handshake and fallback */
1315 smc = smc_sk(sk);
1316 smc->use_fallback = false; /* assume rdma capability first */
1317 rc = sock_create_kern(net, PF_INET, SOCK_STREAM,
1318 IPPROTO_TCP, &smc->clcsock);
1319 if (rc)
1320 sk_common_release(sk);
1321 smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
1322 smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
1323
1324out:
1325 return rc;
1326}
1327
1328static const struct net_proto_family smc_sock_family_ops = {
1329 .family = PF_SMC,
1330 .owner = THIS_MODULE,
1331 .create = smc_create,
1332};
1333
1334static int __init smc_init(void)
1335{
1336 int rc;
1337
1338 rc = smc_pnet_init();
1339 if (rc)
1340 return rc;
1341
1342 rc = smc_llc_init();
1343 if (rc) {
1344 pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
1345 goto out_pnet;
1346 }
1347
1348 rc = smc_cdc_init();
1349 if (rc) {
1350 pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
1351 goto out_pnet;
1352 }
1353
1354 rc = proto_register(&smc_proto, 1);
1355 if (rc) {
1356 pr_err("%s: proto_register fails with %d\n", __func__, rc);
1357 goto out_pnet;
1358 }
1359
1360 rc = sock_register(&smc_sock_family_ops);
1361 if (rc) {
1362 pr_err("%s: sock_register fails with %d\n", __func__, rc);
1363 goto out_proto;
1364 }
1365 INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
1366
1367 rc = smc_ib_register_client();
1368 if (rc) {
1369 pr_err("%s: ib_register fails with %d\n", __func__, rc);
1370 goto out_sock;
1371 }
1372
1373 return 0;
1374
1375out_sock:
1376 sock_unregister(PF_SMC);
1377out_proto:
1378 proto_unregister(&smc_proto);
1379out_pnet:
1380 smc_pnet_exit();
1381 return rc;
1382}
1383
1384static void __exit smc_exit(void)
1385{
1386 struct smc_link_group *lgr, *lg;
1387 LIST_HEAD(lgr_freeing_list);
1388
1389 spin_lock_bh(&smc_lgr_list.lock);
1390 if (!list_empty(&smc_lgr_list.list))
1391 list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
1392 spin_unlock_bh(&smc_lgr_list.lock);
1393 list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
1394 list_del_init(&lgr->list);
1395 smc_lgr_free(lgr); /* free link group */
1396 }
1397 smc_ib_unregister_client();
1398 sock_unregister(PF_SMC);
1399 proto_unregister(&smc_proto);
1400 smc_pnet_exit();
1401}
1402
1403module_init(smc_init);
1404module_exit(smc_exit);
1405
1406MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
1407MODULE_DESCRIPTION("smc socket address family");
1408MODULE_LICENSE("GPL");
1409MODULE_ALIAS_NETPROTO(PF_SMC);
diff --git a/net/smc/smc.h b/net/smc/smc.h
new file mode 100644
index 000000000000..ee5fbea24549
--- /dev/null
+++ b/net/smc/smc.h
@@ -0,0 +1,274 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Definitions for the SMC module (socket related)
5 *
6 * Copyright IBM Corp. 2016
7 *
8 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
9 */
10#ifndef __SMC_H
11#define __SMC_H
12
13#include <linux/socket.h>
14#include <linux/types.h>
15#include <linux/compiler.h> /* __aligned */
16#include <net/sock.h>
17
18#include "smc_ib.h"
19
20#define SMCPROTO_SMC 0 /* SMC protocol */
21
22#define SMC_MAX_PORTS 2 /* Max # of ports */
23
24extern struct proto smc_proto;
25
26#ifdef ATOMIC64_INIT
27#define KERNEL_HAS_ATOMIC64
28#endif
29
30enum smc_state { /* possible states of an SMC socket */
31 SMC_ACTIVE = 1,
32 SMC_INIT = 2,
33 SMC_CLOSED = 7,
34 SMC_LISTEN = 10,
35 /* normal close */
36 SMC_PEERCLOSEWAIT1 = 20,
37 SMC_PEERCLOSEWAIT2 = 21,
38 SMC_APPFINCLOSEWAIT = 24,
39 SMC_APPCLOSEWAIT1 = 22,
40 SMC_APPCLOSEWAIT2 = 23,
41 SMC_PEERFINCLOSEWAIT = 25,
42 /* abnormal close */
43 SMC_PEERABORTWAIT = 26,
44 SMC_PROCESSABORT = 27,
45};
46
47struct smc_link_group;
48
49struct smc_wr_rx_hdr { /* common prefix part of LLC and CDC to demultiplex */
50 u8 type;
51} __aligned(1);
52
53struct smc_cdc_conn_state_flags {
54#if defined(__BIG_ENDIAN_BITFIELD)
55 u8 peer_done_writing : 1; /* Sending done indicator */
56 u8 peer_conn_closed : 1; /* Peer connection closed indicator */
57 u8 peer_conn_abort : 1; /* Abnormal close indicator */
58 u8 reserved : 5;
59#elif defined(__LITTLE_ENDIAN_BITFIELD)
60 u8 reserved : 5;
61 u8 peer_conn_abort : 1;
62 u8 peer_conn_closed : 1;
63 u8 peer_done_writing : 1;
64#endif
65};
66
67struct smc_cdc_producer_flags {
68#if defined(__BIG_ENDIAN_BITFIELD)
69 u8 write_blocked : 1; /* Writing Blocked, no rx buf space */
70 u8 urg_data_pending : 1; /* Urgent Data Pending */
71 u8 urg_data_present : 1; /* Urgent Data Present */
72 u8 cons_curs_upd_req : 1; /* cursor update requested */
73 u8 failover_validation : 1;/* message replay due to failover */
74 u8 reserved : 3;
75#elif defined(__LITTLE_ENDIAN_BITFIELD)
76 u8 reserved : 3;
77 u8 failover_validation : 1;
78 u8 cons_curs_upd_req : 1;
79 u8 urg_data_present : 1;
80 u8 urg_data_pending : 1;
81 u8 write_blocked : 1;
82#endif
83};
84
85/* in host byte order */
86union smc_host_cursor { /* SMC cursor - an offset in an RMBE */
87 struct {
88 u16 reserved;
89 u16 wrap; /* window wrap sequence number */
90 u32 count; /* cursor (= offset) part */
91 };
92#ifdef KERNEL_HAS_ATOMIC64
93 atomic64_t acurs; /* for atomic processing */
94#else
95 u64 acurs; /* for atomic processing */
96#endif
97} __aligned(8);
98
99/* in host byte order, except for flag bitfields in network byte order */
100struct smc_host_cdc_msg { /* Connection Data Control message */
101 struct smc_wr_rx_hdr common; /* .type = 0xFE */
102 u8 len; /* length = 44 */
103 u16 seqno; /* connection seq # */
104 u32 token; /* alert_token */
105 union smc_host_cursor prod; /* producer cursor */
106 union smc_host_cursor cons; /* consumer cursor,
107 * piggy backed "ack"
108 */
109 struct smc_cdc_producer_flags prod_flags; /* conn. tx/rx status */
110 struct smc_cdc_conn_state_flags conn_state_flags; /* peer conn. status*/
111 u8 reserved[18];
112} __aligned(8);
113
114struct smc_connection {
115 struct rb_node alert_node;
116 struct smc_link_group *lgr; /* link group of connection */
117 u32 alert_token_local; /* unique conn. id */
118 u8 peer_conn_idx; /* from tcp handshake */
119 int peer_rmbe_size; /* size of peer rx buffer */
120 atomic_t peer_rmbe_space;/* remaining free bytes in peer
121 * rmbe
122 */
123 int rtoken_idx; /* idx to peer RMB rkey/addr */
124
125 struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */
126 int sndbuf_size; /* sndbuf size <== sock wmem */
127 struct smc_buf_desc *rmb_desc; /* RMBE descriptor */
128 int rmbe_size; /* RMBE size <== sock rmem */
129 int rmbe_size_short;/* compressed notation */
130 int rmbe_update_limit;
131 /* lower limit for consumer
132 * cursor update
133 */
134
135 struct smc_host_cdc_msg local_tx_ctrl; /* host byte order staging
136 * buffer for CDC msg send
137 * .prod cf. TCP snd_nxt
138 * .cons cf. TCP sends ack
139 */
140 union smc_host_cursor tx_curs_prep; /* tx - prepared data
141 * snd_max..wmem_alloc
142 */
143 union smc_host_cursor tx_curs_sent; /* tx - sent data
144 * snd_nxt ?
145 */
146 union smc_host_cursor tx_curs_fin; /* tx - confirmed by peer
147 * snd-wnd-begin ?
148 */
149 atomic_t sndbuf_space; /* remaining space in sndbuf */
150 u16 tx_cdc_seq; /* sequence # for CDC send */
151 spinlock_t send_lock; /* protect wr_sends */
152 struct work_struct tx_work; /* retry of smc_cdc_msg_send */
153
154 struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl.
155 * .prod cf. TCP rcv_nxt
156 * .cons cf. TCP snd_una
157 */
158 union smc_host_cursor rx_curs_confirmed; /* confirmed to peer
159 * source of snd_una ?
160 */
161 atomic_t bytes_to_rcv; /* arrived data,
162 * not yet received
163 */
164#ifndef KERNEL_HAS_ATOMIC64
165 spinlock_t acurs_lock; /* protect cursors */
166#endif
167};
168
169struct smc_sock { /* smc sock container */
170 struct sock sk;
171 struct socket *clcsock; /* internal tcp socket */
172 struct smc_connection conn; /* smc connection */
173 struct sockaddr *addr; /* inet connect address */
174 struct smc_sock *listen_smc; /* listen parent */
175 struct work_struct tcp_listen_work;/* handle tcp socket accepts */
176 struct work_struct smc_listen_work;/* prepare new accept socket */
177 struct list_head accept_q; /* sockets to be accepted */
178 spinlock_t accept_q_lock; /* protects accept_q */
179 struct delayed_work sock_put_work; /* final socket freeing */
180 bool use_fallback; /* fallback to tcp */
181 u8 wait_close_tx_prepared : 1;
182 /* shutdown wr or close
183 * started, waiting for unsent
184 * data to be sent
185 */
186};
187
188static inline struct smc_sock *smc_sk(const struct sock *sk)
189{
190 return (struct smc_sock *)sk;
191}
192
193#define SMC_SYSTEMID_LEN 8
194
195extern u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */
196
197/* convert an u32 value into network byte order, store it into a 3 byte field */
198static inline void hton24(u8 *net, u32 host)
199{
200 __be32 t;
201
202 t = cpu_to_be32(host);
203 memcpy(net, ((u8 *)&t) + 1, 3);
204}
205
206/* convert a received 3 byte field into host byte order*/
207static inline u32 ntoh24(u8 *net)
208{
209 __be32 t = 0;
210
211 memcpy(((u8 *)&t) + 1, net, 3);
212 return be32_to_cpu(t);
213}
214
215#define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */
216
217#define SMC_RMBE_SIZES 16 /* number of distinct sizes for an RMBE */
218/* theoretically, the RFC states that largest size would be 512K,
219 * i.e. compressed 5 and thus 6 sizes (0..5), despite
220 * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15)
221 */
222
223/* convert the RMB size into the compressed notation - minimum 16K.
224 * In contrast to plain ilog2, this rounds towards the next power of 2,
225 * so the socket application gets at least its desired sndbuf / rcvbuf size.
226 */
227static inline u8 smc_compress_bufsize(int size)
228{
229 u8 compressed;
230
231 if (size <= SMC_BUF_MIN_SIZE)
232 return 0;
233
234 size = (size - 1) >> 14;
235 compressed = ilog2(size) + 1;
236 if (compressed >= SMC_RMBE_SIZES)
237 compressed = SMC_RMBE_SIZES - 1;
238 return compressed;
239}
240
241/* convert the RMB size from compressed notation into integer */
242static inline int smc_uncompress_bufsize(u8 compressed)
243{
244 u32 size;
245
246 size = 0x00000001 << (((int)compressed) + 14);
247 return (int)size;
248}
249
250#ifdef CONFIG_XFRM
251static inline bool using_ipsec(struct smc_sock *smc)
252{
253 return (smc->clcsock->sk->sk_policy[0] ||
254 smc->clcsock->sk->sk_policy[1]) ? 1 : 0;
255}
256#else
257static inline bool using_ipsec(struct smc_sock *smc)
258{
259 return 0;
260}
261#endif
262
263struct smc_clc_msg_local;
264
265int smc_netinfo_by_tcpsk(struct socket *clcsock, __be32 *subnet,
266 u8 *prefix_len);
267void smc_conn_free(struct smc_connection *conn);
268int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr,
269 struct smc_ib_device *smcibdev, u8 ibport,
270 struct smc_clc_msg_local *lcl, int srv_first_contact);
271struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock);
272void smc_close_non_accepted(struct sock *sk);
273
274#endif /* __SMC_H */
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
new file mode 100644
index 000000000000..5a339493872e
--- /dev/null
+++ b/net/smc/smc_cdc.c
@@ -0,0 +1,304 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Connection Data Control (CDC)
5 * handles flow control
6 *
7 * Copyright IBM Corp. 2016
8 *
9 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
10 */
11
12#include <linux/spinlock.h>
13
14#include "smc.h"
15#include "smc_wr.h"
16#include "smc_cdc.h"
17#include "smc_tx.h"
18#include "smc_rx.h"
19#include "smc_close.h"
20
21/********************************** send *************************************/
22
23struct smc_cdc_tx_pend {
24 struct smc_connection *conn; /* socket connection */
25 union smc_host_cursor cursor; /* tx sndbuf cursor sent */
26 union smc_host_cursor p_cursor; /* rx RMBE cursor produced */
27 u16 ctrl_seq; /* conn. tx sequence # */
28};
29
30/* handler for send/transmission completion of a CDC msg */
31static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
32 struct smc_link *link,
33 enum ib_wc_status wc_status)
34{
35 struct smc_cdc_tx_pend *cdcpend = (struct smc_cdc_tx_pend *)pnd_snd;
36 struct smc_sock *smc;
37 int diff;
38
39 if (!cdcpend->conn)
40 /* already dismissed */
41 return;
42
43 smc = container_of(cdcpend->conn, struct smc_sock, conn);
44 bh_lock_sock(&smc->sk);
45 if (!wc_status) {
46 diff = smc_curs_diff(cdcpend->conn->sndbuf_size,
47 &cdcpend->conn->tx_curs_fin,
48 &cdcpend->cursor);
49 /* sndbuf_space is decreased in smc_sendmsg */
50 smp_mb__before_atomic();
51 atomic_add(diff, &cdcpend->conn->sndbuf_space);
52 /* guarantee 0 <= sndbuf_space <= sndbuf_size */
53 smp_mb__after_atomic();
54 smc_curs_write(&cdcpend->conn->tx_curs_fin,
55 smc_curs_read(&cdcpend->cursor, cdcpend->conn),
56 cdcpend->conn);
57 }
58 smc_tx_sndbuf_nonfull(smc);
59 if (smc->sk.sk_state != SMC_ACTIVE)
60 /* wake up smc_close_wait_tx_pends() */
61 smc->sk.sk_state_change(&smc->sk);
62 bh_unlock_sock(&smc->sk);
63}
64
65int smc_cdc_get_free_slot(struct smc_link *link,
66 struct smc_wr_buf **wr_buf,
67 struct smc_cdc_tx_pend **pend)
68{
69 return smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf,
70 (struct smc_wr_tx_pend_priv **)pend);
71}
72
73static inline void smc_cdc_add_pending_send(struct smc_connection *conn,
74 struct smc_cdc_tx_pend *pend)
75{
76 BUILD_BUG_ON_MSG(
77 sizeof(struct smc_cdc_msg) > SMC_WR_BUF_SIZE,
78 "must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_cdc_msg)");
79 BUILD_BUG_ON_MSG(
80 offsetof(struct smc_cdc_msg, reserved) > SMC_WR_TX_SIZE,
81 "must adapt SMC_WR_TX_SIZE to sizeof(struct smc_cdc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()");
82 BUILD_BUG_ON_MSG(
83 sizeof(struct smc_cdc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE,
84 "must increase SMC_WR_TX_PEND_PRIV_SIZE to at least sizeof(struct smc_cdc_tx_pend)");
85 pend->conn = conn;
86 pend->cursor = conn->tx_curs_sent;
87 pend->p_cursor = conn->local_tx_ctrl.prod;
88 pend->ctrl_seq = conn->tx_cdc_seq;
89}
90
91int smc_cdc_msg_send(struct smc_connection *conn,
92 struct smc_wr_buf *wr_buf,
93 struct smc_cdc_tx_pend *pend)
94{
95 struct smc_link *link;
96 int rc;
97
98 link = &conn->lgr->lnk[SMC_SINGLE_LINK];
99
100 smc_cdc_add_pending_send(conn, pend);
101
102 conn->tx_cdc_seq++;
103 conn->local_tx_ctrl.seqno = conn->tx_cdc_seq;
104 smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf,
105 &conn->local_tx_ctrl, conn);
106 rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend);
107 if (!rc)
108 smc_curs_write(&conn->rx_curs_confirmed,
109 smc_curs_read(&conn->local_tx_ctrl.cons, conn),
110 conn);
111
112 return rc;
113}
114
115int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn)
116{
117 struct smc_cdc_tx_pend *pend;
118 struct smc_wr_buf *wr_buf;
119 int rc;
120
121 rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], &wr_buf,
122 &pend);
123 if (rc)
124 return rc;
125
126 return smc_cdc_msg_send(conn, wr_buf, pend);
127}
128
129static bool smc_cdc_tx_filter(struct smc_wr_tx_pend_priv *tx_pend,
130 unsigned long data)
131{
132 struct smc_connection *conn = (struct smc_connection *)data;
133 struct smc_cdc_tx_pend *cdc_pend =
134 (struct smc_cdc_tx_pend *)tx_pend;
135
136 return cdc_pend->conn == conn;
137}
138
139static void smc_cdc_tx_dismisser(struct smc_wr_tx_pend_priv *tx_pend)
140{
141 struct smc_cdc_tx_pend *cdc_pend =
142 (struct smc_cdc_tx_pend *)tx_pend;
143
144 cdc_pend->conn = NULL;
145}
146
147void smc_cdc_tx_dismiss_slots(struct smc_connection *conn)
148{
149 struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK];
150
151 smc_wr_tx_dismiss_slots(link, SMC_CDC_MSG_TYPE,
152 smc_cdc_tx_filter, smc_cdc_tx_dismisser,
153 (unsigned long)conn);
154}
155
156bool smc_cdc_tx_has_pending(struct smc_connection *conn)
157{
158 struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK];
159
160 return smc_wr_tx_has_pending(link, SMC_CDC_MSG_TYPE,
161 smc_cdc_tx_filter, (unsigned long)conn);
162}
163
164/********************************* receive ***********************************/
165
166static inline bool smc_cdc_before(u16 seq1, u16 seq2)
167{
168 return (s16)(seq1 - seq2) < 0;
169}
170
171static void smc_cdc_msg_recv_action(struct smc_sock *smc,
172 struct smc_link *link,
173 struct smc_cdc_msg *cdc)
174{
175 union smc_host_cursor cons_old, prod_old;
176 struct smc_connection *conn = &smc->conn;
177 int diff_cons, diff_prod;
178
179 if (!cdc->prod_flags.failover_validation) {
180 if (smc_cdc_before(ntohs(cdc->seqno),
181 conn->local_rx_ctrl.seqno))
182 /* received seqno is old */
183 return;
184 }
185 smc_curs_write(&prod_old,
186 smc_curs_read(&conn->local_rx_ctrl.prod, conn),
187 conn);
188 smc_curs_write(&cons_old,
189 smc_curs_read(&conn->local_rx_ctrl.cons, conn),
190 conn);
191 smc_cdc_msg_to_host(&conn->local_rx_ctrl, cdc, conn);
192
193 diff_cons = smc_curs_diff(conn->peer_rmbe_size, &cons_old,
194 &conn->local_rx_ctrl.cons);
195 if (diff_cons) {
196 /* peer_rmbe_space is decreased during data transfer with RDMA
197 * write
198 */
199 smp_mb__before_atomic();
200 atomic_add(diff_cons, &conn->peer_rmbe_space);
201 /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */
202 smp_mb__after_atomic();
203 }
204
205 diff_prod = smc_curs_diff(conn->rmbe_size, &prod_old,
206 &conn->local_rx_ctrl.prod);
207 if (diff_prod) {
208 /* bytes_to_rcv is decreased in smc_recvmsg */
209 smp_mb__before_atomic();
210 atomic_add(diff_prod, &conn->bytes_to_rcv);
211 /* guarantee 0 <= bytes_to_rcv <= rmbe_size */
212 smp_mb__after_atomic();
213 smc->sk.sk_data_ready(&smc->sk);
214 }
215
216 if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
217 smc->sk.sk_err = ECONNRESET;
218 conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
219 }
220 if (smc_cdc_rxed_any_close_or_senddone(conn))
221 smc_close_passive_received(smc);
222
223 /* piggy backed tx info */
224 /* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */
225 if (diff_cons && smc_tx_prepared_sends(conn)) {
226 smc_tx_sndbuf_nonempty(conn);
227 /* trigger socket release if connection closed */
228 smc_close_wake_tx_prepared(smc);
229 }
230
231 /* subsequent patch: trigger socket release if connection closed */
232
233 /* socket connected but not accepted */
234 if (!smc->sk.sk_socket)
235 return;
236
237 /* data available */
238 if ((conn->local_rx_ctrl.prod_flags.write_blocked) ||
239 (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req))
240 smc_tx_consumer_update(conn);
241}
242
243/* called under tasklet context */
244static inline void smc_cdc_msg_recv(struct smc_cdc_msg *cdc,
245 struct smc_link *link, u64 wr_id)
246{
247 struct smc_link_group *lgr = container_of(link, struct smc_link_group,
248 lnk[SMC_SINGLE_LINK]);
249 struct smc_connection *connection;
250 struct smc_sock *smc;
251
252 /* lookup connection */
253 read_lock_bh(&lgr->conns_lock);
254 connection = smc_lgr_find_conn(ntohl(cdc->token), lgr);
255 if (!connection) {
256 read_unlock_bh(&lgr->conns_lock);
257 return;
258 }
259 smc = container_of(connection, struct smc_sock, conn);
260 sock_hold(&smc->sk);
261 read_unlock_bh(&lgr->conns_lock);
262 bh_lock_sock(&smc->sk);
263 smc_cdc_msg_recv_action(smc, link, cdc);
264 bh_unlock_sock(&smc->sk);
265 sock_put(&smc->sk); /* no free sk in softirq-context */
266}
267
268/***************************** init, exit, misc ******************************/
269
270static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf)
271{
272 struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
273 struct smc_cdc_msg *cdc = buf;
274
275 if (wc->byte_len < offsetof(struct smc_cdc_msg, reserved))
276 return; /* short message */
277 if (cdc->len != sizeof(*cdc))
278 return; /* invalid message */
279 smc_cdc_msg_recv(cdc, link, wc->wr_id);
280}
281
282static struct smc_wr_rx_handler smc_cdc_rx_handlers[] = {
283 {
284 .handler = smc_cdc_rx_handler,
285 .type = SMC_CDC_MSG_TYPE
286 },
287 {
288 .handler = NULL,
289 }
290};
291
292int __init smc_cdc_init(void)
293{
294 struct smc_wr_rx_handler *handler;
295 int rc = 0;
296
297 for (handler = smc_cdc_rx_handlers; handler->handler; handler++) {
298 INIT_HLIST_NODE(&handler->list);
299 rc = smc_wr_rx_register_handler(handler);
300 if (rc)
301 break;
302 }
303 return rc;
304}
diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h
new file mode 100644
index 000000000000..8e1d76f26007
--- /dev/null
+++ b/net/smc/smc_cdc.h
@@ -0,0 +1,218 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Connection Data Control (CDC)
5 *
6 * Copyright IBM Corp. 2016
7 *
8 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
9 */
10
11#ifndef SMC_CDC_H
12#define SMC_CDC_H
13
14#include <linux/kernel.h> /* max_t */
15#include <linux/atomic.h>
16#include <linux/in.h>
17#include <linux/compiler.h>
18
19#include "smc.h"
20#include "smc_core.h"
21#include "smc_wr.h"
22
23#define SMC_CDC_MSG_TYPE 0xFE
24
25/* in network byte order */
26union smc_cdc_cursor { /* SMC cursor */
27 struct {
28 __be16 reserved;
29 __be16 wrap;
30 __be32 count;
31 };
32#ifdef KERNEL_HAS_ATOMIC64
33 atomic64_t acurs; /* for atomic processing */
34#else
35 u64 acurs; /* for atomic processing */
36#endif
37} __aligned(8);
38
39/* in network byte order */
40struct smc_cdc_msg {
41 struct smc_wr_rx_hdr common; /* .type = 0xFE */
42 u8 len; /* 44 */
43 __be16 seqno;
44 __be32 token;
45 union smc_cdc_cursor prod;
46 union smc_cdc_cursor cons; /* piggy backed "ack" */
47 struct smc_cdc_producer_flags prod_flags;
48 struct smc_cdc_conn_state_flags conn_state_flags;
49 u8 reserved[18];
50} __aligned(8);
51
52static inline bool smc_cdc_rxed_any_close(struct smc_connection *conn)
53{
54 return conn->local_rx_ctrl.conn_state_flags.peer_conn_abort ||
55 conn->local_rx_ctrl.conn_state_flags.peer_conn_closed;
56}
57
58static inline bool smc_cdc_rxed_any_close_or_senddone(
59 struct smc_connection *conn)
60{
61 return smc_cdc_rxed_any_close(conn) ||
62 conn->local_rx_ctrl.conn_state_flags.peer_done_writing;
63}
64
65static inline void smc_curs_add(int size, union smc_host_cursor *curs,
66 int value)
67{
68 curs->count += value;
69 if (curs->count >= size) {
70 curs->wrap++;
71 curs->count -= size;
72 }
73}
74
75/* SMC cursors are 8 bytes long and require atomic reading and writing */
76static inline u64 smc_curs_read(union smc_host_cursor *curs,
77 struct smc_connection *conn)
78{
79#ifndef KERNEL_HAS_ATOMIC64
80 unsigned long flags;
81 u64 ret;
82
83 spin_lock_irqsave(&conn->acurs_lock, flags);
84 ret = curs->acurs;
85 spin_unlock_irqrestore(&conn->acurs_lock, flags);
86 return ret;
87#else
88 return atomic64_read(&curs->acurs);
89#endif
90}
91
92static inline u64 smc_curs_read_net(union smc_cdc_cursor *curs,
93 struct smc_connection *conn)
94{
95#ifndef KERNEL_HAS_ATOMIC64
96 unsigned long flags;
97 u64 ret;
98
99 spin_lock_irqsave(&conn->acurs_lock, flags);
100 ret = curs->acurs;
101 spin_unlock_irqrestore(&conn->acurs_lock, flags);
102 return ret;
103#else
104 return atomic64_read(&curs->acurs);
105#endif
106}
107
108static inline void smc_curs_write(union smc_host_cursor *curs, u64 val,
109 struct smc_connection *conn)
110{
111#ifndef KERNEL_HAS_ATOMIC64
112 unsigned long flags;
113
114 spin_lock_irqsave(&conn->acurs_lock, flags);
115 curs->acurs = val;
116 spin_unlock_irqrestore(&conn->acurs_lock, flags);
117#else
118 atomic64_set(&curs->acurs, val);
119#endif
120}
121
122static inline void smc_curs_write_net(union smc_cdc_cursor *curs, u64 val,
123 struct smc_connection *conn)
124{
125#ifndef KERNEL_HAS_ATOMIC64
126 unsigned long flags;
127
128 spin_lock_irqsave(&conn->acurs_lock, flags);
129 curs->acurs = val;
130 spin_unlock_irqrestore(&conn->acurs_lock, flags);
131#else
132 atomic64_set(&curs->acurs, val);
133#endif
134}
135
136/* calculate cursor difference between old and new, where old <= new */
137static inline int smc_curs_diff(unsigned int size,
138 union smc_host_cursor *old,
139 union smc_host_cursor *new)
140{
141 if (old->wrap != new->wrap)
142 return max_t(int, 0,
143 ((size - old->count) + new->count));
144
145 return max_t(int, 0, (new->count - old->count));
146}
147
148static inline void smc_host_cursor_to_cdc(union smc_cdc_cursor *peer,
149 union smc_host_cursor *local,
150 struct smc_connection *conn)
151{
152 union smc_host_cursor temp;
153
154 smc_curs_write(&temp, smc_curs_read(local, conn), conn);
155 peer->count = htonl(temp.count);
156 peer->wrap = htons(temp.wrap);
157 /* peer->reserved = htons(0); must be ensured by caller */
158}
159
160static inline void smc_host_msg_to_cdc(struct smc_cdc_msg *peer,
161 struct smc_host_cdc_msg *local,
162 struct smc_connection *conn)
163{
164 peer->common.type = local->common.type;
165 peer->len = local->len;
166 peer->seqno = htons(local->seqno);
167 peer->token = htonl(local->token);
168 smc_host_cursor_to_cdc(&peer->prod, &local->prod, conn);
169 smc_host_cursor_to_cdc(&peer->cons, &local->cons, conn);
170 peer->prod_flags = local->prod_flags;
171 peer->conn_state_flags = local->conn_state_flags;
172}
173
174static inline void smc_cdc_cursor_to_host(union smc_host_cursor *local,
175 union smc_cdc_cursor *peer,
176 struct smc_connection *conn)
177{
178 union smc_host_cursor temp, old;
179 union smc_cdc_cursor net;
180
181 smc_curs_write(&old, smc_curs_read(local, conn), conn);
182 smc_curs_write_net(&net, smc_curs_read_net(peer, conn), conn);
183 temp.count = ntohl(net.count);
184 temp.wrap = ntohs(net.wrap);
185 if ((old.wrap > temp.wrap) && temp.wrap)
186 return;
187 if ((old.wrap == temp.wrap) &&
188 (old.count > temp.count))
189 return;
190 smc_curs_write(local, smc_curs_read(&temp, conn), conn);
191}
192
193static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local,
194 struct smc_cdc_msg *peer,
195 struct smc_connection *conn)
196{
197 local->common.type = peer->common.type;
198 local->len = peer->len;
199 local->seqno = ntohs(peer->seqno);
200 local->token = ntohl(peer->token);
201 smc_cdc_cursor_to_host(&local->prod, &peer->prod, conn);
202 smc_cdc_cursor_to_host(&local->cons, &peer->cons, conn);
203 local->prod_flags = peer->prod_flags;
204 local->conn_state_flags = peer->conn_state_flags;
205}
206
207struct smc_cdc_tx_pend;
208
209int smc_cdc_get_free_slot(struct smc_link *link, struct smc_wr_buf **wr_buf,
210 struct smc_cdc_tx_pend **pend);
211void smc_cdc_tx_dismiss_slots(struct smc_connection *conn);
212int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf,
213 struct smc_cdc_tx_pend *pend);
214int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn);
215bool smc_cdc_tx_has_pending(struct smc_connection *conn);
216int smc_cdc_init(void) __init;
217
218#endif /* SMC_CDC_H */
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
new file mode 100644
index 000000000000..e41f594a1e1d
--- /dev/null
+++ b/net/smc/smc_clc.c
@@ -0,0 +1,282 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * CLC (connection layer control) handshake over initial TCP socket to
5 * prepare for RDMA traffic
6 *
7 * Copyright IBM Corp. 2016
8 *
9 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
10 */
11
12#include <linux/in.h>
13#include <linux/if_ether.h>
14#include <linux/sched/signal.h>
15
16#include <net/sock.h>
17#include <net/tcp.h>
18
19#include "smc.h"
20#include "smc_core.h"
21#include "smc_clc.h"
22#include "smc_ib.h"
23
24/* Wait for data on the tcp-socket, analyze received data
25 * Returns:
26 * 0 if success and it was not a decline that we received.
27 * SMC_CLC_DECL_REPLY if decline received for fallback w/o another decl send.
28 * clcsock error, -EINTR, -ECONNRESET, -EPROTO otherwise.
29 */
30int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
31 u8 expected_type)
32{
33 struct sock *clc_sk = smc->clcsock->sk;
34 struct smc_clc_msg_hdr *clcm = buf;
35 struct msghdr msg = {NULL, 0};
36 int reason_code = 0;
37 struct kvec vec;
38 int len, datlen;
39 int krflags;
40
41 /* peek the first few bytes to determine length of data to receive
42 * so we don't consume any subsequent CLC message or payload data
43 * in the TCP byte stream
44 */
45 vec.iov_base = buf;
46 vec.iov_len = buflen;
47 krflags = MSG_PEEK | MSG_WAITALL;
48 smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME;
49 len = kernel_recvmsg(smc->clcsock, &msg, &vec, 1,
50 sizeof(struct smc_clc_msg_hdr), krflags);
51 if (signal_pending(current)) {
52 reason_code = -EINTR;
53 clc_sk->sk_err = EINTR;
54 smc->sk.sk_err = EINTR;
55 goto out;
56 }
57 if (clc_sk->sk_err) {
58 reason_code = -clc_sk->sk_err;
59 smc->sk.sk_err = clc_sk->sk_err;
60 goto out;
61 }
62 if (!len) { /* peer has performed orderly shutdown */
63 smc->sk.sk_err = ECONNRESET;
64 reason_code = -ECONNRESET;
65 goto out;
66 }
67 if (len < 0) {
68 smc->sk.sk_err = -len;
69 reason_code = len;
70 goto out;
71 }
72 datlen = ntohs(clcm->length);
73 if ((len < sizeof(struct smc_clc_msg_hdr)) ||
74 (datlen < sizeof(struct smc_clc_msg_decline)) ||
75 (datlen > sizeof(struct smc_clc_msg_accept_confirm)) ||
76 memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) ||
77 ((clcm->type != SMC_CLC_DECLINE) &&
78 (clcm->type != expected_type))) {
79 smc->sk.sk_err = EPROTO;
80 reason_code = -EPROTO;
81 goto out;
82 }
83
84 /* receive the complete CLC message */
85 vec.iov_base = buf;
86 vec.iov_len = buflen;
87 memset(&msg, 0, sizeof(struct msghdr));
88 krflags = MSG_WAITALL;
89 smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME;
90 len = kernel_recvmsg(smc->clcsock, &msg, &vec, 1, datlen, krflags);
91 if (len < datlen) {
92 smc->sk.sk_err = EPROTO;
93 reason_code = -EPROTO;
94 goto out;
95 }
96 if (clcm->type == SMC_CLC_DECLINE) {
97 reason_code = SMC_CLC_DECL_REPLY;
98 if (ntohl(((struct smc_clc_msg_decline *)buf)->peer_diagnosis)
99 == SMC_CLC_DECL_SYNCERR)
100 smc->conn.lgr->sync_err = true;
101 }
102
103out:
104 return reason_code;
105}
106
107/* send CLC DECLINE message across internal TCP socket */
108int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info,
109 u8 out_of_sync)
110{
111 struct smc_clc_msg_decline dclc;
112 struct msghdr msg;
113 struct kvec vec;
114 int len;
115
116 memset(&dclc, 0, sizeof(dclc));
117 memcpy(dclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
118 dclc.hdr.type = SMC_CLC_DECLINE;
119 dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline));
120 dclc.hdr.version = SMC_CLC_V1;
121 dclc.hdr.flag = out_of_sync ? 1 : 0;
122 memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid));
123 dclc.peer_diagnosis = htonl(peer_diag_info);
124 memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
125
126 memset(&msg, 0, sizeof(msg));
127 vec.iov_base = &dclc;
128 vec.iov_len = sizeof(struct smc_clc_msg_decline);
129 len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1,
130 sizeof(struct smc_clc_msg_decline));
131 if (len < sizeof(struct smc_clc_msg_decline))
132 smc->sk.sk_err = EPROTO;
133 if (len < 0)
134 smc->sk.sk_err = -len;
135 return len;
136}
137
138/* send CLC PROPOSAL message across internal TCP socket */
139int smc_clc_send_proposal(struct smc_sock *smc,
140 struct smc_ib_device *smcibdev,
141 u8 ibport)
142{
143 struct smc_clc_msg_proposal pclc;
144 int reason_code = 0;
145 struct msghdr msg;
146 struct kvec vec;
147 int len, rc;
148
149 /* send SMC Proposal CLC message */
150 memset(&pclc, 0, sizeof(pclc));
151 memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
152 pclc.hdr.type = SMC_CLC_PROPOSAL;
153 pclc.hdr.length = htons(sizeof(pclc));
154 pclc.hdr.version = SMC_CLC_V1; /* SMC version */
155 memcpy(pclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
156 memcpy(&pclc.lcl.gid, &smcibdev->gid[ibport - 1], SMC_GID_SIZE);
157 memcpy(&pclc.lcl.mac, &smcibdev->mac[ibport - 1], ETH_ALEN);
158
159 /* determine subnet and mask from internal TCP socket */
160 rc = smc_netinfo_by_tcpsk(smc->clcsock, &pclc.outgoing_subnet,
161 &pclc.prefix_len);
162 if (rc)
163 return SMC_CLC_DECL_CNFERR; /* configuration error */
164 memcpy(pclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
165 memset(&msg, 0, sizeof(msg));
166 vec.iov_base = &pclc;
167 vec.iov_len = sizeof(pclc);
168 /* due to the few bytes needed for clc-handshake this cannot block */
169 len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(pclc));
170 if (len < sizeof(pclc)) {
171 if (len >= 0) {
172 reason_code = -ENETUNREACH;
173 smc->sk.sk_err = -reason_code;
174 } else {
175 smc->sk.sk_err = smc->clcsock->sk->sk_err;
176 reason_code = -smc->sk.sk_err;
177 }
178 }
179
180 return reason_code;
181}
182
183/* send CLC CONFIRM message across internal TCP socket */
184int smc_clc_send_confirm(struct smc_sock *smc)
185{
186 struct smc_connection *conn = &smc->conn;
187 struct smc_clc_msg_accept_confirm cclc;
188 struct smc_link *link;
189 int reason_code = 0;
190 struct msghdr msg;
191 struct kvec vec;
192 int len;
193
194 link = &conn->lgr->lnk[SMC_SINGLE_LINK];
195 /* send SMC Confirm CLC msg */
196 memset(&cclc, 0, sizeof(cclc));
197 memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
198 cclc.hdr.type = SMC_CLC_CONFIRM;
199 cclc.hdr.length = htons(sizeof(cclc));
200 cclc.hdr.version = SMC_CLC_V1; /* SMC version */
201 memcpy(cclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
202 memcpy(&cclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1],
203 SMC_GID_SIZE);
204 memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1], ETH_ALEN);
205 hton24(cclc.qpn, link->roce_qp->qp_num);
206 cclc.rmb_rkey =
207 htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
208 cclc.conn_idx = 1; /* for now: 1 RMB = 1 RMBE */
209 cclc.rmbe_alert_token = htonl(conn->alert_token_local);
210 cclc.qp_mtu = min(link->path_mtu, link->peer_mtu);
211 cclc.rmbe_size = conn->rmbe_size_short;
212 cclc.rmb_dma_addr =
213 cpu_to_be64((u64)conn->rmb_desc->dma_addr[SMC_SINGLE_LINK]);
214 hton24(cclc.psn, link->psn_initial);
215
216 memcpy(cclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
217
218 memset(&msg, 0, sizeof(msg));
219 vec.iov_base = &cclc;
220 vec.iov_len = sizeof(cclc);
221 len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(cclc));
222 if (len < sizeof(cclc)) {
223 if (len >= 0) {
224 reason_code = -ENETUNREACH;
225 smc->sk.sk_err = -reason_code;
226 } else {
227 smc->sk.sk_err = smc->clcsock->sk->sk_err;
228 reason_code = -smc->sk.sk_err;
229 }
230 }
231 return reason_code;
232}
233
234/* send CLC ACCEPT message across internal TCP socket */
235int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact)
236{
237 struct smc_connection *conn = &new_smc->conn;
238 struct smc_clc_msg_accept_confirm aclc;
239 struct smc_link *link;
240 struct msghdr msg;
241 struct kvec vec;
242 int rc = 0;
243 int len;
244
245 link = &conn->lgr->lnk[SMC_SINGLE_LINK];
246 memset(&aclc, 0, sizeof(aclc));
247 memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
248 aclc.hdr.type = SMC_CLC_ACCEPT;
249 aclc.hdr.length = htons(sizeof(aclc));
250 aclc.hdr.version = SMC_CLC_V1; /* SMC version */
251 if (srv_first_contact)
252 aclc.hdr.flag = 1;
253 memcpy(aclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
254 memcpy(&aclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1],
255 SMC_GID_SIZE);
256 memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1], ETH_ALEN);
257 hton24(aclc.qpn, link->roce_qp->qp_num);
258 aclc.rmb_rkey =
259 htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
260 aclc.conn_idx = 1; /* as long as 1 RMB = 1 RMBE */
261 aclc.rmbe_alert_token = htonl(conn->alert_token_local);
262 aclc.qp_mtu = link->path_mtu;
263 aclc.rmbe_size = conn->rmbe_size_short,
264 aclc.rmb_dma_addr =
265 cpu_to_be64((u64)conn->rmb_desc->dma_addr[SMC_SINGLE_LINK]);
266 hton24(aclc.psn, link->psn_initial);
267 memcpy(aclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
268
269 memset(&msg, 0, sizeof(msg));
270 vec.iov_base = &aclc;
271 vec.iov_len = sizeof(aclc);
272 len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1, sizeof(aclc));
273 if (len < sizeof(aclc)) {
274 if (len >= 0)
275 new_smc->sk.sk_err = EPROTO;
276 else
277 new_smc->sk.sk_err = new_smc->clcsock->sk->sk_err;
278 rc = sock_error(&new_smc->sk);
279 }
280
281 return rc;
282}
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
new file mode 100644
index 000000000000..13db8ce177c9
--- /dev/null
+++ b/net/smc/smc_clc.h
@@ -0,0 +1,116 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * CLC (connection layer control) handshake over initial TCP socket to
5 * prepare for RDMA traffic
6 *
7 * Copyright IBM Corp. 2016
8 *
9 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
10 */
11
12#ifndef _SMC_CLC_H
13#define _SMC_CLC_H
14
15#include <rdma/ib_verbs.h>
16
17#include "smc.h"
18
19#define SMC_CLC_PROPOSAL 0x01
20#define SMC_CLC_ACCEPT 0x02
21#define SMC_CLC_CONFIRM 0x03
22#define SMC_CLC_DECLINE 0x04
23
24/* eye catcher "SMCR" EBCDIC for CLC messages */
25static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'};
26
27#define SMC_CLC_V1 0x1 /* SMC version */
28#define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */
29#define SMC_CLC_DECL_MEM 0x01010000 /* insufficient memory resources */
30#define SMC_CLC_DECL_TIMEOUT 0x02000000 /* timeout */
31#define SMC_CLC_DECL_CNFERR 0x03000000 /* configuration error */
32#define SMC_CLC_DECL_IPSEC 0x03030000 /* IPsec usage */
33#define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */
34#define SMC_CLC_DECL_REPLY 0x06000000 /* reply to a received decline */
35#define SMC_CLC_DECL_INTERR 0x99990000 /* internal error */
36#define SMC_CLC_DECL_TCL 0x02040000 /* timeout w4 QP confirm */
37#define SMC_CLC_DECL_SEND 0x07000000 /* sending problem */
38
39struct smc_clc_msg_hdr { /* header1 of clc messages */
40 u8 eyecatcher[4]; /* eye catcher */
41 u8 type; /* proposal / accept / confirm / decline */
42 __be16 length;
43#if defined(__BIG_ENDIAN_BITFIELD)
44 u8 version : 4,
45 flag : 1,
46 rsvd : 3;
47#elif defined(__LITTLE_ENDIAN_BITFIELD)
48 u8 rsvd : 3,
49 flag : 1,
50 version : 4;
51#endif
52} __packed; /* format defined in RFC7609 */
53
54struct smc_clc_msg_trail { /* trailer of clc messages */
55 u8 eyecatcher[4];
56};
57
58struct smc_clc_msg_local { /* header2 of clc messages */
59 u8 id_for_peer[SMC_SYSTEMID_LEN]; /* unique system id */
60 u8 gid[16]; /* gid of ib_device port */
61 u8 mac[6]; /* mac of ib_device port */
62};
63
64struct smc_clc_msg_proposal { /* clc proposal message */
65 struct smc_clc_msg_hdr hdr;
66 struct smc_clc_msg_local lcl;
67 __be16 iparea_offset; /* offset to IP address information area */
68 __be32 outgoing_subnet; /* subnet mask */
69 u8 prefix_len; /* number of significant bits in mask */
70 u8 reserved[2];
71 u8 ipv6_prefixes_cnt; /* number of IPv6 prefixes in prefix array */
72 struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
73} __aligned(4);
74
75struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */
76 struct smc_clc_msg_hdr hdr;
77 struct smc_clc_msg_local lcl;
78 u8 qpn[3]; /* QP number */
79 __be32 rmb_rkey; /* RMB rkey */
80 u8 conn_idx; /* Connection index, which RMBE in RMB */
81 __be32 rmbe_alert_token;/* unique connection id */
82#if defined(__BIG_ENDIAN_BITFIELD)
83 u8 rmbe_size : 4, /* RMBE buf size (compressed notation) */
84 qp_mtu : 4; /* QP mtu */
85#elif defined(__LITTLE_ENDIAN_BITFIELD)
86 u8 qp_mtu : 4,
87 rmbe_size : 4;
88#endif
89 u8 reserved;
90 __be64 rmb_dma_addr; /* RMB virtual address */
91 u8 reserved2;
92 u8 psn[3]; /* initial packet sequence number */
93 struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
94} __packed; /* format defined in RFC7609 */
95
96struct smc_clc_msg_decline { /* clc decline message */
97 struct smc_clc_msg_hdr hdr;
98 u8 id_for_peer[SMC_SYSTEMID_LEN]; /* sender peer_id */
99 __be32 peer_diagnosis; /* diagnosis information */
100 u8 reserved2[4];
101 struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
102} __aligned(4);
103
104struct smc_sock;
105struct smc_ib_device;
106
107int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
108 u8 expected_type);
109int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info,
110 u8 out_of_sync);
111int smc_clc_send_proposal(struct smc_sock *smc, struct smc_ib_device *smcibdev,
112 u8 ibport);
113int smc_clc_send_confirm(struct smc_sock *smc);
114int smc_clc_send_accept(struct smc_sock *smc, int srv_first_contact);
115
116#endif
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
new file mode 100644
index 000000000000..67a71d170bed
--- /dev/null
+++ b/net/smc/smc_close.c
@@ -0,0 +1,444 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Socket Closing - normal and abnormal
5 *
6 * Copyright IBM Corp. 2016
7 *
8 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
9 */
10
11#include <linux/workqueue.h>
12#include <linux/sched/signal.h>
13
14#include <net/sock.h>
15
16#include "smc.h"
17#include "smc_tx.h"
18#include "smc_cdc.h"
19#include "smc_close.h"
20
21#define SMC_CLOSE_WAIT_TX_PENDS_TIME (5 * HZ)
22
23static void smc_close_cleanup_listen(struct sock *parent)
24{
25 struct sock *sk;
26
27 /* Close non-accepted connections */
28 while ((sk = smc_accept_dequeue(parent, NULL)))
29 smc_close_non_accepted(sk);
30}
31
32static void smc_close_wait_tx_pends(struct smc_sock *smc)
33{
34 DEFINE_WAIT_FUNC(wait, woken_wake_function);
35 struct sock *sk = &smc->sk;
36 signed long timeout;
37
38 timeout = SMC_CLOSE_WAIT_TX_PENDS_TIME;
39 add_wait_queue(sk_sleep(sk), &wait);
40 while (!signal_pending(current) && timeout) {
41 int rc;
42
43 rc = sk_wait_event(sk, &timeout,
44 !smc_cdc_tx_has_pending(&smc->conn),
45 &wait);
46 if (rc)
47 break;
48 }
49 remove_wait_queue(sk_sleep(sk), &wait);
50}
51
52/* wait for sndbuf data being transmitted */
53static void smc_close_stream_wait(struct smc_sock *smc, long timeout)
54{
55 DEFINE_WAIT_FUNC(wait, woken_wake_function);
56 struct sock *sk = &smc->sk;
57
58 if (!timeout)
59 return;
60
61 if (!smc_tx_prepared_sends(&smc->conn))
62 return;
63
64 smc->wait_close_tx_prepared = 1;
65 add_wait_queue(sk_sleep(sk), &wait);
66 while (!signal_pending(current) && timeout) {
67 int rc;
68
69 rc = sk_wait_event(sk, &timeout,
70 !smc_tx_prepared_sends(&smc->conn) ||
71 (sk->sk_err == ECONNABORTED) ||
72 (sk->sk_err == ECONNRESET),
73 &wait);
74 if (rc)
75 break;
76 }
77 remove_wait_queue(sk_sleep(sk), &wait);
78 smc->wait_close_tx_prepared = 0;
79}
80
81void smc_close_wake_tx_prepared(struct smc_sock *smc)
82{
83 if (smc->wait_close_tx_prepared)
84 /* wake up socket closing */
85 smc->sk.sk_state_change(&smc->sk);
86}
87
88static int smc_close_wr(struct smc_connection *conn)
89{
90 conn->local_tx_ctrl.conn_state_flags.peer_done_writing = 1;
91
92 return smc_cdc_get_slot_and_msg_send(conn);
93}
94
95static int smc_close_final(struct smc_connection *conn)
96{
97 if (atomic_read(&conn->bytes_to_rcv))
98 conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
99 else
100 conn->local_tx_ctrl.conn_state_flags.peer_conn_closed = 1;
101
102 return smc_cdc_get_slot_and_msg_send(conn);
103}
104
105static int smc_close_abort(struct smc_connection *conn)
106{
107 conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
108
109 return smc_cdc_get_slot_and_msg_send(conn);
110}
111
112/* terminate smc socket abnormally - active abort
113 * RDMA communication no longer possible
114 */
115void smc_close_active_abort(struct smc_sock *smc)
116{
117 struct smc_cdc_conn_state_flags *txflags =
118 &smc->conn.local_tx_ctrl.conn_state_flags;
119
120 bh_lock_sock(&smc->sk);
121 smc->sk.sk_err = ECONNABORTED;
122 if (smc->clcsock && smc->clcsock->sk) {
123 smc->clcsock->sk->sk_err = ECONNABORTED;
124 smc->clcsock->sk->sk_state_change(smc->clcsock->sk);
125 }
126 switch (smc->sk.sk_state) {
127 case SMC_INIT:
128 smc->sk.sk_state = SMC_PEERABORTWAIT;
129 break;
130 case SMC_APPCLOSEWAIT1:
131 case SMC_APPCLOSEWAIT2:
132 txflags->peer_conn_abort = 1;
133 sock_release(smc->clcsock);
134 if (!smc_cdc_rxed_any_close(&smc->conn))
135 smc->sk.sk_state = SMC_PEERABORTWAIT;
136 else
137 smc->sk.sk_state = SMC_CLOSED;
138 break;
139 case SMC_PEERCLOSEWAIT1:
140 case SMC_PEERCLOSEWAIT2:
141 if (!txflags->peer_conn_closed) {
142 smc->sk.sk_state = SMC_PEERABORTWAIT;
143 txflags->peer_conn_abort = 1;
144 sock_release(smc->clcsock);
145 } else {
146 smc->sk.sk_state = SMC_CLOSED;
147 }
148 break;
149 case SMC_PROCESSABORT:
150 case SMC_APPFINCLOSEWAIT:
151 if (!txflags->peer_conn_closed) {
152 txflags->peer_conn_abort = 1;
153 sock_release(smc->clcsock);
154 }
155 smc->sk.sk_state = SMC_CLOSED;
156 break;
157 case SMC_PEERFINCLOSEWAIT:
158 case SMC_PEERABORTWAIT:
159 case SMC_CLOSED:
160 break;
161 }
162
163 sock_set_flag(&smc->sk, SOCK_DEAD);
164 bh_unlock_sock(&smc->sk);
165 smc->sk.sk_state_change(&smc->sk);
166}
167
168int smc_close_active(struct smc_sock *smc)
169{
170 struct smc_cdc_conn_state_flags *txflags =
171 &smc->conn.local_tx_ctrl.conn_state_flags;
172 long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT;
173 struct smc_connection *conn = &smc->conn;
174 struct sock *sk = &smc->sk;
175 int old_state;
176 int rc = 0;
177
178 if (sock_flag(sk, SOCK_LINGER) &&
179 !(current->flags & PF_EXITING))
180 timeout = sk->sk_lingertime;
181
182again:
183 old_state = sk->sk_state;
184 switch (old_state) {
185 case SMC_INIT:
186 sk->sk_state = SMC_CLOSED;
187 if (smc->smc_listen_work.func)
188 flush_work(&smc->smc_listen_work);
189 sock_put(sk);
190 break;
191 case SMC_LISTEN:
192 sk->sk_state = SMC_CLOSED;
193 sk->sk_state_change(sk); /* wake up accept */
194 if (smc->clcsock && smc->clcsock->sk) {
195 rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
196 /* wake up kernel_accept of smc_tcp_listen_worker */
197 smc->clcsock->sk->sk_data_ready(smc->clcsock->sk);
198 }
199 release_sock(sk);
200 smc_close_cleanup_listen(sk);
201 flush_work(&smc->tcp_listen_work);
202 lock_sock(sk);
203 break;
204 case SMC_ACTIVE:
205 smc_close_stream_wait(smc, timeout);
206 release_sock(sk);
207 cancel_work_sync(&conn->tx_work);
208 lock_sock(sk);
209 if (sk->sk_state == SMC_ACTIVE) {
210 /* send close request */
211 rc = smc_close_final(conn);
212 sk->sk_state = SMC_PEERCLOSEWAIT1;
213 } else {
214 /* peer event has changed the state */
215 goto again;
216 }
217 break;
218 case SMC_APPFINCLOSEWAIT:
219 /* socket already shutdown wr or both (active close) */
220 if (txflags->peer_done_writing &&
221 !txflags->peer_conn_closed) {
222 /* just shutdown wr done, send close request */
223 rc = smc_close_final(conn);
224 }
225 sk->sk_state = SMC_CLOSED;
226 smc_close_wait_tx_pends(smc);
227 break;
228 case SMC_APPCLOSEWAIT1:
229 case SMC_APPCLOSEWAIT2:
230 if (!smc_cdc_rxed_any_close(conn))
231 smc_close_stream_wait(smc, timeout);
232 release_sock(sk);
233 cancel_work_sync(&conn->tx_work);
234 lock_sock(sk);
235 if (sk->sk_err != ECONNABORTED) {
236 /* confirm close from peer */
237 rc = smc_close_final(conn);
238 if (rc)
239 break;
240 }
241 if (smc_cdc_rxed_any_close(conn))
242 /* peer has closed the socket already */
243 sk->sk_state = SMC_CLOSED;
244 else
245 /* peer has just issued a shutdown write */
246 sk->sk_state = SMC_PEERFINCLOSEWAIT;
247 smc_close_wait_tx_pends(smc);
248 break;
249 case SMC_PEERCLOSEWAIT1:
250 case SMC_PEERCLOSEWAIT2:
251 case SMC_PEERFINCLOSEWAIT:
252 /* peer sending PeerConnectionClosed will cause transition */
253 break;
254 case SMC_PROCESSABORT:
255 cancel_work_sync(&conn->tx_work);
256 smc_close_abort(conn);
257 sk->sk_state = SMC_CLOSED;
258 smc_close_wait_tx_pends(smc);
259 break;
260 case SMC_PEERABORTWAIT:
261 case SMC_CLOSED:
262 /* nothing to do, add tracing in future patch */
263 break;
264 }
265
266 if (old_state != sk->sk_state)
267 sk->sk_state_change(&smc->sk);
268 return rc;
269}
270
271static void smc_close_passive_abort_received(struct smc_sock *smc)
272{
273 struct smc_cdc_conn_state_flags *txflags =
274 &smc->conn.local_tx_ctrl.conn_state_flags;
275 struct sock *sk = &smc->sk;
276
277 switch (sk->sk_state) {
278 case SMC_ACTIVE:
279 case SMC_APPFINCLOSEWAIT:
280 case SMC_APPCLOSEWAIT1:
281 case SMC_APPCLOSEWAIT2:
282 smc_close_abort(&smc->conn);
283 sk->sk_state = SMC_PROCESSABORT;
284 break;
285 case SMC_PEERCLOSEWAIT1:
286 case SMC_PEERCLOSEWAIT2:
287 if (txflags->peer_done_writing &&
288 !txflags->peer_conn_closed) {
289 /* just shutdown, but not yet closed locally */
290 smc_close_abort(&smc->conn);
291 sk->sk_state = SMC_PROCESSABORT;
292 } else {
293 sk->sk_state = SMC_CLOSED;
294 }
295 break;
296 case SMC_PEERFINCLOSEWAIT:
297 case SMC_PEERABORTWAIT:
298 sk->sk_state = SMC_CLOSED;
299 break;
300 case SMC_INIT:
301 case SMC_PROCESSABORT:
302 /* nothing to do, add tracing in future patch */
303 break;
304 }
305}
306
307/* Some kind of closing has been received: peer_conn_closed, peer_conn_abort,
308 * or peer_done_writing.
309 * Called under tasklet context.
310 */
311void smc_close_passive_received(struct smc_sock *smc)
312{
313 struct smc_cdc_conn_state_flags *rxflags =
314 &smc->conn.local_rx_ctrl.conn_state_flags;
315 struct sock *sk = &smc->sk;
316 int old_state;
317
318 sk->sk_shutdown |= RCV_SHUTDOWN;
319 if (smc->clcsock && smc->clcsock->sk)
320 smc->clcsock->sk->sk_shutdown |= RCV_SHUTDOWN;
321 sock_set_flag(&smc->sk, SOCK_DONE);
322
323 old_state = sk->sk_state;
324
325 if (rxflags->peer_conn_abort) {
326 smc_close_passive_abort_received(smc);
327 goto wakeup;
328 }
329
330 switch (sk->sk_state) {
331 case SMC_INIT:
332 if (atomic_read(&smc->conn.bytes_to_rcv) ||
333 (rxflags->peer_done_writing &&
334 !rxflags->peer_conn_closed))
335 sk->sk_state = SMC_APPCLOSEWAIT1;
336 else
337 sk->sk_state = SMC_CLOSED;
338 break;
339 case SMC_ACTIVE:
340 sk->sk_state = SMC_APPCLOSEWAIT1;
341 break;
342 case SMC_PEERCLOSEWAIT1:
343 if (rxflags->peer_done_writing)
344 sk->sk_state = SMC_PEERCLOSEWAIT2;
345 /* fall through to check for closing */
346 case SMC_PEERCLOSEWAIT2:
347 case SMC_PEERFINCLOSEWAIT:
348 if (!smc_cdc_rxed_any_close(&smc->conn))
349 break;
350 if (sock_flag(sk, SOCK_DEAD) &&
351 (sk->sk_shutdown == SHUTDOWN_MASK)) {
352 /* smc_release has already been called locally */
353 sk->sk_state = SMC_CLOSED;
354 } else {
355 /* just shutdown, but not yet closed locally */
356 sk->sk_state = SMC_APPFINCLOSEWAIT;
357 }
358 break;
359 case SMC_APPCLOSEWAIT1:
360 case SMC_APPCLOSEWAIT2:
361 case SMC_APPFINCLOSEWAIT:
362 case SMC_PEERABORTWAIT:
363 case SMC_PROCESSABORT:
364 case SMC_CLOSED:
365 /* nothing to do, add tracing in future patch */
366 break;
367 }
368
369wakeup:
370 if (old_state != sk->sk_state)
371 sk->sk_state_change(sk);
372 sk->sk_data_ready(sk); /* wakeup blocked rcvbuf consumers */
373 sk->sk_write_space(sk); /* wakeup blocked sndbuf producers */
374
375 if ((sk->sk_state == SMC_CLOSED) &&
376 (sock_flag(sk, SOCK_DEAD) || (old_state == SMC_INIT))) {
377 smc_conn_free(&smc->conn);
378 schedule_delayed_work(&smc->sock_put_work,
379 SMC_CLOSE_SOCK_PUT_DELAY);
380 }
381}
382
383void smc_close_sock_put_work(struct work_struct *work)
384{
385 struct smc_sock *smc = container_of(to_delayed_work(work),
386 struct smc_sock,
387 sock_put_work);
388
389 smc->sk.sk_prot->unhash(&smc->sk);
390 sock_put(&smc->sk);
391}
392
393int smc_close_shutdown_write(struct smc_sock *smc)
394{
395 struct smc_connection *conn = &smc->conn;
396 long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT;
397 struct sock *sk = &smc->sk;
398 int old_state;
399 int rc = 0;
400
401 if (sock_flag(sk, SOCK_LINGER))
402 timeout = sk->sk_lingertime;
403
404again:
405 old_state = sk->sk_state;
406 switch (old_state) {
407 case SMC_ACTIVE:
408 smc_close_stream_wait(smc, timeout);
409 release_sock(sk);
410 cancel_work_sync(&conn->tx_work);
411 lock_sock(sk);
412 /* send close wr request */
413 rc = smc_close_wr(conn);
414 if (sk->sk_state == SMC_ACTIVE)
415 sk->sk_state = SMC_PEERCLOSEWAIT1;
416 else
417 goto again;
418 break;
419 case SMC_APPCLOSEWAIT1:
420 /* passive close */
421 if (!smc_cdc_rxed_any_close(conn))
422 smc_close_stream_wait(smc, timeout);
423 release_sock(sk);
424 cancel_work_sync(&conn->tx_work);
425 lock_sock(sk);
426 /* confirm close from peer */
427 rc = smc_close_wr(conn);
428 sk->sk_state = SMC_APPCLOSEWAIT2;
429 break;
430 case SMC_APPCLOSEWAIT2:
431 case SMC_PEERFINCLOSEWAIT:
432 case SMC_PEERCLOSEWAIT1:
433 case SMC_PEERCLOSEWAIT2:
434 case SMC_APPFINCLOSEWAIT:
435 case SMC_PROCESSABORT:
436 case SMC_PEERABORTWAIT:
437 /* nothing to do, add tracing in future patch */
438 break;
439 }
440
441 if (old_state != sk->sk_state)
442 sk->sk_state_change(&smc->sk);
443 return rc;
444}
diff --git a/net/smc/smc_close.h b/net/smc/smc_close.h
new file mode 100644
index 000000000000..bc9a2df3633c
--- /dev/null
+++ b/net/smc/smc_close.h
@@ -0,0 +1,28 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Socket Closing
5 *
6 * Copyright IBM Corp. 2016
7 *
8 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
9 */
10
11#ifndef SMC_CLOSE_H
12#define SMC_CLOSE_H
13
14#include <linux/workqueue.h>
15
16#include "smc.h"
17
18#define SMC_MAX_STREAM_WAIT_TIMEOUT (2 * HZ)
19#define SMC_CLOSE_SOCK_PUT_DELAY HZ
20
21void smc_close_wake_tx_prepared(struct smc_sock *smc);
22void smc_close_active_abort(struct smc_sock *smc);
23int smc_close_active(struct smc_sock *smc);
24void smc_close_passive_received(struct smc_sock *smc);
25void smc_close_sock_put_work(struct work_struct *work);
26int smc_close_shutdown_write(struct smc_sock *smc);
27
28#endif /* SMC_CLOSE_H */
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
new file mode 100644
index 000000000000..0eac633fb354
--- /dev/null
+++ b/net/smc/smc_core.c
@@ -0,0 +1,682 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Basic Transport Functions exploiting Infiniband API
5 *
6 * Copyright IBM Corp. 2016
7 *
8 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
9 */
10
11#include <linux/socket.h>
12#include <linux/if_vlan.h>
13#include <linux/random.h>
14#include <linux/workqueue.h>
15#include <net/tcp.h>
16#include <net/sock.h>
17#include <rdma/ib_verbs.h>
18
19#include "smc.h"
20#include "smc_clc.h"
21#include "smc_core.h"
22#include "smc_ib.h"
23#include "smc_wr.h"
24#include "smc_llc.h"
25#include "smc_cdc.h"
26#include "smc_close.h"
27
28#define SMC_LGR_NUM_INCR 256
29#define SMC_LGR_FREE_DELAY (600 * HZ)
30
31static u32 smc_lgr_num; /* unique link group number */
32
33/* Register connection's alert token in our lookup structure.
34 * To use rbtrees we have to implement our own insert core.
35 * Requires @conns_lock
36 * @smc connection to register
37 * Returns 0 on success, != otherwise.
38 */
39static void smc_lgr_add_alert_token(struct smc_connection *conn)
40{
41 struct rb_node **link, *parent = NULL;
42 u32 token = conn->alert_token_local;
43
44 link = &conn->lgr->conns_all.rb_node;
45 while (*link) {
46 struct smc_connection *cur = rb_entry(*link,
47 struct smc_connection, alert_node);
48
49 parent = *link;
50 if (cur->alert_token_local > token)
51 link = &parent->rb_left;
52 else
53 link = &parent->rb_right;
54 }
55 /* Put the new node there */
56 rb_link_node(&conn->alert_node, parent, link);
57 rb_insert_color(&conn->alert_node, &conn->lgr->conns_all);
58}
59
60/* Register connection in link group by assigning an alert token
61 * registered in a search tree.
62 * Requires @conns_lock
63 * Note that '0' is a reserved value and not assigned.
64 */
65static void smc_lgr_register_conn(struct smc_connection *conn)
66{
67 struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
68 static atomic_t nexttoken = ATOMIC_INIT(0);
69
70 /* find a new alert_token_local value not yet used by some connection
71 * in this link group
72 */
73 sock_hold(&smc->sk); /* sock_put in smc_lgr_unregister_conn() */
74 while (!conn->alert_token_local) {
75 conn->alert_token_local = atomic_inc_return(&nexttoken);
76 if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr))
77 conn->alert_token_local = 0;
78 }
79 smc_lgr_add_alert_token(conn);
80 conn->lgr->conns_num++;
81}
82
83/* Unregister connection and reset the alert token of the given connection<
84 */
85static void __smc_lgr_unregister_conn(struct smc_connection *conn)
86{
87 struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
88 struct smc_link_group *lgr = conn->lgr;
89
90 rb_erase(&conn->alert_node, &lgr->conns_all);
91 lgr->conns_num--;
92 conn->alert_token_local = 0;
93 conn->lgr = NULL;
94 sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */
95}
96
97/* Unregister connection and trigger lgr freeing if applicable
98 */
99static void smc_lgr_unregister_conn(struct smc_connection *conn)
100{
101 struct smc_link_group *lgr = conn->lgr;
102 int reduced = 0;
103
104 write_lock_bh(&lgr->conns_lock);
105 if (conn->alert_token_local) {
106 reduced = 1;
107 __smc_lgr_unregister_conn(conn);
108 }
109 write_unlock_bh(&lgr->conns_lock);
110 if (reduced && !lgr->conns_num)
111 schedule_delayed_work(&lgr->free_work, SMC_LGR_FREE_DELAY);
112}
113
114static void smc_lgr_free_work(struct work_struct *work)
115{
116 struct smc_link_group *lgr = container_of(to_delayed_work(work),
117 struct smc_link_group,
118 free_work);
119 bool conns;
120
121 spin_lock_bh(&smc_lgr_list.lock);
122 read_lock_bh(&lgr->conns_lock);
123 conns = RB_EMPTY_ROOT(&lgr->conns_all);
124 read_unlock_bh(&lgr->conns_lock);
125 if (!conns) { /* number of lgr connections is no longer zero */
126 spin_unlock_bh(&smc_lgr_list.lock);
127 return;
128 }
129 list_del_init(&lgr->list); /* remove from smc_lgr_list */
130 spin_unlock_bh(&smc_lgr_list.lock);
131 smc_lgr_free(lgr);
132}
133
134/* create a new SMC link group */
135static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr,
136 struct smc_ib_device *smcibdev, u8 ibport,
137 char *peer_systemid, unsigned short vlan_id)
138{
139 struct smc_link_group *lgr;
140 struct smc_link *lnk;
141 u8 rndvec[3];
142 int rc = 0;
143 int i;
144
145 lgr = kzalloc(sizeof(*lgr), GFP_KERNEL);
146 if (!lgr) {
147 rc = -ENOMEM;
148 goto out;
149 }
150 lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
151 lgr->sync_err = false;
152 lgr->daddr = peer_in_addr;
153 memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN);
154 lgr->vlan_id = vlan_id;
155 rwlock_init(&lgr->sndbufs_lock);
156 rwlock_init(&lgr->rmbs_lock);
157 for (i = 0; i < SMC_RMBE_SIZES; i++) {
158 INIT_LIST_HEAD(&lgr->sndbufs[i]);
159 INIT_LIST_HEAD(&lgr->rmbs[i]);
160 }
161 smc_lgr_num += SMC_LGR_NUM_INCR;
162 memcpy(&lgr->id, (u8 *)&smc_lgr_num, SMC_LGR_ID_SIZE);
163 INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work);
164 lgr->conns_all = RB_ROOT;
165
166 lnk = &lgr->lnk[SMC_SINGLE_LINK];
167 /* initialize link */
168 lnk->smcibdev = smcibdev;
169 lnk->ibport = ibport;
170 lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu;
171 if (!smcibdev->initialized)
172 smc_ib_setup_per_ibdev(smcibdev);
173 get_random_bytes(rndvec, sizeof(rndvec));
174 lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16);
175 rc = smc_wr_alloc_link_mem(lnk);
176 if (rc)
177 goto free_lgr;
178 init_waitqueue_head(&lnk->wr_tx_wait);
179 rc = smc_ib_create_protection_domain(lnk);
180 if (rc)
181 goto free_link_mem;
182 rc = smc_ib_create_queue_pair(lnk);
183 if (rc)
184 goto dealloc_pd;
185 rc = smc_wr_create_link(lnk);
186 if (rc)
187 goto destroy_qp;
188 init_completion(&lnk->llc_confirm);
189 init_completion(&lnk->llc_confirm_resp);
190
191 smc->conn.lgr = lgr;
192 rwlock_init(&lgr->conns_lock);
193 spin_lock_bh(&smc_lgr_list.lock);
194 list_add(&lgr->list, &smc_lgr_list.list);
195 spin_unlock_bh(&smc_lgr_list.lock);
196 return 0;
197
198destroy_qp:
199 smc_ib_destroy_queue_pair(lnk);
200dealloc_pd:
201 smc_ib_dealloc_protection_domain(lnk);
202free_link_mem:
203 smc_wr_free_link_mem(lnk);
204free_lgr:
205 kfree(lgr);
206out:
207 return rc;
208}
209
210static void smc_sndbuf_unuse(struct smc_connection *conn)
211{
212 if (conn->sndbuf_desc) {
213 conn->sndbuf_desc->used = 0;
214 conn->sndbuf_size = 0;
215 }
216}
217
218static void smc_rmb_unuse(struct smc_connection *conn)
219{
220 if (conn->rmb_desc) {
221 conn->rmb_desc->used = 0;
222 conn->rmbe_size = 0;
223 }
224}
225
226/* remove a finished connection from its link group */
227void smc_conn_free(struct smc_connection *conn)
228{
229 struct smc_link_group *lgr = conn->lgr;
230
231 if (!lgr)
232 return;
233 smc_cdc_tx_dismiss_slots(conn);
234 smc_lgr_unregister_conn(conn);
235 smc_rmb_unuse(conn);
236 smc_sndbuf_unuse(conn);
237}
238
239static void smc_link_clear(struct smc_link *lnk)
240{
241 lnk->peer_qpn = 0;
242 smc_ib_modify_qp_reset(lnk);
243 smc_wr_free_link(lnk);
244 smc_ib_destroy_queue_pair(lnk);
245 smc_ib_dealloc_protection_domain(lnk);
246 smc_wr_free_link_mem(lnk);
247}
248
249static void smc_lgr_free_sndbufs(struct smc_link_group *lgr)
250{
251 struct smc_buf_desc *sndbuf_desc, *bf_desc;
252 int i;
253
254 for (i = 0; i < SMC_RMBE_SIZES; i++) {
255 list_for_each_entry_safe(sndbuf_desc, bf_desc, &lgr->sndbufs[i],
256 list) {
257 list_del(&sndbuf_desc->list);
258 smc_ib_buf_unmap(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
259 smc_uncompress_bufsize(i),
260 sndbuf_desc, DMA_TO_DEVICE);
261 kfree(sndbuf_desc->cpu_addr);
262 kfree(sndbuf_desc);
263 }
264 }
265}
266
267static void smc_lgr_free_rmbs(struct smc_link_group *lgr)
268{
269 struct smc_buf_desc *rmb_desc, *bf_desc;
270 struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
271 int i;
272
273 for (i = 0; i < SMC_RMBE_SIZES; i++) {
274 list_for_each_entry_safe(rmb_desc, bf_desc, &lgr->rmbs[i],
275 list) {
276 list_del(&rmb_desc->list);
277 smc_ib_buf_unmap(lnk->smcibdev,
278 smc_uncompress_bufsize(i),
279 rmb_desc, DMA_FROM_DEVICE);
280 kfree(rmb_desc->cpu_addr);
281 kfree(rmb_desc);
282 }
283 }
284}
285
286/* remove a link group */
287void smc_lgr_free(struct smc_link_group *lgr)
288{
289 smc_lgr_free_rmbs(lgr);
290 smc_lgr_free_sndbufs(lgr);
291 smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
292 kfree(lgr);
293}
294
295/* terminate linkgroup abnormally */
296void smc_lgr_terminate(struct smc_link_group *lgr)
297{
298 struct smc_connection *conn;
299 struct smc_sock *smc;
300 struct rb_node *node;
301
302 spin_lock_bh(&smc_lgr_list.lock);
303 if (list_empty(&lgr->list)) {
304 /* termination already triggered */
305 spin_unlock_bh(&smc_lgr_list.lock);
306 return;
307 }
308 /* do not use this link group for new connections */
309 list_del_init(&lgr->list);
310 spin_unlock_bh(&smc_lgr_list.lock);
311
312 write_lock_bh(&lgr->conns_lock);
313 node = rb_first(&lgr->conns_all);
314 while (node) {
315 conn = rb_entry(node, struct smc_connection, alert_node);
316 smc = container_of(conn, struct smc_sock, conn);
317 sock_hold(&smc->sk);
318 __smc_lgr_unregister_conn(conn);
319 smc_close_active_abort(smc);
320 sock_put(&smc->sk);
321 node = rb_first(&lgr->conns_all);
322 }
323 write_unlock_bh(&lgr->conns_lock);
324}
325
326/* Determine vlan of internal TCP socket.
327 * @vlan_id: address to store the determined vlan id into
328 */
329static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id)
330{
331 struct dst_entry *dst = sk_dst_get(clcsock->sk);
332 int rc = 0;
333
334 *vlan_id = 0;
335 if (!dst) {
336 rc = -ENOTCONN;
337 goto out;
338 }
339 if (!dst->dev) {
340 rc = -ENODEV;
341 goto out_rel;
342 }
343
344 if (is_vlan_dev(dst->dev))
345 *vlan_id = vlan_dev_vlan_id(dst->dev);
346
347out_rel:
348 dst_release(dst);
349out:
350 return rc;
351}
352
353/* determine the link gid matching the vlan id of the link group */
354static int smc_link_determine_gid(struct smc_link_group *lgr)
355{
356 struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
357 struct ib_gid_attr gattr;
358 union ib_gid gid;
359 int i;
360
361 if (!lgr->vlan_id) {
362 lnk->gid = lnk->smcibdev->gid[lnk->ibport - 1];
363 return 0;
364 }
365
366 for (i = 0; i < lnk->smcibdev->pattr[lnk->ibport - 1].gid_tbl_len;
367 i++) {
368 if (ib_query_gid(lnk->smcibdev->ibdev, lnk->ibport, i, &gid,
369 &gattr))
370 continue;
371 if (gattr.ndev &&
372 (vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id)) {
373 lnk->gid = gid;
374 return 0;
375 }
376 }
377 return -ENODEV;
378}
379
380/* create a new SMC connection (and a new link group if necessary) */
381int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr,
382 struct smc_ib_device *smcibdev, u8 ibport,
383 struct smc_clc_msg_local *lcl, int srv_first_contact)
384{
385 struct smc_connection *conn = &smc->conn;
386 struct smc_link_group *lgr;
387 unsigned short vlan_id;
388 enum smc_lgr_role role;
389 int local_contact = SMC_FIRST_CONTACT;
390 int rc = 0;
391
392 role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
393 rc = smc_vlan_by_tcpsk(smc->clcsock, &vlan_id);
394 if (rc)
395 return rc;
396
397 if ((role == SMC_CLNT) && srv_first_contact)
398 /* create new link group as well */
399 goto create;
400
401 /* determine if an existing link group can be reused */
402 spin_lock_bh(&smc_lgr_list.lock);
403 list_for_each_entry(lgr, &smc_lgr_list.list, list) {
404 write_lock_bh(&lgr->conns_lock);
405 if (!memcmp(lgr->peer_systemid, lcl->id_for_peer,
406 SMC_SYSTEMID_LEN) &&
407 !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid,
408 SMC_GID_SIZE) &&
409 !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac,
410 sizeof(lcl->mac)) &&
411 !lgr->sync_err &&
412 (lgr->role == role) &&
413 (lgr->vlan_id == vlan_id) &&
414 ((role == SMC_CLNT) ||
415 (lgr->conns_num < SMC_RMBS_PER_LGR_MAX))) {
416 /* link group found */
417 local_contact = SMC_REUSE_CONTACT;
418 conn->lgr = lgr;
419 smc_lgr_register_conn(conn); /* add smc conn to lgr */
420 write_unlock_bh(&lgr->conns_lock);
421 break;
422 }
423 write_unlock_bh(&lgr->conns_lock);
424 }
425 spin_unlock_bh(&smc_lgr_list.lock);
426
427 if (role == SMC_CLNT && !srv_first_contact &&
428 (local_contact == SMC_FIRST_CONTACT)) {
429 /* Server reuses a link group, but Client wants to start
430 * a new one
431 * send out_of_sync decline, reason synchr. error
432 */
433 return -ENOLINK;
434 }
435
436create:
437 if (local_contact == SMC_FIRST_CONTACT) {
438 rc = smc_lgr_create(smc, peer_in_addr, smcibdev, ibport,
439 lcl->id_for_peer, vlan_id);
440 if (rc)
441 goto out;
442 smc_lgr_register_conn(conn); /* add smc conn to lgr */
443 rc = smc_link_determine_gid(conn->lgr);
444 }
445 conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
446 conn->local_tx_ctrl.len = sizeof(struct smc_cdc_msg);
447#ifndef KERNEL_HAS_ATOMIC64
448 spin_lock_init(&conn->acurs_lock);
449#endif
450
451out:
452 return rc ? rc : local_contact;
453}
454
455/* try to reuse a sndbuf description slot of the sndbufs list for a certain
456 * buf_size; if not available, return NULL
457 */
458static inline
459struct smc_buf_desc *smc_sndbuf_get_slot(struct smc_link_group *lgr,
460 int compressed_bufsize)
461{
462 struct smc_buf_desc *sndbuf_slot;
463
464 read_lock_bh(&lgr->sndbufs_lock);
465 list_for_each_entry(sndbuf_slot, &lgr->sndbufs[compressed_bufsize],
466 list) {
467 if (cmpxchg(&sndbuf_slot->used, 0, 1) == 0) {
468 read_unlock_bh(&lgr->sndbufs_lock);
469 return sndbuf_slot;
470 }
471 }
472 read_unlock_bh(&lgr->sndbufs_lock);
473 return NULL;
474}
475
476/* try to reuse an rmb description slot of the rmbs list for a certain
477 * rmbe_size; if not available, return NULL
478 */
479static inline
480struct smc_buf_desc *smc_rmb_get_slot(struct smc_link_group *lgr,
481 int compressed_bufsize)
482{
483 struct smc_buf_desc *rmb_slot;
484
485 read_lock_bh(&lgr->rmbs_lock);
486 list_for_each_entry(rmb_slot, &lgr->rmbs[compressed_bufsize],
487 list) {
488 if (cmpxchg(&rmb_slot->used, 0, 1) == 0) {
489 read_unlock_bh(&lgr->rmbs_lock);
490 return rmb_slot;
491 }
492 }
493 read_unlock_bh(&lgr->rmbs_lock);
494 return NULL;
495}
496
497/* one of the conditions for announcing a receiver's current window size is
498 * that it "results in a minimum increase in the window size of 10% of the
499 * receive buffer space" [RFC7609]
500 */
501static inline int smc_rmb_wnd_update_limit(int rmbe_size)
502{
503 return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
504}
505
506/* create the tx buffer for an SMC socket */
507int smc_sndbuf_create(struct smc_sock *smc)
508{
509 struct smc_connection *conn = &smc->conn;
510 struct smc_link_group *lgr = conn->lgr;
511 int tmp_bufsize, tmp_bufsize_short;
512 struct smc_buf_desc *sndbuf_desc;
513 int rc;
514
515 /* use socket send buffer size (w/o overhead) as start value */
516 for (tmp_bufsize_short = smc_compress_bufsize(smc->sk.sk_sndbuf / 2);
517 tmp_bufsize_short >= 0; tmp_bufsize_short--) {
518 tmp_bufsize = smc_uncompress_bufsize(tmp_bufsize_short);
519 /* check for reusable sndbuf_slot in the link group */
520 sndbuf_desc = smc_sndbuf_get_slot(lgr, tmp_bufsize_short);
521 if (sndbuf_desc) {
522 memset(sndbuf_desc->cpu_addr, 0, tmp_bufsize);
523 break; /* found reusable slot */
524 }
525 /* try to alloc a new send buffer */
526 sndbuf_desc = kzalloc(sizeof(*sndbuf_desc), GFP_KERNEL);
527 if (!sndbuf_desc)
528 break; /* give up with -ENOMEM */
529 sndbuf_desc->cpu_addr = kzalloc(tmp_bufsize,
530 GFP_KERNEL | __GFP_NOWARN |
531 __GFP_NOMEMALLOC |
532 __GFP_NORETRY);
533 if (!sndbuf_desc->cpu_addr) {
534 kfree(sndbuf_desc);
535 sndbuf_desc = NULL;
536 /* if send buffer allocation has failed,
537 * try a smaller one
538 */
539 continue;
540 }
541 rc = smc_ib_buf_map(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
542 tmp_bufsize, sndbuf_desc,
543 DMA_TO_DEVICE);
544 if (rc) {
545 kfree(sndbuf_desc->cpu_addr);
546 kfree(sndbuf_desc);
547 sndbuf_desc = NULL;
548 continue; /* if mapping failed, try smaller one */
549 }
550 sndbuf_desc->used = 1;
551 write_lock_bh(&lgr->sndbufs_lock);
552 list_add(&sndbuf_desc->list,
553 &lgr->sndbufs[tmp_bufsize_short]);
554 write_unlock_bh(&lgr->sndbufs_lock);
555 break;
556 }
557 if (sndbuf_desc && sndbuf_desc->cpu_addr) {
558 conn->sndbuf_desc = sndbuf_desc;
559 conn->sndbuf_size = tmp_bufsize;
560 smc->sk.sk_sndbuf = tmp_bufsize * 2;
561 atomic_set(&conn->sndbuf_space, tmp_bufsize);
562 return 0;
563 } else {
564 return -ENOMEM;
565 }
566}
567
568/* create the RMB for an SMC socket (even though the SMC protocol
569 * allows more than one RMB-element per RMB, the Linux implementation
570 * uses just one RMB-element per RMB, i.e. uses an extra RMB for every
571 * connection in a link group
572 */
573int smc_rmb_create(struct smc_sock *smc)
574{
575 struct smc_connection *conn = &smc->conn;
576 struct smc_link_group *lgr = conn->lgr;
577 int tmp_bufsize, tmp_bufsize_short;
578 struct smc_buf_desc *rmb_desc;
579 int rc;
580
581 /* use socket recv buffer size (w/o overhead) as start value */
582 for (tmp_bufsize_short = smc_compress_bufsize(smc->sk.sk_rcvbuf / 2);
583 tmp_bufsize_short >= 0; tmp_bufsize_short--) {
584 tmp_bufsize = smc_uncompress_bufsize(tmp_bufsize_short);
585 /* check for reusable rmb_slot in the link group */
586 rmb_desc = smc_rmb_get_slot(lgr, tmp_bufsize_short);
587 if (rmb_desc) {
588 memset(rmb_desc->cpu_addr, 0, tmp_bufsize);
589 break; /* found reusable slot */
590 }
591 /* try to alloc a new RMB */
592 rmb_desc = kzalloc(sizeof(*rmb_desc), GFP_KERNEL);
593 if (!rmb_desc)
594 break; /* give up with -ENOMEM */
595 rmb_desc->cpu_addr = kzalloc(tmp_bufsize,
596 GFP_KERNEL | __GFP_NOWARN |
597 __GFP_NOMEMALLOC |
598 __GFP_NORETRY);
599 if (!rmb_desc->cpu_addr) {
600 kfree(rmb_desc);
601 rmb_desc = NULL;
602 /* if RMB allocation has failed,
603 * try a smaller one
604 */
605 continue;
606 }
607 rc = smc_ib_buf_map(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
608 tmp_bufsize, rmb_desc,
609 DMA_FROM_DEVICE);
610 if (rc) {
611 kfree(rmb_desc->cpu_addr);
612 kfree(rmb_desc);
613 rmb_desc = NULL;
614 continue; /* if mapping failed, try smaller one */
615 }
616 rc = smc_ib_get_memory_region(lgr->lnk[SMC_SINGLE_LINK].roce_pd,
617 IB_ACCESS_REMOTE_WRITE |
618 IB_ACCESS_LOCAL_WRITE,
619 &rmb_desc->mr_rx[SMC_SINGLE_LINK]);
620 if (rc) {
621 smc_ib_buf_unmap(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
622 tmp_bufsize, rmb_desc,
623 DMA_FROM_DEVICE);
624 kfree(rmb_desc->cpu_addr);
625 kfree(rmb_desc);
626 rmb_desc = NULL;
627 continue;
628 }
629 rmb_desc->used = 1;
630 write_lock_bh(&lgr->rmbs_lock);
631 list_add(&rmb_desc->list,
632 &lgr->rmbs[tmp_bufsize_short]);
633 write_unlock_bh(&lgr->rmbs_lock);
634 break;
635 }
636 if (rmb_desc && rmb_desc->cpu_addr) {
637 conn->rmb_desc = rmb_desc;
638 conn->rmbe_size = tmp_bufsize;
639 conn->rmbe_size_short = tmp_bufsize_short;
640 smc->sk.sk_rcvbuf = tmp_bufsize * 2;
641 atomic_set(&conn->bytes_to_rcv, 0);
642 conn->rmbe_update_limit = smc_rmb_wnd_update_limit(tmp_bufsize);
643 return 0;
644 } else {
645 return -ENOMEM;
646 }
647}
648
649static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr)
650{
651 int i;
652
653 for_each_clear_bit(i, lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) {
654 if (!test_and_set_bit(i, lgr->rtokens_used_mask))
655 return i;
656 }
657 return -ENOSPC;
658}
659
660/* save rkey and dma_addr received from peer during clc handshake */
661int smc_rmb_rtoken_handling(struct smc_connection *conn,
662 struct smc_clc_msg_accept_confirm *clc)
663{
664 u64 dma_addr = be64_to_cpu(clc->rmb_dma_addr);
665 struct smc_link_group *lgr = conn->lgr;
666 u32 rkey = ntohl(clc->rmb_rkey);
667 int i;
668
669 for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
670 if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) &&
671 test_bit(i, lgr->rtokens_used_mask)) {
672 conn->rtoken_idx = i;
673 return 0;
674 }
675 }
676 conn->rtoken_idx = smc_rmb_reserve_rtoken_idx(lgr);
677 if (conn->rtoken_idx < 0)
678 return conn->rtoken_idx;
679 lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey = rkey;
680 lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr = dma_addr;
681 return 0;
682}
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
new file mode 100644
index 000000000000..27eb38056a27
--- /dev/null
+++ b/net/smc/smc_core.h
@@ -0,0 +1,181 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Definitions for SMC Connections, Link Groups and Links
5 *
6 * Copyright IBM Corp. 2016
7 *
8 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
9 */
10
11#ifndef _SMC_CORE_H
12#define _SMC_CORE_H
13
14#include <linux/atomic.h>
15#include <rdma/ib_verbs.h>
16
17#include "smc.h"
18#include "smc_ib.h"
19
20#define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */
21
22struct smc_lgr_list { /* list of link group definition */
23 struct list_head list;
24 spinlock_t lock; /* protects list of link groups */
25};
26
27extern struct smc_lgr_list smc_lgr_list; /* list of link groups */
28
29enum smc_lgr_role { /* possible roles of a link group */
30 SMC_CLNT, /* client */
31 SMC_SERV /* server */
32};
33
34#define SMC_WR_BUF_SIZE 48 /* size of work request buffer */
35
36struct smc_wr_buf {
37 u8 raw[SMC_WR_BUF_SIZE];
38};
39
40struct smc_link {
41 struct smc_ib_device *smcibdev; /* ib-device */
42 u8 ibport; /* port - values 1 | 2 */
43 struct ib_pd *roce_pd; /* IB protection domain,
44 * unique for every RoCE QP
45 */
46 struct ib_qp *roce_qp; /* IB queue pair */
47 struct ib_qp_attr qp_attr; /* IB queue pair attributes */
48
49 struct smc_wr_buf *wr_tx_bufs; /* WR send payload buffers */
50 struct ib_send_wr *wr_tx_ibs; /* WR send meta data */
51 struct ib_sge *wr_tx_sges; /* WR send gather meta data */
52 struct smc_wr_tx_pend *wr_tx_pends; /* WR send waiting for CQE */
53 /* above four vectors have wr_tx_cnt elements and use the same index */
54 dma_addr_t wr_tx_dma_addr; /* DMA address of wr_tx_bufs */
55 atomic_long_t wr_tx_id; /* seq # of last sent WR */
56 unsigned long *wr_tx_mask; /* bit mask of used indexes */
57 u32 wr_tx_cnt; /* number of WR send buffers */
58 wait_queue_head_t wr_tx_wait; /* wait for free WR send buf */
59
60 struct smc_wr_buf *wr_rx_bufs; /* WR recv payload buffers */
61 struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */
62 struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */
63 /* above three vectors have wr_rx_cnt elements and use the same index */
64 dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */
65 u64 wr_rx_id; /* seq # of last recv WR */
66 u32 wr_rx_cnt; /* number of WR recv buffers */
67
68 union ib_gid gid; /* gid matching used vlan id */
69 u32 peer_qpn; /* QP number of peer */
70 enum ib_mtu path_mtu; /* used mtu */
71 enum ib_mtu peer_mtu; /* mtu size of peer */
72 u32 psn_initial; /* QP tx initial packet seqno */
73 u32 peer_psn; /* QP rx initial packet seqno */
74 u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */
75 u8 peer_gid[sizeof(union ib_gid)]; /* gid of peer*/
76 u8 link_id; /* unique # within link group */
77 struct completion llc_confirm; /* wait for rx of conf link */
78 struct completion llc_confirm_resp; /* wait 4 rx of cnf lnk rsp */
79};
80
81/* For now we just allow one parallel link per link group. The SMC protocol
82 * allows more (up to 8).
83 */
84#define SMC_LINKS_PER_LGR_MAX 1
85#define SMC_SINGLE_LINK 0
86
87#define SMC_FIRST_CONTACT 1 /* first contact to a peer */
88#define SMC_REUSE_CONTACT 0 /* follow-on contact to a peer*/
89
90/* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */
91struct smc_buf_desc {
92 struct list_head list;
93 u64 dma_addr[SMC_LINKS_PER_LGR_MAX];
94 /* mapped address of buffer */
95 void *cpu_addr; /* virtual address of buffer */
96 struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX];
97 /* for rmb only:
98 * rkey provided to peer
99 */
100 u32 used; /* currently used / unused */
101};
102
103struct smc_rtoken { /* address/key of remote RMB */
104 u64 dma_addr;
105 u32 rkey;
106};
107
108#define SMC_LGR_ID_SIZE 4
109
110struct smc_link_group {
111 struct list_head list;
112 enum smc_lgr_role role; /* client or server */
113 __be32 daddr; /* destination ip address */
114 struct smc_link lnk[SMC_LINKS_PER_LGR_MAX]; /* smc link */
115 char peer_systemid[SMC_SYSTEMID_LEN];
116 /* unique system_id of peer */
117 struct rb_root conns_all; /* connection tree */
118 rwlock_t conns_lock; /* protects conns_all */
119 unsigned int conns_num; /* current # of connections */
120 unsigned short vlan_id; /* vlan id of link group */
121
122 struct list_head sndbufs[SMC_RMBE_SIZES];/* tx buffers */
123 rwlock_t sndbufs_lock; /* protects tx buffers */
124 struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */
125 rwlock_t rmbs_lock; /* protects rx buffers */
126 struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX]
127 [SMC_LINKS_PER_LGR_MAX];
128 /* remote addr/key pairs */
129 unsigned long rtokens_used_mask[BITS_TO_LONGS(
130 SMC_RMBS_PER_LGR_MAX)];
131 /* used rtoken elements */
132
133 u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */
134 struct delayed_work free_work; /* delayed freeing of an lgr */
135 bool sync_err; /* lgr no longer fits to peer */
136};
137
138/* Find the connection associated with the given alert token in the link group.
139 * To use rbtrees we have to implement our own search core.
140 * Requires @conns_lock
141 * @token alert token to search for
142 * @lgr link group to search in
143 * Returns connection associated with token if found, NULL otherwise.
144 */
145static inline struct smc_connection *smc_lgr_find_conn(
146 u32 token, struct smc_link_group *lgr)
147{
148 struct smc_connection *res = NULL;
149 struct rb_node *node;
150
151 node = lgr->conns_all.rb_node;
152 while (node) {
153 struct smc_connection *cur = rb_entry(node,
154 struct smc_connection, alert_node);
155
156 if (cur->alert_token_local > token) {
157 node = node->rb_left;
158 } else {
159 if (cur->alert_token_local < token) {
160 node = node->rb_right;
161 } else {
162 res = cur;
163 break;
164 }
165 }
166 }
167
168 return res;
169}
170
171struct smc_sock;
172struct smc_clc_msg_accept_confirm;
173
174void smc_lgr_free(struct smc_link_group *lgr);
175void smc_lgr_terminate(struct smc_link_group *lgr);
176int smc_sndbuf_create(struct smc_sock *smc);
177int smc_rmb_create(struct smc_sock *smc);
178int smc_rmb_rtoken_handling(struct smc_connection *conn,
179 struct smc_clc_msg_accept_confirm *clc);
180
181#endif
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
new file mode 100644
index 000000000000..d2d01cf70224
--- /dev/null
+++ b/net/smc/smc_diag.c
@@ -0,0 +1,215 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Monitoring SMC transport protocol sockets
5 *
6 * Copyright IBM Corp. 2016
7 *
8 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
9 */
10
11#include <linux/kernel.h>
12#include <linux/module.h>
13#include <linux/types.h>
14#include <linux/init.h>
15#include <linux/sock_diag.h>
16#include <linux/inet_diag.h>
17#include <linux/smc_diag.h>
18#include <net/netlink.h>
19#include <net/smc.h>
20
21#include "smc.h"
22#include "smc_core.h"
23
24static void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw)
25{
26 sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x",
27 be16_to_cpu(((__be16 *)gid_raw)[0]),
28 be16_to_cpu(((__be16 *)gid_raw)[1]),
29 be16_to_cpu(((__be16 *)gid_raw)[2]),
30 be16_to_cpu(((__be16 *)gid_raw)[3]),
31 be16_to_cpu(((__be16 *)gid_raw)[4]),
32 be16_to_cpu(((__be16 *)gid_raw)[5]),
33 be16_to_cpu(((__be16 *)gid_raw)[6]),
34 be16_to_cpu(((__be16 *)gid_raw)[7]));
35}
36
37static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk)
38{
39 struct smc_sock *smc = smc_sk(sk);
40
41 r->diag_family = sk->sk_family;
42 if (!smc->clcsock)
43 return;
44 r->id.idiag_sport = htons(smc->clcsock->sk->sk_num);
45 r->id.idiag_dport = smc->clcsock->sk->sk_dport;
46 r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if;
47 sock_diag_save_cookie(sk, r->id.idiag_cookie);
48 memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
49 memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
50 r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr;
51 r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr;
52}
53
54static int smc_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
55 struct smc_diag_msg *r,
56 struct user_namespace *user_ns)
57{
58 if (nla_put_u8(skb, SMC_DIAG_SHUTDOWN, sk->sk_shutdown))
59 return 1;
60
61 r->diag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
62 r->diag_inode = sock_i_ino(sk);
63 return 0;
64}
65
66static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
67 struct netlink_callback *cb,
68 const struct smc_diag_req *req,
69 struct nlattr *bc)
70{
71 struct smc_sock *smc = smc_sk(sk);
72 struct user_namespace *user_ns;
73 struct smc_diag_msg *r;
74 struct nlmsghdr *nlh;
75
76 nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
77 cb->nlh->nlmsg_type, sizeof(*r), NLM_F_MULTI);
78 if (!nlh)
79 return -EMSGSIZE;
80
81 r = nlmsg_data(nlh);
82 smc_diag_msg_common_fill(r, sk);
83 r->diag_state = sk->sk_state;
84 r->diag_fallback = smc->use_fallback;
85 user_ns = sk_user_ns(NETLINK_CB(cb->skb).sk);
86 if (smc_diag_msg_attrs_fill(sk, skb, r, user_ns))
87 goto errout;
88
89 if ((req->diag_ext & (1 << (SMC_DIAG_CONNINFO - 1))) && smc->conn.lgr) {
90 struct smc_connection *conn = &smc->conn;
91 struct smc_diag_conninfo cinfo = {
92 .token = conn->alert_token_local,
93 .sndbuf_size = conn->sndbuf_size,
94 .rmbe_size = conn->rmbe_size,
95 .peer_rmbe_size = conn->peer_rmbe_size,
96
97 .rx_prod.wrap = conn->local_rx_ctrl.prod.wrap,
98 .rx_prod.count = conn->local_rx_ctrl.prod.count,
99 .rx_cons.wrap = conn->local_rx_ctrl.cons.wrap,
100 .rx_cons.count = conn->local_rx_ctrl.cons.count,
101
102 .tx_prod.wrap = conn->local_tx_ctrl.prod.wrap,
103 .tx_prod.count = conn->local_tx_ctrl.prod.count,
104 .tx_cons.wrap = conn->local_tx_ctrl.cons.wrap,
105 .tx_cons.count = conn->local_tx_ctrl.cons.count,
106
107 .tx_prod_flags =
108 *(u8 *)&conn->local_tx_ctrl.prod_flags,
109 .tx_conn_state_flags =
110 *(u8 *)&conn->local_tx_ctrl.conn_state_flags,
111 .rx_prod_flags = *(u8 *)&conn->local_rx_ctrl.prod_flags,
112 .rx_conn_state_flags =
113 *(u8 *)&conn->local_rx_ctrl.conn_state_flags,
114
115 .tx_prep.wrap = conn->tx_curs_prep.wrap,
116 .tx_prep.count = conn->tx_curs_prep.count,
117 .tx_sent.wrap = conn->tx_curs_sent.wrap,
118 .tx_sent.count = conn->tx_curs_sent.count,
119 .tx_fin.wrap = conn->tx_curs_fin.wrap,
120 .tx_fin.count = conn->tx_curs_fin.count,
121 };
122
123 if (nla_put(skb, SMC_DIAG_CONNINFO, sizeof(cinfo), &cinfo) < 0)
124 goto errout;
125 }
126
127 if ((req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && smc->conn.lgr) {
128 struct smc_diag_lgrinfo linfo = {
129 .role = smc->conn.lgr->role,
130 .lnk[0].ibport = smc->conn.lgr->lnk[0].ibport,
131 .lnk[0].link_id = smc->conn.lgr->lnk[0].link_id,
132 };
133
134 memcpy(linfo.lnk[0].ibname,
135 smc->conn.lgr->lnk[0].smcibdev->ibdev->name,
136 sizeof(smc->conn.lgr->lnk[0].smcibdev->ibdev->name));
137 smc_gid_be16_convert(linfo.lnk[0].gid,
138 smc->conn.lgr->lnk[0].gid.raw);
139 smc_gid_be16_convert(linfo.lnk[0].peer_gid,
140 smc->conn.lgr->lnk[0].peer_gid);
141
142 if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0)
143 goto errout;
144 }
145
146 nlmsg_end(skb, nlh);
147 return 0;
148
149errout:
150 nlmsg_cancel(skb, nlh);
151 return -EMSGSIZE;
152}
153
154static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
155{
156 struct net *net = sock_net(skb->sk);
157 struct nlattr *bc = NULL;
158 struct hlist_head *head;
159 struct sock *sk;
160 int rc = 0;
161
162 read_lock(&smc_proto.h.smc_hash->lock);
163 head = &smc_proto.h.smc_hash->ht;
164 if (hlist_empty(head))
165 goto out;
166
167 sk_for_each(sk, head) {
168 if (!net_eq(sock_net(sk), net))
169 continue;
170 rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc);
171 if (rc)
172 break;
173 }
174
175out:
176 read_unlock(&smc_proto.h.smc_hash->lock);
177 return rc;
178}
179
180static int smc_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
181{
182 struct net *net = sock_net(skb->sk);
183
184 if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY &&
185 h->nlmsg_flags & NLM_F_DUMP) {
186 {
187 struct netlink_dump_control c = {
188 .dump = smc_diag_dump,
189 .min_dump_alloc = SKB_WITH_OVERHEAD(32768),
190 };
191 return netlink_dump_start(net->diag_nlsk, skb, h, &c);
192 }
193 }
194 return 0;
195}
196
197static const struct sock_diag_handler smc_diag_handler = {
198 .family = AF_SMC,
199 .dump = smc_diag_handler_dump,
200};
201
202static int __init smc_diag_init(void)
203{
204 return sock_diag_register(&smc_diag_handler);
205}
206
207static void __exit smc_diag_exit(void)
208{
209 sock_diag_unregister(&smc_diag_handler);
210}
211
212module_init(smc_diag_init);
213module_exit(smc_diag_exit);
214MODULE_LICENSE("GPL");
215MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 43 /* AF_SMC */);
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
new file mode 100644
index 000000000000..e6743c008ac5
--- /dev/null
+++ b/net/smc/smc_ib.c
@@ -0,0 +1,466 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * IB infrastructure:
5 * Establish SMC-R as an Infiniband Client to be notified about added and
6 * removed IB devices of type RDMA.
7 * Determine device and port characteristics for these IB devices.
8 *
9 * Copyright IBM Corp. 2016
10 *
11 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
12 */
13
14#include <linux/random.h>
15#include <linux/workqueue.h>
16#include <rdma/ib_verbs.h>
17
18#include "smc_pnet.h"
19#include "smc_ib.h"
20#include "smc_core.h"
21#include "smc_wr.h"
22#include "smc.h"
23
24#define SMC_QP_MIN_RNR_TIMER 5
25#define SMC_QP_TIMEOUT 15 /* 4096 * 2 ** timeout usec */
26#define SMC_QP_RETRY_CNT 7 /* 7: infinite */
27#define SMC_QP_RNR_RETRY 7 /* 7: infinite */
28
29struct smc_ib_devices smc_ib_devices = { /* smc-registered ib devices */
30 .lock = __SPIN_LOCK_UNLOCKED(smc_ib_devices.lock),
31 .list = LIST_HEAD_INIT(smc_ib_devices.list),
32};
33
34#define SMC_LOCAL_SYSTEMID_RESET "%%%%%%%"
35
36u8 local_systemid[SMC_SYSTEMID_LEN] = SMC_LOCAL_SYSTEMID_RESET; /* unique system
37 * identifier
38 */
39
40int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
41 struct ib_mr **mr)
42{
43 int rc;
44
45 if (*mr)
46 return 0; /* already done */
47
48 /* obtain unique key -
49 * next invocation of get_dma_mr returns a different key!
50 */
51 *mr = pd->device->get_dma_mr(pd, access_flags);
52 rc = PTR_ERR_OR_ZERO(*mr);
53 if (IS_ERR(*mr))
54 *mr = NULL;
55 return rc;
56}
57
58static int smc_ib_modify_qp_init(struct smc_link *lnk)
59{
60 struct ib_qp_attr qp_attr;
61
62 memset(&qp_attr, 0, sizeof(qp_attr));
63 qp_attr.qp_state = IB_QPS_INIT;
64 qp_attr.pkey_index = 0;
65 qp_attr.port_num = lnk->ibport;
66 qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE
67 | IB_ACCESS_REMOTE_WRITE;
68 return ib_modify_qp(lnk->roce_qp, &qp_attr,
69 IB_QP_STATE | IB_QP_PKEY_INDEX |
70 IB_QP_ACCESS_FLAGS | IB_QP_PORT);
71}
72
73static int smc_ib_modify_qp_rtr(struct smc_link *lnk)
74{
75 enum ib_qp_attr_mask qp_attr_mask =
76 IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN |
77 IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER;
78 struct ib_qp_attr qp_attr;
79
80 memset(&qp_attr, 0, sizeof(qp_attr));
81 qp_attr.qp_state = IB_QPS_RTR;
82 qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu);
83 qp_attr.ah_attr.port_num = lnk->ibport;
84 qp_attr.ah_attr.ah_flags = IB_AH_GRH;
85 qp_attr.ah_attr.grh.hop_limit = 1;
86 memcpy(&qp_attr.ah_attr.grh.dgid, lnk->peer_gid,
87 sizeof(lnk->peer_gid));
88 memcpy(&qp_attr.ah_attr.dmac, lnk->peer_mac,
89 sizeof(lnk->peer_mac));
90 qp_attr.dest_qp_num = lnk->peer_qpn;
91 qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */
92 qp_attr.max_dest_rd_atomic = 1; /* max # of resources for incoming
93 * requests
94 */
95 qp_attr.min_rnr_timer = SMC_QP_MIN_RNR_TIMER;
96
97 return ib_modify_qp(lnk->roce_qp, &qp_attr, qp_attr_mask);
98}
99
100int smc_ib_modify_qp_rts(struct smc_link *lnk)
101{
102 struct ib_qp_attr qp_attr;
103
104 memset(&qp_attr, 0, sizeof(qp_attr));
105 qp_attr.qp_state = IB_QPS_RTS;
106 qp_attr.timeout = SMC_QP_TIMEOUT; /* local ack timeout */
107 qp_attr.retry_cnt = SMC_QP_RETRY_CNT; /* retry count */
108 qp_attr.rnr_retry = SMC_QP_RNR_RETRY; /* RNR retries, 7=infinite */
109 qp_attr.sq_psn = lnk->psn_initial; /* starting send packet seq # */
110 qp_attr.max_rd_atomic = 1; /* # of outstanding RDMA reads and
111 * atomic ops allowed
112 */
113 return ib_modify_qp(lnk->roce_qp, &qp_attr,
114 IB_QP_STATE | IB_QP_TIMEOUT | IB_QP_RETRY_CNT |
115 IB_QP_SQ_PSN | IB_QP_RNR_RETRY |
116 IB_QP_MAX_QP_RD_ATOMIC);
117}
118
119int smc_ib_modify_qp_reset(struct smc_link *lnk)
120{
121 struct ib_qp_attr qp_attr;
122
123 memset(&qp_attr, 0, sizeof(qp_attr));
124 qp_attr.qp_state = IB_QPS_RESET;
125 return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE);
126}
127
128int smc_ib_ready_link(struct smc_link *lnk)
129{
130 struct smc_link_group *lgr =
131 container_of(lnk, struct smc_link_group, lnk[0]);
132 int rc = 0;
133
134 rc = smc_ib_modify_qp_init(lnk);
135 if (rc)
136 goto out;
137
138 rc = smc_ib_modify_qp_rtr(lnk);
139 if (rc)
140 goto out;
141 smc_wr_remember_qp_attr(lnk);
142 rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv,
143 IB_CQ_SOLICITED_MASK);
144 if (rc)
145 goto out;
146 rc = smc_wr_rx_post_init(lnk);
147 if (rc)
148 goto out;
149 smc_wr_remember_qp_attr(lnk);
150
151 if (lgr->role == SMC_SERV) {
152 rc = smc_ib_modify_qp_rts(lnk);
153 if (rc)
154 goto out;
155 smc_wr_remember_qp_attr(lnk);
156 }
157out:
158 return rc;
159}
160
161/* process context wrapper for might_sleep smc_ib_remember_port_attr */
162static void smc_ib_port_event_work(struct work_struct *work)
163{
164 struct smc_ib_device *smcibdev = container_of(
165 work, struct smc_ib_device, port_event_work);
166 u8 port_idx;
167
168 for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) {
169 smc_ib_remember_port_attr(smcibdev, port_idx + 1);
170 clear_bit(port_idx, &smcibdev->port_event_mask);
171 }
172}
173
174/* can be called in IRQ context */
175static void smc_ib_global_event_handler(struct ib_event_handler *handler,
176 struct ib_event *ibevent)
177{
178 struct smc_ib_device *smcibdev;
179 u8 port_idx;
180
181 smcibdev = container_of(handler, struct smc_ib_device, event_handler);
182 if (!smc_pnet_find_ib(smcibdev->ibdev->name))
183 return;
184
185 switch (ibevent->event) {
186 case IB_EVENT_PORT_ERR:
187 port_idx = ibevent->element.port_num - 1;
188 set_bit(port_idx, &smcibdev->port_event_mask);
189 schedule_work(&smcibdev->port_event_work);
190 /* fall through */
191 case IB_EVENT_DEVICE_FATAL:
192 /* tbd in follow-on patch:
193 * abnormal close of corresponding connections
194 */
195 break;
196 case IB_EVENT_PORT_ACTIVE:
197 port_idx = ibevent->element.port_num - 1;
198 set_bit(port_idx, &smcibdev->port_event_mask);
199 schedule_work(&smcibdev->port_event_work);
200 break;
201 default:
202 break;
203 }
204}
205
206void smc_ib_dealloc_protection_domain(struct smc_link *lnk)
207{
208 ib_dealloc_pd(lnk->roce_pd);
209 lnk->roce_pd = NULL;
210}
211
212int smc_ib_create_protection_domain(struct smc_link *lnk)
213{
214 int rc;
215
216 lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 0);
217 rc = PTR_ERR_OR_ZERO(lnk->roce_pd);
218 if (IS_ERR(lnk->roce_pd))
219 lnk->roce_pd = NULL;
220 return rc;
221}
222
223static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv)
224{
225 switch (ibevent->event) {
226 case IB_EVENT_DEVICE_FATAL:
227 case IB_EVENT_GID_CHANGE:
228 case IB_EVENT_PORT_ERR:
229 case IB_EVENT_QP_ACCESS_ERR:
230 /* tbd in follow-on patch:
231 * abnormal close of corresponding connections
232 */
233 break;
234 default:
235 break;
236 }
237}
238
239void smc_ib_destroy_queue_pair(struct smc_link *lnk)
240{
241 ib_destroy_qp(lnk->roce_qp);
242 lnk->roce_qp = NULL;
243}
244
245/* create a queue pair within the protection domain for a link */
246int smc_ib_create_queue_pair(struct smc_link *lnk)
247{
248 struct ib_qp_init_attr qp_attr = {
249 .event_handler = smc_ib_qp_event_handler,
250 .qp_context = lnk,
251 .send_cq = lnk->smcibdev->roce_cq_send,
252 .recv_cq = lnk->smcibdev->roce_cq_recv,
253 .srq = NULL,
254 .cap = {
255 .max_send_wr = SMC_WR_BUF_CNT,
256 /* include unsolicited rdma_writes as well,
257 * there are max. 2 RDMA_WRITE per 1 WR_SEND
258 */
259 .max_recv_wr = SMC_WR_BUF_CNT * 3,
260 .max_send_sge = SMC_IB_MAX_SEND_SGE,
261 .max_recv_sge = 1,
262 .max_inline_data = SMC_WR_TX_SIZE,
263 },
264 .sq_sig_type = IB_SIGNAL_REQ_WR,
265 .qp_type = IB_QPT_RC,
266 };
267 int rc;
268
269 lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr);
270 rc = PTR_ERR_OR_ZERO(lnk->roce_qp);
271 if (IS_ERR(lnk->roce_qp))
272 lnk->roce_qp = NULL;
273 else
274 smc_wr_remember_qp_attr(lnk);
275 return rc;
276}
277
278/* map a new TX or RX buffer to DMA */
279int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size,
280 struct smc_buf_desc *buf_slot,
281 enum dma_data_direction data_direction)
282{
283 int rc = 0;
284
285 if (buf_slot->dma_addr[SMC_SINGLE_LINK])
286 return rc; /* already mapped */
287 buf_slot->dma_addr[SMC_SINGLE_LINK] =
288 ib_dma_map_single(smcibdev->ibdev, buf_slot->cpu_addr,
289 buf_size, data_direction);
290 if (ib_dma_mapping_error(smcibdev->ibdev,
291 buf_slot->dma_addr[SMC_SINGLE_LINK]))
292 rc = -EIO;
293 return rc;
294}
295
296void smc_ib_buf_unmap(struct smc_ib_device *smcibdev, int buf_size,
297 struct smc_buf_desc *buf_slot,
298 enum dma_data_direction data_direction)
299{
300 if (!buf_slot->dma_addr[SMC_SINGLE_LINK])
301 return; /* already unmapped */
302 ib_dma_unmap_single(smcibdev->ibdev, *buf_slot->dma_addr, buf_size,
303 data_direction);
304 buf_slot->dma_addr[SMC_SINGLE_LINK] = 0;
305}
306
307static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport)
308{
309 struct net_device *ndev;
310 int rc;
311
312 rc = ib_query_gid(smcibdev->ibdev, ibport, 0,
313 &smcibdev->gid[ibport - 1], NULL);
314 /* the SMC protocol requires specification of the roce MAC address;
315 * if net_device cannot be determined, it can be derived from gid 0
316 */
317 ndev = smcibdev->ibdev->get_netdev(smcibdev->ibdev, ibport);
318 if (ndev) {
319 memcpy(&smcibdev->mac, ndev->dev_addr, ETH_ALEN);
320 } else if (!rc) {
321 memcpy(&smcibdev->mac[ibport - 1][0],
322 &smcibdev->gid[ibport - 1].raw[8], 3);
323 memcpy(&smcibdev->mac[ibport - 1][3],
324 &smcibdev->gid[ibport - 1].raw[13], 3);
325 smcibdev->mac[ibport - 1][0] &= ~0x02;
326 }
327 return rc;
328}
329
330/* Create an identifier unique for this instance of SMC-R.
331 * The MAC-address of the first active registered IB device
332 * plus a random 2-byte number is used to create this identifier.
333 * This name is delivered to the peer during connection initialization.
334 */
335static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev,
336 u8 ibport)
337{
338 memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1],
339 sizeof(smcibdev->mac[ibport - 1]));
340 get_random_bytes(&local_systemid[0], 2);
341}
342
343bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport)
344{
345 return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE;
346}
347
348int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport)
349{
350 int rc;
351
352 memset(&smcibdev->pattr[ibport - 1], 0,
353 sizeof(smcibdev->pattr[ibport - 1]));
354 rc = ib_query_port(smcibdev->ibdev, ibport,
355 &smcibdev->pattr[ibport - 1]);
356 if (rc)
357 goto out;
358 rc = smc_ib_fill_gid_and_mac(smcibdev, ibport);
359 if (rc)
360 goto out;
361 if (!strncmp(local_systemid, SMC_LOCAL_SYSTEMID_RESET,
362 sizeof(local_systemid)) &&
363 smc_ib_port_active(smcibdev, ibport))
364 /* create unique system identifier */
365 smc_ib_define_local_systemid(smcibdev, ibport);
366out:
367 return rc;
368}
369
370long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
371{
372 struct ib_cq_init_attr cqattr = {
373 .cqe = SMC_WR_MAX_CQE, .comp_vector = 0 };
374 long rc;
375
376 smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev,
377 smc_wr_tx_cq_handler, NULL,
378 smcibdev, &cqattr);
379 rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send);
380 if (IS_ERR(smcibdev->roce_cq_send)) {
381 smcibdev->roce_cq_send = NULL;
382 return rc;
383 }
384 smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev,
385 smc_wr_rx_cq_handler, NULL,
386 smcibdev, &cqattr);
387 rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv);
388 if (IS_ERR(smcibdev->roce_cq_recv)) {
389 smcibdev->roce_cq_recv = NULL;
390 goto err;
391 }
392 INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev,
393 smc_ib_global_event_handler);
394 ib_register_event_handler(&smcibdev->event_handler);
395 smc_wr_add_dev(smcibdev);
396 smcibdev->initialized = 1;
397 return rc;
398
399err:
400 ib_destroy_cq(smcibdev->roce_cq_send);
401 return rc;
402}
403
404static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev)
405{
406 if (!smcibdev->initialized)
407 return;
408 smc_wr_remove_dev(smcibdev);
409 ib_unregister_event_handler(&smcibdev->event_handler);
410 ib_destroy_cq(smcibdev->roce_cq_recv);
411 ib_destroy_cq(smcibdev->roce_cq_send);
412}
413
414static struct ib_client smc_ib_client;
415
416/* callback function for ib_register_client() */
417static void smc_ib_add_dev(struct ib_device *ibdev)
418{
419 struct smc_ib_device *smcibdev;
420
421 if (ibdev->node_type != RDMA_NODE_IB_CA)
422 return;
423
424 smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL);
425 if (!smcibdev)
426 return;
427
428 smcibdev->ibdev = ibdev;
429 INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work);
430
431 spin_lock(&smc_ib_devices.lock);
432 list_add_tail(&smcibdev->list, &smc_ib_devices.list);
433 spin_unlock(&smc_ib_devices.lock);
434 ib_set_client_data(ibdev, &smc_ib_client, smcibdev);
435}
436
437/* callback function for ib_register_client() */
438static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
439{
440 struct smc_ib_device *smcibdev;
441
442 smcibdev = ib_get_client_data(ibdev, &smc_ib_client);
443 ib_set_client_data(ibdev, &smc_ib_client, NULL);
444 spin_lock(&smc_ib_devices.lock);
445 list_del_init(&smcibdev->list); /* remove from smc_ib_devices */
446 spin_unlock(&smc_ib_devices.lock);
447 smc_pnet_remove_by_ibdev(smcibdev);
448 smc_ib_cleanup_per_ibdev(smcibdev);
449 kfree(smcibdev);
450}
451
452static struct ib_client smc_ib_client = {
453 .name = "smc_ib",
454 .add = smc_ib_add_dev,
455 .remove = smc_ib_remove_dev,
456};
457
458int __init smc_ib_register_client(void)
459{
460 return ib_register_client(&smc_ib_client);
461}
462
463void smc_ib_unregister_client(void)
464{
465 ib_unregister_client(&smc_ib_client);
466}
diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h
new file mode 100644
index 000000000000..a95f74bb5569
--- /dev/null
+++ b/net/smc/smc_ib.h
@@ -0,0 +1,71 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Definitions for IB environment
5 *
6 * Copyright IBM Corp. 2016
7 *
8 * Author(s): Ursula Braun <Ursula Braun@linux.vnet.ibm.com>
9 */
10
11#ifndef _SMC_IB_H
12#define _SMC_IB_H
13
14#include <linux/if_ether.h>
15#include <rdma/ib_verbs.h>
16
17#define SMC_MAX_PORTS 2 /* Max # of ports */
18#define SMC_GID_SIZE sizeof(union ib_gid)
19
20#define SMC_IB_MAX_SEND_SGE 2
21
22struct smc_ib_devices { /* list of smc ib devices definition */
23 struct list_head list;
24 spinlock_t lock; /* protects list of smc ib devices */
25};
26
27extern struct smc_ib_devices smc_ib_devices; /* list of smc ib devices */
28
29struct smc_ib_device { /* ib-device infos for smc */
30 struct list_head list;
31 struct ib_device *ibdev;
32 struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */
33 struct ib_event_handler event_handler; /* global ib_event handler */
34 struct ib_cq *roce_cq_send; /* send completion queue */
35 struct ib_cq *roce_cq_recv; /* recv completion queue */
36 struct tasklet_struct send_tasklet; /* called by send cq handler */
37 struct tasklet_struct recv_tasklet; /* called by recv cq handler */
38 char mac[SMC_MAX_PORTS][ETH_ALEN];
39 /* mac address per port*/
40 union ib_gid gid[SMC_MAX_PORTS]; /* gid per port */
41 u8 initialized : 1; /* ib dev CQ, evthdl done */
42 struct work_struct port_event_work;
43 unsigned long port_event_mask;
44};
45
46struct smc_buf_desc;
47struct smc_link;
48
49int smc_ib_register_client(void) __init;
50void smc_ib_unregister_client(void);
51bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport);
52int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport);
53int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size,
54 struct smc_buf_desc *buf_slot,
55 enum dma_data_direction data_direction);
56void smc_ib_buf_unmap(struct smc_ib_device *smcibdev, int bufsize,
57 struct smc_buf_desc *buf_slot,
58 enum dma_data_direction data_direction);
59void smc_ib_dealloc_protection_domain(struct smc_link *lnk);
60int smc_ib_create_protection_domain(struct smc_link *lnk);
61void smc_ib_destroy_queue_pair(struct smc_link *lnk);
62int smc_ib_create_queue_pair(struct smc_link *lnk);
63int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
64 struct ib_mr **mr);
65int smc_ib_ready_link(struct smc_link *lnk);
66int smc_ib_modify_qp_rts(struct smc_link *lnk);
67int smc_ib_modify_qp_reset(struct smc_link *lnk);
68long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev);
69
70
71#endif
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
new file mode 100644
index 000000000000..c2f9165d13ef
--- /dev/null
+++ b/net/smc/smc_llc.c
@@ -0,0 +1,158 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Link Layer Control (LLC)
5 *
6 * For now, we only support the necessary "confirm link" functionality
7 * which happens for the first RoCE link after successful CLC handshake.
8 *
9 * Copyright IBM Corp. 2016
10 *
11 * Author(s): Klaus Wacker <Klaus.Wacker@de.ibm.com>
12 * Ursula Braun <ubraun@linux.vnet.ibm.com>
13 */
14
15#include <net/tcp.h>
16#include <rdma/ib_verbs.h>
17
18#include "smc.h"
19#include "smc_core.h"
20#include "smc_clc.h"
21#include "smc_llc.h"
22
23/********************************** send *************************************/
24
25struct smc_llc_tx_pend {
26};
27
28/* handler for send/transmission completion of an LLC msg */
29static void smc_llc_tx_handler(struct smc_wr_tx_pend_priv *pend,
30 struct smc_link *link,
31 enum ib_wc_status wc_status)
32{
33 /* future work: handle wc_status error for recovery and failover */
34}
35
36/**
37 * smc_llc_add_pending_send() - add LLC control message to pending WQE transmits
38 * @link: Pointer to SMC link used for sending LLC control message.
39 * @wr_buf: Out variable returning pointer to work request payload buffer.
40 * @pend: Out variable returning pointer to private pending WR tracking.
41 * It's the context the transmit complete handler will get.
42 *
43 * Reserves and pre-fills an entry for a pending work request send/tx.
44 * Used by mid-level smc_llc_send_msg() to prepare for later actual send/tx.
45 * Can sleep due to smc_get_ctrl_buf (if not in softirq context).
46 *
47 * Return: 0 on success, otherwise an error value.
48 */
49static int smc_llc_add_pending_send(struct smc_link *link,
50 struct smc_wr_buf **wr_buf,
51 struct smc_wr_tx_pend_priv **pend)
52{
53 int rc;
54
55 rc = smc_wr_tx_get_free_slot(link, smc_llc_tx_handler, wr_buf, pend);
56 if (rc < 0)
57 return rc;
58 BUILD_BUG_ON_MSG(
59 sizeof(union smc_llc_msg) > SMC_WR_BUF_SIZE,
60 "must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_llc_msg)");
61 BUILD_BUG_ON_MSG(
62 sizeof(union smc_llc_msg) != SMC_WR_TX_SIZE,
63 "must adapt SMC_WR_TX_SIZE to sizeof(struct smc_llc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()");
64 BUILD_BUG_ON_MSG(
65 sizeof(struct smc_llc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE,
66 "must increase SMC_WR_TX_PEND_PRIV_SIZE to at least sizeof(struct smc_llc_tx_pend)");
67 return 0;
68}
69
70/* high-level API to send LLC confirm link */
71int smc_llc_send_confirm_link(struct smc_link *link, u8 mac[],
72 union ib_gid *gid,
73 enum smc_llc_reqresp reqresp)
74{
75 struct smc_link_group *lgr = container_of(link, struct smc_link_group,
76 lnk[SMC_SINGLE_LINK]);
77 struct smc_llc_msg_confirm_link *confllc;
78 struct smc_wr_tx_pend_priv *pend;
79 struct smc_wr_buf *wr_buf;
80 int rc;
81
82 rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
83 if (rc)
84 return rc;
85 confllc = (struct smc_llc_msg_confirm_link *)wr_buf;
86 memset(confllc, 0, sizeof(*confllc));
87 confllc->hd.common.type = SMC_LLC_CONFIRM_LINK;
88 confllc->hd.length = sizeof(struct smc_llc_msg_confirm_link);
89 if (reqresp == SMC_LLC_RESP)
90 confllc->hd.flags |= SMC_LLC_FLAG_RESP;
91 memcpy(confllc->sender_mac, mac, ETH_ALEN);
92 memcpy(confllc->sender_gid, gid, SMC_GID_SIZE);
93 hton24(confllc->sender_qp_num, link->roce_qp->qp_num);
94 /* confllc->link_num = SMC_SINGLE_LINK; already done by memset above */
95 memcpy(confllc->link_uid, lgr->id, SMC_LGR_ID_SIZE);
96 confllc->max_links = SMC_LINKS_PER_LGR_MAX;
97 /* send llc message */
98 rc = smc_wr_tx_send(link, pend);
99 return rc;
100}
101
102/********************************* receive ***********************************/
103
104static void smc_llc_rx_confirm_link(struct smc_link *link,
105 struct smc_llc_msg_confirm_link *llc)
106{
107 struct smc_link_group *lgr;
108
109 lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
110 if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
111 if (lgr->role == SMC_SERV)
112 complete(&link->llc_confirm_resp);
113 } else {
114 if (lgr->role == SMC_CLNT) {
115 link->link_id = llc->link_num;
116 complete(&link->llc_confirm);
117 }
118 }
119}
120
121static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
122{
123 struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
124 union smc_llc_msg *llc = buf;
125
126 if (wc->byte_len < sizeof(*llc))
127 return; /* short message */
128 if (llc->raw.hdr.length != sizeof(*llc))
129 return; /* invalid message */
130 if (llc->raw.hdr.common.type == SMC_LLC_CONFIRM_LINK)
131 smc_llc_rx_confirm_link(link, &llc->confirm_link);
132}
133
134/***************************** init, exit, misc ******************************/
135
136static struct smc_wr_rx_handler smc_llc_rx_handlers[] = {
137 {
138 .handler = smc_llc_rx_handler,
139 .type = SMC_LLC_CONFIRM_LINK
140 },
141 {
142 .handler = NULL,
143 }
144};
145
146int __init smc_llc_init(void)
147{
148 struct smc_wr_rx_handler *handler;
149 int rc = 0;
150
151 for (handler = smc_llc_rx_handlers; handler->handler; handler++) {
152 INIT_HLIST_NODE(&handler->list);
153 rc = smc_wr_rx_register_handler(handler);
154 if (rc)
155 break;
156 }
157 return rc;
158}
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
new file mode 100644
index 000000000000..b472f853953a
--- /dev/null
+++ b/net/smc/smc_llc.h
@@ -0,0 +1,63 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Definitions for LLC (link layer control) message handling
5 *
6 * Copyright IBM Corp. 2016
7 *
8 * Author(s): Klaus Wacker <Klaus.Wacker@de.ibm.com>
9 * Ursula Braun <ubraun@linux.vnet.ibm.com>
10 */
11
12#ifndef SMC_LLC_H
13#define SMC_LLC_H
14
15#include "smc_wr.h"
16
17#define SMC_LLC_FLAG_RESP 0x80
18
19#define SMC_LLC_WAIT_FIRST_TIME (5 * HZ)
20
21enum smc_llc_reqresp {
22 SMC_LLC_REQ,
23 SMC_LLC_RESP
24};
25
26enum smc_llc_msg_type {
27 SMC_LLC_CONFIRM_LINK = 0x01,
28};
29
30#define SMC_LLC_DATA_LEN 40
31
32struct smc_llc_hdr {
33 struct smc_wr_rx_hdr common;
34 u8 length; /* 44 */
35 u8 reserved;
36 u8 flags;
37};
38
39struct smc_llc_msg_confirm_link { /* type 0x01 */
40 struct smc_llc_hdr hd;
41 u8 sender_mac[ETH_ALEN];
42 u8 sender_gid[SMC_GID_SIZE];
43 u8 sender_qp_num[3];
44 u8 link_num;
45 u8 link_uid[SMC_LGR_ID_SIZE];
46 u8 max_links;
47 u8 reserved[9];
48};
49
50union smc_llc_msg {
51 struct smc_llc_msg_confirm_link confirm_link;
52 struct {
53 struct smc_llc_hdr hdr;
54 u8 data[SMC_LLC_DATA_LEN];
55 } raw;
56};
57
58/* transmit */
59int smc_llc_send_confirm_link(struct smc_link *lnk, u8 mac[], union ib_gid *gid,
60 enum smc_llc_reqresp reqresp);
61int smc_llc_init(void) __init;
62
63#endif /* SMC_LLC_H */
diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
new file mode 100644
index 000000000000..9d3e7fb8348d
--- /dev/null
+++ b/net/smc/smc_pnet.c
@@ -0,0 +1,534 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Generic netlink support functions to configure an SMC-R PNET table
5 *
6 * Copyright IBM Corp. 2016
7 *
8 * Author(s): Thomas Richter <tmricht@linux.vnet.ibm.com>
9 */
10
11#include <linux/module.h>
12#include <linux/list.h>
13#include <linux/ctype.h>
14#include <net/netlink.h>
15#include <net/genetlink.h>
16
17#include <uapi/linux/if.h>
18#include <uapi/linux/smc.h>
19
20#include <rdma/ib_verbs.h>
21
22#include "smc_pnet.h"
23#include "smc_ib.h"
24
25#define SMC_MAX_PNET_ID_LEN 16 /* Max. length of PNET id */
26
27static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = {
28 [SMC_PNETID_NAME] = {
29 .type = NLA_NUL_STRING,
30 .len = SMC_MAX_PNET_ID_LEN - 1
31 },
32 [SMC_PNETID_ETHNAME] = {
33 .type = NLA_NUL_STRING,
34 .len = IFNAMSIZ - 1
35 },
36 [SMC_PNETID_IBNAME] = {
37 .type = NLA_NUL_STRING,
38 .len = IB_DEVICE_NAME_MAX - 1
39 },
40 [SMC_PNETID_IBPORT] = { .type = NLA_U8 }
41};
42
43static struct genl_family smc_pnet_nl_family;
44
45/**
46 * struct smc_pnettable - SMC PNET table anchor
47 * @lock: Lock for list action
48 * @pnetlist: List of PNETIDs
49 */
50static struct smc_pnettable {
51 rwlock_t lock;
52 struct list_head pnetlist;
53} smc_pnettable = {
54 .pnetlist = LIST_HEAD_INIT(smc_pnettable.pnetlist),
55 .lock = __RW_LOCK_UNLOCKED(smc_pnettable.lock)
56};
57
58/**
59 * struct smc_pnetentry - pnet identifier name entry
60 * @list: List node.
61 * @pnet_name: Pnet identifier name
62 * @ndev: pointer to network device.
63 * @smcibdev: Pointer to IB device.
64 */
65struct smc_pnetentry {
66 struct list_head list;
67 char pnet_name[SMC_MAX_PNET_ID_LEN + 1];
68 struct net_device *ndev;
69 struct smc_ib_device *smcibdev;
70 u8 ib_port;
71};
72
73/* Check if two RDMA device entries are identical. Use device name and port
74 * number for comparison.
75 */
76static bool smc_pnet_same_ibname(struct smc_pnetentry *pnetelem, char *ibname,
77 u8 ibport)
78{
79 return pnetelem->ib_port == ibport &&
80 !strncmp(pnetelem->smcibdev->ibdev->name, ibname,
81 sizeof(pnetelem->smcibdev->ibdev->name));
82}
83
84/* Find a pnetid in the pnet table.
85 */
86static struct smc_pnetentry *smc_pnet_find_pnetid(char *pnet_name)
87{
88 struct smc_pnetentry *pnetelem, *found_pnetelem = NULL;
89
90 read_lock(&smc_pnettable.lock);
91 list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
92 if (!strncmp(pnetelem->pnet_name, pnet_name,
93 sizeof(pnetelem->pnet_name))) {
94 found_pnetelem = pnetelem;
95 break;
96 }
97 }
98 read_unlock(&smc_pnettable.lock);
99 return found_pnetelem;
100}
101
102/* Remove a pnetid from the pnet table.
103 */
104static int smc_pnet_remove_by_pnetid(char *pnet_name)
105{
106 struct smc_pnetentry *pnetelem, *tmp_pe;
107 int rc = -ENOENT;
108
109 write_lock(&smc_pnettable.lock);
110 list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist,
111 list) {
112 if (!strncmp(pnetelem->pnet_name, pnet_name,
113 sizeof(pnetelem->pnet_name))) {
114 list_del(&pnetelem->list);
115 dev_put(pnetelem->ndev);
116 kfree(pnetelem);
117 rc = 0;
118 break;
119 }
120 }
121 write_unlock(&smc_pnettable.lock);
122 return rc;
123}
124
125/* Remove a pnet entry mentioning a given network device from the pnet table.
126 */
127static int smc_pnet_remove_by_ndev(struct net_device *ndev)
128{
129 struct smc_pnetentry *pnetelem, *tmp_pe;
130 int rc = -ENOENT;
131
132 write_lock(&smc_pnettable.lock);
133 list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist,
134 list) {
135 if (pnetelem->ndev == ndev) {
136 list_del(&pnetelem->list);
137 dev_put(pnetelem->ndev);
138 kfree(pnetelem);
139 rc = 0;
140 break;
141 }
142 }
143 write_unlock(&smc_pnettable.lock);
144 return rc;
145}
146
147/* Remove a pnet entry mentioning a given ib device from the pnet table.
148 */
149int smc_pnet_remove_by_ibdev(struct smc_ib_device *ibdev)
150{
151 struct smc_pnetentry *pnetelem, *tmp_pe;
152 int rc = -ENOENT;
153
154 write_lock(&smc_pnettable.lock);
155 list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist,
156 list) {
157 if (pnetelem->smcibdev == ibdev) {
158 list_del(&pnetelem->list);
159 dev_put(pnetelem->ndev);
160 kfree(pnetelem);
161 rc = 0;
162 break;
163 }
164 }
165 write_unlock(&smc_pnettable.lock);
166 return rc;
167}
168
169/* Append a pnetid to the end of the pnet table if not already on this list.
170 */
171static int smc_pnet_enter(struct smc_pnetentry *new_pnetelem)
172{
173 struct smc_pnetentry *pnetelem;
174 int rc = -EEXIST;
175
176 write_lock(&smc_pnettable.lock);
177 list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
178 if (!strncmp(pnetelem->pnet_name, new_pnetelem->pnet_name,
179 sizeof(new_pnetelem->pnet_name)) ||
180 !strncmp(pnetelem->ndev->name, new_pnetelem->ndev->name,
181 sizeof(new_pnetelem->ndev->name)) ||
182 smc_pnet_same_ibname(pnetelem,
183 new_pnetelem->smcibdev->ibdev->name,
184 new_pnetelem->ib_port))
185 goto found;
186 }
187 list_add_tail(&new_pnetelem->list, &smc_pnettable.pnetlist);
188 rc = 0;
189found:
190 write_unlock(&smc_pnettable.lock);
191 return rc;
192}
193
194/* The limit for pnetid is 16 characters.
195 * Valid characters should be (single-byte character set) a-z, A-Z, 0-9.
196 * Lower case letters are converted to upper case.
197 * Interior blanks should not be used.
198 */
199static bool smc_pnetid_valid(const char *pnet_name, char *pnetid)
200{
201 char *bf = skip_spaces(pnet_name);
202 size_t len = strlen(bf);
203 char *end = bf + len;
204
205 if (!len)
206 return false;
207 while (--end >= bf && isspace(*end))
208 ;
209 if (end - bf >= SMC_MAX_PNET_ID_LEN)
210 return false;
211 while (bf <= end) {
212 if (!isalnum(*bf))
213 return false;
214 *pnetid++ = islower(*bf) ? toupper(*bf) : *bf;
215 bf++;
216 }
217 *pnetid = '\0';
218 return true;
219}
220
221/* Find an infiniband device by a given name. The device might not exist. */
222struct smc_ib_device *smc_pnet_find_ib(char *ib_name)
223{
224 struct smc_ib_device *ibdev;
225
226 spin_lock(&smc_ib_devices.lock);
227 list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
228 if (!strncmp(ibdev->ibdev->name, ib_name,
229 sizeof(ibdev->ibdev->name))) {
230 goto out;
231 }
232 }
233 ibdev = NULL;
234out:
235 spin_unlock(&smc_ib_devices.lock);
236 return ibdev;
237}
238
239/* Parse the supplied netlink attributes and fill a pnetentry structure.
240 * For ethernet and infiniband device names verify that the devices exist.
241 */
242static int smc_pnet_fill_entry(struct net *net, struct smc_pnetentry *pnetelem,
243 struct nlattr *tb[])
244{
245 char *string, *ibname = NULL;
246 int rc = 0;
247
248 memset(pnetelem, 0, sizeof(*pnetelem));
249 INIT_LIST_HEAD(&pnetelem->list);
250 if (tb[SMC_PNETID_NAME]) {
251 string = (char *)nla_data(tb[SMC_PNETID_NAME]);
252 if (!smc_pnetid_valid(string, pnetelem->pnet_name)) {
253 rc = -EINVAL;
254 goto error;
255 }
256 }
257 if (tb[SMC_PNETID_ETHNAME]) {
258 string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]);
259 pnetelem->ndev = dev_get_by_name(net, string);
260 if (!pnetelem->ndev)
261 return -ENOENT;
262 }
263 if (tb[SMC_PNETID_IBNAME]) {
264 ibname = (char *)nla_data(tb[SMC_PNETID_IBNAME]);
265 ibname = strim(ibname);
266 pnetelem->smcibdev = smc_pnet_find_ib(ibname);
267 if (!pnetelem->smcibdev) {
268 rc = -ENOENT;
269 goto error;
270 }
271 }
272 if (tb[SMC_PNETID_IBPORT]) {
273 pnetelem->ib_port = nla_get_u8(tb[SMC_PNETID_IBPORT]);
274 if (pnetelem->ib_port > SMC_MAX_PORTS) {
275 rc = -EINVAL;
276 goto error;
277 }
278 }
279 return 0;
280
281error:
282 if (pnetelem->ndev)
283 dev_put(pnetelem->ndev);
284 return rc;
285}
286
287/* Convert an smc_pnetentry to a netlink attribute sequence */
288static int smc_pnet_set_nla(struct sk_buff *msg, struct smc_pnetentry *pnetelem)
289{
290 if (nla_put_string(msg, SMC_PNETID_NAME, pnetelem->pnet_name) ||
291 nla_put_string(msg, SMC_PNETID_ETHNAME, pnetelem->ndev->name) ||
292 nla_put_string(msg, SMC_PNETID_IBNAME,
293 pnetelem->smcibdev->ibdev->name) ||
294 nla_put_u8(msg, SMC_PNETID_IBPORT, pnetelem->ib_port))
295 return -1;
296 return 0;
297}
298
299/* Retrieve one PNETID entry */
300static int smc_pnet_get(struct sk_buff *skb, struct genl_info *info)
301{
302 struct smc_pnetentry *pnetelem;
303 struct sk_buff *msg;
304 void *hdr;
305 int rc;
306
307 pnetelem = smc_pnet_find_pnetid(
308 (char *)nla_data(info->attrs[SMC_PNETID_NAME]));
309 if (!pnetelem)
310 return -ENOENT;
311 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
312 if (!msg)
313 return -ENOMEM;
314
315 hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq,
316 &smc_pnet_nl_family, 0, SMC_PNETID_GET);
317 if (!hdr) {
318 rc = -EMSGSIZE;
319 goto err_out;
320 }
321
322 if (smc_pnet_set_nla(msg, pnetelem)) {
323 rc = -ENOBUFS;
324 goto err_out;
325 }
326
327 genlmsg_end(msg, hdr);
328 return genlmsg_reply(msg, info);
329
330err_out:
331 nlmsg_free(msg);
332 return rc;
333}
334
335static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info)
336{
337 struct net *net = genl_info_net(info);
338 struct smc_pnetentry *pnetelem;
339 int rc;
340
341 pnetelem = kzalloc(sizeof(*pnetelem), GFP_KERNEL);
342 if (!pnetelem)
343 return -ENOMEM;
344 rc = smc_pnet_fill_entry(net, pnetelem, info->attrs);
345 if (!rc)
346 rc = smc_pnet_enter(pnetelem);
347 if (rc) {
348 kfree(pnetelem);
349 return rc;
350 }
351 rc = smc_ib_remember_port_attr(pnetelem->smcibdev, pnetelem->ib_port);
352 if (rc)
353 smc_pnet_remove_by_pnetid(pnetelem->pnet_name);
354 return rc;
355}
356
357static int smc_pnet_del(struct sk_buff *skb, struct genl_info *info)
358{
359 return smc_pnet_remove_by_pnetid(
360 (char *)nla_data(info->attrs[SMC_PNETID_NAME]));
361}
362
363static int smc_pnet_dump_start(struct netlink_callback *cb)
364{
365 cb->args[0] = 0;
366 return 0;
367}
368
369static int smc_pnet_dumpinfo(struct sk_buff *skb,
370 u32 portid, u32 seq, u32 flags,
371 struct smc_pnetentry *pnetelem)
372{
373 void *hdr;
374
375 hdr = genlmsg_put(skb, portid, seq, &smc_pnet_nl_family,
376 flags, SMC_PNETID_GET);
377 if (!hdr)
378 return -ENOMEM;
379 if (smc_pnet_set_nla(skb, pnetelem) < 0) {
380 genlmsg_cancel(skb, hdr);
381 return -EMSGSIZE;
382 }
383 genlmsg_end(skb, hdr);
384 return 0;
385}
386
387static int smc_pnet_dump(struct sk_buff *skb, struct netlink_callback *cb)
388{
389 struct smc_pnetentry *pnetelem;
390 int idx = 0;
391
392 read_lock(&smc_pnettable.lock);
393 list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
394 if (idx++ < cb->args[0])
395 continue;
396 if (smc_pnet_dumpinfo(skb, NETLINK_CB(cb->skb).portid,
397 cb->nlh->nlmsg_seq, NLM_F_MULTI,
398 pnetelem)) {
399 --idx;
400 break;
401 }
402 }
403 cb->args[0] = idx;
404 read_unlock(&smc_pnettable.lock);
405 return skb->len;
406}
407
408/* Remove and delete all pnetids from pnet table.
409 */
410static int smc_pnet_flush(struct sk_buff *skb, struct genl_info *info)
411{
412 struct smc_pnetentry *pnetelem, *tmp_pe;
413
414 write_lock(&smc_pnettable.lock);
415 list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist,
416 list) {
417 list_del(&pnetelem->list);
418 dev_put(pnetelem->ndev);
419 kfree(pnetelem);
420 }
421 write_unlock(&smc_pnettable.lock);
422 return 0;
423}
424
425/* SMC_PNETID generic netlink operation definition */
426static const struct genl_ops smc_pnet_ops[] = {
427 {
428 .cmd = SMC_PNETID_GET,
429 .flags = GENL_ADMIN_PERM,
430 .policy = smc_pnet_policy,
431 .doit = smc_pnet_get,
432 .dumpit = smc_pnet_dump,
433 .start = smc_pnet_dump_start
434 },
435 {
436 .cmd = SMC_PNETID_ADD,
437 .flags = GENL_ADMIN_PERM,
438 .policy = smc_pnet_policy,
439 .doit = smc_pnet_add
440 },
441 {
442 .cmd = SMC_PNETID_DEL,
443 .flags = GENL_ADMIN_PERM,
444 .policy = smc_pnet_policy,
445 .doit = smc_pnet_del
446 },
447 {
448 .cmd = SMC_PNETID_FLUSH,
449 .flags = GENL_ADMIN_PERM,
450 .policy = smc_pnet_policy,
451 .doit = smc_pnet_flush
452 }
453};
454
455/* SMC_PNETID family definition */
456static struct genl_family smc_pnet_nl_family = {
457 .hdrsize = 0,
458 .name = SMCR_GENL_FAMILY_NAME,
459 .version = SMCR_GENL_FAMILY_VERSION,
460 .maxattr = SMC_PNETID_MAX,
461 .netnsok = true,
462 .module = THIS_MODULE,
463 .ops = smc_pnet_ops,
464 .n_ops = ARRAY_SIZE(smc_pnet_ops)
465};
466
467static int smc_pnet_netdev_event(struct notifier_block *this,
468 unsigned long event, void *ptr)
469{
470 struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
471
472 switch (event) {
473 case NETDEV_REBOOT:
474 case NETDEV_UNREGISTER:
475 smc_pnet_remove_by_ndev(event_dev);
476 default:
477 break;
478 }
479 return NOTIFY_DONE;
480}
481
482static struct notifier_block smc_netdev_notifier = {
483 .notifier_call = smc_pnet_netdev_event
484};
485
486int __init smc_pnet_init(void)
487{
488 int rc;
489
490 rc = genl_register_family(&smc_pnet_nl_family);
491 if (rc)
492 return rc;
493 rc = register_netdevice_notifier(&smc_netdev_notifier);
494 if (rc)
495 genl_unregister_family(&smc_pnet_nl_family);
496 return rc;
497}
498
499void smc_pnet_exit(void)
500{
501 smc_pnet_flush(NULL, NULL);
502 unregister_netdevice_notifier(&smc_netdev_notifier);
503 genl_unregister_family(&smc_pnet_nl_family);
504}
505
506/* PNET table analysis for a given sock:
507 * determine ib_device and port belonging to used internal TCP socket
508 * ethernet interface.
509 */
510void smc_pnet_find_roce_resource(struct sock *sk,
511 struct smc_ib_device **smcibdev, u8 *ibport)
512{
513 struct dst_entry *dst = sk_dst_get(sk);
514 struct smc_pnetentry *pnetelem;
515
516 *smcibdev = NULL;
517 *ibport = 0;
518
519 if (!dst)
520 return;
521 if (!dst->dev)
522 goto out_rel;
523 read_lock(&smc_pnettable.lock);
524 list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
525 if (dst->dev == pnetelem->ndev) {
526 *smcibdev = pnetelem->smcibdev;
527 *ibport = pnetelem->ib_port;
528 break;
529 }
530 }
531 read_unlock(&smc_pnettable.lock);
532out_rel:
533 dst_release(dst);
534}
diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h
new file mode 100644
index 000000000000..32ab3df928ca
--- /dev/null
+++ b/net/smc/smc_pnet.h
@@ -0,0 +1,23 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * PNET table queries
5 *
6 * Copyright IBM Corp. 2016
7 *
8 * Author(s): Thomas Richter <tmricht@linux.vnet.ibm.com>
9 */
10
11#ifndef _SMC_PNET_H
12#define _SMC_PNET_H
13
14struct smc_ib_device;
15
16int smc_pnet_init(void) __init;
17void smc_pnet_exit(void);
18int smc_pnet_remove_by_ibdev(struct smc_ib_device *ibdev);
19struct smc_ib_device *smc_pnet_find_ib(char *ib_name);
20void smc_pnet_find_roce_resource(struct sock *sk,
21 struct smc_ib_device **smcibdev, u8 *ibport);
22
23#endif
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
new file mode 100644
index 000000000000..c4ef9a4ec569
--- /dev/null
+++ b/net/smc/smc_rx.c
@@ -0,0 +1,219 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Manage RMBE
5 * copy new RMBE data into user space
6 *
7 * Copyright IBM Corp. 2016
8 *
9 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
10 */
11
12#include <linux/net.h>
13#include <linux/rcupdate.h>
14#include <linux/sched/signal.h>
15
16#include <net/sock.h>
17
18#include "smc.h"
19#include "smc_core.h"
20#include "smc_cdc.h"
21#include "smc_tx.h" /* smc_tx_consumer_update() */
22#include "smc_rx.h"
23
24/* callback implementation for sk.sk_data_ready()
25 * to wakeup rcvbuf consumers that blocked with smc_rx_wait_data().
26 * indirectly called by smc_cdc_msg_recv_action().
27 */
28static void smc_rx_data_ready(struct sock *sk)
29{
30 struct socket_wq *wq;
31
32 /* derived from sock_def_readable() */
33 /* called already in smc_listen_work() */
34 rcu_read_lock();
35 wq = rcu_dereference(sk->sk_wq);
36 if (skwq_has_sleeper(wq))
37 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
38 POLLRDNORM | POLLRDBAND);
39 if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
40 (sk->sk_state == SMC_CLOSED))
41 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
42 else
43 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
44 rcu_read_unlock();
45}
46
47/* blocks rcvbuf consumer until >=len bytes available or timeout or interrupted
48 * @smc smc socket
49 * @timeo pointer to max seconds to wait, pointer to value 0 for no timeout
50 * Returns:
51 * 1 if at least 1 byte available in rcvbuf or if socket error/shutdown.
52 * 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted).
53 */
54static int smc_rx_wait_data(struct smc_sock *smc, long *timeo)
55{
56 DEFINE_WAIT_FUNC(wait, woken_wake_function);
57 struct smc_connection *conn = &smc->conn;
58 struct sock *sk = &smc->sk;
59 int rc;
60
61 if (atomic_read(&conn->bytes_to_rcv))
62 return 1;
63 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
64 add_wait_queue(sk_sleep(sk), &wait);
65 rc = sk_wait_event(sk, timeo,
66 sk->sk_err ||
67 sk->sk_shutdown & RCV_SHUTDOWN ||
68 sock_flag(sk, SOCK_DONE) ||
69 atomic_read(&conn->bytes_to_rcv) ||
70 smc_cdc_rxed_any_close_or_senddone(conn),
71 &wait);
72 remove_wait_queue(sk_sleep(sk), &wait);
73 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
74 return rc;
75}
76
77/* rcvbuf consumer: main API called by socket layer.
78 * called under sk lock.
79 */
80int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
81 int flags)
82{
83 size_t copylen, read_done = 0, read_remaining = len;
84 size_t chunk_len, chunk_off, chunk_len_sum;
85 struct smc_connection *conn = &smc->conn;
86 union smc_host_cursor cons;
87 int readable, chunk;
88 char *rcvbuf_base;
89 struct sock *sk;
90 long timeo;
91 int target; /* Read at least these many bytes */
92 int rc;
93
94 if (unlikely(flags & MSG_ERRQUEUE))
95 return -EINVAL; /* future work for sk.sk_family == AF_SMC */
96 if (flags & MSG_OOB)
97 return -EINVAL; /* future work */
98
99 sk = &smc->sk;
100 if (sk->sk_state == SMC_LISTEN)
101 return -ENOTCONN;
102 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
103 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
104
105 msg->msg_namelen = 0;
106 /* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */
107 rcvbuf_base = conn->rmb_desc->cpu_addr;
108
109 do { /* while (read_remaining) */
110 if (read_done >= target)
111 break;
112
113 if (atomic_read(&conn->bytes_to_rcv))
114 goto copy;
115
116 if (read_done) {
117 if (sk->sk_err ||
118 sk->sk_state == SMC_CLOSED ||
119 (sk->sk_shutdown & RCV_SHUTDOWN) ||
120 !timeo ||
121 signal_pending(current) ||
122 smc_cdc_rxed_any_close_or_senddone(conn) ||
123 conn->local_tx_ctrl.conn_state_flags.
124 peer_conn_abort)
125 break;
126 } else {
127 if (sock_flag(sk, SOCK_DONE))
128 break;
129 if (sk->sk_err) {
130 read_done = sock_error(sk);
131 break;
132 }
133 if (sk->sk_shutdown & RCV_SHUTDOWN ||
134 smc_cdc_rxed_any_close_or_senddone(conn) ||
135 conn->local_tx_ctrl.conn_state_flags.
136 peer_conn_abort)
137 break;
138 if (sk->sk_state == SMC_CLOSED) {
139 if (!sock_flag(sk, SOCK_DONE)) {
140 /* This occurs when user tries to read
141 * from never connected socket.
142 */
143 read_done = -ENOTCONN;
144 break;
145 }
146 break;
147 }
148 if (signal_pending(current)) {
149 read_done = sock_intr_errno(timeo);
150 break;
151 }
152 }
153
154 if (!atomic_read(&conn->bytes_to_rcv)) {
155 smc_rx_wait_data(smc, &timeo);
156 continue;
157 }
158
159copy:
160 /* initialize variables for 1st iteration of subsequent loop */
161 /* could be just 1 byte, even after smc_rx_wait_data above */
162 readable = atomic_read(&conn->bytes_to_rcv);
163 /* not more than what user space asked for */
164 copylen = min_t(size_t, read_remaining, readable);
165 smc_curs_write(&cons,
166 smc_curs_read(&conn->local_tx_ctrl.cons, conn),
167 conn);
168 /* determine chunks where to read from rcvbuf */
169 /* either unwrapped case, or 1st chunk of wrapped case */
170 chunk_len = min_t(size_t,
171 copylen, conn->rmbe_size - cons.count);
172 chunk_len_sum = chunk_len;
173 chunk_off = cons.count;
174 for (chunk = 0; chunk < 2; chunk++) {
175 if (!(flags & MSG_TRUNC)) {
176 rc = memcpy_to_msg(msg, rcvbuf_base + chunk_off,
177 chunk_len);
178 if (rc) {
179 if (!read_done)
180 read_done = -EFAULT;
181 goto out;
182 }
183 }
184 read_remaining -= chunk_len;
185 read_done += chunk_len;
186
187 if (chunk_len_sum == copylen)
188 break; /* either on 1st or 2nd iteration */
189 /* prepare next (== 2nd) iteration */
190 chunk_len = copylen - chunk_len; /* remainder */
191 chunk_len_sum += chunk_len;
192 chunk_off = 0; /* modulo offset in recv ring buffer */
193 }
194
195 /* update cursors */
196 if (!(flags & MSG_PEEK)) {
197 smc_curs_add(conn->rmbe_size, &cons, copylen);
198 /* increased in recv tasklet smc_cdc_msg_rcv() */
199 smp_mb__before_atomic();
200 atomic_sub(copylen, &conn->bytes_to_rcv);
201 /* guarantee 0 <= bytes_to_rcv <= rmbe_size */
202 smp_mb__after_atomic();
203 smc_curs_write(&conn->local_tx_ctrl.cons,
204 smc_curs_read(&cons, conn),
205 conn);
206 /* send consumer cursor update if required */
207 /* similar to advertising new TCP rcv_wnd if required */
208 smc_tx_consumer_update(conn);
209 }
210 } while (read_remaining);
211out:
212 return read_done;
213}
214
215/* Initialize receive properties on connection establishment. NB: not __init! */
216void smc_rx_init(struct smc_sock *smc)
217{
218 smc->sk.sk_data_ready = smc_rx_data_ready;
219}
diff --git a/net/smc/smc_rx.h b/net/smc/smc_rx.h
new file mode 100644
index 000000000000..b5b80e1f8b0f
--- /dev/null
+++ b/net/smc/smc_rx.h
@@ -0,0 +1,23 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Manage RMBE
5 *
6 * Copyright IBM Corp. 2016
7 *
8 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
9 */
10
11#ifndef SMC_RX_H
12#define SMC_RX_H
13
14#include <linux/socket.h>
15#include <linux/types.h>
16
17#include "smc.h"
18
19void smc_rx_init(struct smc_sock *smc);
20int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
21 int flags);
22
23#endif /* SMC_RX_H */
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
new file mode 100644
index 000000000000..69a0013dd25c
--- /dev/null
+++ b/net/smc/smc_tx.c
@@ -0,0 +1,485 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Manage send buffer.
5 * Producer:
6 * Copy user space data into send buffer, if send buffer space available.
7 * Consumer:
8 * Trigger RDMA write into RMBE of peer and send CDC, if RMBE space available.
9 *
10 * Copyright IBM Corp. 2016
11 *
12 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
13 */
14
15#include <linux/net.h>
16#include <linux/rcupdate.h>
17#include <linux/workqueue.h>
18#include <linux/sched/signal.h>
19
20#include <net/sock.h>
21
22#include "smc.h"
23#include "smc_wr.h"
24#include "smc_cdc.h"
25#include "smc_tx.h"
26
27/***************************** sndbuf producer *******************************/
28
29/* callback implementation for sk.sk_write_space()
30 * to wakeup sndbuf producers that blocked with smc_tx_wait_memory().
31 * called under sk_socket lock.
32 */
33static void smc_tx_write_space(struct sock *sk)
34{
35 struct socket *sock = sk->sk_socket;
36 struct smc_sock *smc = smc_sk(sk);
37 struct socket_wq *wq;
38
39 /* similar to sk_stream_write_space */
40 if (atomic_read(&smc->conn.sndbuf_space) && sock) {
41 clear_bit(SOCK_NOSPACE, &sock->flags);
42 rcu_read_lock();
43 wq = rcu_dereference(sk->sk_wq);
44 if (skwq_has_sleeper(wq))
45 wake_up_interruptible_poll(&wq->wait,
46 POLLOUT | POLLWRNORM |
47 POLLWRBAND);
48 if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
49 sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT);
50 rcu_read_unlock();
51 }
52}
53
54/* Wakeup sndbuf producers that blocked with smc_tx_wait_memory().
55 * Cf. tcp_data_snd_check()=>tcp_check_space()=>tcp_new_space().
56 */
57void smc_tx_sndbuf_nonfull(struct smc_sock *smc)
58{
59 if (smc->sk.sk_socket &&
60 test_bit(SOCK_NOSPACE, &smc->sk.sk_socket->flags))
61 smc->sk.sk_write_space(&smc->sk);
62}
63
64/* blocks sndbuf producer until at least one byte of free space available */
65static int smc_tx_wait_memory(struct smc_sock *smc, int flags)
66{
67 DEFINE_WAIT_FUNC(wait, woken_wake_function);
68 struct smc_connection *conn = &smc->conn;
69 struct sock *sk = &smc->sk;
70 bool noblock;
71 long timeo;
72 int rc = 0;
73
74 /* similar to sk_stream_wait_memory */
75 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
76 noblock = timeo ? false : true;
77 add_wait_queue(sk_sleep(sk), &wait);
78 while (1) {
79 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
80 if (sk->sk_err ||
81 (sk->sk_shutdown & SEND_SHUTDOWN) ||
82 conn->local_tx_ctrl.conn_state_flags.peer_done_writing) {
83 rc = -EPIPE;
84 break;
85 }
86 if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
87 rc = -ECONNRESET;
88 break;
89 }
90 if (!timeo) {
91 if (noblock)
92 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
93 rc = -EAGAIN;
94 break;
95 }
96 if (signal_pending(current)) {
97 rc = sock_intr_errno(timeo);
98 break;
99 }
100 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
101 if (atomic_read(&conn->sndbuf_space))
102 break; /* at least 1 byte of free space available */
103 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
104 sk->sk_write_pending++;
105 sk_wait_event(sk, &timeo,
106 sk->sk_err ||
107 (sk->sk_shutdown & SEND_SHUTDOWN) ||
108 smc_cdc_rxed_any_close_or_senddone(conn) ||
109 atomic_read(&conn->sndbuf_space),
110 &wait);
111 sk->sk_write_pending--;
112 }
113 remove_wait_queue(sk_sleep(sk), &wait);
114 return rc;
115}
116
117/* sndbuf producer: main API called by socket layer.
118 * called under sock lock.
119 */
120int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
121{
122 size_t copylen, send_done = 0, send_remaining = len;
123 size_t chunk_len, chunk_off, chunk_len_sum;
124 struct smc_connection *conn = &smc->conn;
125 union smc_host_cursor prep;
126 struct sock *sk = &smc->sk;
127 char *sndbuf_base;
128 int tx_cnt_prep;
129 int writespace;
130 int rc, chunk;
131
132 /* This should be in poll */
133 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
134
135 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) {
136 rc = -EPIPE;
137 goto out_err;
138 }
139
140 while (msg_data_left(msg)) {
141 if (sk->sk_state == SMC_INIT)
142 return -ENOTCONN;
143 if (smc->sk.sk_shutdown & SEND_SHUTDOWN ||
144 (smc->sk.sk_err == ECONNABORTED) ||
145 conn->local_tx_ctrl.conn_state_flags.peer_conn_abort)
146 return -EPIPE;
147 if (smc_cdc_rxed_any_close(conn))
148 return send_done ?: -ECONNRESET;
149
150 if (!atomic_read(&conn->sndbuf_space)) {
151 rc = smc_tx_wait_memory(smc, msg->msg_flags);
152 if (rc) {
153 if (send_done)
154 return send_done;
155 goto out_err;
156 }
157 continue;
158 }
159
160 /* initialize variables for 1st iteration of subsequent loop */
161 /* could be just 1 byte, even after smc_tx_wait_memory above */
162 writespace = atomic_read(&conn->sndbuf_space);
163 /* not more than what user space asked for */
164 copylen = min_t(size_t, send_remaining, writespace);
165 /* determine start of sndbuf */
166 sndbuf_base = conn->sndbuf_desc->cpu_addr;
167 smc_curs_write(&prep,
168 smc_curs_read(&conn->tx_curs_prep, conn),
169 conn);
170 tx_cnt_prep = prep.count;
171 /* determine chunks where to write into sndbuf */
172 /* either unwrapped case, or 1st chunk of wrapped case */
173 chunk_len = min_t(size_t,
174 copylen, conn->sndbuf_size - tx_cnt_prep);
175 chunk_len_sum = chunk_len;
176 chunk_off = tx_cnt_prep;
177 for (chunk = 0; chunk < 2; chunk++) {
178 rc = memcpy_from_msg(sndbuf_base + chunk_off,
179 msg, chunk_len);
180 if (rc) {
181 if (send_done)
182 return send_done;
183 goto out_err;
184 }
185 send_done += chunk_len;
186 send_remaining -= chunk_len;
187
188 if (chunk_len_sum == copylen)
189 break; /* either on 1st or 2nd iteration */
190 /* prepare next (== 2nd) iteration */
191 chunk_len = copylen - chunk_len; /* remainder */
192 chunk_len_sum += chunk_len;
193 chunk_off = 0; /* modulo offset in send ring buffer */
194 }
195 /* update cursors */
196 smc_curs_add(conn->sndbuf_size, &prep, copylen);
197 smc_curs_write(&conn->tx_curs_prep,
198 smc_curs_read(&prep, conn),
199 conn);
200 /* increased in send tasklet smc_cdc_tx_handler() */
201 smp_mb__before_atomic();
202 atomic_sub(copylen, &conn->sndbuf_space);
203 /* guarantee 0 <= sndbuf_space <= sndbuf_size */
204 smp_mb__after_atomic();
205 /* since we just produced more new data into sndbuf,
206 * trigger sndbuf consumer: RDMA write into peer RMBE and CDC
207 */
208 smc_tx_sndbuf_nonempty(conn);
209 } /* while (msg_data_left(msg)) */
210
211 return send_done;
212
213out_err:
214 rc = sk_stream_error(sk, msg->msg_flags, rc);
215 /* make sure we wake any epoll edge trigger waiter */
216 if (unlikely(rc == -EAGAIN))
217 sk->sk_write_space(sk);
218 return rc;
219}
220
221/***************************** sndbuf consumer *******************************/
222
223/* sndbuf consumer: actual data transfer of one target chunk with RDMA write */
224static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
225 int num_sges, struct ib_sge sges[])
226{
227 struct smc_link_group *lgr = conn->lgr;
228 struct ib_send_wr *failed_wr = NULL;
229 struct ib_rdma_wr rdma_wr;
230 struct smc_link *link;
231 int rc;
232
233 memset(&rdma_wr, 0, sizeof(rdma_wr));
234 link = &lgr->lnk[SMC_SINGLE_LINK];
235 rdma_wr.wr.wr_id = smc_wr_tx_get_next_wr_id(link);
236 rdma_wr.wr.sg_list = sges;
237 rdma_wr.wr.num_sge = num_sges;
238 rdma_wr.wr.opcode = IB_WR_RDMA_WRITE;
239 rdma_wr.remote_addr =
240 lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr +
241 /* RMBE within RMB */
242 ((conn->peer_conn_idx - 1) * conn->peer_rmbe_size) +
243 /* offset within RMBE */
244 peer_rmbe_offset;
245 rdma_wr.rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey;
246 rc = ib_post_send(link->roce_qp, &rdma_wr.wr, &failed_wr);
247 if (rc)
248 conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
249 return rc;
250}
251
252/* sndbuf consumer */
253static inline void smc_tx_advance_cursors(struct smc_connection *conn,
254 union smc_host_cursor *prod,
255 union smc_host_cursor *sent,
256 size_t len)
257{
258 smc_curs_add(conn->peer_rmbe_size, prod, len);
259 /* increased in recv tasklet smc_cdc_msg_rcv() */
260 smp_mb__before_atomic();
261 /* data in flight reduces usable snd_wnd */
262 atomic_sub(len, &conn->peer_rmbe_space);
263 /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */
264 smp_mb__after_atomic();
265 smc_curs_add(conn->sndbuf_size, sent, len);
266}
267
268/* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit;
269 * usable snd_wnd as max transmit
270 */
271static int smc_tx_rdma_writes(struct smc_connection *conn)
272{
273 size_t src_off, src_len, dst_off, dst_len; /* current chunk values */
274 size_t len, dst_len_sum, src_len_sum, dstchunk, srcchunk;
275 union smc_host_cursor sent, prep, prod, cons;
276 struct ib_sge sges[SMC_IB_MAX_SEND_SGE];
277 struct smc_link_group *lgr = conn->lgr;
278 int to_send, rmbespace;
279 struct smc_link *link;
280 int num_sges;
281 int rc;
282
283 /* source: sndbuf */
284 smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn);
285 smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn);
286 /* cf. wmem_alloc - (snd_max - snd_una) */
287 to_send = smc_curs_diff(conn->sndbuf_size, &sent, &prep);
288 if (to_send <= 0)
289 return 0;
290
291 /* destination: RMBE */
292 /* cf. snd_wnd */
293 rmbespace = atomic_read(&conn->peer_rmbe_space);
294 if (rmbespace <= 0)
295 return 0;
296 smc_curs_write(&prod,
297 smc_curs_read(&conn->local_tx_ctrl.prod, conn),
298 conn);
299 smc_curs_write(&cons,
300 smc_curs_read(&conn->local_rx_ctrl.cons, conn),
301 conn);
302
303 /* if usable snd_wnd closes ask peer to advertise once it opens again */
304 conn->local_tx_ctrl.prod_flags.write_blocked = (to_send >= rmbespace);
305 /* cf. usable snd_wnd */
306 len = min(to_send, rmbespace);
307
308 /* initialize variables for first iteration of subsequent nested loop */
309 link = &lgr->lnk[SMC_SINGLE_LINK];
310 dst_off = prod.count;
311 if (prod.wrap == cons.wrap) {
312 /* the filled destination area is unwrapped,
313 * hence the available free destination space is wrapped
314 * and we need 2 destination chunks of sum len; start with 1st
315 * which is limited by what's available in sndbuf
316 */
317 dst_len = min_t(size_t,
318 conn->peer_rmbe_size - prod.count, len);
319 } else {
320 /* the filled destination area is wrapped,
321 * hence the available free destination space is unwrapped
322 * and we need a single destination chunk of entire len
323 */
324 dst_len = len;
325 }
326 dst_len_sum = dst_len;
327 src_off = sent.count;
328 /* dst_len determines the maximum src_len */
329 if (sent.count + dst_len <= conn->sndbuf_size) {
330 /* unwrapped src case: single chunk of entire dst_len */
331 src_len = dst_len;
332 } else {
333 /* wrapped src case: 2 chunks of sum dst_len; start with 1st: */
334 src_len = conn->sndbuf_size - sent.count;
335 }
336 src_len_sum = src_len;
337 for (dstchunk = 0; dstchunk < 2; dstchunk++) {
338 num_sges = 0;
339 for (srcchunk = 0; srcchunk < 2; srcchunk++) {
340 sges[srcchunk].addr =
341 conn->sndbuf_desc->dma_addr[SMC_SINGLE_LINK] +
342 src_off;
343 sges[srcchunk].length = src_len;
344 sges[srcchunk].lkey = link->roce_pd->local_dma_lkey;
345 num_sges++;
346 src_off += src_len;
347 if (src_off >= conn->sndbuf_size)
348 src_off -= conn->sndbuf_size;
349 /* modulo in send ring */
350 if (src_len_sum == dst_len)
351 break; /* either on 1st or 2nd iteration */
352 /* prepare next (== 2nd) iteration */
353 src_len = dst_len - src_len; /* remainder */
354 src_len_sum += src_len;
355 }
356 rc = smc_tx_rdma_write(conn, dst_off, num_sges, sges);
357 if (rc)
358 return rc;
359 if (dst_len_sum == len)
360 break; /* either on 1st or 2nd iteration */
361 /* prepare next (== 2nd) iteration */
362 dst_off = 0; /* modulo offset in RMBE ring buffer */
363 dst_len = len - dst_len; /* remainder */
364 dst_len_sum += dst_len;
365 src_len = min_t(int,
366 dst_len, conn->sndbuf_size - sent.count);
367 src_len_sum = src_len;
368 }
369
370 smc_tx_advance_cursors(conn, &prod, &sent, len);
371 /* update connection's cursors with advanced local cursors */
372 smc_curs_write(&conn->local_tx_ctrl.prod,
373 smc_curs_read(&prod, conn),
374 conn);
375 /* dst: peer RMBE */
376 smc_curs_write(&conn->tx_curs_sent,
377 smc_curs_read(&sent, conn),
378 conn);
379 /* src: local sndbuf */
380
381 return 0;
382}
383
384/* Wakeup sndbuf consumers from any context (IRQ or process)
385 * since there is more data to transmit; usable snd_wnd as max transmit
386 */
387int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
388{
389 struct smc_cdc_tx_pend *pend;
390 struct smc_wr_buf *wr_buf;
391 int rc;
392
393 spin_lock_bh(&conn->send_lock);
394 rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], &wr_buf,
395 &pend);
396 if (rc < 0) {
397 if (rc == -EBUSY) {
398 struct smc_sock *smc =
399 container_of(conn, struct smc_sock, conn);
400
401 if (smc->sk.sk_err == ECONNABORTED) {
402 rc = sock_error(&smc->sk);
403 goto out_unlock;
404 }
405 rc = 0;
406 schedule_work(&conn->tx_work);
407 }
408 goto out_unlock;
409 }
410
411 rc = smc_tx_rdma_writes(conn);
412 if (rc) {
413 smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
414 (struct smc_wr_tx_pend_priv *)pend);
415 goto out_unlock;
416 }
417
418 rc = smc_cdc_msg_send(conn, wr_buf, pend);
419
420out_unlock:
421 spin_unlock_bh(&conn->send_lock);
422 return rc;
423}
424
425/* Wakeup sndbuf consumers from process context
426 * since there is more data to transmit
427 */
428static void smc_tx_work(struct work_struct *work)
429{
430 struct smc_connection *conn = container_of(work,
431 struct smc_connection,
432 tx_work);
433 struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
434
435 lock_sock(&smc->sk);
436 smc_tx_sndbuf_nonempty(conn);
437 release_sock(&smc->sk);
438}
439
440void smc_tx_consumer_update(struct smc_connection *conn)
441{
442 union smc_host_cursor cfed, cons;
443 struct smc_cdc_tx_pend *pend;
444 struct smc_wr_buf *wr_buf;
445 int to_confirm, rc;
446
447 smc_curs_write(&cons,
448 smc_curs_read(&conn->local_tx_ctrl.cons, conn),
449 conn);
450 smc_curs_write(&cfed,
451 smc_curs_read(&conn->rx_curs_confirmed, conn),
452 conn);
453 to_confirm = smc_curs_diff(conn->rmbe_size, &cfed, &cons);
454
455 if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
456 ((to_confirm > conn->rmbe_update_limit) &&
457 ((to_confirm > (conn->rmbe_size / 2)) ||
458 conn->local_rx_ctrl.prod_flags.write_blocked))) {
459 rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
460 &wr_buf, &pend);
461 if (!rc)
462 rc = smc_cdc_msg_send(conn, wr_buf, pend);
463 if (rc < 0) {
464 schedule_work(&conn->tx_work);
465 return;
466 }
467 smc_curs_write(&conn->rx_curs_confirmed,
468 smc_curs_read(&conn->local_tx_ctrl.cons, conn),
469 conn);
470 conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0;
471 }
472 if (conn->local_rx_ctrl.prod_flags.write_blocked &&
473 !atomic_read(&conn->bytes_to_rcv))
474 conn->local_rx_ctrl.prod_flags.write_blocked = 0;
475}
476
477/***************************** send initialize *******************************/
478
479/* Initialize send properties on connection establishment. NB: not __init! */
480void smc_tx_init(struct smc_sock *smc)
481{
482 smc->sk.sk_write_space = smc_tx_write_space;
483 INIT_WORK(&smc->conn.tx_work, smc_tx_work);
484 spin_lock_init(&smc->conn.send_lock);
485}
diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h
new file mode 100644
index 000000000000..1d6a0dcdcfe6
--- /dev/null
+++ b/net/smc/smc_tx.h
@@ -0,0 +1,35 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Manage send buffer
5 *
6 * Copyright IBM Corp. 2016
7 *
8 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
9 */
10
11#ifndef SMC_TX_H
12#define SMC_TX_H
13
14#include <linux/socket.h>
15#include <linux/types.h>
16
17#include "smc.h"
18#include "smc_cdc.h"
19
20static inline int smc_tx_prepared_sends(struct smc_connection *conn)
21{
22 union smc_host_cursor sent, prep;
23
24 smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn);
25 smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn);
26 return smc_curs_diff(conn->sndbuf_size, &sent, &prep);
27}
28
29void smc_tx_init(struct smc_sock *smc);
30int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len);
31int smc_tx_sndbuf_nonempty(struct smc_connection *conn);
32void smc_tx_sndbuf_nonfull(struct smc_sock *smc);
33void smc_tx_consumer_update(struct smc_connection *conn);
34
35#endif /* SMC_TX_H */
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
new file mode 100644
index 000000000000..eadf157418dc
--- /dev/null
+++ b/net/smc/smc_wr.c
@@ -0,0 +1,614 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Work Requests exploiting Infiniband API
5 *
6 * Work requests (WR) of type ib_post_send or ib_post_recv respectively
7 * are submitted to either RC SQ or RC RQ respectively
8 * (reliably connected send/receive queue)
9 * and become work queue entries (WQEs).
10 * While an SQ WR/WQE is pending, we track it until transmission completion.
11 * Through a send or receive completion queue (CQ) respectively,
12 * we get completion queue entries (CQEs) [aka work completions (WCs)].
13 * Since the CQ callback is called from IRQ context, we split work by using
14 * bottom halves implemented by tasklets.
15 *
16 * SMC uses this to exchange LLC (link layer control)
17 * and CDC (connection data control) messages.
18 *
19 * Copyright IBM Corp. 2016
20 *
21 * Author(s): Steffen Maier <maier@linux.vnet.ibm.com>
22 */
23
24#include <linux/atomic.h>
25#include <linux/hashtable.h>
26#include <linux/wait.h>
27#include <rdma/ib_verbs.h>
28#include <asm/div64.h>
29
30#include "smc.h"
31#include "smc_wr.h"
32
33#define SMC_WR_MAX_POLL_CQE 10 /* max. # of compl. queue elements in 1 poll */
34
35#define SMC_WR_RX_HASH_BITS 4
36static DEFINE_HASHTABLE(smc_wr_rx_hash, SMC_WR_RX_HASH_BITS);
37static DEFINE_SPINLOCK(smc_wr_rx_hash_lock);
38
39struct smc_wr_tx_pend { /* control data for a pending send request */
40 u64 wr_id; /* work request id sent */
41 smc_wr_tx_handler handler;
42 enum ib_wc_status wc_status; /* CQE status */
43 struct smc_link *link;
44 u32 idx;
45 struct smc_wr_tx_pend_priv priv;
46};
47
48/******************************** send queue *********************************/
49
50/*------------------------------- completion --------------------------------*/
51
52static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id)
53{
54 u32 i;
55
56 for (i = 0; i < link->wr_tx_cnt; i++) {
57 if (link->wr_tx_pends[i].wr_id == wr_id)
58 return i;
59 }
60 return link->wr_tx_cnt;
61}
62
63static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
64{
65 struct smc_wr_tx_pend pnd_snd;
66 struct smc_link *link;
67 u32 pnd_snd_idx;
68 int i;
69
70 link = wc->qp->qp_context;
71 pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id);
72 if (pnd_snd_idx == link->wr_tx_cnt)
73 return;
74 link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status;
75 memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx], sizeof(pnd_snd));
76 /* clear the full struct smc_wr_tx_pend including .priv */
77 memset(&link->wr_tx_pends[pnd_snd_idx], 0,
78 sizeof(link->wr_tx_pends[pnd_snd_idx]));
79 memset(&link->wr_tx_bufs[pnd_snd_idx], 0,
80 sizeof(link->wr_tx_bufs[pnd_snd_idx]));
81 if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
82 return;
83 if (wc->status) {
84 struct smc_link_group *lgr;
85
86 for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
87 /* clear full struct smc_wr_tx_pend including .priv */
88 memset(&link->wr_tx_pends[i], 0,
89 sizeof(link->wr_tx_pends[i]));
90 memset(&link->wr_tx_bufs[i], 0,
91 sizeof(link->wr_tx_bufs[i]));
92 clear_bit(i, link->wr_tx_mask);
93 }
94 /* terminate connections of this link group abnormally */
95 lgr = container_of(link, struct smc_link_group,
96 lnk[SMC_SINGLE_LINK]);
97 smc_lgr_terminate(lgr);
98 }
99 if (pnd_snd.handler)
100 pnd_snd.handler(&pnd_snd.priv, link, wc->status);
101 wake_up(&link->wr_tx_wait);
102}
103
104static void smc_wr_tx_tasklet_fn(unsigned long data)
105{
106 struct smc_ib_device *dev = (struct smc_ib_device *)data;
107 struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
108 int i = 0, rc;
109 int polled = 0;
110
111again:
112 polled++;
113 do {
114 rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc);
115 if (polled == 1) {
116 ib_req_notify_cq(dev->roce_cq_send,
117 IB_CQ_NEXT_COMP |
118 IB_CQ_REPORT_MISSED_EVENTS);
119 }
120 if (!rc)
121 break;
122 for (i = 0; i < rc; i++)
123 smc_wr_tx_process_cqe(&wc[i]);
124 } while (rc > 0);
125 if (polled == 1)
126 goto again;
127}
128
129void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
130{
131 struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
132
133 tasklet_schedule(&dev->send_tasklet);
134}
135
136/*---------------------------- request submission ---------------------------*/
137
138static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx)
139{
140 *idx = link->wr_tx_cnt;
141 for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) {
142 if (!test_and_set_bit(*idx, link->wr_tx_mask))
143 return 0;
144 }
145 *idx = link->wr_tx_cnt;
146 return -EBUSY;
147}
148
149/**
150 * smc_wr_tx_get_free_slot() - returns buffer for message assembly,
151 * and sets info for pending transmit tracking
152 * @link: Pointer to smc_link used to later send the message.
153 * @handler: Send completion handler function pointer.
154 * @wr_buf: Out value returns pointer to message buffer.
155 * @wr_pend_priv: Out value returns pointer serving as handler context.
156 *
157 * Return: 0 on success, or -errno on error.
158 */
159int smc_wr_tx_get_free_slot(struct smc_link *link,
160 smc_wr_tx_handler handler,
161 struct smc_wr_buf **wr_buf,
162 struct smc_wr_tx_pend_priv **wr_pend_priv)
163{
164 struct smc_wr_tx_pend *wr_pend;
165 struct ib_send_wr *wr_ib;
166 u64 wr_id;
167 u32 idx;
168 int rc;
169
170 *wr_buf = NULL;
171 *wr_pend_priv = NULL;
172 if (in_softirq()) {
173 rc = smc_wr_tx_get_free_slot_index(link, &idx);
174 if (rc)
175 return rc;
176 } else {
177 rc = wait_event_interruptible_timeout(
178 link->wr_tx_wait,
179 (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY),
180 SMC_WR_TX_WAIT_FREE_SLOT_TIME);
181 if (!rc) {
182 /* timeout - terminate connections */
183 struct smc_link_group *lgr;
184
185 lgr = container_of(link, struct smc_link_group,
186 lnk[SMC_SINGLE_LINK]);
187 smc_lgr_terminate(lgr);
188 return -EPIPE;
189 }
190 if (rc == -ERESTARTSYS)
191 return -EINTR;
192 if (idx == link->wr_tx_cnt)
193 return -EPIPE;
194 }
195 wr_id = smc_wr_tx_get_next_wr_id(link);
196 wr_pend = &link->wr_tx_pends[idx];
197 wr_pend->wr_id = wr_id;
198 wr_pend->handler = handler;
199 wr_pend->link = link;
200 wr_pend->idx = idx;
201 wr_ib = &link->wr_tx_ibs[idx];
202 wr_ib->wr_id = wr_id;
203 *wr_buf = &link->wr_tx_bufs[idx];
204 *wr_pend_priv = &wr_pend->priv;
205 return 0;
206}
207
208int smc_wr_tx_put_slot(struct smc_link *link,
209 struct smc_wr_tx_pend_priv *wr_pend_priv)
210{
211 struct smc_wr_tx_pend *pend;
212
213 pend = container_of(wr_pend_priv, struct smc_wr_tx_pend, priv);
214 if (pend->idx < link->wr_tx_cnt) {
215 /* clear the full struct smc_wr_tx_pend including .priv */
216 memset(&link->wr_tx_pends[pend->idx], 0,
217 sizeof(link->wr_tx_pends[pend->idx]));
218 memset(&link->wr_tx_bufs[pend->idx], 0,
219 sizeof(link->wr_tx_bufs[pend->idx]));
220 test_and_clear_bit(pend->idx, link->wr_tx_mask);
221 return 1;
222 }
223
224 return 0;
225}
226
227/* Send prepared WR slot via ib_post_send.
228 * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
229 */
230int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)
231{
232 struct ib_send_wr *failed_wr = NULL;
233 struct smc_wr_tx_pend *pend;
234 int rc;
235
236 ib_req_notify_cq(link->smcibdev->roce_cq_send,
237 IB_CQ_SOLICITED_MASK | IB_CQ_REPORT_MISSED_EVENTS);
238 pend = container_of(priv, struct smc_wr_tx_pend, priv);
239 rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx],
240 &failed_wr);
241 if (rc)
242 smc_wr_tx_put_slot(link, priv);
243 return rc;
244}
245
246void smc_wr_tx_dismiss_slots(struct smc_link *link, u8 wr_rx_hdr_type,
247 smc_wr_tx_filter filter,
248 smc_wr_tx_dismisser dismisser,
249 unsigned long data)
250{
251 struct smc_wr_tx_pend_priv *tx_pend;
252 struct smc_wr_rx_hdr *wr_rx;
253 int i;
254
255 for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
256 wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[i];
257 if (wr_rx->type != wr_rx_hdr_type)
258 continue;
259 tx_pend = &link->wr_tx_pends[i].priv;
260 if (filter(tx_pend, data))
261 dismisser(tx_pend);
262 }
263}
264
265bool smc_wr_tx_has_pending(struct smc_link *link, u8 wr_rx_hdr_type,
266 smc_wr_tx_filter filter, unsigned long data)
267{
268 struct smc_wr_tx_pend_priv *tx_pend;
269 struct smc_wr_rx_hdr *wr_rx;
270 int i;
271
272 for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
273 wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[i];
274 if (wr_rx->type != wr_rx_hdr_type)
275 continue;
276 tx_pend = &link->wr_tx_pends[i].priv;
277 if (filter(tx_pend, data))
278 return true;
279 }
280 return false;
281}
282
283/****************************** receive queue ********************************/
284
285int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler)
286{
287 struct smc_wr_rx_handler *h_iter;
288 int rc = 0;
289
290 spin_lock(&smc_wr_rx_hash_lock);
291 hash_for_each_possible(smc_wr_rx_hash, h_iter, list, handler->type) {
292 if (h_iter->type == handler->type) {
293 rc = -EEXIST;
294 goto out_unlock;
295 }
296 }
297 hash_add(smc_wr_rx_hash, &handler->list, handler->type);
298out_unlock:
299 spin_unlock(&smc_wr_rx_hash_lock);
300 return rc;
301}
302
303/* Demultiplex a received work request based on the message type to its handler.
304 * Relies on smc_wr_rx_hash having been completely filled before any IB WRs,
305 * and not being modified any more afterwards so we don't need to lock it.
306 */
307static inline void smc_wr_rx_demultiplex(struct ib_wc *wc)
308{
309 struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
310 struct smc_wr_rx_handler *handler;
311 struct smc_wr_rx_hdr *wr_rx;
312 u64 temp_wr_id;
313 u32 index;
314
315 if (wc->byte_len < sizeof(*wr_rx))
316 return; /* short message */
317 temp_wr_id = wc->wr_id;
318 index = do_div(temp_wr_id, link->wr_rx_cnt);
319 wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index];
320 hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) {
321 if (handler->type == wr_rx->type)
322 handler->handler(wc, wr_rx);
323 }
324}
325
326static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
327{
328 struct smc_link *link;
329 int i;
330
331 for (i = 0; i < num; i++) {
332 link = wc[i].qp->qp_context;
333 if (wc[i].status == IB_WC_SUCCESS) {
334 smc_wr_rx_demultiplex(&wc[i]);
335 smc_wr_rx_post(link); /* refill WR RX */
336 } else {
337 struct smc_link_group *lgr;
338
339 /* handle status errors */
340 switch (wc[i].status) {
341 case IB_WC_RETRY_EXC_ERR:
342 case IB_WC_RNR_RETRY_EXC_ERR:
343 case IB_WC_WR_FLUSH_ERR:
344 /* terminate connections of this link group
345 * abnormally
346 */
347 lgr = container_of(link, struct smc_link_group,
348 lnk[SMC_SINGLE_LINK]);
349 smc_lgr_terminate(lgr);
350 break;
351 default:
352 smc_wr_rx_post(link); /* refill WR RX */
353 break;
354 }
355 }
356 }
357}
358
359static void smc_wr_rx_tasklet_fn(unsigned long data)
360{
361 struct smc_ib_device *dev = (struct smc_ib_device *)data;
362 struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
363 int polled = 0;
364 int rc;
365
366again:
367 polled++;
368 do {
369 memset(&wc, 0, sizeof(wc));
370 rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc);
371 if (polled == 1) {
372 ib_req_notify_cq(dev->roce_cq_recv,
373 IB_CQ_SOLICITED_MASK
374 | IB_CQ_REPORT_MISSED_EVENTS);
375 }
376 if (!rc)
377 break;
378 smc_wr_rx_process_cqes(&wc[0], rc);
379 } while (rc > 0);
380 if (polled == 1)
381 goto again;
382}
383
384void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
385{
386 struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
387
388 tasklet_schedule(&dev->recv_tasklet);
389}
390
391int smc_wr_rx_post_init(struct smc_link *link)
392{
393 u32 i;
394 int rc = 0;
395
396 for (i = 0; i < link->wr_rx_cnt; i++)
397 rc = smc_wr_rx_post(link);
398 return rc;
399}
400
401/***************************** init, exit, misc ******************************/
402
403void smc_wr_remember_qp_attr(struct smc_link *lnk)
404{
405 struct ib_qp_attr *attr = &lnk->qp_attr;
406 struct ib_qp_init_attr init_attr;
407
408 memset(attr, 0, sizeof(*attr));
409 memset(&init_attr, 0, sizeof(init_attr));
410 ib_query_qp(lnk->roce_qp, attr,
411 IB_QP_STATE |
412 IB_QP_CUR_STATE |
413 IB_QP_PKEY_INDEX |
414 IB_QP_PORT |
415 IB_QP_QKEY |
416 IB_QP_AV |
417 IB_QP_PATH_MTU |
418 IB_QP_TIMEOUT |
419 IB_QP_RETRY_CNT |
420 IB_QP_RNR_RETRY |
421 IB_QP_RQ_PSN |
422 IB_QP_ALT_PATH |
423 IB_QP_MIN_RNR_TIMER |
424 IB_QP_SQ_PSN |
425 IB_QP_PATH_MIG_STATE |
426 IB_QP_CAP |
427 IB_QP_DEST_QPN,
428 &init_attr);
429
430 lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT,
431 lnk->qp_attr.cap.max_send_wr);
432 lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3,
433 lnk->qp_attr.cap.max_recv_wr);
434}
435
436static void smc_wr_init_sge(struct smc_link *lnk)
437{
438 u32 i;
439
440 for (i = 0; i < lnk->wr_tx_cnt; i++) {
441 lnk->wr_tx_sges[i].addr =
442 lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
443 lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
444 lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
445 lnk->wr_tx_ibs[i].next = NULL;
446 lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i];
447 lnk->wr_tx_ibs[i].num_sge = 1;
448 lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
449 lnk->wr_tx_ibs[i].send_flags =
450 IB_SEND_SIGNALED | IB_SEND_SOLICITED | IB_SEND_INLINE;
451 }
452 for (i = 0; i < lnk->wr_rx_cnt; i++) {
453 lnk->wr_rx_sges[i].addr =
454 lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE;
455 lnk->wr_rx_sges[i].length = SMC_WR_BUF_SIZE;
456 lnk->wr_rx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
457 lnk->wr_rx_ibs[i].next = NULL;
458 lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[i];
459 lnk->wr_rx_ibs[i].num_sge = 1;
460 }
461}
462
463void smc_wr_free_link(struct smc_link *lnk)
464{
465 struct ib_device *ibdev;
466
467 memset(lnk->wr_tx_mask, 0,
468 BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask));
469
470 if (!lnk->smcibdev)
471 return;
472 ibdev = lnk->smcibdev->ibdev;
473
474 if (lnk->wr_rx_dma_addr) {
475 ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
476 SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
477 DMA_FROM_DEVICE);
478 lnk->wr_rx_dma_addr = 0;
479 }
480 if (lnk->wr_tx_dma_addr) {
481 ib_dma_unmap_single(ibdev, lnk->wr_tx_dma_addr,
482 SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
483 DMA_TO_DEVICE);
484 lnk->wr_tx_dma_addr = 0;
485 }
486}
487
488void smc_wr_free_link_mem(struct smc_link *lnk)
489{
490 kfree(lnk->wr_tx_pends);
491 lnk->wr_tx_pends = NULL;
492 kfree(lnk->wr_tx_mask);
493 lnk->wr_tx_mask = NULL;
494 kfree(lnk->wr_tx_sges);
495 lnk->wr_tx_sges = NULL;
496 kfree(lnk->wr_rx_sges);
497 lnk->wr_rx_sges = NULL;
498 kfree(lnk->wr_rx_ibs);
499 lnk->wr_rx_ibs = NULL;
500 kfree(lnk->wr_tx_ibs);
501 lnk->wr_tx_ibs = NULL;
502 kfree(lnk->wr_tx_bufs);
503 lnk->wr_tx_bufs = NULL;
504 kfree(lnk->wr_rx_bufs);
505 lnk->wr_rx_bufs = NULL;
506}
507
508int smc_wr_alloc_link_mem(struct smc_link *link)
509{
510 /* allocate link related memory */
511 link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL);
512 if (!link->wr_tx_bufs)
513 goto no_mem;
514 link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE,
515 GFP_KERNEL);
516 if (!link->wr_rx_bufs)
517 goto no_mem_wr_tx_bufs;
518 link->wr_tx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_ibs[0]),
519 GFP_KERNEL);
520 if (!link->wr_tx_ibs)
521 goto no_mem_wr_rx_bufs;
522 link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3,
523 sizeof(link->wr_rx_ibs[0]),
524 GFP_KERNEL);
525 if (!link->wr_rx_ibs)
526 goto no_mem_wr_tx_ibs;
527 link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_sges[0]),
528 GFP_KERNEL);
529 if (!link->wr_tx_sges)
530 goto no_mem_wr_rx_ibs;
531 link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3,
532 sizeof(link->wr_rx_sges[0]),
533 GFP_KERNEL);
534 if (!link->wr_rx_sges)
535 goto no_mem_wr_tx_sges;
536 link->wr_tx_mask = kzalloc(
537 BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*link->wr_tx_mask),
538 GFP_KERNEL);
539 if (!link->wr_tx_mask)
540 goto no_mem_wr_rx_sges;
541 link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT,
542 sizeof(link->wr_tx_pends[0]),
543 GFP_KERNEL);
544 if (!link->wr_tx_pends)
545 goto no_mem_wr_tx_mask;
546 return 0;
547
548no_mem_wr_tx_mask:
549 kfree(link->wr_tx_mask);
550no_mem_wr_rx_sges:
551 kfree(link->wr_rx_sges);
552no_mem_wr_tx_sges:
553 kfree(link->wr_tx_sges);
554no_mem_wr_rx_ibs:
555 kfree(link->wr_rx_ibs);
556no_mem_wr_tx_ibs:
557 kfree(link->wr_tx_ibs);
558no_mem_wr_rx_bufs:
559 kfree(link->wr_rx_bufs);
560no_mem_wr_tx_bufs:
561 kfree(link->wr_tx_bufs);
562no_mem:
563 return -ENOMEM;
564}
565
566void smc_wr_remove_dev(struct smc_ib_device *smcibdev)
567{
568 tasklet_kill(&smcibdev->recv_tasklet);
569 tasklet_kill(&smcibdev->send_tasklet);
570}
571
572void smc_wr_add_dev(struct smc_ib_device *smcibdev)
573{
574 tasklet_init(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn,
575 (unsigned long)smcibdev);
576 tasklet_init(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn,
577 (unsigned long)smcibdev);
578}
579
580int smc_wr_create_link(struct smc_link *lnk)
581{
582 struct ib_device *ibdev = lnk->smcibdev->ibdev;
583 int rc = 0;
584
585 smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0);
586 lnk->wr_rx_id = 0;
587 lnk->wr_rx_dma_addr = ib_dma_map_single(
588 ibdev, lnk->wr_rx_bufs, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
589 DMA_FROM_DEVICE);
590 if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr)) {
591 lnk->wr_rx_dma_addr = 0;
592 rc = -EIO;
593 goto out;
594 }
595 lnk->wr_tx_dma_addr = ib_dma_map_single(
596 ibdev, lnk->wr_tx_bufs, SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
597 DMA_TO_DEVICE);
598 if (ib_dma_mapping_error(ibdev, lnk->wr_tx_dma_addr)) {
599 rc = -EIO;
600 goto dma_unmap;
601 }
602 smc_wr_init_sge(lnk);
603 memset(lnk->wr_tx_mask, 0,
604 BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask));
605 return rc;
606
607dma_unmap:
608 ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
609 SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
610 DMA_FROM_DEVICE);
611 lnk->wr_rx_dma_addr = 0;
612out:
613 return rc;
614}
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
new file mode 100644
index 000000000000..0b9beeda6053
--- /dev/null
+++ b/net/smc/smc_wr.h
@@ -0,0 +1,106 @@
1/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Work Requests exploiting Infiniband API
5 *
6 * Copyright IBM Corp. 2016
7 *
8 * Author(s): Steffen Maier <maier@linux.vnet.ibm.com>
9 */
10
11#ifndef SMC_WR_H
12#define SMC_WR_H
13
14#include <linux/atomic.h>
15#include <rdma/ib_verbs.h>
16#include <asm/div64.h>
17
18#include "smc.h"
19#include "smc_core.h"
20
21#define SMC_WR_MAX_CQE 32768 /* max. # of completion queue elements */
22#define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */
23
24#define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ)
25#define SMC_WR_TX_WAIT_PENDING_TIME (5 * HZ)
26
27#define SMC_WR_TX_SIZE 44 /* actual size of wr_send data (<=SMC_WR_BUF_SIZE) */
28
29#define SMC_WR_TX_PEND_PRIV_SIZE 32
30
31struct smc_wr_tx_pend_priv {
32 u8 priv[SMC_WR_TX_PEND_PRIV_SIZE];
33};
34
35typedef void (*smc_wr_tx_handler)(struct smc_wr_tx_pend_priv *,
36 struct smc_link *,
37 enum ib_wc_status);
38
39typedef bool (*smc_wr_tx_filter)(struct smc_wr_tx_pend_priv *,
40 unsigned long);
41
42typedef void (*smc_wr_tx_dismisser)(struct smc_wr_tx_pend_priv *);
43
44struct smc_wr_rx_handler {
45 struct hlist_node list; /* hash table collision resolution */
46 void (*handler)(struct ib_wc *, void *);
47 u8 type;
48};
49
50/* Only used by RDMA write WRs.
51 * All other WRs (CDC/LLC) use smc_wr_tx_send handling WR_ID implicitly
52 */
53static inline long smc_wr_tx_get_next_wr_id(struct smc_link *link)
54{
55 return atomic_long_inc_return(&link->wr_tx_id);
56}
57
58static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val)
59{
60 atomic_long_set(wr_tx_id, val);
61}
62
63/* post a new receive work request to fill a completed old work request entry */
64static inline int smc_wr_rx_post(struct smc_link *link)
65{
66 struct ib_recv_wr *bad_recv_wr = NULL;
67 int rc;
68 u64 wr_id, temp_wr_id;
69 u32 index;
70
71 wr_id = ++link->wr_rx_id; /* tasklet context, thus not atomic */
72 temp_wr_id = wr_id;
73 index = do_div(temp_wr_id, link->wr_rx_cnt);
74 link->wr_rx_ibs[index].wr_id = wr_id;
75 rc = ib_post_recv(link->roce_qp, &link->wr_rx_ibs[index], &bad_recv_wr);
76 return rc;
77}
78
79int smc_wr_create_link(struct smc_link *lnk);
80int smc_wr_alloc_link_mem(struct smc_link *lnk);
81void smc_wr_free_link(struct smc_link *lnk);
82void smc_wr_free_link_mem(struct smc_link *lnk);
83void smc_wr_remember_qp_attr(struct smc_link *lnk);
84void smc_wr_remove_dev(struct smc_ib_device *smcibdev);
85void smc_wr_add_dev(struct smc_ib_device *smcibdev);
86
87int smc_wr_tx_get_free_slot(struct smc_link *link, smc_wr_tx_handler handler,
88 struct smc_wr_buf **wr_buf,
89 struct smc_wr_tx_pend_priv **wr_pend_priv);
90int smc_wr_tx_put_slot(struct smc_link *link,
91 struct smc_wr_tx_pend_priv *wr_pend_priv);
92int smc_wr_tx_send(struct smc_link *link,
93 struct smc_wr_tx_pend_priv *wr_pend_priv);
94void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
95bool smc_wr_tx_has_pending(struct smc_link *link, u8 wr_rx_hdr_type,
96 smc_wr_tx_filter filter, unsigned long data);
97void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type,
98 smc_wr_tx_filter filter,
99 smc_wr_tx_dismisser dismisser,
100 unsigned long data);
101
102int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler);
103int smc_wr_rx_post_init(struct smc_link *link);
104void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
105
106#endif /* SMC_WR_H */
diff --git a/net/socket.c b/net/socket.c
index 73dc69f9681e..985ef06792d6 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -90,7 +90,7 @@
90#include <linux/slab.h> 90#include <linux/slab.h>
91#include <linux/xattr.h> 91#include <linux/xattr.h>
92 92
93#include <asm/uaccess.h> 93#include <linux/uaccess.h>
94#include <asm/unistd.h> 94#include <asm/unistd.h>
95 95
96#include <net/compat.h> 96#include <net/compat.h>
@@ -287,7 +287,7 @@ static void init_once(void *foo)
287 inode_init_once(&ei->vfs_inode); 287 inode_init_once(&ei->vfs_inode);
288} 288}
289 289
290static int init_inodecache(void) 290static void init_inodecache(void)
291{ 291{
292 sock_inode_cachep = kmem_cache_create("sock_inode_cache", 292 sock_inode_cachep = kmem_cache_create("sock_inode_cache",
293 sizeof(struct socket_alloc), 293 sizeof(struct socket_alloc),
@@ -296,9 +296,7 @@ static int init_inodecache(void)
296 SLAB_RECLAIM_ACCOUNT | 296 SLAB_RECLAIM_ACCOUNT |
297 SLAB_MEM_SPREAD | SLAB_ACCOUNT), 297 SLAB_MEM_SPREAD | SLAB_ACCOUNT),
298 init_once); 298 init_once);
299 if (sock_inode_cachep == NULL) 299 BUG_ON(sock_inode_cachep == NULL);
300 return -ENOMEM;
301 return 0;
302} 300}
303 301
304static const struct super_operations sockfs_ops = { 302static const struct super_operations sockfs_ops = {
@@ -533,8 +531,22 @@ static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer,
533 return used; 531 return used;
534} 532}
535 533
534static int sockfs_setattr(struct dentry *dentry, struct iattr *iattr)
535{
536 int err = simple_setattr(dentry, iattr);
537
538 if (!err && (iattr->ia_valid & ATTR_UID)) {
539 struct socket *sock = SOCKET_I(d_inode(dentry));
540
541 sock->sk->sk_uid = iattr->ia_uid;
542 }
543
544 return err;
545}
546
536static const struct inode_operations sockfs_inode_ops = { 547static const struct inode_operations sockfs_inode_ops = {
537 .listxattr = sockfs_listxattr, 548 .listxattr = sockfs_listxattr,
549 .setattr = sockfs_setattr,
538}; 550};
539 551
540/** 552/**
@@ -640,6 +652,16 @@ int kernel_sendmsg(struct socket *sock, struct msghdr *msg,
640} 652}
641EXPORT_SYMBOL(kernel_sendmsg); 653EXPORT_SYMBOL(kernel_sendmsg);
642 654
655static bool skb_is_err_queue(const struct sk_buff *skb)
656{
657 /* pkt_type of skbs enqueued on the error queue are set to
658 * PACKET_OUTGOING in skb_set_err_queue(). This is only safe to do
659 * in recvmsg, since skbs received on a local socket will never
660 * have a pkt_type of PACKET_OUTGOING.
661 */
662 return skb->pkt_type == PACKET_OUTGOING;
663}
664
643/* 665/*
644 * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP) 666 * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP)
645 */ 667 */
@@ -654,7 +676,7 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
654 676
655 /* Race occurred between timestamp enabling and packet 677 /* Race occurred between timestamp enabling and packet
656 receiving. Fill in the current time for now. */ 678 receiving. Fill in the current time for now. */
657 if (need_software_tstamp && skb->tstamp.tv64 == 0) 679 if (need_software_tstamp && skb->tstamp == 0)
658 __net_timestamp(skb); 680 __net_timestamp(skb);
659 681
660 if (need_software_tstamp) { 682 if (need_software_tstamp) {
@@ -679,9 +701,15 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
679 (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) && 701 (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
680 ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2)) 702 ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2))
681 empty = 0; 703 empty = 0;
682 if (!empty) 704 if (!empty) {
683 put_cmsg(msg, SOL_SOCKET, 705 put_cmsg(msg, SOL_SOCKET,
684 SCM_TIMESTAMPING, sizeof(tss), &tss); 706 SCM_TIMESTAMPING, sizeof(tss), &tss);
707
708 if (skb_is_err_queue(skb) && skb->len &&
709 SKB_EXT_ERR(skb)->opt_stats)
710 put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_OPT_STATS,
711 skb->len, skb->data);
712 }
685} 713}
686EXPORT_SYMBOL_GPL(__sock_recv_timestamp); 714EXPORT_SYMBOL_GPL(__sock_recv_timestamp);
687 715
@@ -892,6 +920,11 @@ static long sock_do_ioctl(struct net *net, struct socket *sock,
892 * what to do with it - that's up to the protocol still. 920 * what to do with it - that's up to the protocol still.
893 */ 921 */
894 922
923static struct ns_common *get_net_ns(struct ns_common *ns)
924{
925 return &get_net(container_of(ns, struct net, ns))->ns;
926}
927
895static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) 928static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
896{ 929{
897 struct socket *sock; 930 struct socket *sock;
@@ -960,6 +993,13 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
960 err = dlci_ioctl_hook(cmd, argp); 993 err = dlci_ioctl_hook(cmd, argp);
961 mutex_unlock(&dlci_ioctl_mutex); 994 mutex_unlock(&dlci_ioctl_mutex);
962 break; 995 break;
996 case SIOCGSKNS:
997 err = -EPERM;
998 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
999 break;
1000
1001 err = open_related_ns(&net->ns, get_net_ns);
1002 break;
963 default: 1003 default:
964 err = sock_do_ioctl(net, sock, cmd, arg); 1004 err = sock_do_ioctl(net, sock, cmd, arg);
965 break; 1005 break;
@@ -1477,7 +1517,7 @@ SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,
1477 if (err) 1517 if (err)
1478 goto out_fd; 1518 goto out_fd;
1479 1519
1480 err = sock->ops->accept(sock, newsock, sock->file->f_flags); 1520 err = sock->ops->accept(sock, newsock, sock->file->f_flags, false);
1481 if (err < 0) 1521 if (err < 0)
1482 goto out_fd; 1522 goto out_fd;
1483 1523
@@ -1702,6 +1742,7 @@ SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size,
1702 /* We assume all kernel code knows the size of sockaddr_storage */ 1742 /* We assume all kernel code knows the size of sockaddr_storage */
1703 msg.msg_namelen = 0; 1743 msg.msg_namelen = 0;
1704 msg.msg_iocb = NULL; 1744 msg.msg_iocb = NULL;
1745 msg.msg_flags = 0;
1705 if (sock->file->f_flags & O_NONBLOCK) 1746 if (sock->file->f_flags & O_NONBLOCK)
1706 flags |= MSG_DONTWAIT; 1747 flags |= MSG_DONTWAIT;
1707 err = sock_recvmsg(sock, &msg, flags); 1748 err = sock_recvmsg(sock, &msg, flags);
@@ -1887,7 +1928,7 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
1887 struct sockaddr_storage address; 1928 struct sockaddr_storage address;
1888 struct iovec iovstack[UIO_FASTIOV], *iov = iovstack; 1929 struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
1889 unsigned char ctl[sizeof(struct cmsghdr) + 20] 1930 unsigned char ctl[sizeof(struct cmsghdr) + 20]
1890 __attribute__ ((aligned(sizeof(__kernel_size_t)))); 1931 __aligned(sizeof(__kernel_size_t));
1891 /* 20 is size of ipv6_pktinfo */ 1932 /* 20 is size of ipv6_pktinfo */
1892 unsigned char *ctl_buf = ctl; 1933 unsigned char *ctl_buf = ctl;
1893 int ctl_len; 1934 int ctl_len;
@@ -1917,6 +1958,8 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
1917 ctl_buf = msg_sys->msg_control; 1958 ctl_buf = msg_sys->msg_control;
1918 ctl_len = msg_sys->msg_controllen; 1959 ctl_len = msg_sys->msg_controllen;
1919 } else if (ctl_len) { 1960 } else if (ctl_len) {
1961 BUILD_BUG_ON(sizeof(struct cmsghdr) !=
1962 CMSG_ALIGN(sizeof(struct cmsghdr)));
1920 if (ctl_len > sizeof(ctl)) { 1963 if (ctl_len > sizeof(ctl)) {
1921 ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL); 1964 ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL);
1922 if (ctl_buf == NULL) 1965 if (ctl_buf == NULL)
@@ -2197,8 +2240,10 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
2197 return err; 2240 return err;
2198 2241
2199 err = sock_error(sock->sk); 2242 err = sock_error(sock->sk);
2200 if (err) 2243 if (err) {
2244 datagrams = err;
2201 goto out_put; 2245 goto out_put;
2246 }
2202 2247
2203 entry = mmsg; 2248 entry = mmsg;
2204 compat_entry = (struct compat_mmsghdr __user *)mmsg; 2249 compat_entry = (struct compat_mmsghdr __user *)mmsg;
@@ -3110,6 +3155,7 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
3110 case SIOCSIFVLAN: 3155 case SIOCSIFVLAN:
3111 case SIOCADDDLCI: 3156 case SIOCADDDLCI:
3112 case SIOCDELDLCI: 3157 case SIOCDELDLCI:
3158 case SIOCGSKNS:
3113 return sock_ioctl(file, cmd, arg); 3159 return sock_ioctl(file, cmd, arg);
3114 3160
3115 case SIOCGIFFLAGS: 3161 case SIOCGIFFLAGS:
@@ -3204,7 +3250,7 @@ int kernel_accept(struct socket *sock, struct socket **newsock, int flags)
3204 if (err < 0) 3250 if (err < 0)
3205 goto done; 3251 goto done;
3206 3252
3207 err = sock->ops->accept(sock, *newsock, flags); 3253 err = sock->ops->accept(sock, *newsock, flags, true);
3208 if (err < 0) { 3254 if (err < 0) {
3209 sock_release(*newsock); 3255 sock_release(*newsock);
3210 *newsock = NULL; 3256 *newsock = NULL;
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index 41adf362936d..b5c279b22680 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -504,6 +504,7 @@ static int __init strp_mod_init(void)
504 504
505static void __exit strp_mod_exit(void) 505static void __exit strp_mod_exit(void)
506{ 506{
507 destroy_workqueue(strp_wq);
507} 508}
508module_init(strp_mod_init); 509module_init(strp_mod_init);
509module_exit(strp_mod_exit); 510module_exit(strp_mod_exit);
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 2bff63a73cf8..d2623b9f23d6 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -8,6 +8,7 @@
8 8
9#include <linux/types.h> 9#include <linux/types.h>
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/cred.h>
11#include <linux/module.h> 12#include <linux/module.h>
12#include <linux/slab.h> 13#include <linux/slab.h>
13#include <linux/errno.h> 14#include <linux/errno.h>
@@ -464,8 +465,10 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan)
464 * Note that the cred_unused list must be time-ordered. 465 * Note that the cred_unused list must be time-ordered.
465 */ 466 */
466 if (time_in_range(cred->cr_expire, expired, jiffies) && 467 if (time_in_range(cred->cr_expire, expired, jiffies) &&
467 test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) 468 test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) {
469 freed = SHRINK_STOP;
468 break; 470 break;
471 }
469 472
470 list_del_init(&cred->cr_lru); 473 list_del_init(&cred->cr_lru);
471 number_cred_unused--; 474 number_cred_unused--;
@@ -520,7 +523,7 @@ static unsigned long
520rpcauth_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) 523rpcauth_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
521 524
522{ 525{
523 return (number_cred_unused / 100) * sysctl_vfs_cache_pressure; 526 return number_cred_unused * sysctl_vfs_cache_pressure / 100;
524} 527}
525 528
526static void 529static void
@@ -646,9 +649,6 @@ rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred,
646 cred->cr_auth = auth; 649 cred->cr_auth = auth;
647 cred->cr_ops = ops; 650 cred->cr_ops = ops;
648 cred->cr_expire = jiffies; 651 cred->cr_expire = jiffies;
649#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
650 cred->cr_magic = RPCAUTH_CRED_MAGIC;
651#endif
652 cred->cr_uid = acred->uid; 652 cred->cr_uid = acred->uid;
653} 653}
654EXPORT_SYMBOL_GPL(rpcauth_init_cred); 654EXPORT_SYMBOL_GPL(rpcauth_init_cred);
@@ -876,8 +876,12 @@ int __init rpcauth_init_module(void)
876 err = rpc_init_generic_auth(); 876 err = rpc_init_generic_auth();
877 if (err < 0) 877 if (err < 0)
878 goto out2; 878 goto out2;
879 register_shrinker(&rpc_cred_shrinker); 879 err = register_shrinker(&rpc_cred_shrinker);
880 if (err < 0)
881 goto out3;
880 return 0; 882 return 0;
883out3:
884 rpc_destroy_generic_auth();
881out2: 885out2:
882 rpc_destroy_authunix(); 886 rpc_destroy_authunix();
883out1: 887out1:
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 3dfd769dc5b5..4f16953e4954 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -50,7 +50,7 @@
50#include <linux/workqueue.h> 50#include <linux/workqueue.h>
51#include <linux/sunrpc/rpc_pipe_fs.h> 51#include <linux/sunrpc/rpc_pipe_fs.h>
52#include <linux/sunrpc/gss_api.h> 52#include <linux/sunrpc/gss_api.h>
53#include <asm/uaccess.h> 53#include <linux/uaccess.h>
54#include <linux/hashtable.h> 54#include <linux/hashtable.h>
55 55
56#include "../netns.h" 56#include "../netns.h"
@@ -541,9 +541,13 @@ gss_setup_upcall(struct gss_auth *gss_auth, struct rpc_cred *cred)
541 return gss_new; 541 return gss_new;
542 gss_msg = gss_add_msg(gss_new); 542 gss_msg = gss_add_msg(gss_new);
543 if (gss_msg == gss_new) { 543 if (gss_msg == gss_new) {
544 int res = rpc_queue_upcall(gss_new->pipe, &gss_new->msg); 544 int res;
545 atomic_inc(&gss_msg->count);
546 res = rpc_queue_upcall(gss_new->pipe, &gss_new->msg);
545 if (res) { 547 if (res) {
546 gss_unhash_msg(gss_new); 548 gss_unhash_msg(gss_new);
549 atomic_dec(&gss_msg->count);
550 gss_release_msg(gss_new);
547 gss_msg = ERR_PTR(res); 551 gss_msg = ERR_PTR(res);
548 } 552 }
549 } else 553 } else
@@ -759,7 +763,7 @@ err_put_ctx:
759err: 763err:
760 kfree(buf); 764 kfree(buf);
761out: 765out:
762 dprintk("RPC: %s returning %Zd\n", __func__, err); 766 dprintk("RPC: %s returning %zd\n", __func__, err);
763 return err; 767 return err;
764} 768}
765 769
@@ -836,6 +840,7 @@ gss_pipe_destroy_msg(struct rpc_pipe_msg *msg)
836 warn_gssd(); 840 warn_gssd();
837 gss_release_msg(gss_msg); 841 gss_release_msg(gss_msg);
838 } 842 }
843 gss_release_msg(gss_msg);
839} 844}
840 845
841static void gss_pipe_dentry_destroy(struct dentry *dir, 846static void gss_pipe_dentry_destroy(struct dentry *dir,
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index 90115ceefd49..fb39284ec174 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -200,7 +200,7 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen,
200 if (IS_ERR(hmac_md5)) 200 if (IS_ERR(hmac_md5))
201 goto out_free_md5; 201 goto out_free_md5;
202 202
203 req = ahash_request_alloc(md5, GFP_KERNEL); 203 req = ahash_request_alloc(md5, GFP_NOFS);
204 if (!req) 204 if (!req)
205 goto out_free_hmac_md5; 205 goto out_free_hmac_md5;
206 206
@@ -230,7 +230,7 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen,
230 goto out; 230 goto out;
231 231
232 ahash_request_free(req); 232 ahash_request_free(req);
233 req = ahash_request_alloc(hmac_md5, GFP_KERNEL); 233 req = ahash_request_alloc(hmac_md5, GFP_NOFS);
234 if (!req) 234 if (!req)
235 goto out_free_hmac_md5; 235 goto out_free_hmac_md5;
236 236
@@ -299,7 +299,7 @@ make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen,
299 if (IS_ERR(tfm)) 299 if (IS_ERR(tfm))
300 goto out_free_cksum; 300 goto out_free_cksum;
301 301
302 req = ahash_request_alloc(tfm, GFP_KERNEL); 302 req = ahash_request_alloc(tfm, GFP_NOFS);
303 if (!req) 303 if (!req)
304 goto out_free_ahash; 304 goto out_free_ahash;
305 305
@@ -397,7 +397,7 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen,
397 goto out_free_cksum; 397 goto out_free_cksum;
398 checksumlen = crypto_ahash_digestsize(tfm); 398 checksumlen = crypto_ahash_digestsize(tfm);
399 399
400 req = ahash_request_alloc(tfm, GFP_KERNEL); 400 req = ahash_request_alloc(tfm, GFP_NOFS);
401 if (!req) 401 if (!req)
402 goto out_free_ahash; 402 goto out_free_ahash;
403 403
@@ -963,7 +963,7 @@ krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher,
963 } 963 }
964 964
965 desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac), 965 desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac),
966 GFP_KERNEL); 966 GFP_NOFS);
967 if (!desc) { 967 if (!desc) {
968 dprintk("%s: failed to allocate shash descriptor for '%s'\n", 968 dprintk("%s: failed to allocate shash descriptor for '%s'\n",
969 __func__, kctx->gk5e->cksum_name); 969 __func__, kctx->gk5e->cksum_name);
@@ -1030,7 +1030,7 @@ krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher,
1030 } 1030 }
1031 1031
1032 desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac), 1032 desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac),
1033 GFP_KERNEL); 1033 GFP_NOFS);
1034 if (!desc) { 1034 if (!desc) {
1035 dprintk("%s: failed to allocate shash descriptor for '%s'\n", 1035 dprintk("%s: failed to allocate shash descriptor for '%s'\n",
1036 __func__, kctx->gk5e->cksum_name); 1036 __func__, kctx->gk5e->cksum_name);
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 60595835317a..7bb2514aadd9 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -451,8 +451,7 @@ context_derive_keys_rc4(struct krb5_ctx *ctx)
451 goto out_err_free_hmac; 451 goto out_err_free_hmac;
452 452
453 453
454 desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac), 454 desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac), GFP_NOFS);
455 GFP_KERNEL);
456 if (!desc) { 455 if (!desc) {
457 dprintk("%s: failed to allocate hash descriptor for '%s'\n", 456 dprintk("%s: failed to allocate hash descriptor for '%s'\n",
458 __func__, ctx->gk5e->cksum_name); 457 __func__, ctx->gk5e->cksum_name);
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c
index dc6fb79a361f..25d9a9cf7b66 100644
--- a/net/sunrpc/auth_gss/gss_rpc_xdr.c
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c
@@ -260,7 +260,7 @@ static int gssx_dec_option_array(struct xdr_stream *xdr,
260 if (!oa->data) 260 if (!oa->data)
261 return -ENOMEM; 261 return -ENOMEM;
262 262
263 creds = kmalloc(sizeof(struct svc_cred), GFP_KERNEL); 263 creds = kzalloc(sizeof(struct svc_cred), GFP_KERNEL);
264 if (!creds) { 264 if (!creds) {
265 kfree(oa->data); 265 kfree(oa->data);
266 return -ENOMEM; 266 return -ENOMEM;
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 45662d7f0943..a54a7a3d28f5 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1489,8 +1489,8 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
1489 case RPC_GSS_PROC_DESTROY: 1489 case RPC_GSS_PROC_DESTROY:
1490 if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq)) 1490 if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq))
1491 goto auth_err; 1491 goto auth_err;
1492 rsci->h.expiry_time = get_seconds(); 1492 /* Delete the entry from the cache_list and call cache_put */
1493 set_bit(CACHE_NEGATIVE, &rsci->h.flags); 1493 sunrpc_cache_unhash(sn->rsc_cache, &rsci->h);
1494 if (resv->iov_len + 4 > PAGE_SIZE) 1494 if (resv->iov_len + 4 > PAGE_SIZE)
1495 goto drop; 1495 goto drop;
1496 svc_putnl(resv, RPC_SUCCESS); 1496 svc_putnl(resv, RPC_SUCCESS);
@@ -1548,7 +1548,7 @@ complete:
1548 ret = SVC_COMPLETE; 1548 ret = SVC_COMPLETE;
1549 goto out; 1549 goto out;
1550drop: 1550drop:
1551 ret = SVC_DROP; 1551 ret = SVC_CLOSE;
1552out: 1552out:
1553 if (rsci) 1553 if (rsci)
1554 cache_put(&rsci->h, sn->rsc_cache); 1554 cache_put(&rsci->h, sn->rsc_cache);
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index 4d17376b2acb..5f3d527dff65 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -139,7 +139,4 @@ struct rpc_cred null_cred = {
139 .cr_ops = &null_credops, 139 .cr_ops = &null_credops,
140 .cr_count = ATOMIC_INIT(1), 140 .cr_count = ATOMIC_INIT(1),
141 .cr_flags = 1UL << RPCAUTH_CRED_UPTODATE, 141 .cr_flags = 1UL << RPCAUTH_CRED_UPTODATE,
142#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
143 .cr_magic = RPCAUTH_CRED_MAGIC,
144#endif
145}; 142};
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 306fc0f54596..82337e1ec9cd 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -14,12 +14,10 @@
14#include <linux/sunrpc/auth.h> 14#include <linux/sunrpc/auth.h>
15#include <linux/user_namespace.h> 15#include <linux/user_namespace.h>
16 16
17#define NFS_NGROUPS 16
18
19struct unx_cred { 17struct unx_cred {
20 struct rpc_cred uc_base; 18 struct rpc_cred uc_base;
21 kgid_t uc_gid; 19 kgid_t uc_gid;
22 kgid_t uc_gids[NFS_NGROUPS]; 20 kgid_t uc_gids[UNX_NGROUPS];
23}; 21};
24#define uc_uid uc_base.cr_uid 22#define uc_uid uc_base.cr_uid
25 23
@@ -82,13 +80,13 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t
82 80
83 if (acred->group_info != NULL) 81 if (acred->group_info != NULL)
84 groups = acred->group_info->ngroups; 82 groups = acred->group_info->ngroups;
85 if (groups > NFS_NGROUPS) 83 if (groups > UNX_NGROUPS)
86 groups = NFS_NGROUPS; 84 groups = UNX_NGROUPS;
87 85
88 cred->uc_gid = acred->gid; 86 cred->uc_gid = acred->gid;
89 for (i = 0; i < groups; i++) 87 for (i = 0; i < groups; i++)
90 cred->uc_gids[i] = acred->group_info->gid[i]; 88 cred->uc_gids[i] = acred->group_info->gid[i];
91 if (i < NFS_NGROUPS) 89 if (i < UNX_NGROUPS)
92 cred->uc_gids[i] = INVALID_GID; 90 cred->uc_gids[i] = INVALID_GID;
93 91
94 return &cred->uc_base; 92 return &cred->uc_base;
@@ -132,12 +130,12 @@ unx_match(struct auth_cred *acred, struct rpc_cred *rcred, int flags)
132 130
133 if (acred->group_info != NULL) 131 if (acred->group_info != NULL)
134 groups = acred->group_info->ngroups; 132 groups = acred->group_info->ngroups;
135 if (groups > NFS_NGROUPS) 133 if (groups > UNX_NGROUPS)
136 groups = NFS_NGROUPS; 134 groups = UNX_NGROUPS;
137 for (i = 0; i < groups ; i++) 135 for (i = 0; i < groups ; i++)
138 if (!gid_eq(cred->uc_gids[i], acred->group_info->gid[i])) 136 if (!gid_eq(cred->uc_gids[i], acred->group_info->gid[i]))
139 return 0; 137 return 0;
140 if (groups < NFS_NGROUPS && gid_valid(cred->uc_gids[groups])) 138 if (groups < UNX_NGROUPS && gid_valid(cred->uc_gids[groups]))
141 return 0; 139 return 0;
142 return 1; 140 return 1;
143} 141}
@@ -166,7 +164,7 @@ unx_marshal(struct rpc_task *task, __be32 *p)
166 *p++ = htonl((u32) from_kuid(&init_user_ns, cred->uc_uid)); 164 *p++ = htonl((u32) from_kuid(&init_user_ns, cred->uc_uid));
167 *p++ = htonl((u32) from_kgid(&init_user_ns, cred->uc_gid)); 165 *p++ = htonl((u32) from_kgid(&init_user_ns, cred->uc_gid));
168 hold = p++; 166 hold = p++;
169 for (i = 0; i < 16 && gid_valid(cred->uc_gids[i]); i++) 167 for (i = 0; i < UNX_NGROUPS && gid_valid(cred->uc_gids[i]); i++)
170 *p++ = htonl((u32) from_kgid(&init_user_ns, cred->uc_gids[i])); 168 *p++ = htonl((u32) from_kgid(&init_user_ns, cred->uc_gids[i]));
171 *hold = htonl(p - hold - 1); /* gid array length */ 169 *hold = htonl(p - hold - 1); /* gid array length */
172 *base = htonl((p - base - 1) << 2); /* cred length */ 170 *base = htonl((p - base - 1) << 2); /* cred length */
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 8aabe12201f8..79d55d949d9a 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -21,7 +21,7 @@
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/ctype.h> 22#include <linux/ctype.h>
23#include <linux/string_helpers.h> 23#include <linux/string_helpers.h>
24#include <asm/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/poll.h> 25#include <linux/poll.h>
26#include <linux/seq_file.h> 26#include <linux/seq_file.h>
27#include <linux/proc_fs.h> 27#include <linux/proc_fs.h>
@@ -362,11 +362,6 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd)
362 cache_purge(cd); 362 cache_purge(cd);
363 spin_lock(&cache_list_lock); 363 spin_lock(&cache_list_lock);
364 write_lock(&cd->hash_lock); 364 write_lock(&cd->hash_lock);
365 if (cd->entries) {
366 write_unlock(&cd->hash_lock);
367 spin_unlock(&cache_list_lock);
368 goto out;
369 }
370 if (current_detail == cd) 365 if (current_detail == cd)
371 current_detail = NULL; 366 current_detail = NULL;
372 list_del_init(&cd->others); 367 list_del_init(&cd->others);
@@ -376,9 +371,6 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd)
376 /* module must be being unloaded so its safe to kill the worker */ 371 /* module must be being unloaded so its safe to kill the worker */
377 cancel_delayed_work_sync(&cache_cleaner); 372 cancel_delayed_work_sync(&cache_cleaner);
378 } 373 }
379 return;
380out:
381 printk(KERN_ERR "RPC: failed to unregister %s cache\n", cd->name);
382} 374}
383EXPORT_SYMBOL_GPL(sunrpc_destroy_cache_detail); 375EXPORT_SYMBOL_GPL(sunrpc_destroy_cache_detail);
384 376
@@ -497,13 +489,32 @@ EXPORT_SYMBOL_GPL(cache_flush);
497 489
498void cache_purge(struct cache_detail *detail) 490void cache_purge(struct cache_detail *detail)
499{ 491{
500 time_t now = seconds_since_boot(); 492 struct cache_head *ch = NULL;
501 if (detail->flush_time >= now) 493 struct hlist_head *head = NULL;
502 now = detail->flush_time + 1; 494 struct hlist_node *tmp = NULL;
503 /* 'now' is the maximum value any 'last_refresh' can have */ 495 int i = 0;
504 detail->flush_time = now; 496
505 detail->nextcheck = seconds_since_boot(); 497 write_lock(&detail->hash_lock);
506 cache_flush(); 498 if (!detail->entries) {
499 write_unlock(&detail->hash_lock);
500 return;
501 }
502
503 dprintk("RPC: %d entries in %s cache\n", detail->entries, detail->name);
504 for (i = 0; i < detail->hash_size; i++) {
505 head = &detail->hash_table[i];
506 hlist_for_each_entry_safe(ch, tmp, head, cache_list) {
507 hlist_del_init(&ch->cache_list);
508 detail->entries--;
509
510 set_bit(CACHE_CLEANED, &ch->flags);
511 write_unlock(&detail->hash_lock);
512 cache_fresh_unlocked(ch, detail);
513 cache_put(ch, detail);
514 write_lock(&detail->hash_lock);
515 }
516 }
517 write_unlock(&detail->hash_lock);
507} 518}
508EXPORT_SYMBOL_GPL(cache_purge); 519EXPORT_SYMBOL_GPL(cache_purge);
509 520
@@ -717,7 +728,7 @@ void cache_clean_deferred(void *owner)
717/* 728/*
718 * communicate with user-space 729 * communicate with user-space
719 * 730 *
720 * We have a magic /proc file - /proc/sunrpc/<cachename>/channel. 731 * We have a magic /proc file - /proc/net/rpc/<cachename>/channel.
721 * On read, you get a full request, or block. 732 * On read, you get a full request, or block.
722 * On write, an update request is processed. 733 * On write, an update request is processed.
723 * Poll works if anything to read, and always allows write. 734 * Poll works if anything to read, and always allows write.
@@ -1272,7 +1283,7 @@ EXPORT_SYMBOL_GPL(qword_get);
1272 1283
1273 1284
1274/* 1285/*
1275 * support /proc/sunrpc/cache/$CACHENAME/content 1286 * support /proc/net/rpc/$CACHENAME/content
1276 * as a seqfile. 1287 * as a seqfile.
1277 * We call ->cache_show passing NULL for the item to 1288 * We call ->cache_show passing NULL for the item to
1278 * get a header, then pass each real item in the cache 1289 * get a header, then pass each real item in the cache
@@ -1358,7 +1369,7 @@ static int c_show(struct seq_file *m, void *p)
1358 ifdebug(CACHE) 1369 ifdebug(CACHE)
1359 seq_printf(m, "# expiry=%ld refcnt=%d flags=%lx\n", 1370 seq_printf(m, "# expiry=%ld refcnt=%d flags=%lx\n",
1360 convert_to_wallclock(cp->expiry_time), 1371 convert_to_wallclock(cp->expiry_time),
1361 atomic_read(&cp->ref.refcount), cp->flags); 1372 kref_read(&cp->ref), cp->flags);
1362 cache_get(cp); 1373 cache_get(cp);
1363 if (cache_check(cd, cp, NULL)) 1374 if (cache_check(cd, cp, NULL))
1364 /* cache_check does a cache_put on failure */ 1375 /* cache_check does a cache_put on failure */
@@ -1427,20 +1438,11 @@ static ssize_t read_flush(struct file *file, char __user *buf,
1427 struct cache_detail *cd) 1438 struct cache_detail *cd)
1428{ 1439{
1429 char tbuf[22]; 1440 char tbuf[22];
1430 unsigned long p = *ppos;
1431 size_t len; 1441 size_t len;
1432 1442
1433 snprintf(tbuf, sizeof(tbuf), "%lu\n", convert_to_wallclock(cd->flush_time)); 1443 len = snprintf(tbuf, sizeof(tbuf), "%lu\n",
1434 len = strlen(tbuf); 1444 convert_to_wallclock(cd->flush_time));
1435 if (p >= len) 1445 return simple_read_from_buffer(buf, count, ppos, tbuf, len);
1436 return 0;
1437 len -= p;
1438 if (len > count)
1439 len = count;
1440 if (copy_to_user(buf, (void*)(tbuf+p), len))
1441 return -EFAULT;
1442 *ppos += len;
1443 return len;
1444} 1446}
1445 1447
1446static ssize_t write_flush(struct file *file, const char __user *buf, 1448static ssize_t write_flush(struct file *file, const char __user *buf,
@@ -1600,21 +1602,12 @@ static const struct file_operations cache_flush_operations_procfs = {
1600 .llseek = no_llseek, 1602 .llseek = no_llseek,
1601}; 1603};
1602 1604
1603static void remove_cache_proc_entries(struct cache_detail *cd, struct net *net) 1605static void remove_cache_proc_entries(struct cache_detail *cd)
1604{ 1606{
1605 struct sunrpc_net *sn; 1607 if (cd->procfs) {
1606 1608 proc_remove(cd->procfs);
1607 if (cd->u.procfs.proc_ent == NULL) 1609 cd->procfs = NULL;
1608 return; 1610 }
1609 if (cd->u.procfs.flush_ent)
1610 remove_proc_entry("flush", cd->u.procfs.proc_ent);
1611 if (cd->u.procfs.channel_ent)
1612 remove_proc_entry("channel", cd->u.procfs.proc_ent);
1613 if (cd->u.procfs.content_ent)
1614 remove_proc_entry("content", cd->u.procfs.proc_ent);
1615 cd->u.procfs.proc_ent = NULL;
1616 sn = net_generic(net, sunrpc_net_id);
1617 remove_proc_entry(cd->name, sn->proc_net_rpc);
1618} 1611}
1619 1612
1620#ifdef CONFIG_PROC_FS 1613#ifdef CONFIG_PROC_FS
@@ -1624,38 +1617,30 @@ static int create_cache_proc_entries(struct cache_detail *cd, struct net *net)
1624 struct sunrpc_net *sn; 1617 struct sunrpc_net *sn;
1625 1618
1626 sn = net_generic(net, sunrpc_net_id); 1619 sn = net_generic(net, sunrpc_net_id);
1627 cd->u.procfs.proc_ent = proc_mkdir(cd->name, sn->proc_net_rpc); 1620 cd->procfs = proc_mkdir(cd->name, sn->proc_net_rpc);
1628 if (cd->u.procfs.proc_ent == NULL) 1621 if (cd->procfs == NULL)
1629 goto out_nomem; 1622 goto out_nomem;
1630 cd->u.procfs.channel_ent = NULL;
1631 cd->u.procfs.content_ent = NULL;
1632 1623
1633 p = proc_create_data("flush", S_IFREG|S_IRUSR|S_IWUSR, 1624 p = proc_create_data("flush", S_IFREG|S_IRUSR|S_IWUSR,
1634 cd->u.procfs.proc_ent, 1625 cd->procfs, &cache_flush_operations_procfs, cd);
1635 &cache_flush_operations_procfs, cd);
1636 cd->u.procfs.flush_ent = p;
1637 if (p == NULL) 1626 if (p == NULL)
1638 goto out_nomem; 1627 goto out_nomem;
1639 1628
1640 if (cd->cache_request || cd->cache_parse) { 1629 if (cd->cache_request || cd->cache_parse) {
1641 p = proc_create_data("channel", S_IFREG|S_IRUSR|S_IWUSR, 1630 p = proc_create_data("channel", S_IFREG|S_IRUSR|S_IWUSR,
1642 cd->u.procfs.proc_ent, 1631 cd->procfs, &cache_file_operations_procfs, cd);
1643 &cache_file_operations_procfs, cd);
1644 cd->u.procfs.channel_ent = p;
1645 if (p == NULL) 1632 if (p == NULL)
1646 goto out_nomem; 1633 goto out_nomem;
1647 } 1634 }
1648 if (cd->cache_show) { 1635 if (cd->cache_show) {
1649 p = proc_create_data("content", S_IFREG|S_IRUSR, 1636 p = proc_create_data("content", S_IFREG|S_IRUSR,
1650 cd->u.procfs.proc_ent, 1637 cd->procfs, &content_file_operations_procfs, cd);
1651 &content_file_operations_procfs, cd);
1652 cd->u.procfs.content_ent = p;
1653 if (p == NULL) 1638 if (p == NULL)
1654 goto out_nomem; 1639 goto out_nomem;
1655 } 1640 }
1656 return 0; 1641 return 0;
1657out_nomem: 1642out_nomem:
1658 remove_cache_proc_entries(cd, net); 1643 remove_cache_proc_entries(cd);
1659 return -ENOMEM; 1644 return -ENOMEM;
1660} 1645}
1661#else /* CONFIG_PROC_FS */ 1646#else /* CONFIG_PROC_FS */
@@ -1684,7 +1669,7 @@ EXPORT_SYMBOL_GPL(cache_register_net);
1684 1669
1685void cache_unregister_net(struct cache_detail *cd, struct net *net) 1670void cache_unregister_net(struct cache_detail *cd, struct net *net)
1686{ 1671{
1687 remove_cache_proc_entries(cd, net); 1672 remove_cache_proc_entries(cd);
1688 sunrpc_destroy_cache_detail(cd); 1673 sunrpc_destroy_cache_detail(cd);
1689} 1674}
1690EXPORT_SYMBOL_GPL(cache_unregister_net); 1675EXPORT_SYMBOL_GPL(cache_unregister_net);
@@ -1843,15 +1828,29 @@ int sunrpc_cache_register_pipefs(struct dentry *parent,
1843 struct dentry *dir = rpc_create_cache_dir(parent, name, umode, cd); 1828 struct dentry *dir = rpc_create_cache_dir(parent, name, umode, cd);
1844 if (IS_ERR(dir)) 1829 if (IS_ERR(dir))
1845 return PTR_ERR(dir); 1830 return PTR_ERR(dir);
1846 cd->u.pipefs.dir = dir; 1831 cd->pipefs = dir;
1847 return 0; 1832 return 0;
1848} 1833}
1849EXPORT_SYMBOL_GPL(sunrpc_cache_register_pipefs); 1834EXPORT_SYMBOL_GPL(sunrpc_cache_register_pipefs);
1850 1835
1851void sunrpc_cache_unregister_pipefs(struct cache_detail *cd) 1836void sunrpc_cache_unregister_pipefs(struct cache_detail *cd)
1852{ 1837{
1853 rpc_remove_cache_dir(cd->u.pipefs.dir); 1838 if (cd->pipefs) {
1854 cd->u.pipefs.dir = NULL; 1839 rpc_remove_cache_dir(cd->pipefs);
1840 cd->pipefs = NULL;
1841 }
1855} 1842}
1856EXPORT_SYMBOL_GPL(sunrpc_cache_unregister_pipefs); 1843EXPORT_SYMBOL_GPL(sunrpc_cache_unregister_pipefs);
1857 1844
1845void sunrpc_cache_unhash(struct cache_detail *cd, struct cache_head *h)
1846{
1847 write_lock(&cd->hash_lock);
1848 if (!hlist_unhashed(&h->cache_list)){
1849 hlist_del_init(&h->cache_list);
1850 cd->entries--;
1851 write_unlock(&cd->hash_lock);
1852 cache_put(h, cd);
1853 } else
1854 write_unlock(&cd->hash_lock);
1855}
1856EXPORT_SYMBOL_GPL(sunrpc_cache_unhash);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 62a482790937..52da3ce54bb5 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -336,6 +336,11 @@ out:
336 336
337static DEFINE_IDA(rpc_clids); 337static DEFINE_IDA(rpc_clids);
338 338
339void rpc_cleanup_clids(void)
340{
341 ida_destroy(&rpc_clids);
342}
343
339static int rpc_alloc_clid(struct rpc_clnt *clnt) 344static int rpc_alloc_clid(struct rpc_clnt *clnt)
340{ 345{
341 int clid; 346 int clid;
@@ -1448,21 +1453,6 @@ size_t rpc_max_bc_payload(struct rpc_clnt *clnt)
1448EXPORT_SYMBOL_GPL(rpc_max_bc_payload); 1453EXPORT_SYMBOL_GPL(rpc_max_bc_payload);
1449 1454
1450/** 1455/**
1451 * rpc_get_timeout - Get timeout for transport in units of HZ
1452 * @clnt: RPC client to query
1453 */
1454unsigned long rpc_get_timeout(struct rpc_clnt *clnt)
1455{
1456 unsigned long ret;
1457
1458 rcu_read_lock();
1459 ret = rcu_dereference(clnt->cl_xprt)->timeout->to_initval;
1460 rcu_read_unlock();
1461 return ret;
1462}
1463EXPORT_SYMBOL_GPL(rpc_get_timeout);
1464
1465/**
1466 * rpc_force_rebind - force transport to check that remote port is unchanged 1456 * rpc_force_rebind - force transport to check that remote port is unchanged
1467 * @clnt: client to rebind 1457 * @clnt: client to rebind
1468 * 1458 *
@@ -1926,6 +1916,8 @@ call_connect_status(struct rpc_task *task)
1926 case -EADDRINUSE: 1916 case -EADDRINUSE:
1927 case -ENOBUFS: 1917 case -ENOBUFS:
1928 case -EPIPE: 1918 case -EPIPE:
1919 xprt_conditional_disconnect(task->tk_rqstp->rq_xprt,
1920 task->tk_rqstp->rq_connect_cookie);
1929 if (RPC_IS_SOFTCONN(task)) 1921 if (RPC_IS_SOFTCONN(task))
1930 break; 1922 break;
1931 /* retry with existing socket, after a delay */ 1923 /* retry with existing socket, after a delay */
@@ -2692,6 +2684,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt,
2692{ 2684{
2693 struct rpc_xprt_switch *xps; 2685 struct rpc_xprt_switch *xps;
2694 struct rpc_xprt *xprt; 2686 struct rpc_xprt *xprt;
2687 unsigned long connect_timeout;
2695 unsigned long reconnect_timeout; 2688 unsigned long reconnect_timeout;
2696 unsigned char resvport; 2689 unsigned char resvport;
2697 int ret = 0; 2690 int ret = 0;
@@ -2704,6 +2697,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt,
2704 return -EAGAIN; 2697 return -EAGAIN;
2705 } 2698 }
2706 resvport = xprt->resvport; 2699 resvport = xprt->resvport;
2700 connect_timeout = xprt->connect_timeout;
2707 reconnect_timeout = xprt->max_reconnect_timeout; 2701 reconnect_timeout = xprt->max_reconnect_timeout;
2708 rcu_read_unlock(); 2702 rcu_read_unlock();
2709 2703
@@ -2713,7 +2707,10 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt,
2713 goto out_put_switch; 2707 goto out_put_switch;
2714 } 2708 }
2715 xprt->resvport = resvport; 2709 xprt->resvport = resvport;
2716 xprt->max_reconnect_timeout = reconnect_timeout; 2710 if (xprt->ops->set_connect_timeout != NULL)
2711 xprt->ops->set_connect_timeout(xprt,
2712 connect_timeout,
2713 reconnect_timeout);
2717 2714
2718 rpc_xprt_switch_set_roundrobin(xps); 2715 rpc_xprt_switch_set_roundrobin(xps);
2719 if (setup) { 2716 if (setup) {
@@ -2730,26 +2727,39 @@ out_put_switch:
2730} 2727}
2731EXPORT_SYMBOL_GPL(rpc_clnt_add_xprt); 2728EXPORT_SYMBOL_GPL(rpc_clnt_add_xprt);
2732 2729
2730struct connect_timeout_data {
2731 unsigned long connect_timeout;
2732 unsigned long reconnect_timeout;
2733};
2734
2733static int 2735static int
2734rpc_xprt_cap_max_reconnect_timeout(struct rpc_clnt *clnt, 2736rpc_xprt_set_connect_timeout(struct rpc_clnt *clnt,
2735 struct rpc_xprt *xprt, 2737 struct rpc_xprt *xprt,
2736 void *data) 2738 void *data)
2737{ 2739{
2738 unsigned long timeout = *((unsigned long *)data); 2740 struct connect_timeout_data *timeo = data;
2739 2741
2740 if (timeout < xprt->max_reconnect_timeout) 2742 if (xprt->ops->set_connect_timeout)
2741 xprt->max_reconnect_timeout = timeout; 2743 xprt->ops->set_connect_timeout(xprt,
2744 timeo->connect_timeout,
2745 timeo->reconnect_timeout);
2742 return 0; 2746 return 0;
2743} 2747}
2744 2748
2745void 2749void
2746rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, unsigned long timeo) 2750rpc_set_connect_timeout(struct rpc_clnt *clnt,
2751 unsigned long connect_timeout,
2752 unsigned long reconnect_timeout)
2747{ 2753{
2754 struct connect_timeout_data timeout = {
2755 .connect_timeout = connect_timeout,
2756 .reconnect_timeout = reconnect_timeout,
2757 };
2748 rpc_clnt_iterate_for_each_xprt(clnt, 2758 rpc_clnt_iterate_for_each_xprt(clnt,
2749 rpc_xprt_cap_max_reconnect_timeout, 2759 rpc_xprt_set_connect_timeout,
2750 &timeo); 2760 &timeout);
2751} 2761}
2752EXPORT_SYMBOL_GPL(rpc_cap_max_reconnect_timeout); 2762EXPORT_SYMBOL_GPL(rpc_set_connect_timeout);
2753 2763
2754void rpc_clnt_xprt_switch_put(struct rpc_clnt *clnt) 2764void rpc_clnt_xprt_switch_put(struct rpc_clnt *clnt)
2755{ 2765{
diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c
index e7b4d93566df..c8fd0b6c1618 100644
--- a/net/sunrpc/debugfs.c
+++ b/net/sunrpc/debugfs.c
@@ -16,11 +16,6 @@ static struct dentry *rpc_xprt_dir;
16 16
17unsigned int rpc_inject_disconnect; 17unsigned int rpc_inject_disconnect;
18 18
19struct rpc_clnt_iter {
20 struct rpc_clnt *clnt;
21 loff_t pos;
22};
23
24static int 19static int
25tasks_show(struct seq_file *f, void *v) 20tasks_show(struct seq_file *f, void *v)
26{ 21{
@@ -47,12 +42,10 @@ static void *
47tasks_start(struct seq_file *f, loff_t *ppos) 42tasks_start(struct seq_file *f, loff_t *ppos)
48 __acquires(&clnt->cl_lock) 43 __acquires(&clnt->cl_lock)
49{ 44{
50 struct rpc_clnt_iter *iter = f->private; 45 struct rpc_clnt *clnt = f->private;
51 loff_t pos = *ppos; 46 loff_t pos = *ppos;
52 struct rpc_clnt *clnt = iter->clnt;
53 struct rpc_task *task; 47 struct rpc_task *task;
54 48
55 iter->pos = pos + 1;
56 spin_lock(&clnt->cl_lock); 49 spin_lock(&clnt->cl_lock);
57 list_for_each_entry(task, &clnt->cl_tasks, tk_task) 50 list_for_each_entry(task, &clnt->cl_tasks, tk_task)
58 if (pos-- == 0) 51 if (pos-- == 0)
@@ -63,12 +56,10 @@ tasks_start(struct seq_file *f, loff_t *ppos)
63static void * 56static void *
64tasks_next(struct seq_file *f, void *v, loff_t *pos) 57tasks_next(struct seq_file *f, void *v, loff_t *pos)
65{ 58{
66 struct rpc_clnt_iter *iter = f->private; 59 struct rpc_clnt *clnt = f->private;
67 struct rpc_clnt *clnt = iter->clnt;
68 struct rpc_task *task = v; 60 struct rpc_task *task = v;
69 struct list_head *next = task->tk_task.next; 61 struct list_head *next = task->tk_task.next;
70 62
71 ++iter->pos;
72 ++*pos; 63 ++*pos;
73 64
74 /* If there's another task on list, return it */ 65 /* If there's another task on list, return it */
@@ -81,9 +72,7 @@ static void
81tasks_stop(struct seq_file *f, void *v) 72tasks_stop(struct seq_file *f, void *v)
82 __releases(&clnt->cl_lock) 73 __releases(&clnt->cl_lock)
83{ 74{
84 struct rpc_clnt_iter *iter = f->private; 75 struct rpc_clnt *clnt = f->private;
85 struct rpc_clnt *clnt = iter->clnt;
86
87 spin_unlock(&clnt->cl_lock); 76 spin_unlock(&clnt->cl_lock);
88} 77}
89 78
@@ -96,17 +85,13 @@ static const struct seq_operations tasks_seq_operations = {
96 85
97static int tasks_open(struct inode *inode, struct file *filp) 86static int tasks_open(struct inode *inode, struct file *filp)
98{ 87{
99 int ret = seq_open_private(filp, &tasks_seq_operations, 88 int ret = seq_open(filp, &tasks_seq_operations);
100 sizeof(struct rpc_clnt_iter));
101
102 if (!ret) { 89 if (!ret) {
103 struct seq_file *seq = filp->private_data; 90 struct seq_file *seq = filp->private_data;
104 struct rpc_clnt_iter *iter = seq->private; 91 struct rpc_clnt *clnt = seq->private = inode->i_private;
105
106 iter->clnt = inode->i_private;
107 92
108 if (!atomic_inc_not_zero(&iter->clnt->cl_count)) { 93 if (!atomic_inc_not_zero(&clnt->cl_count)) {
109 seq_release_private(inode, filp); 94 seq_release(inode, filp);
110 ret = -EINVAL; 95 ret = -EINVAL;
111 } 96 }
112 } 97 }
@@ -118,10 +103,10 @@ static int
118tasks_release(struct inode *inode, struct file *filp) 103tasks_release(struct inode *inode, struct file *filp)
119{ 104{
120 struct seq_file *seq = filp->private_data; 105 struct seq_file *seq = filp->private_data;
121 struct rpc_clnt_iter *iter = seq->private; 106 struct rpc_clnt *clnt = seq->private;
122 107
123 rpc_release_client(iter->clnt); 108 rpc_release_client(clnt);
124 return seq_release_private(inode, filp); 109 return seq_release(inode, filp);
125} 110}
126 111
127static const struct file_operations tasks_fops = { 112static const struct file_operations tasks_fops = {
diff --git a/net/sunrpc/netns.h b/net/sunrpc/netns.h
index df5826876535..394ce523174c 100644
--- a/net/sunrpc/netns.h
+++ b/net/sunrpc/netns.h
@@ -34,7 +34,7 @@ struct sunrpc_net {
34 struct proc_dir_entry *use_gssp_proc; 34 struct proc_dir_entry *use_gssp_proc;
35}; 35};
36 36
37extern int sunrpc_net_id; 37extern unsigned int sunrpc_net_id;
38 38
39int ip_map_cache_create(struct net *); 39int ip_map_cache_create(struct net *);
40void ip_map_cache_destroy(struct net *); 40void ip_map_cache_destroy(struct net *);
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 2ecb994314c1..caeb01ad2b5a 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -157,15 +157,17 @@ void rpc_count_iostats_metrics(const struct rpc_task *task,
157 spin_lock(&op_metrics->om_lock); 157 spin_lock(&op_metrics->om_lock);
158 158
159 op_metrics->om_ops++; 159 op_metrics->om_ops++;
160 op_metrics->om_ntrans += req->rq_ntrans; 160 /* kernel API: om_ops must never become larger than om_ntrans */
161 op_metrics->om_ntrans += max(req->rq_ntrans, 1);
161 op_metrics->om_timeouts += task->tk_timeouts; 162 op_metrics->om_timeouts += task->tk_timeouts;
162 163
163 op_metrics->om_bytes_sent += req->rq_xmit_bytes_sent; 164 op_metrics->om_bytes_sent += req->rq_xmit_bytes_sent;
164 op_metrics->om_bytes_recv += req->rq_reply_bytes_recvd; 165 op_metrics->om_bytes_recv += req->rq_reply_bytes_recvd;
165 166
166 delta = ktime_sub(req->rq_xtime, task->tk_start); 167 if (ktime_to_ns(req->rq_xtime)) {
167 op_metrics->om_queue = ktime_add(op_metrics->om_queue, delta); 168 delta = ktime_sub(req->rq_xtime, task->tk_start);
168 169 op_metrics->om_queue = ktime_add(op_metrics->om_queue, delta);
170 }
169 op_metrics->om_rtt = ktime_add(op_metrics->om_rtt, req->rq_rtt); 171 op_metrics->om_rtt = ktime_add(op_metrics->om_rtt, req->rq_rtt);
170 172
171 delta = ktime_sub(now, task->tk_start); 173 delta = ktime_sub(now, task->tk_start);
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index ee5d3d253102..c73de181467a 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -24,7 +24,7 @@
24 24
25#include "netns.h" 25#include "netns.h"
26 26
27int sunrpc_net_id; 27unsigned int sunrpc_net_id;
28EXPORT_SYMBOL_GPL(sunrpc_net_id); 28EXPORT_SYMBOL_GPL(sunrpc_net_id);
29 29
30static __net_init int sunrpc_init_net(struct net *net) 30static __net_init int sunrpc_init_net(struct net *net)
@@ -119,6 +119,7 @@ out:
119static void __exit 119static void __exit
120cleanup_sunrpc(void) 120cleanup_sunrpc(void)
121{ 121{
122 rpc_cleanup_clids();
122 rpcauth_remove_module(); 123 rpcauth_remove_module();
123 cleanup_socket_xprt(); 124 cleanup_socket_xprt();
124 svc_cleanup_xprt_sock(); 125 svc_cleanup_xprt_sock();
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 7c8070ec93c8..a08aeb56b8e4 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -11,7 +11,7 @@
11 */ 11 */
12 12
13#include <linux/linkage.h> 13#include <linux/linkage.h>
14#include <linux/sched.h> 14#include <linux/sched/signal.h>
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/net.h> 16#include <linux/net.h>
17#include <linux/in.h> 17#include <linux/in.h>
@@ -385,7 +385,7 @@ static int svc_uses_rpcbind(struct svc_serv *serv)
385 for (i = 0; i < progp->pg_nvers; i++) { 385 for (i = 0; i < progp->pg_nvers; i++) {
386 if (progp->pg_vers[i] == NULL) 386 if (progp->pg_vers[i] == NULL)
387 continue; 387 continue;
388 if (progp->pg_vers[i]->vs_hidden == 0) 388 if (!progp->pg_vers[i]->vs_hidden)
389 return 1; 389 return 1;
390 } 390 }
391 } 391 }
@@ -976,6 +976,13 @@ int svc_register(const struct svc_serv *serv, struct net *net,
976 if (vers->vs_hidden) 976 if (vers->vs_hidden)
977 continue; 977 continue;
978 978
979 /*
980 * Don't register a UDP port if we need congestion
981 * control.
982 */
983 if (vers->vs_need_cong_ctrl && proto == IPPROTO_UDP)
984 continue;
985
979 error = __svc_register(net, progp->pg_name, progp->pg_prog, 986 error = __svc_register(net, progp->pg_name, progp->pg_prog,
980 i, family, proto, port); 987 i, family, proto, port);
981 988
@@ -1155,8 +1162,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
1155 case SVC_DENIED: 1162 case SVC_DENIED:
1156 goto err_bad_auth; 1163 goto err_bad_auth;
1157 case SVC_CLOSE: 1164 case SVC_CLOSE:
1158 if (test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags)) 1165 goto close;
1159 svc_close_xprt(rqstp->rq_xprt);
1160 case SVC_DROP: 1166 case SVC_DROP:
1161 goto dropit; 1167 goto dropit;
1162 case SVC_COMPLETE: 1168 case SVC_COMPLETE:
@@ -1170,6 +1176,21 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
1170 !(versp = progp->pg_vers[vers])) 1176 !(versp = progp->pg_vers[vers]))
1171 goto err_bad_vers; 1177 goto err_bad_vers;
1172 1178
1179 /*
1180 * Some protocol versions (namely NFSv4) require some form of
1181 * congestion control. (See RFC 7530 section 3.1 paragraph 2)
1182 * In other words, UDP is not allowed. We mark those when setting
1183 * up the svc_xprt, and verify that here.
1184 *
1185 * The spec is not very clear about what error should be returned
1186 * when someone tries to access a server that is listening on UDP
1187 * for lower versions. RPC_PROG_MISMATCH seems to be the closest
1188 * fit.
1189 */
1190 if (versp->vs_need_cong_ctrl &&
1191 !test_bit(XPT_CONG_CTRL, &rqstp->rq_xprt->xpt_flags))
1192 goto err_bad_vers;
1193
1173 procp = versp->vs_proc + proc; 1194 procp = versp->vs_proc + proc;
1174 if (proc >= versp->vs_nproc || !procp->pc_func) 1195 if (proc >= versp->vs_nproc || !procp->pc_func)
1175 goto err_bad_proc; 1196 goto err_bad_proc;
@@ -1246,7 +1267,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
1246 1267
1247 sendit: 1268 sendit:
1248 if (svc_authorise(rqstp)) 1269 if (svc_authorise(rqstp))
1249 goto dropit; 1270 goto close;
1250 return 1; /* Caller can now send it */ 1271 return 1; /* Caller can now send it */
1251 1272
1252 dropit: 1273 dropit:
@@ -1254,11 +1275,16 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
1254 dprintk("svc: svc_process dropit\n"); 1275 dprintk("svc: svc_process dropit\n");
1255 return 0; 1276 return 0;
1256 1277
1278 close:
1279 if (test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags))
1280 svc_close_xprt(rqstp->rq_xprt);
1281 dprintk("svc: svc_process close\n");
1282 return 0;
1283
1257err_short_len: 1284err_short_len:
1258 svc_printk(rqstp, "short len %Zd, dropping request\n", 1285 svc_printk(rqstp, "short len %zd, dropping request\n",
1259 argv->iov_len); 1286 argv->iov_len);
1260 1287 goto close;
1261 goto dropit; /* drop request */
1262 1288
1263err_bad_rpc: 1289err_bad_rpc:
1264 serv->sv_stats->rpcbadfmt++; 1290 serv->sv_stats->rpcbadfmt++;
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 3bc1d61694cb..7bfe1fb42add 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -490,7 +490,7 @@ static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool)
490 svc_xprt_get(xprt); 490 svc_xprt_get(xprt);
491 491
492 dprintk("svc: transport %p dequeued, inuse=%d\n", 492 dprintk("svc: transport %p dequeued, inuse=%d\n",
493 xprt, atomic_read(&xprt->xpt_ref.refcount)); 493 xprt, kref_read(&xprt->xpt_ref));
494 } 494 }
495 spin_unlock_bh(&pool->sp_lock); 495 spin_unlock_bh(&pool->sp_lock);
496out: 496out:
@@ -799,6 +799,8 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
799 799
800 if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) { 800 if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) {
801 dprintk("svc_recv: found XPT_CLOSE\n"); 801 dprintk("svc_recv: found XPT_CLOSE\n");
802 if (test_and_clear_bit(XPT_KILL_TEMP, &xprt->xpt_flags))
803 xprt->xpt_ops->xpo_kill_temp_xprt(xprt);
802 svc_delete_xprt(xprt); 804 svc_delete_xprt(xprt);
803 /* Leave XPT_BUSY set on the dead xprt: */ 805 /* Leave XPT_BUSY set on the dead xprt: */
804 goto out; 806 goto out;
@@ -820,7 +822,7 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
820 /* XPT_DATA|XPT_DEFERRED case: */ 822 /* XPT_DATA|XPT_DEFERRED case: */
821 dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n", 823 dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n",
822 rqstp, rqstp->rq_pool->sp_id, xprt, 824 rqstp, rqstp->rq_pool->sp_id, xprt,
823 atomic_read(&xprt->xpt_ref.refcount)); 825 kref_read(&xprt->xpt_ref));
824 rqstp->rq_deferred = svc_deferred_dequeue(xprt); 826 rqstp->rq_deferred = svc_deferred_dequeue(xprt);
825 if (rqstp->rq_deferred) 827 if (rqstp->rq_deferred)
826 len = svc_deferred_recv(rqstp); 828 len = svc_deferred_recv(rqstp);
@@ -978,7 +980,7 @@ static void svc_age_temp_xprts(unsigned long closure)
978 * through, close it. */ 980 * through, close it. */
979 if (!test_and_set_bit(XPT_OLD, &xprt->xpt_flags)) 981 if (!test_and_set_bit(XPT_OLD, &xprt->xpt_flags))
980 continue; 982 continue;
981 if (atomic_read(&xprt->xpt_ref.refcount) > 1 || 983 if (kref_read(&xprt->xpt_ref) > 1 ||
982 test_bit(XPT_BUSY, &xprt->xpt_flags)) 984 test_bit(XPT_BUSY, &xprt->xpt_flags))
983 continue; 985 continue;
984 list_del_init(le); 986 list_del_init(le);
@@ -1020,9 +1022,11 @@ void svc_age_temp_xprts_now(struct svc_serv *serv, struct sockaddr *server_addr)
1020 le = to_be_closed.next; 1022 le = to_be_closed.next;
1021 list_del_init(le); 1023 list_del_init(le);
1022 xprt = list_entry(le, struct svc_xprt, xpt_list); 1024 xprt = list_entry(le, struct svc_xprt, xpt_list);
1023 dprintk("svc_age_temp_xprts_now: closing %p\n", xprt); 1025 set_bit(XPT_CLOSE, &xprt->xpt_flags);
1024 xprt->xpt_ops->xpo_kill_temp_xprt(xprt); 1026 set_bit(XPT_KILL_TEMP, &xprt->xpt_flags);
1025 svc_close_xprt(xprt); 1027 dprintk("svc_age_temp_xprts_now: queuing xprt %p for closing\n",
1028 xprt);
1029 svc_xprt_enqueue(xprt);
1026 } 1030 }
1027} 1031}
1028EXPORT_SYMBOL_GPL(svc_age_temp_xprts_now); 1032EXPORT_SYMBOL_GPL(svc_age_temp_xprts_now);
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
index 69841db1f533..bb8db3cb8032 100644
--- a/net/sunrpc/svcauth.c
+++ b/net/sunrpc/svcauth.c
@@ -124,16 +124,20 @@ EXPORT_SYMBOL_GPL(svc_auth_unregister);
124#define DN_HASHMAX (1<<DN_HASHBITS) 124#define DN_HASHMAX (1<<DN_HASHBITS)
125 125
126static struct hlist_head auth_domain_table[DN_HASHMAX]; 126static struct hlist_head auth_domain_table[DN_HASHMAX];
127static spinlock_t auth_domain_lock = 127static DEFINE_SPINLOCK(auth_domain_lock);
128 __SPIN_LOCK_UNLOCKED(auth_domain_lock); 128
129static void auth_domain_release(struct kref *kref)
130{
131 struct auth_domain *dom = container_of(kref, struct auth_domain, ref);
132
133 hlist_del(&dom->hash);
134 dom->flavour->domain_release(dom);
135 spin_unlock(&auth_domain_lock);
136}
129 137
130void auth_domain_put(struct auth_domain *dom) 138void auth_domain_put(struct auth_domain *dom)
131{ 139{
132 if (atomic_dec_and_lock(&dom->ref.refcount, &auth_domain_lock)) { 140 kref_put_lock(&dom->ref, auth_domain_release, &auth_domain_lock);
133 hlist_del(&dom->hash);
134 dom->flavour->domain_release(dom);
135 spin_unlock(&auth_domain_lock);
136 }
137} 141}
138EXPORT_SYMBOL_GPL(auth_domain_put); 142EXPORT_SYMBOL_GPL(auth_domain_put);
139 143
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 64af4f034de6..f81eaa8e0888 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -403,7 +403,7 @@ svcauth_unix_info_release(struct svc_xprt *xpt)
403/**************************************************************************** 403/****************************************************************************
404 * auth.unix.gid cache 404 * auth.unix.gid cache
405 * simple cache to map a UID to a list of GIDs 405 * simple cache to map a UID to a list of GIDs
406 * because AUTH_UNIX aka AUTH_SYS has a max of 16 406 * because AUTH_UNIX aka AUTH_SYS has a max of UNX_NGROUPS
407 */ 407 */
408#define GID_HASHBITS 8 408#define GID_HASHBITS 8
409#define GID_HASHMAX (1<<GID_HASHBITS) 409#define GID_HASHMAX (1<<GID_HASHBITS)
@@ -810,7 +810,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp)
810 cred->cr_uid = make_kuid(&init_user_ns, svc_getnl(argv)); /* uid */ 810 cred->cr_uid = make_kuid(&init_user_ns, svc_getnl(argv)); /* uid */
811 cred->cr_gid = make_kgid(&init_user_ns, svc_getnl(argv)); /* gid */ 811 cred->cr_gid = make_kgid(&init_user_ns, svc_getnl(argv)); /* gid */
812 slen = svc_getnl(argv); /* gids length */ 812 slen = svc_getnl(argv); /* gids length */
813 if (slen > 16 || (len -= (slen + 2)*4) < 0) 813 if (slen > UNX_NGROUPS || (len -= (slen + 2)*4) < 0)
814 goto badcred; 814 goto badcred;
815 cred->cr_group_info = groups_alloc(slen); 815 cred->cr_group_info = groups_alloc(slen);
816 if (cred->cr_group_info == NULL) 816 if (cred->cr_group_info == NULL)
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index a4bc98265d88..2b720fa35c4f 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -39,9 +39,10 @@
39#include <net/checksum.h> 39#include <net/checksum.h>
40#include <net/ip.h> 40#include <net/ip.h>
41#include <net/ipv6.h> 41#include <net/ipv6.h>
42#include <net/udp.h>
42#include <net/tcp.h> 43#include <net/tcp.h>
43#include <net/tcp_states.h> 44#include <net/tcp_states.h>
44#include <asm/uaccess.h> 45#include <linux/uaccess.h>
45#include <asm/ioctls.h> 46#include <asm/ioctls.h>
46#include <trace/events/skb.h> 47#include <trace/events/skb.h>
47 48
@@ -129,6 +130,18 @@ static void svc_release_skb(struct svc_rqst *rqstp)
129 } 130 }
130} 131}
131 132
133static void svc_release_udp_skb(struct svc_rqst *rqstp)
134{
135 struct sk_buff *skb = rqstp->rq_xprt_ctxt;
136
137 if (skb) {
138 rqstp->rq_xprt_ctxt = NULL;
139
140 dprintk("svc: service %p, releasing skb %p\n", rqstp, skb);
141 consume_skb(skb);
142 }
143}
144
132union svc_pktinfo_u { 145union svc_pktinfo_u {
133 struct in_pktinfo pkti; 146 struct in_pktinfo pkti;
134 struct in6_pktinfo pkti6; 147 struct in6_pktinfo pkti6;
@@ -265,7 +278,7 @@ static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
265 rqstp->rq_respages[0], tailoff); 278 rqstp->rq_respages[0], tailoff);
266 279
267out: 280out:
268 dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n", 281 dprintk("svc: socket %p sendto([%p %zu... ], %d) = %d (addr %s)\n",
269 svsk, xdr->head[0].iov_base, xdr->head[0].iov_len, 282 svsk, xdr->head[0].iov_base, xdr->head[0].iov_len,
270 xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf))); 283 xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf)));
271 284
@@ -333,7 +346,7 @@ static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr,
333 if (len == buflen) 346 if (len == buflen)
334 set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); 347 set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
335 348
336 dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n", 349 dprintk("svc: socket %p recvfrom(%p, %zu) = %d\n",
337 svsk, iov[0].iov_base, iov[0].iov_len, len); 350 svsk, iov[0].iov_base, iov[0].iov_len, len);
338 return len; 351 return len;
339} 352}
@@ -549,7 +562,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
549 err = kernel_recvmsg(svsk->sk_sock, &msg, NULL, 562 err = kernel_recvmsg(svsk->sk_sock, &msg, NULL,
550 0, 0, MSG_PEEK | MSG_DONTWAIT); 563 0, 0, MSG_PEEK | MSG_DONTWAIT);
551 if (err >= 0) 564 if (err >= 0)
552 skb = skb_recv_datagram(svsk->sk_sk, 0, 1, &err); 565 skb = skb_recv_udp(svsk->sk_sk, 0, 1, &err);
553 566
554 if (skb == NULL) { 567 if (skb == NULL) {
555 if (err != -EAGAIN) { 568 if (err != -EAGAIN) {
@@ -561,7 +574,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
561 } 574 }
562 len = svc_addr_len(svc_addr(rqstp)); 575 len = svc_addr_len(svc_addr(rqstp));
563 rqstp->rq_addrlen = len; 576 rqstp->rq_addrlen = len;
564 if (skb->tstamp.tv64 == 0) { 577 if (skb->tstamp == 0) {
565 skb->tstamp = ktime_get_real(); 578 skb->tstamp = ktime_get_real();
566 /* Don't enable netstamp, sunrpc doesn't 579 /* Don't enable netstamp, sunrpc doesn't
567 need that much accuracy */ 580 need that much accuracy */
@@ -590,7 +603,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
590 goto out_free; 603 goto out_free;
591 } 604 }
592 local_bh_enable(); 605 local_bh_enable();
593 skb_free_datagram_locked(svsk->sk_sk, skb); 606 consume_skb(skb);
594 } else { 607 } else {
595 /* we can use it in-place */ 608 /* we can use it in-place */
596 rqstp->rq_arg.head[0].iov_base = skb->data; 609 rqstp->rq_arg.head[0].iov_base = skb->data;
@@ -617,8 +630,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
617 630
618 return len; 631 return len;
619out_free: 632out_free:
620 trace_kfree_skb(skb, svc_udp_recvfrom); 633 kfree_skb(skb);
621 skb_free_datagram_locked(svsk->sk_sk, skb);
622 return 0; 634 return 0;
623} 635}
624 636
@@ -679,7 +691,7 @@ static struct svc_xprt_ops svc_udp_ops = {
679 .xpo_create = svc_udp_create, 691 .xpo_create = svc_udp_create,
680 .xpo_recvfrom = svc_udp_recvfrom, 692 .xpo_recvfrom = svc_udp_recvfrom,
681 .xpo_sendto = svc_udp_sendto, 693 .xpo_sendto = svc_udp_sendto,
682 .xpo_release_rqst = svc_release_skb, 694 .xpo_release_rqst = svc_release_udp_skb,
683 .xpo_detach = svc_sock_detach, 695 .xpo_detach = svc_sock_detach,
684 .xpo_free = svc_sock_free, 696 .xpo_free = svc_sock_free,
685 .xpo_prep_reply_hdr = svc_udp_prep_reply_hdr, 697 .xpo_prep_reply_hdr = svc_udp_prep_reply_hdr,
@@ -1294,6 +1306,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
1294 svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_tcp_class, 1306 svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_tcp_class,
1295 &svsk->sk_xprt, serv); 1307 &svsk->sk_xprt, serv);
1296 set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags); 1308 set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
1309 set_bit(XPT_CONG_CTRL, &svsk->sk_xprt.xpt_flags);
1297 if (sk->sk_state == TCP_LISTEN) { 1310 if (sk->sk_state == TCP_LISTEN) {
1298 dprintk("setting up TCP socket for listening\n"); 1311 dprintk("setting up TCP socket for listening\n");
1299 set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags); 1312 set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags);
@@ -1622,6 +1635,7 @@ static struct svc_xprt *svc_bc_create_socket(struct svc_serv *serv,
1622 1635
1623 xprt = &svsk->sk_xprt; 1636 xprt = &svsk->sk_xprt;
1624 svc_xprt_init(net, &svc_tcp_bc_class, xprt, serv); 1637 svc_xprt_init(net, &svc_tcp_bc_class, xprt, serv);
1638 set_bit(XPT_CONG_CTRL, &svsk->sk_xprt.xpt_flags);
1625 1639
1626 serv->sv_bc_xprt = xprt; 1640 serv->sv_bc_xprt = xprt;
1627 1641
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c
index c88d9bc06f5c..8c3936403fea 100644
--- a/net/sunrpc/sysctl.c
+++ b/net/sunrpc/sysctl.c
@@ -14,7 +14,7 @@
14#include <linux/sysctl.h> 14#include <linux/sysctl.h>
15#include <linux/module.h> 15#include <linux/module.h>
16 16
17#include <asm/uaccess.h> 17#include <linux/uaccess.h>
18#include <linux/sunrpc/types.h> 18#include <linux/sunrpc/types.h>
19#include <linux/sunrpc/sched.h> 19#include <linux/sunrpc/sched.h>
20#include <linux/sunrpc/stats.h> 20#include <linux/sunrpc/stats.h>
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 7f1071e103ca..1f7082144e01 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1518,3 +1518,37 @@ out:
1518} 1518}
1519EXPORT_SYMBOL_GPL(xdr_process_buf); 1519EXPORT_SYMBOL_GPL(xdr_process_buf);
1520 1520
1521/**
1522 * xdr_stream_decode_string_dup - Decode and duplicate variable length string
1523 * @xdr: pointer to xdr_stream
1524 * @str: location to store pointer to string
1525 * @maxlen: maximum acceptable string length
1526 * @gfp_flags: GFP mask to use
1527 *
1528 * Return values:
1529 * On success, returns length of NUL-terminated string stored in *@ptr
1530 * %-EBADMSG on XDR buffer overflow
1531 * %-EMSGSIZE if the size of the string would exceed @maxlen
1532 * %-ENOMEM on memory allocation failure
1533 */
1534ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str,
1535 size_t maxlen, gfp_t gfp_flags)
1536{
1537 void *p;
1538 ssize_t ret;
1539
1540 ret = xdr_stream_decode_opaque_inline(xdr, &p, maxlen);
1541 if (ret > 0) {
1542 char *s = kmalloc(ret + 1, gfp_flags);
1543 if (s != NULL) {
1544 memcpy(s, p, ret);
1545 s[ret] = '\0';
1546 *str = s;
1547 return strlen(s);
1548 }
1549 ret = -ENOMEM;
1550 }
1551 *str = NULL;
1552 return ret;
1553}
1554EXPORT_SYMBOL_GPL(xdr_stream_decode_string_dup);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 685e6d225414..b530a2852ba8 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -669,7 +669,7 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie)
669 spin_lock_bh(&xprt->transport_lock); 669 spin_lock_bh(&xprt->transport_lock);
670 if (cookie != xprt->connect_cookie) 670 if (cookie != xprt->connect_cookie)
671 goto out; 671 goto out;
672 if (test_bit(XPRT_CLOSING, &xprt->state) || !xprt_connected(xprt)) 672 if (test_bit(XPRT_CLOSING, &xprt->state))
673 goto out; 673 goto out;
674 set_bit(XPRT_CLOSE_WAIT, &xprt->state); 674 set_bit(XPRT_CLOSE_WAIT, &xprt->state);
675 /* Try to schedule an autoclose RPC call */ 675 /* Try to schedule an autoclose RPC call */
@@ -772,6 +772,7 @@ void xprt_connect(struct rpc_task *task)
772 if (!xprt_connected(xprt)) { 772 if (!xprt_connected(xprt)) {
773 task->tk_rqstp->rq_bytes_sent = 0; 773 task->tk_rqstp->rq_bytes_sent = 0;
774 task->tk_timeout = task->tk_rqstp->rq_timeout; 774 task->tk_timeout = task->tk_rqstp->rq_timeout;
775 task->tk_rqstp->rq_connect_cookie = xprt->connect_cookie;
775 rpc_sleep_on(&xprt->pending, task, xprt_connect_status); 776 rpc_sleep_on(&xprt->pending, task, xprt_connect_status);
776 777
777 if (test_bit(XPRT_CLOSING, &xprt->state)) 778 if (test_bit(XPRT_CLOSING, &xprt->state))
@@ -896,13 +897,11 @@ static void xprt_timer(struct rpc_task *task)
896 return; 897 return;
897 dprintk("RPC: %5u xprt_timer\n", task->tk_pid); 898 dprintk("RPC: %5u xprt_timer\n", task->tk_pid);
898 899
899 spin_lock_bh(&xprt->transport_lock);
900 if (!req->rq_reply_bytes_recvd) { 900 if (!req->rq_reply_bytes_recvd) {
901 if (xprt->ops->timer) 901 if (xprt->ops->timer)
902 xprt->ops->timer(xprt, task); 902 xprt->ops->timer(xprt, task);
903 } else 903 } else
904 task->tk_status = 0; 904 task->tk_status = 0;
905 spin_unlock_bh(&xprt->transport_lock);
906} 905}
907 906
908/** 907/**
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 2c472e1b4827..24fedd4b117e 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -55,7 +55,8 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt,
55 if (IS_ERR(rb)) 55 if (IS_ERR(rb))
56 goto out_fail; 56 goto out_fail;
57 req->rl_sendbuf = rb; 57 req->rl_sendbuf = rb;
58 xdr_buf_init(&rqst->rq_snd_buf, rb->rg_base, size); 58 xdr_buf_init(&rqst->rq_snd_buf, rb->rg_base,
59 min_t(size_t, size, PAGE_SIZE));
59 rpcrdma_set_xprtdata(rqst, req); 60 rpcrdma_set_xprtdata(rqst, req);
60 return 0; 61 return 0;
61 62
@@ -191,6 +192,7 @@ size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
191 size_t maxmsg; 192 size_t maxmsg;
192 193
193 maxmsg = min_t(unsigned int, cdata->inline_rsize, cdata->inline_wsize); 194 maxmsg = min_t(unsigned int, cdata->inline_rsize, cdata->inline_wsize);
195 maxmsg = min_t(unsigned int, maxmsg, PAGE_SIZE);
194 return maxmsg - RPCRDMA_HDRLEN_MIN; 196 return maxmsg - RPCRDMA_HDRLEN_MIN;
195} 197}
196 198
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 1ebb09e1ac4f..59e64025ed96 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -310,10 +310,7 @@ fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
310 struct rpcrdma_mw *mw; 310 struct rpcrdma_mw *mw;
311 311
312 while (!list_empty(&req->rl_registered)) { 312 while (!list_empty(&req->rl_registered)) {
313 mw = list_first_entry(&req->rl_registered, 313 mw = rpcrdma_pop_mw(&req->rl_registered);
314 struct rpcrdma_mw, mw_list);
315 list_del_init(&mw->mw_list);
316
317 if (sync) 314 if (sync)
318 fmr_op_recover_mr(mw); 315 fmr_op_recover_mr(mw);
319 else 316 else
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 26b26beef2d4..f81dd93176c0 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -101,7 +101,7 @@ frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
101 struct rpcrdma_frmr *f = &r->frmr; 101 struct rpcrdma_frmr *f = &r->frmr;
102 int rc; 102 int rc;
103 103
104 f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG, depth); 104 f->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth);
105 if (IS_ERR(f->fr_mr)) 105 if (IS_ERR(f->fr_mr))
106 goto out_mr_err; 106 goto out_mr_err;
107 107
@@ -157,7 +157,7 @@ __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
157 return rc; 157 return rc;
158 } 158 }
159 159
160 f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG, 160 f->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype,
161 ia->ri_max_frmr_depth); 161 ia->ri_max_frmr_depth);
162 if (IS_ERR(f->fr_mr)) { 162 if (IS_ERR(f->fr_mr)) {
163 pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n", 163 pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n",
@@ -171,10 +171,6 @@ __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
171} 171}
172 172
173/* Reset of a single FRMR. Generate a fresh rkey by replacing the MR. 173/* Reset of a single FRMR. Generate a fresh rkey by replacing the MR.
174 *
175 * There's no recovery if this fails. The FRMR is abandoned, but
176 * remains in rb_all. It will be cleaned up when the transport is
177 * destroyed.
178 */ 174 */
179static void 175static void
180frwr_op_recover_mr(struct rpcrdma_mw *mw) 176frwr_op_recover_mr(struct rpcrdma_mw *mw)
@@ -210,11 +206,16 @@ static int
210frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, 206frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
211 struct rpcrdma_create_data_internal *cdata) 207 struct rpcrdma_create_data_internal *cdata)
212{ 208{
209 struct ib_device_attr *attrs = &ia->ri_device->attrs;
213 int depth, delta; 210 int depth, delta;
214 211
212 ia->ri_mrtype = IB_MR_TYPE_MEM_REG;
213 if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG)
214 ia->ri_mrtype = IB_MR_TYPE_SG_GAPS;
215
215 ia->ri_max_frmr_depth = 216 ia->ri_max_frmr_depth =
216 min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, 217 min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
217 ia->ri_device->attrs.max_fast_reg_page_list_len); 218 attrs->max_fast_reg_page_list_len);
218 dprintk("RPC: %s: device's max FR page list len = %u\n", 219 dprintk("RPC: %s: device's max FR page list len = %u\n",
219 __func__, ia->ri_max_frmr_depth); 220 __func__, ia->ri_max_frmr_depth);
220 221
@@ -241,8 +242,8 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
241 } 242 }
242 243
243 ep->rep_attr.cap.max_send_wr *= depth; 244 ep->rep_attr.cap.max_send_wr *= depth;
244 if (ep->rep_attr.cap.max_send_wr > ia->ri_device->attrs.max_qp_wr) { 245 if (ep->rep_attr.cap.max_send_wr > attrs->max_qp_wr) {
245 cdata->max_requests = ia->ri_device->attrs.max_qp_wr / depth; 246 cdata->max_requests = attrs->max_qp_wr / depth;
246 if (!cdata->max_requests) 247 if (!cdata->max_requests)
247 return -EINVAL; 248 return -EINVAL;
248 ep->rep_attr.cap.max_send_wr = cdata->max_requests * 249 ep->rep_attr.cap.max_send_wr = cdata->max_requests *
@@ -348,6 +349,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
348 int nsegs, bool writing, struct rpcrdma_mw **out) 349 int nsegs, bool writing, struct rpcrdma_mw **out)
349{ 350{
350 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 351 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
352 bool holes_ok = ia->ri_mrtype == IB_MR_TYPE_SG_GAPS;
351 struct rpcrdma_mw *mw; 353 struct rpcrdma_mw *mw;
352 struct rpcrdma_frmr *frmr; 354 struct rpcrdma_frmr *frmr;
353 struct ib_mr *mr; 355 struct ib_mr *mr;
@@ -383,8 +385,8 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
383 385
384 ++seg; 386 ++seg;
385 ++i; 387 ++i;
386 388 if (holes_ok)
387 /* Check for holes */ 389 continue;
388 if ((i < nsegs && offset_in_page(seg->mr_offset)) || 390 if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
389 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) 391 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
390 break; 392 break;
@@ -421,7 +423,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
421 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : 423 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
422 IB_ACCESS_REMOTE_READ; 424 IB_ACCESS_REMOTE_READ;
423 425
424 DECR_CQCOUNT(&r_xprt->rx_ep); 426 rpcrdma_set_signaled(&r_xprt->rx_ep, &reg_wr->wr);
425 rc = ib_post_send(ia->ri_id->qp, &reg_wr->wr, &bad_wr); 427 rc = ib_post_send(ia->ri_id->qp, &reg_wr->wr, &bad_wr);
426 if (rc) 428 if (rc)
427 goto out_senderr; 429 goto out_senderr;
@@ -451,26 +453,6 @@ out_senderr:
451 return -ENOTCONN; 453 return -ENOTCONN;
452} 454}
453 455
454static struct ib_send_wr *
455__frwr_prepare_linv_wr(struct rpcrdma_mw *mw)
456{
457 struct rpcrdma_frmr *f = &mw->frmr;
458 struct ib_send_wr *invalidate_wr;
459
460 dprintk("RPC: %s: invalidating frmr %p\n", __func__, f);
461
462 f->fr_state = FRMR_IS_INVALID;
463 invalidate_wr = &f->fr_invwr;
464
465 memset(invalidate_wr, 0, sizeof(*invalidate_wr));
466 f->fr_cqe.done = frwr_wc_localinv;
467 invalidate_wr->wr_cqe = &f->fr_cqe;
468 invalidate_wr->opcode = IB_WR_LOCAL_INV;
469 invalidate_wr->ex.invalidate_rkey = f->fr_mr->rkey;
470
471 return invalidate_wr;
472}
473
474/* Invalidate all memory regions that were registered for "req". 456/* Invalidate all memory regions that were registered for "req".
475 * 457 *
476 * Sleeps until it is safe for the host CPU to access the 458 * Sleeps until it is safe for the host CPU to access the
@@ -481,12 +463,12 @@ __frwr_prepare_linv_wr(struct rpcrdma_mw *mw)
481static void 463static void
482frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) 464frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
483{ 465{
484 struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr; 466 struct ib_send_wr *first, **prev, *last, *bad_wr;
485 struct rpcrdma_rep *rep = req->rl_reply; 467 struct rpcrdma_rep *rep = req->rl_reply;
486 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 468 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
487 struct rpcrdma_mw *mw, *tmp;
488 struct rpcrdma_frmr *f; 469 struct rpcrdma_frmr *f;
489 int rc; 470 struct rpcrdma_mw *mw;
471 int count, rc;
490 472
491 dprintk("RPC: %s: req %p\n", __func__, req); 473 dprintk("RPC: %s: req %p\n", __func__, req);
492 474
@@ -496,22 +478,29 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
496 * a single ib_post_send() call. 478 * a single ib_post_send() call.
497 */ 479 */
498 f = NULL; 480 f = NULL;
499 invalidate_wrs = pos = prev = NULL; 481 count = 0;
482 prev = &first;
500 list_for_each_entry(mw, &req->rl_registered, mw_list) { 483 list_for_each_entry(mw, &req->rl_registered, mw_list) {
484 mw->frmr.fr_state = FRMR_IS_INVALID;
485
501 if ((rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) && 486 if ((rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) &&
502 (mw->mw_handle == rep->rr_inv_rkey)) { 487 (mw->mw_handle == rep->rr_inv_rkey))
503 mw->frmr.fr_state = FRMR_IS_INVALID;
504 continue; 488 continue;
505 }
506
507 pos = __frwr_prepare_linv_wr(mw);
508 489
509 if (!invalidate_wrs)
510 invalidate_wrs = pos;
511 else
512 prev->next = pos;
513 prev = pos;
514 f = &mw->frmr; 490 f = &mw->frmr;
491 dprintk("RPC: %s: invalidating frmr %p\n",
492 __func__, f);
493
494 f->fr_cqe.done = frwr_wc_localinv;
495 last = &f->fr_invwr;
496 memset(last, 0, sizeof(*last));
497 last->wr_cqe = &f->fr_cqe;
498 last->opcode = IB_WR_LOCAL_INV;
499 last->ex.invalidate_rkey = mw->mw_handle;
500 count++;
501
502 *prev = last;
503 prev = &last->next;
515 } 504 }
516 if (!f) 505 if (!f)
517 goto unmap; 506 goto unmap;
@@ -520,17 +509,22 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
520 * last WR in the chain completes, all WRs in the chain 509 * last WR in the chain completes, all WRs in the chain
521 * are complete. 510 * are complete.
522 */ 511 */
523 f->fr_invwr.send_flags = IB_SEND_SIGNALED; 512 last->send_flags = IB_SEND_SIGNALED;
524 f->fr_cqe.done = frwr_wc_localinv_wake; 513 f->fr_cqe.done = frwr_wc_localinv_wake;
525 reinit_completion(&f->fr_linv_done); 514 reinit_completion(&f->fr_linv_done);
526 INIT_CQCOUNT(&r_xprt->rx_ep); 515
516 /* Initialize CQ count, since there is always a signaled
517 * WR being posted here. The new cqcount depends on how
518 * many SQEs are about to be consumed.
519 */
520 rpcrdma_init_cqcount(&r_xprt->rx_ep, count);
527 521
528 /* Transport disconnect drains the receive CQ before it 522 /* Transport disconnect drains the receive CQ before it
529 * replaces the QP. The RPC reply handler won't call us 523 * replaces the QP. The RPC reply handler won't call us
530 * unless ri_id->qp is a valid pointer. 524 * unless ri_id->qp is a valid pointer.
531 */ 525 */
532 r_xprt->rx_stats.local_inv_needed++; 526 r_xprt->rx_stats.local_inv_needed++;
533 rc = ib_post_send(ia->ri_id->qp, invalidate_wrs, &bad_wr); 527 rc = ib_post_send(ia->ri_id->qp, first, &bad_wr);
534 if (rc) 528 if (rc)
535 goto reset_mrs; 529 goto reset_mrs;
536 530
@@ -540,10 +534,10 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
540 * them to the free MW list. 534 * them to the free MW list.
541 */ 535 */
542unmap: 536unmap:
543 list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) { 537 while (!list_empty(&req->rl_registered)) {
544 dprintk("RPC: %s: unmapping frmr %p\n", 538 mw = rpcrdma_pop_mw(&req->rl_registered);
539 dprintk("RPC: %s: DMA unmapping frmr %p\n",
545 __func__, &mw->frmr); 540 __func__, &mw->frmr);
546 list_del_init(&mw->mw_list);
547 ib_dma_unmap_sg(ia->ri_device, 541 ib_dma_unmap_sg(ia->ri_device,
548 mw->mw_sg, mw->mw_nents, mw->mw_dir); 542 mw->mw_sg, mw->mw_nents, mw->mw_dir);
549 rpcrdma_put_mw(r_xprt, mw); 543 rpcrdma_put_mw(r_xprt, mw);
@@ -559,7 +553,7 @@ reset_mrs:
559 */ 553 */
560 list_for_each_entry(mw, &req->rl_registered, mw_list) { 554 list_for_each_entry(mw, &req->rl_registered, mw_list) {
561 f = &mw->frmr; 555 f = &mw->frmr;
562 if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) { 556 if (mw->mw_handle == bad_wr->ex.invalidate_rkey) {
563 __frwr_reset_mr(ia, mw); 557 __frwr_reset_mr(ia, mw);
564 bad_wr = bad_wr->next; 558 bad_wr = bad_wr->next;
565 } 559 }
@@ -577,10 +571,7 @@ frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
577 struct rpcrdma_mw *mw; 571 struct rpcrdma_mw *mw;
578 572
579 while (!list_empty(&req->rl_registered)) { 573 while (!list_empty(&req->rl_registered)) {
580 mw = list_first_entry(&req->rl_registered, 574 mw = rpcrdma_pop_mw(&req->rl_registered);
581 struct rpcrdma_mw, mw_list);
582 list_del_init(&mw->mw_list);
583
584 if (sync) 575 if (sync)
585 frwr_op_recover_mr(mw); 576 frwr_op_recover_mr(mw);
586 else 577 else
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index d987c2d3dd6e..a044be2d6ad7 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -125,14 +125,34 @@ void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt)
125/* The client can send a request inline as long as the RPCRDMA header 125/* The client can send a request inline as long as the RPCRDMA header
126 * plus the RPC call fit under the transport's inline limit. If the 126 * plus the RPC call fit under the transport's inline limit. If the
127 * combined call message size exceeds that limit, the client must use 127 * combined call message size exceeds that limit, the client must use
128 * the read chunk list for this operation. 128 * a Read chunk for this operation.
129 *
130 * A Read chunk is also required if sending the RPC call inline would
131 * exceed this device's max_sge limit.
129 */ 132 */
130static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, 133static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
131 struct rpc_rqst *rqst) 134 struct rpc_rqst *rqst)
132{ 135{
133 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 136 struct xdr_buf *xdr = &rqst->rq_snd_buf;
137 unsigned int count, remaining, offset;
138
139 if (xdr->len > r_xprt->rx_ia.ri_max_inline_write)
140 return false;
141
142 if (xdr->page_len) {
143 remaining = xdr->page_len;
144 offset = xdr->page_base & ~PAGE_MASK;
145 count = 0;
146 while (remaining) {
147 remaining -= min_t(unsigned int,
148 PAGE_SIZE - offset, remaining);
149 offset = 0;
150 if (++count > r_xprt->rx_ia.ri_max_send_sges)
151 return false;
152 }
153 }
134 154
135 return rqst->rq_snd_buf.len <= ia->ri_max_inline_write; 155 return true;
136} 156}
137 157
138/* The client can't know how large the actual reply will be. Thus it 158/* The client can't know how large the actual reply will be. Thus it
@@ -186,9 +206,9 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n)
186 */ 206 */
187 207
188static int 208static int
189rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, 209rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
190 enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, 210 unsigned int pos, enum rpcrdma_chunktype type,
191 bool reminv_expected) 211 struct rpcrdma_mr_seg *seg)
192{ 212{
193 int len, n, p, page_base; 213 int len, n, p, page_base;
194 struct page **ppages; 214 struct page **ppages;
@@ -226,22 +246,21 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
226 if (len && n == RPCRDMA_MAX_SEGS) 246 if (len && n == RPCRDMA_MAX_SEGS)
227 goto out_overflow; 247 goto out_overflow;
228 248
229 /* When encoding the read list, the tail is always sent inline */ 249 /* When encoding a Read chunk, the tail iovec contains an
230 if (type == rpcrdma_readch) 250 * XDR pad and may be omitted.
251 */
252 if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup)
231 return n; 253 return n;
232 254
233 /* When encoding the Write list, some servers need to see an extra 255 /* When encoding a Write chunk, some servers need to see an
234 * segment for odd-length Write chunks. The upper layer provides 256 * extra segment for non-XDR-aligned Write chunks. The upper
235 * space in the tail iovec for this purpose. 257 * layer provides space in the tail iovec that may be used
258 * for this purpose.
236 */ 259 */
237 if (type == rpcrdma_writech && reminv_expected) 260 if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup)
238 return n; 261 return n;
239 262
240 if (xdrbuf->tail[0].iov_len) { 263 if (xdrbuf->tail[0].iov_len) {
241 /* the rpcrdma protocol allows us to omit any trailing
242 * xdr pad bytes, saving the server an RDMA operation. */
243 if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize)
244 return n;
245 n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n); 264 n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n);
246 if (n == RPCRDMA_MAX_SEGS) 265 if (n == RPCRDMA_MAX_SEGS)
247 goto out_overflow; 266 goto out_overflow;
@@ -293,7 +312,8 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
293 if (rtype == rpcrdma_areadch) 312 if (rtype == rpcrdma_areadch)
294 pos = 0; 313 pos = 0;
295 seg = req->rl_segments; 314 seg = req->rl_segments;
296 nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg, false); 315 nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos,
316 rtype, seg);
297 if (nsegs < 0) 317 if (nsegs < 0)
298 return ERR_PTR(nsegs); 318 return ERR_PTR(nsegs);
299 319
@@ -302,7 +322,7 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
302 false, &mw); 322 false, &mw);
303 if (n < 0) 323 if (n < 0)
304 return ERR_PTR(n); 324 return ERR_PTR(n);
305 list_add(&mw->mw_list, &req->rl_registered); 325 rpcrdma_push_mw(mw, &req->rl_registered);
306 326
307 *iptr++ = xdr_one; /* item present */ 327 *iptr++ = xdr_one; /* item present */
308 328
@@ -355,10 +375,9 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
355 } 375 }
356 376
357 seg = req->rl_segments; 377 seg = req->rl_segments;
358 nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 378 nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf,
359 rqst->rq_rcv_buf.head[0].iov_len, 379 rqst->rq_rcv_buf.head[0].iov_len,
360 wtype, seg, 380 wtype, seg);
361 r_xprt->rx_ia.ri_reminv_expected);
362 if (nsegs < 0) 381 if (nsegs < 0)
363 return ERR_PTR(nsegs); 382 return ERR_PTR(nsegs);
364 383
@@ -371,7 +390,7 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
371 true, &mw); 390 true, &mw);
372 if (n < 0) 391 if (n < 0)
373 return ERR_PTR(n); 392 return ERR_PTR(n);
374 list_add(&mw->mw_list, &req->rl_registered); 393 rpcrdma_push_mw(mw, &req->rl_registered);
375 394
376 iptr = xdr_encode_rdma_segment(iptr, mw); 395 iptr = xdr_encode_rdma_segment(iptr, mw);
377 396
@@ -423,8 +442,7 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
423 } 442 }
424 443
425 seg = req->rl_segments; 444 seg = req->rl_segments;
426 nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg, 445 nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg);
427 r_xprt->rx_ia.ri_reminv_expected);
428 if (nsegs < 0) 446 if (nsegs < 0)
429 return ERR_PTR(nsegs); 447 return ERR_PTR(nsegs);
430 448
@@ -437,7 +455,7 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
437 true, &mw); 455 true, &mw);
438 if (n < 0) 456 if (n < 0)
439 return ERR_PTR(n); 457 return ERR_PTR(n);
440 list_add(&mw->mw_list, &req->rl_registered); 458 rpcrdma_push_mw(mw, &req->rl_registered);
441 459
442 iptr = xdr_encode_rdma_segment(iptr, mw); 460 iptr = xdr_encode_rdma_segment(iptr, mw);
443 461
@@ -741,13 +759,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
741 iptr = headerp->rm_body.rm_chunks; 759 iptr = headerp->rm_body.rm_chunks;
742 iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype); 760 iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype);
743 if (IS_ERR(iptr)) 761 if (IS_ERR(iptr))
744 goto out_unmap; 762 goto out_err;
745 iptr = rpcrdma_encode_write_list(r_xprt, req, rqst, iptr, wtype); 763 iptr = rpcrdma_encode_write_list(r_xprt, req, rqst, iptr, wtype);
746 if (IS_ERR(iptr)) 764 if (IS_ERR(iptr))
747 goto out_unmap; 765 goto out_err;
748 iptr = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, iptr, wtype); 766 iptr = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, iptr, wtype);
749 if (IS_ERR(iptr)) 767 if (IS_ERR(iptr))
750 goto out_unmap; 768 goto out_err;
751 hdrlen = (unsigned char *)iptr - (unsigned char *)headerp; 769 hdrlen = (unsigned char *)iptr - (unsigned char *)headerp;
752 770
753 dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n", 771 dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n",
@@ -758,12 +776,14 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
758 if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, hdrlen, 776 if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, hdrlen,
759 &rqst->rq_snd_buf, rtype)) { 777 &rqst->rq_snd_buf, rtype)) {
760 iptr = ERR_PTR(-EIO); 778 iptr = ERR_PTR(-EIO);
761 goto out_unmap; 779 goto out_err;
762 } 780 }
763 return 0; 781 return 0;
764 782
765out_unmap: 783out_err:
766 r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); 784 pr_err("rpcrdma: rpcrdma_marshal_req failed, status %ld\n",
785 PTR_ERR(iptr));
786 r_xprt->rx_stats.failed_marshal_count++;
767 return PTR_ERR(iptr); 787 return PTR_ERR(iptr);
768} 788}
769 789
@@ -786,7 +806,7 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, int wrchunk, __be32 **iptrp)
786 ifdebug(FACILITY) { 806 ifdebug(FACILITY) {
787 u64 off; 807 u64 off;
788 xdr_decode_hyper((__be32 *)&seg->rs_offset, &off); 808 xdr_decode_hyper((__be32 *)&seg->rs_offset, &off);
789 dprintk("RPC: %s: chunk %d@0x%llx:0x%x\n", 809 dprintk("RPC: %s: chunk %d@0x%016llx:0x%08x\n",
790 __func__, 810 __func__,
791 be32_to_cpu(seg->rs_length), 811 be32_to_cpu(seg->rs_length),
792 (unsigned long long)off, 812 (unsigned long long)off,
@@ -906,28 +926,6 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
906 return fixup_copy_count; 926 return fixup_copy_count;
907} 927}
908 928
909void
910rpcrdma_connect_worker(struct work_struct *work)
911{
912 struct rpcrdma_ep *ep =
913 container_of(work, struct rpcrdma_ep, rep_connect_worker.work);
914 struct rpcrdma_xprt *r_xprt =
915 container_of(ep, struct rpcrdma_xprt, rx_ep);
916 struct rpc_xprt *xprt = &r_xprt->rx_xprt;
917
918 spin_lock_bh(&xprt->transport_lock);
919 if (++xprt->connect_cookie == 0) /* maintain a reserved value */
920 ++xprt->connect_cookie;
921 if (ep->rep_connected > 0) {
922 if (!xprt_test_and_set_connected(xprt))
923 xprt_wake_pending_tasks(xprt, 0);
924 } else {
925 if (xprt_test_and_clear_connected(xprt))
926 xprt_wake_pending_tasks(xprt, -ENOTCONN);
927 }
928 spin_unlock_bh(&xprt->transport_lock);
929}
930
931#if defined(CONFIG_SUNRPC_BACKCHANNEL) 929#if defined(CONFIG_SUNRPC_BACKCHANNEL)
932/* By convention, backchannel calls arrive via rdma_msg type 930/* By convention, backchannel calls arrive via rdma_msg type
933 * messages, and never populate the chunk lists. This makes 931 * messages, and never populate the chunk lists. This makes
@@ -959,18 +957,6 @@ rpcrdma_is_bcall(struct rpcrdma_msg *headerp)
959} 957}
960#endif /* CONFIG_SUNRPC_BACKCHANNEL */ 958#endif /* CONFIG_SUNRPC_BACKCHANNEL */
961 959
962/*
963 * This function is called when an async event is posted to
964 * the connection which changes the connection state. All it
965 * does at this point is mark the connection up/down, the rpc
966 * timers do the rest.
967 */
968void
969rpcrdma_conn_func(struct rpcrdma_ep *ep)
970{
971 schedule_delayed_work(&ep->rep_connect_worker, 0);
972}
973
974/* Process received RPC/RDMA messages. 960/* Process received RPC/RDMA messages.
975 * 961 *
976 * Errors must result in the RPC task either being awakened, or 962 * Errors must result in the RPC task either being awakened, or
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index 20027f8de129..ff1df40f0d26 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -4,6 +4,7 @@
4 * Support for backward direction RPCs on RPC/RDMA (server-side). 4 * Support for backward direction RPCs on RPC/RDMA (server-side).
5 */ 5 */
6 6
7#include <linux/module.h>
7#include <linux/sunrpc/svc_rdma.h> 8#include <linux/sunrpc/svc_rdma.h>
8#include "xprt_rdma.h" 9#include "xprt_rdma.h"
9 10
@@ -164,13 +165,9 @@ static int
164xprt_rdma_bc_allocate(struct rpc_task *task) 165xprt_rdma_bc_allocate(struct rpc_task *task)
165{ 166{
166 struct rpc_rqst *rqst = task->tk_rqstp; 167 struct rpc_rqst *rqst = task->tk_rqstp;
167 struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt;
168 size_t size = rqst->rq_callsize; 168 size_t size = rqst->rq_callsize;
169 struct svcxprt_rdma *rdma;
170 struct page *page; 169 struct page *page;
171 170
172 rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt);
173
174 if (size > PAGE_SIZE) { 171 if (size > PAGE_SIZE) {
175 WARN_ONCE(1, "svcrdma: large bc buffer request (size %zu)\n", 172 WARN_ONCE(1, "svcrdma: large bc buffer request (size %zu)\n",
176 size); 173 size);
@@ -204,19 +201,20 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst)
204{ 201{
205 struct rpc_xprt *xprt = rqst->rq_xprt; 202 struct rpc_xprt *xprt = rqst->rq_xprt;
206 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 203 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
207 struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)rqst->rq_buffer; 204 __be32 *p;
208 int rc; 205 int rc;
209 206
210 /* Space in the send buffer for an RPC/RDMA header is reserved 207 /* Space in the send buffer for an RPC/RDMA header is reserved
211 * via xprt->tsh_size. 208 * via xprt->tsh_size.
212 */ 209 */
213 headerp->rm_xid = rqst->rq_xid; 210 p = rqst->rq_buffer;
214 headerp->rm_vers = rpcrdma_version; 211 *p++ = rqst->rq_xid;
215 headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests); 212 *p++ = rpcrdma_version;
216 headerp->rm_type = rdma_msg; 213 *p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests);
217 headerp->rm_body.rm_chunks[0] = xdr_zero; 214 *p++ = rdma_msg;
218 headerp->rm_body.rm_chunks[1] = xdr_zero; 215 *p++ = xdr_zero;
219 headerp->rm_body.rm_chunks[2] = xdr_zero; 216 *p++ = xdr_zero;
217 *p = xdr_zero;
220 218
221#ifdef SVCRDMA_BACKCHANNEL_DEBUG 219#ifdef SVCRDMA_BACKCHANNEL_DEBUG
222 pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer); 220 pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer);
@@ -359,6 +357,7 @@ xprt_setup_rdma_bc(struct xprt_create *args)
359out_fail: 357out_fail:
360 xprt_rdma_free_addresses(xprt); 358 xprt_rdma_free_addresses(xprt);
361 args->bc_xprt->xpt_bc_xprt = NULL; 359 args->bc_xprt->xpt_bc_xprt = NULL;
360 args->bc_xprt->xpt_bc_xps = NULL;
362 xprt_put(xprt); 361 xprt_put(xprt);
363 xprt_free(xprt); 362 xprt_free(xprt);
364 return ERR_PTR(-EINVAL); 363 return ERR_PTR(-EINVAL);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
index 0ba9887f3e22..1c4aabf0f657 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -1,4 +1,5 @@
1/* 1/*
2 * Copyright (c) 2016 Oracle. All rights reserved.
2 * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. 3 * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
3 * 4 *
4 * This software is available to you under a choice of one of two 5 * This software is available to you under a choice of one of two
@@ -47,102 +48,43 @@
47 48
48#define RPCDBG_FACILITY RPCDBG_SVCXPRT 49#define RPCDBG_FACILITY RPCDBG_SVCXPRT
49 50
50/* 51static __be32 *xdr_check_read_list(__be32 *p, __be32 *end)
51 * Decodes a read chunk list. The expected format is as follows:
52 * descrim : xdr_one
53 * position : __be32 offset into XDR stream
54 * handle : __be32 RKEY
55 * . . .
56 * end-of-list: xdr_zero
57 */
58static __be32 *decode_read_list(__be32 *va, __be32 *vaend)
59{ 52{
60 struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va; 53 __be32 *next;
61 54
62 while (ch->rc_discrim != xdr_zero) { 55 while (*p++ != xdr_zero) {
63 if (((unsigned long)ch + sizeof(struct rpcrdma_read_chunk)) > 56 next = p + rpcrdma_readchunk_maxsz - 1;
64 (unsigned long)vaend) { 57 if (next > end)
65 dprintk("svcrdma: vaend=%p, ch=%p\n", vaend, ch);
66 return NULL; 58 return NULL;
67 } 59 p = next;
68 ch++;
69 } 60 }
70 return &ch->rc_position; 61 return p;
71} 62}
72 63
73/* 64static __be32 *xdr_check_write_list(__be32 *p, __be32 *end)
74 * Decodes a write chunk list. The expected format is as follows:
75 * descrim : xdr_one
76 * nchunks : <count>
77 * handle : __be32 RKEY ---+
78 * length : __be32 <len of segment> |
79 * offset : remove va + <count>
80 * . . . |
81 * ---+
82 */
83static __be32 *decode_write_list(__be32 *va, __be32 *vaend)
84{ 65{
85 unsigned long start, end; 66 __be32 *next;
86 int nchunks;
87
88 struct rpcrdma_write_array *ary =
89 (struct rpcrdma_write_array *)va;
90 67
91 /* Check for not write-array */ 68 while (*p++ != xdr_zero) {
92 if (ary->wc_discrim == xdr_zero) 69 next = p + 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz;
93 return &ary->wc_nchunks; 70 if (next > end)
94 71 return NULL;
95 if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) > 72 p = next;
96 (unsigned long)vaend) {
97 dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
98 return NULL;
99 }
100 nchunks = be32_to_cpu(ary->wc_nchunks);
101
102 start = (unsigned long)&ary->wc_array[0];
103 end = (unsigned long)vaend;
104 if (nchunks < 0 ||
105 nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) ||
106 (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) {
107 dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
108 ary, nchunks, vaend);
109 return NULL;
110 } 73 }
111 /* 74 return p;
112 * rs_length is the 2nd 4B field in wc_target and taking its
113 * address skips the list terminator
114 */
115 return &ary->wc_array[nchunks].wc_target.rs_length;
116} 75}
117 76
118static __be32 *decode_reply_array(__be32 *va, __be32 *vaend) 77static __be32 *xdr_check_reply_chunk(__be32 *p, __be32 *end)
119{ 78{
120 unsigned long start, end; 79 __be32 *next;
121 int nchunks; 80
122 struct rpcrdma_write_array *ary = 81 if (*p++ != xdr_zero) {
123 (struct rpcrdma_write_array *)va; 82 next = p + 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz;
124 83 if (next > end)
125 /* Check for no reply-array */ 84 return NULL;
126 if (ary->wc_discrim == xdr_zero) 85 p = next;
127 return &ary->wc_nchunks;
128
129 if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
130 (unsigned long)vaend) {
131 dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
132 return NULL;
133 }
134 nchunks = be32_to_cpu(ary->wc_nchunks);
135
136 start = (unsigned long)&ary->wc_array[0];
137 end = (unsigned long)vaend;
138 if (nchunks < 0 ||
139 nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) ||
140 (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) {
141 dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
142 ary, nchunks, vaend);
143 return NULL;
144 } 86 }
145 return (__be32 *)&ary->wc_array[nchunks]; 87 return p;
146} 88}
147 89
148/** 90/**
@@ -158,87 +100,71 @@ static __be32 *decode_reply_array(__be32 *va, __be32 *vaend)
158 */ 100 */
159int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg) 101int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg)
160{ 102{
161 struct rpcrdma_msg *rmsgp; 103 __be32 *p, *end, *rdma_argp;
162 __be32 *va, *vaend; 104 unsigned int hdr_len;
163 unsigned int len;
164 u32 hdr_len;
165 105
166 /* Verify that there's enough bytes for header + something */ 106 /* Verify that there's enough bytes for header + something */
167 if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) { 107 if (rq_arg->len <= RPCRDMA_HDRLEN_ERR)
168 dprintk("svcrdma: header too short = %d\n", 108 goto out_short;
169 rq_arg->len);
170 return -EINVAL;
171 }
172 109
173 rmsgp = (struct rpcrdma_msg *)rq_arg->head[0].iov_base; 110 rdma_argp = rq_arg->head[0].iov_base;
174 if (rmsgp->rm_vers != rpcrdma_version) { 111 if (*(rdma_argp + 1) != rpcrdma_version)
175 dprintk("%s: bad version %u\n", __func__, 112 goto out_version;
176 be32_to_cpu(rmsgp->rm_vers));
177 return -EPROTONOSUPPORT;
178 }
179 113
180 switch (be32_to_cpu(rmsgp->rm_type)) { 114 switch (*(rdma_argp + 3)) {
181 case RDMA_MSG: 115 case rdma_msg:
182 case RDMA_NOMSG: 116 case rdma_nomsg:
183 break; 117 break;
184 118
185 case RDMA_DONE: 119 case rdma_done:
186 /* Just drop it */ 120 goto out_drop;
187 dprintk("svcrdma: dropping RDMA_DONE message\n");
188 return 0;
189
190 case RDMA_ERROR:
191 /* Possible if this is a backchannel reply.
192 * XXX: We should cancel this XID, though.
193 */
194 dprintk("svcrdma: dropping RDMA_ERROR message\n");
195 return 0;
196
197 case RDMA_MSGP:
198 /* Pull in the extra for the padded case, bump our pointer */
199 rmsgp->rm_body.rm_padded.rm_align =
200 be32_to_cpu(rmsgp->rm_body.rm_padded.rm_align);
201 rmsgp->rm_body.rm_padded.rm_thresh =
202 be32_to_cpu(rmsgp->rm_body.rm_padded.rm_thresh);
203
204 va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
205 rq_arg->head[0].iov_base = va;
206 len = (u32)((unsigned long)va - (unsigned long)rmsgp);
207 rq_arg->head[0].iov_len -= len;
208 if (len > rq_arg->len)
209 return -EINVAL;
210 return len;
211 default:
212 dprintk("svcrdma: bad rdma procedure (%u)\n",
213 be32_to_cpu(rmsgp->rm_type));
214 return -EINVAL;
215 }
216 121
217 /* The chunk list may contain either a read chunk list or a write 122 case rdma_error:
218 * chunk list and a reply chunk list. 123 goto out_drop;
219 */ 124
220 va = &rmsgp->rm_body.rm_chunks[0]; 125 default:
221 vaend = (__be32 *)((unsigned long)rmsgp + rq_arg->len); 126 goto out_proc;
222 va = decode_read_list(va, vaend);
223 if (!va) {
224 dprintk("svcrdma: failed to decode read list\n");
225 return -EINVAL;
226 }
227 va = decode_write_list(va, vaend);
228 if (!va) {
229 dprintk("svcrdma: failed to decode write list\n");
230 return -EINVAL;
231 }
232 va = decode_reply_array(va, vaend);
233 if (!va) {
234 dprintk("svcrdma: failed to decode reply chunk\n");
235 return -EINVAL;
236 } 127 }
237 128
238 rq_arg->head[0].iov_base = va; 129 end = (__be32 *)((unsigned long)rdma_argp + rq_arg->len);
239 hdr_len = (unsigned long)va - (unsigned long)rmsgp; 130 p = xdr_check_read_list(rdma_argp + 4, end);
131 if (!p)
132 goto out_inval;
133 p = xdr_check_write_list(p, end);
134 if (!p)
135 goto out_inval;
136 p = xdr_check_reply_chunk(p, end);
137 if (!p)
138 goto out_inval;
139 if (p > end)
140 goto out_inval;
141
142 rq_arg->head[0].iov_base = p;
143 hdr_len = (unsigned long)p - (unsigned long)rdma_argp;
240 rq_arg->head[0].iov_len -= hdr_len; 144 rq_arg->head[0].iov_len -= hdr_len;
241 return hdr_len; 145 return hdr_len;
146
147out_short:
148 dprintk("svcrdma: header too short = %d\n", rq_arg->len);
149 return -EINVAL;
150
151out_version:
152 dprintk("svcrdma: bad xprt version: %u\n",
153 be32_to_cpup(rdma_argp + 1));
154 return -EPROTONOSUPPORT;
155
156out_drop:
157 dprintk("svcrdma: dropping RDMA_DONE/ERROR message\n");
158 return 0;
159
160out_proc:
161 dprintk("svcrdma: bad rdma procedure (%u)\n",
162 be32_to_cpup(rdma_argp + 3));
163 return -EINVAL;
164
165out_inval:
166 dprintk("svcrdma: failed to parse transport header\n");
167 return -EINVAL;
242} 168}
243 169
244int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt, 170int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
@@ -249,7 +175,7 @@ int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
249 175
250 *va++ = rmsgp->rm_xid; 176 *va++ = rmsgp->rm_xid;
251 *va++ = rmsgp->rm_vers; 177 *va++ = rmsgp->rm_vers;
252 *va++ = cpu_to_be32(xprt->sc_max_requests); 178 *va++ = xprt->sc_fc_credits;
253 *va++ = rdma_error; 179 *va++ = rdma_error;
254 *va++ = cpu_to_be32(err); 180 *va++ = cpu_to_be32(err);
255 if (err == ERR_VERS) { 181 if (err == ERR_VERS) {
@@ -260,32 +186,35 @@ int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
260 return (int)((unsigned long)va - (unsigned long)startp); 186 return (int)((unsigned long)va - (unsigned long)startp);
261} 187}
262 188
263int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp) 189/**
190 * svc_rdma_xdr_get_reply_hdr_length - Get length of Reply transport header
191 * @rdma_resp: buffer containing Reply transport header
192 *
193 * Returns length of transport header, in bytes.
194 */
195unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp)
264{ 196{
265 struct rpcrdma_write_array *wr_ary; 197 unsigned int nsegs;
198 __be32 *p;
266 199
267 /* There is no read-list in a reply */ 200 p = rdma_resp;
268 201
269 /* skip write list */ 202 /* RPC-over-RDMA V1 replies never have a Read list. */
270 wr_ary = (struct rpcrdma_write_array *) 203 p += rpcrdma_fixed_maxsz + 1;
271 &rmsgp->rm_body.rm_chunks[1]; 204
272 if (wr_ary->wc_discrim) 205 /* Skip Write list. */
273 wr_ary = (struct rpcrdma_write_array *) 206 while (*p++ != xdr_zero) {
274 &wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)]. 207 nsegs = be32_to_cpup(p++);
275 wc_target.rs_length; 208 p += nsegs * rpcrdma_segment_maxsz;
276 else 209 }
277 wr_ary = (struct rpcrdma_write_array *) 210
278 &wr_ary->wc_nchunks; 211 /* Skip Reply chunk. */
279 212 if (*p++ != xdr_zero) {
280 /* skip reply array */ 213 nsegs = be32_to_cpup(p++);
281 if (wr_ary->wc_discrim) 214 p += nsegs * rpcrdma_segment_maxsz;
282 wr_ary = (struct rpcrdma_write_array *) 215 }
283 &wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)]; 216
284 else 217 return (unsigned long)p - (unsigned long)rdma_resp;
285 wr_ary = (struct rpcrdma_write_array *)
286 &wr_ary->wc_nchunks;
287
288 return (unsigned long) wr_ary - (unsigned long) rmsgp;
289} 218}
290 219
291void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks) 220void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks)
@@ -326,19 +255,3 @@ void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary,
326 seg->rs_offset = rs_offset; 255 seg->rs_offset = rs_offset;
327 seg->rs_length = cpu_to_be32(write_len); 256 seg->rs_length = cpu_to_be32(write_len);
328} 257}
329
330void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt,
331 struct rpcrdma_msg *rdma_argp,
332 struct rpcrdma_msg *rdma_resp,
333 enum rpcrdma_proc rdma_type)
334{
335 rdma_resp->rm_xid = rdma_argp->rm_xid;
336 rdma_resp->rm_vers = rdma_argp->rm_vers;
337 rdma_resp->rm_credit = cpu_to_be32(xprt->sc_max_requests);
338 rdma_resp->rm_type = cpu_to_be32(rdma_type);
339
340 /* Encode <nul> chunks lists */
341 rdma_resp->rm_body.rm_chunks[0] = xdr_zero;
342 rdma_resp->rm_body.rm_chunks[1] = xdr_zero;
343 rdma_resp->rm_body.rm_chunks[2] = xdr_zero;
344}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index ad1df979b3f0..f7b2daf72a86 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -279,7 +279,6 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
279 frmr->sg); 279 frmr->sg);
280 return -ENOMEM; 280 return -ENOMEM;
281 } 281 }
282 atomic_inc(&xprt->sc_dma_used);
283 282
284 n = ib_map_mr_sg(frmr->mr, frmr->sg, frmr->sg_nents, NULL, PAGE_SIZE); 283 n = ib_map_mr_sg(frmr->mr, frmr->sg, frmr->sg_nents, NULL, PAGE_SIZE);
285 if (unlikely(n != frmr->sg_nents)) { 284 if (unlikely(n != frmr->sg_nents)) {
@@ -348,8 +347,6 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
348 atomic_inc(&rdma_stat_read); 347 atomic_inc(&rdma_stat_read);
349 return ret; 348 return ret;
350 err: 349 err:
351 ib_dma_unmap_sg(xprt->sc_cm_id->device,
352 frmr->sg, frmr->sg_nents, frmr->direction);
353 svc_rdma_put_context(ctxt, 0); 350 svc_rdma_put_context(ctxt, 0);
354 svc_rdma_put_frmr(xprt, frmr); 351 svc_rdma_put_frmr(xprt, frmr);
355 return ret; 352 return ret;
@@ -374,9 +371,7 @@ rdma_copy_tail(struct svc_rqst *rqstp, struct svc_rdma_op_ctxt *head,
374 u32 position, u32 byte_count, u32 page_offset, int page_no) 371 u32 position, u32 byte_count, u32 page_offset, int page_no)
375{ 372{
376 char *srcp, *destp; 373 char *srcp, *destp;
377 int ret;
378 374
379 ret = 0;
380 srcp = head->arg.head[0].iov_base + position; 375 srcp = head->arg.head[0].iov_base + position;
381 byte_count = head->arg.head[0].iov_len - position; 376 byte_count = head->arg.head[0].iov_len - position;
382 if (byte_count > PAGE_SIZE) { 377 if (byte_count > PAGE_SIZE) {
@@ -415,6 +410,20 @@ done:
415 return 1; 410 return 1;
416} 411}
417 412
413/* Returns the address of the first read chunk or <nul> if no read chunk
414 * is present
415 */
416static struct rpcrdma_read_chunk *
417svc_rdma_get_read_chunk(struct rpcrdma_msg *rmsgp)
418{
419 struct rpcrdma_read_chunk *ch =
420 (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
421
422 if (ch->rc_discrim == xdr_zero)
423 return NULL;
424 return ch;
425}
426
418static int rdma_read_chunks(struct svcxprt_rdma *xprt, 427static int rdma_read_chunks(struct svcxprt_rdma *xprt,
419 struct rpcrdma_msg *rmsgp, 428 struct rpcrdma_msg *rmsgp,
420 struct svc_rqst *rqstp, 429 struct svc_rqst *rqstp,
@@ -597,26 +606,24 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
597 606
598 dprintk("svcrdma: rqstp=%p\n", rqstp); 607 dprintk("svcrdma: rqstp=%p\n", rqstp);
599 608
600 spin_lock_bh(&rdma_xprt->sc_rq_dto_lock); 609 spin_lock(&rdma_xprt->sc_rq_dto_lock);
601 if (!list_empty(&rdma_xprt->sc_read_complete_q)) { 610 if (!list_empty(&rdma_xprt->sc_read_complete_q)) {
602 ctxt = list_entry(rdma_xprt->sc_read_complete_q.next, 611 ctxt = list_first_entry(&rdma_xprt->sc_read_complete_q,
603 struct svc_rdma_op_ctxt, 612 struct svc_rdma_op_ctxt, list);
604 dto_q); 613 list_del(&ctxt->list);
605 list_del_init(&ctxt->dto_q); 614 spin_unlock(&rdma_xprt->sc_rq_dto_lock);
606 spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
607 rdma_read_complete(rqstp, ctxt); 615 rdma_read_complete(rqstp, ctxt);
608 goto complete; 616 goto complete;
609 } else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) { 617 } else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
610 ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next, 618 ctxt = list_first_entry(&rdma_xprt->sc_rq_dto_q,
611 struct svc_rdma_op_ctxt, 619 struct svc_rdma_op_ctxt, list);
612 dto_q); 620 list_del(&ctxt->list);
613 list_del_init(&ctxt->dto_q);
614 } else { 621 } else {
615 atomic_inc(&rdma_stat_rq_starve); 622 atomic_inc(&rdma_stat_rq_starve);
616 clear_bit(XPT_DATA, &xprt->xpt_flags); 623 clear_bit(XPT_DATA, &xprt->xpt_flags);
617 ctxt = NULL; 624 ctxt = NULL;
618 } 625 }
619 spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock); 626 spin_unlock(&rdma_xprt->sc_rq_dto_lock);
620 if (!ctxt) { 627 if (!ctxt) {
621 /* This is the EAGAIN path. The svc_recv routine will 628 /* This is the EAGAIN path. The svc_recv routine will
622 * return -EAGAIN, the nfsd thread will go to call into 629 * return -EAGAIN, the nfsd thread will go to call into
@@ -627,8 +634,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
627 goto defer; 634 goto defer;
628 goto out; 635 goto out;
629 } 636 }
630 dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n", 637 dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p\n",
631 ctxt, rdma_xprt, rqstp, ctxt->wc_status); 638 ctxt, rdma_xprt, rqstp);
632 atomic_inc(&rdma_stat_recv); 639 atomic_inc(&rdma_stat_recv);
633 640
634 /* Build up the XDR from the receive buffers. */ 641 /* Build up the XDR from the receive buffers. */
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index f5a91edcd233..515221b16d09 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -153,76 +153,35 @@ static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt,
153 return dma_addr; 153 return dma_addr;
154} 154}
155 155
156/* Returns the address of the first read chunk or <nul> if no read chunk 156/* Parse the RPC Call's transport header.
157 * is present
158 */ 157 */
159struct rpcrdma_read_chunk * 158static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp,
160svc_rdma_get_read_chunk(struct rpcrdma_msg *rmsgp) 159 struct rpcrdma_write_array **write,
160 struct rpcrdma_write_array **reply)
161{ 161{
162 struct rpcrdma_read_chunk *ch = 162 __be32 *p;
163 (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
164 163
165 if (ch->rc_discrim == xdr_zero) 164 p = (__be32 *)&rmsgp->rm_body.rm_chunks[0];
166 return NULL;
167 return ch;
168}
169
170/* Returns the address of the first read write array element or <nul>
171 * if no write array list is present
172 */
173static struct rpcrdma_write_array *
174svc_rdma_get_write_array(struct rpcrdma_msg *rmsgp)
175{
176 if (rmsgp->rm_body.rm_chunks[0] != xdr_zero ||
177 rmsgp->rm_body.rm_chunks[1] == xdr_zero)
178 return NULL;
179 return (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[1];
180}
181
182/* Returns the address of the first reply array element or <nul> if no
183 * reply array is present
184 */
185static struct rpcrdma_write_array *
186svc_rdma_get_reply_array(struct rpcrdma_msg *rmsgp,
187 struct rpcrdma_write_array *wr_ary)
188{
189 struct rpcrdma_read_chunk *rch;
190 struct rpcrdma_write_array *rp_ary;
191
192 /* XXX: Need to fix when reply chunk may occur with read list
193 * and/or write list.
194 */
195 if (rmsgp->rm_body.rm_chunks[0] != xdr_zero ||
196 rmsgp->rm_body.rm_chunks[1] != xdr_zero)
197 return NULL;
198
199 rch = svc_rdma_get_read_chunk(rmsgp);
200 if (rch) {
201 while (rch->rc_discrim != xdr_zero)
202 rch++;
203
204 /* The reply chunk follows an empty write array located
205 * at 'rc_position' here. The reply array is at rc_target.
206 */
207 rp_ary = (struct rpcrdma_write_array *)&rch->rc_target;
208 goto found_it;
209 }
210 165
211 if (wr_ary) { 166 /* Read list */
212 int chunk = be32_to_cpu(wr_ary->wc_nchunks); 167 while (*p++ != xdr_zero)
168 p += 5;
213 169
214 rp_ary = (struct rpcrdma_write_array *) 170 /* Write list */
215 &wr_ary->wc_array[chunk].wc_target.rs_length; 171 if (*p != xdr_zero) {
216 goto found_it; 172 *write = (struct rpcrdma_write_array *)p;
173 while (*p++ != xdr_zero)
174 p += 1 + be32_to_cpu(*p) * 4;
175 } else {
176 *write = NULL;
177 p++;
217 } 178 }
218 179
219 /* No read list, no write list */ 180 /* Reply chunk */
220 rp_ary = (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[2]; 181 if (*p != xdr_zero)
221 182 *reply = (struct rpcrdma_write_array *)p;
222 found_it: 183 else
223 if (rp_ary->wc_discrim == xdr_zero) 184 *reply = NULL;
224 return NULL;
225 return rp_ary;
226} 185}
227 186
228/* RPC-over-RDMA Version One private extension: Remote Invalidation. 187/* RPC-over-RDMA Version One private extension: Remote Invalidation.
@@ -240,31 +199,22 @@ static u32 svc_rdma_get_inv_rkey(struct rpcrdma_msg *rdma_argp,
240{ 199{
241 struct rpcrdma_read_chunk *rd_ary; 200 struct rpcrdma_read_chunk *rd_ary;
242 struct rpcrdma_segment *arg_ch; 201 struct rpcrdma_segment *arg_ch;
243 u32 inv_rkey;
244
245 inv_rkey = 0;
246 202
247 rd_ary = svc_rdma_get_read_chunk(rdma_argp); 203 rd_ary = (struct rpcrdma_read_chunk *)&rdma_argp->rm_body.rm_chunks[0];
248 if (rd_ary) { 204 if (rd_ary->rc_discrim != xdr_zero)
249 inv_rkey = be32_to_cpu(rd_ary->rc_target.rs_handle); 205 return be32_to_cpu(rd_ary->rc_target.rs_handle);
250 goto out;
251 }
252 206
253 if (wr_ary && be32_to_cpu(wr_ary->wc_nchunks)) { 207 if (wr_ary && be32_to_cpu(wr_ary->wc_nchunks)) {
254 arg_ch = &wr_ary->wc_array[0].wc_target; 208 arg_ch = &wr_ary->wc_array[0].wc_target;
255 inv_rkey = be32_to_cpu(arg_ch->rs_handle); 209 return be32_to_cpu(arg_ch->rs_handle);
256 goto out;
257 } 210 }
258 211
259 if (rp_ary && be32_to_cpu(rp_ary->wc_nchunks)) { 212 if (rp_ary && be32_to_cpu(rp_ary->wc_nchunks)) {
260 arg_ch = &rp_ary->wc_array[0].wc_target; 213 arg_ch = &rp_ary->wc_array[0].wc_target;
261 inv_rkey = be32_to_cpu(arg_ch->rs_handle); 214 return be32_to_cpu(arg_ch->rs_handle);
262 goto out;
263 } 215 }
264 216
265out: 217 return 0;
266 dprintk("svcrdma: Send With Invalidate rkey=%08x\n", inv_rkey);
267 return inv_rkey;
268} 218}
269 219
270/* Assumptions: 220/* Assumptions:
@@ -526,7 +476,8 @@ static int send_reply(struct svcxprt_rdma *rdma,
526 476
527 /* Prepare the SGE for the RPCRDMA Header */ 477 /* Prepare the SGE for the RPCRDMA Header */
528 ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey; 478 ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
529 ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp); 479 ctxt->sge[0].length =
480 svc_rdma_xdr_get_reply_hdr_len((__be32 *)rdma_resp);
530 ctxt->sge[0].addr = 481 ctxt->sge[0].addr =
531 ib_dma_map_page(rdma->sc_cm_id->device, page, 0, 482 ib_dma_map_page(rdma->sc_cm_id->device, page, 0,
532 ctxt->sge[0].length, DMA_TO_DEVICE); 483 ctxt->sge[0].length, DMA_TO_DEVICE);
@@ -609,12 +560,12 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
609 struct rpcrdma_msg *rdma_argp; 560 struct rpcrdma_msg *rdma_argp;
610 struct rpcrdma_msg *rdma_resp; 561 struct rpcrdma_msg *rdma_resp;
611 struct rpcrdma_write_array *wr_ary, *rp_ary; 562 struct rpcrdma_write_array *wr_ary, *rp_ary;
612 enum rpcrdma_proc reply_type;
613 int ret; 563 int ret;
614 int inline_bytes; 564 int inline_bytes;
615 struct page *res_page; 565 struct page *res_page;
616 struct svc_rdma_req_map *vec; 566 struct svc_rdma_req_map *vec;
617 u32 inv_rkey; 567 u32 inv_rkey;
568 __be32 *p;
618 569
619 dprintk("svcrdma: sending response for rqstp=%p\n", rqstp); 570 dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
620 571
@@ -622,8 +573,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
622 * places this at the start of page 0. 573 * places this at the start of page 0.
623 */ 574 */
624 rdma_argp = page_address(rqstp->rq_pages[0]); 575 rdma_argp = page_address(rqstp->rq_pages[0]);
625 wr_ary = svc_rdma_get_write_array(rdma_argp); 576 svc_rdma_get_write_arrays(rdma_argp, &wr_ary, &rp_ary);
626 rp_ary = svc_rdma_get_reply_array(rdma_argp, wr_ary);
627 577
628 inv_rkey = 0; 578 inv_rkey = 0;
629 if (rdma->sc_snd_w_inv) 579 if (rdma->sc_snd_w_inv)
@@ -636,18 +586,28 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
636 goto err0; 586 goto err0;
637 inline_bytes = rqstp->rq_res.len; 587 inline_bytes = rqstp->rq_res.len;
638 588
639 /* Create the RDMA response header */ 589 /* Create the RDMA response header. xprt->xpt_mutex,
590 * acquired in svc_send(), serializes RPC replies. The
591 * code path below that inserts the credit grant value
592 * into each transport header runs only inside this
593 * critical section.
594 */
640 ret = -ENOMEM; 595 ret = -ENOMEM;
641 res_page = alloc_page(GFP_KERNEL); 596 res_page = alloc_page(GFP_KERNEL);
642 if (!res_page) 597 if (!res_page)
643 goto err0; 598 goto err0;
644 rdma_resp = page_address(res_page); 599 rdma_resp = page_address(res_page);
645 if (rp_ary) 600
646 reply_type = RDMA_NOMSG; 601 p = &rdma_resp->rm_xid;
647 else 602 *p++ = rdma_argp->rm_xid;
648 reply_type = RDMA_MSG; 603 *p++ = rdma_argp->rm_vers;
649 svc_rdma_xdr_encode_reply_header(rdma, rdma_argp, 604 *p++ = rdma->sc_fc_credits;
650 rdma_resp, reply_type); 605 *p++ = rp_ary ? rdma_nomsg : rdma_msg;
606
607 /* Start with empty chunks */
608 *p++ = xdr_zero;
609 *p++ = xdr_zero;
610 *p = xdr_zero;
651 611
652 /* Send any write-chunk data and build resp write-list */ 612 /* Send any write-chunk data and build resp write-list */
653 if (wr_ary) { 613 if (wr_ary) {
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 1334de2715c2..fc8f14c7bfec 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -41,6 +41,7 @@
41 */ 41 */
42 42
43#include <linux/sunrpc/svc_xprt.h> 43#include <linux/sunrpc/svc_xprt.h>
44#include <linux/sunrpc/addr.h>
44#include <linux/sunrpc/debug.h> 45#include <linux/sunrpc/debug.h>
45#include <linux/sunrpc/rpc_rdma.h> 46#include <linux/sunrpc/rpc_rdma.h>
46#include <linux/interrupt.h> 47#include <linux/interrupt.h>
@@ -126,6 +127,7 @@ static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *serv,
126 xprt = &cma_xprt->sc_xprt; 127 xprt = &cma_xprt->sc_xprt;
127 128
128 svc_xprt_init(net, &svc_rdma_bc_class, xprt, serv); 129 svc_xprt_init(net, &svc_rdma_bc_class, xprt, serv);
130 set_bit(XPT_CONG_CTRL, &xprt->xpt_flags);
129 serv->sv_bc_xprt = xprt; 131 serv->sv_bc_xprt = xprt;
130 132
131 dprintk("svcrdma: %s(%p)\n", __func__, xprt); 133 dprintk("svcrdma: %s(%p)\n", __func__, xprt);
@@ -156,8 +158,7 @@ static struct svc_rdma_op_ctxt *alloc_ctxt(struct svcxprt_rdma *xprt,
156 ctxt = kmalloc(sizeof(*ctxt), flags); 158 ctxt = kmalloc(sizeof(*ctxt), flags);
157 if (ctxt) { 159 if (ctxt) {
158 ctxt->xprt = xprt; 160 ctxt->xprt = xprt;
159 INIT_LIST_HEAD(&ctxt->free); 161 INIT_LIST_HEAD(&ctxt->list);
160 INIT_LIST_HEAD(&ctxt->dto_q);
161 } 162 }
162 return ctxt; 163 return ctxt;
163} 164}
@@ -179,7 +180,7 @@ static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt)
179 dprintk("svcrdma: No memory for RDMA ctxt\n"); 180 dprintk("svcrdma: No memory for RDMA ctxt\n");
180 return false; 181 return false;
181 } 182 }
182 list_add(&ctxt->free, &xprt->sc_ctxts); 183 list_add(&ctxt->list, &xprt->sc_ctxts);
183 } 184 }
184 return true; 185 return true;
185} 186}
@@ -188,15 +189,15 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
188{ 189{
189 struct svc_rdma_op_ctxt *ctxt = NULL; 190 struct svc_rdma_op_ctxt *ctxt = NULL;
190 191
191 spin_lock_bh(&xprt->sc_ctxt_lock); 192 spin_lock(&xprt->sc_ctxt_lock);
192 xprt->sc_ctxt_used++; 193 xprt->sc_ctxt_used++;
193 if (list_empty(&xprt->sc_ctxts)) 194 if (list_empty(&xprt->sc_ctxts))
194 goto out_empty; 195 goto out_empty;
195 196
196 ctxt = list_first_entry(&xprt->sc_ctxts, 197 ctxt = list_first_entry(&xprt->sc_ctxts,
197 struct svc_rdma_op_ctxt, free); 198 struct svc_rdma_op_ctxt, list);
198 list_del_init(&ctxt->free); 199 list_del(&ctxt->list);
199 spin_unlock_bh(&xprt->sc_ctxt_lock); 200 spin_unlock(&xprt->sc_ctxt_lock);
200 201
201out: 202out:
202 ctxt->count = 0; 203 ctxt->count = 0;
@@ -208,15 +209,15 @@ out_empty:
208 /* Either pre-allocation missed the mark, or send 209 /* Either pre-allocation missed the mark, or send
209 * queue accounting is broken. 210 * queue accounting is broken.
210 */ 211 */
211 spin_unlock_bh(&xprt->sc_ctxt_lock); 212 spin_unlock(&xprt->sc_ctxt_lock);
212 213
213 ctxt = alloc_ctxt(xprt, GFP_NOIO); 214 ctxt = alloc_ctxt(xprt, GFP_NOIO);
214 if (ctxt) 215 if (ctxt)
215 goto out; 216 goto out;
216 217
217 spin_lock_bh(&xprt->sc_ctxt_lock); 218 spin_lock(&xprt->sc_ctxt_lock);
218 xprt->sc_ctxt_used--; 219 xprt->sc_ctxt_used--;
219 spin_unlock_bh(&xprt->sc_ctxt_lock); 220 spin_unlock(&xprt->sc_ctxt_lock);
220 WARN_ONCE(1, "svcrdma: empty RDMA ctxt list?\n"); 221 WARN_ONCE(1, "svcrdma: empty RDMA ctxt list?\n");
221 return NULL; 222 return NULL;
222} 223}
@@ -226,25 +227,22 @@ void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
226 struct svcxprt_rdma *xprt = ctxt->xprt; 227 struct svcxprt_rdma *xprt = ctxt->xprt;
227 struct ib_device *device = xprt->sc_cm_id->device; 228 struct ib_device *device = xprt->sc_cm_id->device;
228 u32 lkey = xprt->sc_pd->local_dma_lkey; 229 u32 lkey = xprt->sc_pd->local_dma_lkey;
229 unsigned int i, count; 230 unsigned int i;
230 231
231 for (count = 0, i = 0; i < ctxt->mapped_sges; i++) { 232 for (i = 0; i < ctxt->mapped_sges; i++) {
232 /* 233 /*
233 * Unmap the DMA addr in the SGE if the lkey matches 234 * Unmap the DMA addr in the SGE if the lkey matches
234 * the local_dma_lkey, otherwise, ignore it since it is 235 * the local_dma_lkey, otherwise, ignore it since it is
235 * an FRMR lkey and will be unmapped later when the 236 * an FRMR lkey and will be unmapped later when the
236 * last WR that uses it completes. 237 * last WR that uses it completes.
237 */ 238 */
238 if (ctxt->sge[i].lkey == lkey) { 239 if (ctxt->sge[i].lkey == lkey)
239 count++;
240 ib_dma_unmap_page(device, 240 ib_dma_unmap_page(device,
241 ctxt->sge[i].addr, 241 ctxt->sge[i].addr,
242 ctxt->sge[i].length, 242 ctxt->sge[i].length,
243 ctxt->direction); 243 ctxt->direction);
244 }
245 } 244 }
246 ctxt->mapped_sges = 0; 245 ctxt->mapped_sges = 0;
247 atomic_sub(count, &xprt->sc_dma_used);
248} 246}
249 247
250void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) 248void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
@@ -256,10 +254,10 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
256 for (i = 0; i < ctxt->count; i++) 254 for (i = 0; i < ctxt->count; i++)
257 put_page(ctxt->pages[i]); 255 put_page(ctxt->pages[i]);
258 256
259 spin_lock_bh(&xprt->sc_ctxt_lock); 257 spin_lock(&xprt->sc_ctxt_lock);
260 xprt->sc_ctxt_used--; 258 xprt->sc_ctxt_used--;
261 list_add(&ctxt->free, &xprt->sc_ctxts); 259 list_add(&ctxt->list, &xprt->sc_ctxts);
262 spin_unlock_bh(&xprt->sc_ctxt_lock); 260 spin_unlock(&xprt->sc_ctxt_lock);
263} 261}
264 262
265static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt) 263static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
@@ -268,8 +266,8 @@ static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
268 struct svc_rdma_op_ctxt *ctxt; 266 struct svc_rdma_op_ctxt *ctxt;
269 267
270 ctxt = list_first_entry(&xprt->sc_ctxts, 268 ctxt = list_first_entry(&xprt->sc_ctxts,
271 struct svc_rdma_op_ctxt, free); 269 struct svc_rdma_op_ctxt, list);
272 list_del(&ctxt->free); 270 list_del(&ctxt->list);
273 kfree(ctxt); 271 kfree(ctxt);
274 } 272 }
275} 273}
@@ -398,7 +396,6 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
398 396
399 /* WARNING: Only wc->wr_cqe and wc->status are reliable */ 397 /* WARNING: Only wc->wr_cqe and wc->status are reliable */
400 ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe); 398 ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe);
401 ctxt->wc_status = wc->status;
402 svc_rdma_unmap_dma(ctxt); 399 svc_rdma_unmap_dma(ctxt);
403 400
404 if (wc->status != IB_WC_SUCCESS) 401 if (wc->status != IB_WC_SUCCESS)
@@ -407,7 +404,7 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
407 /* All wc fields are now known to be valid */ 404 /* All wc fields are now known to be valid */
408 ctxt->byte_len = wc->byte_len; 405 ctxt->byte_len = wc->byte_len;
409 spin_lock(&xprt->sc_rq_dto_lock); 406 spin_lock(&xprt->sc_rq_dto_lock);
410 list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q); 407 list_add_tail(&ctxt->list, &xprt->sc_rq_dto_q);
411 spin_unlock(&xprt->sc_rq_dto_lock); 408 spin_unlock(&xprt->sc_rq_dto_lock);
412 409
413 set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); 410 set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
@@ -436,7 +433,7 @@ static void svc_rdma_send_wc_common(struct svcxprt_rdma *xprt,
436 goto err; 433 goto err;
437 434
438out: 435out:
439 atomic_dec(&xprt->sc_sq_count); 436 atomic_inc(&xprt->sc_sq_avail);
440 wake_up(&xprt->sc_send_wait); 437 wake_up(&xprt->sc_send_wait);
441 return; 438 return;
442 439
@@ -528,7 +525,7 @@ void svc_rdma_wc_read(struct ib_cq *cq, struct ib_wc *wc)
528 525
529 read_hdr = ctxt->read_hdr; 526 read_hdr = ctxt->read_hdr;
530 spin_lock(&xprt->sc_rq_dto_lock); 527 spin_lock(&xprt->sc_rq_dto_lock);
531 list_add_tail(&read_hdr->dto_q, 528 list_add_tail(&read_hdr->list,
532 &xprt->sc_read_complete_q); 529 &xprt->sc_read_complete_q);
533 spin_unlock(&xprt->sc_rq_dto_lock); 530 spin_unlock(&xprt->sc_rq_dto_lock);
534 531
@@ -560,7 +557,6 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
560 return NULL; 557 return NULL;
561 svc_xprt_init(&init_net, &svc_rdma_class, &cma_xprt->sc_xprt, serv); 558 svc_xprt_init(&init_net, &svc_rdma_class, &cma_xprt->sc_xprt, serv);
562 INIT_LIST_HEAD(&cma_xprt->sc_accept_q); 559 INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
563 INIT_LIST_HEAD(&cma_xprt->sc_dto_q);
564 INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); 560 INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
565 INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); 561 INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
566 INIT_LIST_HEAD(&cma_xprt->sc_frmr_q); 562 INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
@@ -574,6 +570,14 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
574 spin_lock_init(&cma_xprt->sc_ctxt_lock); 570 spin_lock_init(&cma_xprt->sc_ctxt_lock);
575 spin_lock_init(&cma_xprt->sc_map_lock); 571 spin_lock_init(&cma_xprt->sc_map_lock);
576 572
573 /*
574 * Note that this implies that the underlying transport support
575 * has some form of congestion control (see RFC 7530 section 3.1
576 * paragraph 2). For now, we assume that all supported RDMA
577 * transports are suitable here.
578 */
579 set_bit(XPT_CONG_CTRL, &cma_xprt->sc_xprt.xpt_flags);
580
577 if (listener) 581 if (listener)
578 set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); 582 set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
579 583
@@ -926,14 +930,14 @@ struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma)
926{ 930{
927 struct svc_rdma_fastreg_mr *frmr = NULL; 931 struct svc_rdma_fastreg_mr *frmr = NULL;
928 932
929 spin_lock_bh(&rdma->sc_frmr_q_lock); 933 spin_lock(&rdma->sc_frmr_q_lock);
930 if (!list_empty(&rdma->sc_frmr_q)) { 934 if (!list_empty(&rdma->sc_frmr_q)) {
931 frmr = list_entry(rdma->sc_frmr_q.next, 935 frmr = list_entry(rdma->sc_frmr_q.next,
932 struct svc_rdma_fastreg_mr, frmr_list); 936 struct svc_rdma_fastreg_mr, frmr_list);
933 list_del_init(&frmr->frmr_list); 937 list_del_init(&frmr->frmr_list);
934 frmr->sg_nents = 0; 938 frmr->sg_nents = 0;
935 } 939 }
936 spin_unlock_bh(&rdma->sc_frmr_q_lock); 940 spin_unlock(&rdma->sc_frmr_q_lock);
937 if (frmr) 941 if (frmr)
938 return frmr; 942 return frmr;
939 943
@@ -946,11 +950,10 @@ void svc_rdma_put_frmr(struct svcxprt_rdma *rdma,
946 if (frmr) { 950 if (frmr) {
947 ib_dma_unmap_sg(rdma->sc_cm_id->device, 951 ib_dma_unmap_sg(rdma->sc_cm_id->device,
948 frmr->sg, frmr->sg_nents, frmr->direction); 952 frmr->sg, frmr->sg_nents, frmr->direction);
949 atomic_dec(&rdma->sc_dma_used); 953 spin_lock(&rdma->sc_frmr_q_lock);
950 spin_lock_bh(&rdma->sc_frmr_q_lock);
951 WARN_ON_ONCE(!list_empty(&frmr->frmr_list)); 954 WARN_ON_ONCE(!list_empty(&frmr->frmr_list));
952 list_add(&frmr->frmr_list, &rdma->sc_frmr_q); 955 list_add(&frmr->frmr_list, &rdma->sc_frmr_q);
953 spin_unlock_bh(&rdma->sc_frmr_q_lock); 956 spin_unlock(&rdma->sc_frmr_q_lock);
954 } 957 }
955} 958}
956 959
@@ -973,6 +976,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
973 struct rpcrdma_connect_private pmsg; 976 struct rpcrdma_connect_private pmsg;
974 struct ib_qp_init_attr qp_attr; 977 struct ib_qp_init_attr qp_attr;
975 struct ib_device *dev; 978 struct ib_device *dev;
979 struct sockaddr *sap;
976 unsigned int i; 980 unsigned int i;
977 int ret = 0; 981 int ret = 0;
978 982
@@ -1005,11 +1009,13 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
1005 newxprt->sc_max_req_size = svcrdma_max_req_size; 1009 newxprt->sc_max_req_size = svcrdma_max_req_size;
1006 newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr, 1010 newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr,
1007 svcrdma_max_requests); 1011 svcrdma_max_requests);
1012 newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests);
1008 newxprt->sc_max_bc_requests = min_t(u32, dev->attrs.max_qp_wr, 1013 newxprt->sc_max_bc_requests = min_t(u32, dev->attrs.max_qp_wr,
1009 svcrdma_max_bc_requests); 1014 svcrdma_max_bc_requests);
1010 newxprt->sc_rq_depth = newxprt->sc_max_requests + 1015 newxprt->sc_rq_depth = newxprt->sc_max_requests +
1011 newxprt->sc_max_bc_requests; 1016 newxprt->sc_max_bc_requests;
1012 newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_rq_depth; 1017 newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_rq_depth;
1018 atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth);
1013 1019
1014 if (!svc_rdma_prealloc_ctxts(newxprt)) 1020 if (!svc_rdma_prealloc_ctxts(newxprt))
1015 goto errout; 1021 goto errout;
@@ -1029,13 +1035,13 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
1029 goto errout; 1035 goto errout;
1030 } 1036 }
1031 newxprt->sc_sq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_sq_depth, 1037 newxprt->sc_sq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_sq_depth,
1032 0, IB_POLL_SOFTIRQ); 1038 0, IB_POLL_WORKQUEUE);
1033 if (IS_ERR(newxprt->sc_sq_cq)) { 1039 if (IS_ERR(newxprt->sc_sq_cq)) {
1034 dprintk("svcrdma: error creating SQ CQ for connect request\n"); 1040 dprintk("svcrdma: error creating SQ CQ for connect request\n");
1035 goto errout; 1041 goto errout;
1036 } 1042 }
1037 newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_rq_depth, 1043 newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_rq_depth,
1038 0, IB_POLL_SOFTIRQ); 1044 0, IB_POLL_WORKQUEUE);
1039 if (IS_ERR(newxprt->sc_rq_cq)) { 1045 if (IS_ERR(newxprt->sc_rq_cq)) {
1040 dprintk("svcrdma: error creating RQ CQ for connect request\n"); 1046 dprintk("svcrdma: error creating RQ CQ for connect request\n");
1041 goto errout; 1047 goto errout;
@@ -1052,18 +1058,12 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
1052 qp_attr.qp_type = IB_QPT_RC; 1058 qp_attr.qp_type = IB_QPT_RC;
1053 qp_attr.send_cq = newxprt->sc_sq_cq; 1059 qp_attr.send_cq = newxprt->sc_sq_cq;
1054 qp_attr.recv_cq = newxprt->sc_rq_cq; 1060 qp_attr.recv_cq = newxprt->sc_rq_cq;
1055 dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n" 1061 dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n",
1056 " cm_id->device=%p, sc_pd->device=%p\n" 1062 newxprt->sc_cm_id, newxprt->sc_pd);
1057 " cap.max_send_wr = %d\n" 1063 dprintk(" cap.max_send_wr = %d, cap.max_recv_wr = %d\n",
1058 " cap.max_recv_wr = %d\n" 1064 qp_attr.cap.max_send_wr, qp_attr.cap.max_recv_wr);
1059 " cap.max_send_sge = %d\n" 1065 dprintk(" cap.max_send_sge = %d, cap.max_recv_sge = %d\n",
1060 " cap.max_recv_sge = %d\n", 1066 qp_attr.cap.max_send_sge, qp_attr.cap.max_recv_sge);
1061 newxprt->sc_cm_id, newxprt->sc_pd,
1062 dev, newxprt->sc_pd->device,
1063 qp_attr.cap.max_send_wr,
1064 qp_attr.cap.max_recv_wr,
1065 qp_attr.cap.max_send_sge,
1066 qp_attr.cap.max_recv_sge);
1067 1067
1068 ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr); 1068 ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr);
1069 if (ret) { 1069 if (ret) {
@@ -1146,31 +1146,16 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
1146 goto errout; 1146 goto errout;
1147 } 1147 }
1148 1148
1149 dprintk("svcrdma: new connection %p accepted with the following " 1149 dprintk("svcrdma: new connection %p accepted:\n", newxprt);
1150 "attributes:\n" 1150 sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
1151 " local_ip : %pI4\n" 1151 dprintk(" local address : %pIS:%u\n", sap, rpc_get_port(sap));
1152 " local_port : %d\n" 1152 sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
1153 " remote_ip : %pI4\n" 1153 dprintk(" remote address : %pIS:%u\n", sap, rpc_get_port(sap));
1154 " remote_port : %d\n" 1154 dprintk(" max_sge : %d\n", newxprt->sc_max_sge);
1155 " max_sge : %d\n" 1155 dprintk(" max_sge_rd : %d\n", newxprt->sc_max_sge_rd);
1156 " max_sge_rd : %d\n" 1156 dprintk(" sq_depth : %d\n", newxprt->sc_sq_depth);
1157 " sq_depth : %d\n" 1157 dprintk(" max_requests : %d\n", newxprt->sc_max_requests);
1158 " max_requests : %d\n" 1158 dprintk(" ord : %d\n", newxprt->sc_ord);
1159 " ord : %d\n",
1160 newxprt,
1161 &((struct sockaddr_in *)&newxprt->sc_cm_id->
1162 route.addr.src_addr)->sin_addr.s_addr,
1163 ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
1164 route.addr.src_addr)->sin_port),
1165 &((struct sockaddr_in *)&newxprt->sc_cm_id->
1166 route.addr.dst_addr)->sin_addr.s_addr,
1167 ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
1168 route.addr.dst_addr)->sin_port),
1169 newxprt->sc_max_sge,
1170 newxprt->sc_max_sge_rd,
1171 newxprt->sc_sq_depth,
1172 newxprt->sc_max_requests,
1173 newxprt->sc_ord);
1174 1159
1175 return &newxprt->sc_xprt; 1160 return &newxprt->sc_xprt;
1176 1161
@@ -1224,9 +1209,9 @@ static void __svc_rdma_free(struct work_struct *work)
1224 ib_drain_qp(rdma->sc_qp); 1209 ib_drain_qp(rdma->sc_qp);
1225 1210
1226 /* We should only be called from kref_put */ 1211 /* We should only be called from kref_put */
1227 if (atomic_read(&xprt->xpt_ref.refcount) != 0) 1212 if (kref_read(&xprt->xpt_ref) != 0)
1228 pr_err("svcrdma: sc_xprt still in use? (%d)\n", 1213 pr_err("svcrdma: sc_xprt still in use? (%d)\n",
1229 atomic_read(&xprt->xpt_ref.refcount)); 1214 kref_read(&xprt->xpt_ref));
1230 1215
1231 /* 1216 /*
1232 * Destroy queued, but not processed read completions. Note 1217 * Destroy queued, but not processed read completions. Note
@@ -1236,20 +1221,18 @@ static void __svc_rdma_free(struct work_struct *work)
1236 */ 1221 */
1237 while (!list_empty(&rdma->sc_read_complete_q)) { 1222 while (!list_empty(&rdma->sc_read_complete_q)) {
1238 struct svc_rdma_op_ctxt *ctxt; 1223 struct svc_rdma_op_ctxt *ctxt;
1239 ctxt = list_entry(rdma->sc_read_complete_q.next, 1224 ctxt = list_first_entry(&rdma->sc_read_complete_q,
1240 struct svc_rdma_op_ctxt, 1225 struct svc_rdma_op_ctxt, list);
1241 dto_q); 1226 list_del(&ctxt->list);
1242 list_del_init(&ctxt->dto_q);
1243 svc_rdma_put_context(ctxt, 1); 1227 svc_rdma_put_context(ctxt, 1);
1244 } 1228 }
1245 1229
1246 /* Destroy queued, but not processed recv completions */ 1230 /* Destroy queued, but not processed recv completions */
1247 while (!list_empty(&rdma->sc_rq_dto_q)) { 1231 while (!list_empty(&rdma->sc_rq_dto_q)) {
1248 struct svc_rdma_op_ctxt *ctxt; 1232 struct svc_rdma_op_ctxt *ctxt;
1249 ctxt = list_entry(rdma->sc_rq_dto_q.next, 1233 ctxt = list_first_entry(&rdma->sc_rq_dto_q,
1250 struct svc_rdma_op_ctxt, 1234 struct svc_rdma_op_ctxt, list);
1251 dto_q); 1235 list_del(&ctxt->list);
1252 list_del_init(&ctxt->dto_q);
1253 svc_rdma_put_context(ctxt, 1); 1236 svc_rdma_put_context(ctxt, 1);
1254 } 1237 }
1255 1238
@@ -1257,9 +1240,6 @@ static void __svc_rdma_free(struct work_struct *work)
1257 if (rdma->sc_ctxt_used != 0) 1240 if (rdma->sc_ctxt_used != 0)
1258 pr_err("svcrdma: ctxt still in use? (%d)\n", 1241 pr_err("svcrdma: ctxt still in use? (%d)\n",
1259 rdma->sc_ctxt_used); 1242 rdma->sc_ctxt_used);
1260 if (atomic_read(&rdma->sc_dma_used) != 0)
1261 pr_err("svcrdma: dma still in use? (%d)\n",
1262 atomic_read(&rdma->sc_dma_used));
1263 1243
1264 /* Final put of backchannel client transport */ 1244 /* Final put of backchannel client transport */
1265 if (xprt->xpt_bc_xprt) { 1245 if (xprt->xpt_bc_xprt) {
@@ -1339,15 +1319,13 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
1339 1319
1340 /* If the SQ is full, wait until an SQ entry is available */ 1320 /* If the SQ is full, wait until an SQ entry is available */
1341 while (1) { 1321 while (1) {
1342 spin_lock_bh(&xprt->sc_lock); 1322 if ((atomic_sub_return(wr_count, &xprt->sc_sq_avail) < 0)) {
1343 if (xprt->sc_sq_depth < atomic_read(&xprt->sc_sq_count) + wr_count) {
1344 spin_unlock_bh(&xprt->sc_lock);
1345 atomic_inc(&rdma_stat_sq_starve); 1323 atomic_inc(&rdma_stat_sq_starve);
1346 1324
1347 /* Wait until SQ WR available if SQ still full */ 1325 /* Wait until SQ WR available if SQ still full */
1326 atomic_add(wr_count, &xprt->sc_sq_avail);
1348 wait_event(xprt->sc_send_wait, 1327 wait_event(xprt->sc_send_wait,
1349 atomic_read(&xprt->sc_sq_count) < 1328 atomic_read(&xprt->sc_sq_avail) > wr_count);
1350 xprt->sc_sq_depth);
1351 if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags)) 1329 if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
1352 return -ENOTCONN; 1330 return -ENOTCONN;
1353 continue; 1331 continue;
@@ -1357,21 +1335,17 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
1357 svc_xprt_get(&xprt->sc_xprt); 1335 svc_xprt_get(&xprt->sc_xprt);
1358 1336
1359 /* Bump used SQ WR count and post */ 1337 /* Bump used SQ WR count and post */
1360 atomic_add(wr_count, &xprt->sc_sq_count);
1361 ret = ib_post_send(xprt->sc_qp, wr, &bad_wr); 1338 ret = ib_post_send(xprt->sc_qp, wr, &bad_wr);
1362 if (ret) { 1339 if (ret) {
1363 set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); 1340 set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
1364 atomic_sub(wr_count, &xprt->sc_sq_count);
1365 for (i = 0; i < wr_count; i ++) 1341 for (i = 0; i < wr_count; i ++)
1366 svc_xprt_put(&xprt->sc_xprt); 1342 svc_xprt_put(&xprt->sc_xprt);
1367 dprintk("svcrdma: failed to post SQ WR rc=%d, " 1343 dprintk("svcrdma: failed to post SQ WR rc=%d\n", ret);
1368 "sc_sq_count=%d, sc_sq_depth=%d\n", 1344 dprintk(" sc_sq_avail=%d, sc_sq_depth=%d\n",
1369 ret, atomic_read(&xprt->sc_sq_count), 1345 atomic_read(&xprt->sc_sq_avail),
1370 xprt->sc_sq_depth); 1346 xprt->sc_sq_depth);
1371 }
1372 spin_unlock_bh(&xprt->sc_lock);
1373 if (ret)
1374 wake_up(&xprt->sc_send_wait); 1347 wake_up(&xprt->sc_send_wait);
1348 }
1375 break; 1349 break;
1376 } 1350 }
1377 return ret; 1351 return ret;
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index ed5e285fd2ea..c717f5410776 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -67,7 +67,7 @@ unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
67static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; 67static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
68static unsigned int xprt_rdma_inline_write_padding; 68static unsigned int xprt_rdma_inline_write_padding;
69static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR; 69static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
70 int xprt_rdma_pad_optimize = 1; 70 int xprt_rdma_pad_optimize = 0;
71 71
72#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 72#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
73 73
@@ -219,6 +219,34 @@ xprt_rdma_free_addresses(struct rpc_xprt *xprt)
219 } 219 }
220} 220}
221 221
222void
223rpcrdma_conn_func(struct rpcrdma_ep *ep)
224{
225 schedule_delayed_work(&ep->rep_connect_worker, 0);
226}
227
228void
229rpcrdma_connect_worker(struct work_struct *work)
230{
231 struct rpcrdma_ep *ep =
232 container_of(work, struct rpcrdma_ep, rep_connect_worker.work);
233 struct rpcrdma_xprt *r_xprt =
234 container_of(ep, struct rpcrdma_xprt, rx_ep);
235 struct rpc_xprt *xprt = &r_xprt->rx_xprt;
236
237 spin_lock_bh(&xprt->transport_lock);
238 if (++xprt->connect_cookie == 0) /* maintain a reserved value */
239 ++xprt->connect_cookie;
240 if (ep->rep_connected > 0) {
241 if (!xprt_test_and_set_connected(xprt))
242 xprt_wake_pending_tasks(xprt, 0);
243 } else {
244 if (xprt_test_and_clear_connected(xprt))
245 xprt_wake_pending_tasks(xprt, -ENOTCONN);
246 }
247 spin_unlock_bh(&xprt->transport_lock);
248}
249
222static void 250static void
223xprt_rdma_connect_worker(struct work_struct *work) 251xprt_rdma_connect_worker(struct work_struct *work)
224{ 252{
@@ -621,7 +649,8 @@ xprt_rdma_free(struct rpc_task *task)
621 649
622 dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply); 650 dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply);
623 651
624 ia->ri_ops->ro_unmap_safe(r_xprt, req, !RPC_IS_ASYNC(task)); 652 if (unlikely(!list_empty(&req->rl_registered)))
653 ia->ri_ops->ro_unmap_safe(r_xprt, req, !RPC_IS_ASYNC(task));
625 rpcrdma_unmap_sges(ia, req); 654 rpcrdma_unmap_sges(ia, req);
626 rpcrdma_buffer_put(req); 655 rpcrdma_buffer_put(req);
627} 656}
@@ -657,7 +686,8 @@ xprt_rdma_send_request(struct rpc_task *task)
657 int rc = 0; 686 int rc = 0;
658 687
659 /* On retransmit, remove any previously registered chunks */ 688 /* On retransmit, remove any previously registered chunks */
660 r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); 689 if (unlikely(!list_empty(&req->rl_registered)))
690 r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
661 691
662 rc = rpcrdma_marshal_req(rqst); 692 rc = rpcrdma_marshal_req(rqst);
663 if (rc < 0) 693 if (rc < 0)
@@ -679,10 +709,6 @@ xprt_rdma_send_request(struct rpc_task *task)
679 return 0; 709 return 0;
680 710
681failed_marshal: 711failed_marshal:
682 dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n",
683 __func__, rc);
684 if (rc == -EIO)
685 r_xprt->rx_stats.failed_marshal_count++;
686 if (rc != -ENOTCONN) 712 if (rc != -ENOTCONN)
687 return rc; 713 return rc;
688drop_connection: 714drop_connection:
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index ec74289af7ec..3b332b395045 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -54,6 +54,7 @@
54#include <linux/sunrpc/svc_rdma.h> 54#include <linux/sunrpc/svc_rdma.h>
55#include <asm/bitops.h> 55#include <asm/bitops.h>
56#include <linux/module.h> /* try_module_get()/module_put() */ 56#include <linux/module.h> /* try_module_get()/module_put() */
57#include <rdma/ib_cm.h>
57 58
58#include "xprt_rdma.h" 59#include "xprt_rdma.h"
59 60
@@ -103,9 +104,9 @@ rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
103{ 104{
104 struct rpcrdma_ep *ep = context; 105 struct rpcrdma_ep *ep = context;
105 106
106 pr_err("RPC: %s: %s on device %s ep %p\n", 107 pr_err("rpcrdma: %s on device %s ep %p\n",
107 __func__, ib_event_msg(event->event), 108 ib_event_msg(event->event), event->device->name, context);
108 event->device->name, context); 109
109 if (ep->rep_connected == 1) { 110 if (ep->rep_connected == 1) {
110 ep->rep_connected = -EIO; 111 ep->rep_connected = -EIO;
111 rpcrdma_conn_func(ep); 112 rpcrdma_conn_func(ep);
@@ -208,6 +209,7 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
208 209
209 /* Default settings for RPC-over-RDMA Version One */ 210 /* Default settings for RPC-over-RDMA Version One */
210 r_xprt->rx_ia.ri_reminv_expected = false; 211 r_xprt->rx_ia.ri_reminv_expected = false;
212 r_xprt->rx_ia.ri_implicit_roundup = xprt_rdma_pad_optimize;
211 rsize = RPCRDMA_V1_DEF_INLINE_SIZE; 213 rsize = RPCRDMA_V1_DEF_INLINE_SIZE;
212 wsize = RPCRDMA_V1_DEF_INLINE_SIZE; 214 wsize = RPCRDMA_V1_DEF_INLINE_SIZE;
213 215
@@ -215,6 +217,7 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
215 pmsg->cp_magic == rpcrdma_cmp_magic && 217 pmsg->cp_magic == rpcrdma_cmp_magic &&
216 pmsg->cp_version == RPCRDMA_CMP_VERSION) { 218 pmsg->cp_version == RPCRDMA_CMP_VERSION) {
217 r_xprt->rx_ia.ri_reminv_expected = true; 219 r_xprt->rx_ia.ri_reminv_expected = true;
220 r_xprt->rx_ia.ri_implicit_roundup = true;
218 rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size); 221 rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size);
219 wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size); 222 wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size);
220 } 223 }
@@ -223,8 +226,8 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
223 cdata->inline_rsize = rsize; 226 cdata->inline_rsize = rsize;
224 if (wsize < cdata->inline_wsize) 227 if (wsize < cdata->inline_wsize)
225 cdata->inline_wsize = wsize; 228 cdata->inline_wsize = wsize;
226 pr_info("rpcrdma: max send %u, max recv %u\n", 229 dprintk("RPC: %s: max send %u, max recv %u\n",
227 cdata->inline_wsize, cdata->inline_rsize); 230 __func__, cdata->inline_wsize, cdata->inline_rsize);
228 rpcrdma_set_max_header_sizes(r_xprt); 231 rpcrdma_set_max_header_sizes(r_xprt);
229} 232}
230 233
@@ -277,7 +280,14 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
277 connstate = -ENETDOWN; 280 connstate = -ENETDOWN;
278 goto connected; 281 goto connected;
279 case RDMA_CM_EVENT_REJECTED: 282 case RDMA_CM_EVENT_REJECTED:
283#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
284 pr_info("rpcrdma: connection to %pIS:%u on %s rejected: %s\n",
285 sap, rpc_get_port(sap), ia->ri_device->name,
286 rdma_reject_msg(id, event->status));
287#endif
280 connstate = -ECONNREFUSED; 288 connstate = -ECONNREFUSED;
289 if (event->status == IB_CM_REJ_STALE_CONN)
290 connstate = -EAGAIN;
281 goto connected; 291 goto connected;
282 case RDMA_CM_EVENT_DISCONNECTED: 292 case RDMA_CM_EVENT_DISCONNECTED:
283 connstate = -ECONNABORTED; 293 connstate = -ECONNABORTED;
@@ -331,6 +341,7 @@ static struct rdma_cm_id *
331rpcrdma_create_id(struct rpcrdma_xprt *xprt, 341rpcrdma_create_id(struct rpcrdma_xprt *xprt,
332 struct rpcrdma_ia *ia, struct sockaddr *addr) 342 struct rpcrdma_ia *ia, struct sockaddr *addr)
333{ 343{
344 unsigned long wtimeout = msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1;
334 struct rdma_cm_id *id; 345 struct rdma_cm_id *id;
335 int rc; 346 int rc;
336 347
@@ -352,8 +363,12 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
352 __func__, rc); 363 __func__, rc);
353 goto out; 364 goto out;
354 } 365 }
355 wait_for_completion_interruptible_timeout(&ia->ri_done, 366 rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
356 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); 367 if (rc < 0) {
368 dprintk("RPC: %s: wait() exited: %i\n",
369 __func__, rc);
370 goto out;
371 }
357 372
358 /* FIXME: 373 /* FIXME:
359 * Until xprtrdma supports DEVICE_REMOVAL, the provider must 374 * Until xprtrdma supports DEVICE_REMOVAL, the provider must
@@ -376,8 +391,12 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
376 __func__, rc); 391 __func__, rc);
377 goto put; 392 goto put;
378 } 393 }
379 wait_for_completion_interruptible_timeout(&ia->ri_done, 394 rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
380 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); 395 if (rc < 0) {
396 dprintk("RPC: %s: wait() exited: %i\n",
397 __func__, rc);
398 goto put;
399 }
381 rc = ia->ri_async_rc; 400 rc = ia->ri_async_rc;
382 if (rc) 401 if (rc)
383 goto put; 402 goto put;
@@ -477,18 +496,20 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia)
477 */ 496 */
478int 497int
479rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, 498rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
480 struct rpcrdma_create_data_internal *cdata) 499 struct rpcrdma_create_data_internal *cdata)
481{ 500{
482 struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private; 501 struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private;
502 unsigned int max_qp_wr, max_sge;
483 struct ib_cq *sendcq, *recvcq; 503 struct ib_cq *sendcq, *recvcq;
484 unsigned int max_qp_wr;
485 int rc; 504 int rc;
486 505
487 if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_SEND_SGES) { 506 max_sge = min_t(unsigned int, ia->ri_device->attrs.max_sge,
488 dprintk("RPC: %s: insufficient sge's available\n", 507 RPCRDMA_MAX_SEND_SGES);
489 __func__); 508 if (max_sge < RPCRDMA_MIN_SEND_SGES) {
509 pr_warn("rpcrdma: HCA provides only %d send SGEs\n", max_sge);
490 return -ENOMEM; 510 return -ENOMEM;
491 } 511 }
512 ia->ri_max_send_sges = max_sge - RPCRDMA_MIN_SEND_SGES;
492 513
493 if (ia->ri_device->attrs.max_qp_wr <= RPCRDMA_BACKWARD_WRS) { 514 if (ia->ri_device->attrs.max_qp_wr <= RPCRDMA_BACKWARD_WRS) {
494 dprintk("RPC: %s: insufficient wqe's available\n", 515 dprintk("RPC: %s: insufficient wqe's available\n",
@@ -513,7 +534,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
513 ep->rep_attr.cap.max_recv_wr = cdata->max_requests; 534 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
514 ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; 535 ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
515 ep->rep_attr.cap.max_recv_wr += 1; /* drain cqe */ 536 ep->rep_attr.cap.max_recv_wr += 1; /* drain cqe */
516 ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_SEND_SGES; 537 ep->rep_attr.cap.max_send_sge = max_sge;
517 ep->rep_attr.cap.max_recv_sge = 1; 538 ep->rep_attr.cap.max_recv_sge = 1;
518 ep->rep_attr.cap.max_inline_data = 0; 539 ep->rep_attr.cap.max_inline_data = 0;
519 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; 540 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
@@ -532,7 +553,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
532 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1; 553 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
533 if (ep->rep_cqinit <= 2) 554 if (ep->rep_cqinit <= 2)
534 ep->rep_cqinit = 0; /* always signal? */ 555 ep->rep_cqinit = 0; /* always signal? */
535 INIT_CQCOUNT(ep); 556 rpcrdma_init_cqcount(ep, 0);
536 init_waitqueue_head(&ep->rep_connect_wait); 557 init_waitqueue_head(&ep->rep_connect_wait);
537 INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); 558 INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
538 559
@@ -631,20 +652,21 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
631int 652int
632rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) 653rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
633{ 654{
655 struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
656 rx_ia);
634 struct rdma_cm_id *id, *old; 657 struct rdma_cm_id *id, *old;
658 struct sockaddr *sap;
659 unsigned int extras;
635 int rc = 0; 660 int rc = 0;
636 int retry_count = 0;
637 661
638 if (ep->rep_connected != 0) { 662 if (ep->rep_connected != 0) {
639 struct rpcrdma_xprt *xprt;
640retry: 663retry:
641 dprintk("RPC: %s: reconnecting...\n", __func__); 664 dprintk("RPC: %s: reconnecting...\n", __func__);
642 665
643 rpcrdma_ep_disconnect(ep, ia); 666 rpcrdma_ep_disconnect(ep, ia);
644 667
645 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); 668 sap = (struct sockaddr *)&r_xprt->rx_data.addr;
646 id = rpcrdma_create_id(xprt, ia, 669 id = rpcrdma_create_id(r_xprt, ia, sap);
647 (struct sockaddr *)&xprt->rx_data.addr);
648 if (IS_ERR(id)) { 670 if (IS_ERR(id)) {
649 rc = -EHOSTUNREACH; 671 rc = -EHOSTUNREACH;
650 goto out; 672 goto out;
@@ -699,51 +721,18 @@ retry:
699 } 721 }
700 722
701 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0); 723 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
702
703 /*
704 * Check state. A non-peer reject indicates no listener
705 * (ECONNREFUSED), which may be a transient state. All
706 * others indicate a transport condition which has already
707 * undergone a best-effort.
708 */
709 if (ep->rep_connected == -ECONNREFUSED &&
710 ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
711 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
712 goto retry;
713 }
714 if (ep->rep_connected <= 0) { 724 if (ep->rep_connected <= 0) {
715 /* Sometimes, the only way to reliably connect to remote 725 if (ep->rep_connected == -EAGAIN)
716 * CMs is to use same nonzero values for ORD and IRD. */
717 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
718 (ep->rep_remote_cma.responder_resources == 0 ||
719 ep->rep_remote_cma.initiator_depth !=
720 ep->rep_remote_cma.responder_resources)) {
721 if (ep->rep_remote_cma.responder_resources == 0)
722 ep->rep_remote_cma.responder_resources = 1;
723 ep->rep_remote_cma.initiator_depth =
724 ep->rep_remote_cma.responder_resources;
725 goto retry; 726 goto retry;
726 }
727 rc = ep->rep_connected; 727 rc = ep->rep_connected;
728 } else { 728 goto out;
729 struct rpcrdma_xprt *r_xprt;
730 unsigned int extras;
731
732 dprintk("RPC: %s: connected\n", __func__);
733
734 r_xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
735 extras = r_xprt->rx_buf.rb_bc_srv_max_requests;
736
737 if (extras) {
738 rc = rpcrdma_ep_post_extra_recv(r_xprt, extras);
739 if (rc) {
740 pr_warn("%s: rpcrdma_ep_post_extra_recv: %i\n",
741 __func__, rc);
742 rc = 0;
743 }
744 }
745 } 729 }
746 730
731 dprintk("RPC: %s: connected\n", __func__);
732 extras = r_xprt->rx_buf.rb_bc_srv_max_requests;
733 if (extras)
734 rpcrdma_ep_post_extra_recv(r_xprt, extras);
735
747out: 736out:
748 if (rc) 737 if (rc)
749 ep->rep_connected = rc; 738 ep->rep_connected = rc;
@@ -788,9 +777,7 @@ rpcrdma_mr_recovery_worker(struct work_struct *work)
788 777
789 spin_lock(&buf->rb_recovery_lock); 778 spin_lock(&buf->rb_recovery_lock);
790 while (!list_empty(&buf->rb_stale_mrs)) { 779 while (!list_empty(&buf->rb_stale_mrs)) {
791 mw = list_first_entry(&buf->rb_stale_mrs, 780 mw = rpcrdma_pop_mw(&buf->rb_stale_mrs);
792 struct rpcrdma_mw, mw_list);
793 list_del_init(&mw->mw_list);
794 spin_unlock(&buf->rb_recovery_lock); 781 spin_unlock(&buf->rb_recovery_lock);
795 782
796 dprintk("RPC: %s: recovering MR %p\n", __func__, mw); 783 dprintk("RPC: %s: recovering MR %p\n", __func__, mw);
@@ -808,7 +795,7 @@ rpcrdma_defer_mr_recovery(struct rpcrdma_mw *mw)
808 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 795 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
809 796
810 spin_lock(&buf->rb_recovery_lock); 797 spin_lock(&buf->rb_recovery_lock);
811 list_add(&mw->mw_list, &buf->rb_stale_mrs); 798 rpcrdma_push_mw(mw, &buf->rb_stale_mrs);
812 spin_unlock(&buf->rb_recovery_lock); 799 spin_unlock(&buf->rb_recovery_lock);
813 800
814 schedule_delayed_work(&buf->rb_recovery_worker, 0); 801 schedule_delayed_work(&buf->rb_recovery_worker, 0);
@@ -1084,11 +1071,8 @@ rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt)
1084 struct rpcrdma_mw *mw = NULL; 1071 struct rpcrdma_mw *mw = NULL;
1085 1072
1086 spin_lock(&buf->rb_mwlock); 1073 spin_lock(&buf->rb_mwlock);
1087 if (!list_empty(&buf->rb_mws)) { 1074 if (!list_empty(&buf->rb_mws))
1088 mw = list_first_entry(&buf->rb_mws, 1075 mw = rpcrdma_pop_mw(&buf->rb_mws);
1089 struct rpcrdma_mw, mw_list);
1090 list_del_init(&mw->mw_list);
1091 }
1092 spin_unlock(&buf->rb_mwlock); 1076 spin_unlock(&buf->rb_mwlock);
1093 1077
1094 if (!mw) 1078 if (!mw)
@@ -1111,7 +1095,7 @@ rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
1111 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 1095 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1112 1096
1113 spin_lock(&buf->rb_mwlock); 1097 spin_lock(&buf->rb_mwlock);
1114 list_add_tail(&mw->mw_list, &buf->rb_mws); 1098 rpcrdma_push_mw(mw, &buf->rb_mws);
1115 spin_unlock(&buf->rb_mwlock); 1099 spin_unlock(&buf->rb_mwlock);
1116} 1100}
1117 1101
@@ -1311,13 +1295,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
1311 dprintk("RPC: %s: posting %d s/g entries\n", 1295 dprintk("RPC: %s: posting %d s/g entries\n",
1312 __func__, send_wr->num_sge); 1296 __func__, send_wr->num_sge);
1313 1297
1314 if (DECR_CQCOUNT(ep) > 0) 1298 rpcrdma_set_signaled(ep, send_wr);
1315 send_wr->send_flags = 0;
1316 else { /* Provider must take a send completion every now and then */
1317 INIT_CQCOUNT(ep);
1318 send_wr->send_flags = IB_SEND_SIGNALED;
1319 }
1320
1321 rc = ib_post_send(ia->ri_id->qp, send_wr, &send_wr_fail); 1299 rc = ib_post_send(ia->ri_id->qp, send_wr, &send_wr_fail);
1322 if (rc) 1300 if (rc)
1323 goto out_postsend_err; 1301 goto out_postsend_err;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 6e1bba358203..171a35116de9 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -74,7 +74,10 @@ struct rpcrdma_ia {
74 unsigned int ri_max_frmr_depth; 74 unsigned int ri_max_frmr_depth;
75 unsigned int ri_max_inline_write; 75 unsigned int ri_max_inline_write;
76 unsigned int ri_max_inline_read; 76 unsigned int ri_max_inline_read;
77 unsigned int ri_max_send_sges;
77 bool ri_reminv_expected; 78 bool ri_reminv_expected;
79 bool ri_implicit_roundup;
80 enum ib_mr_type ri_mrtype;
78 struct ib_qp_attr ri_qp_attr; 81 struct ib_qp_attr ri_qp_attr;
79 struct ib_qp_init_attr ri_qp_init_attr; 82 struct ib_qp_init_attr ri_qp_init_attr;
80}; 83};
@@ -95,8 +98,24 @@ struct rpcrdma_ep {
95 struct delayed_work rep_connect_worker; 98 struct delayed_work rep_connect_worker;
96}; 99};
97 100
98#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) 101static inline void
99#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount) 102rpcrdma_init_cqcount(struct rpcrdma_ep *ep, int count)
103{
104 atomic_set(&ep->rep_cqcount, ep->rep_cqinit - count);
105}
106
107/* To update send queue accounting, provider must take a
108 * send completion every now and then.
109 */
110static inline void
111rpcrdma_set_signaled(struct rpcrdma_ep *ep, struct ib_send_wr *send_wr)
112{
113 send_wr->send_flags = 0;
114 if (unlikely(atomic_sub_return(1, &ep->rep_cqcount) <= 0)) {
115 rpcrdma_init_cqcount(ep, 0);
116 send_wr->send_flags = IB_SEND_SIGNALED;
117 }
118}
100 119
101/* Pre-allocate extra Work Requests for handling backward receives 120/* Pre-allocate extra Work Requests for handling backward receives
102 * and sends. This is a fixed value because the Work Queues are 121 * and sends. This is a fixed value because the Work Queues are
@@ -286,15 +305,19 @@ struct rpcrdma_mr_seg { /* chunk descriptors */
286 char *mr_offset; /* kva if no page, else offset */ 305 char *mr_offset; /* kva if no page, else offset */
287}; 306};
288 307
289/* Reserve enough Send SGEs to send a maximum size inline request: 308/* The Send SGE array is provisioned to send a maximum size
309 * inline request:
290 * - RPC-over-RDMA header 310 * - RPC-over-RDMA header
291 * - xdr_buf head iovec 311 * - xdr_buf head iovec
292 * - RPCRDMA_MAX_INLINE bytes, possibly unaligned, in pages 312 * - RPCRDMA_MAX_INLINE bytes, in pages
293 * - xdr_buf tail iovec 313 * - xdr_buf tail iovec
314 *
315 * The actual number of array elements consumed by each RPC
316 * depends on the device's max_sge limit.
294 */ 317 */
295enum { 318enum {
296 RPCRDMA_MAX_SEND_PAGES = PAGE_SIZE + RPCRDMA_MAX_INLINE - 1, 319 RPCRDMA_MIN_SEND_SGES = 3,
297 RPCRDMA_MAX_PAGE_SGES = (RPCRDMA_MAX_SEND_PAGES >> PAGE_SHIFT) + 1, 320 RPCRDMA_MAX_PAGE_SGES = RPCRDMA_MAX_INLINE >> PAGE_SHIFT,
298 RPCRDMA_MAX_SEND_SGES = 1 + 1 + RPCRDMA_MAX_PAGE_SGES + 1, 321 RPCRDMA_MAX_SEND_SGES = 1 + 1 + RPCRDMA_MAX_PAGE_SGES + 1,
299}; 322};
300 323
@@ -331,6 +354,22 @@ rpcr_to_rdmar(struct rpc_rqst *rqst)
331 return rqst->rq_xprtdata; 354 return rqst->rq_xprtdata;
332} 355}
333 356
357static inline void
358rpcrdma_push_mw(struct rpcrdma_mw *mw, struct list_head *list)
359{
360 list_add_tail(&mw->mw_list, list);
361}
362
363static inline struct rpcrdma_mw *
364rpcrdma_pop_mw(struct list_head *list)
365{
366 struct rpcrdma_mw *mw;
367
368 mw = list_first_entry(list, struct rpcrdma_mw, mw_list);
369 list_del(&mw->mw_list);
370 return mw;
371}
372
334/* 373/*
335 * struct rpcrdma_buffer -- holds list/queue of pre-registered memory for 374 * struct rpcrdma_buffer -- holds list/queue of pre-registered memory for
336 * inline requests/replies, and client/server credits. 375 * inline requests/replies, and client/server credits.
@@ -473,6 +512,7 @@ int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
473 struct rpcrdma_create_data_internal *); 512 struct rpcrdma_create_data_internal *);
474void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *); 513void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
475int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); 514int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
515void rpcrdma_conn_func(struct rpcrdma_ep *ep);
476void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); 516void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
477 517
478int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, 518int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
@@ -532,13 +572,6 @@ rpcrdma_data_dir(bool writing)
532} 572}
533 573
534/* 574/*
535 * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
536 */
537void rpcrdma_connect_worker(struct work_struct *);
538void rpcrdma_conn_func(struct rpcrdma_ep *);
539void rpcrdma_reply_handler(struct work_struct *);
540
541/*
542 * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c 575 * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
543 */ 576 */
544 577
@@ -555,12 +588,14 @@ bool rpcrdma_prepare_send_sges(struct rpcrdma_ia *, struct rpcrdma_req *,
555void rpcrdma_unmap_sges(struct rpcrdma_ia *, struct rpcrdma_req *); 588void rpcrdma_unmap_sges(struct rpcrdma_ia *, struct rpcrdma_req *);
556int rpcrdma_marshal_req(struct rpc_rqst *); 589int rpcrdma_marshal_req(struct rpc_rqst *);
557void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *); 590void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *);
591void rpcrdma_reply_handler(struct work_struct *work);
558 592
559/* RPC/RDMA module init - xprtrdma/transport.c 593/* RPC/RDMA module init - xprtrdma/transport.c
560 */ 594 */
561extern unsigned int xprt_rdma_max_inline_read; 595extern unsigned int xprt_rdma_max_inline_read;
562void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap); 596void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap);
563void xprt_rdma_free_addresses(struct rpc_xprt *xprt); 597void xprt_rdma_free_addresses(struct rpc_xprt *xprt);
598void rpcrdma_connect_worker(struct work_struct *work);
564void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq); 599void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq);
565int xprt_rdma_init(void); 600int xprt_rdma_init(void);
566void xprt_rdma_cleanup(void); 601void xprt_rdma_cleanup(void);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index e01c825bc683..16aff8ddc16f 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -52,6 +52,8 @@
52#include "sunrpc.h" 52#include "sunrpc.h"
53 53
54static void xs_close(struct rpc_xprt *xprt); 54static void xs_close(struct rpc_xprt *xprt);
55static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
56 struct socket *sock);
55 57
56/* 58/*
57 * xprtsock tunables 59 * xprtsock tunables
@@ -666,6 +668,9 @@ static int xs_tcp_send_request(struct rpc_task *task)
666 if (task->tk_flags & RPC_TASK_SENT) 668 if (task->tk_flags & RPC_TASK_SENT)
667 zerocopy = false; 669 zerocopy = false;
668 670
671 if (test_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state))
672 xs_tcp_set_socket_timeouts(xprt, transport->sock);
673
669 /* Continue transmitting the packet/record. We must be careful 674 /* Continue transmitting the packet/record. We must be careful
670 * to cope with writespace callbacks arriving _after_ we have 675 * to cope with writespace callbacks arriving _after_ we have
671 * called sendmsg(). */ 676 * called sendmsg(). */
@@ -1080,10 +1085,10 @@ static void xs_udp_data_receive(struct sock_xprt *transport)
1080 if (sk == NULL) 1085 if (sk == NULL)
1081 goto out; 1086 goto out;
1082 for (;;) { 1087 for (;;) {
1083 skb = skb_recv_datagram(sk, 0, 1, &err); 1088 skb = skb_recv_udp(sk, 0, 1, &err);
1084 if (skb != NULL) { 1089 if (skb != NULL) {
1085 xs_udp_data_read_skb(&transport->xprt, sk, skb); 1090 xs_udp_data_read_skb(&transport->xprt, sk, skb);
1086 skb_free_datagram_locked(sk, skb); 1091 consume_skb(skb);
1087 continue; 1092 continue;
1088 } 1093 }
1089 if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) 1094 if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
@@ -1188,7 +1193,7 @@ static inline void xs_tcp_read_xid(struct sock_xprt *transport, struct xdr_skb_r
1188 char *p; 1193 char *p;
1189 1194
1190 len = sizeof(transport->tcp_xid) - transport->tcp_offset; 1195 len = sizeof(transport->tcp_xid) - transport->tcp_offset;
1191 dprintk("RPC: reading XID (%Zu bytes)\n", len); 1196 dprintk("RPC: reading XID (%zu bytes)\n", len);
1192 p = ((char *) &transport->tcp_xid) + transport->tcp_offset; 1197 p = ((char *) &transport->tcp_xid) + transport->tcp_offset;
1193 used = xdr_skb_read_bits(desc, p, len); 1198 used = xdr_skb_read_bits(desc, p, len);
1194 transport->tcp_offset += used; 1199 transport->tcp_offset += used;
@@ -1219,7 +1224,7 @@ static inline void xs_tcp_read_calldir(struct sock_xprt *transport,
1219 */ 1224 */
1220 offset = transport->tcp_offset - sizeof(transport->tcp_xid); 1225 offset = transport->tcp_offset - sizeof(transport->tcp_xid);
1221 len = sizeof(transport->tcp_calldir) - offset; 1226 len = sizeof(transport->tcp_calldir) - offset;
1222 dprintk("RPC: reading CALL/REPLY flag (%Zu bytes)\n", len); 1227 dprintk("RPC: reading CALL/REPLY flag (%zu bytes)\n", len);
1223 p = ((char *) &transport->tcp_calldir) + offset; 1228 p = ((char *) &transport->tcp_calldir) + offset;
1224 used = xdr_skb_read_bits(desc, p, len); 1229 used = xdr_skb_read_bits(desc, p, len);
1225 transport->tcp_offset += used; 1230 transport->tcp_offset += used;
@@ -1310,7 +1315,7 @@ static inline void xs_tcp_read_common(struct rpc_xprt *xprt,
1310 return; 1315 return;
1311 } 1316 }
1312 1317
1313 dprintk("RPC: XID %08x read %Zd bytes\n", 1318 dprintk("RPC: XID %08x read %zd bytes\n",
1314 ntohl(transport->tcp_xid), r); 1319 ntohl(transport->tcp_xid), r);
1315 dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, " 1320 dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, "
1316 "tcp_reclen = %u\n", xprt, transport->tcp_copied, 1321 "tcp_reclen = %u\n", xprt, transport->tcp_copied,
@@ -1456,7 +1461,7 @@ static inline void xs_tcp_read_discard(struct sock_xprt *transport, struct xdr_s
1456 desc->count -= len; 1461 desc->count -= len;
1457 desc->offset += len; 1462 desc->offset += len;
1458 transport->tcp_offset += len; 1463 transport->tcp_offset += len;
1459 dprintk("RPC: discarded %Zu bytes\n", len); 1464 dprintk("RPC: discarded %zu bytes\n", len);
1460 xs_tcp_check_fraghdr(transport); 1465 xs_tcp_check_fraghdr(transport);
1461} 1466}
1462 1467
@@ -1734,7 +1739,9 @@ static void xs_udp_set_buffer_size(struct rpc_xprt *xprt, size_t sndsize, size_t
1734 */ 1739 */
1735static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task) 1740static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task)
1736{ 1741{
1742 spin_lock_bh(&xprt->transport_lock);
1737 xprt_adjust_cwnd(xprt, task, -ETIMEDOUT); 1743 xprt_adjust_cwnd(xprt, task, -ETIMEDOUT);
1744 spin_unlock_bh(&xprt->transport_lock);
1738} 1745}
1739 1746
1740static unsigned short xs_get_random_port(void) 1747static unsigned short xs_get_random_port(void)
@@ -2235,6 +2242,66 @@ static void xs_tcp_shutdown(struct rpc_xprt *xprt)
2235 xs_reset_transport(transport); 2242 xs_reset_transport(transport);
2236} 2243}
2237 2244
2245static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
2246 struct socket *sock)
2247{
2248 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
2249 unsigned int keepidle;
2250 unsigned int keepcnt;
2251 unsigned int opt_on = 1;
2252 unsigned int timeo;
2253
2254 spin_lock_bh(&xprt->transport_lock);
2255 keepidle = DIV_ROUND_UP(xprt->timeout->to_initval, HZ);
2256 keepcnt = xprt->timeout->to_retries + 1;
2257 timeo = jiffies_to_msecs(xprt->timeout->to_initval) *
2258 (xprt->timeout->to_retries + 1);
2259 clear_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state);
2260 spin_unlock_bh(&xprt->transport_lock);
2261
2262 /* TCP Keepalive options */
2263 kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
2264 (char *)&opt_on, sizeof(opt_on));
2265 kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE,
2266 (char *)&keepidle, sizeof(keepidle));
2267 kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL,
2268 (char *)&keepidle, sizeof(keepidle));
2269 kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
2270 (char *)&keepcnt, sizeof(keepcnt));
2271
2272 /* TCP user timeout (see RFC5482) */
2273 kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
2274 (char *)&timeo, sizeof(timeo));
2275}
2276
2277static void xs_tcp_set_connect_timeout(struct rpc_xprt *xprt,
2278 unsigned long connect_timeout,
2279 unsigned long reconnect_timeout)
2280{
2281 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
2282 struct rpc_timeout to;
2283 unsigned long initval;
2284
2285 spin_lock_bh(&xprt->transport_lock);
2286 if (reconnect_timeout < xprt->max_reconnect_timeout)
2287 xprt->max_reconnect_timeout = reconnect_timeout;
2288 if (connect_timeout < xprt->connect_timeout) {
2289 memcpy(&to, xprt->timeout, sizeof(to));
2290 initval = DIV_ROUND_UP(connect_timeout, to.to_retries + 1);
2291 /* Arbitrary lower limit */
2292 if (initval < XS_TCP_INIT_REEST_TO << 1)
2293 initval = XS_TCP_INIT_REEST_TO << 1;
2294 to.to_initval = initval;
2295 to.to_maxval = initval;
2296 memcpy(&transport->tcp_timeout, &to,
2297 sizeof(transport->tcp_timeout));
2298 xprt->timeout = &transport->tcp_timeout;
2299 xprt->connect_timeout = connect_timeout;
2300 }
2301 set_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state);
2302 spin_unlock_bh(&xprt->transport_lock);
2303}
2304
2238static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) 2305static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
2239{ 2306{
2240 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); 2307 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
@@ -2242,22 +2309,8 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
2242 2309
2243 if (!transport->inet) { 2310 if (!transport->inet) {
2244 struct sock *sk = sock->sk; 2311 struct sock *sk = sock->sk;
2245 unsigned int keepidle = xprt->timeout->to_initval / HZ;
2246 unsigned int keepcnt = xprt->timeout->to_retries + 1;
2247 unsigned int opt_on = 1;
2248 unsigned int timeo;
2249 unsigned int addr_pref = IPV6_PREFER_SRC_PUBLIC; 2312 unsigned int addr_pref = IPV6_PREFER_SRC_PUBLIC;
2250 2313
2251 /* TCP Keepalive options */
2252 kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
2253 (char *)&opt_on, sizeof(opt_on));
2254 kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE,
2255 (char *)&keepidle, sizeof(keepidle));
2256 kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL,
2257 (char *)&keepidle, sizeof(keepidle));
2258 kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
2259 (char *)&keepcnt, sizeof(keepcnt));
2260
2261 /* Avoid temporary address, they are bad for long-lived 2314 /* Avoid temporary address, they are bad for long-lived
2262 * connections such as NFS mounts. 2315 * connections such as NFS mounts.
2263 * RFC4941, section 3.6 suggests that: 2316 * RFC4941, section 3.6 suggests that:
@@ -2268,11 +2321,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
2268 kernel_setsockopt(sock, SOL_IPV6, IPV6_ADDR_PREFERENCES, 2321 kernel_setsockopt(sock, SOL_IPV6, IPV6_ADDR_PREFERENCES,
2269 (char *)&addr_pref, sizeof(addr_pref)); 2322 (char *)&addr_pref, sizeof(addr_pref));
2270 2323
2271 /* TCP user timeout (see RFC5482) */ 2324 xs_tcp_set_socket_timeouts(xprt, sock);
2272 timeo = jiffies_to_msecs(xprt->timeout->to_initval) *
2273 (xprt->timeout->to_retries + 1);
2274 kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
2275 (char *)&timeo, sizeof(timeo));
2276 2325
2277 write_lock_bh(&sk->sk_callback_lock); 2326 write_lock_bh(&sk->sk_callback_lock);
2278 2327
@@ -2721,6 +2770,7 @@ static struct rpc_xprt_ops xs_tcp_ops = {
2721 .set_retrans_timeout = xprt_set_retrans_timeout_def, 2770 .set_retrans_timeout = xprt_set_retrans_timeout_def,
2722 .close = xs_tcp_shutdown, 2771 .close = xs_tcp_shutdown,
2723 .destroy = xs_destroy, 2772 .destroy = xs_destroy,
2773 .set_connect_timeout = xs_tcp_set_connect_timeout,
2724 .print_stats = xs_tcp_print_stats, 2774 .print_stats = xs_tcp_print_stats,
2725 .enable_swap = xs_enable_swap, 2775 .enable_swap = xs_enable_swap,
2726 .disable_swap = xs_disable_swap, 2776 .disable_swap = xs_disable_swap,
@@ -3007,6 +3057,8 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
3007 xprt->timeout = &xs_tcp_default_timeout; 3057 xprt->timeout = &xs_tcp_default_timeout;
3008 3058
3009 xprt->max_reconnect_timeout = xprt->timeout->to_maxval; 3059 xprt->max_reconnect_timeout = xprt->timeout->to_maxval;
3060 xprt->connect_timeout = xprt->timeout->to_initval *
3061 (xprt->timeout->to_retries + 1);
3010 3062
3011 INIT_WORK(&transport->recv_worker, xs_tcp_data_receive_workfn); 3063 INIT_WORK(&transport->recv_worker, xs_tcp_data_receive_workfn);
3012 INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_setup_socket); 3064 INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_setup_socket);
@@ -3209,7 +3261,9 @@ static int param_set_uint_minmax(const char *val,
3209 if (!val) 3261 if (!val)
3210 return -EINVAL; 3262 return -EINVAL;
3211 ret = kstrtouint(val, 0, &num); 3263 ret = kstrtouint(val, 0, &num);
3212 if (ret == -EINVAL || num < min || num > max) 3264 if (ret)
3265 return ret;
3266 if (num < min || num > max)
3213 return -EINVAL; 3267 return -EINVAL;
3214 *((unsigned int *)kp->arg) = num; 3268 *((unsigned int *)kp->arg) = num;
3215 return 0; 3269 return 0;
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 3b95fe980fa2..017801f9dbaa 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -624,13 +624,10 @@ EXPORT_SYMBOL_GPL(unregister_switchdev_notifier);
624int call_switchdev_notifiers(unsigned long val, struct net_device *dev, 624int call_switchdev_notifiers(unsigned long val, struct net_device *dev,
625 struct switchdev_notifier_info *info) 625 struct switchdev_notifier_info *info)
626{ 626{
627 int err;
628
629 ASSERT_RTNL(); 627 ASSERT_RTNL();
630 628
631 info->dev = dev; 629 info->dev = dev;
632 err = raw_notifier_call_chain(&switchdev_notif_chain, val, info); 630 return raw_notifier_call_chain(&switchdev_notif_chain, val, info);
633 return err;
634} 631}
635EXPORT_SYMBOL_GPL(call_switchdev_notifiers); 632EXPORT_SYMBOL_GPL(call_switchdev_notifiers);
636 633
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index aa1babbea385..7d99029df342 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * net/tipc/bcast.c: TIPC broadcast code 2 * net/tipc/bcast.c: TIPC broadcast code
3 * 3 *
4 * Copyright (c) 2004-2006, 2014-2015, Ericsson AB 4 * Copyright (c) 2004-2006, 2014-2016, Ericsson AB
5 * Copyright (c) 2004, Intel Corporation. 5 * Copyright (c) 2004, Intel Corporation.
6 * Copyright (c) 2005, 2010-2011, Wind River Systems 6 * Copyright (c) 2005, 2010-2011, Wind River Systems
7 * All rights reserved. 7 * All rights reserved.
@@ -39,9 +39,8 @@
39#include "socket.h" 39#include "socket.h"
40#include "msg.h" 40#include "msg.h"
41#include "bcast.h" 41#include "bcast.h"
42#include "name_distr.h"
43#include "link.h" 42#include "link.h"
44#include "node.h" 43#include "name_table.h"
45 44
46#define BCLINK_WIN_DEFAULT 50 /* bcast link window size (default) */ 45#define BCLINK_WIN_DEFAULT 50 /* bcast link window size (default) */
47#define BCLINK_WIN_MIN 32 /* bcast minimum link window size */ 46#define BCLINK_WIN_MIN 32 /* bcast minimum link window size */
@@ -54,12 +53,20 @@ const char tipc_bclink_name[] = "broadcast-link";
54 * @inputq: data input queue; will only carry SOCK_WAKEUP messages 53 * @inputq: data input queue; will only carry SOCK_WAKEUP messages
55 * @dest: array keeping number of reachable destinations per bearer 54 * @dest: array keeping number of reachable destinations per bearer
56 * @primary_bearer: a bearer having links to all broadcast destinations, if any 55 * @primary_bearer: a bearer having links to all broadcast destinations, if any
56 * @bcast_support: indicates if primary bearer, if any, supports broadcast
57 * @rcast_support: indicates if all peer nodes support replicast
58 * @rc_ratio: dest count as percentage of cluster size where send method changes
59 * @bc_threshold: calculated drom rc_ratio; if dests > threshold use broadcast
57 */ 60 */
58struct tipc_bc_base { 61struct tipc_bc_base {
59 struct tipc_link *link; 62 struct tipc_link *link;
60 struct sk_buff_head inputq; 63 struct sk_buff_head inputq;
61 int dests[MAX_BEARERS]; 64 int dests[MAX_BEARERS];
62 int primary_bearer; 65 int primary_bearer;
66 bool bcast_support;
67 bool rcast_support;
68 int rc_ratio;
69 int bc_threshold;
63}; 70};
64 71
65static struct tipc_bc_base *tipc_bc_base(struct net *net) 72static struct tipc_bc_base *tipc_bc_base(struct net *net)
@@ -69,7 +76,20 @@ static struct tipc_bc_base *tipc_bc_base(struct net *net)
69 76
70int tipc_bcast_get_mtu(struct net *net) 77int tipc_bcast_get_mtu(struct net *net)
71{ 78{
72 return tipc_link_mtu(tipc_bc_sndlink(net)); 79 return tipc_link_mtu(tipc_bc_sndlink(net)) - INT_H_SIZE;
80}
81
82void tipc_bcast_disable_rcast(struct net *net)
83{
84 tipc_bc_base(net)->rcast_support = false;
85}
86
87static void tipc_bcbase_calc_bc_threshold(struct net *net)
88{
89 struct tipc_bc_base *bb = tipc_bc_base(net);
90 int cluster_size = tipc_link_bc_peers(tipc_bc_sndlink(net));
91
92 bb->bc_threshold = 1 + (cluster_size * bb->rc_ratio / 100);
73} 93}
74 94
75/* tipc_bcbase_select_primary(): find a bearer with links to all destinations, 95/* tipc_bcbase_select_primary(): find a bearer with links to all destinations,
@@ -79,9 +99,10 @@ static void tipc_bcbase_select_primary(struct net *net)
79{ 99{
80 struct tipc_bc_base *bb = tipc_bc_base(net); 100 struct tipc_bc_base *bb = tipc_bc_base(net);
81 int all_dests = tipc_link_bc_peers(bb->link); 101 int all_dests = tipc_link_bc_peers(bb->link);
82 int i, mtu; 102 int i, mtu, prim;
83 103
84 bb->primary_bearer = INVALID_BEARER_ID; 104 bb->primary_bearer = INVALID_BEARER_ID;
105 bb->bcast_support = true;
85 106
86 if (!all_dests) 107 if (!all_dests)
87 return; 108 return;
@@ -93,7 +114,7 @@ static void tipc_bcbase_select_primary(struct net *net)
93 mtu = tipc_bearer_mtu(net, i); 114 mtu = tipc_bearer_mtu(net, i);
94 if (mtu < tipc_link_mtu(bb->link)) 115 if (mtu < tipc_link_mtu(bb->link))
95 tipc_link_set_mtu(bb->link, mtu); 116 tipc_link_set_mtu(bb->link, mtu);
96 117 bb->bcast_support &= tipc_bearer_bcast_support(net, i);
97 if (bb->dests[i] < all_dests) 118 if (bb->dests[i] < all_dests)
98 continue; 119 continue;
99 120
@@ -103,6 +124,9 @@ static void tipc_bcbase_select_primary(struct net *net)
103 if ((i ^ tipc_own_addr(net)) & 1) 124 if ((i ^ tipc_own_addr(net)) & 1)
104 break; 125 break;
105 } 126 }
127 prim = bb->primary_bearer;
128 if (prim != INVALID_BEARER_ID)
129 bb->bcast_support = tipc_bearer_bcast_support(net, prim);
106} 130}
107 131
108void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id) 132void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id)
@@ -170,45 +194,131 @@ static void tipc_bcbase_xmit(struct net *net, struct sk_buff_head *xmitq)
170 __skb_queue_purge(&_xmitq); 194 __skb_queue_purge(&_xmitq);
171} 195}
172 196
173/* tipc_bcast_xmit - deliver buffer chain to all nodes in cluster 197static void tipc_bcast_select_xmit_method(struct net *net, int dests,
174 * and to identified node local sockets 198 struct tipc_mc_method *method)
199{
200 struct tipc_bc_base *bb = tipc_bc_base(net);
201 unsigned long exp = method->expires;
202
203 /* Broadcast supported by used bearer/bearers? */
204 if (!bb->bcast_support) {
205 method->rcast = true;
206 return;
207 }
208 /* Any destinations which don't support replicast ? */
209 if (!bb->rcast_support) {
210 method->rcast = false;
211 return;
212 }
213 /* Can current method be changed ? */
214 method->expires = jiffies + TIPC_METHOD_EXPIRE;
215 if (method->mandatory || time_before(jiffies, exp))
216 return;
217
218 /* Determine method to use now */
219 method->rcast = dests <= bb->bc_threshold;
220}
221
222/* tipc_bcast_xmit - broadcast the buffer chain to all external nodes
175 * @net: the applicable net namespace 223 * @net: the applicable net namespace
176 * @list: chain of buffers containing message 224 * @pkts: chain of buffers containing message
177 * Consumes the buffer chain, except when returning -ELINKCONG 225 * @cong_link_cnt: set to 1 if broadcast link is congested, otherwise 0
178 * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE 226 * Consumes the buffer chain.
227 * Returns 0 if success, otherwise errno: -EHOSTUNREACH,-EMSGSIZE
179 */ 228 */
180int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list) 229static int tipc_bcast_xmit(struct net *net, struct sk_buff_head *pkts,
230 u16 *cong_link_cnt)
181{ 231{
182 struct tipc_link *l = tipc_bc_sndlink(net); 232 struct tipc_link *l = tipc_bc_sndlink(net);
183 struct sk_buff_head xmitq, inputq, rcvq; 233 struct sk_buff_head xmitq;
184 int rc = 0; 234 int rc = 0;
185 235
186 __skb_queue_head_init(&rcvq);
187 __skb_queue_head_init(&xmitq); 236 __skb_queue_head_init(&xmitq);
188 skb_queue_head_init(&inputq);
189
190 /* Prepare message clone for local node */
191 if (unlikely(!tipc_msg_reassemble(list, &rcvq)))
192 return -EHOSTUNREACH;
193
194 tipc_bcast_lock(net); 237 tipc_bcast_lock(net);
195 if (tipc_link_bc_peers(l)) 238 if (tipc_link_bc_peers(l))
196 rc = tipc_link_xmit(l, list, &xmitq); 239 rc = tipc_link_xmit(l, pkts, &xmitq);
197 tipc_bcast_unlock(net); 240 tipc_bcast_unlock(net);
198 241 tipc_bcbase_xmit(net, &xmitq);
199 /* Don't send to local node if adding to link failed */ 242 __skb_queue_purge(pkts);
200 if (unlikely(rc)) { 243 if (rc == -ELINKCONG) {
201 __skb_queue_purge(&rcvq); 244 *cong_link_cnt = 1;
202 return rc; 245 rc = 0;
203 } 246 }
247 return rc;
248}
204 249
205 /* Broadcast to all nodes, inluding local node */ 250/* tipc_rcast_xmit - replicate and send a message to given destination nodes
206 tipc_bcbase_xmit(net, &xmitq); 251 * @net: the applicable net namespace
207 tipc_sk_mcast_rcv(net, &rcvq, &inputq); 252 * @pkts: chain of buffers containing message
208 __skb_queue_purge(list); 253 * @dests: list of destination nodes
254 * @cong_link_cnt: returns number of congested links
255 * @cong_links: returns identities of congested links
256 * Returns 0 if success, otherwise errno
257 */
258static int tipc_rcast_xmit(struct net *net, struct sk_buff_head *pkts,
259 struct tipc_nlist *dests, u16 *cong_link_cnt)
260{
261 struct sk_buff_head _pkts;
262 struct u32_item *n, *tmp;
263 u32 dst, selector;
264
265 selector = msg_link_selector(buf_msg(skb_peek(pkts)));
266 __skb_queue_head_init(&_pkts);
267
268 list_for_each_entry_safe(n, tmp, &dests->list, list) {
269 dst = n->value;
270 if (!tipc_msg_pskb_copy(dst, pkts, &_pkts))
271 return -ENOMEM;
272
273 /* Any other return value than -ELINKCONG is ignored */
274 if (tipc_node_xmit(net, &_pkts, dst, selector) == -ELINKCONG)
275 (*cong_link_cnt)++;
276 }
209 return 0; 277 return 0;
210} 278}
211 279
280/* tipc_mcast_xmit - deliver message to indicated destination nodes
281 * and to identified node local sockets
282 * @net: the applicable net namespace
283 * @pkts: chain of buffers containing message
284 * @method: send method to be used
285 * @dests: destination nodes for message.
286 * @cong_link_cnt: returns number of encountered congested destination links
287 * Consumes buffer chain.
288 * Returns 0 if success, otherwise errno
289 */
290int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts,
291 struct tipc_mc_method *method, struct tipc_nlist *dests,
292 u16 *cong_link_cnt)
293{
294 struct sk_buff_head inputq, localq;
295 int rc = 0;
296
297 skb_queue_head_init(&inputq);
298 skb_queue_head_init(&localq);
299
300 /* Clone packets before they are consumed by next call */
301 if (dests->local && !tipc_msg_reassemble(pkts, &localq)) {
302 rc = -ENOMEM;
303 goto exit;
304 }
305 /* Send according to determined transmit method */
306 if (dests->remote) {
307 tipc_bcast_select_xmit_method(net, dests->remote, method);
308 if (method->rcast)
309 rc = tipc_rcast_xmit(net, pkts, dests, cong_link_cnt);
310 else
311 rc = tipc_bcast_xmit(net, pkts, cong_link_cnt);
312 }
313
314 if (dests->local)
315 tipc_sk_mcast_rcv(net, &localq, &inputq);
316exit:
317 /* This queue should normally be empty by now */
318 __skb_queue_purge(pkts);
319 return rc;
320}
321
212/* tipc_bcast_rcv - receive a broadcast packet, and deliver to rcv link 322/* tipc_bcast_rcv - receive a broadcast packet, and deliver to rcv link
213 * 323 *
214 * RCU is locked, no other locks set 324 * RCU is locked, no other locks set
@@ -313,6 +423,7 @@ void tipc_bcast_add_peer(struct net *net, struct tipc_link *uc_l,
313 tipc_bcast_lock(net); 423 tipc_bcast_lock(net);
314 tipc_link_add_bc_peer(snd_l, uc_l, xmitq); 424 tipc_link_add_bc_peer(snd_l, uc_l, xmitq);
315 tipc_bcbase_select_primary(net); 425 tipc_bcbase_select_primary(net);
426 tipc_bcbase_calc_bc_threshold(net);
316 tipc_bcast_unlock(net); 427 tipc_bcast_unlock(net);
317} 428}
318 429
@@ -331,6 +442,7 @@ void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_l)
331 tipc_bcast_lock(net); 442 tipc_bcast_lock(net);
332 tipc_link_remove_bc_peer(snd_l, rcv_l, &xmitq); 443 tipc_link_remove_bc_peer(snd_l, rcv_l, &xmitq);
333 tipc_bcbase_select_primary(net); 444 tipc_bcbase_select_primary(net);
445 tipc_bcbase_calc_bc_threshold(net);
334 tipc_bcast_unlock(net); 446 tipc_bcast_unlock(net);
335 447
336 tipc_bcbase_xmit(net, &xmitq); 448 tipc_bcbase_xmit(net, &xmitq);
@@ -413,6 +525,8 @@ int tipc_bcast_init(struct net *net)
413 goto enomem; 525 goto enomem;
414 bb->link = l; 526 bb->link = l;
415 tn->bcl = l; 527 tn->bcl = l;
528 bb->rc_ratio = 25;
529 bb->rcast_support = true;
416 return 0; 530 return 0;
417enomem: 531enomem:
418 kfree(bb); 532 kfree(bb);
@@ -428,3 +542,33 @@ void tipc_bcast_stop(struct net *net)
428 kfree(tn->bcbase); 542 kfree(tn->bcbase);
429 kfree(tn->bcl); 543 kfree(tn->bcl);
430} 544}
545
546void tipc_nlist_init(struct tipc_nlist *nl, u32 self)
547{
548 memset(nl, 0, sizeof(*nl));
549 INIT_LIST_HEAD(&nl->list);
550 nl->self = self;
551}
552
553void tipc_nlist_add(struct tipc_nlist *nl, u32 node)
554{
555 if (node == nl->self)
556 nl->local = true;
557 else if (u32_push(&nl->list, node))
558 nl->remote++;
559}
560
561void tipc_nlist_del(struct tipc_nlist *nl, u32 node)
562{
563 if (node == nl->self)
564 nl->local = false;
565 else if (u32_del(&nl->list, node))
566 nl->remote--;
567}
568
569void tipc_nlist_purge(struct tipc_nlist *nl)
570{
571 u32_list_purge(&nl->list);
572 nl->remote = 0;
573 nl->local = 0;
574}
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
index 855d53c64ab3..751530ab0c49 100644
--- a/net/tipc/bcast.h
+++ b/net/tipc/bcast.h
@@ -42,9 +42,35 @@
42struct tipc_node; 42struct tipc_node;
43struct tipc_msg; 43struct tipc_msg;
44struct tipc_nl_msg; 44struct tipc_nl_msg;
45struct tipc_node_map; 45struct tipc_nlist;
46struct tipc_nitem;
46extern const char tipc_bclink_name[]; 47extern const char tipc_bclink_name[];
47 48
49#define TIPC_METHOD_EXPIRE msecs_to_jiffies(5000)
50
51struct tipc_nlist {
52 struct list_head list;
53 u32 self;
54 u16 remote;
55 bool local;
56};
57
58void tipc_nlist_init(struct tipc_nlist *nl, u32 self);
59void tipc_nlist_purge(struct tipc_nlist *nl);
60void tipc_nlist_add(struct tipc_nlist *nl, u32 node);
61void tipc_nlist_del(struct tipc_nlist *nl, u32 node);
62
63/* Cookie to be used between socket and broadcast layer
64 * @rcast: replicast (instead of broadcast) was used at previous xmit
65 * @mandatory: broadcast/replicast indication was set by user
66 * @expires: re-evaluate non-mandatory transmit method if we are past this
67 */
68struct tipc_mc_method {
69 bool rcast;
70 bool mandatory;
71 unsigned long expires;
72};
73
48int tipc_bcast_init(struct net *net); 74int tipc_bcast_init(struct net *net);
49void tipc_bcast_stop(struct net *net); 75void tipc_bcast_stop(struct net *net);
50void tipc_bcast_add_peer(struct net *net, struct tipc_link *l, 76void tipc_bcast_add_peer(struct net *net, struct tipc_link *l,
@@ -53,7 +79,10 @@ void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_bcl);
53void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id); 79void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id);
54void tipc_bcast_dec_bearer_dst_cnt(struct net *net, int bearer_id); 80void tipc_bcast_dec_bearer_dst_cnt(struct net *net, int bearer_id);
55int tipc_bcast_get_mtu(struct net *net); 81int tipc_bcast_get_mtu(struct net *net);
56int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list); 82void tipc_bcast_disable_rcast(struct net *net);
83int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts,
84 struct tipc_mc_method *method, struct tipc_nlist *dests,
85 u16 *cong_link_cnt);
57int tipc_bcast_rcv(struct net *net, struct tipc_link *l, struct sk_buff *skb); 86int tipc_bcast_rcv(struct net *net, struct tipc_link *l, struct sk_buff *skb);
58void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l, 87void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l,
59 struct tipc_msg *hdr); 88 struct tipc_msg *hdr);
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 52d74760fb68..33a5bdfbef76 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -431,7 +431,7 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b,
431 memset(&b->bcast_addr, 0, sizeof(b->bcast_addr)); 431 memset(&b->bcast_addr, 0, sizeof(b->bcast_addr));
432 memcpy(b->bcast_addr.value, dev->broadcast, b->media->hwaddr_len); 432 memcpy(b->bcast_addr.value, dev->broadcast, b->media->hwaddr_len);
433 b->bcast_addr.media_id = b->media->type_id; 433 b->bcast_addr.media_id = b->media->type_id;
434 b->bcast_addr.broadcast = 1; 434 b->bcast_addr.broadcast = TIPC_BROADCAST_SUPPORT;
435 b->mtu = dev->mtu; 435 b->mtu = dev->mtu;
436 b->media->raw2addr(b, &b->addr, (char *)dev->dev_addr); 436 b->media->raw2addr(b, &b->addr, (char *)dev->dev_addr);
437 rcu_assign_pointer(dev->tipc_ptr, b); 437 rcu_assign_pointer(dev->tipc_ptr, b);
@@ -482,6 +482,19 @@ int tipc_l2_send_msg(struct net *net, struct sk_buff *skb,
482 return 0; 482 return 0;
483} 483}
484 484
485bool tipc_bearer_bcast_support(struct net *net, u32 bearer_id)
486{
487 bool supp = false;
488 struct tipc_bearer *b;
489
490 rcu_read_lock();
491 b = bearer_get(net, bearer_id);
492 if (b)
493 supp = (b->bcast_addr.broadcast == TIPC_BROADCAST_SUPPORT);
494 rcu_read_unlock();
495 return supp;
496}
497
485int tipc_bearer_mtu(struct net *net, u32 bearer_id) 498int tipc_bearer_mtu(struct net *net, u32 bearer_id)
486{ 499{
487 int mtu = 0; 500 int mtu = 0;
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
index 278ff7f616f9..635c9086e19a 100644
--- a/net/tipc/bearer.h
+++ b/net/tipc/bearer.h
@@ -60,9 +60,14 @@
60#define TIPC_MEDIA_TYPE_IB 2 60#define TIPC_MEDIA_TYPE_IB 2
61#define TIPC_MEDIA_TYPE_UDP 3 61#define TIPC_MEDIA_TYPE_UDP 3
62 62
63/* minimum bearer MTU */ 63/* Minimum bearer MTU */
64#define TIPC_MIN_BEARER_MTU (MAX_H_SIZE + INT_H_SIZE) 64#define TIPC_MIN_BEARER_MTU (MAX_H_SIZE + INT_H_SIZE)
65 65
66/* Identifiers for distinguishing between broadcast/multicast and replicast
67 */
68#define TIPC_BROADCAST_SUPPORT 1
69#define TIPC_REPLICAST_SUPPORT 2
70
66/** 71/**
67 * struct tipc_media_addr - destination address used by TIPC bearers 72 * struct tipc_media_addr - destination address used by TIPC bearers
68 * @value: address info (format defined by media) 73 * @value: address info (format defined by media)
@@ -210,6 +215,7 @@ int tipc_bearer_setup(void);
210void tipc_bearer_cleanup(void); 215void tipc_bearer_cleanup(void);
211void tipc_bearer_stop(struct net *net); 216void tipc_bearer_stop(struct net *net);
212int tipc_bearer_mtu(struct net *net, u32 bearer_id); 217int tipc_bearer_mtu(struct net *net, u32 bearer_id);
218bool tipc_bearer_bcast_support(struct net *net, u32 bearer_id);
213void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id, 219void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id,
214 struct sk_buff *skb, 220 struct sk_buff *skb,
215 struct tipc_media_addr *dest); 221 struct tipc_media_addr *dest);
diff --git a/net/tipc/core.c b/net/tipc/core.c
index 236b043a4156..0b982d048fb9 100644
--- a/net/tipc/core.c
+++ b/net/tipc/core.c
@@ -47,7 +47,7 @@
47#include <linux/module.h> 47#include <linux/module.h>
48 48
49/* configurable TIPC parameters */ 49/* configurable TIPC parameters */
50int tipc_net_id __read_mostly; 50unsigned int tipc_net_id __read_mostly;
51int sysctl_tipc_rmem[3] __read_mostly; /* min/default/max */ 51int sysctl_tipc_rmem[3] __read_mostly; /* min/default/max */
52 52
53static int __net_init tipc_init_net(struct net *net) 53static int __net_init tipc_init_net(struct net *net)
diff --git a/net/tipc/core.h b/net/tipc/core.h
index a1845fb27d80..5cc5398be722 100644
--- a/net/tipc/core.h
+++ b/net/tipc/core.h
@@ -74,7 +74,7 @@ struct tipc_monitor;
74#define MAX_BEARERS 3 74#define MAX_BEARERS 3
75#define TIPC_DEF_MON_THRESHOLD 32 75#define TIPC_DEF_MON_THRESHOLD 32
76 76
77extern int tipc_net_id __read_mostly; 77extern unsigned int tipc_net_id __read_mostly;
78extern int sysctl_tipc_rmem[3] __read_mostly; 78extern int sysctl_tipc_rmem[3] __read_mostly;
79extern int sysctl_tipc_named_timeout __read_mostly; 79extern int sysctl_tipc_named_timeout __read_mostly;
80 80
diff --git a/net/tipc/discover.c b/net/tipc/discover.c
index 6b109a808d4c..02462d67d191 100644
--- a/net/tipc/discover.c
+++ b/net/tipc/discover.c
@@ -169,7 +169,7 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb,
169 169
170 /* Send response, if necessary */ 170 /* Send response, if necessary */
171 if (respond && (mtyp == DSC_REQ_MSG)) { 171 if (respond && (mtyp == DSC_REQ_MSG)) {
172 rskb = tipc_buf_acquire(MAX_H_SIZE); 172 rskb = tipc_buf_acquire(MAX_H_SIZE, GFP_ATOMIC);
173 if (!rskb) 173 if (!rskb)
174 return; 174 return;
175 tipc_disc_init_msg(net, rskb, DSC_RESP_MSG, bearer); 175 tipc_disc_init_msg(net, rskb, DSC_RESP_MSG, bearer);
@@ -278,7 +278,7 @@ int tipc_disc_create(struct net *net, struct tipc_bearer *b,
278 req = kmalloc(sizeof(*req), GFP_ATOMIC); 278 req = kmalloc(sizeof(*req), GFP_ATOMIC);
279 if (!req) 279 if (!req)
280 return -ENOMEM; 280 return -ENOMEM;
281 req->buf = tipc_buf_acquire(MAX_H_SIZE); 281 req->buf = tipc_buf_acquire(MAX_H_SIZE, GFP_ATOMIC);
282 if (!req->buf) { 282 if (!req->buf) {
283 kfree(req); 283 kfree(req);
284 return -ENOMEM; 284 return -ENOMEM;
diff --git a/net/tipc/link.c b/net/tipc/link.c
index bda89bf9f4ff..ddd2dd6f77aa 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -515,6 +515,10 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer,
515 if (link_is_bc_sndlink(l)) 515 if (link_is_bc_sndlink(l))
516 l->state = LINK_ESTABLISHED; 516 l->state = LINK_ESTABLISHED;
517 517
518 /* Disable replicast if even a single peer doesn't support it */
519 if (link_is_bc_rcvlink(l) && !(peer_caps & TIPC_BCAST_RCAST))
520 tipc_bcast_disable_rcast(net);
521
518 return true; 522 return true;
519} 523}
520 524
@@ -776,60 +780,47 @@ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq)
776 780
777/** 781/**
778 * link_schedule_user - schedule a message sender for wakeup after congestion 782 * link_schedule_user - schedule a message sender for wakeup after congestion
779 * @link: congested link 783 * @l: congested link
780 * @list: message that was attempted sent 784 * @hdr: header of message that is being sent
781 * Create pseudo msg to send back to user when congestion abates 785 * Create pseudo msg to send back to user when congestion abates
782 * Does not consume buffer list
783 */ 786 */
784static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list) 787static int link_schedule_user(struct tipc_link *l, struct tipc_msg *hdr)
785{ 788{
786 struct tipc_msg *msg = buf_msg(skb_peek(list)); 789 u32 dnode = tipc_own_addr(l->net);
787 int imp = msg_importance(msg); 790 u32 dport = msg_origport(hdr);
788 u32 oport = msg_origport(msg);
789 u32 addr = tipc_own_addr(link->net);
790 struct sk_buff *skb; 791 struct sk_buff *skb;
791 792
792 /* This really cannot happen... */
793 if (unlikely(imp > TIPC_CRITICAL_IMPORTANCE)) {
794 pr_warn("%s<%s>, send queue full", link_rst_msg, link->name);
795 return -ENOBUFS;
796 }
797 /* Non-blocking sender: */
798 if (TIPC_SKB_CB(skb_peek(list))->wakeup_pending)
799 return -ELINKCONG;
800
801 /* Create and schedule wakeup pseudo message */ 793 /* Create and schedule wakeup pseudo message */
802 skb = tipc_msg_create(SOCK_WAKEUP, 0, INT_H_SIZE, 0, 794 skb = tipc_msg_create(SOCK_WAKEUP, 0, INT_H_SIZE, 0,
803 addr, addr, oport, 0, 0); 795 dnode, l->addr, dport, 0, 0);
804 if (!skb) 796 if (!skb)
805 return -ENOBUFS; 797 return -ENOBUFS;
806 TIPC_SKB_CB(skb)->chain_sz = skb_queue_len(list); 798 msg_set_dest_droppable(buf_msg(skb), true);
807 TIPC_SKB_CB(skb)->chain_imp = imp; 799 TIPC_SKB_CB(skb)->chain_imp = msg_importance(hdr);
808 skb_queue_tail(&link->wakeupq, skb); 800 skb_queue_tail(&l->wakeupq, skb);
809 link->stats.link_congs++; 801 l->stats.link_congs++;
810 return -ELINKCONG; 802 return -ELINKCONG;
811} 803}
812 804
813/** 805/**
814 * link_prepare_wakeup - prepare users for wakeup after congestion 806 * link_prepare_wakeup - prepare users for wakeup after congestion
815 * @link: congested link 807 * @l: congested link
816 * Move a number of waiting users, as permitted by available space in 808 * Wake up a number of waiting users, as permitted by available space
817 * the send queue, from link wait queue to node wait queue for wakeup 809 * in the send queue
818 */ 810 */
819void link_prepare_wakeup(struct tipc_link *l) 811void link_prepare_wakeup(struct tipc_link *l)
820{ 812{
821 int pnd[TIPC_SYSTEM_IMPORTANCE + 1] = {0,};
822 int imp, lim;
823 struct sk_buff *skb, *tmp; 813 struct sk_buff *skb, *tmp;
814 int imp, i = 0;
824 815
825 skb_queue_walk_safe(&l->wakeupq, skb, tmp) { 816 skb_queue_walk_safe(&l->wakeupq, skb, tmp) {
826 imp = TIPC_SKB_CB(skb)->chain_imp; 817 imp = TIPC_SKB_CB(skb)->chain_imp;
827 lim = l->backlog[imp].limit; 818 if (l->backlog[imp].len < l->backlog[imp].limit) {
828 pnd[imp] += TIPC_SKB_CB(skb)->chain_sz; 819 skb_unlink(skb, &l->wakeupq);
829 if ((pnd[imp] + l->backlog[imp].len) >= lim) 820 skb_queue_tail(l->inputq, skb);
821 } else if (i++ > 10) {
830 break; 822 break;
831 skb_unlink(skb, &l->wakeupq); 823 }
832 skb_queue_tail(l->inputq, skb);
833 } 824 }
834} 825}
835 826
@@ -869,8 +860,7 @@ void tipc_link_reset(struct tipc_link *l)
869 * @list: chain of buffers containing message 860 * @list: chain of buffers containing message
870 * @xmitq: returned list of packets to be sent by caller 861 * @xmitq: returned list of packets to be sent by caller
871 * 862 *
872 * Consumes the buffer chain, except when returning -ELINKCONG, 863 * Consumes the buffer chain.
873 * since the caller then may want to make more send attempts.
874 * Returns 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS 864 * Returns 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS
875 * Messages at TIPC_SYSTEM_IMPORTANCE are always accepted 865 * Messages at TIPC_SYSTEM_IMPORTANCE are always accepted
876 */ 866 */
@@ -879,7 +869,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
879{ 869{
880 struct tipc_msg *hdr = buf_msg(skb_peek(list)); 870 struct tipc_msg *hdr = buf_msg(skb_peek(list));
881 unsigned int maxwin = l->window; 871 unsigned int maxwin = l->window;
882 unsigned int i, imp = msg_importance(hdr); 872 int imp = msg_importance(hdr);
883 unsigned int mtu = l->mtu; 873 unsigned int mtu = l->mtu;
884 u16 ack = l->rcv_nxt - 1; 874 u16 ack = l->rcv_nxt - 1;
885 u16 seqno = l->snd_nxt; 875 u16 seqno = l->snd_nxt;
@@ -888,19 +878,22 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
888 struct sk_buff_head *backlogq = &l->backlogq; 878 struct sk_buff_head *backlogq = &l->backlogq;
889 struct sk_buff *skb, *_skb, *bskb; 879 struct sk_buff *skb, *_skb, *bskb;
890 int pkt_cnt = skb_queue_len(list); 880 int pkt_cnt = skb_queue_len(list);
881 int rc = 0;
891 882
892 /* Match msg importance against this and all higher backlog limits: */
893 if (!skb_queue_empty(backlogq)) {
894 for (i = imp; i <= TIPC_SYSTEM_IMPORTANCE; i++) {
895 if (unlikely(l->backlog[i].len >= l->backlog[i].limit))
896 return link_schedule_user(l, list);
897 }
898 }
899 if (unlikely(msg_size(hdr) > mtu)) { 883 if (unlikely(msg_size(hdr) > mtu)) {
900 skb_queue_purge(list); 884 skb_queue_purge(list);
901 return -EMSGSIZE; 885 return -EMSGSIZE;
902 } 886 }
903 887
888 /* Allow oversubscription of one data msg per source at congestion */
889 if (unlikely(l->backlog[imp].len >= l->backlog[imp].limit)) {
890 if (imp == TIPC_SYSTEM_IMPORTANCE) {
891 pr_warn("%s<%s>, link overflow", link_rst_msg, l->name);
892 return -ENOBUFS;
893 }
894 rc = link_schedule_user(l, hdr);
895 }
896
904 if (pkt_cnt > 1) { 897 if (pkt_cnt > 1) {
905 l->stats.sent_fragmented++; 898 l->stats.sent_fragmented++;
906 l->stats.sent_fragments += pkt_cnt; 899 l->stats.sent_fragments += pkt_cnt;
@@ -946,7 +939,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
946 skb_queue_splice_tail_init(list, backlogq); 939 skb_queue_splice_tail_init(list, backlogq);
947 } 940 }
948 l->snd_nxt = seqno; 941 l->snd_nxt = seqno;
949 return 0; 942 return rc;
950} 943}
951 944
952void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq) 945void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq)
@@ -1043,11 +1036,17 @@ int tipc_link_retrans(struct tipc_link *l, u16 from, u16 to,
1043static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb, 1036static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb,
1044 struct sk_buff_head *inputq) 1037 struct sk_buff_head *inputq)
1045{ 1038{
1046 switch (msg_user(buf_msg(skb))) { 1039 struct tipc_msg *hdr = buf_msg(skb);
1040
1041 switch (msg_user(hdr)) {
1047 case TIPC_LOW_IMPORTANCE: 1042 case TIPC_LOW_IMPORTANCE:
1048 case TIPC_MEDIUM_IMPORTANCE: 1043 case TIPC_MEDIUM_IMPORTANCE:
1049 case TIPC_HIGH_IMPORTANCE: 1044 case TIPC_HIGH_IMPORTANCE:
1050 case TIPC_CRITICAL_IMPORTANCE: 1045 case TIPC_CRITICAL_IMPORTANCE:
1046 if (unlikely(msg_type(hdr) == TIPC_MCAST_MSG)) {
1047 skb_queue_tail(l->bc_rcvlink->inputq, skb);
1048 return true;
1049 }
1051 case CONN_MANAGER: 1050 case CONN_MANAGER:
1052 skb_queue_tail(inputq, skb); 1051 skb_queue_tail(inputq, skb);
1053 return true; 1052 return true;
@@ -1395,7 +1394,7 @@ tnl:
1395 msg_set_seqno(hdr, seqno++); 1394 msg_set_seqno(hdr, seqno++);
1396 pktlen = msg_size(hdr); 1395 pktlen = msg_size(hdr);
1397 msg_set_size(&tnlhdr, pktlen + INT_H_SIZE); 1396 msg_set_size(&tnlhdr, pktlen + INT_H_SIZE);
1398 tnlskb = tipc_buf_acquire(pktlen + INT_H_SIZE); 1397 tnlskb = tipc_buf_acquire(pktlen + INT_H_SIZE, GFP_ATOMIC);
1399 if (!tnlskb) { 1398 if (!tnlskb) {
1400 pr_warn("%sunable to send packet\n", link_co_err); 1399 pr_warn("%sunable to send packet\n", link_co_err);
1401 return; 1400 return;
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index 17201aa8423d..312ef7de57d7 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -58,12 +58,12 @@ static unsigned int align(unsigned int i)
58 * NOTE: Headroom is reserved to allow prepending of a data link header. 58 * NOTE: Headroom is reserved to allow prepending of a data link header.
59 * There may also be unrequested tailroom present at the buffer's end. 59 * There may also be unrequested tailroom present at the buffer's end.
60 */ 60 */
61struct sk_buff *tipc_buf_acquire(u32 size) 61struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp)
62{ 62{
63 struct sk_buff *skb; 63 struct sk_buff *skb;
64 unsigned int buf_size = (BUF_HEADROOM + size + 3) & ~3u; 64 unsigned int buf_size = (BUF_HEADROOM + size + 3) & ~3u;
65 65
66 skb = alloc_skb_fclone(buf_size, GFP_ATOMIC); 66 skb = alloc_skb_fclone(buf_size, gfp);
67 if (skb) { 67 if (skb) {
68 skb_reserve(skb, BUF_HEADROOM); 68 skb_reserve(skb, BUF_HEADROOM);
69 skb_put(skb, size); 69 skb_put(skb, size);
@@ -95,7 +95,7 @@ struct sk_buff *tipc_msg_create(uint user, uint type,
95 struct tipc_msg *msg; 95 struct tipc_msg *msg;
96 struct sk_buff *buf; 96 struct sk_buff *buf;
97 97
98 buf = tipc_buf_acquire(hdr_sz + data_sz); 98 buf = tipc_buf_acquire(hdr_sz + data_sz, GFP_ATOMIC);
99 if (unlikely(!buf)) 99 if (unlikely(!buf))
100 return NULL; 100 return NULL;
101 101
@@ -261,14 +261,14 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
261 261
262 /* No fragmentation needed? */ 262 /* No fragmentation needed? */
263 if (likely(msz <= pktmax)) { 263 if (likely(msz <= pktmax)) {
264 skb = tipc_buf_acquire(msz); 264 skb = tipc_buf_acquire(msz, GFP_KERNEL);
265 if (unlikely(!skb)) 265 if (unlikely(!skb))
266 return -ENOMEM; 266 return -ENOMEM;
267 skb_orphan(skb); 267 skb_orphan(skb);
268 __skb_queue_tail(list, skb); 268 __skb_queue_tail(list, skb);
269 skb_copy_to_linear_data(skb, mhdr, mhsz); 269 skb_copy_to_linear_data(skb, mhdr, mhsz);
270 pktpos = skb->data + mhsz; 270 pktpos = skb->data + mhsz;
271 if (copy_from_iter(pktpos, dsz, &m->msg_iter) == dsz) 271 if (copy_from_iter_full(pktpos, dsz, &m->msg_iter))
272 return dsz; 272 return dsz;
273 rc = -EFAULT; 273 rc = -EFAULT;
274 goto error; 274 goto error;
@@ -282,7 +282,7 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
282 msg_set_importance(&pkthdr, msg_importance(mhdr)); 282 msg_set_importance(&pkthdr, msg_importance(mhdr));
283 283
284 /* Prepare first fragment */ 284 /* Prepare first fragment */
285 skb = tipc_buf_acquire(pktmax); 285 skb = tipc_buf_acquire(pktmax, GFP_KERNEL);
286 if (!skb) 286 if (!skb)
287 return -ENOMEM; 287 return -ENOMEM;
288 skb_orphan(skb); 288 skb_orphan(skb);
@@ -299,7 +299,7 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
299 if (drem < pktrem) 299 if (drem < pktrem)
300 pktrem = drem; 300 pktrem = drem;
301 301
302 if (copy_from_iter(pktpos, pktrem, &m->msg_iter) != pktrem) { 302 if (!copy_from_iter_full(pktpos, pktrem, &m->msg_iter)) {
303 rc = -EFAULT; 303 rc = -EFAULT;
304 goto error; 304 goto error;
305 } 305 }
@@ -313,7 +313,7 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
313 pktsz = drem + INT_H_SIZE; 313 pktsz = drem + INT_H_SIZE;
314 else 314 else
315 pktsz = pktmax; 315 pktsz = pktmax;
316 skb = tipc_buf_acquire(pktsz); 316 skb = tipc_buf_acquire(pktsz, GFP_KERNEL);
317 if (!skb) { 317 if (!skb) {
318 rc = -ENOMEM; 318 rc = -ENOMEM;
319 goto error; 319 goto error;
@@ -448,7 +448,7 @@ bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg,
448 if (msz > (max / 2)) 448 if (msz > (max / 2))
449 return false; 449 return false;
450 450
451 _skb = tipc_buf_acquire(max); 451 _skb = tipc_buf_acquire(max, GFP_ATOMIC);
452 if (!_skb) 452 if (!_skb)
453 return false; 453 return false;
454 454
@@ -496,7 +496,7 @@ bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err)
496 496
497 /* Never return SHORT header; expand by replacing buffer if necessary */ 497 /* Never return SHORT header; expand by replacing buffer if necessary */
498 if (msg_short(hdr)) { 498 if (msg_short(hdr)) {
499 *skb = tipc_buf_acquire(BASIC_H_SIZE + dlen); 499 *skb = tipc_buf_acquire(BASIC_H_SIZE + dlen, GFP_ATOMIC);
500 if (!*skb) 500 if (!*skb)
501 goto exit; 501 goto exit;
502 memcpy((*skb)->data + BASIC_H_SIZE, msg_data(hdr), dlen); 502 memcpy((*skb)->data + BASIC_H_SIZE, msg_data(hdr), dlen);
@@ -607,6 +607,23 @@ error:
607 return false; 607 return false;
608} 608}
609 609
610bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg,
611 struct sk_buff_head *cpy)
612{
613 struct sk_buff *skb, *_skb;
614
615 skb_queue_walk(msg, skb) {
616 _skb = pskb_copy(skb, GFP_ATOMIC);
617 if (!_skb) {
618 __skb_queue_purge(cpy);
619 return false;
620 }
621 msg_set_destnode(buf_msg(_skb), dst);
622 __skb_queue_tail(cpy, _skb);
623 }
624 return true;
625}
626
610/* tipc_skb_queue_sorted(); sort pkt into list according to sequence number 627/* tipc_skb_queue_sorted(); sort pkt into list according to sequence number
611 * @list: list to be appended to 628 * @list: list to be appended to
612 * @seqno: sequence number of buffer to add 629 * @seqno: sequence number of buffer to add
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index 50a739860d37..c843fd2bc48d 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -95,11 +95,9 @@ struct plist;
95#define TIPC_MEDIA_INFO_OFFSET 5 95#define TIPC_MEDIA_INFO_OFFSET 5
96 96
97struct tipc_skb_cb { 97struct tipc_skb_cb {
98 void *handle; 98 u32 bytes_read;
99 struct sk_buff *tail; 99 struct sk_buff *tail;
100 bool validated; 100 bool validated;
101 bool wakeup_pending;
102 u16 chain_sz;
103 u16 chain_imp; 101 u16 chain_imp;
104 u16 ackers; 102 u16 ackers;
105}; 103};
@@ -633,14 +631,11 @@ static inline void msg_set_bc_netid(struct tipc_msg *m, u32 id)
633 631
634static inline u32 msg_link_selector(struct tipc_msg *m) 632static inline u32 msg_link_selector(struct tipc_msg *m)
635{ 633{
634 if (msg_user(m) == MSG_FRAGMENTER)
635 m = (void *)msg_data(m);
636 return msg_bits(m, 4, 0, 1); 636 return msg_bits(m, 4, 0, 1);
637} 637}
638 638
639static inline void msg_set_link_selector(struct tipc_msg *m, u32 n)
640{
641 msg_set_bits(m, 4, 0, 1, n);
642}
643
644/* 639/*
645 * Word 5 640 * Word 5
646 */ 641 */
@@ -820,7 +815,7 @@ static inline bool msg_is_reset(struct tipc_msg *hdr)
820 return (msg_user(hdr) == LINK_PROTOCOL) && (msg_type(hdr) == RESET_MSG); 815 return (msg_user(hdr) == LINK_PROTOCOL) && (msg_type(hdr) == RESET_MSG);
821} 816}
822 817
823struct sk_buff *tipc_buf_acquire(u32 size); 818struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp);
824bool tipc_msg_validate(struct sk_buff *skb); 819bool tipc_msg_validate(struct sk_buff *skb);
825bool tipc_msg_reverse(u32 own_addr, struct sk_buff **skb, int err); 820bool tipc_msg_reverse(u32 own_addr, struct sk_buff **skb, int err);
826void tipc_msg_init(u32 own_addr, struct tipc_msg *m, u32 user, u32 type, 821void tipc_msg_init(u32 own_addr, struct tipc_msg *m, u32 user, u32 type,
@@ -837,6 +832,8 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
837 int offset, int dsz, int mtu, struct sk_buff_head *list); 832 int offset, int dsz, int mtu, struct sk_buff_head *list);
838bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err); 833bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err);
839bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq); 834bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq);
835bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg,
836 struct sk_buff_head *cpy);
840void __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno, 837void __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno,
841 struct sk_buff *skb); 838 struct sk_buff *skb);
842 839
diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index c1cfd92de17a..23f8899e0f8c 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -69,7 +69,7 @@ static struct sk_buff *named_prepare_buf(struct net *net, u32 type, u32 size,
69 u32 dest) 69 u32 dest)
70{ 70{
71 struct tipc_net *tn = net_generic(net, tipc_net_id); 71 struct tipc_net *tn = net_generic(net, tipc_net_id);
72 struct sk_buff *buf = tipc_buf_acquire(INT_H_SIZE + size); 72 struct sk_buff *buf = tipc_buf_acquire(INT_H_SIZE + size, GFP_ATOMIC);
73 struct tipc_msg *msg; 73 struct tipc_msg *msg;
74 74
75 if (buf != NULL) { 75 if (buf != NULL) {
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index e190460fe0d3..9be6592e4a6f 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -608,7 +608,7 @@ not_found:
608 * Returns non-zero if any off-node ports overlap 608 * Returns non-zero if any off-node ports overlap
609 */ 609 */
610int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper, 610int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
611 u32 limit, struct tipc_plist *dports) 611 u32 limit, struct list_head *dports)
612{ 612{
613 struct name_seq *seq; 613 struct name_seq *seq;
614 struct sub_seq *sseq; 614 struct sub_seq *sseq;
@@ -633,7 +633,7 @@ int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
633 info = sseq->info; 633 info = sseq->info;
634 list_for_each_entry(publ, &info->node_list, node_list) { 634 list_for_each_entry(publ, &info->node_list, node_list) {
635 if (publ->scope <= limit) 635 if (publ->scope <= limit)
636 tipc_plist_push(dports, publ->ref); 636 u32_push(dports, publ->ref);
637 } 637 }
638 638
639 if (info->cluster_list_size != info->node_list_size) 639 if (info->cluster_list_size != info->node_list_size)
@@ -645,6 +645,39 @@ exit:
645 return res; 645 return res;
646} 646}
647 647
648/* tipc_nametbl_lookup_dst_nodes - find broadcast destination nodes
649 * - Creates list of nodes that overlap the given multicast address
650 * - Determines if any node local ports overlap
651 */
652void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower,
653 u32 upper, u32 domain,
654 struct tipc_nlist *nodes)
655{
656 struct sub_seq *sseq, *stop;
657 struct publication *publ;
658 struct name_info *info;
659 struct name_seq *seq;
660
661 rcu_read_lock();
662 seq = nametbl_find_seq(net, type);
663 if (!seq)
664 goto exit;
665
666 spin_lock_bh(&seq->lock);
667 sseq = seq->sseqs + nameseq_locate_subseq(seq, lower);
668 stop = seq->sseqs + seq->first_free;
669 for (; sseq->lower <= upper && sseq != stop; sseq++) {
670 info = sseq->info;
671 list_for_each_entry(publ, &info->zone_list, zone_list) {
672 if (tipc_in_scope(domain, publ->node))
673 tipc_nlist_add(nodes, publ->node);
674 }
675 }
676 spin_unlock_bh(&seq->lock);
677exit:
678 rcu_read_unlock();
679}
680
648/* 681/*
649 * tipc_nametbl_publish - add name publication to network name tables 682 * tipc_nametbl_publish - add name publication to network name tables
650 */ 683 */
@@ -1022,40 +1055,79 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb)
1022 return skb->len; 1055 return skb->len;
1023} 1056}
1024 1057
1025void tipc_plist_push(struct tipc_plist *pl, u32 port) 1058bool u32_find(struct list_head *l, u32 value)
1026{ 1059{
1027 struct tipc_plist *nl; 1060 struct u32_item *item;
1028 1061
1029 if (likely(!pl->port)) { 1062 list_for_each_entry(item, l, list) {
1030 pl->port = port; 1063 if (item->value == value)
1031 return; 1064 return true;
1032 } 1065 }
1033 if (pl->port == port) 1066 return false;
1034 return; 1067}
1035 list_for_each_entry(nl, &pl->list, list) { 1068
1036 if (nl->port == port) 1069bool u32_push(struct list_head *l, u32 value)
1037 return; 1070{
1071 struct u32_item *item;
1072
1073 list_for_each_entry(item, l, list) {
1074 if (item->value == value)
1075 return false;
1076 }
1077 item = kmalloc(sizeof(*item), GFP_ATOMIC);
1078 if (unlikely(!item))
1079 return false;
1080
1081 item->value = value;
1082 list_add(&item->list, l);
1083 return true;
1084}
1085
1086u32 u32_pop(struct list_head *l)
1087{
1088 struct u32_item *item;
1089 u32 value = 0;
1090
1091 if (list_empty(l))
1092 return 0;
1093 item = list_first_entry(l, typeof(*item), list);
1094 value = item->value;
1095 list_del(&item->list);
1096 kfree(item);
1097 return value;
1098}
1099
1100bool u32_del(struct list_head *l, u32 value)
1101{
1102 struct u32_item *item, *tmp;
1103
1104 list_for_each_entry_safe(item, tmp, l, list) {
1105 if (item->value != value)
1106 continue;
1107 list_del(&item->list);
1108 kfree(item);
1109 return true;
1038 } 1110 }
1039 nl = kmalloc(sizeof(*nl), GFP_ATOMIC); 1111 return false;
1040 if (nl) { 1112}
1041 nl->port = port; 1113
1042 list_add(&nl->list, &pl->list); 1114void u32_list_purge(struct list_head *l)
1115{
1116 struct u32_item *item, *tmp;
1117
1118 list_for_each_entry_safe(item, tmp, l, list) {
1119 list_del(&item->list);
1120 kfree(item);
1043 } 1121 }
1044} 1122}
1045 1123
1046u32 tipc_plist_pop(struct tipc_plist *pl) 1124int u32_list_len(struct list_head *l)
1047{ 1125{
1048 struct tipc_plist *nl; 1126 struct u32_item *item;
1049 u32 port = 0; 1127 int i = 0;
1050 1128
1051 if (likely(list_empty(&pl->list))) { 1129 list_for_each_entry(item, l, list) {
1052 port = pl->port; 1130 i++;
1053 pl->port = 0;
1054 return port;
1055 } 1131 }
1056 nl = list_first_entry(&pl->list, typeof(*nl), list); 1132 return i;
1057 port = nl->port;
1058 list_del(&nl->list);
1059 kfree(nl);
1060 return port;
1061} 1133}
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
index 1524a73830f7..6ebdeb1d84a5 100644
--- a/net/tipc/name_table.h
+++ b/net/tipc/name_table.h
@@ -39,6 +39,7 @@
39 39
40struct tipc_subscription; 40struct tipc_subscription;
41struct tipc_plist; 41struct tipc_plist;
42struct tipc_nlist;
42 43
43/* 44/*
44 * TIPC name types reserved for internal TIPC use (both current and planned) 45 * TIPC name types reserved for internal TIPC use (both current and planned)
@@ -99,7 +100,10 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb);
99 100
100u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *node); 101u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *node);
101int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper, 102int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
102 u32 limit, struct tipc_plist *dports); 103 u32 limit, struct list_head *dports);
104void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower,
105 u32 upper, u32 domain,
106 struct tipc_nlist *nodes);
103struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower, 107struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower,
104 u32 upper, u32 scope, u32 port_ref, 108 u32 upper, u32 scope, u32 port_ref,
105 u32 key); 109 u32 key);
@@ -116,18 +120,16 @@ void tipc_nametbl_unsubscribe(struct tipc_subscription *s);
116int tipc_nametbl_init(struct net *net); 120int tipc_nametbl_init(struct net *net);
117void tipc_nametbl_stop(struct net *net); 121void tipc_nametbl_stop(struct net *net);
118 122
119struct tipc_plist { 123struct u32_item {
120 struct list_head list; 124 struct list_head list;
121 u32 port; 125 u32 value;
122}; 126};
123 127
124static inline void tipc_plist_init(struct tipc_plist *pl) 128bool u32_push(struct list_head *l, u32 value);
125{ 129u32 u32_pop(struct list_head *l);
126 INIT_LIST_HEAD(&pl->list); 130bool u32_find(struct list_head *l, u32 value);
127 pl->port = 0; 131bool u32_del(struct list_head *l, u32 value);
128} 132void u32_list_purge(struct list_head *l);
129 133int u32_list_len(struct list_head *l);
130void tipc_plist_push(struct tipc_plist *pl, u32 port);
131u32 tipc_plist_pop(struct tipc_plist *pl);
132 134
133#endif 135#endif
diff --git a/net/tipc/net.c b/net/tipc/net.c
index 28bf4feeb81c..ab8a2d5d1e32 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -110,6 +110,10 @@ int tipc_net_start(struct net *net, u32 addr)
110 char addr_string[16]; 110 char addr_string[16];
111 111
112 tn->own_addr = addr; 112 tn->own_addr = addr;
113
114 /* Ensure that the new address is visible before we reinit. */
115 smp_mb();
116
113 tipc_named_reinit(net); 117 tipc_named_reinit(net);
114 tipc_sk_reinit(net); 118 tipc_sk_reinit(net);
115 119
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index 3200059d14b2..26ca8dd64ded 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -135,15 +135,6 @@ const struct nla_policy tipc_nl_udp_policy[TIPC_NLA_UDP_MAX + 1] = {
135/* Users of the legacy API (tipc-config) can't handle that we add operations, 135/* Users of the legacy API (tipc-config) can't handle that we add operations,
136 * so we have a separate genl handling for the new API. 136 * so we have a separate genl handling for the new API.
137 */ 137 */
138struct genl_family tipc_genl_family = {
139 .id = GENL_ID_GENERATE,
140 .name = TIPC_GENL_V2_NAME,
141 .version = TIPC_GENL_V2_VERSION,
142 .hdrsize = 0,
143 .maxattr = TIPC_NLA_MAX,
144 .netnsok = true,
145};
146
147static const struct genl_ops tipc_genl_v2_ops[] = { 138static const struct genl_ops tipc_genl_v2_ops[] = {
148 { 139 {
149 .cmd = TIPC_NL_BEARER_DISABLE, 140 .cmd = TIPC_NL_BEARER_DISABLE,
@@ -258,23 +249,33 @@ static const struct genl_ops tipc_genl_v2_ops[] = {
258#endif 249#endif
259}; 250};
260 251
252struct genl_family tipc_genl_family __ro_after_init = {
253 .name = TIPC_GENL_V2_NAME,
254 .version = TIPC_GENL_V2_VERSION,
255 .hdrsize = 0,
256 .maxattr = TIPC_NLA_MAX,
257 .netnsok = true,
258 .module = THIS_MODULE,
259 .ops = tipc_genl_v2_ops,
260 .n_ops = ARRAY_SIZE(tipc_genl_v2_ops),
261};
262
261int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***attr) 263int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***attr)
262{ 264{
263 u32 maxattr = tipc_genl_family.maxattr; 265 u32 maxattr = tipc_genl_family.maxattr;
264 266
265 *attr = tipc_genl_family.attrbuf; 267 *attr = genl_family_attrbuf(&tipc_genl_family);
266 if (!*attr) 268 if (!*attr)
267 return -EOPNOTSUPP; 269 return -EOPNOTSUPP;
268 270
269 return nlmsg_parse(nlh, GENL_HDRLEN, *attr, maxattr, tipc_nl_policy); 271 return nlmsg_parse(nlh, GENL_HDRLEN, *attr, maxattr, tipc_nl_policy);
270} 272}
271 273
272int tipc_netlink_start(void) 274int __init tipc_netlink_start(void)
273{ 275{
274 int res; 276 int res;
275 277
276 res = genl_register_family_with_ops(&tipc_genl_family, 278 res = genl_register_family(&tipc_genl_family);
277 tipc_genl_v2_ops);
278 if (res) { 279 if (res) {
279 pr_err("Failed to register netlink interface\n"); 280 pr_err("Failed to register netlink interface\n");
280 return res; 281 return res;
diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index 1fd464764765..e1ae8a8a2b8e 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -1215,15 +1215,6 @@ send:
1215 return err; 1215 return err;
1216} 1216}
1217 1217
1218static struct genl_family tipc_genl_compat_family = {
1219 .id = GENL_ID_GENERATE,
1220 .name = TIPC_GENL_NAME,
1221 .version = TIPC_GENL_VERSION,
1222 .hdrsize = TIPC_GENL_HDRLEN,
1223 .maxattr = 0,
1224 .netnsok = true,
1225};
1226
1227static struct genl_ops tipc_genl_compat_ops[] = { 1218static struct genl_ops tipc_genl_compat_ops[] = {
1228 { 1219 {
1229 .cmd = TIPC_GENL_CMD, 1220 .cmd = TIPC_GENL_CMD,
@@ -1231,12 +1222,22 @@ static struct genl_ops tipc_genl_compat_ops[] = {
1231 }, 1222 },
1232}; 1223};
1233 1224
1234int tipc_netlink_compat_start(void) 1225static struct genl_family tipc_genl_compat_family __ro_after_init = {
1226 .name = TIPC_GENL_NAME,
1227 .version = TIPC_GENL_VERSION,
1228 .hdrsize = TIPC_GENL_HDRLEN,
1229 .maxattr = 0,
1230 .netnsok = true,
1231 .module = THIS_MODULE,
1232 .ops = tipc_genl_compat_ops,
1233 .n_ops = ARRAY_SIZE(tipc_genl_compat_ops),
1234};
1235
1236int __init tipc_netlink_compat_start(void)
1235{ 1237{
1236 int res; 1238 int res;
1237 1239
1238 res = genl_register_family_with_ops(&tipc_genl_compat_family, 1240 res = genl_register_family(&tipc_genl_compat_family);
1239 tipc_genl_compat_ops);
1240 if (res) { 1241 if (res) {
1241 pr_err("Failed to register legacy compat interface\n"); 1242 pr_err("Failed to register legacy compat interface\n");
1242 return res; 1243 return res;
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 9d2f4c2b08ab..4512e83652b1 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -263,6 +263,11 @@ static void tipc_node_write_lock(struct tipc_node *n)
263 write_lock_bh(&n->lock); 263 write_lock_bh(&n->lock);
264} 264}
265 265
266static void tipc_node_write_unlock_fast(struct tipc_node *n)
267{
268 write_unlock_bh(&n->lock);
269}
270
266static void tipc_node_write_unlock(struct tipc_node *n) 271static void tipc_node_write_unlock(struct tipc_node *n)
267{ 272{
268 struct net *net = n->net; 273 struct net *net = n->net;
@@ -417,7 +422,7 @@ void tipc_node_subscribe(struct net *net, struct list_head *subscr, u32 addr)
417 } 422 }
418 tipc_node_write_lock(n); 423 tipc_node_write_lock(n);
419 list_add_tail(subscr, &n->publ_list); 424 list_add_tail(subscr, &n->publ_list);
420 tipc_node_write_unlock(n); 425 tipc_node_write_unlock_fast(n);
421 tipc_node_put(n); 426 tipc_node_put(n);
422} 427}
423 428
@@ -435,7 +440,7 @@ void tipc_node_unsubscribe(struct net *net, struct list_head *subscr, u32 addr)
435 } 440 }
436 tipc_node_write_lock(n); 441 tipc_node_write_lock(n);
437 list_del_init(subscr); 442 list_del_init(subscr);
438 tipc_node_write_unlock(n); 443 tipc_node_write_unlock_fast(n);
439 tipc_node_put(n); 444 tipc_node_put(n);
440} 445}
441 446
@@ -1167,7 +1172,7 @@ msg_full:
1167 * @list: chain of buffers containing message 1172 * @list: chain of buffers containing message
1168 * @dnode: address of destination node 1173 * @dnode: address of destination node
1169 * @selector: a number used for deterministic link selection 1174 * @selector: a number used for deterministic link selection
1170 * Consumes the buffer chain, except when returning -ELINKCONG 1175 * Consumes the buffer chain.
1171 * Returns 0 if success, otherwise: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE,-ENOBUF 1176 * Returns 0 if success, otherwise: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE,-ENOBUF
1172 */ 1177 */
1173int tipc_node_xmit(struct net *net, struct sk_buff_head *list, 1178int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
@@ -1206,10 +1211,10 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
1206 spin_unlock_bh(&le->lock); 1211 spin_unlock_bh(&le->lock);
1207 tipc_node_read_unlock(n); 1212 tipc_node_read_unlock(n);
1208 1213
1209 if (likely(rc == 0)) 1214 if (unlikely(rc == -ENOBUFS))
1210 tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr);
1211 else if (rc == -ENOBUFS)
1212 tipc_node_link_down(n, bearer_id, false); 1215 tipc_node_link_down(n, bearer_id, false);
1216 else
1217 tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr);
1213 1218
1214 tipc_node_put(n); 1219 tipc_node_put(n);
1215 1220
@@ -1221,20 +1226,15 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
1221 * messages, which will not be rejected 1226 * messages, which will not be rejected
1222 * The only exception is datagram messages rerouted after secondary 1227 * The only exception is datagram messages rerouted after secondary
1223 * lookup, which are rare and safe to dispose of anyway. 1228 * lookup, which are rare and safe to dispose of anyway.
1224 * TODO: Return real return value, and let callers use
1225 * tipc_wait_for_sendpkt() where applicable
1226 */ 1229 */
1227int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode, 1230int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode,
1228 u32 selector) 1231 u32 selector)
1229{ 1232{
1230 struct sk_buff_head head; 1233 struct sk_buff_head head;
1231 int rc;
1232 1234
1233 skb_queue_head_init(&head); 1235 skb_queue_head_init(&head);
1234 __skb_queue_tail(&head, skb); 1236 __skb_queue_tail(&head, skb);
1235 rc = tipc_node_xmit(net, &head, dnode, selector); 1237 tipc_node_xmit(net, &head, dnode, selector);
1236 if (rc == -ELINKCONG)
1237 kfree_skb(skb);
1238 return 0; 1238 return 0;
1239} 1239}
1240 1240
@@ -1262,6 +1262,19 @@ void tipc_node_broadcast(struct net *net, struct sk_buff *skb)
1262 kfree_skb(skb); 1262 kfree_skb(skb);
1263} 1263}
1264 1264
1265static void tipc_node_mcast_rcv(struct tipc_node *n)
1266{
1267 struct tipc_bclink_entry *be = &n->bc_entry;
1268
1269 /* 'arrvq' is under inputq2's lock protection */
1270 spin_lock_bh(&be->inputq2.lock);
1271 spin_lock_bh(&be->inputq1.lock);
1272 skb_queue_splice_tail_init(&be->inputq1, &be->arrvq);
1273 spin_unlock_bh(&be->inputq1.lock);
1274 spin_unlock_bh(&be->inputq2.lock);
1275 tipc_sk_mcast_rcv(n->net, &be->arrvq, &be->inputq2);
1276}
1277
1265static void tipc_node_bc_sync_rcv(struct tipc_node *n, struct tipc_msg *hdr, 1278static void tipc_node_bc_sync_rcv(struct tipc_node *n, struct tipc_msg *hdr,
1266 int bearer_id, struct sk_buff_head *xmitq) 1279 int bearer_id, struct sk_buff_head *xmitq)
1267{ 1280{
@@ -1335,15 +1348,8 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id
1335 if (!skb_queue_empty(&xmitq)) 1348 if (!skb_queue_empty(&xmitq))
1336 tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr); 1349 tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr);
1337 1350
1338 /* Deliver. 'arrvq' is under inputq2's lock protection */ 1351 if (!skb_queue_empty(&be->inputq1))
1339 if (!skb_queue_empty(&be->inputq1)) { 1352 tipc_node_mcast_rcv(n);
1340 spin_lock_bh(&be->inputq2.lock);
1341 spin_lock_bh(&be->inputq1.lock);
1342 skb_queue_splice_tail_init(&be->inputq1, &be->arrvq);
1343 spin_unlock_bh(&be->inputq1.lock);
1344 spin_unlock_bh(&be->inputq2.lock);
1345 tipc_sk_mcast_rcv(net, &be->arrvq, &be->inputq2);
1346 }
1347 1353
1348 if (rc & TIPC_LINK_DOWN_EVT) { 1354 if (rc & TIPC_LINK_DOWN_EVT) {
1349 /* Reception reassembly failure => reset all links to peer */ 1355 /* Reception reassembly failure => reset all links to peer */
@@ -1499,19 +1505,21 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b)
1499{ 1505{
1500 struct sk_buff_head xmitq; 1506 struct sk_buff_head xmitq;
1501 struct tipc_node *n; 1507 struct tipc_node *n;
1502 struct tipc_msg *hdr = buf_msg(skb); 1508 struct tipc_msg *hdr;
1503 int usr = msg_user(hdr);
1504 int bearer_id = b->identity; 1509 int bearer_id = b->identity;
1505 struct tipc_link_entry *le; 1510 struct tipc_link_entry *le;
1506 u16 bc_ack = msg_bcast_ack(hdr);
1507 u32 self = tipc_own_addr(net); 1511 u32 self = tipc_own_addr(net);
1508 int rc = 0; 1512 int usr, rc = 0;
1513 u16 bc_ack;
1509 1514
1510 __skb_queue_head_init(&xmitq); 1515 __skb_queue_head_init(&xmitq);
1511 1516
1512 /* Ensure message is well-formed */ 1517 /* Ensure message is well-formed before touching the header */
1513 if (unlikely(!tipc_msg_validate(skb))) 1518 if (unlikely(!tipc_msg_validate(skb)))
1514 goto discard; 1519 goto discard;
1520 hdr = buf_msg(skb);
1521 usr = msg_user(hdr);
1522 bc_ack = msg_bcast_ack(hdr);
1515 1523
1516 /* Handle arrival of discovery or broadcast packet */ 1524 /* Handle arrival of discovery or broadcast packet */
1517 if (unlikely(msg_non_seq(hdr))) { 1525 if (unlikely(msg_non_seq(hdr))) {
@@ -1570,6 +1578,9 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b)
1570 if (unlikely(!skb_queue_empty(&n->bc_entry.namedq))) 1578 if (unlikely(!skb_queue_empty(&n->bc_entry.namedq)))
1571 tipc_named_rcv(net, &n->bc_entry.namedq); 1579 tipc_named_rcv(net, &n->bc_entry.namedq);
1572 1580
1581 if (unlikely(!skb_queue_empty(&n->bc_entry.inputq1)))
1582 tipc_node_mcast_rcv(n);
1583
1573 if (!skb_queue_empty(&le->inputq)) 1584 if (!skb_queue_empty(&le->inputq))
1574 tipc_sk_rcv(net, &le->inputq); 1585 tipc_sk_rcv(net, &le->inputq);
1575 1586
diff --git a/net/tipc/node.h b/net/tipc/node.h
index 39ef54c1f2ad..898c22916984 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -47,11 +47,13 @@
47enum { 47enum {
48 TIPC_BCAST_SYNCH = (1 << 1), 48 TIPC_BCAST_SYNCH = (1 << 1),
49 TIPC_BCAST_STATE_NACK = (1 << 2), 49 TIPC_BCAST_STATE_NACK = (1 << 2),
50 TIPC_BLOCK_FLOWCTL = (1 << 3) 50 TIPC_BLOCK_FLOWCTL = (1 << 3),
51 TIPC_BCAST_RCAST = (1 << 4)
51}; 52};
52 53
53#define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \ 54#define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \
54 TIPC_BCAST_STATE_NACK | \ 55 TIPC_BCAST_STATE_NACK | \
56 TIPC_BCAST_RCAST | \
55 TIPC_BLOCK_FLOWCTL) 57 TIPC_BLOCK_FLOWCTL)
56#define INVALID_BEARER_ID -1 58#define INVALID_BEARER_ID -1
57 59
diff --git a/net/tipc/server.c b/net/tipc/server.c
index 215849ce453d..3cd6402e812c 100644
--- a/net/tipc/server.c
+++ b/net/tipc/server.c
@@ -86,12 +86,12 @@ struct outqueue_entry {
86static void tipc_recv_work(struct work_struct *work); 86static void tipc_recv_work(struct work_struct *work);
87static void tipc_send_work(struct work_struct *work); 87static void tipc_send_work(struct work_struct *work);
88static void tipc_clean_outqueues(struct tipc_conn *con); 88static void tipc_clean_outqueues(struct tipc_conn *con);
89static void tipc_sock_release(struct tipc_conn *con);
90 89
91static void tipc_conn_kref_release(struct kref *kref) 90static void tipc_conn_kref_release(struct kref *kref)
92{ 91{
93 struct tipc_conn *con = container_of(kref, struct tipc_conn, kref); 92 struct tipc_conn *con = container_of(kref, struct tipc_conn, kref);
94 struct sockaddr_tipc *saddr = con->server->saddr; 93 struct tipc_server *s = con->server;
94 struct sockaddr_tipc *saddr = s->saddr;
95 struct socket *sock = con->sock; 95 struct socket *sock = con->sock;
96 struct sock *sk; 96 struct sock *sk;
97 97
@@ -103,9 +103,13 @@ static void tipc_conn_kref_release(struct kref *kref)
103 } 103 }
104 saddr->scope = -TIPC_NODE_SCOPE; 104 saddr->scope = -TIPC_NODE_SCOPE;
105 kernel_bind(sock, (struct sockaddr *)saddr, sizeof(*saddr)); 105 kernel_bind(sock, (struct sockaddr *)saddr, sizeof(*saddr));
106 tipc_sock_release(con);
107 sock_release(sock); 106 sock_release(sock);
108 con->sock = NULL; 107 con->sock = NULL;
108
109 spin_lock_bh(&s->idr_lock);
110 idr_remove(&s->conn_idr, con->conid);
111 s->idr_in_use--;
112 spin_unlock_bh(&s->idr_lock);
109 } 113 }
110 114
111 tipc_clean_outqueues(con); 115 tipc_clean_outqueues(con);
@@ -128,8 +132,10 @@ static struct tipc_conn *tipc_conn_lookup(struct tipc_server *s, int conid)
128 132
129 spin_lock_bh(&s->idr_lock); 133 spin_lock_bh(&s->idr_lock);
130 con = idr_find(&s->conn_idr, conid); 134 con = idr_find(&s->conn_idr, conid);
131 if (con) 135 if (con && test_bit(CF_CONNECTED, &con->flags))
132 conn_get(con); 136 conn_get(con);
137 else
138 con = NULL;
133 spin_unlock_bh(&s->idr_lock); 139 spin_unlock_bh(&s->idr_lock);
134 return con; 140 return con;
135} 141}
@@ -186,26 +192,15 @@ static void tipc_unregister_callbacks(struct tipc_conn *con)
186 write_unlock_bh(&sk->sk_callback_lock); 192 write_unlock_bh(&sk->sk_callback_lock);
187} 193}
188 194
189static void tipc_sock_release(struct tipc_conn *con)
190{
191 struct tipc_server *s = con->server;
192
193 if (con->conid)
194 s->tipc_conn_release(con->conid, con->usr_data);
195
196 tipc_unregister_callbacks(con);
197}
198
199static void tipc_close_conn(struct tipc_conn *con) 195static void tipc_close_conn(struct tipc_conn *con)
200{ 196{
201 struct tipc_server *s = con->server; 197 struct tipc_server *s = con->server;
202 198
203 if (test_and_clear_bit(CF_CONNECTED, &con->flags)) { 199 if (test_and_clear_bit(CF_CONNECTED, &con->flags)) {
200 tipc_unregister_callbacks(con);
204 201
205 spin_lock_bh(&s->idr_lock); 202 if (con->conid)
206 idr_remove(&s->conn_idr, con->conid); 203 s->tipc_conn_release(con->conid, con->usr_data);
207 s->idr_in_use--;
208 spin_unlock_bh(&s->idr_lock);
209 204
210 /* We shouldn't flush pending works as we may be in the 205 /* We shouldn't flush pending works as we may be in the
211 * thread. In fact the races with pending rx/tx work structs 206 * thread. In fact the races with pending rx/tx work structs
@@ -458,6 +453,11 @@ int tipc_conn_sendmsg(struct tipc_server *s, int conid,
458 if (!con) 453 if (!con)
459 return -EINVAL; 454 return -EINVAL;
460 455
456 if (!test_bit(CF_CONNECTED, &con->flags)) {
457 conn_put(con);
458 return 0;
459 }
460
461 e = tipc_alloc_entry(data, len); 461 e = tipc_alloc_entry(data, len);
462 if (!e) { 462 if (!e) {
463 conn_put(con); 463 conn_put(con);
@@ -471,12 +471,8 @@ int tipc_conn_sendmsg(struct tipc_server *s, int conid,
471 list_add_tail(&e->list, &con->outqueue); 471 list_add_tail(&e->list, &con->outqueue);
472 spin_unlock_bh(&con->outqueue_lock); 472 spin_unlock_bh(&con->outqueue_lock);
473 473
474 if (test_bit(CF_CONNECTED, &con->flags)) { 474 if (!queue_work(s->send_wq, &con->swork))
475 if (!queue_work(s->send_wq, &con->swork))
476 conn_put(con);
477 } else {
478 conn_put(con); 475 conn_put(con);
479 }
480 return 0; 476 return 0;
481} 477}
482 478
@@ -500,7 +496,7 @@ static void tipc_send_to_sock(struct tipc_conn *con)
500 int ret; 496 int ret;
501 497
502 spin_lock_bh(&con->outqueue_lock); 498 spin_lock_bh(&con->outqueue_lock);
503 while (1) { 499 while (test_bit(CF_CONNECTED, &con->flags)) {
504 e = list_entry(con->outqueue.next, struct outqueue_entry, 500 e = list_entry(con->outqueue.next, struct outqueue_entry,
505 list); 501 list);
506 if ((struct list_head *) e == &con->outqueue) 502 if ((struct list_head *) e == &con->outqueue)
@@ -623,14 +619,12 @@ int tipc_server_start(struct tipc_server *s)
623void tipc_server_stop(struct tipc_server *s) 619void tipc_server_stop(struct tipc_server *s)
624{ 620{
625 struct tipc_conn *con; 621 struct tipc_conn *con;
626 int total = 0;
627 int id; 622 int id;
628 623
629 spin_lock_bh(&s->idr_lock); 624 spin_lock_bh(&s->idr_lock);
630 for (id = 0; total < s->idr_in_use; id++) { 625 for (id = 0; s->idr_in_use; id++) {
631 con = idr_find(&s->conn_idr, id); 626 con = idr_find(&s->conn_idr, id);
632 if (con) { 627 if (con) {
633 total++;
634 spin_unlock_bh(&s->idr_lock); 628 spin_unlock_bh(&s->idr_lock);
635 tipc_close_conn(con); 629 tipc_close_conn(con);
636 spin_lock_bh(&s->idr_lock); 630 spin_lock_bh(&s->idr_lock);
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 41f013888f07..7130e73bd42c 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -35,6 +35,8 @@
35 */ 35 */
36 36
37#include <linux/rhashtable.h> 37#include <linux/rhashtable.h>
38#include <linux/sched/signal.h>
39
38#include "core.h" 40#include "core.h"
39#include "name_table.h" 41#include "name_table.h"
40#include "node.h" 42#include "node.h"
@@ -44,65 +46,67 @@
44#include "bcast.h" 46#include "bcast.h"
45#include "netlink.h" 47#include "netlink.h"
46 48
47#define SS_LISTENING -1 /* socket is listening */
48#define SS_READY -2 /* socket is connectionless */
49
50#define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */ 49#define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */
51#define CONN_PROBING_INTERVAL msecs_to_jiffies(3600000) /* [ms] => 1 h */ 50#define CONN_PROBING_INTERVAL msecs_to_jiffies(3600000) /* [ms] => 1 h */
52#define TIPC_FWD_MSG 1 51#define TIPC_FWD_MSG 1
53#define TIPC_CONN_OK 0
54#define TIPC_CONN_PROBING 1
55#define TIPC_MAX_PORT 0xffffffff 52#define TIPC_MAX_PORT 0xffffffff
56#define TIPC_MIN_PORT 1 53#define TIPC_MIN_PORT 1
57 54
55enum {
56 TIPC_LISTEN = TCP_LISTEN,
57 TIPC_ESTABLISHED = TCP_ESTABLISHED,
58 TIPC_OPEN = TCP_CLOSE,
59 TIPC_DISCONNECTING = TCP_CLOSE_WAIT,
60 TIPC_CONNECTING = TCP_SYN_SENT,
61};
62
58/** 63/**
59 * struct tipc_sock - TIPC socket structure 64 * struct tipc_sock - TIPC socket structure
60 * @sk: socket - interacts with 'port' and with user via the socket API 65 * @sk: socket - interacts with 'port' and with user via the socket API
61 * @connected: non-zero if port is currently connected to a peer port
62 * @conn_type: TIPC type used when connection was established 66 * @conn_type: TIPC type used when connection was established
63 * @conn_instance: TIPC instance used when connection was established 67 * @conn_instance: TIPC instance used when connection was established
64 * @published: non-zero if port has one or more associated names 68 * @published: non-zero if port has one or more associated names
65 * @max_pkt: maximum packet size "hint" used when building messages sent by port 69 * @max_pkt: maximum packet size "hint" used when building messages sent by port
66 * @portid: unique port identity in TIPC socket hash table 70 * @portid: unique port identity in TIPC socket hash table
67 * @phdr: preformatted message header used when sending messages 71 * @phdr: preformatted message header used when sending messages
68 * @port_list: adjacent ports in TIPC's global list of ports 72 * #cong_links: list of congested links
69 * @publications: list of publications for port 73 * @publications: list of publications for port
74 * @blocking_link: address of the congested link we are currently sleeping on
70 * @pub_count: total # of publications port has made during its lifetime 75 * @pub_count: total # of publications port has made during its lifetime
71 * @probing_state: 76 * @probing_state:
72 * @probing_intv:
73 * @conn_timeout: the time we can wait for an unresponded setup request 77 * @conn_timeout: the time we can wait for an unresponded setup request
74 * @dupl_rcvcnt: number of bytes counted twice, in both backlog and rcv queue 78 * @dupl_rcvcnt: number of bytes counted twice, in both backlog and rcv queue
75 * @link_cong: non-zero if owner must sleep because of link congestion 79 * @cong_link_cnt: number of congested links
76 * @sent_unacked: # messages sent by socket, and not yet acked by peer 80 * @sent_unacked: # messages sent by socket, and not yet acked by peer
77 * @rcv_unacked: # messages read by user, but not yet acked back to peer 81 * @rcv_unacked: # messages read by user, but not yet acked back to peer
78 * @remote: 'connected' peer for dgram/rdm 82 * @peer: 'connected' peer for dgram/rdm
79 * @node: hash table node 83 * @node: hash table node
84 * @mc_method: cookie for use between socket and broadcast layer
80 * @rcu: rcu struct for tipc_sock 85 * @rcu: rcu struct for tipc_sock
81 */ 86 */
82struct tipc_sock { 87struct tipc_sock {
83 struct sock sk; 88 struct sock sk;
84 int connected;
85 u32 conn_type; 89 u32 conn_type;
86 u32 conn_instance; 90 u32 conn_instance;
87 int published; 91 int published;
88 u32 max_pkt; 92 u32 max_pkt;
89 u32 portid; 93 u32 portid;
90 struct tipc_msg phdr; 94 struct tipc_msg phdr;
91 struct list_head sock_list; 95 struct list_head cong_links;
92 struct list_head publications; 96 struct list_head publications;
93 u32 pub_count; 97 u32 pub_count;
94 u32 probing_state;
95 unsigned long probing_intv;
96 uint conn_timeout; 98 uint conn_timeout;
97 atomic_t dupl_rcvcnt; 99 atomic_t dupl_rcvcnt;
98 bool link_cong; 100 bool probe_unacked;
101 u16 cong_link_cnt;
99 u16 snt_unacked; 102 u16 snt_unacked;
100 u16 snd_win; 103 u16 snd_win;
101 u16 peer_caps; 104 u16 peer_caps;
102 u16 rcv_unacked; 105 u16 rcv_unacked;
103 u16 rcv_win; 106 u16 rcv_win;
104 struct sockaddr_tipc remote; 107 struct sockaddr_tipc peer;
105 struct rhash_head node; 108 struct rhash_head node;
109 struct tipc_mc_method mc_method;
106 struct rcu_head rcu; 110 struct rcu_head rcu;
107}; 111};
108 112
@@ -111,8 +115,8 @@ static void tipc_data_ready(struct sock *sk);
111static void tipc_write_space(struct sock *sk); 115static void tipc_write_space(struct sock *sk);
112static void tipc_sock_destruct(struct sock *sk); 116static void tipc_sock_destruct(struct sock *sk);
113static int tipc_release(struct socket *sock); 117static int tipc_release(struct socket *sock);
114static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags); 118static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags,
115static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p); 119 bool kern);
116static void tipc_sk_timeout(unsigned long data); 120static void tipc_sk_timeout(unsigned long data);
117static int tipc_sk_publish(struct tipc_sock *tsk, uint scope, 121static int tipc_sk_publish(struct tipc_sock *tsk, uint scope,
118 struct tipc_name_seq const *seq); 122 struct tipc_name_seq const *seq);
@@ -121,8 +125,7 @@ static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope,
121static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid); 125static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid);
122static int tipc_sk_insert(struct tipc_sock *tsk); 126static int tipc_sk_insert(struct tipc_sock *tsk);
123static void tipc_sk_remove(struct tipc_sock *tsk); 127static void tipc_sk_remove(struct tipc_sock *tsk);
124static int __tipc_send_stream(struct socket *sock, struct msghdr *m, 128static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz);
125 size_t dsz);
126static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz); 129static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz);
127 130
128static const struct proto_ops packet_ops; 131static const struct proto_ops packet_ops;
@@ -248,6 +251,21 @@ static void tsk_rej_rx_queue(struct sock *sk)
248 tipc_sk_respond(sk, skb, TIPC_ERR_NO_PORT); 251 tipc_sk_respond(sk, skb, TIPC_ERR_NO_PORT);
249} 252}
250 253
254static bool tipc_sk_connected(struct sock *sk)
255{
256 return sk->sk_state == TIPC_ESTABLISHED;
257}
258
259/* tipc_sk_type_connectionless - check if the socket is datagram socket
260 * @sk: socket
261 *
262 * Returns true if connection less, false otherwise
263 */
264static bool tipc_sk_type_connectionless(struct sock *sk)
265{
266 return sk->sk_type == SOCK_RDM || sk->sk_type == SOCK_DGRAM;
267}
268
251/* tsk_peer_msg - verify if message was sent by connected port's peer 269/* tsk_peer_msg - verify if message was sent by connected port's peer
252 * 270 *
253 * Handles cases where the node's network address has changed from 271 * Handles cases where the node's network address has changed from
@@ -255,12 +273,13 @@ static void tsk_rej_rx_queue(struct sock *sk)
255 */ 273 */
256static bool tsk_peer_msg(struct tipc_sock *tsk, struct tipc_msg *msg) 274static bool tsk_peer_msg(struct tipc_sock *tsk, struct tipc_msg *msg)
257{ 275{
258 struct tipc_net *tn = net_generic(sock_net(&tsk->sk), tipc_net_id); 276 struct sock *sk = &tsk->sk;
277 struct tipc_net *tn = net_generic(sock_net(sk), tipc_net_id);
259 u32 peer_port = tsk_peer_port(tsk); 278 u32 peer_port = tsk_peer_port(tsk);
260 u32 orig_node; 279 u32 orig_node;
261 u32 peer_node; 280 u32 peer_node;
262 281
263 if (unlikely(!tsk->connected)) 282 if (unlikely(!tipc_sk_connected(sk)))
264 return false; 283 return false;
265 284
266 if (unlikely(msg_origport(msg) != peer_port)) 285 if (unlikely(msg_origport(msg) != peer_port))
@@ -281,6 +300,88 @@ static bool tsk_peer_msg(struct tipc_sock *tsk, struct tipc_msg *msg)
281 return false; 300 return false;
282} 301}
283 302
303/* tipc_set_sk_state - set the sk_state of the socket
304 * @sk: socket
305 *
306 * Caller must hold socket lock
307 *
308 * Returns 0 on success, errno otherwise
309 */
310static int tipc_set_sk_state(struct sock *sk, int state)
311{
312 int oldsk_state = sk->sk_state;
313 int res = -EINVAL;
314
315 switch (state) {
316 case TIPC_OPEN:
317 res = 0;
318 break;
319 case TIPC_LISTEN:
320 case TIPC_CONNECTING:
321 if (oldsk_state == TIPC_OPEN)
322 res = 0;
323 break;
324 case TIPC_ESTABLISHED:
325 if (oldsk_state == TIPC_CONNECTING ||
326 oldsk_state == TIPC_OPEN)
327 res = 0;
328 break;
329 case TIPC_DISCONNECTING:
330 if (oldsk_state == TIPC_CONNECTING ||
331 oldsk_state == TIPC_ESTABLISHED)
332 res = 0;
333 break;
334 }
335
336 if (!res)
337 sk->sk_state = state;
338
339 return res;
340}
341
342static int tipc_sk_sock_err(struct socket *sock, long *timeout)
343{
344 struct sock *sk = sock->sk;
345 int err = sock_error(sk);
346 int typ = sock->type;
347
348 if (err)
349 return err;
350 if (typ == SOCK_STREAM || typ == SOCK_SEQPACKET) {
351 if (sk->sk_state == TIPC_DISCONNECTING)
352 return -EPIPE;
353 else if (!tipc_sk_connected(sk))
354 return -ENOTCONN;
355 }
356 if (!*timeout)
357 return -EAGAIN;
358 if (signal_pending(current))
359 return sock_intr_errno(*timeout);
360
361 return 0;
362}
363
364#define tipc_wait_for_cond(sock_, timeout_, condition_) \
365({ \
366 int rc_ = 0; \
367 int done_ = 0; \
368 \
369 while (!(condition_) && !done_) { \
370 struct sock *sk_ = sock->sk; \
371 DEFINE_WAIT_FUNC(wait_, woken_wake_function); \
372 \
373 rc_ = tipc_sk_sock_err(sock_, timeout_); \
374 if (rc_) \
375 break; \
376 prepare_to_wait(sk_sleep(sk_), &wait_, \
377 TASK_INTERRUPTIBLE); \
378 done_ = sk_wait_event(sk_, timeout_, \
379 (condition_), &wait_); \
380 remove_wait_queue(sk_sleep(sk_), &wait_); \
381 } \
382 rc_; \
383})
384
284/** 385/**
285 * tipc_sk_create - create a TIPC socket 386 * tipc_sk_create - create a TIPC socket
286 * @net: network namespace (must be default network) 387 * @net: network namespace (must be default network)
@@ -298,7 +399,6 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
298{ 399{
299 struct tipc_net *tn; 400 struct tipc_net *tn;
300 const struct proto_ops *ops; 401 const struct proto_ops *ops;
301 socket_state state;
302 struct sock *sk; 402 struct sock *sk;
303 struct tipc_sock *tsk; 403 struct tipc_sock *tsk;
304 struct tipc_msg *msg; 404 struct tipc_msg *msg;
@@ -310,16 +410,13 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
310 switch (sock->type) { 410 switch (sock->type) {
311 case SOCK_STREAM: 411 case SOCK_STREAM:
312 ops = &stream_ops; 412 ops = &stream_ops;
313 state = SS_UNCONNECTED;
314 break; 413 break;
315 case SOCK_SEQPACKET: 414 case SOCK_SEQPACKET:
316 ops = &packet_ops; 415 ops = &packet_ops;
317 state = SS_UNCONNECTED;
318 break; 416 break;
319 case SOCK_DGRAM: 417 case SOCK_DGRAM:
320 case SOCK_RDM: 418 case SOCK_RDM:
321 ops = &msg_ops; 419 ops = &msg_ops;
322 state = SS_READY;
323 break; 420 break;
324 default: 421 default:
325 return -EPROTOTYPE; 422 return -EPROTOTYPE;
@@ -333,21 +430,28 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
333 tsk = tipc_sk(sk); 430 tsk = tipc_sk(sk);
334 tsk->max_pkt = MAX_PKT_DEFAULT; 431 tsk->max_pkt = MAX_PKT_DEFAULT;
335 INIT_LIST_HEAD(&tsk->publications); 432 INIT_LIST_HEAD(&tsk->publications);
433 INIT_LIST_HEAD(&tsk->cong_links);
336 msg = &tsk->phdr; 434 msg = &tsk->phdr;
337 tn = net_generic(sock_net(sk), tipc_net_id); 435 tn = net_generic(sock_net(sk), tipc_net_id);
338 tipc_msg_init(tn->own_addr, msg, TIPC_LOW_IMPORTANCE, TIPC_NAMED_MSG,
339 NAMED_H_SIZE, 0);
340 436
341 /* Finish initializing socket data structures */ 437 /* Finish initializing socket data structures */
342 sock->ops = ops; 438 sock->ops = ops;
343 sock->state = state;
344 sock_init_data(sock, sk); 439 sock_init_data(sock, sk);
440 tipc_set_sk_state(sk, TIPC_OPEN);
345 if (tipc_sk_insert(tsk)) { 441 if (tipc_sk_insert(tsk)) {
346 pr_warn("Socket create failed; port number exhausted\n"); 442 pr_warn("Socket create failed; port number exhausted\n");
347 return -EINVAL; 443 return -EINVAL;
348 } 444 }
445
446 /* Ensure tsk is visible before we read own_addr. */
447 smp_mb();
448
449 tipc_msg_init(tn->own_addr, msg, TIPC_LOW_IMPORTANCE, TIPC_NAMED_MSG,
450 NAMED_H_SIZE, 0);
451
349 msg_set_origport(msg, tsk->portid); 452 msg_set_origport(msg, tsk->portid);
350 setup_timer(&sk->sk_timer, tipc_sk_timeout, (unsigned long)tsk); 453 setup_timer(&sk->sk_timer, tipc_sk_timeout, (unsigned long)tsk);
454 sk->sk_shutdown = 0;
351 sk->sk_backlog_rcv = tipc_backlog_rcv; 455 sk->sk_backlog_rcv = tipc_backlog_rcv;
352 sk->sk_rcvbuf = sysctl_tipc_rmem[1]; 456 sk->sk_rcvbuf = sysctl_tipc_rmem[1];
353 sk->sk_data_ready = tipc_data_ready; 457 sk->sk_data_ready = tipc_data_ready;
@@ -360,11 +464,12 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
360 tsk->snd_win = tsk_adv_blocks(RCVBUF_MIN); 464 tsk->snd_win = tsk_adv_blocks(RCVBUF_MIN);
361 tsk->rcv_win = tsk->snd_win; 465 tsk->rcv_win = tsk->snd_win;
362 466
363 if (sock->state == SS_READY) { 467 if (tipc_sk_type_connectionless(sk)) {
364 tsk_set_unreturnable(tsk, true); 468 tsk_set_unreturnable(tsk, true);
365 if (sock->type == SOCK_DGRAM) 469 if (sock->type == SOCK_DGRAM)
366 tsk_set_unreliable(tsk, true); 470 tsk_set_unreliable(tsk, true);
367 } 471 }
472
368 return 0; 473 return 0;
369} 474}
370 475
@@ -375,6 +480,51 @@ static void tipc_sk_callback(struct rcu_head *head)
375 sock_put(&tsk->sk); 480 sock_put(&tsk->sk);
376} 481}
377 482
483/* Caller should hold socket lock for the socket. */
484static void __tipc_shutdown(struct socket *sock, int error)
485{
486 struct sock *sk = sock->sk;
487 struct tipc_sock *tsk = tipc_sk(sk);
488 struct net *net = sock_net(sk);
489 long timeout = CONN_TIMEOUT_DEFAULT;
490 u32 dnode = tsk_peer_node(tsk);
491 struct sk_buff *skb;
492
493 /* Avoid that hi-prio shutdown msgs bypass msgs in link wakeup queue */
494 tipc_wait_for_cond(sock, &timeout, (!tsk->cong_link_cnt &&
495 !tsk_conn_cong(tsk)));
496
497 /* Reject all unreceived messages, except on an active connection
498 * (which disconnects locally & sends a 'FIN+' to peer).
499 */
500 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
501 if (TIPC_SKB_CB(skb)->bytes_read) {
502 kfree_skb(skb);
503 continue;
504 }
505 if (!tipc_sk_type_connectionless(sk) &&
506 sk->sk_state != TIPC_DISCONNECTING) {
507 tipc_set_sk_state(sk, TIPC_DISCONNECTING);
508 tipc_node_remove_conn(net, dnode, tsk->portid);
509 }
510 tipc_sk_respond(sk, skb, error);
511 }
512
513 if (tipc_sk_type_connectionless(sk))
514 return;
515
516 if (sk->sk_state != TIPC_DISCONNECTING) {
517 skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE,
518 TIPC_CONN_MSG, SHORT_H_SIZE, 0, dnode,
519 tsk_own_node(tsk), tsk_peer_port(tsk),
520 tsk->portid, error);
521 if (skb)
522 tipc_node_xmit_skb(net, skb, dnode, tsk->portid);
523 tipc_node_remove_conn(net, dnode, tsk->portid);
524 tipc_set_sk_state(sk, TIPC_DISCONNECTING);
525 }
526}
527
378/** 528/**
379 * tipc_release - destroy a TIPC socket 529 * tipc_release - destroy a TIPC socket
380 * @sock: socket to destroy 530 * @sock: socket to destroy
@@ -394,10 +544,7 @@ static void tipc_sk_callback(struct rcu_head *head)
394static int tipc_release(struct socket *sock) 544static int tipc_release(struct socket *sock)
395{ 545{
396 struct sock *sk = sock->sk; 546 struct sock *sk = sock->sk;
397 struct net *net;
398 struct tipc_sock *tsk; 547 struct tipc_sock *tsk;
399 struct sk_buff *skb;
400 u32 dnode;
401 548
402 /* 549 /*
403 * Exit if socket isn't fully initialized (occurs when a failed accept() 550 * Exit if socket isn't fully initialized (occurs when a failed accept()
@@ -406,49 +553,19 @@ static int tipc_release(struct socket *sock)
406 if (sk == NULL) 553 if (sk == NULL)
407 return 0; 554 return 0;
408 555
409 net = sock_net(sk);
410 tsk = tipc_sk(sk); 556 tsk = tipc_sk(sk);
411 lock_sock(sk); 557 lock_sock(sk);
412 558
413 /* 559 __tipc_shutdown(sock, TIPC_ERR_NO_PORT);
414 * Reject all unreceived messages, except on an active connection 560 sk->sk_shutdown = SHUTDOWN_MASK;
415 * (which disconnects locally & sends a 'FIN+' to peer)
416 */
417 dnode = tsk_peer_node(tsk);
418 while (sock->state != SS_DISCONNECTING) {
419 skb = __skb_dequeue(&sk->sk_receive_queue);
420 if (skb == NULL)
421 break;
422 if (TIPC_SKB_CB(skb)->handle != NULL)
423 kfree_skb(skb);
424 else {
425 if ((sock->state == SS_CONNECTING) ||
426 (sock->state == SS_CONNECTED)) {
427 sock->state = SS_DISCONNECTING;
428 tsk->connected = 0;
429 tipc_node_remove_conn(net, dnode, tsk->portid);
430 }
431 tipc_sk_respond(sk, skb, TIPC_ERR_NO_PORT);
432 }
433 }
434
435 tipc_sk_withdraw(tsk, 0, NULL); 561 tipc_sk_withdraw(tsk, 0, NULL);
436 sk_stop_timer(sk, &sk->sk_timer); 562 sk_stop_timer(sk, &sk->sk_timer);
437 tipc_sk_remove(tsk); 563 tipc_sk_remove(tsk);
438 if (tsk->connected) {
439 skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE,
440 TIPC_CONN_MSG, SHORT_H_SIZE, 0, dnode,
441 tsk_own_node(tsk), tsk_peer_port(tsk),
442 tsk->portid, TIPC_ERR_NO_PORT);
443 if (skb)
444 tipc_node_xmit_skb(net, skb, dnode, tsk->portid);
445 tipc_node_remove_conn(net, dnode, tsk->portid);
446 }
447 564
448 /* Reject any messages that accumulated in backlog queue */ 565 /* Reject any messages that accumulated in backlog queue */
449 sock->state = SS_DISCONNECTING;
450 release_sock(sk); 566 release_sock(sk);
451 567 u32_list_purge(&tsk->cong_links);
568 tsk->cong_link_cnt = 0;
452 call_rcu(&tsk->rcu, tipc_sk_callback); 569 call_rcu(&tsk->rcu, tipc_sk_callback);
453 sock->sk = NULL; 570 sock->sk = NULL;
454 571
@@ -532,13 +649,14 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr,
532 int *uaddr_len, int peer) 649 int *uaddr_len, int peer)
533{ 650{
534 struct sockaddr_tipc *addr = (struct sockaddr_tipc *)uaddr; 651 struct sockaddr_tipc *addr = (struct sockaddr_tipc *)uaddr;
535 struct tipc_sock *tsk = tipc_sk(sock->sk); 652 struct sock *sk = sock->sk;
653 struct tipc_sock *tsk = tipc_sk(sk);
536 struct tipc_net *tn = net_generic(sock_net(sock->sk), tipc_net_id); 654 struct tipc_net *tn = net_generic(sock_net(sock->sk), tipc_net_id);
537 655
538 memset(addr, 0, sizeof(*addr)); 656 memset(addr, 0, sizeof(*addr));
539 if (peer) { 657 if (peer) {
540 if ((sock->state != SS_CONNECTED) && 658 if ((!tipc_sk_connected(sk)) &&
541 ((peer != 2) || (sock->state != SS_DISCONNECTING))) 659 ((peer != 2) || (sk->sk_state != TIPC_DISCONNECTING)))
542 return -ENOTCONN; 660 return -ENOTCONN;
543 addr->addr.id.ref = tsk_peer_port(tsk); 661 addr->addr.id.ref = tsk_peer_port(tsk);
544 addr->addr.id.node = tsk_peer_node(tsk); 662 addr->addr.id.node = tsk_peer_node(tsk);
@@ -570,28 +688,6 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr,
570 * exits. TCP and other protocols seem to rely on higher level poll routines 688 * exits. TCP and other protocols seem to rely on higher level poll routines
571 * to handle any preventable race conditions, so TIPC will do the same ... 689 * to handle any preventable race conditions, so TIPC will do the same ...
572 * 690 *
573 * TIPC sets the returned events as follows:
574 *
575 * socket state flags set
576 * ------------ ---------
577 * unconnected no read flags
578 * POLLOUT if port is not congested
579 *
580 * connecting POLLIN/POLLRDNORM if ACK/NACK in rx queue
581 * no write flags
582 *
583 * connected POLLIN/POLLRDNORM if data in rx queue
584 * POLLOUT if port is not congested
585 *
586 * disconnecting POLLIN/POLLRDNORM/POLLHUP
587 * no write flags
588 *
589 * listening POLLIN if SYN in rx queue
590 * no write flags
591 *
592 * ready POLLIN/POLLRDNORM if data in rx queue
593 * [connectionless] POLLOUT (since port cannot be congested)
594 *
595 * IMPORTANT: The fact that a read or write operation is indicated does NOT 691 * IMPORTANT: The fact that a read or write operation is indicated does NOT
596 * imply that the operation will succeed, merely that it should be performed 692 * imply that the operation will succeed, merely that it should be performed
597 * and will not block. 693 * and will not block.
@@ -605,22 +701,29 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock,
605 701
606 sock_poll_wait(file, sk_sleep(sk), wait); 702 sock_poll_wait(file, sk_sleep(sk), wait);
607 703
608 switch ((int)sock->state) { 704 if (sk->sk_shutdown & RCV_SHUTDOWN)
609 case SS_UNCONNECTED: 705 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
610 if (!tsk->link_cong) 706 if (sk->sk_shutdown == SHUTDOWN_MASK)
611 mask |= POLLOUT; 707 mask |= POLLHUP;
612 break; 708
613 case SS_READY: 709 switch (sk->sk_state) {
614 case SS_CONNECTED: 710 case TIPC_ESTABLISHED:
615 if (!tsk->link_cong && !tsk_conn_cong(tsk)) 711 if (!tsk->cong_link_cnt && !tsk_conn_cong(tsk))
616 mask |= POLLOUT; 712 mask |= POLLOUT;
617 /* fall thru' */ 713 /* fall thru' */
618 case SS_CONNECTING: 714 case TIPC_LISTEN:
619 case SS_LISTENING: 715 case TIPC_CONNECTING:
620 if (!skb_queue_empty(&sk->sk_receive_queue)) 716 if (!skb_queue_empty(&sk->sk_receive_queue))
621 mask |= (POLLIN | POLLRDNORM); 717 mask |= (POLLIN | POLLRDNORM);
622 break; 718 break;
623 case SS_DISCONNECTING: 719 case TIPC_OPEN:
720 if (!tsk->cong_link_cnt)
721 mask |= POLLOUT;
722 if (tipc_sk_type_connectionless(sk) &&
723 (!skb_queue_empty(&sk->sk_receive_queue)))
724 mask |= (POLLIN | POLLRDNORM);
725 break;
726 case TIPC_DISCONNECTING:
624 mask = (POLLIN | POLLRDNORM | POLLHUP); 727 mask = (POLLIN | POLLRDNORM | POLLHUP);
625 break; 728 break;
626 } 729 }
@@ -633,60 +736,60 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock,
633 * @sock: socket structure 736 * @sock: socket structure
634 * @seq: destination address 737 * @seq: destination address
635 * @msg: message to send 738 * @msg: message to send
636 * @dsz: total length of message data 739 * @dlen: length of data to send
637 * @timeo: timeout to wait for wakeup 740 * @timeout: timeout to wait for wakeup
638 * 741 *
639 * Called from function tipc_sendmsg(), which has done all sanity checks 742 * Called from function tipc_sendmsg(), which has done all sanity checks
640 * Returns the number of bytes sent on success, or errno 743 * Returns the number of bytes sent on success, or errno
641 */ 744 */
642static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, 745static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq,
643 struct msghdr *msg, size_t dsz, long timeo) 746 struct msghdr *msg, size_t dlen, long timeout)
644{ 747{
645 struct sock *sk = sock->sk; 748 struct sock *sk = sock->sk;
646 struct tipc_sock *tsk = tipc_sk(sk); 749 struct tipc_sock *tsk = tipc_sk(sk);
750 struct tipc_msg *hdr = &tsk->phdr;
647 struct net *net = sock_net(sk); 751 struct net *net = sock_net(sk);
648 struct tipc_msg *mhdr = &tsk->phdr; 752 int mtu = tipc_bcast_get_mtu(net);
649 struct sk_buff_head pktchain; 753 struct tipc_mc_method *method = &tsk->mc_method;
650 struct iov_iter save = msg->msg_iter; 754 u32 domain = addr_domain(net, TIPC_CLUSTER_SCOPE);
651 uint mtu; 755 struct sk_buff_head pkts;
756 struct tipc_nlist dsts;
652 int rc; 757 int rc;
653 758
654 msg_set_type(mhdr, TIPC_MCAST_MSG); 759 /* Block or return if any destination link is congested */
655 msg_set_lookup_scope(mhdr, TIPC_CLUSTER_SCOPE); 760 rc = tipc_wait_for_cond(sock, &timeout, !tsk->cong_link_cnt);
656 msg_set_destport(mhdr, 0); 761 if (unlikely(rc))
657 msg_set_destnode(mhdr, 0);
658 msg_set_nametype(mhdr, seq->type);
659 msg_set_namelower(mhdr, seq->lower);
660 msg_set_nameupper(mhdr, seq->upper);
661 msg_set_hdr_sz(mhdr, MCAST_H_SIZE);
662
663 skb_queue_head_init(&pktchain);
664
665new_mtu:
666 mtu = tipc_bcast_get_mtu(net);
667 rc = tipc_msg_build(mhdr, msg, 0, dsz, mtu, &pktchain);
668 if (unlikely(rc < 0))
669 return rc; 762 return rc;
670 763
671 do { 764 /* Lookup destination nodes */
672 rc = tipc_bcast_xmit(net, &pktchain); 765 tipc_nlist_init(&dsts, tipc_own_addr(net));
673 if (likely(!rc)) 766 tipc_nametbl_lookup_dst_nodes(net, seq->type, seq->lower,
674 return dsz; 767 seq->upper, domain, &dsts);
675 768 if (!dsts.local && !dsts.remote)
676 if (rc == -ELINKCONG) { 769 return -EHOSTUNREACH;
677 tsk->link_cong = 1; 770
678 rc = tipc_wait_for_sndmsg(sock, &timeo); 771 /* Build message header */
679 if (!rc) 772 msg_set_type(hdr, TIPC_MCAST_MSG);
680 continue; 773 msg_set_hdr_sz(hdr, MCAST_H_SIZE);
681 } 774 msg_set_lookup_scope(hdr, TIPC_CLUSTER_SCOPE);
682 __skb_queue_purge(&pktchain); 775 msg_set_destport(hdr, 0);
683 if (rc == -EMSGSIZE) { 776 msg_set_destnode(hdr, 0);
684 msg->msg_iter = save; 777 msg_set_nametype(hdr, seq->type);
685 goto new_mtu; 778 msg_set_namelower(hdr, seq->lower);
686 } 779 msg_set_nameupper(hdr, seq->upper);
687 break; 780
688 } while (1); 781 /* Build message as chain of buffers */
689 return rc; 782 skb_queue_head_init(&pkts);
783 rc = tipc_msg_build(hdr, msg, 0, dlen, mtu, &pkts);
784
785 /* Send message if build was successful */
786 if (unlikely(rc == dlen))
787 rc = tipc_mcast_xmit(net, &pkts, method, &dsts,
788 &tsk->cong_link_cnt);
789
790 tipc_nlist_purge(&dsts);
791
792 return rc ? rc : dlen;
690} 793}
691 794
692/** 795/**
@@ -700,7 +803,7 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
700 struct sk_buff_head *inputq) 803 struct sk_buff_head *inputq)
701{ 804{
702 struct tipc_msg *msg; 805 struct tipc_msg *msg;
703 struct tipc_plist dports; 806 struct list_head dports;
704 u32 portid; 807 u32 portid;
705 u32 scope = TIPC_CLUSTER_SCOPE; 808 u32 scope = TIPC_CLUSTER_SCOPE;
706 struct sk_buff_head tmpq; 809 struct sk_buff_head tmpq;
@@ -708,7 +811,7 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
708 struct sk_buff *skb, *_skb; 811 struct sk_buff *skb, *_skb;
709 812
710 __skb_queue_head_init(&tmpq); 813 __skb_queue_head_init(&tmpq);
711 tipc_plist_init(&dports); 814 INIT_LIST_HEAD(&dports);
712 815
713 skb = tipc_skb_peek(arrvq, &inputq->lock); 816 skb = tipc_skb_peek(arrvq, &inputq->lock);
714 for (; skb; skb = tipc_skb_peek(arrvq, &inputq->lock)) { 817 for (; skb; skb = tipc_skb_peek(arrvq, &inputq->lock)) {
@@ -722,8 +825,8 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
722 tipc_nametbl_mc_translate(net, 825 tipc_nametbl_mc_translate(net,
723 msg_nametype(msg), msg_namelower(msg), 826 msg_nametype(msg), msg_namelower(msg),
724 msg_nameupper(msg), scope, &dports); 827 msg_nameupper(msg), scope, &dports);
725 portid = tipc_plist_pop(&dports); 828 portid = u32_pop(&dports);
726 for (; portid; portid = tipc_plist_pop(&dports)) { 829 for (; portid; portid = u32_pop(&dports)) {
727 _skb = __pskb_copy(skb, hsz, GFP_ATOMIC); 830 _skb = __pskb_copy(skb, hsz, GFP_ATOMIC);
728 if (_skb) { 831 if (_skb) {
729 msg_set_destport(buf_msg(_skb), portid); 832 msg_set_destport(buf_msg(_skb), portid);
@@ -763,7 +866,7 @@ static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb,
763 if (!tsk_peer_msg(tsk, hdr)) 866 if (!tsk_peer_msg(tsk, hdr))
764 goto exit; 867 goto exit;
765 868
766 tsk->probing_state = TIPC_CONN_OK; 869 tsk->probe_unacked = false;
767 870
768 if (mtyp == CONN_PROBE) { 871 if (mtyp == CONN_PROBE) {
769 msg_set_type(hdr, CONN_PROBE_REPLY); 872 msg_set_type(hdr, CONN_PROBE_REPLY);
@@ -784,31 +887,6 @@ exit:
784 kfree_skb(skb); 887 kfree_skb(skb);
785} 888}
786 889
787static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p)
788{
789 struct sock *sk = sock->sk;
790 struct tipc_sock *tsk = tipc_sk(sk);
791 DEFINE_WAIT(wait);
792 int done;
793
794 do {
795 int err = sock_error(sk);
796 if (err)
797 return err;
798 if (sock->state == SS_DISCONNECTING)
799 return -EPIPE;
800 if (!*timeo_p)
801 return -EAGAIN;
802 if (signal_pending(current))
803 return sock_intr_errno(*timeo_p);
804
805 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
806 done = sk_wait_event(sk, timeo_p, !tsk->link_cong);
807 finish_wait(sk_sleep(sk), &wait);
808 } while (!done);
809 return 0;
810}
811
812/** 890/**
813 * tipc_sendmsg - send message in connectionless manner 891 * tipc_sendmsg - send message in connectionless manner
814 * @sock: socket structure 892 * @sock: socket structure
@@ -835,37 +913,41 @@ static int tipc_sendmsg(struct socket *sock,
835 return ret; 913 return ret;
836} 914}
837 915
838static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz) 916static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
839{ 917{
840 DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name);
841 struct sock *sk = sock->sk; 918 struct sock *sk = sock->sk;
842 struct tipc_sock *tsk = tipc_sk(sk);
843 struct net *net = sock_net(sk); 919 struct net *net = sock_net(sk);
844 struct tipc_msg *mhdr = &tsk->phdr; 920 struct tipc_sock *tsk = tipc_sk(sk);
845 u32 dnode, dport; 921 DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name);
846 struct sk_buff_head pktchain; 922 long timeout = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT);
847 struct sk_buff *skb; 923 struct list_head *clinks = &tsk->cong_links;
924 bool syn = !tipc_sk_type_connectionless(sk);
925 struct tipc_msg *hdr = &tsk->phdr;
848 struct tipc_name_seq *seq; 926 struct tipc_name_seq *seq;
849 struct iov_iter save; 927 struct sk_buff_head pkts;
850 u32 mtu; 928 u32 type, inst, domain;
851 long timeo; 929 u32 dnode, dport;
852 int rc; 930 int mtu, rc;
853 931
854 if (dsz > TIPC_MAX_USER_MSG_SIZE) 932 if (unlikely(dlen > TIPC_MAX_USER_MSG_SIZE))
855 return -EMSGSIZE; 933 return -EMSGSIZE;
934
856 if (unlikely(!dest)) { 935 if (unlikely(!dest)) {
857 if (tsk->connected && sock->state == SS_READY) 936 dest = &tsk->peer;
858 dest = &tsk->remote; 937 if (!syn || dest->family != AF_TIPC)
859 else
860 return -EDESTADDRREQ; 938 return -EDESTADDRREQ;
861 } else if (unlikely(m->msg_namelen < sizeof(*dest)) ||
862 dest->family != AF_TIPC) {
863 return -EINVAL;
864 } 939 }
865 if (unlikely(sock->state != SS_READY)) { 940
866 if (sock->state == SS_LISTENING) 941 if (unlikely(m->msg_namelen < sizeof(*dest)))
942 return -EINVAL;
943
944 if (unlikely(dest->family != AF_TIPC))
945 return -EINVAL;
946
947 if (unlikely(syn)) {
948 if (sk->sk_state == TIPC_LISTEN)
867 return -EPIPE; 949 return -EPIPE;
868 if (sock->state != SS_UNCONNECTED) 950 if (sk->sk_state != TIPC_OPEN)
869 return -EISCONN; 951 return -EISCONN;
870 if (tsk->published) 952 if (tsk->published)
871 return -EOPNOTSUPP; 953 return -EOPNOTSUPP;
@@ -874,102 +956,62 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz)
874 tsk->conn_instance = dest->addr.name.name.instance; 956 tsk->conn_instance = dest->addr.name.name.instance;
875 } 957 }
876 } 958 }
877 seq = &dest->addr.nameseq;
878 timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT);
879 959
880 if (dest->addrtype == TIPC_ADDR_MCAST) { 960 seq = &dest->addr.nameseq;
881 return tipc_sendmcast(sock, seq, m, dsz, timeo); 961 if (dest->addrtype == TIPC_ADDR_MCAST)
882 } else if (dest->addrtype == TIPC_ADDR_NAME) { 962 return tipc_sendmcast(sock, seq, m, dlen, timeout);
883 u32 type = dest->addr.name.name.type;
884 u32 inst = dest->addr.name.name.instance;
885 u32 domain = dest->addr.name.domain;
886 963
964 if (dest->addrtype == TIPC_ADDR_NAME) {
965 type = dest->addr.name.name.type;
966 inst = dest->addr.name.name.instance;
967 domain = dest->addr.name.domain;
887 dnode = domain; 968 dnode = domain;
888 msg_set_type(mhdr, TIPC_NAMED_MSG); 969 msg_set_type(hdr, TIPC_NAMED_MSG);
889 msg_set_hdr_sz(mhdr, NAMED_H_SIZE); 970 msg_set_hdr_sz(hdr, NAMED_H_SIZE);
890 msg_set_nametype(mhdr, type); 971 msg_set_nametype(hdr, type);
891 msg_set_nameinst(mhdr, inst); 972 msg_set_nameinst(hdr, inst);
892 msg_set_lookup_scope(mhdr, tipc_addr_scope(domain)); 973 msg_set_lookup_scope(hdr, tipc_addr_scope(domain));
893 dport = tipc_nametbl_translate(net, type, inst, &dnode); 974 dport = tipc_nametbl_translate(net, type, inst, &dnode);
894 msg_set_destnode(mhdr, dnode); 975 msg_set_destnode(hdr, dnode);
895 msg_set_destport(mhdr, dport); 976 msg_set_destport(hdr, dport);
896 if (unlikely(!dport && !dnode)) 977 if (unlikely(!dport && !dnode))
897 return -EHOSTUNREACH; 978 return -EHOSTUNREACH;
979
898 } else if (dest->addrtype == TIPC_ADDR_ID) { 980 } else if (dest->addrtype == TIPC_ADDR_ID) {
899 dnode = dest->addr.id.node; 981 dnode = dest->addr.id.node;
900 msg_set_type(mhdr, TIPC_DIRECT_MSG); 982 msg_set_type(hdr, TIPC_DIRECT_MSG);
901 msg_set_lookup_scope(mhdr, 0); 983 msg_set_lookup_scope(hdr, 0);
902 msg_set_destnode(mhdr, dnode); 984 msg_set_destnode(hdr, dnode);
903 msg_set_destport(mhdr, dest->addr.id.ref); 985 msg_set_destport(hdr, dest->addr.id.ref);
904 msg_set_hdr_sz(mhdr, BASIC_H_SIZE); 986 msg_set_hdr_sz(hdr, BASIC_H_SIZE);
905 } 987 }
906 988
907 skb_queue_head_init(&pktchain); 989 /* Block or return if destination link is congested */
908 save = m->msg_iter; 990 rc = tipc_wait_for_cond(sock, &timeout, !u32_find(clinks, dnode));
909new_mtu: 991 if (unlikely(rc))
910 mtu = tipc_node_get_mtu(net, dnode, tsk->portid);
911 rc = tipc_msg_build(mhdr, m, 0, dsz, mtu, &pktchain);
912 if (rc < 0)
913 return rc; 992 return rc;
914 993
915 do { 994 skb_queue_head_init(&pkts);
916 skb = skb_peek(&pktchain); 995 mtu = tipc_node_get_mtu(net, dnode, tsk->portid);
917 TIPC_SKB_CB(skb)->wakeup_pending = tsk->link_cong; 996 rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts);
918 rc = tipc_node_xmit(net, &pktchain, dnode, tsk->portid); 997 if (unlikely(rc != dlen))
919 if (likely(!rc)) { 998 return rc;
920 if (sock->state != SS_READY)
921 sock->state = SS_CONNECTING;
922 return dsz;
923 }
924 if (rc == -ELINKCONG) {
925 tsk->link_cong = 1;
926 rc = tipc_wait_for_sndmsg(sock, &timeo);
927 if (!rc)
928 continue;
929 }
930 __skb_queue_purge(&pktchain);
931 if (rc == -EMSGSIZE) {
932 m->msg_iter = save;
933 goto new_mtu;
934 }
935 break;
936 } while (1);
937
938 return rc;
939}
940 999
941static int tipc_wait_for_sndpkt(struct socket *sock, long *timeo_p) 1000 rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid);
942{ 1001 if (unlikely(rc == -ELINKCONG)) {
943 struct sock *sk = sock->sk; 1002 u32_push(clinks, dnode);
944 struct tipc_sock *tsk = tipc_sk(sk); 1003 tsk->cong_link_cnt++;
945 DEFINE_WAIT(wait); 1004 rc = 0;
946 int done; 1005 }
947 1006
948 do { 1007 if (unlikely(syn && !rc))
949 int err = sock_error(sk); 1008 tipc_set_sk_state(sk, TIPC_CONNECTING);
950 if (err)
951 return err;
952 if (sock->state == SS_DISCONNECTING)
953 return -EPIPE;
954 else if (sock->state != SS_CONNECTED)
955 return -ENOTCONN;
956 if (!*timeo_p)
957 return -EAGAIN;
958 if (signal_pending(current))
959 return sock_intr_errno(*timeo_p);
960 1009
961 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 1010 return rc ? rc : dlen;
962 done = sk_wait_event(sk, timeo_p,
963 (!tsk->link_cong &&
964 !tsk_conn_cong(tsk)) ||
965 !tsk->connected);
966 finish_wait(sk_sleep(sk), &wait);
967 } while (!done);
968 return 0;
969} 1011}
970 1012
971/** 1013/**
972 * tipc_send_stream - send stream-oriented data 1014 * tipc_sendstream - send stream-oriented data
973 * @sock: socket structure 1015 * @sock: socket structure
974 * @m: data to send 1016 * @m: data to send
975 * @dsz: total length of data to be transmitted 1017 * @dsz: total length of data to be transmitted
@@ -979,91 +1021,69 @@ static int tipc_wait_for_sndpkt(struct socket *sock, long *timeo_p)
979 * Returns the number of bytes sent on success (or partial success), 1021 * Returns the number of bytes sent on success (or partial success),
980 * or errno if no data sent 1022 * or errno if no data sent
981 */ 1023 */
982static int tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz) 1024static int tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz)
983{ 1025{
984 struct sock *sk = sock->sk; 1026 struct sock *sk = sock->sk;
985 int ret; 1027 int ret;
986 1028
987 lock_sock(sk); 1029 lock_sock(sk);
988 ret = __tipc_send_stream(sock, m, dsz); 1030 ret = __tipc_sendstream(sock, m, dsz);
989 release_sock(sk); 1031 release_sock(sk);
990 1032
991 return ret; 1033 return ret;
992} 1034}
993 1035
994static int __tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz) 1036static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen)
995{ 1037{
996 struct sock *sk = sock->sk; 1038 struct sock *sk = sock->sk;
997 struct net *net = sock_net(sk);
998 struct tipc_sock *tsk = tipc_sk(sk);
999 struct tipc_msg *mhdr = &tsk->phdr;
1000 struct sk_buff_head pktchain;
1001 DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); 1039 DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name);
1002 u32 portid = tsk->portid; 1040 long timeout = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT);
1003 int rc = -EINVAL; 1041 struct tipc_sock *tsk = tipc_sk(sk);
1004 long timeo; 1042 struct tipc_msg *hdr = &tsk->phdr;
1005 u32 dnode; 1043 struct net *net = sock_net(sk);
1006 uint mtu, send, sent = 0; 1044 struct sk_buff_head pkts;
1007 struct iov_iter save; 1045 u32 dnode = tsk_peer_node(tsk);
1008 int hlen = MIN_H_SIZE; 1046 int send, sent = 0;
1009 1047 int rc = 0;
1010 /* Handle implied connection establishment */
1011 if (unlikely(dest)) {
1012 rc = __tipc_sendmsg(sock, m, dsz);
1013 hlen = msg_hdr_sz(mhdr);
1014 if (dsz && (dsz == rc))
1015 tsk->snt_unacked = tsk_inc(tsk, dsz + hlen);
1016 return rc;
1017 }
1018 if (dsz > (uint)INT_MAX)
1019 return -EMSGSIZE;
1020 1048
1021 if (unlikely(sock->state != SS_CONNECTED)) { 1049 skb_queue_head_init(&pkts);
1022 if (sock->state == SS_DISCONNECTING)
1023 return -EPIPE;
1024 else
1025 return -ENOTCONN;
1026 }
1027 1050
1028 timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); 1051 if (unlikely(dlen > INT_MAX))
1029 dnode = tsk_peer_node(tsk); 1052 return -EMSGSIZE;
1030 skb_queue_head_init(&pktchain);
1031 1053
1032next: 1054 /* Handle implicit connection setup */
1033 save = m->msg_iter; 1055 if (unlikely(dest)) {
1034 mtu = tsk->max_pkt; 1056 rc = __tipc_sendmsg(sock, m, dlen);
1035 send = min_t(uint, dsz - sent, TIPC_MAX_USER_MSG_SIZE); 1057 if (dlen && (dlen == rc))
1036 rc = tipc_msg_build(mhdr, m, sent, send, mtu, &pktchain); 1058 tsk->snt_unacked = tsk_inc(tsk, dlen + msg_hdr_sz(hdr));
1037 if (unlikely(rc < 0))
1038 return rc; 1059 return rc;
1060 }
1039 1061
1040 do { 1062 do {
1041 if (likely(!tsk_conn_cong(tsk))) { 1063 rc = tipc_wait_for_cond(sock, &timeout,
1042 rc = tipc_node_xmit(net, &pktchain, dnode, portid); 1064 (!tsk->cong_link_cnt &&
1043 if (likely(!rc)) { 1065 !tsk_conn_cong(tsk) &&
1044 tsk->snt_unacked += tsk_inc(tsk, send + hlen); 1066 tipc_sk_connected(sk)));
1045 sent += send; 1067 if (unlikely(rc))
1046 if (sent == dsz) 1068 break;
1047 return dsz;
1048 goto next;
1049 }
1050 if (rc == -EMSGSIZE) {
1051 __skb_queue_purge(&pktchain);
1052 tsk->max_pkt = tipc_node_get_mtu(net, dnode,
1053 portid);
1054 m->msg_iter = save;
1055 goto next;
1056 }
1057 if (rc != -ELINKCONG)
1058 break;
1059 1069
1060 tsk->link_cong = 1; 1070 send = min_t(size_t, dlen - sent, TIPC_MAX_USER_MSG_SIZE);
1071 rc = tipc_msg_build(hdr, m, sent, send, tsk->max_pkt, &pkts);
1072 if (unlikely(rc != send))
1073 break;
1074
1075 rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid);
1076 if (unlikely(rc == -ELINKCONG)) {
1077 tsk->cong_link_cnt = 1;
1078 rc = 0;
1079 }
1080 if (likely(!rc)) {
1081 tsk->snt_unacked += tsk_inc(tsk, send + MIN_H_SIZE);
1082 sent += send;
1061 } 1083 }
1062 rc = tipc_wait_for_sndpkt(sock, &timeo); 1084 } while (sent < dlen && !rc);
1063 } while (!rc);
1064 1085
1065 __skb_queue_purge(&pktchain); 1086 return rc ? rc : sent;
1066 return sent ? sent : rc;
1067} 1087}
1068 1088
1069/** 1089/**
@@ -1081,7 +1101,7 @@ static int tipc_send_packet(struct socket *sock, struct msghdr *m, size_t dsz)
1081 if (dsz > TIPC_MAX_USER_MSG_SIZE) 1101 if (dsz > TIPC_MAX_USER_MSG_SIZE)
1082 return -EMSGSIZE; 1102 return -EMSGSIZE;
1083 1103
1084 return tipc_send_stream(sock, m, dsz); 1104 return tipc_sendstream(sock, m, dsz);
1085} 1105}
1086 1106
1087/* tipc_sk_finish_conn - complete the setup of a connection 1107/* tipc_sk_finish_conn - complete the setup of a connection
@@ -1099,10 +1119,8 @@ static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port,
1099 msg_set_lookup_scope(msg, 0); 1119 msg_set_lookup_scope(msg, 0);
1100 msg_set_hdr_sz(msg, SHORT_H_SIZE); 1120 msg_set_hdr_sz(msg, SHORT_H_SIZE);
1101 1121
1102 tsk->probing_intv = CONN_PROBING_INTERVAL; 1122 sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTERVAL);
1103 tsk->probing_state = TIPC_CONN_OK; 1123 tipc_set_sk_state(sk, TIPC_ESTABLISHED);
1104 tsk->connected = 1;
1105 sk_reset_timer(sk, &sk->sk_timer, jiffies + tsk->probing_intv);
1106 tipc_node_add_conn(net, peer_node, tsk->portid, peer_port); 1124 tipc_node_add_conn(net, peer_node, tsk->portid, peer_port);
1107 tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid); 1125 tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid);
1108 tsk->peer_caps = tipc_node_get_capabilities(net, peer_node); 1126 tsk->peer_caps = tipc_node_get_capabilities(net, peer_node);
@@ -1210,13 +1228,14 @@ static int tipc_sk_anc_data_recv(struct msghdr *m, struct tipc_msg *msg,
1210 1228
1211static void tipc_sk_send_ack(struct tipc_sock *tsk) 1229static void tipc_sk_send_ack(struct tipc_sock *tsk)
1212{ 1230{
1213 struct net *net = sock_net(&tsk->sk); 1231 struct sock *sk = &tsk->sk;
1232 struct net *net = sock_net(sk);
1214 struct sk_buff *skb = NULL; 1233 struct sk_buff *skb = NULL;
1215 struct tipc_msg *msg; 1234 struct tipc_msg *msg;
1216 u32 peer_port = tsk_peer_port(tsk); 1235 u32 peer_port = tsk_peer_port(tsk);
1217 u32 dnode = tsk_peer_node(tsk); 1236 u32 dnode = tsk_peer_node(tsk);
1218 1237
1219 if (!tsk->connected) 1238 if (!tipc_sk_connected(sk))
1220 return; 1239 return;
1221 skb = tipc_msg_create(CONN_MANAGER, CONN_ACK, INT_H_SIZE, 0, 1240 skb = tipc_msg_create(CONN_MANAGER, CONN_ACK, INT_H_SIZE, 0,
1222 dnode, tsk_own_node(tsk), peer_port, 1241 dnode, tsk_own_node(tsk), peer_port,
@@ -1245,7 +1264,7 @@ static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop)
1245 for (;;) { 1264 for (;;) {
1246 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 1265 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1247 if (timeo && skb_queue_empty(&sk->sk_receive_queue)) { 1266 if (timeo && skb_queue_empty(&sk->sk_receive_queue)) {
1248 if (sock->state == SS_DISCONNECTING) { 1267 if (sk->sk_shutdown & RCV_SHUTDOWN) {
1249 err = -ENOTCONN; 1268 err = -ENOTCONN;
1250 break; 1269 break;
1251 } 1270 }
@@ -1286,6 +1305,7 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, size_t buf_len,
1286 struct tipc_sock *tsk = tipc_sk(sk); 1305 struct tipc_sock *tsk = tipc_sk(sk);
1287 struct sk_buff *buf; 1306 struct sk_buff *buf;
1288 struct tipc_msg *msg; 1307 struct tipc_msg *msg;
1308 bool is_connectionless = tipc_sk_type_connectionless(sk);
1289 long timeo; 1309 long timeo;
1290 unsigned int sz; 1310 unsigned int sz;
1291 u32 err; 1311 u32 err;
@@ -1297,7 +1317,7 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, size_t buf_len,
1297 1317
1298 lock_sock(sk); 1318 lock_sock(sk);
1299 1319
1300 if (unlikely(sock->state == SS_UNCONNECTED)) { 1320 if (!is_connectionless && unlikely(sk->sk_state == TIPC_OPEN)) {
1301 res = -ENOTCONN; 1321 res = -ENOTCONN;
1302 goto exit; 1322 goto exit;
1303 } 1323 }
@@ -1342,8 +1362,8 @@ restart:
1342 goto exit; 1362 goto exit;
1343 res = sz; 1363 res = sz;
1344 } else { 1364 } else {
1345 if ((sock->state == SS_READY) || 1365 if (is_connectionless || err == TIPC_CONN_SHUTDOWN ||
1346 ((err == TIPC_CONN_SHUTDOWN) || m->msg_control)) 1366 m->msg_control)
1347 res = 0; 1367 res = 0;
1348 else 1368 else
1349 res = -ECONNRESET; 1369 res = -ECONNRESET;
@@ -1352,7 +1372,7 @@ restart:
1352 if (unlikely(flags & MSG_PEEK)) 1372 if (unlikely(flags & MSG_PEEK))
1353 goto exit; 1373 goto exit;
1354 1374
1355 if (likely(sock->state != SS_READY)) { 1375 if (likely(!is_connectionless)) {
1356 tsk->rcv_unacked += tsk_inc(tsk, hlen + sz); 1376 tsk->rcv_unacked += tsk_inc(tsk, hlen + sz);
1357 if (unlikely(tsk->rcv_unacked >= (tsk->rcv_win / 4))) 1377 if (unlikely(tsk->rcv_unacked >= (tsk->rcv_win / 4)))
1358 tipc_sk_send_ack(tsk); 1378 tipc_sk_send_ack(tsk);
@@ -1383,7 +1403,7 @@ static int tipc_recv_stream(struct socket *sock, struct msghdr *m,
1383 struct tipc_msg *msg; 1403 struct tipc_msg *msg;
1384 long timeo; 1404 long timeo;
1385 unsigned int sz; 1405 unsigned int sz;
1386 int sz_to_copy, target, needed; 1406 int target;
1387 int sz_copied = 0; 1407 int sz_copied = 0;
1388 u32 err; 1408 u32 err;
1389 int res = 0, hlen; 1409 int res = 0, hlen;
@@ -1394,7 +1414,7 @@ static int tipc_recv_stream(struct socket *sock, struct msghdr *m,
1394 1414
1395 lock_sock(sk); 1415 lock_sock(sk);
1396 1416
1397 if (unlikely(sock->state == SS_UNCONNECTED)) { 1417 if (unlikely(sk->sk_state == TIPC_OPEN)) {
1398 res = -ENOTCONN; 1418 res = -ENOTCONN;
1399 goto exit; 1419 goto exit;
1400 } 1420 }
@@ -1431,11 +1451,13 @@ restart:
1431 1451
1432 /* Capture message data (if valid) & compute return value (always) */ 1452 /* Capture message data (if valid) & compute return value (always) */
1433 if (!err) { 1453 if (!err) {
1434 u32 offset = (u32)(unsigned long)(TIPC_SKB_CB(buf)->handle); 1454 u32 offset = TIPC_SKB_CB(buf)->bytes_read;
1455 u32 needed;
1456 int sz_to_copy;
1435 1457
1436 sz -= offset; 1458 sz -= offset;
1437 needed = (buf_len - sz_copied); 1459 needed = (buf_len - sz_copied);
1438 sz_to_copy = (sz <= needed) ? sz : needed; 1460 sz_to_copy = min(sz, needed);
1439 1461
1440 res = skb_copy_datagram_msg(buf, hlen + offset, m, sz_to_copy); 1462 res = skb_copy_datagram_msg(buf, hlen + offset, m, sz_to_copy);
1441 if (res) 1463 if (res)
@@ -1445,8 +1467,8 @@ restart:
1445 1467
1446 if (sz_to_copy < sz) { 1468 if (sz_to_copy < sz) {
1447 if (!(flags & MSG_PEEK)) 1469 if (!(flags & MSG_PEEK))
1448 TIPC_SKB_CB(buf)->handle = 1470 TIPC_SKB_CB(buf)->bytes_read =
1449 (void *)(unsigned long)(offset + sz_to_copy); 1471 offset + sz_to_copy;
1450 goto exit; 1472 goto exit;
1451 } 1473 }
1452 } else { 1474 } else {
@@ -1528,49 +1550,31 @@ static bool filter_connect(struct tipc_sock *tsk, struct sk_buff *skb)
1528{ 1550{
1529 struct sock *sk = &tsk->sk; 1551 struct sock *sk = &tsk->sk;
1530 struct net *net = sock_net(sk); 1552 struct net *net = sock_net(sk);
1531 struct socket *sock = sk->sk_socket;
1532 struct tipc_msg *hdr = buf_msg(skb); 1553 struct tipc_msg *hdr = buf_msg(skb);
1533 1554
1534 if (unlikely(msg_mcast(hdr))) 1555 if (unlikely(msg_mcast(hdr)))
1535 return false; 1556 return false;
1536 1557
1537 switch ((int)sock->state) { 1558 switch (sk->sk_state) {
1538 case SS_CONNECTED: 1559 case TIPC_CONNECTING:
1539
1540 /* Accept only connection-based messages sent by peer */
1541 if (unlikely(!tsk_peer_msg(tsk, hdr)))
1542 return false;
1543
1544 if (unlikely(msg_errcode(hdr))) {
1545 sock->state = SS_DISCONNECTING;
1546 tsk->connected = 0;
1547 /* Let timer expire on it's own */
1548 tipc_node_remove_conn(net, tsk_peer_node(tsk),
1549 tsk->portid);
1550 }
1551 return true;
1552
1553 case SS_CONNECTING:
1554
1555 /* Accept only ACK or NACK message */ 1560 /* Accept only ACK or NACK message */
1556 if (unlikely(!msg_connected(hdr))) 1561 if (unlikely(!msg_connected(hdr)))
1557 return false; 1562 return false;
1558 1563
1559 if (unlikely(msg_errcode(hdr))) { 1564 if (unlikely(msg_errcode(hdr))) {
1560 sock->state = SS_DISCONNECTING; 1565 tipc_set_sk_state(sk, TIPC_DISCONNECTING);
1561 sk->sk_err = ECONNREFUSED; 1566 sk->sk_err = ECONNREFUSED;
1562 return true; 1567 return true;
1563 } 1568 }
1564 1569
1565 if (unlikely(!msg_isdata(hdr))) { 1570 if (unlikely(!msg_isdata(hdr))) {
1566 sock->state = SS_DISCONNECTING; 1571 tipc_set_sk_state(sk, TIPC_DISCONNECTING);
1567 sk->sk_err = EINVAL; 1572 sk->sk_err = EINVAL;
1568 return true; 1573 return true;
1569 } 1574 }
1570 1575
1571 tipc_sk_finish_conn(tsk, msg_origport(hdr), msg_orignode(hdr)); 1576 tipc_sk_finish_conn(tsk, msg_origport(hdr), msg_orignode(hdr));
1572 msg_set_importance(&tsk->phdr, msg_importance(hdr)); 1577 msg_set_importance(&tsk->phdr, msg_importance(hdr));
1573 sock->state = SS_CONNECTED;
1574 1578
1575 /* If 'ACK+' message, add to socket receive queue */ 1579 /* If 'ACK+' message, add to socket receive queue */
1576 if (msg_data_sz(hdr)) 1580 if (msg_data_sz(hdr))
@@ -1584,18 +1588,31 @@ static bool filter_connect(struct tipc_sock *tsk, struct sk_buff *skb)
1584 msg_set_dest_droppable(hdr, 1); 1588 msg_set_dest_droppable(hdr, 1);
1585 return false; 1589 return false;
1586 1590
1587 case SS_LISTENING: 1591 case TIPC_OPEN:
1588 case SS_UNCONNECTED: 1592 case TIPC_DISCONNECTING:
1589 1593 break;
1594 case TIPC_LISTEN:
1590 /* Accept only SYN message */ 1595 /* Accept only SYN message */
1591 if (!msg_connected(hdr) && !(msg_errcode(hdr))) 1596 if (!msg_connected(hdr) && !(msg_errcode(hdr)))
1592 return true; 1597 return true;
1593 break; 1598 break;
1594 case SS_DISCONNECTING: 1599 case TIPC_ESTABLISHED:
1595 break; 1600 /* Accept only connection-based messages sent by peer */
1601 if (unlikely(!tsk_peer_msg(tsk, hdr)))
1602 return false;
1603
1604 if (unlikely(msg_errcode(hdr))) {
1605 tipc_set_sk_state(sk, TIPC_DISCONNECTING);
1606 /* Let timer expire on it's own */
1607 tipc_node_remove_conn(net, tsk_peer_node(tsk),
1608 tsk->portid);
1609 sk->sk_state_change(sk);
1610 }
1611 return true;
1596 default: 1612 default:
1597 pr_err("Unknown socket state %u\n", sock->state); 1613 pr_err("Unknown sk_state %u\n", sk->sk_state);
1598 } 1614 }
1615
1599 return false; 1616 return false;
1600} 1617}
1601 1618
@@ -1646,12 +1663,12 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb)
1646static bool filter_rcv(struct sock *sk, struct sk_buff *skb, 1663static bool filter_rcv(struct sock *sk, struct sk_buff *skb,
1647 struct sk_buff_head *xmitq) 1664 struct sk_buff_head *xmitq)
1648{ 1665{
1649 struct socket *sock = sk->sk_socket;
1650 struct tipc_sock *tsk = tipc_sk(sk); 1666 struct tipc_sock *tsk = tipc_sk(sk);
1651 struct tipc_msg *hdr = buf_msg(skb); 1667 struct tipc_msg *hdr = buf_msg(skb);
1652 unsigned int limit = rcvbuf_limit(sk, skb); 1668 unsigned int limit = rcvbuf_limit(sk, skb);
1653 int err = TIPC_OK; 1669 int err = TIPC_OK;
1654 int usr = msg_user(hdr); 1670 int usr = msg_user(hdr);
1671 u32 onode;
1655 1672
1656 if (unlikely(msg_user(hdr) == CONN_MANAGER)) { 1673 if (unlikely(msg_user(hdr) == CONN_MANAGER)) {
1657 tipc_sk_proto_rcv(tsk, skb, xmitq); 1674 tipc_sk_proto_rcv(tsk, skb, xmitq);
@@ -1659,8 +1676,10 @@ static bool filter_rcv(struct sock *sk, struct sk_buff *skb,
1659 } 1676 }
1660 1677
1661 if (unlikely(usr == SOCK_WAKEUP)) { 1678 if (unlikely(usr == SOCK_WAKEUP)) {
1679 onode = msg_orignode(hdr);
1662 kfree_skb(skb); 1680 kfree_skb(skb);
1663 tsk->link_cong = 0; 1681 u32_del(&tsk->cong_links, onode);
1682 tsk->cong_link_cnt--;
1664 sk->sk_write_space(sk); 1683 sk->sk_write_space(sk);
1665 return false; 1684 return false;
1666 } 1685 }
@@ -1672,7 +1691,7 @@ static bool filter_rcv(struct sock *sk, struct sk_buff *skb,
1672 } 1691 }
1673 1692
1674 /* Reject if wrong message type for current socket state */ 1693 /* Reject if wrong message type for current socket state */
1675 if (unlikely(sock->state == SS_READY)) { 1694 if (tipc_sk_type_connectionless(sk)) {
1676 if (msg_connected(hdr)) { 1695 if (msg_connected(hdr)) {
1677 err = TIPC_ERR_NO_PORT; 1696 err = TIPC_ERR_NO_PORT;
1678 goto reject; 1697 goto reject;
@@ -1689,7 +1708,7 @@ static bool filter_rcv(struct sock *sk, struct sk_buff *skb,
1689 } 1708 }
1690 1709
1691 /* Enqueue message */ 1710 /* Enqueue message */
1692 TIPC_SKB_CB(skb)->handle = NULL; 1711 TIPC_SKB_CB(skb)->bytes_read = 0;
1693 __skb_queue_tail(&sk->sk_receive_queue, skb); 1712 __skb_queue_tail(&sk->sk_receive_queue, skb);
1694 skb_set_owner_r(skb, sk); 1713 skb_set_owner_r(skb, sk);
1695 1714
@@ -1839,8 +1858,8 @@ xmit:
1839 1858
1840static int tipc_wait_for_connect(struct socket *sock, long *timeo_p) 1859static int tipc_wait_for_connect(struct socket *sock, long *timeo_p)
1841{ 1860{
1861 DEFINE_WAIT_FUNC(wait, woken_wake_function);
1842 struct sock *sk = sock->sk; 1862 struct sock *sk = sock->sk;
1843 DEFINE_WAIT(wait);
1844 int done; 1863 int done;
1845 1864
1846 do { 1865 do {
@@ -1852,9 +1871,10 @@ static int tipc_wait_for_connect(struct socket *sock, long *timeo_p)
1852 if (signal_pending(current)) 1871 if (signal_pending(current))
1853 return sock_intr_errno(*timeo_p); 1872 return sock_intr_errno(*timeo_p);
1854 1873
1855 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 1874 add_wait_queue(sk_sleep(sk), &wait);
1856 done = sk_wait_event(sk, timeo_p, sock->state != SS_CONNECTING); 1875 done = sk_wait_event(sk, timeo_p,
1857 finish_wait(sk_sleep(sk), &wait); 1876 sk->sk_state != TIPC_CONNECTING, &wait);
1877 remove_wait_queue(sk_sleep(sk), &wait);
1858 } while (!done); 1878 } while (!done);
1859 return 0; 1879 return 0;
1860} 1880}
@@ -1876,21 +1896,19 @@ static int tipc_connect(struct socket *sock, struct sockaddr *dest,
1876 struct sockaddr_tipc *dst = (struct sockaddr_tipc *)dest; 1896 struct sockaddr_tipc *dst = (struct sockaddr_tipc *)dest;
1877 struct msghdr m = {NULL,}; 1897 struct msghdr m = {NULL,};
1878 long timeout = (flags & O_NONBLOCK) ? 0 : tsk->conn_timeout; 1898 long timeout = (flags & O_NONBLOCK) ? 0 : tsk->conn_timeout;
1879 socket_state previous; 1899 int previous;
1880 int res = 0; 1900 int res = 0;
1881 1901
1882 lock_sock(sk); 1902 lock_sock(sk);
1883 1903
1884 /* DGRAM/RDM connect(), just save the destaddr */ 1904 /* DGRAM/RDM connect(), just save the destaddr */
1885 if (sock->state == SS_READY) { 1905 if (tipc_sk_type_connectionless(sk)) {
1886 if (dst->family == AF_UNSPEC) { 1906 if (dst->family == AF_UNSPEC) {
1887 memset(&tsk->remote, 0, sizeof(struct sockaddr_tipc)); 1907 memset(&tsk->peer, 0, sizeof(struct sockaddr_tipc));
1888 tsk->connected = 0;
1889 } else if (destlen != sizeof(struct sockaddr_tipc)) { 1908 } else if (destlen != sizeof(struct sockaddr_tipc)) {
1890 res = -EINVAL; 1909 res = -EINVAL;
1891 } else { 1910 } else {
1892 memcpy(&tsk->remote, dest, destlen); 1911 memcpy(&tsk->peer, dest, destlen);
1893 tsk->connected = 1;
1894 } 1912 }
1895 goto exit; 1913 goto exit;
1896 } 1914 }
@@ -1906,9 +1924,10 @@ static int tipc_connect(struct socket *sock, struct sockaddr *dest,
1906 goto exit; 1924 goto exit;
1907 } 1925 }
1908 1926
1909 previous = sock->state; 1927 previous = sk->sk_state;
1910 switch (sock->state) { 1928
1911 case SS_UNCONNECTED: 1929 switch (sk->sk_state) {
1930 case TIPC_OPEN:
1912 /* Send a 'SYN-' to destination */ 1931 /* Send a 'SYN-' to destination */
1913 m.msg_name = dest; 1932 m.msg_name = dest;
1914 m.msg_namelen = destlen; 1933 m.msg_namelen = destlen;
@@ -1923,27 +1942,29 @@ static int tipc_connect(struct socket *sock, struct sockaddr *dest,
1923 if ((res < 0) && (res != -EWOULDBLOCK)) 1942 if ((res < 0) && (res != -EWOULDBLOCK))
1924 goto exit; 1943 goto exit;
1925 1944
1926 /* Just entered SS_CONNECTING state; the only 1945 /* Just entered TIPC_CONNECTING state; the only
1927 * difference is that return value in non-blocking 1946 * difference is that return value in non-blocking
1928 * case is EINPROGRESS, rather than EALREADY. 1947 * case is EINPROGRESS, rather than EALREADY.
1929 */ 1948 */
1930 res = -EINPROGRESS; 1949 res = -EINPROGRESS;
1931 case SS_CONNECTING: 1950 /* fall thru' */
1932 if (previous == SS_CONNECTING) 1951 case TIPC_CONNECTING:
1933 res = -EALREADY; 1952 if (!timeout) {
1934 if (!timeout) 1953 if (previous == TIPC_CONNECTING)
1954 res = -EALREADY;
1935 goto exit; 1955 goto exit;
1956 }
1936 timeout = msecs_to_jiffies(timeout); 1957 timeout = msecs_to_jiffies(timeout);
1937 /* Wait until an 'ACK' or 'RST' arrives, or a timeout occurs */ 1958 /* Wait until an 'ACK' or 'RST' arrives, or a timeout occurs */
1938 res = tipc_wait_for_connect(sock, &timeout); 1959 res = tipc_wait_for_connect(sock, &timeout);
1939 break; 1960 break;
1940 case SS_CONNECTED: 1961 case TIPC_ESTABLISHED:
1941 res = -EISCONN; 1962 res = -EISCONN;
1942 break; 1963 break;
1943 default: 1964 default:
1944 res = -EINVAL; 1965 res = -EINVAL;
1945 break;
1946 } 1966 }
1967
1947exit: 1968exit:
1948 release_sock(sk); 1969 release_sock(sk);
1949 return res; 1970 return res;
@@ -1962,15 +1983,9 @@ static int tipc_listen(struct socket *sock, int len)
1962 int res; 1983 int res;
1963 1984
1964 lock_sock(sk); 1985 lock_sock(sk);
1965 1986 res = tipc_set_sk_state(sk, TIPC_LISTEN);
1966 if (sock->state != SS_UNCONNECTED)
1967 res = -EINVAL;
1968 else {
1969 sock->state = SS_LISTENING;
1970 res = 0;
1971 }
1972
1973 release_sock(sk); 1987 release_sock(sk);
1988
1974 return res; 1989 return res;
1975} 1990}
1976 1991
@@ -1996,9 +2011,6 @@ static int tipc_wait_for_accept(struct socket *sock, long timeo)
1996 err = 0; 2011 err = 0;
1997 if (!skb_queue_empty(&sk->sk_receive_queue)) 2012 if (!skb_queue_empty(&sk->sk_receive_queue))
1998 break; 2013 break;
1999 err = -EINVAL;
2000 if (sock->state != SS_LISTENING)
2001 break;
2002 err = -EAGAIN; 2014 err = -EAGAIN;
2003 if (!timeo) 2015 if (!timeo)
2004 break; 2016 break;
@@ -2018,7 +2030,8 @@ static int tipc_wait_for_accept(struct socket *sock, long timeo)
2018 * 2030 *
2019 * Returns 0 on success, errno otherwise 2031 * Returns 0 on success, errno otherwise
2020 */ 2032 */
2021static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags) 2033static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags,
2034 bool kern)
2022{ 2035{
2023 struct sock *new_sk, *sk = sock->sk; 2036 struct sock *new_sk, *sk = sock->sk;
2024 struct sk_buff *buf; 2037 struct sk_buff *buf;
@@ -2029,7 +2042,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags)
2029 2042
2030 lock_sock(sk); 2043 lock_sock(sk);
2031 2044
2032 if (sock->state != SS_LISTENING) { 2045 if (sk->sk_state != TIPC_LISTEN) {
2033 res = -EINVAL; 2046 res = -EINVAL;
2034 goto exit; 2047 goto exit;
2035 } 2048 }
@@ -2040,7 +2053,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags)
2040 2053
2041 buf = skb_peek(&sk->sk_receive_queue); 2054 buf = skb_peek(&sk->sk_receive_queue);
2042 2055
2043 res = tipc_sk_create(sock_net(sock->sk), new_sock, 0, 1); 2056 res = tipc_sk_create(sock_net(sock->sk), new_sock, 0, kern);
2044 if (res) 2057 if (res)
2045 goto exit; 2058 goto exit;
2046 security_sk_clone(sock->sk, new_sock->sk); 2059 security_sk_clone(sock->sk, new_sock->sk);
@@ -2060,7 +2073,6 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags)
2060 2073
2061 /* Connect new socket to it's peer */ 2074 /* Connect new socket to it's peer */
2062 tipc_sk_finish_conn(new_tsock, msg_origport(msg), msg_orignode(msg)); 2075 tipc_sk_finish_conn(new_tsock, msg_origport(msg), msg_orignode(msg));
2063 new_sock->state = SS_CONNECTED;
2064 2076
2065 tsk_set_importance(new_tsock, msg_importance(msg)); 2077 tsk_set_importance(new_tsock, msg_importance(msg));
2066 if (msg_named(msg)) { 2078 if (msg_named(msg)) {
@@ -2076,7 +2088,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags)
2076 struct msghdr m = {NULL,}; 2088 struct msghdr m = {NULL,};
2077 2089
2078 tsk_advance_rx_queue(sk); 2090 tsk_advance_rx_queue(sk);
2079 __tipc_send_stream(new_sock, &m, 0); 2091 __tipc_sendstream(new_sock, &m, 0);
2080 } else { 2092 } else {
2081 __skb_dequeue(&sk->sk_receive_queue); 2093 __skb_dequeue(&sk->sk_receive_queue);
2082 __skb_queue_head(&new_sk->sk_receive_queue, buf); 2094 __skb_queue_head(&new_sk->sk_receive_queue, buf);
@@ -2100,13 +2112,6 @@ exit:
2100static int tipc_shutdown(struct socket *sock, int how) 2112static int tipc_shutdown(struct socket *sock, int how)
2101{ 2113{
2102 struct sock *sk = sock->sk; 2114 struct sock *sk = sock->sk;
2103 struct net *net = sock_net(sk);
2104 struct tipc_sock *tsk = tipc_sk(sk);
2105 struct sk_buff *skb;
2106 u32 dnode = tsk_peer_node(tsk);
2107 u32 dport = tsk_peer_port(tsk);
2108 u32 onode = tipc_own_addr(net);
2109 u32 oport = tsk->portid;
2110 int res; 2115 int res;
2111 2116
2112 if (how != SHUT_RDWR) 2117 if (how != SHUT_RDWR)
@@ -2114,45 +2119,17 @@ static int tipc_shutdown(struct socket *sock, int how)
2114 2119
2115 lock_sock(sk); 2120 lock_sock(sk);
2116 2121
2117 switch (sock->state) { 2122 __tipc_shutdown(sock, TIPC_CONN_SHUTDOWN);
2118 case SS_CONNECTING: 2123 sk->sk_shutdown = SEND_SHUTDOWN;
2119 case SS_CONNECTED:
2120
2121restart:
2122 dnode = tsk_peer_node(tsk);
2123
2124 /* Disconnect and send a 'FIN+' or 'FIN-' message to peer */
2125 skb = __skb_dequeue(&sk->sk_receive_queue);
2126 if (skb) {
2127 if (TIPC_SKB_CB(skb)->handle != NULL) {
2128 kfree_skb(skb);
2129 goto restart;
2130 }
2131 tipc_sk_respond(sk, skb, TIPC_CONN_SHUTDOWN);
2132 } else {
2133 skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE,
2134 TIPC_CONN_MSG, SHORT_H_SIZE,
2135 0, dnode, onode, dport, oport,
2136 TIPC_CONN_SHUTDOWN);
2137 if (skb)
2138 tipc_node_xmit_skb(net, skb, dnode, tsk->portid);
2139 }
2140 tsk->connected = 0;
2141 sock->state = SS_DISCONNECTING;
2142 tipc_node_remove_conn(net, dnode, tsk->portid);
2143 /* fall through */
2144
2145 case SS_DISCONNECTING:
2146 2124
2125 if (sk->sk_state == TIPC_DISCONNECTING) {
2147 /* Discard any unreceived messages */ 2126 /* Discard any unreceived messages */
2148 __skb_queue_purge(&sk->sk_receive_queue); 2127 __skb_queue_purge(&sk->sk_receive_queue);
2149 2128
2150 /* Wake up anyone sleeping in poll */ 2129 /* Wake up anyone sleeping in poll */
2151 sk->sk_state_change(sk); 2130 sk->sk_state_change(sk);
2152 res = 0; 2131 res = 0;
2153 break; 2132 } else {
2154
2155 default:
2156 res = -ENOTCONN; 2133 res = -ENOTCONN;
2157 } 2134 }
2158 2135
@@ -2169,17 +2146,16 @@ static void tipc_sk_timeout(unsigned long data)
2169 u32 own_node = tsk_own_node(tsk); 2146 u32 own_node = tsk_own_node(tsk);
2170 2147
2171 bh_lock_sock(sk); 2148 bh_lock_sock(sk);
2172 if (!tsk->connected) { 2149 if (!tipc_sk_connected(sk)) {
2173 bh_unlock_sock(sk); 2150 bh_unlock_sock(sk);
2174 goto exit; 2151 goto exit;
2175 } 2152 }
2176 peer_port = tsk_peer_port(tsk); 2153 peer_port = tsk_peer_port(tsk);
2177 peer_node = tsk_peer_node(tsk); 2154 peer_node = tsk_peer_node(tsk);
2178 2155
2179 if (tsk->probing_state == TIPC_CONN_PROBING) { 2156 if (tsk->probe_unacked) {
2180 if (!sock_owned_by_user(sk)) { 2157 if (!sock_owned_by_user(sk)) {
2181 sk->sk_socket->state = SS_DISCONNECTING; 2158 tipc_set_sk_state(sk, TIPC_DISCONNECTING);
2182 tsk->connected = 0;
2183 tipc_node_remove_conn(sock_net(sk), tsk_peer_node(tsk), 2159 tipc_node_remove_conn(sock_net(sk), tsk_peer_node(tsk),
2184 tsk_peer_port(tsk)); 2160 tsk_peer_port(tsk));
2185 sk->sk_state_change(sk); 2161 sk->sk_state_change(sk);
@@ -2188,13 +2164,15 @@ static void tipc_sk_timeout(unsigned long data)
2188 sk_reset_timer(sk, &sk->sk_timer, (HZ / 20)); 2164 sk_reset_timer(sk, &sk->sk_timer, (HZ / 20));
2189 } 2165 }
2190 2166
2191 } else { 2167 bh_unlock_sock(sk);
2192 skb = tipc_msg_create(CONN_MANAGER, CONN_PROBE, 2168 goto exit;
2193 INT_H_SIZE, 0, peer_node, own_node,
2194 peer_port, tsk->portid, TIPC_OK);
2195 tsk->probing_state = TIPC_CONN_PROBING;
2196 sk_reset_timer(sk, &sk->sk_timer, jiffies + tsk->probing_intv);
2197 } 2169 }
2170
2171 skb = tipc_msg_create(CONN_MANAGER, CONN_PROBE,
2172 INT_H_SIZE, 0, peer_node, own_node,
2173 peer_port, tsk->portid, TIPC_OK);
2174 tsk->probe_unacked = true;
2175 sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTERVAL);
2198 bh_unlock_sock(sk); 2176 bh_unlock_sock(sk);
2199 if (skb) 2177 if (skb)
2200 tipc_node_xmit_skb(sock_net(sk), skb, peer_node, tsk->portid); 2178 tipc_node_xmit_skb(sock_net(sk), skb, peer_node, tsk->portid);
@@ -2205,11 +2183,12 @@ exit:
2205static int tipc_sk_publish(struct tipc_sock *tsk, uint scope, 2183static int tipc_sk_publish(struct tipc_sock *tsk, uint scope,
2206 struct tipc_name_seq const *seq) 2184 struct tipc_name_seq const *seq)
2207{ 2185{
2208 struct net *net = sock_net(&tsk->sk); 2186 struct sock *sk = &tsk->sk;
2187 struct net *net = sock_net(sk);
2209 struct publication *publ; 2188 struct publication *publ;
2210 u32 key; 2189 u32 key;
2211 2190
2212 if (tsk->connected) 2191 if (tipc_sk_connected(sk))
2213 return -EINVAL; 2192 return -EINVAL;
2214 key = tsk->portid + tsk->pub_count + 1; 2193 key = tsk->portid + tsk->pub_count + 1;
2215 if (key == tsk->portid) 2194 if (key == tsk->portid)
@@ -2264,24 +2243,27 @@ static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope,
2264void tipc_sk_reinit(struct net *net) 2243void tipc_sk_reinit(struct net *net)
2265{ 2244{
2266 struct tipc_net *tn = net_generic(net, tipc_net_id); 2245 struct tipc_net *tn = net_generic(net, tipc_net_id);
2267 const struct bucket_table *tbl; 2246 struct rhashtable_iter iter;
2268 struct rhash_head *pos;
2269 struct tipc_sock *tsk; 2247 struct tipc_sock *tsk;
2270 struct tipc_msg *msg; 2248 struct tipc_msg *msg;
2271 int i;
2272 2249
2273 rcu_read_lock(); 2250 rhashtable_walk_enter(&tn->sk_rht, &iter);
2274 tbl = rht_dereference_rcu((&tn->sk_rht)->tbl, &tn->sk_rht); 2251
2275 for (i = 0; i < tbl->size; i++) { 2252 do {
2276 rht_for_each_entry_rcu(tsk, pos, tbl, i, node) { 2253 tsk = ERR_PTR(rhashtable_walk_start(&iter));
2254 if (tsk)
2255 continue;
2256
2257 while ((tsk = rhashtable_walk_next(&iter)) && !IS_ERR(tsk)) {
2277 spin_lock_bh(&tsk->sk.sk_lock.slock); 2258 spin_lock_bh(&tsk->sk.sk_lock.slock);
2278 msg = &tsk->phdr; 2259 msg = &tsk->phdr;
2279 msg_set_prevnode(msg, tn->own_addr); 2260 msg_set_prevnode(msg, tn->own_addr);
2280 msg_set_orignode(msg, tn->own_addr); 2261 msg_set_orignode(msg, tn->own_addr);
2281 spin_unlock_bh(&tsk->sk.sk_lock.slock); 2262 spin_unlock_bh(&tsk->sk.sk_lock.slock);
2282 } 2263 }
2283 } 2264
2284 rcu_read_unlock(); 2265 rhashtable_walk_stop(&iter);
2266 } while (tsk == ERR_PTR(-EAGAIN));
2285} 2267}
2286 2268
2287static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid) 2269static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid)
@@ -2377,18 +2359,29 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt,
2377{ 2359{
2378 struct sock *sk = sock->sk; 2360 struct sock *sk = sock->sk;
2379 struct tipc_sock *tsk = tipc_sk(sk); 2361 struct tipc_sock *tsk = tipc_sk(sk);
2380 u32 value; 2362 u32 value = 0;
2381 int res; 2363 int res = 0;
2382 2364
2383 if ((lvl == IPPROTO_TCP) && (sock->type == SOCK_STREAM)) 2365 if ((lvl == IPPROTO_TCP) && (sock->type == SOCK_STREAM))
2384 return 0; 2366 return 0;
2385 if (lvl != SOL_TIPC) 2367 if (lvl != SOL_TIPC)
2386 return -ENOPROTOOPT; 2368 return -ENOPROTOOPT;
2387 if (ol < sizeof(value)) 2369
2388 return -EINVAL; 2370 switch (opt) {
2389 res = get_user(value, (u32 __user *)ov); 2371 case TIPC_IMPORTANCE:
2390 if (res) 2372 case TIPC_SRC_DROPPABLE:
2391 return res; 2373 case TIPC_DEST_DROPPABLE:
2374 case TIPC_CONN_TIMEOUT:
2375 if (ol < sizeof(value))
2376 return -EINVAL;
2377 res = get_user(value, (u32 __user *)ov);
2378 if (res)
2379 return res;
2380 break;
2381 default:
2382 if (ov || ol)
2383 return -EINVAL;
2384 }
2392 2385
2393 lock_sock(sk); 2386 lock_sock(sk);
2394 2387
@@ -2407,7 +2400,14 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt,
2407 break; 2400 break;
2408 case TIPC_CONN_TIMEOUT: 2401 case TIPC_CONN_TIMEOUT:
2409 tipc_sk(sk)->conn_timeout = value; 2402 tipc_sk(sk)->conn_timeout = value;
2410 /* no need to set "res", since already 0 at this point */ 2403 break;
2404 case TIPC_MCAST_BROADCAST:
2405 tsk->mc_method.rcast = false;
2406 tsk->mc_method.mandatory = true;
2407 break;
2408 case TIPC_MCAST_REPLICAST:
2409 tsk->mc_method.rcast = true;
2410 tsk->mc_method.mandatory = true;
2411 break; 2411 break;
2412 default: 2412 default:
2413 res = -EINVAL; 2413 res = -EINVAL;
@@ -2570,7 +2570,7 @@ static const struct proto_ops stream_ops = {
2570 .shutdown = tipc_shutdown, 2570 .shutdown = tipc_shutdown,
2571 .setsockopt = tipc_setsockopt, 2571 .setsockopt = tipc_setsockopt,
2572 .getsockopt = tipc_getsockopt, 2572 .getsockopt = tipc_getsockopt,
2573 .sendmsg = tipc_send_stream, 2573 .sendmsg = tipc_sendstream,
2574 .recvmsg = tipc_recv_stream, 2574 .recvmsg = tipc_recv_stream,
2575 .mmap = sock_no_mmap, 2575 .mmap = sock_no_mmap,
2576 .sendpage = sock_no_sendpage 2576 .sendpage = sock_no_sendpage
@@ -2667,6 +2667,7 @@ static int __tipc_nl_add_sk(struct sk_buff *skb, struct netlink_callback *cb,
2667 struct nlattr *attrs; 2667 struct nlattr *attrs;
2668 struct net *net = sock_net(skb->sk); 2668 struct net *net = sock_net(skb->sk);
2669 struct tipc_net *tn = net_generic(net, tipc_net_id); 2669 struct tipc_net *tn = net_generic(net, tipc_net_id);
2670 struct sock *sk = &tsk->sk;
2670 2671
2671 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 2672 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
2672 &tipc_genl_family, NLM_F_MULTI, TIPC_NL_SOCK_GET); 2673 &tipc_genl_family, NLM_F_MULTI, TIPC_NL_SOCK_GET);
@@ -2681,7 +2682,7 @@ static int __tipc_nl_add_sk(struct sk_buff *skb, struct netlink_callback *cb,
2681 if (nla_put_u32(skb, TIPC_NLA_SOCK_ADDR, tn->own_addr)) 2682 if (nla_put_u32(skb, TIPC_NLA_SOCK_ADDR, tn->own_addr))
2682 goto attr_msg_cancel; 2683 goto attr_msg_cancel;
2683 2684
2684 if (tsk->connected) { 2685 if (tipc_sk_connected(sk)) {
2685 err = __tipc_nl_add_sk_con(skb, tsk); 2686 err = __tipc_nl_add_sk_con(skb, tsk);
2686 if (err) 2687 if (err)
2687 goto attr_msg_cancel; 2688 goto attr_msg_cancel;
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index 0dd02244e21d..271cd66e4b3b 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -54,6 +54,8 @@ struct tipc_subscriber {
54 54
55static void tipc_subscrp_delete(struct tipc_subscription *sub); 55static void tipc_subscrp_delete(struct tipc_subscription *sub);
56static void tipc_subscrb_put(struct tipc_subscriber *subscriber); 56static void tipc_subscrb_put(struct tipc_subscriber *subscriber);
57static void tipc_subscrp_put(struct tipc_subscription *subscription);
58static void tipc_subscrp_get(struct tipc_subscription *subscription);
57 59
58/** 60/**
59 * htohl - convert value to endianness used by destination 61 * htohl - convert value to endianness used by destination
@@ -123,6 +125,7 @@ void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower,
123{ 125{
124 struct tipc_name_seq seq; 126 struct tipc_name_seq seq;
125 127
128 tipc_subscrp_get(sub);
126 tipc_subscrp_convert_seq(&sub->evt.s.seq, sub->swap, &seq); 129 tipc_subscrp_convert_seq(&sub->evt.s.seq, sub->swap, &seq);
127 if (!tipc_subscrp_check_overlap(&seq, found_lower, found_upper)) 130 if (!tipc_subscrp_check_overlap(&seq, found_lower, found_upper))
128 return; 131 return;
@@ -132,6 +135,7 @@ void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower,
132 135
133 tipc_subscrp_send_event(sub, found_lower, found_upper, event, port_ref, 136 tipc_subscrp_send_event(sub, found_lower, found_upper, event, port_ref,
134 node); 137 node);
138 tipc_subscrp_put(sub);
135} 139}
136 140
137static void tipc_subscrp_timeout(unsigned long data) 141static void tipc_subscrp_timeout(unsigned long data)
@@ -139,23 +143,20 @@ static void tipc_subscrp_timeout(unsigned long data)
139 struct tipc_subscription *sub = (struct tipc_subscription *)data; 143 struct tipc_subscription *sub = (struct tipc_subscription *)data;
140 struct tipc_subscriber *subscriber = sub->subscriber; 144 struct tipc_subscriber *subscriber = sub->subscriber;
141 145
146 spin_lock_bh(&subscriber->lock);
147 tipc_nametbl_unsubscribe(sub);
148 spin_unlock_bh(&subscriber->lock);
149
142 /* Notify subscriber of timeout */ 150 /* Notify subscriber of timeout */
143 tipc_subscrp_send_event(sub, sub->evt.s.seq.lower, sub->evt.s.seq.upper, 151 tipc_subscrp_send_event(sub, sub->evt.s.seq.lower, sub->evt.s.seq.upper,
144 TIPC_SUBSCR_TIMEOUT, 0, 0); 152 TIPC_SUBSCR_TIMEOUT, 0, 0);
145 153
146 spin_lock_bh(&subscriber->lock); 154 tipc_subscrp_put(sub);
147 tipc_subscrp_delete(sub);
148 spin_unlock_bh(&subscriber->lock);
149
150 tipc_subscrb_put(subscriber);
151} 155}
152 156
153static void tipc_subscrb_kref_release(struct kref *kref) 157static void tipc_subscrb_kref_release(struct kref *kref)
154{ 158{
155 struct tipc_subscriber *subcriber = container_of(kref, 159 kfree(container_of(kref,struct tipc_subscriber, kref));
156 struct tipc_subscriber, kref);
157
158 kfree(subcriber);
159} 160}
160 161
161static void tipc_subscrb_put(struct tipc_subscriber *subscriber) 162static void tipc_subscrb_put(struct tipc_subscriber *subscriber)
@@ -168,6 +169,59 @@ static void tipc_subscrb_get(struct tipc_subscriber *subscriber)
168 kref_get(&subscriber->kref); 169 kref_get(&subscriber->kref);
169} 170}
170 171
172static void tipc_subscrp_kref_release(struct kref *kref)
173{
174 struct tipc_subscription *sub = container_of(kref,
175 struct tipc_subscription,
176 kref);
177 struct tipc_net *tn = net_generic(sub->net, tipc_net_id);
178 struct tipc_subscriber *subscriber = sub->subscriber;
179
180 spin_lock_bh(&subscriber->lock);
181 list_del(&sub->subscrp_list);
182 atomic_dec(&tn->subscription_count);
183 spin_unlock_bh(&subscriber->lock);
184 kfree(sub);
185 tipc_subscrb_put(subscriber);
186}
187
188static void tipc_subscrp_put(struct tipc_subscription *subscription)
189{
190 kref_put(&subscription->kref, tipc_subscrp_kref_release);
191}
192
193static void tipc_subscrp_get(struct tipc_subscription *subscription)
194{
195 kref_get(&subscription->kref);
196}
197
198/* tipc_subscrb_subscrp_delete - delete a specific subscription or all
199 * subscriptions for a given subscriber.
200 */
201static void tipc_subscrb_subscrp_delete(struct tipc_subscriber *subscriber,
202 struct tipc_subscr *s)
203{
204 struct list_head *subscription_list = &subscriber->subscrp_list;
205 struct tipc_subscription *sub, *temp;
206
207 spin_lock_bh(&subscriber->lock);
208 list_for_each_entry_safe(sub, temp, subscription_list, subscrp_list) {
209 if (s && memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr)))
210 continue;
211
212 tipc_nametbl_unsubscribe(sub);
213 tipc_subscrp_get(sub);
214 spin_unlock_bh(&subscriber->lock);
215 tipc_subscrp_delete(sub);
216 tipc_subscrp_put(sub);
217 spin_lock_bh(&subscriber->lock);
218
219 if (s)
220 break;
221 }
222 spin_unlock_bh(&subscriber->lock);
223}
224
171static struct tipc_subscriber *tipc_subscrb_create(int conid) 225static struct tipc_subscriber *tipc_subscrb_create(int conid)
172{ 226{
173 struct tipc_subscriber *subscriber; 227 struct tipc_subscriber *subscriber;
@@ -177,8 +231,8 @@ static struct tipc_subscriber *tipc_subscrb_create(int conid)
177 pr_warn("Subscriber rejected, no memory\n"); 231 pr_warn("Subscriber rejected, no memory\n");
178 return NULL; 232 return NULL;
179 } 233 }
180 kref_init(&subscriber->kref);
181 INIT_LIST_HEAD(&subscriber->subscrp_list); 234 INIT_LIST_HEAD(&subscriber->subscrp_list);
235 kref_init(&subscriber->kref);
182 subscriber->conid = conid; 236 subscriber->conid = conid;
183 spin_lock_init(&subscriber->lock); 237 spin_lock_init(&subscriber->lock);
184 238
@@ -187,55 +241,22 @@ static struct tipc_subscriber *tipc_subscrb_create(int conid)
187 241
188static void tipc_subscrb_delete(struct tipc_subscriber *subscriber) 242static void tipc_subscrb_delete(struct tipc_subscriber *subscriber)
189{ 243{
190 struct tipc_subscription *sub, *temp; 244 tipc_subscrb_subscrp_delete(subscriber, NULL);
191 u32 timeout;
192
193 spin_lock_bh(&subscriber->lock);
194 /* Destroy any existing subscriptions for subscriber */
195 list_for_each_entry_safe(sub, temp, &subscriber->subscrp_list,
196 subscrp_list) {
197 timeout = htohl(sub->evt.s.timeout, sub->swap);
198 if ((timeout == TIPC_WAIT_FOREVER) || del_timer(&sub->timer)) {
199 tipc_subscrp_delete(sub);
200 tipc_subscrb_put(subscriber);
201 }
202 }
203 spin_unlock_bh(&subscriber->lock);
204
205 tipc_subscrb_put(subscriber); 245 tipc_subscrb_put(subscriber);
206} 246}
207 247
208static void tipc_subscrp_delete(struct tipc_subscription *sub) 248static void tipc_subscrp_delete(struct tipc_subscription *sub)
209{ 249{
210 struct tipc_net *tn = net_generic(sub->net, tipc_net_id); 250 u32 timeout = htohl(sub->evt.s.timeout, sub->swap);
211 251
212 tipc_nametbl_unsubscribe(sub); 252 if (timeout == TIPC_WAIT_FOREVER || del_timer(&sub->timer))
213 list_del(&sub->subscrp_list); 253 tipc_subscrp_put(sub);
214 kfree(sub);
215 atomic_dec(&tn->subscription_count);
216} 254}
217 255
218static void tipc_subscrp_cancel(struct tipc_subscr *s, 256static void tipc_subscrp_cancel(struct tipc_subscr *s,
219 struct tipc_subscriber *subscriber) 257 struct tipc_subscriber *subscriber)
220{ 258{
221 struct tipc_subscription *sub, *temp; 259 tipc_subscrb_subscrp_delete(subscriber, s);
222 u32 timeout;
223
224 spin_lock_bh(&subscriber->lock);
225 /* Find first matching subscription, exit if not found */
226 list_for_each_entry_safe(sub, temp, &subscriber->subscrp_list,
227 subscrp_list) {
228 if (!memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr))) {
229 timeout = htohl(sub->evt.s.timeout, sub->swap);
230 if ((timeout == TIPC_WAIT_FOREVER) ||
231 del_timer(&sub->timer)) {
232 tipc_subscrp_delete(sub);
233 tipc_subscrb_put(subscriber);
234 }
235 break;
236 }
237 }
238 spin_unlock_bh(&subscriber->lock);
239} 260}
240 261
241static struct tipc_subscription *tipc_subscrp_create(struct net *net, 262static struct tipc_subscription *tipc_subscrp_create(struct net *net,
@@ -272,6 +293,7 @@ static struct tipc_subscription *tipc_subscrp_create(struct net *net,
272 sub->swap = swap; 293 sub->swap = swap;
273 memcpy(&sub->evt.s, s, sizeof(*s)); 294 memcpy(&sub->evt.s, s, sizeof(*s));
274 atomic_inc(&tn->subscription_count); 295 atomic_inc(&tn->subscription_count);
296 kref_init(&sub->kref);
275 return sub; 297 return sub;
276} 298}
277 299
@@ -288,17 +310,16 @@ static void tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s,
288 310
289 spin_lock_bh(&subscriber->lock); 311 spin_lock_bh(&subscriber->lock);
290 list_add(&sub->subscrp_list, &subscriber->subscrp_list); 312 list_add(&sub->subscrp_list, &subscriber->subscrp_list);
291 tipc_subscrb_get(subscriber);
292 sub->subscriber = subscriber; 313 sub->subscriber = subscriber;
293 tipc_nametbl_subscribe(sub); 314 tipc_nametbl_subscribe(sub);
315 tipc_subscrb_get(subscriber);
294 spin_unlock_bh(&subscriber->lock); 316 spin_unlock_bh(&subscriber->lock);
295 317
318 setup_timer(&sub->timer, tipc_subscrp_timeout, (unsigned long)sub);
296 timeout = htohl(sub->evt.s.timeout, swap); 319 timeout = htohl(sub->evt.s.timeout, swap);
297 if (timeout == TIPC_WAIT_FOREVER)
298 return;
299 320
300 setup_timer(&sub->timer, tipc_subscrp_timeout, (unsigned long)sub); 321 if (timeout != TIPC_WAIT_FOREVER)
301 mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout)); 322 mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout));
302} 323}
303 324
304/* Handle one termination request for the subscriber */ 325/* Handle one termination request for the subscriber */
diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h
index be60103082c9..ffdc214c117a 100644
--- a/net/tipc/subscr.h
+++ b/net/tipc/subscr.h
@@ -57,6 +57,7 @@ struct tipc_subscriber;
57 * @evt: template for events generated by subscription 57 * @evt: template for events generated by subscription
58 */ 58 */
59struct tipc_subscription { 59struct tipc_subscription {
60 struct kref kref;
60 struct tipc_subscriber *subscriber; 61 struct tipc_subscriber *subscriber;
61 struct net *net; 62 struct net *net;
62 struct timer_list timer; 63 struct timer_list timer;
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index b58dc95f3d35..46061cf48cd1 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -113,7 +113,7 @@ static void tipc_udp_media_addr_set(struct tipc_media_addr *addr,
113 memcpy(addr->value, ua, sizeof(struct udp_media_addr)); 113 memcpy(addr->value, ua, sizeof(struct udp_media_addr));
114 114
115 if (tipc_udp_is_mcast_addr(ua)) 115 if (tipc_udp_is_mcast_addr(ua))
116 addr->broadcast = 1; 116 addr->broadcast = TIPC_BROADCAST_SUPPORT;
117} 117}
118 118
119/* tipc_udp_addr2str - convert ip/udp address to string */ 119/* tipc_udp_addr2str - convert ip/udp address to string */
@@ -229,7 +229,7 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,
229 goto out; 229 goto out;
230 } 230 }
231 231
232 if (!addr->broadcast || list_empty(&ub->rcast.list)) 232 if (addr->broadcast != TIPC_REPLICAST_SUPPORT)
233 return tipc_udp_xmit(net, skb, ub, src, dst); 233 return tipc_udp_xmit(net, skb, ub, src, dst);
234 234
235 /* Replicast, send an skb to each configured IP address */ 235 /* Replicast, send an skb to each configured IP address */
@@ -296,7 +296,7 @@ static int tipc_udp_rcast_add(struct tipc_bearer *b,
296 else if (ntohs(addr->proto) == ETH_P_IPV6) 296 else if (ntohs(addr->proto) == ETH_P_IPV6)
297 pr_info("New replicast peer: %pI6\n", &rcast->addr.ipv6); 297 pr_info("New replicast peer: %pI6\n", &rcast->addr.ipv6);
298#endif 298#endif
299 299 b->bcast_addr.broadcast = TIPC_REPLICAST_SUPPORT;
300 list_add_rcu(&rcast->list, &ub->rcast.list); 300 list_add_rcu(&rcast->list, &ub->rcast.list);
301 return 0; 301 return 0;
302} 302}
@@ -681,7 +681,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
681 goto err; 681 goto err;
682 682
683 b->bcast_addr.media_id = TIPC_MEDIA_TYPE_UDP; 683 b->bcast_addr.media_id = TIPC_MEDIA_TYPE_UDP;
684 b->bcast_addr.broadcast = 1; 684 b->bcast_addr.broadcast = TIPC_BROADCAST_SUPPORT;
685 rcu_assign_pointer(b->media_ptr, ub); 685 rcu_assign_pointer(b->media_ptr, ub);
686 rcu_assign_pointer(ub->bearer, b); 686 rcu_assign_pointer(ub->bearer, b);
687 tipc_udp_media_addr_set(&b->addr, &local); 687 tipc_udp_media_addr_set(&b->addr, &local);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 2358f2690ec5..928691c43408 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -85,7 +85,7 @@
85#include <linux/module.h> 85#include <linux/module.h>
86#include <linux/kernel.h> 86#include <linux/kernel.h>
87#include <linux/signal.h> 87#include <linux/signal.h>
88#include <linux/sched.h> 88#include <linux/sched/signal.h>
89#include <linux/errno.h> 89#include <linux/errno.h>
90#include <linux/string.h> 90#include <linux/string.h>
91#include <linux/stat.h> 91#include <linux/stat.h>
@@ -100,7 +100,7 @@
100#include <linux/in.h> 100#include <linux/in.h>
101#include <linux/fs.h> 101#include <linux/fs.h>
102#include <linux/slab.h> 102#include <linux/slab.h>
103#include <asm/uaccess.h> 103#include <linux/uaccess.h>
104#include <linux/skbuff.h> 104#include <linux/skbuff.h>
105#include <linux/netdevice.h> 105#include <linux/netdevice.h>
106#include <net/net_namespace.h> 106#include <net/net_namespace.h>
@@ -117,6 +117,7 @@
117#include <net/checksum.h> 117#include <net/checksum.h>
118#include <linux/security.h> 118#include <linux/security.h>
119#include <linux/freezer.h> 119#include <linux/freezer.h>
120#include <linux/file.h>
120 121
121struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE]; 122struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
122EXPORT_SYMBOL_GPL(unix_socket_table); 123EXPORT_SYMBOL_GPL(unix_socket_table);
@@ -315,7 +316,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i)
315 &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) { 316 &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
316 struct dentry *dentry = unix_sk(s)->path.dentry; 317 struct dentry *dentry = unix_sk(s)->path.dentry;
317 318
318 if (dentry && d_real_inode(dentry) == i) { 319 if (dentry && d_backing_inode(dentry) == i) {
319 sock_hold(s); 320 sock_hold(s);
320 goto found; 321 goto found;
321 } 322 }
@@ -635,7 +636,7 @@ static int unix_bind(struct socket *, struct sockaddr *, int);
635static int unix_stream_connect(struct socket *, struct sockaddr *, 636static int unix_stream_connect(struct socket *, struct sockaddr *,
636 int addr_len, int flags); 637 int addr_len, int flags);
637static int unix_socketpair(struct socket *, struct socket *); 638static int unix_socketpair(struct socket *, struct socket *);
638static int unix_accept(struct socket *, struct socket *, int); 639static int unix_accept(struct socket *, struct socket *, int, bool);
639static int unix_getname(struct socket *, struct sockaddr *, int *, int); 640static int unix_getname(struct socket *, struct sockaddr *, int *, int);
640static unsigned int unix_poll(struct file *, struct socket *, poll_table *); 641static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
641static unsigned int unix_dgram_poll(struct file *, struct socket *, 642static unsigned int unix_dgram_poll(struct file *, struct socket *,
@@ -913,7 +914,7 @@ static struct sock *unix_find_other(struct net *net,
913 err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path); 914 err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
914 if (err) 915 if (err)
915 goto fail; 916 goto fail;
916 inode = d_real_inode(path.dentry); 917 inode = d_backing_inode(path.dentry);
917 err = inode_permission(inode, MAY_WRITE); 918 err = inode_permission(inode, MAY_WRITE);
918 if (err) 919 if (err)
919 goto put_fail; 920 goto put_fail;
@@ -995,6 +996,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
995 unsigned int hash; 996 unsigned int hash;
996 struct unix_address *addr; 997 struct unix_address *addr;
997 struct hlist_head *list; 998 struct hlist_head *list;
999 struct path path = { NULL, NULL };
998 1000
999 err = -EINVAL; 1001 err = -EINVAL;
1000 if (sunaddr->sun_family != AF_UNIX) 1002 if (sunaddr->sun_family != AF_UNIX)
@@ -1010,9 +1012,20 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1010 goto out; 1012 goto out;
1011 addr_len = err; 1013 addr_len = err;
1012 1014
1015 if (sun_path[0]) {
1016 umode_t mode = S_IFSOCK |
1017 (SOCK_INODE(sock)->i_mode & ~current_umask());
1018 err = unix_mknod(sun_path, mode, &path);
1019 if (err) {
1020 if (err == -EEXIST)
1021 err = -EADDRINUSE;
1022 goto out;
1023 }
1024 }
1025
1013 err = mutex_lock_interruptible(&u->bindlock); 1026 err = mutex_lock_interruptible(&u->bindlock);
1014 if (err) 1027 if (err)
1015 goto out; 1028 goto out_put;
1016 1029
1017 err = -EINVAL; 1030 err = -EINVAL;
1018 if (u->addr) 1031 if (u->addr)
@@ -1029,18 +1042,8 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1029 atomic_set(&addr->refcnt, 1); 1042 atomic_set(&addr->refcnt, 1);
1030 1043
1031 if (sun_path[0]) { 1044 if (sun_path[0]) {
1032 struct path path;
1033 umode_t mode = S_IFSOCK |
1034 (SOCK_INODE(sock)->i_mode & ~current_umask());
1035 err = unix_mknod(sun_path, mode, &path);
1036 if (err) {
1037 if (err == -EEXIST)
1038 err = -EADDRINUSE;
1039 unix_release_addr(addr);
1040 goto out_up;
1041 }
1042 addr->hash = UNIX_HASH_SIZE; 1045 addr->hash = UNIX_HASH_SIZE;
1043 hash = d_real_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1); 1046 hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1);
1044 spin_lock(&unix_table_lock); 1047 spin_lock(&unix_table_lock);
1045 u->path = path; 1048 u->path = path;
1046 list = &unix_socket_table[hash]; 1049 list = &unix_socket_table[hash];
@@ -1065,6 +1068,9 @@ out_unlock:
1065 spin_unlock(&unix_table_lock); 1068 spin_unlock(&unix_table_lock);
1066out_up: 1069out_up:
1067 mutex_unlock(&u->bindlock); 1070 mutex_unlock(&u->bindlock);
1071out_put:
1072 if (err)
1073 path_put(&path);
1068out: 1074out:
1069 return err; 1075 return err;
1070} 1076}
@@ -1396,7 +1402,8 @@ static void unix_sock_inherit_flags(const struct socket *old,
1396 set_bit(SOCK_PASSSEC, &new->flags); 1402 set_bit(SOCK_PASSSEC, &new->flags);
1397} 1403}
1398 1404
1399static int unix_accept(struct socket *sock, struct socket *newsock, int flags) 1405static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1406 bool kern)
1400{ 1407{
1401 struct sock *sk = sock->sk; 1408 struct sock *sk = sock->sk;
1402 struct sock *tsk; 1409 struct sock *tsk;
@@ -2113,8 +2120,8 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
2113 mutex_lock(&u->iolock); 2120 mutex_lock(&u->iolock);
2114 2121
2115 skip = sk_peek_offset(sk, flags); 2122 skip = sk_peek_offset(sk, flags);
2116 skb = __skb_try_recv_datagram(sk, flags, &peeked, &skip, &err, 2123 skb = __skb_try_recv_datagram(sk, flags, NULL, &peeked, &skip,
2117 &last); 2124 &err, &last);
2118 if (skb) 2125 if (skb)
2119 break; 2126 break;
2120 2127
@@ -2587,6 +2594,43 @@ long unix_outq_len(struct sock *sk)
2587} 2594}
2588EXPORT_SYMBOL_GPL(unix_outq_len); 2595EXPORT_SYMBOL_GPL(unix_outq_len);
2589 2596
2597static int unix_open_file(struct sock *sk)
2598{
2599 struct path path;
2600 struct file *f;
2601 int fd;
2602
2603 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2604 return -EPERM;
2605
2606 unix_state_lock(sk);
2607 path = unix_sk(sk)->path;
2608 if (!path.dentry) {
2609 unix_state_unlock(sk);
2610 return -ENOENT;
2611 }
2612
2613 path_get(&path);
2614 unix_state_unlock(sk);
2615
2616 fd = get_unused_fd_flags(O_CLOEXEC);
2617 if (fd < 0)
2618 goto out;
2619
2620 f = dentry_open(&path, O_PATH, current_cred());
2621 if (IS_ERR(f)) {
2622 put_unused_fd(fd);
2623 fd = PTR_ERR(f);
2624 goto out;
2625 }
2626
2627 fd_install(fd, f);
2628out:
2629 path_put(&path);
2630
2631 return fd;
2632}
2633
2590static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 2634static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2591{ 2635{
2592 struct sock *sk = sock->sk; 2636 struct sock *sk = sock->sk;
@@ -2605,6 +2649,9 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2605 else 2649 else
2606 err = put_user(amount, (int __user *)arg); 2650 err = put_user(amount, (int __user *)arg);
2607 break; 2651 break;
2652 case SIOCUNIXFILE:
2653 err = unix_open_file(sk);
2654 break;
2608 default: 2655 default:
2609 err = -ENOIOCTLCMD; 2656 err = -ENOIOCTLCMD;
2610 break; 2657 break;
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 6a0d48525fcf..c36757e72844 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -146,6 +146,7 @@ void unix_notinflight(struct user_struct *user, struct file *fp)
146 if (s) { 146 if (s) {
147 struct unix_sock *u = unix_sk(s); 147 struct unix_sock *u = unix_sk(s);
148 148
149 BUG_ON(!atomic_long_read(&u->inflight));
149 BUG_ON(list_empty(&u->link)); 150 BUG_ON(list_empty(&u->link));
150 151
151 if (atomic_long_dec_and_test(&u->inflight)) 152 if (atomic_long_dec_and_test(&u->inflight))
@@ -341,6 +342,14 @@ void unix_gc(void)
341 } 342 }
342 list_del(&cursor); 343 list_del(&cursor);
343 344
345 /* Now gc_candidates contains only garbage. Restore original
346 * inflight counters for these as well, and remove the skbuffs
347 * which are creating the cycle(s).
348 */
349 skb_queue_head_init(&hitlist);
350 list_for_each_entry(u, &gc_candidates, link)
351 scan_children(&u->sk, inc_inflight, &hitlist);
352
344 /* not_cycle_list contains those sockets which do not make up a 353 /* not_cycle_list contains those sockets which do not make up a
345 * cycle. Restore these to the inflight list. 354 * cycle. Restore these to the inflight list.
346 */ 355 */
@@ -350,14 +359,6 @@ void unix_gc(void)
350 list_move_tail(&u->link, &gc_inflight_list); 359 list_move_tail(&u->link, &gc_inflight_list);
351 } 360 }
352 361
353 /* Now gc_candidates contains only garbage. Restore original
354 * inflight counters for these as well, and remove the skbuffs
355 * which are creating the cycle(s).
356 */
357 skb_queue_head_init(&hitlist);
358 list_for_each_entry(u, &gc_candidates, link)
359 scan_children(&u->sk, inc_inflight, &hitlist);
360
361 spin_unlock(&unix_gc_lock); 362 spin_unlock(&unix_gc_lock);
362 363
363 /* Here we are. Hitlist is filled. Die. */ 364 /* Here we are. Hitlist is filled. Die. */
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 8a398b3fb532..6f7f6757ceef 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -90,6 +90,7 @@
90#include <linux/init.h> 90#include <linux/init.h>
91#include <linux/io.h> 91#include <linux/io.h>
92#include <linux/kernel.h> 92#include <linux/kernel.h>
93#include <linux/sched/signal.h>
93#include <linux/kmod.h> 94#include <linux/kmod.h>
94#include <linux/list.h> 95#include <linux/list.h>
95#include <linux/miscdevice.h> 96#include <linux/miscdevice.h>
@@ -1101,10 +1102,19 @@ static const struct proto_ops vsock_dgram_ops = {
1101 .sendpage = sock_no_sendpage, 1102 .sendpage = sock_no_sendpage,
1102}; 1103};
1103 1104
1105static int vsock_transport_cancel_pkt(struct vsock_sock *vsk)
1106{
1107 if (!transport->cancel_pkt)
1108 return -EOPNOTSUPP;
1109
1110 return transport->cancel_pkt(vsk);
1111}
1112
1104static void vsock_connect_timeout(struct work_struct *work) 1113static void vsock_connect_timeout(struct work_struct *work)
1105{ 1114{
1106 struct sock *sk; 1115 struct sock *sk;
1107 struct vsock_sock *vsk; 1116 struct vsock_sock *vsk;
1117 int cancel = 0;
1108 1118
1109 vsk = container_of(work, struct vsock_sock, dwork.work); 1119 vsk = container_of(work, struct vsock_sock, dwork.work);
1110 sk = sk_vsock(vsk); 1120 sk = sk_vsock(vsk);
@@ -1115,8 +1125,11 @@ static void vsock_connect_timeout(struct work_struct *work)
1115 sk->sk_state = SS_UNCONNECTED; 1125 sk->sk_state = SS_UNCONNECTED;
1116 sk->sk_err = ETIMEDOUT; 1126 sk->sk_err = ETIMEDOUT;
1117 sk->sk_error_report(sk); 1127 sk->sk_error_report(sk);
1128 cancel = 1;
1118 } 1129 }
1119 release_sock(sk); 1130 release_sock(sk);
1131 if (cancel)
1132 vsock_transport_cancel_pkt(vsk);
1120 1133
1121 sock_put(sk); 1134 sock_put(sk);
1122} 1135}
@@ -1223,11 +1236,13 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr,
1223 err = sock_intr_errno(timeout); 1236 err = sock_intr_errno(timeout);
1224 sk->sk_state = SS_UNCONNECTED; 1237 sk->sk_state = SS_UNCONNECTED;
1225 sock->state = SS_UNCONNECTED; 1238 sock->state = SS_UNCONNECTED;
1239 vsock_transport_cancel_pkt(vsk);
1226 goto out_wait; 1240 goto out_wait;
1227 } else if (timeout == 0) { 1241 } else if (timeout == 0) {
1228 err = -ETIMEDOUT; 1242 err = -ETIMEDOUT;
1229 sk->sk_state = SS_UNCONNECTED; 1243 sk->sk_state = SS_UNCONNECTED;
1230 sock->state = SS_UNCONNECTED; 1244 sock->state = SS_UNCONNECTED;
1245 vsock_transport_cancel_pkt(vsk);
1231 goto out_wait; 1246 goto out_wait;
1232 } 1247 }
1233 1248
@@ -1249,7 +1264,8 @@ out:
1249 return err; 1264 return err;
1250} 1265}
1251 1266
1252static int vsock_accept(struct socket *sock, struct socket *newsock, int flags) 1267static int vsock_accept(struct socket *sock, struct socket *newsock, int flags,
1268 bool kern)
1253{ 1269{
1254 struct sock *listener; 1270 struct sock *listener;
1255 int err; 1271 int err;
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 936d7eee62d0..68675a151f22 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -44,6 +44,10 @@ struct virtio_vsock {
44 spinlock_t send_pkt_list_lock; 44 spinlock_t send_pkt_list_lock;
45 struct list_head send_pkt_list; 45 struct list_head send_pkt_list;
46 46
47 struct work_struct loopback_work;
48 spinlock_t loopback_list_lock; /* protects loopback_list */
49 struct list_head loopback_list;
50
47 atomic_t queued_replies; 51 atomic_t queued_replies;
48 52
49 /* The following fields are protected by rx_lock. vqs[VSOCK_VQ_RX] 53 /* The following fields are protected by rx_lock. vqs[VSOCK_VQ_RX]
@@ -74,6 +78,42 @@ static u32 virtio_transport_get_local_cid(void)
74 return vsock->guest_cid; 78 return vsock->guest_cid;
75} 79}
76 80
81static void virtio_transport_loopback_work(struct work_struct *work)
82{
83 struct virtio_vsock *vsock =
84 container_of(work, struct virtio_vsock, loopback_work);
85 LIST_HEAD(pkts);
86
87 spin_lock_bh(&vsock->loopback_list_lock);
88 list_splice_init(&vsock->loopback_list, &pkts);
89 spin_unlock_bh(&vsock->loopback_list_lock);
90
91 mutex_lock(&vsock->rx_lock);
92 while (!list_empty(&pkts)) {
93 struct virtio_vsock_pkt *pkt;
94
95 pkt = list_first_entry(&pkts, struct virtio_vsock_pkt, list);
96 list_del_init(&pkt->list);
97
98 virtio_transport_recv_pkt(pkt);
99 }
100 mutex_unlock(&vsock->rx_lock);
101}
102
103static int virtio_transport_send_pkt_loopback(struct virtio_vsock *vsock,
104 struct virtio_vsock_pkt *pkt)
105{
106 int len = pkt->len;
107
108 spin_lock_bh(&vsock->loopback_list_lock);
109 list_add_tail(&pkt->list, &vsock->loopback_list);
110 spin_unlock_bh(&vsock->loopback_list_lock);
111
112 queue_work(virtio_vsock_workqueue, &vsock->loopback_work);
113
114 return len;
115}
116
77static void 117static void
78virtio_transport_send_pkt_work(struct work_struct *work) 118virtio_transport_send_pkt_work(struct work_struct *work)
79{ 119{
@@ -159,6 +199,9 @@ virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt)
159 return -ENODEV; 199 return -ENODEV;
160 } 200 }
161 201
202 if (le32_to_cpu(pkt->hdr.dst_cid) == vsock->guest_cid)
203 return virtio_transport_send_pkt_loopback(vsock, pkt);
204
162 if (pkt->reply) 205 if (pkt->reply)
163 atomic_inc(&vsock->queued_replies); 206 atomic_inc(&vsock->queued_replies);
164 207
@@ -170,6 +213,47 @@ virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt)
170 return len; 213 return len;
171} 214}
172 215
216static int
217virtio_transport_cancel_pkt(struct vsock_sock *vsk)
218{
219 struct virtio_vsock *vsock;
220 struct virtio_vsock_pkt *pkt, *n;
221 int cnt = 0;
222 LIST_HEAD(freeme);
223
224 vsock = virtio_vsock_get();
225 if (!vsock) {
226 return -ENODEV;
227 }
228
229 spin_lock_bh(&vsock->send_pkt_list_lock);
230 list_for_each_entry_safe(pkt, n, &vsock->send_pkt_list, list) {
231 if (pkt->vsk != vsk)
232 continue;
233 list_move(&pkt->list, &freeme);
234 }
235 spin_unlock_bh(&vsock->send_pkt_list_lock);
236
237 list_for_each_entry_safe(pkt, n, &freeme, list) {
238 if (pkt->reply)
239 cnt++;
240 list_del(&pkt->list);
241 virtio_transport_free_pkt(pkt);
242 }
243
244 if (cnt) {
245 struct virtqueue *rx_vq = vsock->vqs[VSOCK_VQ_RX];
246 int new_cnt;
247
248 new_cnt = atomic_sub_return(cnt, &vsock->queued_replies);
249 if (new_cnt + cnt >= virtqueue_get_vring_size(rx_vq) &&
250 new_cnt < virtqueue_get_vring_size(rx_vq))
251 queue_work(virtio_vsock_workqueue, &vsock->rx_work);
252 }
253
254 return 0;
255}
256
173static void virtio_vsock_rx_fill(struct virtio_vsock *vsock) 257static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
174{ 258{
175 int buf_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE; 259 int buf_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
@@ -336,7 +420,7 @@ static void virtio_vsock_reset_sock(struct sock *sk)
336static void virtio_vsock_update_guest_cid(struct virtio_vsock *vsock) 420static void virtio_vsock_update_guest_cid(struct virtio_vsock *vsock)
337{ 421{
338 struct virtio_device *vdev = vsock->vdev; 422 struct virtio_device *vdev = vsock->vdev;
339 u64 guest_cid; 423 __le64 guest_cid;
340 424
341 vdev->config->get(vdev, offsetof(struct virtio_vsock_config, guest_cid), 425 vdev->config->get(vdev, offsetof(struct virtio_vsock_config, guest_cid),
342 &guest_cid, sizeof(guest_cid)); 426 &guest_cid, sizeof(guest_cid));
@@ -419,6 +503,7 @@ static struct virtio_transport virtio_transport = {
419 .release = virtio_transport_release, 503 .release = virtio_transport_release,
420 .connect = virtio_transport_connect, 504 .connect = virtio_transport_connect,
421 .shutdown = virtio_transport_shutdown, 505 .shutdown = virtio_transport_shutdown,
506 .cancel_pkt = virtio_transport_cancel_pkt,
422 507
423 .dgram_bind = virtio_transport_dgram_bind, 508 .dgram_bind = virtio_transport_dgram_bind,
424 .dgram_dequeue = virtio_transport_dgram_dequeue, 509 .dgram_dequeue = virtio_transport_dgram_dequeue,
@@ -489,7 +574,8 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
489 vsock->vdev = vdev; 574 vsock->vdev = vdev;
490 575
491 ret = vsock->vdev->config->find_vqs(vsock->vdev, VSOCK_VQ_MAX, 576 ret = vsock->vdev->config->find_vqs(vsock->vdev, VSOCK_VQ_MAX,
492 vsock->vqs, callbacks, names); 577 vsock->vqs, callbacks, names,
578 NULL);
493 if (ret < 0) 579 if (ret < 0)
494 goto out; 580 goto out;
495 581
@@ -510,10 +596,13 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
510 mutex_init(&vsock->event_lock); 596 mutex_init(&vsock->event_lock);
511 spin_lock_init(&vsock->send_pkt_list_lock); 597 spin_lock_init(&vsock->send_pkt_list_lock);
512 INIT_LIST_HEAD(&vsock->send_pkt_list); 598 INIT_LIST_HEAD(&vsock->send_pkt_list);
599 spin_lock_init(&vsock->loopback_list_lock);
600 INIT_LIST_HEAD(&vsock->loopback_list);
513 INIT_WORK(&vsock->rx_work, virtio_transport_rx_work); 601 INIT_WORK(&vsock->rx_work, virtio_transport_rx_work);
514 INIT_WORK(&vsock->tx_work, virtio_transport_tx_work); 602 INIT_WORK(&vsock->tx_work, virtio_transport_tx_work);
515 INIT_WORK(&vsock->event_work, virtio_transport_event_work); 603 INIT_WORK(&vsock->event_work, virtio_transport_event_work);
516 INIT_WORK(&vsock->send_pkt_work, virtio_transport_send_pkt_work); 604 INIT_WORK(&vsock->send_pkt_work, virtio_transport_send_pkt_work);
605 INIT_WORK(&vsock->loopback_work, virtio_transport_loopback_work);
517 606
518 mutex_lock(&vsock->rx_lock); 607 mutex_lock(&vsock->rx_lock);
519 virtio_vsock_rx_fill(vsock); 608 virtio_vsock_rx_fill(vsock);
@@ -539,6 +628,7 @@ static void virtio_vsock_remove(struct virtio_device *vdev)
539 struct virtio_vsock *vsock = vdev->priv; 628 struct virtio_vsock *vsock = vdev->priv;
540 struct virtio_vsock_pkt *pkt; 629 struct virtio_vsock_pkt *pkt;
541 630
631 flush_work(&vsock->loopback_work);
542 flush_work(&vsock->rx_work); 632 flush_work(&vsock->rx_work);
543 flush_work(&vsock->tx_work); 633 flush_work(&vsock->tx_work);
544 flush_work(&vsock->event_work); 634 flush_work(&vsock->event_work);
@@ -565,6 +655,15 @@ static void virtio_vsock_remove(struct virtio_device *vdev)
565 } 655 }
566 spin_unlock_bh(&vsock->send_pkt_list_lock); 656 spin_unlock_bh(&vsock->send_pkt_list_lock);
567 657
658 spin_lock_bh(&vsock->loopback_list_lock);
659 while (!list_empty(&vsock->loopback_list)) {
660 pkt = list_first_entry(&vsock->loopback_list,
661 struct virtio_vsock_pkt, list);
662 list_del(&pkt->list);
663 virtio_transport_free_pkt(pkt);
664 }
665 spin_unlock_bh(&vsock->loopback_list_lock);
666
568 mutex_lock(&the_virtio_vsock_mutex); 667 mutex_lock(&the_virtio_vsock_mutex);
569 the_virtio_vsock = NULL; 668 the_virtio_vsock = NULL;
570 vsock_core_exit(); 669 vsock_core_exit();
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index a53b3a16b4f1..af087b44ceea 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -9,6 +9,7 @@
9 */ 9 */
10#include <linux/spinlock.h> 10#include <linux/spinlock.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/sched/signal.h>
12#include <linux/ctype.h> 13#include <linux/ctype.h>
13#include <linux/list.h> 14#include <linux/list.h>
14#include <linux/virtio.h> 15#include <linux/virtio.h>
@@ -32,7 +33,7 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
32 return container_of(t, struct virtio_transport, transport); 33 return container_of(t, struct virtio_transport, transport);
33} 34}
34 35
35struct virtio_vsock_pkt * 36static struct virtio_vsock_pkt *
36virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info, 37virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
37 size_t len, 38 size_t len,
38 u32 src_cid, 39 u32 src_cid,
@@ -57,6 +58,7 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
57 pkt->len = len; 58 pkt->len = len;
58 pkt->hdr.len = cpu_to_le32(len); 59 pkt->hdr.len = cpu_to_le32(len);
59 pkt->reply = info->reply; 60 pkt->reply = info->reply;
61 pkt->vsk = info->vsk;
60 62
61 if (info->msg && len > 0) { 63 if (info->msg && len > 0) {
62 pkt->buf = kmalloc(len, GFP_KERNEL); 64 pkt->buf = kmalloc(len, GFP_KERNEL);
@@ -82,7 +84,6 @@ out_pkt:
82 kfree(pkt); 84 kfree(pkt);
83 return NULL; 85 return NULL;
84} 86}
85EXPORT_SYMBOL_GPL(virtio_transport_alloc_pkt);
86 87
87static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, 88static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
88 struct virtio_vsock_pkt_info *info) 89 struct virtio_vsock_pkt_info *info)
@@ -180,6 +181,7 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
180 struct virtio_vsock_pkt_info info = { 181 struct virtio_vsock_pkt_info info = {
181 .op = VIRTIO_VSOCK_OP_CREDIT_UPDATE, 182 .op = VIRTIO_VSOCK_OP_CREDIT_UPDATE,
182 .type = type, 183 .type = type,
184 .vsk = vsk,
183 }; 185 };
184 186
185 return virtio_transport_send_pkt_info(vsk, &info); 187 return virtio_transport_send_pkt_info(vsk, &info);
@@ -519,6 +521,7 @@ int virtio_transport_connect(struct vsock_sock *vsk)
519 struct virtio_vsock_pkt_info info = { 521 struct virtio_vsock_pkt_info info = {
520 .op = VIRTIO_VSOCK_OP_REQUEST, 522 .op = VIRTIO_VSOCK_OP_REQUEST,
521 .type = VIRTIO_VSOCK_TYPE_STREAM, 523 .type = VIRTIO_VSOCK_TYPE_STREAM,
524 .vsk = vsk,
522 }; 525 };
523 526
524 return virtio_transport_send_pkt_info(vsk, &info); 527 return virtio_transport_send_pkt_info(vsk, &info);
@@ -534,6 +537,7 @@ int virtio_transport_shutdown(struct vsock_sock *vsk, int mode)
534 VIRTIO_VSOCK_SHUTDOWN_RCV : 0) | 537 VIRTIO_VSOCK_SHUTDOWN_RCV : 0) |
535 (mode & SEND_SHUTDOWN ? 538 (mode & SEND_SHUTDOWN ?
536 VIRTIO_VSOCK_SHUTDOWN_SEND : 0), 539 VIRTIO_VSOCK_SHUTDOWN_SEND : 0),
540 .vsk = vsk,
537 }; 541 };
538 542
539 return virtio_transport_send_pkt_info(vsk, &info); 543 return virtio_transport_send_pkt_info(vsk, &info);
@@ -560,6 +564,7 @@ virtio_transport_stream_enqueue(struct vsock_sock *vsk,
560 .type = VIRTIO_VSOCK_TYPE_STREAM, 564 .type = VIRTIO_VSOCK_TYPE_STREAM,
561 .msg = msg, 565 .msg = msg,
562 .pkt_len = len, 566 .pkt_len = len,
567 .vsk = vsk,
563 }; 568 };
564 569
565 return virtio_transport_send_pkt_info(vsk, &info); 570 return virtio_transport_send_pkt_info(vsk, &info);
@@ -581,6 +586,7 @@ static int virtio_transport_reset(struct vsock_sock *vsk,
581 .op = VIRTIO_VSOCK_OP_RST, 586 .op = VIRTIO_VSOCK_OP_RST,
582 .type = VIRTIO_VSOCK_TYPE_STREAM, 587 .type = VIRTIO_VSOCK_TYPE_STREAM,
583 .reply = !!pkt, 588 .reply = !!pkt,
589 .vsk = vsk,
584 }; 590 };
585 591
586 /* Send RST only if the original pkt is not a RST pkt */ 592 /* Send RST only if the original pkt is not a RST pkt */
@@ -606,9 +612,9 @@ static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt)
606 return 0; 612 return 0;
607 613
608 pkt = virtio_transport_alloc_pkt(&info, 0, 614 pkt = virtio_transport_alloc_pkt(&info, 0,
609 le32_to_cpu(pkt->hdr.dst_cid), 615 le64_to_cpu(pkt->hdr.dst_cid),
610 le32_to_cpu(pkt->hdr.dst_port), 616 le32_to_cpu(pkt->hdr.dst_port),
611 le32_to_cpu(pkt->hdr.src_cid), 617 le64_to_cpu(pkt->hdr.src_cid),
612 le32_to_cpu(pkt->hdr.src_port)); 618 le32_to_cpu(pkt->hdr.src_port));
613 if (!pkt) 619 if (!pkt)
614 return -ENOMEM; 620 return -ENOMEM;
@@ -619,17 +625,17 @@ static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt)
619static void virtio_transport_wait_close(struct sock *sk, long timeout) 625static void virtio_transport_wait_close(struct sock *sk, long timeout)
620{ 626{
621 if (timeout) { 627 if (timeout) {
622 DEFINE_WAIT(wait); 628 DEFINE_WAIT_FUNC(wait, woken_wake_function);
629
630 add_wait_queue(sk_sleep(sk), &wait);
623 631
624 do { 632 do {
625 prepare_to_wait(sk_sleep(sk), &wait,
626 TASK_INTERRUPTIBLE);
627 if (sk_wait_event(sk, &timeout, 633 if (sk_wait_event(sk, &timeout,
628 sock_flag(sk, SOCK_DONE))) 634 sock_flag(sk, SOCK_DONE), &wait))
629 break; 635 break;
630 } while (!signal_pending(current) && timeout); 636 } while (!signal_pending(current) && timeout);
631 637
632 finish_wait(sk_sleep(sk), &wait); 638 remove_wait_queue(sk_sleep(sk), &wait);
633 } 639 }
634} 640}
635 641
@@ -823,9 +829,10 @@ virtio_transport_send_response(struct vsock_sock *vsk,
823 struct virtio_vsock_pkt_info info = { 829 struct virtio_vsock_pkt_info info = {
824 .op = VIRTIO_VSOCK_OP_RESPONSE, 830 .op = VIRTIO_VSOCK_OP_RESPONSE,
825 .type = VIRTIO_VSOCK_TYPE_STREAM, 831 .type = VIRTIO_VSOCK_TYPE_STREAM,
826 .remote_cid = le32_to_cpu(pkt->hdr.src_cid), 832 .remote_cid = le64_to_cpu(pkt->hdr.src_cid),
827 .remote_port = le32_to_cpu(pkt->hdr.src_port), 833 .remote_port = le32_to_cpu(pkt->hdr.src_port),
828 .reply = true, 834 .reply = true,
835 .vsk = vsk,
829 }; 836 };
830 837
831 return virtio_transport_send_pkt_info(vsk, &info); 838 return virtio_transport_send_pkt_info(vsk, &info);
@@ -863,9 +870,9 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt)
863 child->sk_state = SS_CONNECTED; 870 child->sk_state = SS_CONNECTED;
864 871
865 vchild = vsock_sk(child); 872 vchild = vsock_sk(child);
866 vsock_addr_init(&vchild->local_addr, le32_to_cpu(pkt->hdr.dst_cid), 873 vsock_addr_init(&vchild->local_addr, le64_to_cpu(pkt->hdr.dst_cid),
867 le32_to_cpu(pkt->hdr.dst_port)); 874 le32_to_cpu(pkt->hdr.dst_port));
868 vsock_addr_init(&vchild->remote_addr, le32_to_cpu(pkt->hdr.src_cid), 875 vsock_addr_init(&vchild->remote_addr, le64_to_cpu(pkt->hdr.src_cid),
869 le32_to_cpu(pkt->hdr.src_port)); 876 le32_to_cpu(pkt->hdr.src_port));
870 877
871 vsock_insert_connected(vchild); 878 vsock_insert_connected(vchild);
@@ -904,9 +911,9 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
904 struct sock *sk; 911 struct sock *sk;
905 bool space_available; 912 bool space_available;
906 913
907 vsock_addr_init(&src, le32_to_cpu(pkt->hdr.src_cid), 914 vsock_addr_init(&src, le64_to_cpu(pkt->hdr.src_cid),
908 le32_to_cpu(pkt->hdr.src_port)); 915 le32_to_cpu(pkt->hdr.src_port));
909 vsock_addr_init(&dst, le32_to_cpu(pkt->hdr.dst_cid), 916 vsock_addr_init(&dst, le64_to_cpu(pkt->hdr.dst_cid),
910 le32_to_cpu(pkt->hdr.dst_port)); 917 le32_to_cpu(pkt->hdr.dst_port));
911 918
912 trace_virtio_transport_recv_pkt(src.svm_cid, src.svm_port, 919 trace_virtio_transport_recv_pkt(src.svm_cid, src.svm_port,
diff --git a/net/vmw_vsock/vmci_transport_notify.c b/net/vmw_vsock/vmci_transport_notify.c
index fd8cf0214d51..1406db4d97d1 100644
--- a/net/vmw_vsock/vmci_transport_notify.c
+++ b/net/vmw_vsock/vmci_transport_notify.c
@@ -662,19 +662,19 @@ static void vmci_transport_notify_pkt_process_negotiate(struct sock *sk)
662 662
663/* Socket control packet based operations. */ 663/* Socket control packet based operations. */
664const struct vmci_transport_notify_ops vmci_transport_notify_pkt_ops = { 664const struct vmci_transport_notify_ops vmci_transport_notify_pkt_ops = {
665 vmci_transport_notify_pkt_socket_init, 665 .socket_init = vmci_transport_notify_pkt_socket_init,
666 vmci_transport_notify_pkt_socket_destruct, 666 .socket_destruct = vmci_transport_notify_pkt_socket_destruct,
667 vmci_transport_notify_pkt_poll_in, 667 .poll_in = vmci_transport_notify_pkt_poll_in,
668 vmci_transport_notify_pkt_poll_out, 668 .poll_out = vmci_transport_notify_pkt_poll_out,
669 vmci_transport_notify_pkt_handle_pkt, 669 .handle_notify_pkt = vmci_transport_notify_pkt_handle_pkt,
670 vmci_transport_notify_pkt_recv_init, 670 .recv_init = vmci_transport_notify_pkt_recv_init,
671 vmci_transport_notify_pkt_recv_pre_block, 671 .recv_pre_block = vmci_transport_notify_pkt_recv_pre_block,
672 vmci_transport_notify_pkt_recv_pre_dequeue, 672 .recv_pre_dequeue = vmci_transport_notify_pkt_recv_pre_dequeue,
673 vmci_transport_notify_pkt_recv_post_dequeue, 673 .recv_post_dequeue = vmci_transport_notify_pkt_recv_post_dequeue,
674 vmci_transport_notify_pkt_send_init, 674 .send_init = vmci_transport_notify_pkt_send_init,
675 vmci_transport_notify_pkt_send_pre_block, 675 .send_pre_block = vmci_transport_notify_pkt_send_pre_block,
676 vmci_transport_notify_pkt_send_pre_enqueue, 676 .send_pre_enqueue = vmci_transport_notify_pkt_send_pre_enqueue,
677 vmci_transport_notify_pkt_send_post_enqueue, 677 .send_post_enqueue = vmci_transport_notify_pkt_send_post_enqueue,
678 vmci_transport_notify_pkt_process_request, 678 .process_request = vmci_transport_notify_pkt_process_request,
679 vmci_transport_notify_pkt_process_negotiate, 679 .process_negotiate = vmci_transport_notify_pkt_process_negotiate,
680}; 680};
diff --git a/net/vmw_vsock/vmci_transport_notify_qstate.c b/net/vmw_vsock/vmci_transport_notify_qstate.c
index 21e591dafb03..f3a0afc46208 100644
--- a/net/vmw_vsock/vmci_transport_notify_qstate.c
+++ b/net/vmw_vsock/vmci_transport_notify_qstate.c
@@ -420,19 +420,19 @@ vmci_transport_notify_pkt_send_pre_enqueue(
420 420
421/* Socket always on control packet based operations. */ 421/* Socket always on control packet based operations. */
422const struct vmci_transport_notify_ops vmci_transport_notify_pkt_q_state_ops = { 422const struct vmci_transport_notify_ops vmci_transport_notify_pkt_q_state_ops = {
423 vmci_transport_notify_pkt_socket_init, 423 .socket_init = vmci_transport_notify_pkt_socket_init,
424 vmci_transport_notify_pkt_socket_destruct, 424 .socket_destruct = vmci_transport_notify_pkt_socket_destruct,
425 vmci_transport_notify_pkt_poll_in, 425 .poll_in = vmci_transport_notify_pkt_poll_in,
426 vmci_transport_notify_pkt_poll_out, 426 .poll_out = vmci_transport_notify_pkt_poll_out,
427 vmci_transport_notify_pkt_handle_pkt, 427 .handle_notify_pkt = vmci_transport_notify_pkt_handle_pkt,
428 vmci_transport_notify_pkt_recv_init, 428 .recv_init = vmci_transport_notify_pkt_recv_init,
429 vmci_transport_notify_pkt_recv_pre_block, 429 .recv_pre_block = vmci_transport_notify_pkt_recv_pre_block,
430 vmci_transport_notify_pkt_recv_pre_dequeue, 430 .recv_pre_dequeue = vmci_transport_notify_pkt_recv_pre_dequeue,
431 vmci_transport_notify_pkt_recv_post_dequeue, 431 .recv_post_dequeue = vmci_transport_notify_pkt_recv_post_dequeue,
432 vmci_transport_notify_pkt_send_init, 432 .send_init = vmci_transport_notify_pkt_send_init,
433 vmci_transport_notify_pkt_send_pre_block, 433 .send_pre_block = vmci_transport_notify_pkt_send_pre_block,
434 vmci_transport_notify_pkt_send_pre_enqueue, 434 .send_pre_enqueue = vmci_transport_notify_pkt_send_pre_enqueue,
435 vmci_transport_notify_pkt_send_post_enqueue, 435 .send_post_enqueue = vmci_transport_notify_pkt_send_post_enqueue,
436 vmci_transport_notify_pkt_process_request, 436 .process_request = vmci_transport_notify_pkt_process_request,
437 vmci_transport_notify_pkt_process_negotiate, 437 .process_negotiate = vmci_transport_notify_pkt_process_negotiate,
438}; 438};
diff --git a/net/wimax/stack.c b/net/wimax/stack.c
index 3f816e2971ee..5db731512014 100644
--- a/net/wimax/stack.c
+++ b/net/wimax/stack.c
@@ -572,16 +572,20 @@ struct d_level D_LEVEL[] = {
572size_t D_LEVEL_SIZE = ARRAY_SIZE(D_LEVEL); 572size_t D_LEVEL_SIZE = ARRAY_SIZE(D_LEVEL);
573 573
574 574
575struct genl_family wimax_gnl_family = { 575static const struct genl_multicast_group wimax_gnl_mcgrps[] = {
576 .id = GENL_ID_GENERATE, 576 { .name = "msg", },
577};
578
579struct genl_family wimax_gnl_family __ro_after_init = {
577 .name = "WiMAX", 580 .name = "WiMAX",
578 .version = WIMAX_GNL_VERSION, 581 .version = WIMAX_GNL_VERSION,
579 .hdrsize = 0, 582 .hdrsize = 0,
580 .maxattr = WIMAX_GNL_ATTR_MAX, 583 .maxattr = WIMAX_GNL_ATTR_MAX,
581}; 584 .module = THIS_MODULE,
582 585 .ops = wimax_gnl_ops,
583static const struct genl_multicast_group wimax_gnl_mcgrps[] = { 586 .n_ops = ARRAY_SIZE(wimax_gnl_ops),
584 { .name = "msg", }, 587 .mcgrps = wimax_gnl_mcgrps,
588 .n_mcgrps = ARRAY_SIZE(wimax_gnl_mcgrps),
585}; 589};
586 590
587 591
@@ -596,11 +600,7 @@ int __init wimax_subsys_init(void)
596 d_parse_params(D_LEVEL, D_LEVEL_SIZE, wimax_debug_params, 600 d_parse_params(D_LEVEL, D_LEVEL_SIZE, wimax_debug_params,
597 "wimax.debug"); 601 "wimax.debug");
598 602
599 snprintf(wimax_gnl_family.name, sizeof(wimax_gnl_family.name), 603 result = genl_register_family(&wimax_gnl_family);
600 "WiMAX");
601 result = genl_register_family_with_ops_groups(&wimax_gnl_family,
602 wimax_gnl_ops,
603 wimax_gnl_mcgrps);
604 if (unlikely(result < 0)) { 604 if (unlikely(result < 0)) {
605 pr_err("cannot register generic netlink family: %d\n", result); 605 pr_err("cannot register generic netlink family: %d\n", result);
606 goto error_register_family; 606 goto error_register_family;
diff --git a/net/wireless/Makefile b/net/wireless/Makefile
index 4c9e39f04ef8..d06e5015751a 100644
--- a/net/wireless/Makefile
+++ b/net/wireless/Makefile
@@ -11,14 +11,13 @@ obj-$(CONFIG_WEXT_PRIV) += wext-priv.o
11 11
12cfg80211-y += core.o sysfs.o radiotap.o util.o reg.o scan.o nl80211.o 12cfg80211-y += core.o sysfs.o radiotap.o util.o reg.o scan.o nl80211.o
13cfg80211-y += mlme.o ibss.o sme.o chan.o ethtool.o mesh.o ap.o trace.o ocb.o 13cfg80211-y += mlme.o ibss.o sme.o chan.o ethtool.o mesh.o ap.o trace.o ocb.o
14cfg80211-$(CONFIG_OF) += of.o
14cfg80211-$(CONFIG_CFG80211_DEBUGFS) += debugfs.o 15cfg80211-$(CONFIG_CFG80211_DEBUGFS) += debugfs.o
15cfg80211-$(CONFIG_CFG80211_WEXT) += wext-compat.o wext-sme.o 16cfg80211-$(CONFIG_CFG80211_WEXT) += wext-compat.o wext-sme.o
16cfg80211-$(CONFIG_CFG80211_INTERNAL_REGDB) += regdb.o 17cfg80211-$(CONFIG_CFG80211_INTERNAL_REGDB) += regdb.o
17 18
18CFLAGS_trace.o := -I$(src) 19CFLAGS_trace.o := -I$(src)
19 20
20ccflags-y += -D__CHECK_ENDIAN__
21
22$(obj)/regdb.c: $(src)/db.txt $(src)/genregdb.awk 21$(obj)/regdb.c: $(src)/db.txt $(src)/genregdb.awk
23 @$(AWK) -f $(srctree)/$(src)/genregdb.awk < $< > $@ 22 @$(AWK) -f $(srctree)/$(src)/genregdb.awk < $< > $@
24 23
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 8201e6d7449e..e55e05bc4805 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -210,11 +210,11 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev,
210 if (WARN_ON(wdev->iftype != NL80211_IFTYPE_P2P_DEVICE)) 210 if (WARN_ON(wdev->iftype != NL80211_IFTYPE_P2P_DEVICE))
211 return; 211 return;
212 212
213 if (!wdev->p2p_started) 213 if (!wdev_running(wdev))
214 return; 214 return;
215 215
216 rdev_stop_p2p_device(rdev, wdev); 216 rdev_stop_p2p_device(rdev, wdev);
217 wdev->p2p_started = false; 217 wdev->is_running = false;
218 218
219 rdev->opencount--; 219 rdev->opencount--;
220 220
@@ -233,11 +233,11 @@ void cfg80211_stop_nan(struct cfg80211_registered_device *rdev,
233 if (WARN_ON(wdev->iftype != NL80211_IFTYPE_NAN)) 233 if (WARN_ON(wdev->iftype != NL80211_IFTYPE_NAN))
234 return; 234 return;
235 235
236 if (!wdev->nan_started) 236 if (!wdev_running(wdev))
237 return; 237 return;
238 238
239 rdev_stop_nan(rdev, wdev); 239 rdev_stop_nan(rdev, wdev);
240 wdev->nan_started = false; 240 wdev->is_running = false;
241 241
242 rdev->opencount--; 242 rdev->opencount--;
243} 243}
@@ -562,6 +562,21 @@ static int wiphy_verify_combinations(struct wiphy *wiphy)
562 c->limits[j].max > 1)) 562 c->limits[j].max > 1))
563 return -EINVAL; 563 return -EINVAL;
564 564
565 /*
566 * This isn't well-defined right now. If you have an
567 * IBSS interface, then its beacon interval may change
568 * by joining other networks, and nothing prevents it
569 * from doing that.
570 * So technically we probably shouldn't even allow AP
571 * and IBSS in the same interface, but it seems that
572 * some drivers support that, possibly only with fixed
573 * beacon intervals for IBSS.
574 */
575 if (WARN_ON(types & BIT(NL80211_IFTYPE_ADHOC) &&
576 c->beacon_int_min_gcd)) {
577 return -EINVAL;
578 }
579
565 cnt += c->limits[j].max; 580 cnt += c->limits[j].max;
566 /* 581 /*
567 * Don't advertise an unsupported type 582 * Don't advertise an unsupported type
@@ -571,6 +586,11 @@ static int wiphy_verify_combinations(struct wiphy *wiphy)
571 return -EINVAL; 586 return -EINVAL;
572 } 587 }
573 588
589#ifndef CONFIG_WIRELESS_WDS
590 if (WARN_ON(all_iftypes & BIT(NL80211_IFTYPE_WDS)))
591 return -EINVAL;
592#endif
593
574 /* You can't even choose that many! */ 594 /* You can't even choose that many! */
575 if (WARN_ON(cnt < c->max_interfaces)) 595 if (WARN_ON(cnt < c->max_interfaces))
576 return -EINVAL; 596 return -EINVAL;
@@ -606,8 +626,14 @@ int wiphy_register(struct wiphy *wiphy)
606 626
607 if (WARN_ON((wiphy->interface_modes & BIT(NL80211_IFTYPE_NAN)) && 627 if (WARN_ON((wiphy->interface_modes & BIT(NL80211_IFTYPE_NAN)) &&
608 (!rdev->ops->start_nan || !rdev->ops->stop_nan || 628 (!rdev->ops->start_nan || !rdev->ops->stop_nan ||
609 !rdev->ops->add_nan_func || !rdev->ops->del_nan_func))) 629 !rdev->ops->add_nan_func || !rdev->ops->del_nan_func ||
630 !(wiphy->nan_supported_bands & BIT(NL80211_BAND_2GHZ)))))
631 return -EINVAL;
632
633#ifndef CONFIG_WIRELESS_WDS
634 if (WARN_ON(wiphy->interface_modes & BIT(NL80211_IFTYPE_WDS)))
610 return -EINVAL; 635 return -EINVAL;
636#endif
611 637
612 /* 638 /*
613 * if a wiphy has unsupported modes for regulatory channel enforcement, 639 * if a wiphy has unsupported modes for regulatory channel enforcement,
@@ -1117,6 +1143,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
1117 wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr) 1143 wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr)
1118 dev->priv_flags |= IFF_DONT_BRIDGE; 1144 dev->priv_flags |= IFF_DONT_BRIDGE;
1119 1145
1146 INIT_WORK(&wdev->disconnect_wk, cfg80211_autodisconnect_wk);
1147
1120 nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE); 1148 nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE);
1121 break; 1149 break;
1122 case NETDEV_GOING_DOWN: 1150 case NETDEV_GOING_DOWN:
@@ -1205,6 +1233,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
1205#ifdef CONFIG_CFG80211_WEXT 1233#ifdef CONFIG_CFG80211_WEXT
1206 kzfree(wdev->wext.keys); 1234 kzfree(wdev->wext.keys);
1207#endif 1235#endif
1236 flush_work(&wdev->disconnect_wk);
1208 } 1237 }
1209 /* 1238 /*
1210 * synchronise (so that we won't find this netdev 1239 * synchronise (so that we won't find this netdev
diff --git a/net/wireless/core.h b/net/wireless/core.h
index f0c0c8a48c92..58ca206982fe 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -228,6 +228,7 @@ struct cfg80211_event {
228 size_t resp_ie_len; 228 size_t resp_ie_len;
229 struct cfg80211_bss *bss; 229 struct cfg80211_bss *bss;
230 int status; /* -1 = failed; 0..65535 = status code */ 230 int status; /* -1 = failed; 0..65535 = status code */
231 enum nl80211_timeout_reason timeout_reason;
231 } cr; 232 } cr;
232 struct { 233 struct {
233 const u8 *req_ie; 234 const u8 *req_ie;
@@ -346,7 +347,7 @@ int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev,
346 const u8 *ssid, int ssid_len, 347 const u8 *ssid, int ssid_len,
347 const u8 *ie, int ie_len, 348 const u8 *ie, int ie_len,
348 const u8 *key, int key_len, int key_idx, 349 const u8 *key, int key_len, int key_idx,
349 const u8 *sae_data, int sae_data_len); 350 const u8 *auth_data, int auth_data_len);
350int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev, 351int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev,
351 struct net_device *dev, 352 struct net_device *dev,
352 struct ieee80211_channel *chan, 353 struct ieee80211_channel *chan,
@@ -388,7 +389,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
388 const u8 *req_ie, size_t req_ie_len, 389 const u8 *req_ie, size_t req_ie_len,
389 const u8 *resp_ie, size_t resp_ie_len, 390 const u8 *resp_ie, size_t resp_ie_len,
390 int status, bool wextev, 391 int status, bool wextev,
391 struct cfg80211_bss *bss); 392 struct cfg80211_bss *bss,
393 enum nl80211_timeout_reason timeout_reason);
392void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, 394void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
393 size_t ie_len, u16 reason, bool from_ap); 395 size_t ie_len, u16 reason, bool from_ap);
394int cfg80211_disconnect(struct cfg80211_registered_device *rdev, 396int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
@@ -400,6 +402,7 @@ void __cfg80211_roamed(struct wireless_dev *wdev,
400 const u8 *resp_ie, size_t resp_ie_len); 402 const u8 *resp_ie, size_t resp_ie_len);
401int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev, 403int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev,
402 struct wireless_dev *wdev); 404 struct wireless_dev *wdev);
405void cfg80211_autodisconnect_wk(struct work_struct *work);
403 406
404/* SME implementation */ 407/* SME implementation */
405void cfg80211_conn_work(struct work_struct *work); 408void cfg80211_conn_work(struct work_struct *work);
@@ -410,6 +413,7 @@ void cfg80211_sme_disassoc(struct wireless_dev *wdev);
410void cfg80211_sme_deauth(struct wireless_dev *wdev); 413void cfg80211_sme_deauth(struct wireless_dev *wdev);
411void cfg80211_sme_auth_timeout(struct wireless_dev *wdev); 414void cfg80211_sme_auth_timeout(struct wireless_dev *wdev);
412void cfg80211_sme_assoc_timeout(struct wireless_dev *wdev); 415void cfg80211_sme_assoc_timeout(struct wireless_dev *wdev);
416void cfg80211_sme_abandon_assoc(struct wireless_dev *wdev);
413 417
414/* internal helpers */ 418/* internal helpers */
415bool cfg80211_supported_cipher_suite(struct wiphy *wiphy, u32 cipher); 419bool cfg80211_supported_cipher_suite(struct wiphy *wiphy, u32 cipher);
@@ -429,6 +433,9 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
429void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev); 433void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev);
430void cfg80211_process_wdev_events(struct wireless_dev *wdev); 434void cfg80211_process_wdev_events(struct wireless_dev *wdev);
431 435
436bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range,
437 u32 center_freq_khz, u32 bw_khz);
438
432/** 439/**
433 * cfg80211_chandef_dfs_usable - checks if chandef is DFS usable 440 * cfg80211_chandef_dfs_usable - checks if chandef is DFS usable
434 * @wiphy: the wiphy to validate against 441 * @wiphy: the wiphy to validate against
@@ -476,7 +483,7 @@ int ieee80211_get_ratemask(struct ieee80211_supported_band *sband,
476 u32 *mask); 483 u32 *mask);
477 484
478int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev, 485int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev,
479 u32 beacon_int); 486 enum nl80211_iftype iftype, u32 beacon_int);
480 487
481void cfg80211_update_iface_num(struct cfg80211_registered_device *rdev, 488void cfg80211_update_iface_num(struct cfg80211_registered_device *rdev,
482 enum nl80211_iftype iftype, int num); 489 enum nl80211_iftype iftype, int num);
diff --git a/net/wireless/debugfs.c b/net/wireless/debugfs.c
index 5d453916a417..30fc6eb352bc 100644
--- a/net/wireless/debugfs.c
+++ b/net/wireless/debugfs.c
@@ -17,7 +17,7 @@
17static ssize_t name## _read(struct file *file, char __user *userbuf, \ 17static ssize_t name## _read(struct file *file, char __user *userbuf, \
18 size_t count, loff_t *ppos) \ 18 size_t count, loff_t *ppos) \
19{ \ 19{ \
20 struct wiphy *wiphy= file->private_data; \ 20 struct wiphy *wiphy = file->private_data; \
21 char buf[buflen]; \ 21 char buf[buflen]; \
22 int res; \ 22 int res; \
23 \ 23 \
@@ -29,14 +29,14 @@ static const struct file_operations name## _ops = { \
29 .read = name## _read, \ 29 .read = name## _read, \
30 .open = simple_open, \ 30 .open = simple_open, \
31 .llseek = generic_file_llseek, \ 31 .llseek = generic_file_llseek, \
32}; 32}
33 33
34DEBUGFS_READONLY_FILE(rts_threshold, 20, "%d", 34DEBUGFS_READONLY_FILE(rts_threshold, 20, "%d",
35 wiphy->rts_threshold) 35 wiphy->rts_threshold);
36DEBUGFS_READONLY_FILE(fragmentation_threshold, 20, "%d", 36DEBUGFS_READONLY_FILE(fragmentation_threshold, 20, "%d",
37 wiphy->frag_threshold); 37 wiphy->frag_threshold);
38DEBUGFS_READONLY_FILE(short_retry_limit, 20, "%d", 38DEBUGFS_READONLY_FILE(short_retry_limit, 20, "%d",
39 wiphy->retry_short) 39 wiphy->retry_short);
40DEBUGFS_READONLY_FILE(long_retry_limit, 20, "%d", 40DEBUGFS_READONLY_FILE(long_retry_limit, 20, "%d",
41 wiphy->retry_long); 41 wiphy->retry_long);
42 42
@@ -103,7 +103,7 @@ static const struct file_operations ht40allow_map_ops = {
103}; 103};
104 104
105#define DEBUGFS_ADD(name) \ 105#define DEBUGFS_ADD(name) \
106 debugfs_create_file(#name, S_IRUGO, phyd, &rdev->wiphy, &name## _ops); 106 debugfs_create_file(#name, 0444, phyd, &rdev->wiphy, &name## _ops)
107 107
108void cfg80211_debugfs_rdev_add(struct cfg80211_registered_device *rdev) 108void cfg80211_debugfs_rdev_add(struct cfg80211_registered_device *rdev)
109{ 109{
diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c
index 71447cf86306..ba0a1f398ce5 100644
--- a/net/wireless/lib80211_crypt_tkip.c
+++ b/net/wireless/lib80211_crypt_tkip.c
@@ -556,7 +556,7 @@ static void michael_mic_hdr(struct sk_buff *skb, u8 * hdr)
556 memcpy(hdr, hdr11->addr3, ETH_ALEN); /* DA */ 556 memcpy(hdr, hdr11->addr3, ETH_ALEN); /* DA */
557 memcpy(hdr + ETH_ALEN, hdr11->addr4, ETH_ALEN); /* SA */ 557 memcpy(hdr + ETH_ALEN, hdr11->addr4, ETH_ALEN); /* SA */
558 break; 558 break;
559 case 0: 559 default:
560 memcpy(hdr, hdr11->addr1, ETH_ALEN); /* DA */ 560 memcpy(hdr, hdr11->addr1, ETH_ALEN); /* DA */
561 memcpy(hdr + ETH_ALEN, hdr11->addr2, ETH_ALEN); /* SA */ 561 memcpy(hdr + ETH_ALEN, hdr11->addr2, ETH_ALEN); /* SA */
562 break; 562 break;
diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c
index fa2066b56f36..2d8518a37eab 100644
--- a/net/wireless/mesh.c
+++ b/net/wireless/mesh.c
@@ -183,6 +183,7 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev,
183 memcpy(wdev->ssid, setup->mesh_id, setup->mesh_id_len); 183 memcpy(wdev->ssid, setup->mesh_id, setup->mesh_id_len);
184 wdev->mesh_id_len = setup->mesh_id_len; 184 wdev->mesh_id_len = setup->mesh_id_len;
185 wdev->chandef = setup->chandef; 185 wdev->chandef = setup->chandef;
186 wdev->beacon_interval = setup->beacon_interval;
186 } 187 }
187 188
188 return err; 189 return err;
@@ -258,6 +259,7 @@ int __cfg80211_leave_mesh(struct cfg80211_registered_device *rdev,
258 err = rdev_leave_mesh(rdev, dev); 259 err = rdev_leave_mesh(rdev, dev);
259 if (!err) { 260 if (!err) {
260 wdev->mesh_id_len = 0; 261 wdev->mesh_id_len = 0;
262 wdev->beacon_interval = 0;
261 memset(&wdev->chandef, 0, sizeof(wdev->chandef)); 263 memset(&wdev->chandef, 0, sizeof(wdev->chandef));
262 rdev_set_qos_map(rdev, dev, NULL); 264 rdev_set_qos_map(rdev, dev, NULL);
263 } 265 }
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index cbb48e26a871..22b3d9990065 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -48,7 +48,8 @@ void cfg80211_rx_assoc_resp(struct net_device *dev, struct cfg80211_bss *bss,
48 /* update current_bss etc., consumes the bss reference */ 48 /* update current_bss etc., consumes the bss reference */
49 __cfg80211_connect_result(dev, mgmt->bssid, NULL, 0, ie, len - ieoffs, 49 __cfg80211_connect_result(dev, mgmt->bssid, NULL, 0, ie, len - ieoffs,
50 status_code, 50 status_code,
51 status_code == WLAN_STATUS_SUCCESS, bss); 51 status_code == WLAN_STATUS_SUCCESS, bss,
52 NL80211_TIMEOUT_UNSPECIFIED);
52} 53}
53EXPORT_SYMBOL(cfg80211_rx_assoc_resp); 54EXPORT_SYMBOL(cfg80211_rx_assoc_resp);
54 55
@@ -149,6 +150,18 @@ void cfg80211_assoc_timeout(struct net_device *dev, struct cfg80211_bss *bss)
149} 150}
150EXPORT_SYMBOL(cfg80211_assoc_timeout); 151EXPORT_SYMBOL(cfg80211_assoc_timeout);
151 152
153void cfg80211_abandon_assoc(struct net_device *dev, struct cfg80211_bss *bss)
154{
155 struct wireless_dev *wdev = dev->ieee80211_ptr;
156 struct wiphy *wiphy = wdev->wiphy;
157
158 cfg80211_sme_abandon_assoc(wdev);
159
160 cfg80211_unhold_bss(bss_from_pub(bss));
161 cfg80211_put_bss(wiphy, bss);
162}
163EXPORT_SYMBOL(cfg80211_abandon_assoc);
164
152void cfg80211_tx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len) 165void cfg80211_tx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len)
153{ 166{
154 struct wireless_dev *wdev = dev->ieee80211_ptr; 167 struct wireless_dev *wdev = dev->ieee80211_ptr;
@@ -204,14 +217,14 @@ int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev,
204 const u8 *ssid, int ssid_len, 217 const u8 *ssid, int ssid_len,
205 const u8 *ie, int ie_len, 218 const u8 *ie, int ie_len,
206 const u8 *key, int key_len, int key_idx, 219 const u8 *key, int key_len, int key_idx,
207 const u8 *sae_data, int sae_data_len) 220 const u8 *auth_data, int auth_data_len)
208{ 221{
209 struct wireless_dev *wdev = dev->ieee80211_ptr; 222 struct wireless_dev *wdev = dev->ieee80211_ptr;
210 struct cfg80211_auth_request req = { 223 struct cfg80211_auth_request req = {
211 .ie = ie, 224 .ie = ie,
212 .ie_len = ie_len, 225 .ie_len = ie_len,
213 .sae_data = sae_data, 226 .auth_data = auth_data,
214 .sae_data_len = sae_data_len, 227 .auth_data_len = auth_data_len,
215 .auth_type = auth_type, 228 .auth_type = auth_type,
216 .key = key, 229 .key = key,
217 .key_len = key_len, 230 .key_len = key_len,
@@ -333,6 +346,11 @@ int cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev,
333 !ether_addr_equal(wdev->current_bss->pub.bssid, bssid))) 346 !ether_addr_equal(wdev->current_bss->pub.bssid, bssid)))
334 return 0; 347 return 0;
335 348
349 if (ether_addr_equal(wdev->disconnect_bssid, bssid) ||
350 (wdev->current_bss &&
351 ether_addr_equal(wdev->current_bss->pub.bssid, bssid)))
352 wdev->conn_owner_nlportid = 0;
353
336 return rdev_deauth(rdev, dev, &req); 354 return rdev_deauth(rdev, dev, &req);
337} 355}
338 356
@@ -645,8 +663,25 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
645 return err; 663 return err;
646 } 664 }
647 665
648 if (!ether_addr_equal(mgmt->sa, wdev_address(wdev))) 666 if (!ether_addr_equal(mgmt->sa, wdev_address(wdev))) {
649 return -EINVAL; 667 /* Allow random TA to be used with Public Action frames if the
668 * driver has indicated support for this. Otherwise, only allow
669 * the local address to be used.
670 */
671 if (!ieee80211_is_action(mgmt->frame_control) ||
672 mgmt->u.action.category != WLAN_CATEGORY_PUBLIC)
673 return -EINVAL;
674 if (!wdev->current_bss &&
675 !wiphy_ext_feature_isset(
676 &rdev->wiphy,
677 NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA))
678 return -EINVAL;
679 if (wdev->current_bss &&
680 !wiphy_ext_feature_isset(
681 &rdev->wiphy,
682 NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED))
683 return -EINVAL;
684 }
650 685
651 /* Transmit the Action frame as requested by user space */ 686 /* Transmit the Action frame as requested by user space */
652 return rdev_mgmt_tx(rdev, wdev, params, cookie); 687 return rdev_mgmt_tx(rdev, wdev, params, cookie);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index c510810f0b7c..2312dc2ffdb9 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> 4 * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
5 * Copyright 2013-2014 Intel Mobile Communications GmbH 5 * Copyright 2013-2014 Intel Mobile Communications GmbH
6 * Copyright 2015-2016 Intel Deutschland GmbH 6 * Copyright 2015-2017 Intel Deutschland GmbH
7 */ 7 */
8 8
9#include <linux/if.h> 9#include <linux/if.h>
@@ -32,22 +32,8 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev,
32 struct cfg80211_crypto_settings *settings, 32 struct cfg80211_crypto_settings *settings,
33 int cipher_limit); 33 int cipher_limit);
34 34
35static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
36 struct genl_info *info);
37static void nl80211_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
38 struct genl_info *info);
39
40/* the netlink family */ 35/* the netlink family */
41static struct genl_family nl80211_fam = { 36static struct genl_family nl80211_fam;
42 .id = GENL_ID_GENERATE, /* don't bother with a hardcoded ID */
43 .name = NL80211_GENL_NAME, /* have users key off the name instead */
44 .hdrsize = 0, /* no private header */
45 .version = 1, /* no particular meaning now */
46 .maxattr = NL80211_ATTR_MAX,
47 .netnsok = true,
48 .pre_doit = nl80211_pre_doit,
49 .post_doit = nl80211_post_doit,
50};
51 37
52/* multicast groups */ 38/* multicast groups */
53enum nl80211_multicast_groups { 39enum nl80211_multicast_groups {
@@ -357,7 +343,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
357 [NL80211_ATTR_BG_SCAN_PERIOD] = { .type = NLA_U16 }, 343 [NL80211_ATTR_BG_SCAN_PERIOD] = { .type = NLA_U16 },
358 [NL80211_ATTR_WDEV] = { .type = NLA_U64 }, 344 [NL80211_ATTR_WDEV] = { .type = NLA_U64 },
359 [NL80211_ATTR_USER_REG_HINT_TYPE] = { .type = NLA_U32 }, 345 [NL80211_ATTR_USER_REG_HINT_TYPE] = { .type = NLA_U32 },
360 [NL80211_ATTR_SAE_DATA] = { .type = NLA_BINARY, }, 346 [NL80211_ATTR_AUTH_DATA] = { .type = NLA_BINARY, },
361 [NL80211_ATTR_VHT_CAPABILITY] = { .len = NL80211_VHT_CAPABILITY_LEN }, 347 [NL80211_ATTR_VHT_CAPABILITY] = { .len = NL80211_VHT_CAPABILITY_LEN },
362 [NL80211_ATTR_SCAN_FLAGS] = { .type = NLA_U32 }, 348 [NL80211_ATTR_SCAN_FLAGS] = { .type = NLA_U32 },
363 [NL80211_ATTR_P2P_CTWINDOW] = { .type = NLA_U8 }, 349 [NL80211_ATTR_P2P_CTWINDOW] = { .type = NLA_U8 },
@@ -412,8 +398,18 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
412 }, 398 },
413 [NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = { .len = ETH_ALEN }, 399 [NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = { .len = ETH_ALEN },
414 [NL80211_ATTR_NAN_MASTER_PREF] = { .type = NLA_U8 }, 400 [NL80211_ATTR_NAN_MASTER_PREF] = { .type = NLA_U8 },
415 [NL80211_ATTR_NAN_DUAL] = { .type = NLA_U8 }, 401 [NL80211_ATTR_BANDS] = { .type = NLA_U32 },
416 [NL80211_ATTR_NAN_FUNC] = { .type = NLA_NESTED }, 402 [NL80211_ATTR_NAN_FUNC] = { .type = NLA_NESTED },
403 [NL80211_ATTR_FILS_KEK] = { .type = NLA_BINARY,
404 .len = FILS_MAX_KEK_LEN },
405 [NL80211_ATTR_FILS_NONCES] = { .len = 2 * FILS_NONCE_LEN },
406 [NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED] = { .type = NLA_FLAG, },
407 [NL80211_ATTR_BSSID] = { .len = ETH_ALEN },
408 [NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI] = { .type = NLA_S8 },
409 [NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST] = {
410 .len = sizeof(struct nl80211_bss_select_rssi_adjust)
411 },
412 [NL80211_ATTR_TIMEOUT_REASON] = { .type = NLA_U32 },
417}; 413};
418 414
419/* policy for the key attributes */ 415/* policy for the key attributes */
@@ -435,6 +431,7 @@ nl80211_key_default_policy[NUM_NL80211_KEY_DEFAULT_TYPES] = {
435 [NL80211_KEY_DEFAULT_TYPE_MULTICAST] = { .type = NLA_FLAG }, 431 [NL80211_KEY_DEFAULT_TYPE_MULTICAST] = { .type = NLA_FLAG },
436}; 432};
437 433
434#ifdef CONFIG_PM
438/* policy for WoWLAN attributes */ 435/* policy for WoWLAN attributes */
439static const struct nla_policy 436static const struct nla_policy
440nl80211_wowlan_policy[NUM_NL80211_WOWLAN_TRIG] = { 437nl80211_wowlan_policy[NUM_NL80211_WOWLAN_TRIG] = {
@@ -468,6 +465,7 @@ nl80211_wowlan_tcp_policy[NUM_NL80211_WOWLAN_TCP] = {
468 [NL80211_WOWLAN_TCP_WAKE_PAYLOAD] = { .len = 1 }, 465 [NL80211_WOWLAN_TCP_WAKE_PAYLOAD] = { .len = 1 },
469 [NL80211_WOWLAN_TCP_WAKE_MASK] = { .len = 1 }, 466 [NL80211_WOWLAN_TCP_WAKE_MASK] = { .len = 1 },
470}; 467};
468#endif /* CONFIG_PM */
471 469
472/* policy for coalesce rule attributes */ 470/* policy for coalesce rule attributes */
473static const struct nla_policy 471static const struct nla_policy
@@ -547,21 +545,18 @@ static int nl80211_prepare_wdev_dump(struct sk_buff *skb,
547{ 545{
548 int err; 546 int err;
549 547
550 rtnl_lock();
551
552 if (!cb->args[0]) { 548 if (!cb->args[0]) {
553 err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize, 549 err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize,
554 nl80211_fam.attrbuf, nl80211_fam.maxattr, 550 genl_family_attrbuf(&nl80211_fam),
555 nl80211_policy); 551 nl80211_fam.maxattr, nl80211_policy);
556 if (err) 552 if (err)
557 goto out_unlock; 553 return err;
558 554
559 *wdev = __cfg80211_wdev_from_attrs(sock_net(skb->sk), 555 *wdev = __cfg80211_wdev_from_attrs(
560 nl80211_fam.attrbuf); 556 sock_net(skb->sk),
561 if (IS_ERR(*wdev)) { 557 genl_family_attrbuf(&nl80211_fam));
562 err = PTR_ERR(*wdev); 558 if (IS_ERR(*wdev))
563 goto out_unlock; 559 return PTR_ERR(*wdev);
564 }
565 *rdev = wiphy_to_rdev((*wdev)->wiphy); 560 *rdev = wiphy_to_rdev((*wdev)->wiphy);
566 /* 0 is the first index - add 1 to parse only once */ 561 /* 0 is the first index - add 1 to parse only once */
567 cb->args[0] = (*rdev)->wiphy_idx + 1; 562 cb->args[0] = (*rdev)->wiphy_idx + 1;
@@ -571,10 +566,8 @@ static int nl80211_prepare_wdev_dump(struct sk_buff *skb,
571 struct wiphy *wiphy = wiphy_idx_to_wiphy(cb->args[0] - 1); 566 struct wiphy *wiphy = wiphy_idx_to_wiphy(cb->args[0] - 1);
572 struct wireless_dev *tmp; 567 struct wireless_dev *tmp;
573 568
574 if (!wiphy) { 569 if (!wiphy)
575 err = -ENODEV; 570 return -ENODEV;
576 goto out_unlock;
577 }
578 *rdev = wiphy_to_rdev(wiphy); 571 *rdev = wiphy_to_rdev(wiphy);
579 *wdev = NULL; 572 *wdev = NULL;
580 573
@@ -585,21 +578,11 @@ static int nl80211_prepare_wdev_dump(struct sk_buff *skb,
585 } 578 }
586 } 579 }
587 580
588 if (!*wdev) { 581 if (!*wdev)
589 err = -ENODEV; 582 return -ENODEV;
590 goto out_unlock;
591 }
592 } 583 }
593 584
594 return 0; 585 return 0;
595 out_unlock:
596 rtnl_unlock();
597 return err;
598}
599
600static void nl80211_finish_wdev_dump(struct cfg80211_registered_device *rdev)
601{
602 rtnl_unlock();
603} 586}
604 587
605/* IE validation */ 588/* IE validation */
@@ -1075,6 +1058,10 @@ static int nl80211_put_iface_combinations(struct wiphy *wiphy,
1075 nla_put_u32(msg, NL80211_IFACE_COMB_RADAR_DETECT_REGIONS, 1058 nla_put_u32(msg, NL80211_IFACE_COMB_RADAR_DETECT_REGIONS,
1076 c->radar_detect_regions))) 1059 c->radar_detect_regions)))
1077 goto nla_put_failure; 1060 goto nla_put_failure;
1061 if (c->beacon_int_min_gcd &&
1062 nla_put_u32(msg, NL80211_IFACE_COMB_BI_MIN_GCD,
1063 c->beacon_int_min_gcd))
1064 goto nla_put_failure;
1078 1065
1079 nla_nest_end(msg, nl_combi); 1066 nla_nest_end(msg, nl_combi);
1080 } 1067 }
@@ -1322,6 +1309,95 @@ nl80211_send_mgmt_stypes(struct sk_buff *msg,
1322 return 0; 1309 return 0;
1323} 1310}
1324 1311
1312#define CMD(op, n) \
1313 do { \
1314 if (rdev->ops->op) { \
1315 i++; \
1316 if (nla_put_u32(msg, i, NL80211_CMD_ ## n)) \
1317 goto nla_put_failure; \
1318 } \
1319 } while (0)
1320
1321static int nl80211_add_commands_unsplit(struct cfg80211_registered_device *rdev,
1322 struct sk_buff *msg)
1323{
1324 int i = 0;
1325
1326 /*
1327 * do *NOT* add anything into this function, new things need to be
1328 * advertised only to new versions of userspace that can deal with
1329 * the split (and they can't possibly care about new features...
1330 */
1331 CMD(add_virtual_intf, NEW_INTERFACE);
1332 CMD(change_virtual_intf, SET_INTERFACE);
1333 CMD(add_key, NEW_KEY);
1334 CMD(start_ap, START_AP);
1335 CMD(add_station, NEW_STATION);
1336 CMD(add_mpath, NEW_MPATH);
1337 CMD(update_mesh_config, SET_MESH_CONFIG);
1338 CMD(change_bss, SET_BSS);
1339 CMD(auth, AUTHENTICATE);
1340 CMD(assoc, ASSOCIATE);
1341 CMD(deauth, DEAUTHENTICATE);
1342 CMD(disassoc, DISASSOCIATE);
1343 CMD(join_ibss, JOIN_IBSS);
1344 CMD(join_mesh, JOIN_MESH);
1345 CMD(set_pmksa, SET_PMKSA);
1346 CMD(del_pmksa, DEL_PMKSA);
1347 CMD(flush_pmksa, FLUSH_PMKSA);
1348 if (rdev->wiphy.flags & WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL)
1349 CMD(remain_on_channel, REMAIN_ON_CHANNEL);
1350 CMD(set_bitrate_mask, SET_TX_BITRATE_MASK);
1351 CMD(mgmt_tx, FRAME);
1352 CMD(mgmt_tx_cancel_wait, FRAME_WAIT_CANCEL);
1353 if (rdev->wiphy.flags & WIPHY_FLAG_NETNS_OK) {
1354 i++;
1355 if (nla_put_u32(msg, i, NL80211_CMD_SET_WIPHY_NETNS))
1356 goto nla_put_failure;
1357 }
1358 if (rdev->ops->set_monitor_channel || rdev->ops->start_ap ||
1359 rdev->ops->join_mesh) {
1360 i++;
1361 if (nla_put_u32(msg, i, NL80211_CMD_SET_CHANNEL))
1362 goto nla_put_failure;
1363 }
1364 CMD(set_wds_peer, SET_WDS_PEER);
1365 if (rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_TDLS) {
1366 CMD(tdls_mgmt, TDLS_MGMT);
1367 CMD(tdls_oper, TDLS_OPER);
1368 }
1369 if (rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN)
1370 CMD(sched_scan_start, START_SCHED_SCAN);
1371 CMD(probe_client, PROBE_CLIENT);
1372 CMD(set_noack_map, SET_NOACK_MAP);
1373 if (rdev->wiphy.flags & WIPHY_FLAG_REPORTS_OBSS) {
1374 i++;
1375 if (nla_put_u32(msg, i, NL80211_CMD_REGISTER_BEACONS))
1376 goto nla_put_failure;
1377 }
1378 CMD(start_p2p_device, START_P2P_DEVICE);
1379 CMD(set_mcast_rate, SET_MCAST_RATE);
1380#ifdef CONFIG_NL80211_TESTMODE
1381 CMD(testmode_cmd, TESTMODE);
1382#endif
1383
1384 if (rdev->ops->connect || rdev->ops->auth) {
1385 i++;
1386 if (nla_put_u32(msg, i, NL80211_CMD_CONNECT))
1387 goto nla_put_failure;
1388 }
1389
1390 if (rdev->ops->disconnect || rdev->ops->deauth) {
1391 i++;
1392 if (nla_put_u32(msg, i, NL80211_CMD_DISCONNECT))
1393 goto nla_put_failure;
1394 }
1395
1396 return i;
1397 nla_put_failure:
1398 return -ENOBUFS;
1399}
1400
1325struct nl80211_dump_wiphy_state { 1401struct nl80211_dump_wiphy_state {
1326 s64 filter_wiphy; 1402 s64 filter_wiphy;
1327 long start; 1403 long start;
@@ -1549,68 +1625,9 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
1549 if (!nl_cmds) 1625 if (!nl_cmds)
1550 goto nla_put_failure; 1626 goto nla_put_failure;
1551 1627
1552 i = 0; 1628 i = nl80211_add_commands_unsplit(rdev, msg);
1553#define CMD(op, n) \ 1629 if (i < 0)
1554 do { \ 1630 goto nla_put_failure;
1555 if (rdev->ops->op) { \
1556 i++; \
1557 if (nla_put_u32(msg, i, NL80211_CMD_ ## n)) \
1558 goto nla_put_failure; \
1559 } \
1560 } while (0)
1561
1562 CMD(add_virtual_intf, NEW_INTERFACE);
1563 CMD(change_virtual_intf, SET_INTERFACE);
1564 CMD(add_key, NEW_KEY);
1565 CMD(start_ap, START_AP);
1566 CMD(add_station, NEW_STATION);
1567 CMD(add_mpath, NEW_MPATH);
1568 CMD(update_mesh_config, SET_MESH_CONFIG);
1569 CMD(change_bss, SET_BSS);
1570 CMD(auth, AUTHENTICATE);
1571 CMD(assoc, ASSOCIATE);
1572 CMD(deauth, DEAUTHENTICATE);
1573 CMD(disassoc, DISASSOCIATE);
1574 CMD(join_ibss, JOIN_IBSS);
1575 CMD(join_mesh, JOIN_MESH);
1576 CMD(set_pmksa, SET_PMKSA);
1577 CMD(del_pmksa, DEL_PMKSA);
1578 CMD(flush_pmksa, FLUSH_PMKSA);
1579 if (rdev->wiphy.flags & WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL)
1580 CMD(remain_on_channel, REMAIN_ON_CHANNEL);
1581 CMD(set_bitrate_mask, SET_TX_BITRATE_MASK);
1582 CMD(mgmt_tx, FRAME);
1583 CMD(mgmt_tx_cancel_wait, FRAME_WAIT_CANCEL);
1584 if (rdev->wiphy.flags & WIPHY_FLAG_NETNS_OK) {
1585 i++;
1586 if (nla_put_u32(msg, i, NL80211_CMD_SET_WIPHY_NETNS))
1587 goto nla_put_failure;
1588 }
1589 if (rdev->ops->set_monitor_channel || rdev->ops->start_ap ||
1590 rdev->ops->join_mesh) {
1591 i++;
1592 if (nla_put_u32(msg, i, NL80211_CMD_SET_CHANNEL))
1593 goto nla_put_failure;
1594 }
1595 CMD(set_wds_peer, SET_WDS_PEER);
1596 if (rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_TDLS) {
1597 CMD(tdls_mgmt, TDLS_MGMT);
1598 CMD(tdls_oper, TDLS_OPER);
1599 }
1600 if (rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN)
1601 CMD(sched_scan_start, START_SCHED_SCAN);
1602 CMD(probe_client, PROBE_CLIENT);
1603 CMD(set_noack_map, SET_NOACK_MAP);
1604 if (rdev->wiphy.flags & WIPHY_FLAG_REPORTS_OBSS) {
1605 i++;
1606 if (nla_put_u32(msg, i, NL80211_CMD_REGISTER_BEACONS))
1607 goto nla_put_failure;
1608 }
1609 CMD(start_p2p_device, START_P2P_DEVICE);
1610 CMD(set_mcast_rate, SET_MCAST_RATE);
1611#ifdef CONFIG_NL80211_TESTMODE
1612 CMD(testmode_cmd, TESTMODE);
1613#endif
1614 if (state->split) { 1631 if (state->split) {
1615 CMD(crit_proto_start, CRIT_PROTOCOL_START); 1632 CMD(crit_proto_start, CRIT_PROTOCOL_START);
1616 CMD(crit_proto_stop, CRIT_PROTOCOL_STOP); 1633 CMD(crit_proto_stop, CRIT_PROTOCOL_STOP);
@@ -1620,22 +1637,11 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
1620 if (rdev->wiphy.features & 1637 if (rdev->wiphy.features &
1621 NL80211_FEATURE_SUPPORTS_WMM_ADMISSION) 1638 NL80211_FEATURE_SUPPORTS_WMM_ADMISSION)
1622 CMD(add_tx_ts, ADD_TX_TS); 1639 CMD(add_tx_ts, ADD_TX_TS);
1640 CMD(set_multicast_to_unicast, SET_MULTICAST_TO_UNICAST);
1641 CMD(update_connect_params, UPDATE_CONNECT_PARAMS);
1623 } 1642 }
1624 /* add into the if now */
1625#undef CMD 1643#undef CMD
1626 1644
1627 if (rdev->ops->connect || rdev->ops->auth) {
1628 i++;
1629 if (nla_put_u32(msg, i, NL80211_CMD_CONNECT))
1630 goto nla_put_failure;
1631 }
1632
1633 if (rdev->ops->disconnect || rdev->ops->deauth) {
1634 i++;
1635 if (nla_put_u32(msg, i, NL80211_CMD_DISCONNECT))
1636 goto nla_put_failure;
1637 }
1638
1639 nla_nest_end(msg, nl_cmds); 1645 nla_nest_end(msg, nl_cmds);
1640 state->split_start++; 1646 state->split_start++;
1641 if (state->split) 1647 if (state->split)
@@ -1864,6 +1870,10 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
1864 } 1870 }
1865 } 1871 }
1866 1872
1873 if (nla_put_u32(msg, NL80211_ATTR_BANDS,
1874 rdev->wiphy.nan_supported_bands))
1875 goto nla_put_failure;
1876
1867 /* done */ 1877 /* done */
1868 state->split_start = 0; 1878 state->split_start = 0;
1869 break; 1879 break;
@@ -1881,7 +1891,7 @@ static int nl80211_dump_wiphy_parse(struct sk_buff *skb,
1881 struct netlink_callback *cb, 1891 struct netlink_callback *cb,
1882 struct nl80211_dump_wiphy_state *state) 1892 struct nl80211_dump_wiphy_state *state)
1883{ 1893{
1884 struct nlattr **tb = nl80211_fam.attrbuf; 1894 struct nlattr **tb = genl_family_attrbuf(&nl80211_fam);
1885 int ret = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize, 1895 int ret = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize,
1886 tb, nl80211_fam.maxattr, nl80211_policy); 1896 tb, nl80211_fam.maxattr, nl80211_policy);
1887 /* ignore parse errors for backward compatibility */ 1897 /* ignore parse errors for backward compatibility */
@@ -2296,10 +2306,9 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
2296 nla_for_each_nested(nl_txq_params, 2306 nla_for_each_nested(nl_txq_params,
2297 info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS], 2307 info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS],
2298 rem_txq_params) { 2308 rem_txq_params) {
2299 result = nla_parse(tb, NL80211_TXQ_ATTR_MAX, 2309 result = nla_parse_nested(tb, NL80211_TXQ_ATTR_MAX,
2300 nla_data(nl_txq_params), 2310 nl_txq_params,
2301 nla_len(nl_txq_params), 2311 txq_params_policy);
2302 txq_params_policy);
2303 if (result) 2312 if (result)
2304 return result; 2313 return result;
2305 result = parse_txq_params(tb, &txq_params); 2314 result = parse_txq_params(tb, &txq_params);
@@ -2583,17 +2592,17 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *
2583 int filter_wiphy = -1; 2592 int filter_wiphy = -1;
2584 struct cfg80211_registered_device *rdev; 2593 struct cfg80211_registered_device *rdev;
2585 struct wireless_dev *wdev; 2594 struct wireless_dev *wdev;
2595 int ret;
2586 2596
2587 rtnl_lock(); 2597 rtnl_lock();
2588 if (!cb->args[2]) { 2598 if (!cb->args[2]) {
2589 struct nl80211_dump_wiphy_state state = { 2599 struct nl80211_dump_wiphy_state state = {
2590 .filter_wiphy = -1, 2600 .filter_wiphy = -1,
2591 }; 2601 };
2592 int ret;
2593 2602
2594 ret = nl80211_dump_wiphy_parse(skb, cb, &state); 2603 ret = nl80211_dump_wiphy_parse(skb, cb, &state);
2595 if (ret) 2604 if (ret)
2596 return ret; 2605 goto out_unlock;
2597 2606
2598 filter_wiphy = state.filter_wiphy; 2607 filter_wiphy = state.filter_wiphy;
2599 2608
@@ -2638,12 +2647,14 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *
2638 wp_idx++; 2647 wp_idx++;
2639 } 2648 }
2640 out: 2649 out:
2641 rtnl_unlock();
2642
2643 cb->args[0] = wp_idx; 2650 cb->args[0] = wp_idx;
2644 cb->args[1] = if_idx; 2651 cb->args[1] = if_idx;
2645 2652
2646 return skb->len; 2653 ret = skb->len;
2654 out_unlock:
2655 rtnl_unlock();
2656
2657 return ret;
2647} 2658}
2648 2659
2649static int nl80211_get_interface(struct sk_buff *skb, struct genl_info *info) 2660static int nl80211_get_interface(struct sk_buff *skb, struct genl_info *info)
@@ -3549,8 +3560,8 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info,
3549 sband = rdev->wiphy.bands[band]; 3560 sband = rdev->wiphy.bands[band];
3550 if (sband == NULL) 3561 if (sband == NULL)
3551 return -EINVAL; 3562 return -EINVAL;
3552 err = nla_parse(tb, NL80211_TXRATE_MAX, nla_data(tx_rates), 3563 err = nla_parse_nested(tb, NL80211_TXRATE_MAX, tx_rates,
3553 nla_len(tx_rates), nl80211_txattr_policy); 3564 nl80211_txattr_policy);
3554 if (err) 3565 if (err)
3555 return err; 3566 return err;
3556 if (tb[NL80211_TXRATE_LEGACY]) { 3567 if (tb[NL80211_TXRATE_LEGACY]) {
@@ -3722,6 +3733,49 @@ static int nl80211_parse_beacon(struct nlattr *attrs[],
3722 return 0; 3733 return 0;
3723} 3734}
3724 3735
3736static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params,
3737 const u8 *rates)
3738{
3739 int i;
3740
3741 if (!rates)
3742 return;
3743
3744 for (i = 0; i < rates[1]; i++) {
3745 if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_HT_PHY)
3746 params->ht_required = true;
3747 if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_VHT_PHY)
3748 params->vht_required = true;
3749 }
3750}
3751
3752/*
3753 * Since the nl80211 API didn't include, from the beginning, attributes about
3754 * HT/VHT requirements/capabilities, we parse them out of the IEs for the
3755 * benefit of drivers that rebuild IEs in the firmware.
3756 */
3757static void nl80211_calculate_ap_params(struct cfg80211_ap_settings *params)
3758{
3759 const struct cfg80211_beacon_data *bcn = &params->beacon;
3760 size_t ies_len = bcn->beacon_ies_len;
3761 const u8 *ies = bcn->beacon_ies;
3762 const u8 *rates;
3763 const u8 *cap;
3764
3765 rates = cfg80211_find_ie(WLAN_EID_SUPP_RATES, ies, ies_len);
3766 nl80211_check_ap_rate_selectors(params, rates);
3767
3768 rates = cfg80211_find_ie(WLAN_EID_EXT_SUPP_RATES, ies, ies_len);
3769 nl80211_check_ap_rate_selectors(params, rates);
3770
3771 cap = cfg80211_find_ie(WLAN_EID_HT_CAPABILITY, ies, ies_len);
3772 if (cap && cap[1] >= sizeof(*params->ht_cap))
3773 params->ht_cap = (void *)(cap + 2);
3774 cap = cfg80211_find_ie(WLAN_EID_VHT_CAPABILITY, ies, ies_len);
3775 if (cap && cap[1] >= sizeof(*params->vht_cap))
3776 params->vht_cap = (void *)(cap + 2);
3777}
3778
3725static bool nl80211_get_ap_channel(struct cfg80211_registered_device *rdev, 3779static bool nl80211_get_ap_channel(struct cfg80211_registered_device *rdev,
3726 struct cfg80211_ap_settings *params) 3780 struct cfg80211_ap_settings *params)
3727{ 3781{
@@ -3756,12 +3810,23 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev,
3756 if (!(rdev->wiphy.features & NL80211_FEATURE_SAE) && 3810 if (!(rdev->wiphy.features & NL80211_FEATURE_SAE) &&
3757 auth_type == NL80211_AUTHTYPE_SAE) 3811 auth_type == NL80211_AUTHTYPE_SAE)
3758 return false; 3812 return false;
3813 if (!wiphy_ext_feature_isset(&rdev->wiphy,
3814 NL80211_EXT_FEATURE_FILS_STA) &&
3815 (auth_type == NL80211_AUTHTYPE_FILS_SK ||
3816 auth_type == NL80211_AUTHTYPE_FILS_SK_PFS ||
3817 auth_type == NL80211_AUTHTYPE_FILS_PK))
3818 return false;
3759 return true; 3819 return true;
3760 case NL80211_CMD_CONNECT: 3820 case NL80211_CMD_CONNECT:
3761 case NL80211_CMD_START_AP: 3821 case NL80211_CMD_START_AP:
3762 /* SAE not supported yet */ 3822 /* SAE not supported yet */
3763 if (auth_type == NL80211_AUTHTYPE_SAE) 3823 if (auth_type == NL80211_AUTHTYPE_SAE)
3764 return false; 3824 return false;
3825 /* FILS not supported yet */
3826 if (auth_type == NL80211_AUTHTYPE_FILS_SK ||
3827 auth_type == NL80211_AUTHTYPE_FILS_SK_PFS ||
3828 auth_type == NL80211_AUTHTYPE_FILS_PK)
3829 return false;
3765 return true; 3830 return true;
3766 default: 3831 default:
3767 return false; 3832 return false;
@@ -3803,7 +3868,8 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
3803 params.dtim_period = 3868 params.dtim_period =
3804 nla_get_u32(info->attrs[NL80211_ATTR_DTIM_PERIOD]); 3869 nla_get_u32(info->attrs[NL80211_ATTR_DTIM_PERIOD]);
3805 3870
3806 err = cfg80211_validate_beacon_int(rdev, params.beacon_interval); 3871 err = cfg80211_validate_beacon_int(rdev, dev->ieee80211_ptr->iftype,
3872 params.beacon_interval);
3807 if (err) 3873 if (err)
3808 return err; 3874 return err;
3809 3875
@@ -3938,6 +4004,8 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
3938 return PTR_ERR(params.acl); 4004 return PTR_ERR(params.acl);
3939 } 4005 }
3940 4006
4007 nl80211_calculate_ap_params(&params);
4008
3941 wdev_lock(wdev); 4009 wdev_lock(wdev);
3942 err = rdev_start_ap(rdev, dev, &params); 4010 err = rdev_start_ap(rdev, dev, &params);
3943 if (!err) { 4011 if (!err) {
@@ -4370,9 +4438,10 @@ static int nl80211_dump_station(struct sk_buff *skb,
4370 int sta_idx = cb->args[2]; 4438 int sta_idx = cb->args[2];
4371 int err; 4439 int err;
4372 4440
4441 rtnl_lock();
4373 err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); 4442 err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev);
4374 if (err) 4443 if (err)
4375 return err; 4444 goto out_err;
4376 4445
4377 if (!wdev->netdev) { 4446 if (!wdev->netdev) {
4378 err = -EINVAL; 4447 err = -EINVAL;
@@ -4407,7 +4476,7 @@ static int nl80211_dump_station(struct sk_buff *skb,
4407 cb->args[2] = sta_idx; 4476 cb->args[2] = sta_idx;
4408 err = skb->len; 4477 err = skb->len;
4409 out_err: 4478 out_err:
4410 nl80211_finish_wdev_dump(rdev); 4479 rtnl_unlock();
4411 4480
4412 return err; 4481 return err;
4413} 4482}
@@ -4587,6 +4656,15 @@ int cfg80211_check_station_change(struct wiphy *wiphy,
4587 break; 4656 break;
4588 } 4657 }
4589 4658
4659 /*
4660 * Older kernel versions ignored this attribute entirely, so don't
4661 * reject attempts to update it but mark it as unused instead so the
4662 * driver won't look at the data.
4663 */
4664 if (statype != CFG80211_STA_AP_CLIENT_UNASSOC &&
4665 statype != CFG80211_STA_TDLS_PEER_SETUP)
4666 params->opmode_notif_used = false;
4667
4590 return 0; 4668 return 0;
4591} 4669}
4592EXPORT_SYMBOL(cfg80211_check_station_change); 4670EXPORT_SYMBOL(cfg80211_check_station_change);
@@ -4826,6 +4904,12 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
4826 params.local_pm = pm; 4904 params.local_pm = pm;
4827 } 4905 }
4828 4906
4907 if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) {
4908 params.opmode_notif_used = true;
4909 params.opmode_notif =
4910 nla_get_u8(info->attrs[NL80211_ATTR_OPMODE_NOTIF]);
4911 }
4912
4829 /* Include parameters for TDLS peer (will check later) */ 4913 /* Include parameters for TDLS peer (will check later) */
4830 err = nl80211_set_station_tdls(info, &params); 4914 err = nl80211_set_station_tdls(info, &params);
4831 if (err) 4915 if (err)
@@ -5178,9 +5262,10 @@ static int nl80211_dump_mpath(struct sk_buff *skb,
5178 int path_idx = cb->args[2]; 5262 int path_idx = cb->args[2];
5179 int err; 5263 int err;
5180 5264
5265 rtnl_lock();
5181 err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); 5266 err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev);
5182 if (err) 5267 if (err)
5183 return err; 5268 goto out_err;
5184 5269
5185 if (!rdev->ops->dump_mpath) { 5270 if (!rdev->ops->dump_mpath) {
5186 err = -EOPNOTSUPP; 5271 err = -EOPNOTSUPP;
@@ -5213,7 +5298,7 @@ static int nl80211_dump_mpath(struct sk_buff *skb,
5213 cb->args[2] = path_idx; 5298 cb->args[2] = path_idx;
5214 err = skb->len; 5299 err = skb->len;
5215 out_err: 5300 out_err:
5216 nl80211_finish_wdev_dump(rdev); 5301 rtnl_unlock();
5217 return err; 5302 return err;
5218} 5303}
5219 5304
@@ -5373,9 +5458,10 @@ static int nl80211_dump_mpp(struct sk_buff *skb,
5373 int path_idx = cb->args[2]; 5458 int path_idx = cb->args[2];
5374 int err; 5459 int err;
5375 5460
5461 rtnl_lock();
5376 err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); 5462 err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev);
5377 if (err) 5463 if (err)
5378 return err; 5464 goto out_err;
5379 5465
5380 if (!rdev->ops->dump_mpp) { 5466 if (!rdev->ops->dump_mpp) {
5381 err = -EOPNOTSUPP; 5467 err = -EOPNOTSUPP;
@@ -5408,7 +5494,7 @@ static int nl80211_dump_mpp(struct sk_buff *skb,
5408 cb->args[2] = path_idx; 5494 cb->args[2] = path_idx;
5409 err = skb->len; 5495 err = skb->len;
5410 out_err: 5496 out_err:
5411 nl80211_finish_wdev_dump(rdev); 5497 rtnl_unlock();
5412 return err; 5498 return err;
5413} 5499}
5414 5500
@@ -5873,6 +5959,7 @@ do { \
5873 break; 5959 break;
5874 } 5960 }
5875 cfg->ht_opmode = ht_opmode; 5961 cfg->ht_opmode = ht_opmode;
5962 mask |= (1 << (NL80211_MESHCONF_HT_OPMODE - 1));
5876 } 5963 }
5877 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPactivePathToRootTimeout, 5964 FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPactivePathToRootTimeout,
5878 1, 65535, mask, 5965 1, 65535, mask,
@@ -6305,9 +6392,8 @@ static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info)
6305 6392
6306 nla_for_each_nested(nl_reg_rule, info->attrs[NL80211_ATTR_REG_RULES], 6393 nla_for_each_nested(nl_reg_rule, info->attrs[NL80211_ATTR_REG_RULES],
6307 rem_reg_rules) { 6394 rem_reg_rules) {
6308 r = nla_parse(tb, NL80211_REG_RULE_ATTR_MAX, 6395 r = nla_parse_nested(tb, NL80211_REG_RULE_ATTR_MAX,
6309 nla_data(nl_reg_rule), nla_len(nl_reg_rule), 6396 nl_reg_rule, reg_rule_policy);
6310 reg_rule_policy);
6311 if (r) 6397 if (r)
6312 goto bad_reg; 6398 goto bad_reg;
6313 r = parse_reg_rule(tb, &rd->reg_rules[rule_idx]); 6399 r = parse_reg_rule(tb, &rd->reg_rules[rule_idx]);
@@ -6374,8 +6460,8 @@ static int parse_bss_select(struct nlattr *nla, struct wiphy *wiphy,
6374 if (!nla_ok(nest, nla_len(nest))) 6460 if (!nla_ok(nest, nla_len(nest)))
6375 return -EINVAL; 6461 return -EINVAL;
6376 6462
6377 err = nla_parse(attr, NL80211_BSS_SELECT_ATTR_MAX, nla_data(nest), 6463 err = nla_parse_nested(attr, NL80211_BSS_SELECT_ATTR_MAX, nest,
6378 nla_len(nest), nl80211_bss_select_policy); 6464 nl80211_bss_select_policy);
6379 if (err) 6465 if (err)
6380 return err; 6466 return err;
6381 6467
@@ -6677,7 +6763,20 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
6677 request->no_cck = 6763 request->no_cck =
6678 nla_get_flag(info->attrs[NL80211_ATTR_TX_NO_CCK_RATE]); 6764 nla_get_flag(info->attrs[NL80211_ATTR_TX_NO_CCK_RATE]);
6679 6765
6680 if (info->attrs[NL80211_ATTR_MAC]) 6766 /* Initial implementation used NL80211_ATTR_MAC to set the specific
6767 * BSSID to scan for. This was problematic because that same attribute
6768 * was already used for another purpose (local random MAC address). The
6769 * NL80211_ATTR_BSSID attribute was added to fix this. For backwards
6770 * compatibility with older userspace components, also use the
6771 * NL80211_ATTR_MAC value here if it can be determined to be used for
6772 * the specific BSSID use case instead of the random MAC address
6773 * (NL80211_ATTR_SCAN_FLAGS is used to enable random MAC address use).
6774 */
6775 if (info->attrs[NL80211_ATTR_BSSID])
6776 memcpy(request->bssid,
6777 nla_data(info->attrs[NL80211_ATTR_BSSID]), ETH_ALEN);
6778 else if (!(request->flags & NL80211_SCAN_FLAG_RANDOM_ADDR) &&
6779 info->attrs[NL80211_ATTR_MAC])
6681 memcpy(request->bssid, nla_data(info->attrs[NL80211_ATTR_MAC]), 6780 memcpy(request->bssid, nla_data(info->attrs[NL80211_ATTR_MAC]),
6682 ETH_ALEN); 6781 ETH_ALEN);
6683 else 6782 else
@@ -6735,13 +6834,10 @@ nl80211_parse_sched_scan_plans(struct wiphy *wiphy, int n_plans,
6735 6834
6736 /* 6835 /*
6737 * If scan plans are not specified, 6836 * If scan plans are not specified,
6738 * %NL80211_ATTR_SCHED_SCAN_INTERVAL must be specified. In this 6837 * %NL80211_ATTR_SCHED_SCAN_INTERVAL will be specified. In this
6739 * case one scan plan will be set with the specified scan 6838 * case one scan plan will be set with the specified scan
6740 * interval and infinite number of iterations. 6839 * interval and infinite number of iterations.
6741 */ 6840 */
6742 if (!attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL])
6743 return -EINVAL;
6744
6745 interval = nla_get_u32(attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL]); 6841 interval = nla_get_u32(attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL]);
6746 if (!interval) 6842 if (!interval)
6747 return -EINVAL; 6843 return -EINVAL;
@@ -6765,9 +6861,8 @@ nl80211_parse_sched_scan_plans(struct wiphy *wiphy, int n_plans,
6765 if (WARN_ON(i >= n_plans)) 6861 if (WARN_ON(i >= n_plans))
6766 return -EINVAL; 6862 return -EINVAL;
6767 6863
6768 err = nla_parse(plan, NL80211_SCHED_SCAN_PLAN_MAX, 6864 err = nla_parse_nested(plan, NL80211_SCHED_SCAN_PLAN_MAX,
6769 nla_data(attr), nla_len(attr), 6865 attr, nl80211_plan_policy);
6770 nl80211_plan_policy);
6771 if (err) 6866 if (err)
6772 return err; 6867 return err;
6773 6868
@@ -6811,7 +6906,7 @@ nl80211_parse_sched_scan_plans(struct wiphy *wiphy, int n_plans,
6811 6906
6812static struct cfg80211_sched_scan_request * 6907static struct cfg80211_sched_scan_request *
6813nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, 6908nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
6814 struct nlattr **attrs) 6909 struct nlattr **attrs, int max_match_sets)
6815{ 6910{
6816 struct cfg80211_sched_scan_request *request; 6911 struct cfg80211_sched_scan_request *request;
6817 struct nlattr *attr; 6912 struct nlattr *attr;
@@ -6856,9 +6951,9 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
6856 tmp) { 6951 tmp) {
6857 struct nlattr *rssi; 6952 struct nlattr *rssi;
6858 6953
6859 err = nla_parse(tb, NL80211_SCHED_SCAN_MATCH_ATTR_MAX, 6954 err = nla_parse_nested(tb,
6860 nla_data(attr), nla_len(attr), 6955 NL80211_SCHED_SCAN_MATCH_ATTR_MAX,
6861 nl80211_match_policy); 6956 attr, nl80211_match_policy);
6862 if (err) 6957 if (err)
6863 return ERR_PTR(err); 6958 return ERR_PTR(err);
6864 /* add other standalone attributes here */ 6959 /* add other standalone attributes here */
@@ -6876,7 +6971,7 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
6876 if (!n_match_sets && default_match_rssi != NL80211_SCAN_RSSI_THOLD_OFF) 6971 if (!n_match_sets && default_match_rssi != NL80211_SCAN_RSSI_THOLD_OFF)
6877 n_match_sets = 1; 6972 n_match_sets = 1;
6878 6973
6879 if (n_match_sets > wiphy->max_match_sets) 6974 if (n_match_sets > max_match_sets)
6880 return ERR_PTR(-EINVAL); 6975 return ERR_PTR(-EINVAL);
6881 6976
6882 if (attrs[NL80211_ATTR_IE]) 6977 if (attrs[NL80211_ATTR_IE])
@@ -6914,6 +7009,12 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
6914 if (!n_plans || n_plans > wiphy->max_sched_scan_plans) 7009 if (!n_plans || n_plans > wiphy->max_sched_scan_plans)
6915 return ERR_PTR(-EINVAL); 7010 return ERR_PTR(-EINVAL);
6916 7011
7012 if (!wiphy_ext_feature_isset(
7013 wiphy, NL80211_EXT_FEATURE_SCHED_SCAN_RELATIVE_RSSI) &&
7014 (attrs[NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI] ||
7015 attrs[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST]))
7016 return ERR_PTR(-EINVAL);
7017
6917 request = kzalloc(sizeof(*request) 7018 request = kzalloc(sizeof(*request)
6918 + sizeof(*request->ssids) * n_ssids 7019 + sizeof(*request->ssids) * n_ssids
6919 + sizeof(*request->match_sets) * n_match_sets 7020 + sizeof(*request->match_sets) * n_match_sets
@@ -7029,9 +7130,9 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
7029 tmp) { 7130 tmp) {
7030 struct nlattr *ssid, *rssi; 7131 struct nlattr *ssid, *rssi;
7031 7132
7032 err = nla_parse(tb, NL80211_SCHED_SCAN_MATCH_ATTR_MAX, 7133 err = nla_parse_nested(tb,
7033 nla_data(attr), nla_len(attr), 7134 NL80211_SCHED_SCAN_MATCH_ATTR_MAX,
7034 nl80211_match_policy); 7135 attr, nl80211_match_policy);
7035 if (err) 7136 if (err)
7036 goto out_free; 7137 goto out_free;
7037 ssid = tb[NL80211_SCHED_SCAN_MATCH_ATTR_SSID]; 7138 ssid = tb[NL80211_SCHED_SCAN_MATCH_ATTR_SSID];
@@ -7120,6 +7221,26 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
7120 request->delay = 7221 request->delay =
7121 nla_get_u32(attrs[NL80211_ATTR_SCHED_SCAN_DELAY]); 7222 nla_get_u32(attrs[NL80211_ATTR_SCHED_SCAN_DELAY]);
7122 7223
7224 if (attrs[NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI]) {
7225 request->relative_rssi = nla_get_s8(
7226 attrs[NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI]);
7227 request->relative_rssi_set = true;
7228 }
7229
7230 if (request->relative_rssi_set &&
7231 attrs[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST]) {
7232 struct nl80211_bss_select_rssi_adjust *rssi_adjust;
7233
7234 rssi_adjust = nla_data(
7235 attrs[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST]);
7236 request->rssi_adjust.band = rssi_adjust->band;
7237 request->rssi_adjust.delta = rssi_adjust->delta;
7238 if (!is_band_valid(wiphy, request->rssi_adjust.band)) {
7239 err = -EINVAL;
7240 goto out_free;
7241 }
7242 }
7243
7123 err = nl80211_parse_sched_scan_plans(wiphy, n_plans, request, attrs); 7244 err = nl80211_parse_sched_scan_plans(wiphy, n_plans, request, attrs);
7124 if (err) 7245 if (err)
7125 goto out_free; 7246 goto out_free;
@@ -7150,7 +7271,8 @@ static int nl80211_start_sched_scan(struct sk_buff *skb,
7150 return -EINPROGRESS; 7271 return -EINPROGRESS;
7151 7272
7152 sched_scan_req = nl80211_parse_sched_scan(&rdev->wiphy, wdev, 7273 sched_scan_req = nl80211_parse_sched_scan(&rdev->wiphy, wdev,
7153 info->attrs); 7274 info->attrs,
7275 rdev->wiphy.max_match_sets);
7154 7276
7155 err = PTR_ERR_OR_ZERO(sched_scan_req); 7277 err = PTR_ERR_OR_ZERO(sched_scan_req);
7156 if (err) 7278 if (err)
@@ -7541,9 +7663,12 @@ static int nl80211_dump_scan(struct sk_buff *skb, struct netlink_callback *cb)
7541 int start = cb->args[2], idx = 0; 7663 int start = cb->args[2], idx = 0;
7542 int err; 7664 int err;
7543 7665
7666 rtnl_lock();
7544 err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); 7667 err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev);
7545 if (err) 7668 if (err) {
7669 rtnl_unlock();
7546 return err; 7670 return err;
7671 }
7547 7672
7548 wdev_lock(wdev); 7673 wdev_lock(wdev);
7549 spin_lock_bh(&rdev->bss_lock); 7674 spin_lock_bh(&rdev->bss_lock);
@@ -7566,7 +7691,7 @@ static int nl80211_dump_scan(struct sk_buff *skb, struct netlink_callback *cb)
7566 wdev_unlock(wdev); 7691 wdev_unlock(wdev);
7567 7692
7568 cb->args[2] = idx; 7693 cb->args[2] = idx;
7569 nl80211_finish_wdev_dump(rdev); 7694 rtnl_unlock();
7570 7695
7571 return skb->len; 7696 return skb->len;
7572} 7697}
@@ -7643,6 +7768,7 @@ static int nl80211_send_survey(struct sk_buff *msg, u32 portid, u32 seq,
7643 7768
7644static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb) 7769static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb)
7645{ 7770{
7771 struct nlattr **attrbuf = genl_family_attrbuf(&nl80211_fam);
7646 struct survey_info survey; 7772 struct survey_info survey;
7647 struct cfg80211_registered_device *rdev; 7773 struct cfg80211_registered_device *rdev;
7648 struct wireless_dev *wdev; 7774 struct wireless_dev *wdev;
@@ -7650,12 +7776,13 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb)
7650 int res; 7776 int res;
7651 bool radio_stats; 7777 bool radio_stats;
7652 7778
7779 rtnl_lock();
7653 res = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); 7780 res = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev);
7654 if (res) 7781 if (res)
7655 return res; 7782 goto out_err;
7656 7783
7657 /* prepare_wdev_dump parsed the attributes */ 7784 /* prepare_wdev_dump parsed the attributes */
7658 radio_stats = nl80211_fam.attrbuf[NL80211_ATTR_SURVEY_RADIO_STATS]; 7785 radio_stats = attrbuf[NL80211_ATTR_SURVEY_RADIO_STATS];
7659 7786
7660 if (!wdev->netdev) { 7787 if (!wdev->netdev) {
7661 res = -EINVAL; 7788 res = -EINVAL;
@@ -7693,7 +7820,7 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb)
7693 cb->args[2] = survey_idx; 7820 cb->args[2] = survey_idx;
7694 res = skb->len; 7821 res = skb->len;
7695 out_err: 7822 out_err:
7696 nl80211_finish_wdev_dump(rdev); 7823 rtnl_unlock();
7697 return res; 7824 return res;
7698} 7825}
7699 7826
@@ -7708,8 +7835,8 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
7708 struct cfg80211_registered_device *rdev = info->user_ptr[0]; 7835 struct cfg80211_registered_device *rdev = info->user_ptr[0];
7709 struct net_device *dev = info->user_ptr[1]; 7836 struct net_device *dev = info->user_ptr[1];
7710 struct ieee80211_channel *chan; 7837 struct ieee80211_channel *chan;
7711 const u8 *bssid, *ssid, *ie = NULL, *sae_data = NULL; 7838 const u8 *bssid, *ssid, *ie = NULL, *auth_data = NULL;
7712 int err, ssid_len, ie_len = 0, sae_data_len = 0; 7839 int err, ssid_len, ie_len = 0, auth_data_len = 0;
7713 enum nl80211_auth_type auth_type; 7840 enum nl80211_auth_type auth_type;
7714 struct key_parse key; 7841 struct key_parse key;
7715 bool local_state_change; 7842 bool local_state_change;
@@ -7789,17 +7916,23 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
7789 if (!nl80211_valid_auth_type(rdev, auth_type, NL80211_CMD_AUTHENTICATE)) 7916 if (!nl80211_valid_auth_type(rdev, auth_type, NL80211_CMD_AUTHENTICATE))
7790 return -EINVAL; 7917 return -EINVAL;
7791 7918
7792 if (auth_type == NL80211_AUTHTYPE_SAE && 7919 if ((auth_type == NL80211_AUTHTYPE_SAE ||
7793 !info->attrs[NL80211_ATTR_SAE_DATA]) 7920 auth_type == NL80211_AUTHTYPE_FILS_SK ||
7921 auth_type == NL80211_AUTHTYPE_FILS_SK_PFS ||
7922 auth_type == NL80211_AUTHTYPE_FILS_PK) &&
7923 !info->attrs[NL80211_ATTR_AUTH_DATA])
7794 return -EINVAL; 7924 return -EINVAL;
7795 7925
7796 if (info->attrs[NL80211_ATTR_SAE_DATA]) { 7926 if (info->attrs[NL80211_ATTR_AUTH_DATA]) {
7797 if (auth_type != NL80211_AUTHTYPE_SAE) 7927 if (auth_type != NL80211_AUTHTYPE_SAE &&
7928 auth_type != NL80211_AUTHTYPE_FILS_SK &&
7929 auth_type != NL80211_AUTHTYPE_FILS_SK_PFS &&
7930 auth_type != NL80211_AUTHTYPE_FILS_PK)
7798 return -EINVAL; 7931 return -EINVAL;
7799 sae_data = nla_data(info->attrs[NL80211_ATTR_SAE_DATA]); 7932 auth_data = nla_data(info->attrs[NL80211_ATTR_AUTH_DATA]);
7800 sae_data_len = nla_len(info->attrs[NL80211_ATTR_SAE_DATA]); 7933 auth_data_len = nla_len(info->attrs[NL80211_ATTR_AUTH_DATA]);
7801 /* need to include at least Auth Transaction and Status Code */ 7934 /* need to include at least Auth Transaction and Status Code */
7802 if (sae_data_len < 4) 7935 if (auth_data_len < 4)
7803 return -EINVAL; 7936 return -EINVAL;
7804 } 7937 }
7805 7938
@@ -7816,7 +7949,7 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
7816 err = cfg80211_mlme_auth(rdev, dev, chan, auth_type, bssid, 7949 err = cfg80211_mlme_auth(rdev, dev, chan, auth_type, bssid,
7817 ssid, ssid_len, ie, ie_len, 7950 ssid, ssid_len, ie, ie_len,
7818 key.p.key, key.p.key_len, key.idx, 7951 key.p.key, key.p.key_len, key.idx,
7819 sae_data, sae_data_len); 7952 auth_data, auth_data_len);
7820 wdev_unlock(dev->ieee80211_ptr); 7953 wdev_unlock(dev->ieee80211_ptr);
7821 return err; 7954 return err;
7822} 7955}
@@ -7995,11 +8128,29 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
7995 req.flags |= ASSOC_REQ_USE_RRM; 8128 req.flags |= ASSOC_REQ_USE_RRM;
7996 } 8129 }
7997 8130
8131 if (info->attrs[NL80211_ATTR_FILS_KEK]) {
8132 req.fils_kek = nla_data(info->attrs[NL80211_ATTR_FILS_KEK]);
8133 req.fils_kek_len = nla_len(info->attrs[NL80211_ATTR_FILS_KEK]);
8134 if (!info->attrs[NL80211_ATTR_FILS_NONCES])
8135 return -EINVAL;
8136 req.fils_nonces =
8137 nla_data(info->attrs[NL80211_ATTR_FILS_NONCES]);
8138 }
8139
7998 err = nl80211_crypto_settings(rdev, info, &req.crypto, 1); 8140 err = nl80211_crypto_settings(rdev, info, &req.crypto, 1);
7999 if (!err) { 8141 if (!err) {
8000 wdev_lock(dev->ieee80211_ptr); 8142 wdev_lock(dev->ieee80211_ptr);
8143
8001 err = cfg80211_mlme_assoc(rdev, dev, chan, bssid, 8144 err = cfg80211_mlme_assoc(rdev, dev, chan, bssid,
8002 ssid, ssid_len, &req); 8145 ssid, ssid_len, &req);
8146
8147 if (!err && info->attrs[NL80211_ATTR_SOCKET_OWNER]) {
8148 dev->ieee80211_ptr->conn_owner_nlportid =
8149 info->snd_portid;
8150 memcpy(dev->ieee80211_ptr->disconnect_bssid,
8151 bssid, ETH_ALEN);
8152 }
8153
8003 wdev_unlock(dev->ieee80211_ptr); 8154 wdev_unlock(dev->ieee80211_ptr);
8004 } 8155 }
8005 8156
@@ -8152,7 +8303,8 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
8152 ibss.beacon_interval = 8303 ibss.beacon_interval =
8153 nla_get_u32(info->attrs[NL80211_ATTR_BEACON_INTERVAL]); 8304 nla_get_u32(info->attrs[NL80211_ATTR_BEACON_INTERVAL]);
8154 8305
8155 err = cfg80211_validate_beacon_int(rdev, ibss.beacon_interval); 8306 err = cfg80211_validate_beacon_int(rdev, NL80211_IFTYPE_ADHOC,
8307 ibss.beacon_interval);
8156 if (err) 8308 if (err)
8157 return err; 8309 return err;
8158 8310
@@ -8477,25 +8629,29 @@ static int nl80211_testmode_dump(struct sk_buff *skb,
8477 * so we need to offset by 1. 8629 * so we need to offset by 1.
8478 */ 8630 */
8479 phy_idx = cb->args[0] - 1; 8631 phy_idx = cb->args[0] - 1;
8632
8633 rdev = cfg80211_rdev_by_wiphy_idx(phy_idx);
8634 if (!rdev) {
8635 err = -ENOENT;
8636 goto out_err;
8637 }
8480 } else { 8638 } else {
8639 struct nlattr **attrbuf = genl_family_attrbuf(&nl80211_fam);
8640
8481 err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize, 8641 err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize,
8482 nl80211_fam.attrbuf, nl80211_fam.maxattr, 8642 attrbuf, nl80211_fam.maxattr, nl80211_policy);
8483 nl80211_policy);
8484 if (err) 8643 if (err)
8485 goto out_err; 8644 goto out_err;
8486 8645
8487 rdev = __cfg80211_rdev_from_attrs(sock_net(skb->sk), 8646 rdev = __cfg80211_rdev_from_attrs(sock_net(skb->sk), attrbuf);
8488 nl80211_fam.attrbuf);
8489 if (IS_ERR(rdev)) { 8647 if (IS_ERR(rdev)) {
8490 err = PTR_ERR(rdev); 8648 err = PTR_ERR(rdev);
8491 goto out_err; 8649 goto out_err;
8492 } 8650 }
8493 phy_idx = rdev->wiphy_idx; 8651 phy_idx = rdev->wiphy_idx;
8494 rdev = NULL;
8495 8652
8496 if (nl80211_fam.attrbuf[NL80211_ATTR_TESTDATA]) 8653 if (attrbuf[NL80211_ATTR_TESTDATA])
8497 cb->args[1] = 8654 cb->args[1] = (long)attrbuf[NL80211_ATTR_TESTDATA];
8498 (long)nl80211_fam.attrbuf[NL80211_ATTR_TESTDATA];
8499 } 8655 }
8500 8656
8501 if (cb->args[1]) { 8657 if (cb->args[1]) {
@@ -8503,12 +8659,6 @@ static int nl80211_testmode_dump(struct sk_buff *skb,
8503 data_len = nla_len((void *)cb->args[1]); 8659 data_len = nla_len((void *)cb->args[1]);
8504 } 8660 }
8505 8661
8506 rdev = cfg80211_rdev_by_wiphy_idx(phy_idx);
8507 if (!rdev) {
8508 err = -ENOENT;
8509 goto out_err;
8510 }
8511
8512 if (!rdev->ops->testmode_dump) { 8662 if (!rdev->ops->testmode_dump) {
8513 err = -EOPNOTSUPP; 8663 err = -EOPNOTSUPP;
8514 goto out_err; 8664 goto out_err;
@@ -8718,14 +8868,58 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
8718 } 8868 }
8719 8869
8720 wdev_lock(dev->ieee80211_ptr); 8870 wdev_lock(dev->ieee80211_ptr);
8871
8721 err = cfg80211_connect(rdev, dev, &connect, connkeys, 8872 err = cfg80211_connect(rdev, dev, &connect, connkeys,
8722 connect.prev_bssid); 8873 connect.prev_bssid);
8723 wdev_unlock(dev->ieee80211_ptr);
8724 if (err) 8874 if (err)
8725 kzfree(connkeys); 8875 kzfree(connkeys);
8876
8877 if (!err && info->attrs[NL80211_ATTR_SOCKET_OWNER]) {
8878 dev->ieee80211_ptr->conn_owner_nlportid = info->snd_portid;
8879 if (connect.bssid)
8880 memcpy(dev->ieee80211_ptr->disconnect_bssid,
8881 connect.bssid, ETH_ALEN);
8882 else
8883 memset(dev->ieee80211_ptr->disconnect_bssid,
8884 0, ETH_ALEN);
8885 }
8886
8887 wdev_unlock(dev->ieee80211_ptr);
8888
8726 return err; 8889 return err;
8727} 8890}
8728 8891
8892static int nl80211_update_connect_params(struct sk_buff *skb,
8893 struct genl_info *info)
8894{
8895 struct cfg80211_connect_params connect = {};
8896 struct cfg80211_registered_device *rdev = info->user_ptr[0];
8897 struct net_device *dev = info->user_ptr[1];
8898 struct wireless_dev *wdev = dev->ieee80211_ptr;
8899 u32 changed = 0;
8900 int ret;
8901
8902 if (!rdev->ops->update_connect_params)
8903 return -EOPNOTSUPP;
8904
8905 if (info->attrs[NL80211_ATTR_IE]) {
8906 if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
8907 return -EINVAL;
8908 connect.ie = nla_data(info->attrs[NL80211_ATTR_IE]);
8909 connect.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
8910 changed |= UPDATE_ASSOC_IES;
8911 }
8912
8913 wdev_lock(dev->ieee80211_ptr);
8914 if (!wdev->current_bss)
8915 ret = -ENOLINK;
8916 else
8917 ret = rdev_update_connect_params(rdev, dev, &connect, changed);
8918 wdev_unlock(dev->ieee80211_ptr);
8919
8920 return ret;
8921}
8922
8729static int nl80211_disconnect(struct sk_buff *skb, struct genl_info *info) 8923static int nl80211_disconnect(struct sk_buff *skb, struct genl_info *info)
8730{ 8924{
8731 struct cfg80211_registered_device *rdev = info->user_ptr[0]; 8925 struct cfg80211_registered_device *rdev = info->user_ptr[0];
@@ -9278,6 +9472,7 @@ nl80211_attr_cqm_policy[NL80211_ATTR_CQM_MAX + 1] = {
9278 [NL80211_ATTR_CQM_TXE_RATE] = { .type = NLA_U32 }, 9472 [NL80211_ATTR_CQM_TXE_RATE] = { .type = NLA_U32 },
9279 [NL80211_ATTR_CQM_TXE_PKTS] = { .type = NLA_U32 }, 9473 [NL80211_ATTR_CQM_TXE_PKTS] = { .type = NLA_U32 },
9280 [NL80211_ATTR_CQM_TXE_INTVL] = { .type = NLA_U32 }, 9474 [NL80211_ATTR_CQM_TXE_INTVL] = { .type = NLA_U32 },
9475 [NL80211_ATTR_CQM_RSSI_LEVEL] = { .type = NLA_S32 },
9281}; 9476};
9282 9477
9283static int nl80211_set_cqm_txe(struct genl_info *info, 9478static int nl80211_set_cqm_txe(struct genl_info *info,
@@ -9417,7 +9612,9 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info)
9417 setup.beacon_interval = 9612 setup.beacon_interval =
9418 nla_get_u32(info->attrs[NL80211_ATTR_BEACON_INTERVAL]); 9613 nla_get_u32(info->attrs[NL80211_ATTR_BEACON_INTERVAL]);
9419 9614
9420 err = cfg80211_validate_beacon_int(rdev, setup.beacon_interval); 9615 err = cfg80211_validate_beacon_int(rdev,
9616 NL80211_IFTYPE_MESH_POINT,
9617 setup.beacon_interval);
9421 if (err) 9618 if (err)
9422 return err; 9619 return err;
9423 } 9620 }
@@ -9585,6 +9782,20 @@ static int nl80211_send_wowlan_nd(struct sk_buff *msg,
9585 if (nla_put_u32(msg, NL80211_ATTR_SCHED_SCAN_DELAY, req->delay)) 9782 if (nla_put_u32(msg, NL80211_ATTR_SCHED_SCAN_DELAY, req->delay))
9586 return -ENOBUFS; 9783 return -ENOBUFS;
9587 9784
9785 if (req->relative_rssi_set) {
9786 struct nl80211_bss_select_rssi_adjust rssi_adjust;
9787
9788 if (nla_put_s8(msg, NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI,
9789 req->relative_rssi))
9790 return -ENOBUFS;
9791
9792 rssi_adjust.band = req->rssi_adjust.band;
9793 rssi_adjust.delta = req->rssi_adjust.delta;
9794 if (nla_put(msg, NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST,
9795 sizeof(rssi_adjust), &rssi_adjust))
9796 return -ENOBUFS;
9797 }
9798
9588 freqs = nla_nest_start(msg, NL80211_ATTR_SCAN_FREQUENCIES); 9799 freqs = nla_nest_start(msg, NL80211_ATTR_SCAN_FREQUENCIES);
9589 if (!freqs) 9800 if (!freqs)
9590 return -ENOBUFS; 9801 return -ENOBUFS;
@@ -9728,9 +9939,8 @@ static int nl80211_parse_wowlan_tcp(struct cfg80211_registered_device *rdev,
9728 if (!rdev->wiphy.wowlan->tcp) 9939 if (!rdev->wiphy.wowlan->tcp)
9729 return -EINVAL; 9940 return -EINVAL;
9730 9941
9731 err = nla_parse(tb, MAX_NL80211_WOWLAN_TCP, 9942 err = nla_parse_nested(tb, MAX_NL80211_WOWLAN_TCP, attr,
9732 nla_data(attr), nla_len(attr), 9943 nl80211_wowlan_tcp_policy);
9733 nl80211_wowlan_tcp_policy);
9734 if (err) 9944 if (err)
9735 return err; 9945 return err;
9736 9946
@@ -9875,13 +10085,12 @@ static int nl80211_parse_wowlan_nd(struct cfg80211_registered_device *rdev,
9875 goto out; 10085 goto out;
9876 } 10086 }
9877 10087
9878 err = nla_parse(tb, NL80211_ATTR_MAX, 10088 err = nla_parse_nested(tb, NL80211_ATTR_MAX, attr, nl80211_policy);
9879 nla_data(attr), nla_len(attr),
9880 nl80211_policy);
9881 if (err) 10089 if (err)
9882 goto out; 10090 goto out;
9883 10091
9884 trig->nd_config = nl80211_parse_sched_scan(&rdev->wiphy, NULL, tb); 10092 trig->nd_config = nl80211_parse_sched_scan(&rdev->wiphy, NULL, tb,
10093 wowlan->max_nd_match_sets);
9885 err = PTR_ERR_OR_ZERO(trig->nd_config); 10094 err = PTR_ERR_OR_ZERO(trig->nd_config);
9886 if (err) 10095 if (err)
9887 trig->nd_config = NULL; 10096 trig->nd_config = NULL;
@@ -9911,10 +10120,9 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info)
9911 goto set_wakeup; 10120 goto set_wakeup;
9912 } 10121 }
9913 10122
9914 err = nla_parse(tb, MAX_NL80211_WOWLAN_TRIG, 10123 err = nla_parse_nested(tb, MAX_NL80211_WOWLAN_TRIG,
9915 nla_data(info->attrs[NL80211_ATTR_WOWLAN_TRIGGERS]), 10124 info->attrs[NL80211_ATTR_WOWLAN_TRIGGERS],
9916 nla_len(info->attrs[NL80211_ATTR_WOWLAN_TRIGGERS]), 10125 nl80211_wowlan_policy);
9917 nl80211_wowlan_policy);
9918 if (err) 10126 if (err)
9919 return err; 10127 return err;
9920 10128
@@ -9996,8 +10204,8 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info)
9996 rem) { 10204 rem) {
9997 u8 *mask_pat; 10205 u8 *mask_pat;
9998 10206
9999 nla_parse(pat_tb, MAX_NL80211_PKTPAT, nla_data(pat), 10207 nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat,
10000 nla_len(pat), NULL); 10208 NULL);
10001 err = -EINVAL; 10209 err = -EINVAL;
10002 if (!pat_tb[NL80211_PKTPAT_MASK] || 10210 if (!pat_tb[NL80211_PKTPAT_MASK] ||
10003 !pat_tb[NL80211_PKTPAT_PATTERN]) 10211 !pat_tb[NL80211_PKTPAT_PATTERN])
@@ -10207,8 +10415,8 @@ static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev,
10207 int rem, pat_len, mask_len, pkt_offset, n_patterns = 0; 10415 int rem, pat_len, mask_len, pkt_offset, n_patterns = 0;
10208 struct nlattr *pat_tb[NUM_NL80211_PKTPAT]; 10416 struct nlattr *pat_tb[NUM_NL80211_PKTPAT];
10209 10417
10210 err = nla_parse(tb, NL80211_ATTR_COALESCE_RULE_MAX, nla_data(rule), 10418 err = nla_parse_nested(tb, NL80211_ATTR_COALESCE_RULE_MAX, rule,
10211 nla_len(rule), nl80211_coalesce_policy); 10419 nl80211_coalesce_policy);
10212 if (err) 10420 if (err)
10213 return err; 10421 return err;
10214 10422
@@ -10246,8 +10454,7 @@ static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev,
10246 rem) { 10454 rem) {
10247 u8 *mask_pat; 10455 u8 *mask_pat;
10248 10456
10249 nla_parse(pat_tb, MAX_NL80211_PKTPAT, nla_data(pat), 10457 nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat, NULL);
10250 nla_len(pat), NULL);
10251 if (!pat_tb[NL80211_PKTPAT_MASK] || 10458 if (!pat_tb[NL80211_PKTPAT_MASK] ||
10252 !pat_tb[NL80211_PKTPAT_PATTERN]) 10459 !pat_tb[NL80211_PKTPAT_PATTERN])
10253 return -EINVAL; 10460 return -EINVAL;
@@ -10366,10 +10573,9 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info)
10366 if (!info->attrs[NL80211_ATTR_REKEY_DATA]) 10573 if (!info->attrs[NL80211_ATTR_REKEY_DATA])
10367 return -EINVAL; 10574 return -EINVAL;
10368 10575
10369 err = nla_parse(tb, MAX_NL80211_REKEY_DATA, 10576 err = nla_parse_nested(tb, MAX_NL80211_REKEY_DATA,
10370 nla_data(info->attrs[NL80211_ATTR_REKEY_DATA]), 10577 info->attrs[NL80211_ATTR_REKEY_DATA],
10371 nla_len(info->attrs[NL80211_ATTR_REKEY_DATA]), 10578 nl80211_rekey_policy);
10372 nl80211_rekey_policy);
10373 if (err) 10579 if (err)
10374 return err; 10580 return err;
10375 10581
@@ -10518,7 +10724,7 @@ static int nl80211_start_p2p_device(struct sk_buff *skb, struct genl_info *info)
10518 if (wdev->iftype != NL80211_IFTYPE_P2P_DEVICE) 10724 if (wdev->iftype != NL80211_IFTYPE_P2P_DEVICE)
10519 return -EOPNOTSUPP; 10725 return -EOPNOTSUPP;
10520 10726
10521 if (wdev->p2p_started) 10727 if (wdev_running(wdev))
10522 return 0; 10728 return 0;
10523 10729
10524 if (rfkill_blocked(rdev->rfkill)) 10730 if (rfkill_blocked(rdev->rfkill))
@@ -10528,7 +10734,7 @@ static int nl80211_start_p2p_device(struct sk_buff *skb, struct genl_info *info)
10528 if (err) 10734 if (err)
10529 return err; 10735 return err;
10530 10736
10531 wdev->p2p_started = true; 10737 wdev->is_running = true;
10532 rdev->opencount++; 10738 rdev->opencount++;
10533 10739
10534 return 0; 10740 return 0;
@@ -10560,7 +10766,7 @@ static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info)
10560 if (wdev->iftype != NL80211_IFTYPE_NAN) 10766 if (wdev->iftype != NL80211_IFTYPE_NAN)
10561 return -EOPNOTSUPP; 10767 return -EOPNOTSUPP;
10562 10768
10563 if (wdev->nan_started) 10769 if (wdev_running(wdev))
10564 return -EEXIST; 10770 return -EEXIST;
10565 10771
10566 if (rfkill_blocked(rdev->rfkill)) 10772 if (rfkill_blocked(rdev->rfkill))
@@ -10569,21 +10775,28 @@ static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info)
10569 if (!info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) 10775 if (!info->attrs[NL80211_ATTR_NAN_MASTER_PREF])
10570 return -EINVAL; 10776 return -EINVAL;
10571 10777
10572 if (!info->attrs[NL80211_ATTR_NAN_DUAL])
10573 return -EINVAL;
10574
10575 conf.master_pref = 10778 conf.master_pref =
10576 nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]); 10779 nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]);
10577 if (!conf.master_pref) 10780 if (!conf.master_pref)
10578 return -EINVAL; 10781 return -EINVAL;
10579 10782
10580 conf.dual = nla_get_u8(info->attrs[NL80211_ATTR_NAN_DUAL]); 10783 if (info->attrs[NL80211_ATTR_BANDS]) {
10784 u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]);
10785
10786 if (bands & ~(u32)wdev->wiphy->nan_supported_bands)
10787 return -EOPNOTSUPP;
10788
10789 if (bands && !(bands & BIT(NL80211_BAND_2GHZ)))
10790 return -EINVAL;
10791
10792 conf.bands = bands;
10793 }
10581 10794
10582 err = rdev_start_nan(rdev, wdev, &conf); 10795 err = rdev_start_nan(rdev, wdev, &conf);
10583 if (err) 10796 if (err)
10584 return err; 10797 return err;
10585 10798
10586 wdev->nan_started = true; 10799 wdev->is_running = true;
10587 rdev->opencount++; 10800 rdev->opencount++;
10588 10801
10589 return 0; 10802 return 0;
@@ -10638,8 +10851,7 @@ static int handle_nan_filter(struct nlattr *attr_filter,
10638 10851
10639 i = 0; 10852 i = 0;
10640 nla_for_each_nested(attr, attr_filter, rem) { 10853 nla_for_each_nested(attr, attr_filter, rem) {
10641 filter[i].filter = kmemdup(nla_data(attr), nla_len(attr), 10854 filter[i].filter = nla_memdup(attr, GFP_KERNEL);
10642 GFP_KERNEL);
10643 filter[i].len = nla_len(attr); 10855 filter[i].len = nla_len(attr);
10644 i++; 10856 i++;
10645 } 10857 }
@@ -10668,7 +10880,7 @@ static int nl80211_nan_add_func(struct sk_buff *skb,
10668 if (wdev->iftype != NL80211_IFTYPE_NAN) 10880 if (wdev->iftype != NL80211_IFTYPE_NAN)
10669 return -EOPNOTSUPP; 10881 return -EOPNOTSUPP;
10670 10882
10671 if (!wdev->nan_started) 10883 if (!wdev_running(wdev))
10672 return -ENOTCONN; 10884 return -ENOTCONN;
10673 10885
10674 if (!info->attrs[NL80211_ATTR_NAN_FUNC]) 10886 if (!info->attrs[NL80211_ATTR_NAN_FUNC])
@@ -10678,10 +10890,9 @@ static int nl80211_nan_add_func(struct sk_buff *skb,
10678 wdev->owner_nlportid != info->snd_portid) 10890 wdev->owner_nlportid != info->snd_portid)
10679 return -ENOTCONN; 10891 return -ENOTCONN;
10680 10892
10681 err = nla_parse(tb, NL80211_NAN_FUNC_ATTR_MAX, 10893 err = nla_parse_nested(tb, NL80211_NAN_FUNC_ATTR_MAX,
10682 nla_data(info->attrs[NL80211_ATTR_NAN_FUNC]), 10894 info->attrs[NL80211_ATTR_NAN_FUNC],
10683 nla_len(info->attrs[NL80211_ATTR_NAN_FUNC]), 10895 nl80211_nan_func_policy);
10684 nl80211_nan_func_policy);
10685 if (err) 10896 if (err)
10686 return err; 10897 return err;
10687 10898
@@ -10776,9 +10987,9 @@ static int nl80211_nan_add_func(struct sk_buff *skb,
10776 if (tb[NL80211_NAN_FUNC_SRF]) { 10987 if (tb[NL80211_NAN_FUNC_SRF]) {
10777 struct nlattr *srf_tb[NUM_NL80211_NAN_SRF_ATTR]; 10988 struct nlattr *srf_tb[NUM_NL80211_NAN_SRF_ATTR];
10778 10989
10779 err = nla_parse(srf_tb, NL80211_NAN_SRF_ATTR_MAX, 10990 err = nla_parse_nested(srf_tb, NL80211_NAN_SRF_ATTR_MAX,
10780 nla_data(tb[NL80211_NAN_FUNC_SRF]), 10991 tb[NL80211_NAN_FUNC_SRF],
10781 nla_len(tb[NL80211_NAN_FUNC_SRF]), NULL); 10992 nl80211_nan_srf_policy);
10782 if (err) 10993 if (err)
10783 goto out; 10994 goto out;
10784 10995
@@ -10904,7 +11115,7 @@ static int nl80211_nan_del_func(struct sk_buff *skb,
10904 if (wdev->iftype != NL80211_IFTYPE_NAN) 11115 if (wdev->iftype != NL80211_IFTYPE_NAN)
10905 return -EOPNOTSUPP; 11116 return -EOPNOTSUPP;
10906 11117
10907 if (!wdev->nan_started) 11118 if (!wdev_running(wdev))
10908 return -ENOTCONN; 11119 return -ENOTCONN;
10909 11120
10910 if (!info->attrs[NL80211_ATTR_COOKIE]) 11121 if (!info->attrs[NL80211_ATTR_COOKIE])
@@ -10932,7 +11143,7 @@ static int nl80211_nan_change_config(struct sk_buff *skb,
10932 if (wdev->iftype != NL80211_IFTYPE_NAN) 11143 if (wdev->iftype != NL80211_IFTYPE_NAN)
10933 return -EOPNOTSUPP; 11144 return -EOPNOTSUPP;
10934 11145
10935 if (!wdev->nan_started) 11146 if (!wdev_running(wdev))
10936 return -ENOTCONN; 11147 return -ENOTCONN;
10937 11148
10938 if (info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) { 11149 if (info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) {
@@ -10944,9 +11155,17 @@ static int nl80211_nan_change_config(struct sk_buff *skb,
10944 changed |= CFG80211_NAN_CONF_CHANGED_PREF; 11155 changed |= CFG80211_NAN_CONF_CHANGED_PREF;
10945 } 11156 }
10946 11157
10947 if (info->attrs[NL80211_ATTR_NAN_DUAL]) { 11158 if (info->attrs[NL80211_ATTR_BANDS]) {
10948 conf.dual = nla_get_u8(info->attrs[NL80211_ATTR_NAN_DUAL]); 11159 u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]);
10949 changed |= CFG80211_NAN_CONF_CHANGED_DUAL; 11160
11161 if (bands & ~(u32)wdev->wiphy->nan_supported_bands)
11162 return -EOPNOTSUPP;
11163
11164 if (bands && !(bands & BIT(NL80211_BAND_2GHZ)))
11165 return -EINVAL;
11166
11167 conf.bands = bands;
11168 changed |= CFG80211_NAN_CONF_CHANGED_BANDS;
10950 } 11169 }
10951 11170
10952 if (!changed) 11171 if (!changed)
@@ -11244,10 +11463,7 @@ static int nl80211_vendor_cmd(struct sk_buff *skb, struct genl_info *info)
11244 return -EINVAL; 11463 return -EINVAL;
11245 11464
11246 if (vcmd->flags & WIPHY_VENDOR_CMD_NEED_RUNNING) { 11465 if (vcmd->flags & WIPHY_VENDOR_CMD_NEED_RUNNING) {
11247 if (wdev->netdev && 11466 if (!wdev_running(wdev))
11248 !netif_running(wdev->netdev))
11249 return -ENETDOWN;
11250 if (!wdev->netdev && !wdev->p2p_started)
11251 return -ENETDOWN; 11467 return -ENETDOWN;
11252 } 11468 }
11253 11469
@@ -11277,6 +11493,7 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb,
11277 struct cfg80211_registered_device **rdev, 11493 struct cfg80211_registered_device **rdev,
11278 struct wireless_dev **wdev) 11494 struct wireless_dev **wdev)
11279{ 11495{
11496 struct nlattr **attrbuf = genl_family_attrbuf(&nl80211_fam);
11280 u32 vid, subcmd; 11497 u32 vid, subcmd;
11281 unsigned int i; 11498 unsigned int i;
11282 int vcmd_idx = -1; 11499 int vcmd_idx = -1;
@@ -11284,17 +11501,13 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb,
11284 void *data = NULL; 11501 void *data = NULL;
11285 unsigned int data_len = 0; 11502 unsigned int data_len = 0;
11286 11503
11287 rtnl_lock();
11288
11289 if (cb->args[0]) { 11504 if (cb->args[0]) {
11290 /* subtract the 1 again here */ 11505 /* subtract the 1 again here */
11291 struct wiphy *wiphy = wiphy_idx_to_wiphy(cb->args[0] - 1); 11506 struct wiphy *wiphy = wiphy_idx_to_wiphy(cb->args[0] - 1);
11292 struct wireless_dev *tmp; 11507 struct wireless_dev *tmp;
11293 11508
11294 if (!wiphy) { 11509 if (!wiphy)
11295 err = -ENODEV; 11510 return -ENODEV;
11296 goto out_unlock;
11297 }
11298 *rdev = wiphy_to_rdev(wiphy); 11511 *rdev = wiphy_to_rdev(wiphy);
11299 *wdev = NULL; 11512 *wdev = NULL;
11300 11513
@@ -11312,31 +11525,24 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb,
11312 } 11525 }
11313 11526
11314 err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize, 11527 err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize,
11315 nl80211_fam.attrbuf, nl80211_fam.maxattr, 11528 attrbuf, nl80211_fam.maxattr, nl80211_policy);
11316 nl80211_policy);
11317 if (err) 11529 if (err)
11318 goto out_unlock; 11530 return err;
11319 11531
11320 if (!nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_ID] || 11532 if (!attrbuf[NL80211_ATTR_VENDOR_ID] ||
11321 !nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_SUBCMD]) { 11533 !attrbuf[NL80211_ATTR_VENDOR_SUBCMD])
11322 err = -EINVAL; 11534 return -EINVAL;
11323 goto out_unlock;
11324 }
11325 11535
11326 *wdev = __cfg80211_wdev_from_attrs(sock_net(skb->sk), 11536 *wdev = __cfg80211_wdev_from_attrs(sock_net(skb->sk), attrbuf);
11327 nl80211_fam.attrbuf);
11328 if (IS_ERR(*wdev)) 11537 if (IS_ERR(*wdev))
11329 *wdev = NULL; 11538 *wdev = NULL;
11330 11539
11331 *rdev = __cfg80211_rdev_from_attrs(sock_net(skb->sk), 11540 *rdev = __cfg80211_rdev_from_attrs(sock_net(skb->sk), attrbuf);
11332 nl80211_fam.attrbuf); 11541 if (IS_ERR(*rdev))
11333 if (IS_ERR(*rdev)) { 11542 return PTR_ERR(*rdev);
11334 err = PTR_ERR(*rdev);
11335 goto out_unlock;
11336 }
11337 11543
11338 vid = nla_get_u32(nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_ID]); 11544 vid = nla_get_u32(attrbuf[NL80211_ATTR_VENDOR_ID]);
11339 subcmd = nla_get_u32(nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_SUBCMD]); 11545 subcmd = nla_get_u32(attrbuf[NL80211_ATTR_VENDOR_SUBCMD]);
11340 11546
11341 for (i = 0; i < (*rdev)->wiphy.n_vendor_commands; i++) { 11547 for (i = 0; i < (*rdev)->wiphy.n_vendor_commands; i++) {
11342 const struct wiphy_vendor_command *vcmd; 11548 const struct wiphy_vendor_command *vcmd;
@@ -11346,23 +11552,19 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb,
11346 if (vcmd->info.vendor_id != vid || vcmd->info.subcmd != subcmd) 11552 if (vcmd->info.vendor_id != vid || vcmd->info.subcmd != subcmd)
11347 continue; 11553 continue;
11348 11554
11349 if (!vcmd->dumpit) { 11555 if (!vcmd->dumpit)
11350 err = -EOPNOTSUPP; 11556 return -EOPNOTSUPP;
11351 goto out_unlock;
11352 }
11353 11557
11354 vcmd_idx = i; 11558 vcmd_idx = i;
11355 break; 11559 break;
11356 } 11560 }
11357 11561
11358 if (vcmd_idx < 0) { 11562 if (vcmd_idx < 0)
11359 err = -EOPNOTSUPP; 11563 return -EOPNOTSUPP;
11360 goto out_unlock;
11361 }
11362 11564
11363 if (nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_DATA]) { 11565 if (attrbuf[NL80211_ATTR_VENDOR_DATA]) {
11364 data = nla_data(nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_DATA]); 11566 data = nla_data(attrbuf[NL80211_ATTR_VENDOR_DATA]);
11365 data_len = nla_len(nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_DATA]); 11567 data_len = nla_len(attrbuf[NL80211_ATTR_VENDOR_DATA]);
11366 } 11568 }
11367 11569
11368 /* 0 is the first index - add 1 to parse only once */ 11570 /* 0 is the first index - add 1 to parse only once */
@@ -11375,9 +11577,6 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb,
11375 11577
11376 /* keep rtnl locked in successful case */ 11578 /* keep rtnl locked in successful case */
11377 return 0; 11579 return 0;
11378 out_unlock:
11379 rtnl_unlock();
11380 return err;
11381} 11580}
11382 11581
11383static int nl80211_vendor_cmd_dump(struct sk_buff *skb, 11582static int nl80211_vendor_cmd_dump(struct sk_buff *skb,
@@ -11392,9 +11591,10 @@ static int nl80211_vendor_cmd_dump(struct sk_buff *skb,
11392 int err; 11591 int err;
11393 struct nlattr *vendor_data; 11592 struct nlattr *vendor_data;
11394 11593
11594 rtnl_lock();
11395 err = nl80211_prepare_vendor_dump(skb, cb, &rdev, &wdev); 11595 err = nl80211_prepare_vendor_dump(skb, cb, &rdev, &wdev);
11396 if (err) 11596 if (err)
11397 return err; 11597 goto out;
11398 11598
11399 vcmd_idx = cb->args[2]; 11599 vcmd_idx = cb->args[2];
11400 data = (void *)cb->args[3]; 11600 data = (void *)cb->args[3];
@@ -11403,18 +11603,21 @@ static int nl80211_vendor_cmd_dump(struct sk_buff *skb,
11403 11603
11404 if (vcmd->flags & (WIPHY_VENDOR_CMD_NEED_WDEV | 11604 if (vcmd->flags & (WIPHY_VENDOR_CMD_NEED_WDEV |
11405 WIPHY_VENDOR_CMD_NEED_NETDEV)) { 11605 WIPHY_VENDOR_CMD_NEED_NETDEV)) {
11406 if (!wdev) 11606 if (!wdev) {
11407 return -EINVAL; 11607 err = -EINVAL;
11608 goto out;
11609 }
11408 if (vcmd->flags & WIPHY_VENDOR_CMD_NEED_NETDEV && 11610 if (vcmd->flags & WIPHY_VENDOR_CMD_NEED_NETDEV &&
11409 !wdev->netdev) 11611 !wdev->netdev) {
11410 return -EINVAL; 11612 err = -EINVAL;
11613 goto out;
11614 }
11411 11615
11412 if (vcmd->flags & WIPHY_VENDOR_CMD_NEED_RUNNING) { 11616 if (vcmd->flags & WIPHY_VENDOR_CMD_NEED_RUNNING) {
11413 if (wdev->netdev && 11617 if (!wdev_running(wdev)) {
11414 !netif_running(wdev->netdev)) 11618 err = -ENETDOWN;
11415 return -ENETDOWN; 11619 goto out;
11416 if (!wdev->netdev && !wdev->p2p_started) 11620 }
11417 return -ENETDOWN;
11418 } 11621 }
11419 } 11622 }
11420 11623
@@ -11726,6 +11929,28 @@ static int nl80211_tdls_cancel_channel_switch(struct sk_buff *skb,
11726 return 0; 11929 return 0;
11727} 11930}
11728 11931
11932static int nl80211_set_multicast_to_unicast(struct sk_buff *skb,
11933 struct genl_info *info)
11934{
11935 struct cfg80211_registered_device *rdev = info->user_ptr[0];
11936 struct net_device *dev = info->user_ptr[1];
11937 struct wireless_dev *wdev = dev->ieee80211_ptr;
11938 const struct nlattr *nla;
11939 bool enabled;
11940
11941 if (!rdev->ops->set_multicast_to_unicast)
11942 return -EOPNOTSUPP;
11943
11944 if (wdev->iftype != NL80211_IFTYPE_AP &&
11945 wdev->iftype != NL80211_IFTYPE_P2P_GO)
11946 return -EOPNOTSUPP;
11947
11948 nla = info->attrs[NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED];
11949 enabled = nla_get_flag(nla);
11950
11951 return rdev_set_multicast_to_unicast(rdev, dev, enabled);
11952}
11953
11729#define NL80211_FLAG_NEED_WIPHY 0x01 11954#define NL80211_FLAG_NEED_WIPHY 0x01
11730#define NL80211_FLAG_NEED_NETDEV 0x02 11955#define NL80211_FLAG_NEED_NETDEV 0x02
11731#define NL80211_FLAG_NEED_RTNL 0x04 11956#define NL80211_FLAG_NEED_RTNL 0x04
@@ -11784,29 +12009,15 @@ static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
11784 info->user_ptr[1] = wdev; 12009 info->user_ptr[1] = wdev;
11785 } 12010 }
11786 12011
11787 if (dev) { 12012 if (ops->internal_flags & NL80211_FLAG_CHECK_NETDEV_UP &&
11788 if (ops->internal_flags & NL80211_FLAG_CHECK_NETDEV_UP && 12013 !wdev_running(wdev)) {
11789 !netif_running(dev)) { 12014 if (rtnl)
11790 if (rtnl) 12015 rtnl_unlock();
11791 rtnl_unlock(); 12016 return -ENETDOWN;
11792 return -ENETDOWN; 12017 }
11793 }
11794 12018
12019 if (dev)
11795 dev_hold(dev); 12020 dev_hold(dev);
11796 } else if (ops->internal_flags & NL80211_FLAG_CHECK_NETDEV_UP) {
11797 if (wdev->iftype == NL80211_IFTYPE_P2P_DEVICE &&
11798 !wdev->p2p_started) {
11799 if (rtnl)
11800 rtnl_unlock();
11801 return -ENETDOWN;
11802 }
11803 if (wdev->iftype == NL80211_IFTYPE_NAN &&
11804 !wdev->nan_started) {
11805 if (rtnl)
11806 rtnl_unlock();
11807 return -ENETDOWN;
11808 }
11809 }
11810 12021
11811 info->user_ptr[0] = rdev; 12022 info->user_ptr[0] = rdev;
11812 } 12023 }
@@ -12179,6 +12390,14 @@ static const struct genl_ops nl80211_ops[] = {
12179 NL80211_FLAG_NEED_RTNL, 12390 NL80211_FLAG_NEED_RTNL,
12180 }, 12391 },
12181 { 12392 {
12393 .cmd = NL80211_CMD_UPDATE_CONNECT_PARAMS,
12394 .doit = nl80211_update_connect_params,
12395 .policy = nl80211_policy,
12396 .flags = GENL_ADMIN_PERM,
12397 .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
12398 NL80211_FLAG_NEED_RTNL,
12399 },
12400 {
12182 .cmd = NL80211_CMD_DISCONNECT, 12401 .cmd = NL80211_CMD_DISCONNECT,
12183 .doit = nl80211_disconnect, 12402 .doit = nl80211_disconnect,
12184 .policy = nl80211_policy, 12403 .policy = nl80211_policy,
@@ -12599,6 +12818,29 @@ static const struct genl_ops nl80211_ops[] = {
12599 .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | 12818 .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
12600 NL80211_FLAG_NEED_RTNL, 12819 NL80211_FLAG_NEED_RTNL,
12601 }, 12820 },
12821 {
12822 .cmd = NL80211_CMD_SET_MULTICAST_TO_UNICAST,
12823 .doit = nl80211_set_multicast_to_unicast,
12824 .policy = nl80211_policy,
12825 .flags = GENL_UNS_ADMIN_PERM,
12826 .internal_flags = NL80211_FLAG_NEED_NETDEV |
12827 NL80211_FLAG_NEED_RTNL,
12828 },
12829};
12830
12831static struct genl_family nl80211_fam __ro_after_init = {
12832 .name = NL80211_GENL_NAME, /* have users key off the name instead */
12833 .hdrsize = 0, /* no private header */
12834 .version = 1, /* no particular meaning now */
12835 .maxattr = NL80211_ATTR_MAX,
12836 .netnsok = true,
12837 .pre_doit = nl80211_pre_doit,
12838 .post_doit = nl80211_post_doit,
12839 .module = THIS_MODULE,
12840 .ops = nl80211_ops,
12841 .n_ops = ARRAY_SIZE(nl80211_ops),
12842 .mcgrps = nl80211_mcgrps,
12843 .n_mcgrps = ARRAY_SIZE(nl80211_mcgrps),
12602}; 12844};
12603 12845
12604/* notification functions */ 12846/* notification functions */
@@ -12696,7 +12938,7 @@ static int nl80211_add_scan_req(struct sk_buff *msg,
12696 return -ENOBUFS; 12938 return -ENOBUFS;
12697} 12939}
12698 12940
12699static int nl80211_send_scan_msg(struct sk_buff *msg, 12941static int nl80211_prep_scan_msg(struct sk_buff *msg,
12700 struct cfg80211_registered_device *rdev, 12942 struct cfg80211_registered_device *rdev,
12701 struct wireless_dev *wdev, 12943 struct wireless_dev *wdev,
12702 u32 portid, u32 seq, int flags, 12944 u32 portid, u32 seq, int flags,
@@ -12727,7 +12969,7 @@ static int nl80211_send_scan_msg(struct sk_buff *msg,
12727} 12969}
12728 12970
12729static int 12971static int
12730nl80211_send_sched_scan_msg(struct sk_buff *msg, 12972nl80211_prep_sched_scan_msg(struct sk_buff *msg,
12731 struct cfg80211_registered_device *rdev, 12973 struct cfg80211_registered_device *rdev,
12732 struct net_device *netdev, 12974 struct net_device *netdev,
12733 u32 portid, u32 seq, int flags, u32 cmd) 12975 u32 portid, u32 seq, int flags, u32 cmd)
@@ -12759,7 +13001,7 @@ void nl80211_send_scan_start(struct cfg80211_registered_device *rdev,
12759 if (!msg) 13001 if (!msg)
12760 return; 13002 return;
12761 13003
12762 if (nl80211_send_scan_msg(msg, rdev, wdev, 0, 0, 0, 13004 if (nl80211_prep_scan_msg(msg, rdev, wdev, 0, 0, 0,
12763 NL80211_CMD_TRIGGER_SCAN) < 0) { 13005 NL80211_CMD_TRIGGER_SCAN) < 0) {
12764 nlmsg_free(msg); 13006 nlmsg_free(msg);
12765 return; 13007 return;
@@ -12778,7 +13020,7 @@ struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev,
12778 if (!msg) 13020 if (!msg)
12779 return NULL; 13021 return NULL;
12780 13022
12781 if (nl80211_send_scan_msg(msg, rdev, wdev, 0, 0, 0, 13023 if (nl80211_prep_scan_msg(msg, rdev, wdev, 0, 0, 0,
12782 aborted ? NL80211_CMD_SCAN_ABORTED : 13024 aborted ? NL80211_CMD_SCAN_ABORTED :
12783 NL80211_CMD_NEW_SCAN_RESULTS) < 0) { 13025 NL80211_CMD_NEW_SCAN_RESULTS) < 0) {
12784 nlmsg_free(msg); 13026 nlmsg_free(msg);
@@ -12788,31 +13030,13 @@ struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev,
12788 return msg; 13030 return msg;
12789} 13031}
12790 13032
12791void nl80211_send_scan_result(struct cfg80211_registered_device *rdev, 13033/* send message created by nl80211_build_scan_msg() */
12792 struct sk_buff *msg) 13034void nl80211_send_scan_msg(struct cfg80211_registered_device *rdev,
12793{ 13035 struct sk_buff *msg)
12794 if (!msg)
12795 return;
12796
12797 genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
12798 NL80211_MCGRP_SCAN, GFP_KERNEL);
12799}
12800
12801void nl80211_send_sched_scan_results(struct cfg80211_registered_device *rdev,
12802 struct net_device *netdev)
12803{ 13036{
12804 struct sk_buff *msg;
12805
12806 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
12807 if (!msg) 13037 if (!msg)
12808 return; 13038 return;
12809 13039
12810 if (nl80211_send_sched_scan_msg(msg, rdev, netdev, 0, 0, 0,
12811 NL80211_CMD_SCHED_SCAN_RESULTS) < 0) {
12812 nlmsg_free(msg);
12813 return;
12814 }
12815
12816 genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, 13040 genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
12817 NL80211_MCGRP_SCAN, GFP_KERNEL); 13041 NL80211_MCGRP_SCAN, GFP_KERNEL);
12818} 13042}
@@ -12826,7 +13050,7 @@ void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev,
12826 if (!msg) 13050 if (!msg)
12827 return; 13051 return;
12828 13052
12829 if (nl80211_send_sched_scan_msg(msg, rdev, netdev, 0, 0, 0, cmd) < 0) { 13053 if (nl80211_prep_sched_scan_msg(msg, rdev, netdev, 0, 0, 0, cmd) < 0) {
12830 nlmsg_free(msg); 13054 nlmsg_free(msg);
12831 return; 13055 return;
12832 } 13056 }
@@ -12928,7 +13152,7 @@ static void nl80211_send_mlme_event(struct cfg80211_registered_device *rdev,
12928 struct sk_buff *msg; 13152 struct sk_buff *msg;
12929 void *hdr; 13153 void *hdr;
12930 13154
12931 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); 13155 msg = nlmsg_new(100 + len, gfp);
12932 if (!msg) 13156 if (!msg)
12933 return; 13157 return;
12934 13158
@@ -13075,12 +13299,14 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
13075 struct net_device *netdev, const u8 *bssid, 13299 struct net_device *netdev, const u8 *bssid,
13076 const u8 *req_ie, size_t req_ie_len, 13300 const u8 *req_ie, size_t req_ie_len,
13077 const u8 *resp_ie, size_t resp_ie_len, 13301 const u8 *resp_ie, size_t resp_ie_len,
13078 int status, gfp_t gfp) 13302 int status,
13303 enum nl80211_timeout_reason timeout_reason,
13304 gfp_t gfp)
13079{ 13305{
13080 struct sk_buff *msg; 13306 struct sk_buff *msg;
13081 void *hdr; 13307 void *hdr;
13082 13308
13083 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); 13309 msg = nlmsg_new(100 + req_ie_len + resp_ie_len, gfp);
13084 if (!msg) 13310 if (!msg)
13085 return; 13311 return;
13086 13312
@@ -13096,7 +13322,9 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
13096 nla_put_u16(msg, NL80211_ATTR_STATUS_CODE, 13322 nla_put_u16(msg, NL80211_ATTR_STATUS_CODE,
13097 status < 0 ? WLAN_STATUS_UNSPECIFIED_FAILURE : 13323 status < 0 ? WLAN_STATUS_UNSPECIFIED_FAILURE :
13098 status) || 13324 status) ||
13099 (status < 0 && nla_put_flag(msg, NL80211_ATTR_TIMED_OUT)) || 13325 (status < 0 &&
13326 (nla_put_flag(msg, NL80211_ATTR_TIMED_OUT) ||
13327 nla_put_u32(msg, NL80211_ATTR_TIMEOUT_REASON, timeout_reason))) ||
13100 (req_ie && 13328 (req_ie &&
13101 nla_put(msg, NL80211_ATTR_REQ_IE, req_ie_len, req_ie)) || 13329 nla_put(msg, NL80211_ATTR_REQ_IE, req_ie_len, req_ie)) ||
13102 (resp_ie && 13330 (resp_ie &&
@@ -13122,7 +13350,7 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
13122 struct sk_buff *msg; 13350 struct sk_buff *msg;
13123 void *hdr; 13351 void *hdr;
13124 13352
13125 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); 13353 msg = nlmsg_new(100 + req_ie_len + resp_ie_len, gfp);
13126 if (!msg) 13354 if (!msg)
13127 return; 13355 return;
13128 13356
@@ -13159,7 +13387,7 @@ void nl80211_send_disconnected(struct cfg80211_registered_device *rdev,
13159 struct sk_buff *msg; 13387 struct sk_buff *msg;
13160 void *hdr; 13388 void *hdr;
13161 13389
13162 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 13390 msg = nlmsg_new(100 + ie_len, GFP_KERNEL);
13163 if (!msg) 13391 if (!msg)
13164 return; 13392 return;
13165 13393
@@ -13235,7 +13463,7 @@ void cfg80211_notify_new_peer_candidate(struct net_device *dev, const u8 *addr,
13235 13463
13236 trace_cfg80211_notify_new_peer_candidate(dev, addr); 13464 trace_cfg80211_notify_new_peer_candidate(dev, addr);
13237 13465
13238 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); 13466 msg = nlmsg_new(100 + ie_len, gfp);
13239 if (!msg) 13467 if (!msg)
13240 return; 13468 return;
13241 13469
@@ -13606,7 +13834,7 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
13606 struct sk_buff *msg; 13834 struct sk_buff *msg;
13607 void *hdr; 13835 void *hdr;
13608 13836
13609 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); 13837 msg = nlmsg_new(100 + len, gfp);
13610 if (!msg) 13838 if (!msg)
13611 return -ENOMEM; 13839 return -ENOMEM;
13612 13840
@@ -13650,7 +13878,7 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie,
13650 13878
13651 trace_cfg80211_mgmt_tx_status(wdev, cookie, ack); 13879 trace_cfg80211_mgmt_tx_status(wdev, cookie, ack);
13652 13880
13653 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); 13881 msg = nlmsg_new(100 + len, gfp);
13654 if (!msg) 13882 if (!msg)
13655 return; 13883 return;
13656 13884
@@ -13737,11 +13965,11 @@ static void cfg80211_send_cqm(struct sk_buff *msg, gfp_t gfp)
13737 13965
13738void cfg80211_cqm_rssi_notify(struct net_device *dev, 13966void cfg80211_cqm_rssi_notify(struct net_device *dev,
13739 enum nl80211_cqm_rssi_threshold_event rssi_event, 13967 enum nl80211_cqm_rssi_threshold_event rssi_event,
13740 gfp_t gfp) 13968 s32 rssi_level, gfp_t gfp)
13741{ 13969{
13742 struct sk_buff *msg; 13970 struct sk_buff *msg;
13743 13971
13744 trace_cfg80211_cqm_rssi_notify(dev, rssi_event); 13972 trace_cfg80211_cqm_rssi_notify(dev, rssi_event, rssi_level);
13745 13973
13746 if (WARN_ON(rssi_event != NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW && 13974 if (WARN_ON(rssi_event != NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW &&
13747 rssi_event != NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH)) 13975 rssi_event != NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH))
@@ -13755,6 +13983,10 @@ void cfg80211_cqm_rssi_notify(struct net_device *dev,
13755 rssi_event)) 13983 rssi_event))
13756 goto nla_put_failure; 13984 goto nla_put_failure;
13757 13985
13986 if (rssi_level && nla_put_s32(msg, NL80211_ATTR_CQM_RSSI_LEVEL,
13987 rssi_level))
13988 goto nla_put_failure;
13989
13758 cfg80211_send_cqm(msg, gfp); 13990 cfg80211_send_cqm(msg, gfp);
13759 13991
13760 return; 13992 return;
@@ -14388,19 +14620,25 @@ static int nl80211_netlink_notify(struct notifier_block * nb,
14388 14620
14389 list_for_each_entry_rcu(rdev, &cfg80211_rdev_list, list) { 14621 list_for_each_entry_rcu(rdev, &cfg80211_rdev_list, list) {
14390 bool schedule_destroy_work = false; 14622 bool schedule_destroy_work = false;
14391 bool schedule_scan_stop = false;
14392 struct cfg80211_sched_scan_request *sched_scan_req = 14623 struct cfg80211_sched_scan_request *sched_scan_req =
14393 rcu_dereference(rdev->sched_scan_req); 14624 rcu_dereference(rdev->sched_scan_req);
14394 14625
14395 if (sched_scan_req && notify->portid && 14626 if (sched_scan_req && notify->portid &&
14396 sched_scan_req->owner_nlportid == notify->portid) 14627 sched_scan_req->owner_nlportid == notify->portid) {
14397 schedule_scan_stop = true; 14628 sched_scan_req->owner_nlportid = 0;
14629
14630 if (rdev->ops->sched_scan_stop &&
14631 rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN)
14632 schedule_work(&rdev->sched_scan_stop_wk);
14633 }
14398 14634
14399 list_for_each_entry_rcu(wdev, &rdev->wiphy.wdev_list, list) { 14635 list_for_each_entry_rcu(wdev, &rdev->wiphy.wdev_list, list) {
14400 cfg80211_mlme_unregister_socket(wdev, notify->portid); 14636 cfg80211_mlme_unregister_socket(wdev, notify->portid);
14401 14637
14402 if (wdev->owner_nlportid == notify->portid) 14638 if (wdev->owner_nlportid == notify->portid)
14403 schedule_destroy_work = true; 14639 schedule_destroy_work = true;
14640 else if (wdev->conn_owner_nlportid == notify->portid)
14641 schedule_work(&wdev->disconnect_wk);
14404 } 14642 }
14405 14643
14406 spin_lock_bh(&rdev->beacon_registrations_lock); 14644 spin_lock_bh(&rdev->beacon_registrations_lock);
@@ -14425,12 +14663,6 @@ static int nl80211_netlink_notify(struct notifier_block * nb,
14425 spin_unlock(&rdev->destroy_list_lock); 14663 spin_unlock(&rdev->destroy_list_lock);
14426 schedule_work(&rdev->destroy_work); 14664 schedule_work(&rdev->destroy_work);
14427 } 14665 }
14428 } else if (schedule_scan_stop) {
14429 sched_scan_req->owner_nlportid = 0;
14430
14431 if (rdev->ops->sched_scan_stop &&
14432 rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN)
14433 schedule_work(&rdev->sched_scan_stop_wk);
14434 } 14666 }
14435 } 14667 }
14436 14668
@@ -14461,7 +14693,7 @@ void cfg80211_ft_event(struct net_device *netdev,
14461 if (!ft_event->target_ap) 14693 if (!ft_event->target_ap)
14462 return; 14694 return;
14463 14695
14464 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 14696 msg = nlmsg_new(100 + ft_event->ric_ies_len, GFP_KERNEL);
14465 if (!msg) 14697 if (!msg)
14466 return; 14698 return;
14467 14699
@@ -14563,12 +14795,11 @@ void nl80211_send_ap_stopped(struct wireless_dev *wdev)
14563 14795
14564/* initialisation/exit functions */ 14796/* initialisation/exit functions */
14565 14797
14566int nl80211_init(void) 14798int __init nl80211_init(void)
14567{ 14799{
14568 int err; 14800 int err;
14569 14801
14570 err = genl_register_family_with_ops_groups(&nl80211_fam, nl80211_ops, 14802 err = genl_register_family(&nl80211_fam);
14571 nl80211_mcgrps);
14572 if (err) 14803 if (err)
14573 return err; 14804 return err;
14574 14805
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index 7e3821d7fcc5..e488dca87423 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -14,12 +14,10 @@ void nl80211_send_scan_start(struct cfg80211_registered_device *rdev,
14 struct wireless_dev *wdev); 14 struct wireless_dev *wdev);
15struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev, 15struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev,
16 struct wireless_dev *wdev, bool aborted); 16 struct wireless_dev *wdev, bool aborted);
17void nl80211_send_scan_result(struct cfg80211_registered_device *rdev, 17void nl80211_send_scan_msg(struct cfg80211_registered_device *rdev,
18 struct sk_buff *msg); 18 struct sk_buff *msg);
19void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev, 19void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev,
20 struct net_device *netdev, u32 cmd); 20 struct net_device *netdev, u32 cmd);
21void nl80211_send_sched_scan_results(struct cfg80211_registered_device *rdev,
22 struct net_device *netdev);
23void nl80211_common_reg_change_event(enum nl80211_commands cmd_id, 21void nl80211_common_reg_change_event(enum nl80211_commands cmd_id,
24 struct regulatory_request *request); 22 struct regulatory_request *request);
25 23
@@ -58,7 +56,9 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
58 struct net_device *netdev, const u8 *bssid, 56 struct net_device *netdev, const u8 *bssid,
59 const u8 *req_ie, size_t req_ie_len, 57 const u8 *req_ie, size_t req_ie_len,
60 const u8 *resp_ie, size_t resp_ie_len, 58 const u8 *resp_ie, size_t resp_ie_len,
61 int status, gfp_t gfp); 59 int status,
60 enum nl80211_timeout_reason timeout_reason,
61 gfp_t gfp);
62void nl80211_send_roamed(struct cfg80211_registered_device *rdev, 62void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
63 struct net_device *netdev, const u8 *bssid, 63 struct net_device *netdev, const u8 *bssid,
64 const u8 *req_ie, size_t req_ie_len, 64 const u8 *req_ie, size_t req_ie_len,
diff --git a/net/wireless/of.c b/net/wireless/of.c
new file mode 100644
index 000000000000..de221f0edca5
--- /dev/null
+++ b/net/wireless/of.c
@@ -0,0 +1,138 @@
1/*
2 * Copyright (C) 2017 Rafał Miłecki <rafal@milecki.pl>
3 *
4 * Permission to use, copy, modify, and/or distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 */
16
17#include <linux/of.h>
18#include <net/cfg80211.h>
19#include "core.h"
20
21static bool wiphy_freq_limits_valid_chan(struct wiphy *wiphy,
22 struct ieee80211_freq_range *freq_limits,
23 unsigned int n_freq_limits,
24 struct ieee80211_channel *chan)
25{
26 u32 bw = MHZ_TO_KHZ(20);
27 int i;
28
29 for (i = 0; i < n_freq_limits; i++) {
30 struct ieee80211_freq_range *limit = &freq_limits[i];
31
32 if (cfg80211_does_bw_fit_range(limit,
33 MHZ_TO_KHZ(chan->center_freq),
34 bw))
35 return true;
36 }
37
38 return false;
39}
40
41static void wiphy_freq_limits_apply(struct wiphy *wiphy,
42 struct ieee80211_freq_range *freq_limits,
43 unsigned int n_freq_limits)
44{
45 enum nl80211_band band;
46 int i;
47
48 if (WARN_ON(!n_freq_limits))
49 return;
50
51 for (band = 0; band < NUM_NL80211_BANDS; band++) {
52 struct ieee80211_supported_band *sband = wiphy->bands[band];
53
54 if (!sband)
55 continue;
56
57 for (i = 0; i < sband->n_channels; i++) {
58 struct ieee80211_channel *chan = &sband->channels[i];
59
60 if (chan->flags & IEEE80211_CHAN_DISABLED)
61 continue;
62
63 if (!wiphy_freq_limits_valid_chan(wiphy, freq_limits,
64 n_freq_limits,
65 chan)) {
66 pr_debug("Disabling freq %d MHz as it's out of OF limits\n",
67 chan->center_freq);
68 chan->flags |= IEEE80211_CHAN_DISABLED;
69 }
70 }
71 }
72}
73
74void wiphy_read_of_freq_limits(struct wiphy *wiphy)
75{
76 struct device *dev = wiphy_dev(wiphy);
77 struct device_node *np;
78 struct property *prop;
79 struct ieee80211_freq_range *freq_limits;
80 unsigned int n_freq_limits;
81 const __be32 *p;
82 int len, i;
83 int err = 0;
84
85 if (!dev)
86 return;
87 np = dev_of_node(dev);
88 if (!np)
89 return;
90
91 prop = of_find_property(np, "ieee80211-freq-limit", &len);
92 if (!prop)
93 return;
94
95 if (!len || len % sizeof(u32) || len / sizeof(u32) % 2) {
96 dev_err(dev, "ieee80211-freq-limit wrong format");
97 return;
98 }
99 n_freq_limits = len / sizeof(u32) / 2;
100
101 freq_limits = kcalloc(n_freq_limits, sizeof(*freq_limits), GFP_KERNEL);
102 if (!freq_limits) {
103 err = -ENOMEM;
104 goto out_kfree;
105 }
106
107 p = NULL;
108 for (i = 0; i < n_freq_limits; i++) {
109 struct ieee80211_freq_range *limit = &freq_limits[i];
110
111 p = of_prop_next_u32(prop, p, &limit->start_freq_khz);
112 if (!p) {
113 err = -EINVAL;
114 goto out_kfree;
115 }
116
117 p = of_prop_next_u32(prop, p, &limit->end_freq_khz);
118 if (!p) {
119 err = -EINVAL;
120 goto out_kfree;
121 }
122
123 if (!limit->start_freq_khz ||
124 !limit->end_freq_khz ||
125 limit->start_freq_khz >= limit->end_freq_khz) {
126 err = -EINVAL;
127 goto out_kfree;
128 }
129 }
130
131 wiphy_freq_limits_apply(wiphy, freq_limits, n_freq_limits);
132
133out_kfree:
134 kfree(freq_limits);
135 if (err)
136 dev_err(dev, "Failed to get limits: %d\n", err);
137}
138EXPORT_SYMBOL(wiphy_read_of_freq_limits);
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index 11cf83c8ad4f..2f425075ada8 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -490,6 +490,18 @@ static inline int rdev_connect(struct cfg80211_registered_device *rdev,
490 return ret; 490 return ret;
491} 491}
492 492
493static inline int
494rdev_update_connect_params(struct cfg80211_registered_device *rdev,
495 struct net_device *dev,
496 struct cfg80211_connect_params *sme, u32 changed)
497{
498 int ret;
499 trace_rdev_update_connect_params(&rdev->wiphy, dev, sme, changed);
500 ret = rdev->ops->update_connect_params(&rdev->wiphy, dev, sme, changed);
501 trace_rdev_return_int(&rdev->wiphy, ret);
502 return ret;
503}
504
493static inline int rdev_disconnect(struct cfg80211_registered_device *rdev, 505static inline int rdev_disconnect(struct cfg80211_registered_device *rdev,
494 struct net_device *dev, u16 reason_code) 506 struct net_device *dev, u16 reason_code)
495{ 507{
@@ -562,6 +574,18 @@ static inline int rdev_set_wds_peer(struct cfg80211_registered_device *rdev,
562 return ret; 574 return ret;
563} 575}
564 576
577static inline int
578rdev_set_multicast_to_unicast(struct cfg80211_registered_device *rdev,
579 struct net_device *dev,
580 const bool enabled)
581{
582 int ret;
583 trace_rdev_set_multicast_to_unicast(&rdev->wiphy, dev, enabled);
584 ret = rdev->ops->set_multicast_to_unicast(&rdev->wiphy, dev, enabled);
585 trace_rdev_return_int(&rdev->wiphy, ret);
586 return ret;
587}
588
565static inline void rdev_rfkill_poll(struct cfg80211_registered_device *rdev) 589static inline void rdev_rfkill_poll(struct cfg80211_registered_device *rdev)
566{ 590{
567 trace_rdev_rfkill_poll(&rdev->wiphy); 591 trace_rdev_rfkill_poll(&rdev->wiphy);
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 5dbac3749738..753efcd51fa3 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -748,21 +748,6 @@ static bool is_valid_rd(const struct ieee80211_regdomain *rd)
748 return true; 748 return true;
749} 749}
750 750
751static bool reg_does_bw_fit(const struct ieee80211_freq_range *freq_range,
752 u32 center_freq_khz, u32 bw_khz)
753{
754 u32 start_freq_khz, end_freq_khz;
755
756 start_freq_khz = center_freq_khz - (bw_khz/2);
757 end_freq_khz = center_freq_khz + (bw_khz/2);
758
759 if (start_freq_khz >= freq_range->start_freq_khz &&
760 end_freq_khz <= freq_range->end_freq_khz)
761 return true;
762
763 return false;
764}
765
766/** 751/**
767 * freq_in_rule_band - tells us if a frequency is in a frequency band 752 * freq_in_rule_band - tells us if a frequency is in a frequency band
768 * @freq_range: frequency rule we want to query 753 * @freq_range: frequency rule we want to query
@@ -1070,7 +1055,7 @@ freq_reg_info_regd(u32 center_freq,
1070 if (!band_rule_found) 1055 if (!band_rule_found)
1071 band_rule_found = freq_in_rule_band(fr, center_freq); 1056 band_rule_found = freq_in_rule_band(fr, center_freq);
1072 1057
1073 bw_fits = reg_does_bw_fit(fr, center_freq, bw); 1058 bw_fits = cfg80211_does_bw_fit_range(fr, center_freq, bw);
1074 1059
1075 if (band_rule_found && bw_fits) 1060 if (band_rule_found && bw_fits)
1076 return rr; 1061 return rr;
@@ -1138,11 +1123,13 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd
1138 max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule); 1123 max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule);
1139 1124
1140 /* If we get a reg_rule we can assume that at least 5Mhz fit */ 1125 /* If we get a reg_rule we can assume that at least 5Mhz fit */
1141 if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq), 1126 if (!cfg80211_does_bw_fit_range(freq_range,
1142 MHZ_TO_KHZ(10))) 1127 MHZ_TO_KHZ(chan->center_freq),
1128 MHZ_TO_KHZ(10)))
1143 bw_flags |= IEEE80211_CHAN_NO_10MHZ; 1129 bw_flags |= IEEE80211_CHAN_NO_10MHZ;
1144 if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq), 1130 if (!cfg80211_does_bw_fit_range(freq_range,
1145 MHZ_TO_KHZ(20))) 1131 MHZ_TO_KHZ(chan->center_freq),
1132 MHZ_TO_KHZ(20)))
1146 bw_flags |= IEEE80211_CHAN_NO_20MHZ; 1133 bw_flags |= IEEE80211_CHAN_NO_20MHZ;
1147 1134
1148 if (max_bandwidth_khz < MHZ_TO_KHZ(10)) 1135 if (max_bandwidth_khz < MHZ_TO_KHZ(10))
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 35ad69fd0838..21be56b3128e 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -227,7 +227,7 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev,
227 ASSERT_RTNL(); 227 ASSERT_RTNL();
228 228
229 if (rdev->scan_msg) { 229 if (rdev->scan_msg) {
230 nl80211_send_scan_result(rdev, rdev->scan_msg); 230 nl80211_send_scan_msg(rdev, rdev->scan_msg);
231 rdev->scan_msg = NULL; 231 rdev->scan_msg = NULL;
232 return; 232 return;
233 } 233 }
@@ -273,7 +273,7 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev,
273 if (!send_message) 273 if (!send_message)
274 rdev->scan_msg = msg; 274 rdev->scan_msg = msg;
275 else 275 else
276 nl80211_send_scan_result(rdev, msg); 276 nl80211_send_scan_msg(rdev, msg);
277} 277}
278 278
279void __cfg80211_scan_done(struct work_struct *wk) 279void __cfg80211_scan_done(struct work_struct *wk)
@@ -321,7 +321,8 @@ void __cfg80211_sched_scan_results(struct work_struct *wk)
321 spin_unlock_bh(&rdev->bss_lock); 321 spin_unlock_bh(&rdev->bss_lock);
322 request->scan_start = jiffies; 322 request->scan_start = jiffies;
323 } 323 }
324 nl80211_send_sched_scan_results(rdev, request->dev); 324 nl80211_send_sched_scan(rdev, request->dev,
325 NL80211_CMD_SCHED_SCAN_RESULTS);
325 } 326 }
326 327
327 rtnl_unlock(); 328 rtnl_unlock();
@@ -1147,7 +1148,7 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy,
1147 else 1148 else
1148 rcu_assign_pointer(tmp.pub.beacon_ies, ies); 1149 rcu_assign_pointer(tmp.pub.beacon_ies, ies);
1149 rcu_assign_pointer(tmp.pub.ies, ies); 1150 rcu_assign_pointer(tmp.pub.ies, ies);
1150 1151
1151 memcpy(tmp.pub.bssid, mgmt->bssid, ETH_ALEN); 1152 memcpy(tmp.pub.bssid, mgmt->bssid, ETH_ALEN);
1152 tmp.pub.channel = channel; 1153 tmp.pub.channel = channel;
1153 tmp.pub.scan_width = data->scan_width; 1154 tmp.pub.scan_width = data->scan_width;
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index a77db333927e..b347e63d7aaa 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -34,11 +34,13 @@ struct cfg80211_conn {
34 CFG80211_CONN_SCAN_AGAIN, 34 CFG80211_CONN_SCAN_AGAIN,
35 CFG80211_CONN_AUTHENTICATE_NEXT, 35 CFG80211_CONN_AUTHENTICATE_NEXT,
36 CFG80211_CONN_AUTHENTICATING, 36 CFG80211_CONN_AUTHENTICATING,
37 CFG80211_CONN_AUTH_FAILED, 37 CFG80211_CONN_AUTH_FAILED_TIMEOUT,
38 CFG80211_CONN_ASSOCIATE_NEXT, 38 CFG80211_CONN_ASSOCIATE_NEXT,
39 CFG80211_CONN_ASSOCIATING, 39 CFG80211_CONN_ASSOCIATING,
40 CFG80211_CONN_ASSOC_FAILED, 40 CFG80211_CONN_ASSOC_FAILED,
41 CFG80211_CONN_ASSOC_FAILED_TIMEOUT,
41 CFG80211_CONN_DEAUTH, 42 CFG80211_CONN_DEAUTH,
43 CFG80211_CONN_ABANDON,
42 CFG80211_CONN_CONNECTED, 44 CFG80211_CONN_CONNECTED,
43 } state; 45 } state;
44 u8 bssid[ETH_ALEN], prev_bssid[ETH_ALEN]; 46 u8 bssid[ETH_ALEN], prev_bssid[ETH_ALEN];
@@ -139,7 +141,8 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev)
139 return err; 141 return err;
140} 142}
141 143
142static int cfg80211_conn_do_work(struct wireless_dev *wdev) 144static int cfg80211_conn_do_work(struct wireless_dev *wdev,
145 enum nl80211_timeout_reason *treason)
143{ 146{
144 struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); 147 struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
145 struct cfg80211_connect_params *params; 148 struct cfg80211_connect_params *params;
@@ -170,7 +173,8 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev)
170 NULL, 0, 173 NULL, 0,
171 params->key, params->key_len, 174 params->key, params->key_len,
172 params->key_idx, NULL, 0); 175 params->key_idx, NULL, 0);
173 case CFG80211_CONN_AUTH_FAILED: 176 case CFG80211_CONN_AUTH_FAILED_TIMEOUT:
177 *treason = NL80211_TIMEOUT_AUTH;
174 return -ENOTCONN; 178 return -ENOTCONN;
175 case CFG80211_CONN_ASSOCIATE_NEXT: 179 case CFG80211_CONN_ASSOCIATE_NEXT:
176 if (WARN_ON(!rdev->ops->assoc)) 180 if (WARN_ON(!rdev->ops->assoc))
@@ -197,6 +201,9 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev)
197 WLAN_REASON_DEAUTH_LEAVING, 201 WLAN_REASON_DEAUTH_LEAVING,
198 false); 202 false);
199 return err; 203 return err;
204 case CFG80211_CONN_ASSOC_FAILED_TIMEOUT:
205 *treason = NL80211_TIMEOUT_ASSOC;
206 /* fall through */
200 case CFG80211_CONN_ASSOC_FAILED: 207 case CFG80211_CONN_ASSOC_FAILED:
201 cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid, 208 cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid,
202 NULL, 0, 209 NULL, 0,
@@ -206,6 +213,8 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev)
206 cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid, 213 cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid,
207 NULL, 0, 214 NULL, 0,
208 WLAN_REASON_DEAUTH_LEAVING, false); 215 WLAN_REASON_DEAUTH_LEAVING, false);
216 /* fall through */
217 case CFG80211_CONN_ABANDON:
209 /* free directly, disconnected event already sent */ 218 /* free directly, disconnected event already sent */
210 cfg80211_sme_free(wdev); 219 cfg80211_sme_free(wdev);
211 return 0; 220 return 0;
@@ -220,6 +229,7 @@ void cfg80211_conn_work(struct work_struct *work)
220 container_of(work, struct cfg80211_registered_device, conn_work); 229 container_of(work, struct cfg80211_registered_device, conn_work);
221 struct wireless_dev *wdev; 230 struct wireless_dev *wdev;
222 u8 bssid_buf[ETH_ALEN], *bssid = NULL; 231 u8 bssid_buf[ETH_ALEN], *bssid = NULL;
232 enum nl80211_timeout_reason treason;
223 233
224 rtnl_lock(); 234 rtnl_lock();
225 235
@@ -241,10 +251,12 @@ void cfg80211_conn_work(struct work_struct *work)
241 memcpy(bssid_buf, wdev->conn->params.bssid, ETH_ALEN); 251 memcpy(bssid_buf, wdev->conn->params.bssid, ETH_ALEN);
242 bssid = bssid_buf; 252 bssid = bssid_buf;
243 } 253 }
244 if (cfg80211_conn_do_work(wdev)) { 254 treason = NL80211_TIMEOUT_UNSPECIFIED;
255 if (cfg80211_conn_do_work(wdev, &treason)) {
245 __cfg80211_connect_result( 256 __cfg80211_connect_result(
246 wdev->netdev, bssid, 257 wdev->netdev, bssid,
247 NULL, 0, NULL, 0, -1, false, NULL); 258 NULL, 0, NULL, 0, -1, false, NULL,
259 treason);
248 } 260 }
249 wdev_unlock(wdev); 261 wdev_unlock(wdev);
250 } 262 }
@@ -349,7 +361,8 @@ void cfg80211_sme_rx_auth(struct wireless_dev *wdev, const u8 *buf, size_t len)
349 } else if (status_code != WLAN_STATUS_SUCCESS) { 361 } else if (status_code != WLAN_STATUS_SUCCESS) {
350 __cfg80211_connect_result(wdev->netdev, mgmt->bssid, 362 __cfg80211_connect_result(wdev->netdev, mgmt->bssid,
351 NULL, 0, NULL, 0, 363 NULL, 0, NULL, 0,
352 status_code, false, NULL); 364 status_code, false, NULL,
365 NL80211_TIMEOUT_UNSPECIFIED);
353 } else if (wdev->conn->state == CFG80211_CONN_AUTHENTICATING) { 366 } else if (wdev->conn->state == CFG80211_CONN_AUTHENTICATING) {
354 wdev->conn->state = CFG80211_CONN_ASSOCIATE_NEXT; 367 wdev->conn->state = CFG80211_CONN_ASSOCIATE_NEXT;
355 schedule_work(&rdev->conn_work); 368 schedule_work(&rdev->conn_work);
@@ -397,7 +410,7 @@ void cfg80211_sme_auth_timeout(struct wireless_dev *wdev)
397 if (!wdev->conn) 410 if (!wdev->conn)
398 return; 411 return;
399 412
400 wdev->conn->state = CFG80211_CONN_AUTH_FAILED; 413 wdev->conn->state = CFG80211_CONN_AUTH_FAILED_TIMEOUT;
401 schedule_work(&rdev->conn_work); 414 schedule_work(&rdev->conn_work);
402} 415}
403 416
@@ -419,7 +432,18 @@ void cfg80211_sme_assoc_timeout(struct wireless_dev *wdev)
419 if (!wdev->conn) 432 if (!wdev->conn)
420 return; 433 return;
421 434
422 wdev->conn->state = CFG80211_CONN_ASSOC_FAILED; 435 wdev->conn->state = CFG80211_CONN_ASSOC_FAILED_TIMEOUT;
436 schedule_work(&rdev->conn_work);
437}
438
439void cfg80211_sme_abandon_assoc(struct wireless_dev *wdev)
440{
441 struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
442
443 if (!wdev->conn)
444 return;
445
446 wdev->conn->state = CFG80211_CONN_ABANDON;
423 schedule_work(&rdev->conn_work); 447 schedule_work(&rdev->conn_work);
424} 448}
425 449
@@ -550,7 +574,9 @@ static int cfg80211_sme_connect(struct wireless_dev *wdev,
550 574
551 /* we're good if we have a matching bss struct */ 575 /* we're good if we have a matching bss struct */
552 if (bss) { 576 if (bss) {
553 err = cfg80211_conn_do_work(wdev); 577 enum nl80211_timeout_reason treason;
578
579 err = cfg80211_conn_do_work(wdev, &treason);
554 cfg80211_put_bss(wdev->wiphy, bss); 580 cfg80211_put_bss(wdev->wiphy, bss);
555 } else { 581 } else {
556 /* otherwise we'll need to scan for the AP first */ 582 /* otherwise we'll need to scan for the AP first */
@@ -647,7 +673,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
647 const u8 *req_ie, size_t req_ie_len, 673 const u8 *req_ie, size_t req_ie_len,
648 const u8 *resp_ie, size_t resp_ie_len, 674 const u8 *resp_ie, size_t resp_ie_len,
649 int status, bool wextev, 675 int status, bool wextev,
650 struct cfg80211_bss *bss) 676 struct cfg80211_bss *bss,
677 enum nl80211_timeout_reason timeout_reason)
651{ 678{
652 struct wireless_dev *wdev = dev->ieee80211_ptr; 679 struct wireless_dev *wdev = dev->ieee80211_ptr;
653 const u8 *country_ie; 680 const u8 *country_ie;
@@ -666,7 +693,7 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
666 nl80211_send_connect_result(wiphy_to_rdev(wdev->wiphy), dev, 693 nl80211_send_connect_result(wiphy_to_rdev(wdev->wiphy), dev,
667 bssid, req_ie, req_ie_len, 694 bssid, req_ie, req_ie_len,
668 resp_ie, resp_ie_len, 695 resp_ie, resp_ie_len,
669 status, GFP_KERNEL); 696 status, timeout_reason, GFP_KERNEL);
670 697
671#ifdef CONFIG_CFG80211_WEXT 698#ifdef CONFIG_CFG80211_WEXT
672 if (wextev) { 699 if (wextev) {
@@ -713,6 +740,7 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
713 kzfree(wdev->connect_keys); 740 kzfree(wdev->connect_keys);
714 wdev->connect_keys = NULL; 741 wdev->connect_keys = NULL;
715 wdev->ssid_len = 0; 742 wdev->ssid_len = 0;
743 wdev->conn_owner_nlportid = 0;
716 if (bss) { 744 if (bss) {
717 cfg80211_unhold_bss(bss_from_pub(bss)); 745 cfg80211_unhold_bss(bss_from_pub(bss));
718 cfg80211_put_bss(wdev->wiphy, bss); 746 cfg80211_put_bss(wdev->wiphy, bss);
@@ -756,7 +784,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
756void cfg80211_connect_bss(struct net_device *dev, const u8 *bssid, 784void cfg80211_connect_bss(struct net_device *dev, const u8 *bssid,
757 struct cfg80211_bss *bss, const u8 *req_ie, 785 struct cfg80211_bss *bss, const u8 *req_ie,
758 size_t req_ie_len, const u8 *resp_ie, 786 size_t req_ie_len, const u8 *resp_ie,
759 size_t resp_ie_len, int status, gfp_t gfp) 787 size_t resp_ie_len, int status, gfp_t gfp,
788 enum nl80211_timeout_reason timeout_reason)
760{ 789{
761 struct wireless_dev *wdev = dev->ieee80211_ptr; 790 struct wireless_dev *wdev = dev->ieee80211_ptr;
762 struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); 791 struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
@@ -796,6 +825,7 @@ void cfg80211_connect_bss(struct net_device *dev, const u8 *bssid,
796 cfg80211_hold_bss(bss_from_pub(bss)); 825 cfg80211_hold_bss(bss_from_pub(bss));
797 ev->cr.bss = bss; 826 ev->cr.bss = bss;
798 ev->cr.status = status; 827 ev->cr.status = status;
828 ev->cr.timeout_reason = timeout_reason;
799 829
800 spin_lock_irqsave(&wdev->event_lock, flags); 830 spin_lock_irqsave(&wdev->event_lock, flags);
801 list_add_tail(&ev->list, &wdev->event_list); 831 list_add_tail(&ev->list, &wdev->event_list);
@@ -941,6 +971,7 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
941 971
942 wdev->current_bss = NULL; 972 wdev->current_bss = NULL;
943 wdev->ssid_len = 0; 973 wdev->ssid_len = 0;
974 wdev->conn_owner_nlportid = 0;
944 975
945 nl80211_send_disconnected(rdev, dev, reason, ie, ie_len, from_ap); 976 nl80211_send_disconnected(rdev, dev, reason, ie, ie_len, from_ap);
946 977
@@ -1084,12 +1115,43 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
1084 kzfree(wdev->connect_keys); 1115 kzfree(wdev->connect_keys);
1085 wdev->connect_keys = NULL; 1116 wdev->connect_keys = NULL;
1086 1117
1118 wdev->conn_owner_nlportid = 0;
1119
1087 if (wdev->conn) 1120 if (wdev->conn)
1088 err = cfg80211_sme_disconnect(wdev, reason); 1121 err = cfg80211_sme_disconnect(wdev, reason);
1089 else if (!rdev->ops->disconnect) 1122 else if (!rdev->ops->disconnect)
1090 cfg80211_mlme_down(rdev, dev); 1123 cfg80211_mlme_down(rdev, dev);
1091 else if (wdev->current_bss) 1124 else if (wdev->ssid_len)
1092 err = rdev_disconnect(rdev, dev, reason); 1125 err = rdev_disconnect(rdev, dev, reason);
1093 1126
1094 return err; 1127 return err;
1095} 1128}
1129
1130/*
1131 * Used to clean up after the connection / connection attempt owner socket
1132 * disconnects
1133 */
1134void cfg80211_autodisconnect_wk(struct work_struct *work)
1135{
1136 struct wireless_dev *wdev =
1137 container_of(work, struct wireless_dev, disconnect_wk);
1138 struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
1139
1140 wdev_lock(wdev);
1141
1142 if (wdev->conn_owner_nlportid) {
1143 /*
1144 * Use disconnect_bssid if still connecting and ops->disconnect
1145 * not implemented. Otherwise we can use cfg80211_disconnect.
1146 */
1147 if (rdev->ops->disconnect || wdev->current_bss)
1148 cfg80211_disconnect(rdev, wdev->netdev,
1149 WLAN_REASON_DEAUTH_LEAVING, true);
1150 else
1151 cfg80211_mlme_deauth(rdev, wdev->netdev,
1152 wdev->disconnect_bssid, NULL, 0,
1153 WLAN_REASON_DEAUTH_LEAVING, false);
1154 }
1155
1156 wdev_unlock(wdev);
1157}
diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c
index 14b3f007826d..570a2b67ca10 100644
--- a/net/wireless/sysfs.c
+++ b/net/wireless/sysfs.c
@@ -39,9 +39,11 @@ SHOW_FMT(address_mask, "%pM", wiphy.addr_mask);
39 39
40static ssize_t name_show(struct device *dev, 40static ssize_t name_show(struct device *dev,
41 struct device_attribute *attr, 41 struct device_attribute *attr,
42 char *buf) { 42 char *buf)
43{
43 struct wiphy *wiphy = &dev_to_rdev(dev)->wiphy; 44 struct wiphy *wiphy = &dev_to_rdev(dev)->wiphy;
44 return sprintf(buf, "%s\n", dev_name(&wiphy->dev)); 45
46 return sprintf(buf, "%s\n", wiphy_name(wiphy));
45} 47}
46static DEVICE_ATTR_RO(name); 48static DEVICE_ATTR_RO(name);
47 49
@@ -130,12 +132,10 @@ static int wiphy_resume(struct device *dev)
130 /* Age scan results with time spent in suspend */ 132 /* Age scan results with time spent in suspend */
131 cfg80211_bss_age(rdev, get_seconds() - rdev->suspend_at); 133 cfg80211_bss_age(rdev, get_seconds() - rdev->suspend_at);
132 134
133 if (rdev->ops->resume) { 135 rtnl_lock();
134 rtnl_lock(); 136 if (rdev->wiphy.registered && rdev->ops->resume)
135 if (rdev->wiphy.registered) 137 ret = rdev_resume(rdev);
136 ret = rdev_resume(rdev); 138 rtnl_unlock();
137 rtnl_unlock();
138 }
139 139
140 return ret; 140 return ret;
141} 141}
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index a3d0a91b1e09..776e80cef9b4 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -1281,6 +1281,24 @@ TRACE_EVENT(rdev_connect,
1281 __entry->wpa_versions, __entry->flags, MAC_PR_ARG(prev_bssid)) 1281 __entry->wpa_versions, __entry->flags, MAC_PR_ARG(prev_bssid))
1282); 1282);
1283 1283
1284TRACE_EVENT(rdev_update_connect_params,
1285 TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
1286 struct cfg80211_connect_params *sme, u32 changed),
1287 TP_ARGS(wiphy, netdev, sme, changed),
1288 TP_STRUCT__entry(
1289 WIPHY_ENTRY
1290 NETDEV_ENTRY
1291 __field(u32, changed)
1292 ),
1293 TP_fast_assign(
1294 WIPHY_ASSIGN;
1295 NETDEV_ASSIGN;
1296 __entry->changed = changed;
1297 ),
1298 TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", parameters changed: %u",
1299 WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->changed)
1300);
1301
1284TRACE_EVENT(rdev_set_cqm_rssi_config, 1302TRACE_EVENT(rdev_set_cqm_rssi_config,
1285 TP_PROTO(struct wiphy *wiphy, 1303 TP_PROTO(struct wiphy *wiphy,
1286 struct net_device *netdev, s32 rssi_thold, 1304 struct net_device *netdev, s32 rssi_thold,
@@ -1897,18 +1915,18 @@ TRACE_EVENT(rdev_start_nan,
1897 WIPHY_ENTRY 1915 WIPHY_ENTRY
1898 WDEV_ENTRY 1916 WDEV_ENTRY
1899 __field(u8, master_pref) 1917 __field(u8, master_pref)
1900 __field(u8, dual); 1918 __field(u8, bands);
1901 ), 1919 ),
1902 TP_fast_assign( 1920 TP_fast_assign(
1903 WIPHY_ASSIGN; 1921 WIPHY_ASSIGN;
1904 WDEV_ASSIGN; 1922 WDEV_ASSIGN;
1905 __entry->master_pref = conf->master_pref; 1923 __entry->master_pref = conf->master_pref;
1906 __entry->dual = conf->dual; 1924 __entry->bands = conf->bands;
1907 ), 1925 ),
1908 TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT 1926 TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT
1909 ", master preference: %u, dual: %d", 1927 ", master preference: %u, bands: 0x%0x",
1910 WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref, 1928 WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref,
1911 __entry->dual) 1929 __entry->bands)
1912); 1930);
1913 1931
1914TRACE_EVENT(rdev_nan_change_conf, 1932TRACE_EVENT(rdev_nan_change_conf,
@@ -1919,20 +1937,20 @@ TRACE_EVENT(rdev_nan_change_conf,
1919 WIPHY_ENTRY 1937 WIPHY_ENTRY
1920 WDEV_ENTRY 1938 WDEV_ENTRY
1921 __field(u8, master_pref) 1939 __field(u8, master_pref)
1922 __field(u8, dual); 1940 __field(u8, bands);
1923 __field(u32, changes); 1941 __field(u32, changes);
1924 ), 1942 ),
1925 TP_fast_assign( 1943 TP_fast_assign(
1926 WIPHY_ASSIGN; 1944 WIPHY_ASSIGN;
1927 WDEV_ASSIGN; 1945 WDEV_ASSIGN;
1928 __entry->master_pref = conf->master_pref; 1946 __entry->master_pref = conf->master_pref;
1929 __entry->dual = conf->dual; 1947 __entry->bands = conf->bands;
1930 __entry->changes = changes; 1948 __entry->changes = changes;
1931 ), 1949 ),
1932 TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT 1950 TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT
1933 ", master preference: %u, dual: %d, changes: %x", 1951 ", master preference: %u, bands: 0x%0x, changes: %x",
1934 WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref, 1952 WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref,
1935 __entry->dual, __entry->changes) 1953 __entry->bands, __entry->changes)
1936); 1954);
1937 1955
1938DEFINE_EVENT(wiphy_wdev_evt, rdev_stop_nan, 1956DEFINE_EVENT(wiphy_wdev_evt, rdev_stop_nan,
@@ -2472,18 +2490,21 @@ TRACE_EVENT(cfg80211_mgmt_tx_status,
2472 2490
2473TRACE_EVENT(cfg80211_cqm_rssi_notify, 2491TRACE_EVENT(cfg80211_cqm_rssi_notify,
2474 TP_PROTO(struct net_device *netdev, 2492 TP_PROTO(struct net_device *netdev,
2475 enum nl80211_cqm_rssi_threshold_event rssi_event), 2493 enum nl80211_cqm_rssi_threshold_event rssi_event,
2476 TP_ARGS(netdev, rssi_event), 2494 s32 rssi_level),
2495 TP_ARGS(netdev, rssi_event, rssi_level),
2477 TP_STRUCT__entry( 2496 TP_STRUCT__entry(
2478 NETDEV_ENTRY 2497 NETDEV_ENTRY
2479 __field(enum nl80211_cqm_rssi_threshold_event, rssi_event) 2498 __field(enum nl80211_cqm_rssi_threshold_event, rssi_event)
2499 __field(s32, rssi_level)
2480 ), 2500 ),
2481 TP_fast_assign( 2501 TP_fast_assign(
2482 NETDEV_ASSIGN; 2502 NETDEV_ASSIGN;
2483 __entry->rssi_event = rssi_event; 2503 __entry->rssi_event = rssi_event;
2504 __entry->rssi_level = rssi_level;
2484 ), 2505 ),
2485 TP_printk(NETDEV_PR_FMT ", rssi event: %d", 2506 TP_printk(NETDEV_PR_FMT ", rssi event: %d, level: %d",
2486 NETDEV_PR_ARG, __entry->rssi_event) 2507 NETDEV_PR_ARG, __entry->rssi_event, __entry->rssi_level)
2487); 2508);
2488 2509
2489TRACE_EVENT(cfg80211_reg_can_beacon, 2510TRACE_EVENT(cfg80211_reg_can_beacon,
@@ -3030,6 +3051,25 @@ DEFINE_EVENT(wiphy_wdev_evt, rdev_abort_scan,
3030 TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), 3051 TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
3031 TP_ARGS(wiphy, wdev) 3052 TP_ARGS(wiphy, wdev)
3032); 3053);
3054
3055TRACE_EVENT(rdev_set_multicast_to_unicast,
3056 TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
3057 const bool enabled),
3058 TP_ARGS(wiphy, netdev, enabled),
3059 TP_STRUCT__entry(
3060 WIPHY_ENTRY
3061 NETDEV_ENTRY
3062 __field(bool, enabled)
3063 ),
3064 TP_fast_assign(
3065 WIPHY_ASSIGN;
3066 NETDEV_ASSIGN;
3067 __entry->enabled = enabled;
3068 ),
3069 TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", unicast: %s",
3070 WIPHY_PR_ARG, NETDEV_PR_ARG,
3071 BOOL_TO_STR(__entry->enabled))
3072);
3033#endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ 3073#endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */
3034 3074
3035#undef TRACE_INCLUDE_PATH 3075#undef TRACE_INCLUDE_PATH
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 659b507b347d..68e5f2ecee1a 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -13,6 +13,7 @@
13#include <net/dsfield.h> 13#include <net/dsfield.h>
14#include <linux/if_vlan.h> 14#include <linux/if_vlan.h>
15#include <linux/mpls.h> 15#include <linux/mpls.h>
16#include <linux/gcd.h>
16#include "core.h" 17#include "core.h"
17#include "rdev-ops.h" 18#include "rdev-ops.h"
18 19
@@ -113,8 +114,7 @@ int ieee80211_frequency_to_channel(int freq)
113} 114}
114EXPORT_SYMBOL(ieee80211_frequency_to_channel); 115EXPORT_SYMBOL(ieee80211_frequency_to_channel);
115 116
116struct ieee80211_channel *__ieee80211_get_channel(struct wiphy *wiphy, 117struct ieee80211_channel *ieee80211_get_channel(struct wiphy *wiphy, int freq)
117 int freq)
118{ 118{
119 enum nl80211_band band; 119 enum nl80211_band band;
120 struct ieee80211_supported_band *sband; 120 struct ieee80211_supported_band *sband;
@@ -134,14 +134,13 @@ struct ieee80211_channel *__ieee80211_get_channel(struct wiphy *wiphy,
134 134
135 return NULL; 135 return NULL;
136} 136}
137EXPORT_SYMBOL(__ieee80211_get_channel); 137EXPORT_SYMBOL(ieee80211_get_channel);
138 138
139static void set_mandatory_flags_band(struct ieee80211_supported_band *sband, 139static void set_mandatory_flags_band(struct ieee80211_supported_band *sband)
140 enum nl80211_band band)
141{ 140{
142 int i, want; 141 int i, want;
143 142
144 switch (band) { 143 switch (sband->band) {
145 case NL80211_BAND_5GHZ: 144 case NL80211_BAND_5GHZ:
146 want = 3; 145 want = 3;
147 for (i = 0; i < sband->n_bitrates; i++) { 146 for (i = 0; i < sband->n_bitrates; i++) {
@@ -191,6 +190,7 @@ static void set_mandatory_flags_band(struct ieee80211_supported_band *sband,
191 WARN_ON((sband->ht_cap.mcs.rx_mask[0] & 0x1e) != 0x1e); 190 WARN_ON((sband->ht_cap.mcs.rx_mask[0] & 0x1e) != 0x1e);
192 break; 191 break;
193 case NUM_NL80211_BANDS: 192 case NUM_NL80211_BANDS:
193 default:
194 WARN_ON(1); 194 WARN_ON(1);
195 break; 195 break;
196 } 196 }
@@ -202,7 +202,7 @@ void ieee80211_set_bitrate_flags(struct wiphy *wiphy)
202 202
203 for (band = 0; band < NUM_NL80211_BANDS; band++) 203 for (band = 0; band < NUM_NL80211_BANDS; band++)
204 if (wiphy->bands[band]) 204 if (wiphy->bands[band])
205 set_mandatory_flags_band(wiphy->bands[band], band); 205 set_mandatory_flags_band(wiphy->bands[band]);
206} 206}
207 207
208bool cfg80211_supported_cipher_suite(struct wiphy *wiphy, u32 cipher) 208bool cfg80211_supported_cipher_suite(struct wiphy *wiphy, u32 cipher)
@@ -618,8 +618,6 @@ int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr,
618 618
619 if (pskb_expand_head(skb, head_need, 0, GFP_ATOMIC)) 619 if (pskb_expand_head(skb, head_need, 0, GFP_ATOMIC))
620 return -ENOMEM; 620 return -ENOMEM;
621
622 skb->truesize += head_need;
623 } 621 }
624 622
625 if (encaps_data) { 623 if (encaps_data) {
@@ -951,7 +949,7 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev)
951 ev->cr.resp_ie, ev->cr.resp_ie_len, 949 ev->cr.resp_ie, ev->cr.resp_ie_len,
952 ev->cr.status, 950 ev->cr.status,
953 ev->cr.status == WLAN_STATUS_SUCCESS, 951 ev->cr.status == WLAN_STATUS_SUCCESS,
954 ev->cr.bss); 952 ev->cr.bss, ev->cr.timeout_reason);
955 break; 953 break;
956 case EVENT_ROAMED: 954 case EVENT_ROAMED:
957 __cfg80211_roamed(wdev, ev->rm.bss, ev->rm.req_ie, 955 __cfg80211_roamed(wdev, ev->rm.bss, ev->rm.req_ie,
@@ -1378,6 +1376,25 @@ static bool ieee80211_id_in_list(const u8 *ids, int n_ids, u8 id)
1378 return false; 1376 return false;
1379} 1377}
1380 1378
1379static size_t skip_ie(const u8 *ies, size_t ielen, size_t pos)
1380{
1381 /* we assume a validly formed IEs buffer */
1382 u8 len = ies[pos + 1];
1383
1384 pos += 2 + len;
1385
1386 /* the IE itself must have 255 bytes for fragments to follow */
1387 if (len < 255)
1388 return pos;
1389
1390 while (pos < ielen && ies[pos] == WLAN_EID_FRAGMENT) {
1391 len = ies[pos + 1];
1392 pos += 2 + len;
1393 }
1394
1395 return pos;
1396}
1397
1381size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen, 1398size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen,
1382 const u8 *ids, int n_ids, 1399 const u8 *ids, int n_ids,
1383 const u8 *after_ric, int n_after_ric, 1400 const u8 *after_ric, int n_after_ric,
@@ -1387,14 +1404,14 @@ size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen,
1387 1404
1388 while (pos < ielen && ieee80211_id_in_list(ids, n_ids, ies[pos])) { 1405 while (pos < ielen && ieee80211_id_in_list(ids, n_ids, ies[pos])) {
1389 if (ies[pos] == WLAN_EID_RIC_DATA && n_after_ric) { 1406 if (ies[pos] == WLAN_EID_RIC_DATA && n_after_ric) {
1390 pos += 2 + ies[pos + 1]; 1407 pos = skip_ie(ies, ielen, pos);
1391 1408
1392 while (pos < ielen && 1409 while (pos < ielen &&
1393 !ieee80211_id_in_list(after_ric, n_after_ric, 1410 !ieee80211_id_in_list(after_ric, n_after_ric,
1394 ies[pos])) 1411 ies[pos]))
1395 pos += 2 + ies[pos + 1]; 1412 pos = skip_ie(ies, ielen, pos);
1396 } else { 1413 } else {
1397 pos += 2 + ies[pos + 1]; 1414 pos = skip_ie(ies, ielen, pos);
1398 } 1415 }
1399 } 1416 }
1400 1417
@@ -1555,31 +1572,57 @@ bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef,
1555} 1572}
1556EXPORT_SYMBOL(ieee80211_chandef_to_operating_class); 1573EXPORT_SYMBOL(ieee80211_chandef_to_operating_class);
1557 1574
1558int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev, 1575static void cfg80211_calculate_bi_data(struct wiphy *wiphy, u32 new_beacon_int,
1559 u32 beacon_int) 1576 u32 *beacon_int_gcd,
1577 bool *beacon_int_different)
1560{ 1578{
1561 struct wireless_dev *wdev; 1579 struct wireless_dev *wdev;
1562 int res = 0;
1563 1580
1564 if (beacon_int < 10 || beacon_int > 10000) 1581 *beacon_int_gcd = 0;
1565 return -EINVAL; 1582 *beacon_int_different = false;
1566 1583
1567 list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { 1584 list_for_each_entry(wdev, &wiphy->wdev_list, list) {
1568 if (!wdev->beacon_interval) 1585 if (!wdev->beacon_interval)
1569 continue; 1586 continue;
1570 if (wdev->beacon_interval != beacon_int) { 1587
1571 res = -EINVAL; 1588 if (!*beacon_int_gcd) {
1572 break; 1589 *beacon_int_gcd = wdev->beacon_interval;
1590 continue;
1573 } 1591 }
1592
1593 if (wdev->beacon_interval == *beacon_int_gcd)
1594 continue;
1595
1596 *beacon_int_different = true;
1597 *beacon_int_gcd = gcd(*beacon_int_gcd, wdev->beacon_interval);
1598 }
1599
1600 if (new_beacon_int && *beacon_int_gcd != new_beacon_int) {
1601 if (*beacon_int_gcd)
1602 *beacon_int_different = true;
1603 *beacon_int_gcd = gcd(*beacon_int_gcd, new_beacon_int);
1574 } 1604 }
1605}
1606
1607int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev,
1608 enum nl80211_iftype iftype, u32 beacon_int)
1609{
1610 /*
1611 * This is just a basic pre-condition check; if interface combinations
1612 * are possible the driver must already be checking those with a call
1613 * to cfg80211_check_combinations(), in which case we'll validate more
1614 * through the cfg80211_calculate_bi_data() call and code in
1615 * cfg80211_iter_combinations().
1616 */
1575 1617
1576 return res; 1618 if (beacon_int < 10 || beacon_int > 10000)
1619 return -EINVAL;
1620
1621 return 0;
1577} 1622}
1578 1623
1579int cfg80211_iter_combinations(struct wiphy *wiphy, 1624int cfg80211_iter_combinations(struct wiphy *wiphy,
1580 const int num_different_channels, 1625 struct iface_combination_params *params,
1581 const u8 radar_detect,
1582 const int iftype_num[NUM_NL80211_IFTYPES],
1583 void (*iter)(const struct ieee80211_iface_combination *c, 1626 void (*iter)(const struct ieee80211_iface_combination *c,
1584 void *data), 1627 void *data),
1585 void *data) 1628 void *data)
@@ -1589,8 +1632,23 @@ int cfg80211_iter_combinations(struct wiphy *wiphy,
1589 int i, j, iftype; 1632 int i, j, iftype;
1590 int num_interfaces = 0; 1633 int num_interfaces = 0;
1591 u32 used_iftypes = 0; 1634 u32 used_iftypes = 0;
1635 u32 beacon_int_gcd;
1636 bool beacon_int_different;
1592 1637
1593 if (radar_detect) { 1638 /*
1639 * This is a bit strange, since the iteration used to rely only on
1640 * the data given by the driver, but here it now relies on context,
1641 * in form of the currently operating interfaces.
1642 * This is OK for all current users, and saves us from having to
1643 * push the GCD calculations into all the drivers.
1644 * In the future, this should probably rely more on data that's in
1645 * cfg80211 already - the only thing not would appear to be any new
1646 * interfaces (while being brought up) and channel/radar data.
1647 */
1648 cfg80211_calculate_bi_data(wiphy, params->new_beacon_int,
1649 &beacon_int_gcd, &beacon_int_different);
1650
1651 if (params->radar_detect) {
1594 rcu_read_lock(); 1652 rcu_read_lock();
1595 regdom = rcu_dereference(cfg80211_regdomain); 1653 regdom = rcu_dereference(cfg80211_regdomain);
1596 if (regdom) 1654 if (regdom)
@@ -1599,8 +1657,8 @@ int cfg80211_iter_combinations(struct wiphy *wiphy,
1599 } 1657 }
1600 1658
1601 for (iftype = 0; iftype < NUM_NL80211_IFTYPES; iftype++) { 1659 for (iftype = 0; iftype < NUM_NL80211_IFTYPES; iftype++) {
1602 num_interfaces += iftype_num[iftype]; 1660 num_interfaces += params->iftype_num[iftype];
1603 if (iftype_num[iftype] > 0 && 1661 if (params->iftype_num[iftype] > 0 &&
1604 !(wiphy->software_iftypes & BIT(iftype))) 1662 !(wiphy->software_iftypes & BIT(iftype)))
1605 used_iftypes |= BIT(iftype); 1663 used_iftypes |= BIT(iftype);
1606 } 1664 }
@@ -1614,7 +1672,7 @@ int cfg80211_iter_combinations(struct wiphy *wiphy,
1614 1672
1615 if (num_interfaces > c->max_interfaces) 1673 if (num_interfaces > c->max_interfaces)
1616 continue; 1674 continue;
1617 if (num_different_channels > c->num_different_channels) 1675 if (params->num_different_channels > c->num_different_channels)
1618 continue; 1676 continue;
1619 1677
1620 limits = kmemdup(c->limits, sizeof(limits[0]) * c->n_limits, 1678 limits = kmemdup(c->limits, sizeof(limits[0]) * c->n_limits,
@@ -1629,16 +1687,17 @@ int cfg80211_iter_combinations(struct wiphy *wiphy,
1629 all_iftypes |= limits[j].types; 1687 all_iftypes |= limits[j].types;
1630 if (!(limits[j].types & BIT(iftype))) 1688 if (!(limits[j].types & BIT(iftype)))
1631 continue; 1689 continue;
1632 if (limits[j].max < iftype_num[iftype]) 1690 if (limits[j].max < params->iftype_num[iftype])
1633 goto cont; 1691 goto cont;
1634 limits[j].max -= iftype_num[iftype]; 1692 limits[j].max -= params->iftype_num[iftype];
1635 } 1693 }
1636 } 1694 }
1637 1695
1638 if (radar_detect != (c->radar_detect_widths & radar_detect)) 1696 if (params->radar_detect !=
1697 (c->radar_detect_widths & params->radar_detect))
1639 goto cont; 1698 goto cont;
1640 1699
1641 if (radar_detect && c->radar_detect_regions && 1700 if (params->radar_detect && c->radar_detect_regions &&
1642 !(c->radar_detect_regions & BIT(region))) 1701 !(c->radar_detect_regions & BIT(region)))
1643 goto cont; 1702 goto cont;
1644 1703
@@ -1650,6 +1709,14 @@ int cfg80211_iter_combinations(struct wiphy *wiphy,
1650 if ((all_iftypes & used_iftypes) != used_iftypes) 1709 if ((all_iftypes & used_iftypes) != used_iftypes)
1651 goto cont; 1710 goto cont;
1652 1711
1712 if (beacon_int_gcd) {
1713 if (c->beacon_int_min_gcd &&
1714 beacon_int_gcd < c->beacon_int_min_gcd)
1715 goto cont;
1716 if (!c->beacon_int_min_gcd && beacon_int_different)
1717 goto cont;
1718 }
1719
1653 /* This combination covered all interface types and 1720 /* This combination covered all interface types and
1654 * supported the requested numbers, so we're good. 1721 * supported the requested numbers, so we're good.
1655 */ 1722 */
@@ -1672,14 +1739,11 @@ cfg80211_iter_sum_ifcombs(const struct ieee80211_iface_combination *c,
1672} 1739}
1673 1740
1674int cfg80211_check_combinations(struct wiphy *wiphy, 1741int cfg80211_check_combinations(struct wiphy *wiphy,
1675 const int num_different_channels, 1742 struct iface_combination_params *params)
1676 const u8 radar_detect,
1677 const int iftype_num[NUM_NL80211_IFTYPES])
1678{ 1743{
1679 int err, num = 0; 1744 int err, num = 0;
1680 1745
1681 err = cfg80211_iter_combinations(wiphy, num_different_channels, 1746 err = cfg80211_iter_combinations(wiphy, params,
1682 radar_detect, iftype_num,
1683 cfg80211_iter_sum_ifcombs, &num); 1747 cfg80211_iter_sum_ifcombs, &num);
1684 if (err) 1748 if (err)
1685 return err; 1749 return err;
@@ -1781,6 +1845,21 @@ void cfg80211_free_nan_func(struct cfg80211_nan_func *f)
1781} 1845}
1782EXPORT_SYMBOL(cfg80211_free_nan_func); 1846EXPORT_SYMBOL(cfg80211_free_nan_func);
1783 1847
1848bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range,
1849 u32 center_freq_khz, u32 bw_khz)
1850{
1851 u32 start_freq_khz, end_freq_khz;
1852
1853 start_freq_khz = center_freq_khz - (bw_khz / 2);
1854 end_freq_khz = center_freq_khz + (bw_khz / 2);
1855
1856 if (start_freq_khz >= freq_range->start_freq_khz &&
1857 end_freq_khz <= freq_range->end_freq_khz)
1858 return true;
1859
1860 return false;
1861}
1862
1784/* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */ 1863/* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */
1785/* Ethernet-II snap header (RFC1042 for most EtherTypes) */ 1864/* Ethernet-II snap header (RFC1042 for most EtherTypes) */
1786const unsigned char rfc1042_header[] __aligned(2) = 1865const unsigned char rfc1042_header[] __aligned(2) =
diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
index 6250b1cfcde5..1a4db6790e20 100644
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -1119,3 +1119,70 @@ int compat_wext_handle_ioctl(struct net *net, unsigned int cmd,
1119 return ret; 1119 return ret;
1120} 1120}
1121#endif 1121#endif
1122
1123char *iwe_stream_add_event(struct iw_request_info *info, char *stream,
1124 char *ends, struct iw_event *iwe, int event_len)
1125{
1126 int lcp_len = iwe_stream_lcp_len(info);
1127
1128 event_len = iwe_stream_event_len_adjust(info, event_len);
1129
1130 /* Check if it's possible */
1131 if (likely((stream + event_len) < ends)) {
1132 iwe->len = event_len;
1133 /* Beware of alignement issues on 64 bits */
1134 memcpy(stream, (char *) iwe, IW_EV_LCP_PK_LEN);
1135 memcpy(stream + lcp_len, &iwe->u,
1136 event_len - lcp_len);
1137 stream += event_len;
1138 }
1139
1140 return stream;
1141}
1142EXPORT_SYMBOL(iwe_stream_add_event);
1143
1144char *iwe_stream_add_point(struct iw_request_info *info, char *stream,
1145 char *ends, struct iw_event *iwe, char *extra)
1146{
1147 int event_len = iwe_stream_point_len(info) + iwe->u.data.length;
1148 int point_len = iwe_stream_point_len(info);
1149 int lcp_len = iwe_stream_lcp_len(info);
1150
1151 /* Check if it's possible */
1152 if (likely((stream + event_len) < ends)) {
1153 iwe->len = event_len;
1154 memcpy(stream, (char *) iwe, IW_EV_LCP_PK_LEN);
1155 memcpy(stream + lcp_len,
1156 ((char *) &iwe->u) + IW_EV_POINT_OFF,
1157 IW_EV_POINT_PK_LEN - IW_EV_LCP_PK_LEN);
1158 if (iwe->u.data.length && extra)
1159 memcpy(stream + point_len, extra, iwe->u.data.length);
1160 stream += event_len;
1161 }
1162
1163 return stream;
1164}
1165EXPORT_SYMBOL(iwe_stream_add_point);
1166
1167char *iwe_stream_add_value(struct iw_request_info *info, char *event,
1168 char *value, char *ends, struct iw_event *iwe,
1169 int event_len)
1170{
1171 int lcp_len = iwe_stream_lcp_len(info);
1172
1173 /* Don't duplicate LCP */
1174 event_len -= IW_EV_LCP_LEN;
1175
1176 /* Check if it's possible */
1177 if (likely((value + event_len) < ends)) {
1178 /* Add new value */
1179 memcpy(value, &iwe->u, event_len);
1180 value += event_len;
1181 /* Patch LCP */
1182 iwe->len = value - event;
1183 memcpy(event, (char *) iwe, lcp_len);
1184 }
1185
1186 return value;
1187}
1188EXPORT_SYMBOL(iwe_stream_add_value);
diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c
index 995163830a61..c434f193f39a 100644
--- a/net/wireless/wext-sme.c
+++ b/net/wireless/wext-sme.c
@@ -105,30 +105,7 @@ int cfg80211_mgd_wext_siwfreq(struct net_device *dev,
105 goto out; 105 goto out;
106 } 106 }
107 107
108
109 wdev->wext.connect.channel = chan; 108 wdev->wext.connect.channel = chan;
110
111 /*
112 * SSID is not set, we just want to switch monitor channel,
113 * this is really just backward compatibility, if the SSID
114 * is set then we use the channel to select the BSS to use
115 * to connect to instead. If we were connected on another
116 * channel we disconnected above and reconnect below.
117 */
118 if (chan && !wdev->wext.connect.ssid_len) {
119 struct cfg80211_chan_def chandef = {
120 .width = NL80211_CHAN_WIDTH_20_NOHT,
121 .center_freq1 = freq,
122 };
123
124 chandef.chan = ieee80211_get_channel(&rdev->wiphy, freq);
125 if (chandef.chan)
126 err = cfg80211_set_monitor_channel(rdev, &chandef);
127 else
128 err = -EINVAL;
129 goto out;
130 }
131
132 err = cfg80211_mgd_wext_connect(rdev, wdev); 109 err = cfg80211_mgd_wext_connect(rdev, wdev);
133 out: 110 out:
134 wdev_unlock(wdev); 111 wdev_unlock(wdev);
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index f83b74d3e2ac..8b911c29860e 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -41,7 +41,7 @@
41#include <linux/capability.h> 41#include <linux/capability.h>
42#include <linux/errno.h> 42#include <linux/errno.h>
43#include <linux/kernel.h> 43#include <linux/kernel.h>
44#include <linux/sched.h> 44#include <linux/sched/signal.h>
45#include <linux/timer.h> 45#include <linux/timer.h>
46#include <linux/string.h> 46#include <linux/string.h>
47#include <linux/net.h> 47#include <linux/net.h>
@@ -51,7 +51,7 @@
51#include <linux/slab.h> 51#include <linux/slab.h>
52#include <net/sock.h> 52#include <net/sock.h>
53#include <net/tcp_states.h> 53#include <net/tcp_states.h>
54#include <asm/uaccess.h> 54#include <linux/uaccess.h>
55#include <linux/fcntl.h> 55#include <linux/fcntl.h>
56#include <linux/termios.h> /* For TIOCINQ/OUTQ */ 56#include <linux/termios.h> /* For TIOCINQ/OUTQ */
57#include <linux/notifier.h> 57#include <linux/notifier.h>
@@ -852,7 +852,8 @@ static int x25_wait_for_data(struct sock *sk, long timeout)
852 return rc; 852 return rc;
853} 853}
854 854
855static int x25_accept(struct socket *sock, struct socket *newsock, int flags) 855static int x25_accept(struct socket *sock, struct socket *newsock, int flags,
856 bool kern)
856{ 857{
857 struct sock *sk = sock->sk; 858 struct sock *sk = sock->sk;
858 struct sock *newsk; 859 struct sock *newsk;
diff --git a/net/x25/sysctl_net_x25.c b/net/x25/sysctl_net_x25.c
index 43239527a205..a06dfe143c67 100644
--- a/net/x25/sysctl_net_x25.c
+++ b/net/x25/sysctl_net_x25.c
@@ -70,7 +70,7 @@ static struct ctl_table x25_table[] = {
70 .mode = 0644, 70 .mode = 0644,
71 .proc_handler = proc_dointvec, 71 .proc_handler = proc_dointvec,
72 }, 72 },
73 { 0, }, 73 { },
74}; 74};
75 75
76void __init x25_register_sysctl(void) 76void __init x25_register_sysctl(void)
diff --git a/net/x25/x25_link.c b/net/x25/x25_link.c
index fd5ffb25873f..bcaa180d6a3f 100644
--- a/net/x25/x25_link.c
+++ b/net/x25/x25_link.c
@@ -29,7 +29,7 @@
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/netdevice.h> 30#include <linux/netdevice.h>
31#include <linux/skbuff.h> 31#include <linux/skbuff.h>
32#include <asm/uaccess.h> 32#include <linux/uaccess.h>
33#include <linux/init.h> 33#include <linux/init.h>
34#include <net/x25.h> 34#include <net/x25.h>
35 35
diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
index bda1a13628a8..286ed25c1a69 100644
--- a/net/xfrm/Kconfig
+++ b/net/xfrm/Kconfig
@@ -4,6 +4,11 @@
4config XFRM 4config XFRM
5 bool 5 bool
6 depends on NET 6 depends on NET
7 select GRO_CELLS
8
9config XFRM_OFFLOAD
10 bool
11 depends on XFRM
7 12
8config XFRM_ALGO 13config XFRM_ALGO
9 tristate 14 tristate
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 6e3f0254d8a1..46bdb4fbed0b 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -19,16 +19,18 @@
19static struct kmem_cache *secpath_cachep __read_mostly; 19static struct kmem_cache *secpath_cachep __read_mostly;
20 20
21static DEFINE_SPINLOCK(xfrm_input_afinfo_lock); 21static DEFINE_SPINLOCK(xfrm_input_afinfo_lock);
22static struct xfrm_input_afinfo __rcu *xfrm_input_afinfo[NPROTO]; 22static struct xfrm_input_afinfo const __rcu *xfrm_input_afinfo[AF_INET6 + 1];
23 23
24int xfrm_input_register_afinfo(struct xfrm_input_afinfo *afinfo) 24static struct gro_cells gro_cells;
25static struct net_device xfrm_napi_dev;
26
27int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo)
25{ 28{
26 int err = 0; 29 int err = 0;
27 30
28 if (unlikely(afinfo == NULL)) 31 if (WARN_ON(afinfo->family >= ARRAY_SIZE(xfrm_input_afinfo)))
29 return -EINVAL;
30 if (unlikely(afinfo->family >= NPROTO))
31 return -EAFNOSUPPORT; 32 return -EAFNOSUPPORT;
33
32 spin_lock_bh(&xfrm_input_afinfo_lock); 34 spin_lock_bh(&xfrm_input_afinfo_lock);
33 if (unlikely(xfrm_input_afinfo[afinfo->family] != NULL)) 35 if (unlikely(xfrm_input_afinfo[afinfo->family] != NULL))
34 err = -EEXIST; 36 err = -EEXIST;
@@ -39,14 +41,10 @@ int xfrm_input_register_afinfo(struct xfrm_input_afinfo *afinfo)
39} 41}
40EXPORT_SYMBOL(xfrm_input_register_afinfo); 42EXPORT_SYMBOL(xfrm_input_register_afinfo);
41 43
42int xfrm_input_unregister_afinfo(struct xfrm_input_afinfo *afinfo) 44int xfrm_input_unregister_afinfo(const struct xfrm_input_afinfo *afinfo)
43{ 45{
44 int err = 0; 46 int err = 0;
45 47
46 if (unlikely(afinfo == NULL))
47 return -EINVAL;
48 if (unlikely(afinfo->family >= NPROTO))
49 return -EAFNOSUPPORT;
50 spin_lock_bh(&xfrm_input_afinfo_lock); 48 spin_lock_bh(&xfrm_input_afinfo_lock);
51 if (likely(xfrm_input_afinfo[afinfo->family] != NULL)) { 49 if (likely(xfrm_input_afinfo[afinfo->family] != NULL)) {
52 if (unlikely(xfrm_input_afinfo[afinfo->family] != afinfo)) 50 if (unlikely(xfrm_input_afinfo[afinfo->family] != afinfo))
@@ -60,12 +58,13 @@ int xfrm_input_unregister_afinfo(struct xfrm_input_afinfo *afinfo)
60} 58}
61EXPORT_SYMBOL(xfrm_input_unregister_afinfo); 59EXPORT_SYMBOL(xfrm_input_unregister_afinfo);
62 60
63static struct xfrm_input_afinfo *xfrm_input_get_afinfo(unsigned int family) 61static const struct xfrm_input_afinfo *xfrm_input_get_afinfo(unsigned int family)
64{ 62{
65 struct xfrm_input_afinfo *afinfo; 63 const struct xfrm_input_afinfo *afinfo;
66 64
67 if (unlikely(family >= NPROTO)) 65 if (WARN_ON_ONCE(family >= ARRAY_SIZE(xfrm_input_afinfo)))
68 return NULL; 66 return NULL;
67
69 rcu_read_lock(); 68 rcu_read_lock();
70 afinfo = rcu_dereference(xfrm_input_afinfo[family]); 69 afinfo = rcu_dereference(xfrm_input_afinfo[family]);
71 if (unlikely(!afinfo)) 70 if (unlikely(!afinfo))
@@ -73,22 +72,17 @@ static struct xfrm_input_afinfo *xfrm_input_get_afinfo(unsigned int family)
73 return afinfo; 72 return afinfo;
74} 73}
75 74
76static void xfrm_input_put_afinfo(struct xfrm_input_afinfo *afinfo)
77{
78 rcu_read_unlock();
79}
80
81static int xfrm_rcv_cb(struct sk_buff *skb, unsigned int family, u8 protocol, 75static int xfrm_rcv_cb(struct sk_buff *skb, unsigned int family, u8 protocol,
82 int err) 76 int err)
83{ 77{
84 int ret; 78 int ret;
85 struct xfrm_input_afinfo *afinfo = xfrm_input_get_afinfo(family); 79 const struct xfrm_input_afinfo *afinfo = xfrm_input_get_afinfo(family);
86 80
87 if (!afinfo) 81 if (!afinfo)
88 return -EAFNOSUPPORT; 82 return -EAFNOSUPPORT;
89 83
90 ret = afinfo->callback(skb, protocol, err); 84 ret = afinfo->callback(skb, protocol, err);
91 xfrm_input_put_afinfo(afinfo); 85 rcu_read_unlock();
92 86
93 return ret; 87 return ret;
94} 88}
@@ -111,6 +105,8 @@ struct sec_path *secpath_dup(struct sec_path *src)
111 return NULL; 105 return NULL;
112 106
113 sp->len = 0; 107 sp->len = 0;
108 sp->olen = 0;
109
114 if (src) { 110 if (src) {
115 int i; 111 int i;
116 112
@@ -123,6 +119,24 @@ struct sec_path *secpath_dup(struct sec_path *src)
123} 119}
124EXPORT_SYMBOL(secpath_dup); 120EXPORT_SYMBOL(secpath_dup);
125 121
122int secpath_set(struct sk_buff *skb)
123{
124 struct sec_path *sp;
125
126 /* Allocate new secpath or COW existing one. */
127 if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) {
128 sp = secpath_dup(skb->sp);
129 if (!sp)
130 return -ENOMEM;
131
132 if (skb->sp)
133 secpath_put(skb->sp);
134 skb->sp = sp;
135 }
136 return 0;
137}
138EXPORT_SYMBOL(secpath_set);
139
126/* Fetch spi and seq from ipsec header */ 140/* Fetch spi and seq from ipsec header */
127 141
128int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq) 142int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq)
@@ -158,6 +172,7 @@ int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq)
158 *seq = *(__be32 *)(skb_transport_header(skb) + offset_seq); 172 *seq = *(__be32 *)(skb_transport_header(skb) + offset_seq);
159 return 0; 173 return 0;
160} 174}
175EXPORT_SYMBOL(xfrm_parse_spi);
161 176
162int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb) 177int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb)
163{ 178{
@@ -192,14 +207,23 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
192 unsigned int family; 207 unsigned int family;
193 int decaps = 0; 208 int decaps = 0;
194 int async = 0; 209 int async = 0;
210 struct xfrm_offload *xo;
211 bool xfrm_gro = false;
195 212
196 /* A negative encap_type indicates async resumption. */
197 if (encap_type < 0) { 213 if (encap_type < 0) {
198 async = 1;
199 x = xfrm_input_state(skb); 214 x = xfrm_input_state(skb);
200 seq = XFRM_SKB_CB(skb)->seq.input.low;
201 family = x->outer_mode->afinfo->family; 215 family = x->outer_mode->afinfo->family;
202 goto resume; 216
217 /* An encap_type of -1 indicates async resumption. */
218 if (encap_type == -1) {
219 async = 1;
220 seq = XFRM_SKB_CB(skb)->seq.input.low;
221 goto resume;
222 }
223 /* encap_type < -1 indicates a GRO call. */
224 encap_type = 0;
225 seq = XFRM_SPI_SKB_CB(skb)->seq;
226 goto lock;
203 } 227 }
204 228
205 daddr = (xfrm_address_t *)(skb_network_header(skb) + 229 daddr = (xfrm_address_t *)(skb_network_header(skb) +
@@ -218,18 +242,10 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
218 break; 242 break;
219 } 243 }
220 244
221 /* Allocate new secpath or COW existing one. */ 245 err = secpath_set(skb);
222 if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) { 246 if (err) {
223 struct sec_path *sp; 247 XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR);
224 248 goto drop;
225 sp = secpath_dup(skb->sp);
226 if (!sp) {
227 XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR);
228 goto drop;
229 }
230 if (skb->sp)
231 secpath_put(skb->sp);
232 skb->sp = sp;
233 } 249 }
234 250
235 seq = 0; 251 seq = 0;
@@ -253,6 +269,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
253 269
254 skb->sp->xvec[skb->sp->len++] = x; 270 skb->sp->xvec[skb->sp->len++] = x;
255 271
272lock:
256 spin_lock(&x->lock); 273 spin_lock(&x->lock);
257 274
258 if (unlikely(x->km.state != XFRM_STATE_VALID)) { 275 if (unlikely(x->km.state != XFRM_STATE_VALID)) {
@@ -371,10 +388,21 @@ resume:
371 388
372 if (decaps) { 389 if (decaps) {
373 skb_dst_drop(skb); 390 skb_dst_drop(skb);
374 netif_rx(skb); 391 gro_cells_receive(&gro_cells, skb);
375 return 0; 392 return 0;
376 } else { 393 } else {
377 return x->inner_mode->afinfo->transport_finish(skb, async); 394 xo = xfrm_offload(skb);
395 if (xo)
396 xfrm_gro = xo->flags & XFRM_GRO;
397
398 err = x->inner_mode->afinfo->transport_finish(skb, async);
399 if (xfrm_gro) {
400 skb_dst_drop(skb);
401 gro_cells_receive(&gro_cells, skb);
402 return err;
403 }
404
405 return err;
378 } 406 }
379 407
380drop_unlock: 408drop_unlock:
@@ -394,6 +422,13 @@ EXPORT_SYMBOL(xfrm_input_resume);
394 422
395void __init xfrm_input_init(void) 423void __init xfrm_input_init(void)
396{ 424{
425 int err;
426
427 init_dummy_netdev(&xfrm_napi_dev);
428 err = gro_cells_init(&gro_cells, &xfrm_napi_dev);
429 if (err)
430 gro_cells.cells = NULL;
431
397 secpath_cachep = kmem_cache_create("secpath_cache", 432 secpath_cachep = kmem_cache_create("secpath_cache",
398 sizeof(struct sec_path), 433 sizeof(struct sec_path),
399 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, 434 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 637387bbaaea..8ba29fe58352 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -246,10 +246,8 @@ void xfrm_local_error(struct sk_buff *skb, int mtu)
246 return; 246 return;
247 247
248 afinfo = xfrm_state_get_afinfo(proto); 248 afinfo = xfrm_state_get_afinfo(proto);
249 if (!afinfo) 249 if (afinfo)
250 return; 250 afinfo->local_error(skb, mtu);
251 251 rcu_read_unlock();
252 afinfo->local_error(skb, mtu);
253 xfrm_state_put_afinfo(afinfo);
254} 252}
255EXPORT_SYMBOL_GPL(xfrm_local_error); 253EXPORT_SYMBOL_GPL(xfrm_local_error);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 5bf7e1bfeac7..236cbbc0ab9c 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -45,7 +45,7 @@ struct xfrm_flo {
45}; 45};
46 46
47static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock); 47static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock);
48static struct xfrm_policy_afinfo __rcu *xfrm_policy_afinfo[NPROTO] 48static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1]
49 __read_mostly; 49 __read_mostly;
50 50
51static struct kmem_cache *xfrm_dst_cache __read_mostly; 51static struct kmem_cache *xfrm_dst_cache __read_mostly;
@@ -103,11 +103,11 @@ bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl
103 return false; 103 return false;
104} 104}
105 105
106static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family) 106static const struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
107{ 107{
108 struct xfrm_policy_afinfo *afinfo; 108 const struct xfrm_policy_afinfo *afinfo;
109 109
110 if (unlikely(family >= NPROTO)) 110 if (unlikely(family >= ARRAY_SIZE(xfrm_policy_afinfo)))
111 return NULL; 111 return NULL;
112 rcu_read_lock(); 112 rcu_read_lock();
113 afinfo = rcu_dereference(xfrm_policy_afinfo[family]); 113 afinfo = rcu_dereference(xfrm_policy_afinfo[family]);
@@ -116,18 +116,13 @@ static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
116 return afinfo; 116 return afinfo;
117} 117}
118 118
119static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo)
120{
121 rcu_read_unlock();
122}
123
124static inline struct dst_entry *__xfrm_dst_lookup(struct net *net, 119static inline struct dst_entry *__xfrm_dst_lookup(struct net *net,
125 int tos, int oif, 120 int tos, int oif,
126 const xfrm_address_t *saddr, 121 const xfrm_address_t *saddr,
127 const xfrm_address_t *daddr, 122 const xfrm_address_t *daddr,
128 int family) 123 int family)
129{ 124{
130 struct xfrm_policy_afinfo *afinfo; 125 const struct xfrm_policy_afinfo *afinfo;
131 struct dst_entry *dst; 126 struct dst_entry *dst;
132 127
133 afinfo = xfrm_policy_get_afinfo(family); 128 afinfo = xfrm_policy_get_afinfo(family);
@@ -136,7 +131,7 @@ static inline struct dst_entry *__xfrm_dst_lookup(struct net *net,
136 131
137 dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr); 132 dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr);
138 133
139 xfrm_policy_put_afinfo(afinfo); 134 rcu_read_unlock();
140 135
141 return dst; 136 return dst;
142} 137}
@@ -330,7 +325,7 @@ void xfrm_policy_destroy(struct xfrm_policy *policy)
330} 325}
331EXPORT_SYMBOL(xfrm_policy_destroy); 326EXPORT_SYMBOL(xfrm_policy_destroy);
332 327
333/* Rule must be locked. Release descentant resources, announce 328/* Rule must be locked. Release descendant resources, announce
334 * entry dead. The rule must be unlinked from lists to the moment. 329 * entry dead. The rule must be unlinked from lists to the moment.
335 */ 330 */
336 331
@@ -1248,7 +1243,7 @@ static inline int policy_to_flow_dir(int dir)
1248} 1243}
1249 1244
1250static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, 1245static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
1251 const struct flowi *fl) 1246 const struct flowi *fl, u16 family)
1252{ 1247{
1253 struct xfrm_policy *pol; 1248 struct xfrm_policy *pol;
1254 1249
@@ -1256,8 +1251,7 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
1256 again: 1251 again:
1257 pol = rcu_dereference(sk->sk_policy[dir]); 1252 pol = rcu_dereference(sk->sk_policy[dir]);
1258 if (pol != NULL) { 1253 if (pol != NULL) {
1259 bool match = xfrm_selector_match(&pol->selector, fl, 1254 bool match = xfrm_selector_match(&pol->selector, fl, family);
1260 sk->sk_family);
1261 int err = 0; 1255 int err = 0;
1262 1256
1263 if (match) { 1257 if (match) {
@@ -1431,12 +1425,12 @@ xfrm_get_saddr(struct net *net, int oif, xfrm_address_t *local,
1431 xfrm_address_t *remote, unsigned short family) 1425 xfrm_address_t *remote, unsigned short family)
1432{ 1426{
1433 int err; 1427 int err;
1434 struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); 1428 const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
1435 1429
1436 if (unlikely(afinfo == NULL)) 1430 if (unlikely(afinfo == NULL))
1437 return -EINVAL; 1431 return -EINVAL;
1438 err = afinfo->get_saddr(net, oif, local, remote); 1432 err = afinfo->get_saddr(net, oif, local, remote);
1439 xfrm_policy_put_afinfo(afinfo); 1433 rcu_read_unlock();
1440 return err; 1434 return err;
1441} 1435}
1442 1436
@@ -1538,21 +1532,15 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl,
1538 1532
1539} 1533}
1540 1534
1541/* Check that the bundle accepts the flow and its components are 1535static int xfrm_get_tos(const struct flowi *fl, int family)
1542 * still valid.
1543 */
1544
1545static inline int xfrm_get_tos(const struct flowi *fl, int family)
1546{ 1536{
1547 struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); 1537 const struct xfrm_policy_afinfo *afinfo;
1548 int tos; 1538 int tos = 0;
1549
1550 if (!afinfo)
1551 return -EINVAL;
1552 1539
1553 tos = afinfo->get_tos(fl); 1540 afinfo = xfrm_policy_get_afinfo(family);
1541 tos = afinfo ? afinfo->get_tos(fl) : 0;
1554 1542
1555 xfrm_policy_put_afinfo(afinfo); 1543 rcu_read_unlock();
1556 1544
1557 return tos; 1545 return tos;
1558} 1546}
@@ -1609,7 +1597,7 @@ static const struct flow_cache_ops xfrm_bundle_fc_ops = {
1609 1597
1610static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) 1598static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
1611{ 1599{
1612 struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); 1600 const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
1613 struct dst_ops *dst_ops; 1601 struct dst_ops *dst_ops;
1614 struct xfrm_dst *xdst; 1602 struct xfrm_dst *xdst;
1615 1603
@@ -1638,7 +1626,7 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
1638 } else 1626 } else
1639 xdst = ERR_PTR(-ENOBUFS); 1627 xdst = ERR_PTR(-ENOBUFS);
1640 1628
1641 xfrm_policy_put_afinfo(afinfo); 1629 rcu_read_unlock();
1642 1630
1643 return xdst; 1631 return xdst;
1644} 1632}
@@ -1646,7 +1634,7 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
1646static inline int xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst, 1634static inline int xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst,
1647 int nfheader_len) 1635 int nfheader_len)
1648{ 1636{
1649 struct xfrm_policy_afinfo *afinfo = 1637 const struct xfrm_policy_afinfo *afinfo =
1650 xfrm_policy_get_afinfo(dst->ops->family); 1638 xfrm_policy_get_afinfo(dst->ops->family);
1651 int err; 1639 int err;
1652 1640
@@ -1655,7 +1643,7 @@ static inline int xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst,
1655 1643
1656 err = afinfo->init_path(path, dst, nfheader_len); 1644 err = afinfo->init_path(path, dst, nfheader_len);
1657 1645
1658 xfrm_policy_put_afinfo(afinfo); 1646 rcu_read_unlock();
1659 1647
1660 return err; 1648 return err;
1661} 1649}
@@ -1663,7 +1651,7 @@ static inline int xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst,
1663static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, 1651static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
1664 const struct flowi *fl) 1652 const struct flowi *fl)
1665{ 1653{
1666 struct xfrm_policy_afinfo *afinfo = 1654 const struct xfrm_policy_afinfo *afinfo =
1667 xfrm_policy_get_afinfo(xdst->u.dst.ops->family); 1655 xfrm_policy_get_afinfo(xdst->u.dst.ops->family);
1668 int err; 1656 int err;
1669 1657
@@ -1672,7 +1660,7 @@ static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
1672 1660
1673 err = afinfo->fill_dst(xdst, dev, fl); 1661 err = afinfo->fill_dst(xdst, dev, fl);
1674 1662
1675 xfrm_policy_put_afinfo(afinfo); 1663 rcu_read_unlock();
1676 1664
1677 return err; 1665 return err;
1678} 1666}
@@ -1705,9 +1693,6 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
1705 xfrm_flowi_addr_get(fl, &saddr, &daddr, family); 1693 xfrm_flowi_addr_get(fl, &saddr, &daddr, family);
1706 1694
1707 tos = xfrm_get_tos(fl, family); 1695 tos = xfrm_get_tos(fl, family);
1708 err = tos;
1709 if (tos < 0)
1710 goto put_states;
1711 1696
1712 dst_hold(dst); 1697 dst_hold(dst);
1713 1698
@@ -2215,7 +2200,7 @@ error:
2215static struct dst_entry *make_blackhole(struct net *net, u16 family, 2200static struct dst_entry *make_blackhole(struct net *net, u16 family,
2216 struct dst_entry *dst_orig) 2201 struct dst_entry *dst_orig)
2217{ 2202{
2218 struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); 2203 const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
2219 struct dst_entry *ret; 2204 struct dst_entry *ret;
2220 2205
2221 if (!afinfo) { 2206 if (!afinfo) {
@@ -2224,7 +2209,7 @@ static struct dst_entry *make_blackhole(struct net *net, u16 family,
2224 } else { 2209 } else {
2225 ret = afinfo->blackhole_route(net, dst_orig); 2210 ret = afinfo->blackhole_route(net, dst_orig);
2226 } 2211 }
2227 xfrm_policy_put_afinfo(afinfo); 2212 rcu_read_unlock();
2228 2213
2229 return ret; 2214 return ret;
2230} 2215}
@@ -2253,7 +2238,7 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
2253 sk = sk_const_to_full_sk(sk); 2238 sk = sk_const_to_full_sk(sk);
2254 if (sk && sk->sk_policy[XFRM_POLICY_OUT]) { 2239 if (sk && sk->sk_policy[XFRM_POLICY_OUT]) {
2255 num_pols = 1; 2240 num_pols = 1;
2256 pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl); 2241 pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, family);
2257 err = xfrm_expand_policies(fl, family, pols, 2242 err = xfrm_expand_policies(fl, family, pols,
2258 &num_pols, &num_xfrms); 2243 &num_pols, &num_xfrms);
2259 if (err < 0) 2244 if (err < 0)
@@ -2466,7 +2451,7 @@ xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int star
2466int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, 2451int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl,
2467 unsigned int family, int reverse) 2452 unsigned int family, int reverse)
2468{ 2453{
2469 struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); 2454 const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
2470 int err; 2455 int err;
2471 2456
2472 if (unlikely(afinfo == NULL)) 2457 if (unlikely(afinfo == NULL))
@@ -2474,7 +2459,7 @@ int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl,
2474 2459
2475 afinfo->decode_session(skb, fl, reverse); 2460 afinfo->decode_session(skb, fl, reverse);
2476 err = security_xfrm_decode_session(skb, &fl->flowi_secid); 2461 err = security_xfrm_decode_session(skb, &fl->flowi_secid);
2477 xfrm_policy_put_afinfo(afinfo); 2462 rcu_read_unlock();
2478 return err; 2463 return err;
2479} 2464}
2480EXPORT_SYMBOL(__xfrm_decode_session); 2465EXPORT_SYMBOL(__xfrm_decode_session);
@@ -2532,7 +2517,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
2532 pol = NULL; 2517 pol = NULL;
2533 sk = sk_to_full_sk(sk); 2518 sk = sk_to_full_sk(sk);
2534 if (sk && sk->sk_policy[dir]) { 2519 if (sk && sk->sk_policy[dir]) {
2535 pol = xfrm_sk_policy_lookup(sk, dir, &fl); 2520 pol = xfrm_sk_policy_lookup(sk, dir, &fl, family);
2536 if (IS_ERR(pol)) { 2521 if (IS_ERR(pol)) {
2537 XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); 2522 XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
2538 return 0; 2523 return 0;
@@ -2742,10 +2727,11 @@ void xfrm_garbage_collect(struct net *net)
2742} 2727}
2743EXPORT_SYMBOL(xfrm_garbage_collect); 2728EXPORT_SYMBOL(xfrm_garbage_collect);
2744 2729
2745static void xfrm_garbage_collect_deferred(struct net *net) 2730void xfrm_garbage_collect_deferred(struct net *net)
2746{ 2731{
2747 flow_cache_flush_deferred(net); 2732 flow_cache_flush_deferred(net);
2748} 2733}
2734EXPORT_SYMBOL(xfrm_garbage_collect_deferred);
2749 2735
2750static void xfrm_init_pmtu(struct dst_entry *dst) 2736static void xfrm_init_pmtu(struct dst_entry *dst)
2751{ 2737{
@@ -2849,22 +2835,52 @@ static unsigned int xfrm_mtu(const struct dst_entry *dst)
2849 return mtu ? : dst_mtu(dst->path); 2835 return mtu ? : dst_mtu(dst->path);
2850} 2836}
2851 2837
2838static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst,
2839 const void *daddr)
2840{
2841 const struct dst_entry *path = dst->path;
2842
2843 for (; dst != path; dst = dst->child) {
2844 const struct xfrm_state *xfrm = dst->xfrm;
2845
2846 if (xfrm->props.mode == XFRM_MODE_TRANSPORT)
2847 continue;
2848 if (xfrm->type->flags & XFRM_TYPE_REMOTE_COADDR)
2849 daddr = xfrm->coaddr;
2850 else if (!(xfrm->type->flags & XFRM_TYPE_LOCAL_COADDR))
2851 daddr = &xfrm->id.daddr;
2852 }
2853 return daddr;
2854}
2855
2852static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst, 2856static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst,
2853 struct sk_buff *skb, 2857 struct sk_buff *skb,
2854 const void *daddr) 2858 const void *daddr)
2855{ 2859{
2856 return dst->path->ops->neigh_lookup(dst, skb, daddr); 2860 const struct dst_entry *path = dst->path;
2861
2862 if (!skb)
2863 daddr = xfrm_get_dst_nexthop(dst, daddr);
2864 return path->ops->neigh_lookup(path, skb, daddr);
2865}
2866
2867static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr)
2868{
2869 const struct dst_entry *path = dst->path;
2870
2871 daddr = xfrm_get_dst_nexthop(dst, daddr);
2872 path->ops->confirm_neigh(path, daddr);
2857} 2873}
2858 2874
2859int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) 2875int xfrm_policy_register_afinfo(const struct xfrm_policy_afinfo *afinfo, int family)
2860{ 2876{
2861 int err = 0; 2877 int err = 0;
2862 if (unlikely(afinfo == NULL)) 2878
2863 return -EINVAL; 2879 if (WARN_ON(family >= ARRAY_SIZE(xfrm_policy_afinfo)))
2864 if (unlikely(afinfo->family >= NPROTO))
2865 return -EAFNOSUPPORT; 2880 return -EAFNOSUPPORT;
2881
2866 spin_lock(&xfrm_policy_afinfo_lock); 2882 spin_lock(&xfrm_policy_afinfo_lock);
2867 if (unlikely(xfrm_policy_afinfo[afinfo->family] != NULL)) 2883 if (unlikely(xfrm_policy_afinfo[family] != NULL))
2868 err = -EEXIST; 2884 err = -EEXIST;
2869 else { 2885 else {
2870 struct dst_ops *dst_ops = afinfo->dst_ops; 2886 struct dst_ops *dst_ops = afinfo->dst_ops;
@@ -2882,9 +2898,9 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
2882 dst_ops->link_failure = xfrm_link_failure; 2898 dst_ops->link_failure = xfrm_link_failure;
2883 if (likely(dst_ops->neigh_lookup == NULL)) 2899 if (likely(dst_ops->neigh_lookup == NULL))
2884 dst_ops->neigh_lookup = xfrm_neigh_lookup; 2900 dst_ops->neigh_lookup = xfrm_neigh_lookup;
2885 if (likely(afinfo->garbage_collect == NULL)) 2901 if (likely(!dst_ops->confirm_neigh))
2886 afinfo->garbage_collect = xfrm_garbage_collect_deferred; 2902 dst_ops->confirm_neigh = xfrm_confirm_neigh;
2887 rcu_assign_pointer(xfrm_policy_afinfo[afinfo->family], afinfo); 2903 rcu_assign_pointer(xfrm_policy_afinfo[family], afinfo);
2888 } 2904 }
2889 spin_unlock(&xfrm_policy_afinfo_lock); 2905 spin_unlock(&xfrm_policy_afinfo_lock);
2890 2906
@@ -2892,34 +2908,24 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
2892} 2908}
2893EXPORT_SYMBOL(xfrm_policy_register_afinfo); 2909EXPORT_SYMBOL(xfrm_policy_register_afinfo);
2894 2910
2895int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo) 2911void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo)
2896{ 2912{
2897 int err = 0; 2913 struct dst_ops *dst_ops = afinfo->dst_ops;
2898 if (unlikely(afinfo == NULL)) 2914 int i;
2899 return -EINVAL; 2915
2900 if (unlikely(afinfo->family >= NPROTO)) 2916 for (i = 0; i < ARRAY_SIZE(xfrm_policy_afinfo); i++) {
2901 return -EAFNOSUPPORT; 2917 if (xfrm_policy_afinfo[i] != afinfo)
2902 spin_lock(&xfrm_policy_afinfo_lock); 2918 continue;
2903 if (likely(xfrm_policy_afinfo[afinfo->family] != NULL)) { 2919 RCU_INIT_POINTER(xfrm_policy_afinfo[i], NULL);
2904 if (unlikely(xfrm_policy_afinfo[afinfo->family] != afinfo)) 2920 break;
2905 err = -EINVAL;
2906 else
2907 RCU_INIT_POINTER(xfrm_policy_afinfo[afinfo->family],
2908 NULL);
2909 } 2921 }
2910 spin_unlock(&xfrm_policy_afinfo_lock);
2911 if (!err) {
2912 struct dst_ops *dst_ops = afinfo->dst_ops;
2913 2922
2914 synchronize_rcu(); 2923 synchronize_rcu();
2915 2924
2916 dst_ops->kmem_cachep = NULL; 2925 dst_ops->kmem_cachep = NULL;
2917 dst_ops->check = NULL; 2926 dst_ops->check = NULL;
2918 dst_ops->negative_advice = NULL; 2927 dst_ops->negative_advice = NULL;
2919 dst_ops->link_failure = NULL; 2928 dst_ops->link_failure = NULL;
2920 afinfo->garbage_collect = NULL;
2921 }
2922 return err;
2923} 2929}
2924EXPORT_SYMBOL(xfrm_policy_unregister_afinfo); 2930EXPORT_SYMBOL(xfrm_policy_unregister_afinfo);
2925 2931
@@ -3062,6 +3068,11 @@ static int __net_init xfrm_net_init(struct net *net)
3062{ 3068{
3063 int rv; 3069 int rv;
3064 3070
3071 /* Initialize the per-net locks here */
3072 spin_lock_init(&net->xfrm.xfrm_state_lock);
3073 spin_lock_init(&net->xfrm.xfrm_policy_lock);
3074 mutex_init(&net->xfrm.xfrm_cfg_mutex);
3075
3065 rv = xfrm_statistics_init(net); 3076 rv = xfrm_statistics_init(net);
3066 if (rv < 0) 3077 if (rv < 0)
3067 goto out_statistics; 3078 goto out_statistics;
@@ -3078,11 +3089,6 @@ static int __net_init xfrm_net_init(struct net *net)
3078 if (rv < 0) 3089 if (rv < 0)
3079 goto out; 3090 goto out;
3080 3091
3081 /* Initialize the per-net locks here */
3082 spin_lock_init(&net->xfrm.xfrm_state_lock);
3083 spin_lock_init(&net->xfrm.xfrm_policy_lock);
3084 mutex_init(&net->xfrm.xfrm_cfg_mutex);
3085
3086 return 0; 3092 return 0;
3087 3093
3088out: 3094out:
@@ -3113,6 +3119,7 @@ static struct pernet_operations __net_initdata xfrm_net_ops = {
3113 3119
3114void __init xfrm_init(void) 3120void __init xfrm_init(void)
3115{ 3121{
3122 flow_cache_hp_init();
3116 register_pernet_subsys(&xfrm_net_ops); 3123 register_pernet_subsys(&xfrm_net_ops);
3117 seqcount_init(&xfrm_policy_hash_generation); 3124 seqcount_init(&xfrm_policy_hash_generation);
3118 xfrm_input_init(); 3125 xfrm_input_init();
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 419bf5d463bd..5a597dbbe564 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -20,7 +20,7 @@
20#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/cache.h> 21#include <linux/cache.h>
22#include <linux/audit.h> 22#include <linux/audit.h>
23#include <asm/uaccess.h> 23#include <linux/uaccess.h>
24#include <linux/ktime.h> 24#include <linux/ktime.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/interrupt.h> 26#include <linux/interrupt.h>
@@ -192,7 +192,7 @@ int xfrm_register_type(const struct xfrm_type *type, unsigned short family)
192 else 192 else
193 err = -EEXIST; 193 err = -EEXIST;
194 spin_unlock_bh(&xfrm_type_lock); 194 spin_unlock_bh(&xfrm_type_lock);
195 xfrm_state_put_afinfo(afinfo); 195 rcu_read_unlock();
196 return err; 196 return err;
197} 197}
198EXPORT_SYMBOL(xfrm_register_type); 198EXPORT_SYMBOL(xfrm_register_type);
@@ -213,7 +213,7 @@ int xfrm_unregister_type(const struct xfrm_type *type, unsigned short family)
213 else 213 else
214 typemap[type->proto] = NULL; 214 typemap[type->proto] = NULL;
215 spin_unlock_bh(&xfrm_type_lock); 215 spin_unlock_bh(&xfrm_type_lock);
216 xfrm_state_put_afinfo(afinfo); 216 rcu_read_unlock();
217 return err; 217 return err;
218} 218}
219EXPORT_SYMBOL(xfrm_unregister_type); 219EXPORT_SYMBOL(xfrm_unregister_type);
@@ -231,17 +231,18 @@ retry:
231 return NULL; 231 return NULL;
232 typemap = afinfo->type_map; 232 typemap = afinfo->type_map;
233 233
234 type = typemap[proto]; 234 type = READ_ONCE(typemap[proto]);
235 if (unlikely(type && !try_module_get(type->owner))) 235 if (unlikely(type && !try_module_get(type->owner)))
236 type = NULL; 236 type = NULL;
237
238 rcu_read_unlock();
239
237 if (!type && !modload_attempted) { 240 if (!type && !modload_attempted) {
238 xfrm_state_put_afinfo(afinfo);
239 request_module("xfrm-type-%d-%d", family, proto); 241 request_module("xfrm-type-%d-%d", family, proto);
240 modload_attempted = 1; 242 modload_attempted = 1;
241 goto retry; 243 goto retry;
242 } 244 }
243 245
244 xfrm_state_put_afinfo(afinfo);
245 return type; 246 return type;
246} 247}
247 248
@@ -280,7 +281,7 @@ int xfrm_register_mode(struct xfrm_mode *mode, int family)
280 281
281out: 282out:
282 spin_unlock_bh(&xfrm_mode_lock); 283 spin_unlock_bh(&xfrm_mode_lock);
283 xfrm_state_put_afinfo(afinfo); 284 rcu_read_unlock();
284 return err; 285 return err;
285} 286}
286EXPORT_SYMBOL(xfrm_register_mode); 287EXPORT_SYMBOL(xfrm_register_mode);
@@ -308,7 +309,7 @@ int xfrm_unregister_mode(struct xfrm_mode *mode, int family)
308 } 309 }
309 310
310 spin_unlock_bh(&xfrm_mode_lock); 311 spin_unlock_bh(&xfrm_mode_lock);
311 xfrm_state_put_afinfo(afinfo); 312 rcu_read_unlock();
312 return err; 313 return err;
313} 314}
314EXPORT_SYMBOL(xfrm_unregister_mode); 315EXPORT_SYMBOL(xfrm_unregister_mode);
@@ -327,17 +328,17 @@ retry:
327 if (unlikely(afinfo == NULL)) 328 if (unlikely(afinfo == NULL))
328 return NULL; 329 return NULL;
329 330
330 mode = afinfo->mode_map[encap]; 331 mode = READ_ONCE(afinfo->mode_map[encap]);
331 if (unlikely(mode && !try_module_get(mode->owner))) 332 if (unlikely(mode && !try_module_get(mode->owner)))
332 mode = NULL; 333 mode = NULL;
334
335 rcu_read_unlock();
333 if (!mode && !modload_attempted) { 336 if (!mode && !modload_attempted) {
334 xfrm_state_put_afinfo(afinfo);
335 request_module("xfrm-mode-%d-%d", family, encap); 337 request_module("xfrm-mode-%d-%d", family, encap);
336 modload_attempted = 1; 338 modload_attempted = 1;
337 goto retry; 339 goto retry;
338 } 340 }
339 341
340 xfrm_state_put_afinfo(afinfo);
341 return mode; 342 return mode;
342} 343}
343 344
@@ -388,14 +389,6 @@ static void xfrm_state_gc_task(struct work_struct *work)
388 xfrm_state_gc_destroy(x); 389 xfrm_state_gc_destroy(x);
389} 390}
390 391
391static inline unsigned long make_jiffies(long secs)
392{
393 if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ)
394 return MAX_SCHEDULE_TIMEOUT-1;
395 else
396 return secs*HZ;
397}
398
399static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) 392static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
400{ 393{
401 struct tasklet_hrtimer *thr = container_of(me, struct tasklet_hrtimer, timer); 394 struct tasklet_hrtimer *thr = container_of(me, struct tasklet_hrtimer, timer);
@@ -417,7 +410,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
417 if (x->xflags & XFRM_SOFT_EXPIRE) { 410 if (x->xflags & XFRM_SOFT_EXPIRE) {
418 /* enter hard expire without soft expire first?! 411 /* enter hard expire without soft expire first?!
419 * setting a new date could trigger this. 412 * setting a new date could trigger this.
420 * workarbound: fix x->curflt.add_time by below: 413 * workaround: fix x->curflt.add_time by below:
421 */ 414 */
422 x->curlft.add_time = now - x->saved_tmo - 1; 415 x->curlft.add_time = now - x->saved_tmo - 1;
423 tmo = x->lft.hard_add_expires_seconds - x->saved_tmo; 416 tmo = x->lft.hard_add_expires_seconds - x->saved_tmo;
@@ -647,26 +640,25 @@ void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si)
647} 640}
648EXPORT_SYMBOL(xfrm_sad_getinfo); 641EXPORT_SYMBOL(xfrm_sad_getinfo);
649 642
650static int 643static void
651xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl, 644xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl,
652 const struct xfrm_tmpl *tmpl, 645 const struct xfrm_tmpl *tmpl,
653 const xfrm_address_t *daddr, const xfrm_address_t *saddr, 646 const xfrm_address_t *daddr, const xfrm_address_t *saddr,
654 unsigned short family) 647 unsigned short family)
655{ 648{
656 struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); 649 struct xfrm_state_afinfo *afinfo = xfrm_state_afinfo_get_rcu(family);
650
657 if (!afinfo) 651 if (!afinfo)
658 return -1; 652 return;
653
659 afinfo->init_tempsel(&x->sel, fl); 654 afinfo->init_tempsel(&x->sel, fl);
660 655
661 if (family != tmpl->encap_family) { 656 if (family != tmpl->encap_family) {
662 xfrm_state_put_afinfo(afinfo); 657 afinfo = xfrm_state_afinfo_get_rcu(tmpl->encap_family);
663 afinfo = xfrm_state_get_afinfo(tmpl->encap_family);
664 if (!afinfo) 658 if (!afinfo)
665 return -1; 659 return;
666 } 660 }
667 afinfo->init_temprop(x, tmpl, daddr, saddr); 661 afinfo->init_temprop(x, tmpl, daddr, saddr);
668 xfrm_state_put_afinfo(afinfo);
669 return 0;
670} 662}
671 663
672static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark, 664static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark,
@@ -1412,7 +1404,7 @@ int xfrm_state_check_expire(struct xfrm_state *x)
1412 if (x->curlft.bytes >= x->lft.hard_byte_limit || 1404 if (x->curlft.bytes >= x->lft.hard_byte_limit ||
1413 x->curlft.packets >= x->lft.hard_packet_limit) { 1405 x->curlft.packets >= x->lft.hard_packet_limit) {
1414 x->km.state = XFRM_STATE_EXPIRED; 1406 x->km.state = XFRM_STATE_EXPIRED;
1415 tasklet_hrtimer_start(&x->mtimer, ktime_set(0, 0), HRTIMER_MODE_REL); 1407 tasklet_hrtimer_start(&x->mtimer, 0, HRTIMER_MODE_REL);
1416 return -EINVAL; 1408 return -EINVAL;
1417 } 1409 }
1418 1410
@@ -1482,7 +1474,7 @@ xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n,
1482 if (afinfo->tmpl_sort) 1474 if (afinfo->tmpl_sort)
1483 err = afinfo->tmpl_sort(dst, src, n); 1475 err = afinfo->tmpl_sort(dst, src, n);
1484 spin_unlock_bh(&net->xfrm.xfrm_state_lock); 1476 spin_unlock_bh(&net->xfrm.xfrm_state_lock);
1485 xfrm_state_put_afinfo(afinfo); 1477 rcu_read_unlock();
1486 return err; 1478 return err;
1487} 1479}
1488EXPORT_SYMBOL(xfrm_tmpl_sort); 1480EXPORT_SYMBOL(xfrm_tmpl_sort);
@@ -1502,7 +1494,7 @@ xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n,
1502 if (afinfo->state_sort) 1494 if (afinfo->state_sort)
1503 err = afinfo->state_sort(dst, src, n); 1495 err = afinfo->state_sort(dst, src, n);
1504 spin_unlock_bh(&net->xfrm.xfrm_state_lock); 1496 spin_unlock_bh(&net->xfrm.xfrm_state_lock);
1505 xfrm_state_put_afinfo(afinfo); 1497 rcu_read_unlock();
1506 return err; 1498 return err;
1507} 1499}
1508EXPORT_SYMBOL(xfrm_state_sort); 1500EXPORT_SYMBOL(xfrm_state_sort);
@@ -1940,10 +1932,10 @@ EXPORT_SYMBOL(xfrm_unregister_km);
1940int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo) 1932int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo)
1941{ 1933{
1942 int err = 0; 1934 int err = 0;
1943 if (unlikely(afinfo == NULL)) 1935
1944 return -EINVAL; 1936 if (WARN_ON(afinfo->family >= NPROTO))
1945 if (unlikely(afinfo->family >= NPROTO))
1946 return -EAFNOSUPPORT; 1937 return -EAFNOSUPPORT;
1938
1947 spin_lock_bh(&xfrm_state_afinfo_lock); 1939 spin_lock_bh(&xfrm_state_afinfo_lock);
1948 if (unlikely(xfrm_state_afinfo[afinfo->family] != NULL)) 1940 if (unlikely(xfrm_state_afinfo[afinfo->family] != NULL))
1949 err = -EEXIST; 1941 err = -EEXIST;
@@ -1956,14 +1948,14 @@ EXPORT_SYMBOL(xfrm_state_register_afinfo);
1956 1948
1957int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo) 1949int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo)
1958{ 1950{
1959 int err = 0; 1951 int err = 0, family = afinfo->family;
1960 if (unlikely(afinfo == NULL)) 1952
1961 return -EINVAL; 1953 if (WARN_ON(family >= NPROTO))
1962 if (unlikely(afinfo->family >= NPROTO))
1963 return -EAFNOSUPPORT; 1954 return -EAFNOSUPPORT;
1955
1964 spin_lock_bh(&xfrm_state_afinfo_lock); 1956 spin_lock_bh(&xfrm_state_afinfo_lock);
1965 if (likely(xfrm_state_afinfo[afinfo->family] != NULL)) { 1957 if (likely(xfrm_state_afinfo[afinfo->family] != NULL)) {
1966 if (unlikely(xfrm_state_afinfo[afinfo->family] != afinfo)) 1958 if (rcu_access_pointer(xfrm_state_afinfo[family]) != afinfo)
1967 err = -EINVAL; 1959 err = -EINVAL;
1968 else 1960 else
1969 RCU_INIT_POINTER(xfrm_state_afinfo[afinfo->family], NULL); 1961 RCU_INIT_POINTER(xfrm_state_afinfo[afinfo->family], NULL);
@@ -1974,6 +1966,14 @@ int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo)
1974} 1966}
1975EXPORT_SYMBOL(xfrm_state_unregister_afinfo); 1967EXPORT_SYMBOL(xfrm_state_unregister_afinfo);
1976 1968
1969struct xfrm_state_afinfo *xfrm_state_afinfo_get_rcu(unsigned int family)
1970{
1971 if (unlikely(family >= NPROTO))
1972 return NULL;
1973
1974 return rcu_dereference(xfrm_state_afinfo[family]);
1975}
1976
1977struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family) 1977struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family)
1978{ 1978{
1979 struct xfrm_state_afinfo *afinfo; 1979 struct xfrm_state_afinfo *afinfo;
@@ -1986,11 +1986,6 @@ struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family)
1986 return afinfo; 1986 return afinfo;
1987} 1987}
1988 1988
1989void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo)
1990{
1991 rcu_read_unlock();
1992}
1993
1994/* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */ 1989/* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */
1995void xfrm_state_delete_tunnel(struct xfrm_state *x) 1990void xfrm_state_delete_tunnel(struct xfrm_state *x)
1996{ 1991{
@@ -2008,16 +2003,13 @@ EXPORT_SYMBOL(xfrm_state_delete_tunnel);
2008 2003
2009int xfrm_state_mtu(struct xfrm_state *x, int mtu) 2004int xfrm_state_mtu(struct xfrm_state *x, int mtu)
2010{ 2005{
2011 int res; 2006 const struct xfrm_type *type = READ_ONCE(x->type);
2012 2007
2013 spin_lock_bh(&x->lock);
2014 if (x->km.state == XFRM_STATE_VALID && 2008 if (x->km.state == XFRM_STATE_VALID &&
2015 x->type && x->type->get_mtu) 2009 type && type->get_mtu)
2016 res = x->type->get_mtu(x, mtu); 2010 return type->get_mtu(x, mtu);
2017 else 2011
2018 res = mtu - x->props.header_len; 2012 return mtu - x->props.header_len;
2019 spin_unlock_bh(&x->lock);
2020 return res;
2021} 2013}
2022 2014
2023int __xfrm_init_state(struct xfrm_state *x, bool init_replay) 2015int __xfrm_init_state(struct xfrm_state *x, bool init_replay)
@@ -2036,7 +2028,7 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay)
2036 if (afinfo->init_flags) 2028 if (afinfo->init_flags)
2037 err = afinfo->init_flags(x); 2029 err = afinfo->init_flags(x);
2038 2030
2039 xfrm_state_put_afinfo(afinfo); 2031 rcu_read_unlock();
2040 2032
2041 if (err) 2033 if (err)
2042 goto error; 2034 goto error;
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 671a1d0333f0..40a8aa39220d 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -27,7 +27,7 @@
27#include <net/xfrm.h> 27#include <net/xfrm.h>
28#include <net/netlink.h> 28#include <net/netlink.h>
29#include <net/ah.h> 29#include <net/ah.h>
30#include <asm/uaccess.h> 30#include <linux/uaccess.h>
31#if IS_ENABLED(CONFIG_IPV6) 31#if IS_ENABLED(CONFIG_IPV6)
32#include <linux/in6.h> 32#include <linux/in6.h>
33#endif 33#endif
@@ -412,7 +412,14 @@ static inline int xfrm_replay_verify_len(struct xfrm_replay_state_esn *replay_es
412 up = nla_data(rp); 412 up = nla_data(rp);
413 ulen = xfrm_replay_state_esn_len(up); 413 ulen = xfrm_replay_state_esn_len(up);
414 414
415 if (nla_len(rp) < ulen || xfrm_replay_state_esn_len(replay_esn) != ulen) 415 /* Check the overall length and the internal bitmap length to avoid
416 * potential overflow. */
417 if (nla_len(rp) < ulen ||
418 xfrm_replay_state_esn_len(replay_esn) != ulen ||
419 replay_esn->bmp_len != up->bmp_len)
420 return -EINVAL;
421
422 if (up->replay_window > up->bmp_len * sizeof(__u32) * 8)
416 return -EINVAL; 423 return -EINVAL;
417 424
418 return 0; 425 return 0;