aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorLiam Girdwood <lrg@slimlogic.co.uk>2010-11-03 10:11:27 -0400
committerLiam Girdwood <lrg@slimlogic.co.uk>2010-11-03 10:11:27 -0400
commit8f987768eb99631374f4ab0bb19cd062baf1397d (patch)
treeb89aa5c207f7ba6a688f45657424b937f17ceb8a /net
parent63f7526f26f0a9291ac3f7a986aa18ebfb61ec19 (diff)
parentc8ddb2713c624f432fa5fe3c7ecffcdda46ea0d4 (diff)
Merge commit 'v2.6.37-rc1' into for-2.6.37
Diffstat (limited to 'net')
-rw-r--r--net/802/fc.c2
-rw-r--r--net/802/fddi.c12
-rw-r--r--net/802/garp.c18
-rw-r--r--net/802/hippi.c2
-rw-r--r--net/802/stp.c4
-rw-r--r--net/802/tr.c2
-rw-r--r--net/8021q/vlan.c93
-rw-r--r--net/8021q/vlan.h17
-rw-r--r--net/8021q/vlan_core.c117
-rw-r--r--net/8021q/vlan_dev.c13
-rw-r--r--net/9p/client.c240
-rw-r--r--net/9p/protocol.c5
-rw-r--r--net/9p/trans_fd.c2
-rw-r--r--net/9p/trans_rdma.c29
-rw-r--r--net/9p/trans_virtio.c79
-rw-r--r--net/Kconfig3
-rw-r--r--net/Makefile1
-rw-r--r--net/atm/br2684.c12
-rw-r--r--net/atm/clip.c4
-rw-r--r--net/atm/common.c2
-rw-r--r--net/atm/lec.c1
-rw-r--r--net/atm/mpc.c2
-rw-r--r--net/atm/proc.c1
-rw-r--r--net/ax25/Kconfig8
-rw-r--r--net/ax25/af_ax25.c2
-rw-r--r--net/ax25/ax25_ds_timer.c2
-rw-r--r--net/ax25/ax25_route.c4
-rw-r--r--net/bluetooth/af_bluetooth.c114
-rw-r--r--net/bluetooth/cmtp/core.c6
-rw-r--r--net/bluetooth/hci_core.c1
-rw-r--r--net/bluetooth/hci_sysfs.c21
-rw-r--r--net/bluetooth/hidp/core.c8
-rw-r--r--net/bluetooth/l2cap.c122
-rw-r--r--net/bluetooth/lib.c4
-rw-r--r--net/bluetooth/rfcomm/core.c43
-rw-r--r--net/bluetooth/rfcomm/sock.c108
-rw-r--r--net/bluetooth/rfcomm/tty.c8
-rw-r--r--net/bridge/br_device.c8
-rw-r--r--net/bridge/br_if.c29
-rw-r--r--net/bridge/br_input.c4
-rw-r--r--net/bridge/br_netfilter.c138
-rw-r--r--net/bridge/netfilter/ebt_vlan.c25
-rw-r--r--net/bridge/netfilter/ebtables.c15
-rw-r--r--net/caif/caif_dev.c24
-rw-r--r--net/caif/caif_socket.c44
-rw-r--r--net/caif/cfcnfg.c49
-rw-r--r--net/caif/cfctrl.c59
-rw-r--r--net/caif/cfdbgl.c4
-rw-r--r--net/caif/cfdgml.c11
-rw-r--r--net/caif/cffrml.c14
-rw-r--r--net/caif/cfmuxl.c14
-rw-r--r--net/caif/cfpkt_skbuff.c48
-rw-r--r--net/caif/cfrfml.c14
-rw-r--r--net/caif/cfserl.c4
-rw-r--r--net/caif/cfsrvl.c17
-rw-r--r--net/caif/cfutill.c12
-rw-r--r--net/caif/cfveil.c11
-rw-r--r--net/caif/cfvidl.c6
-rw-r--r--net/caif/chnl_net.c47
-rw-r--r--net/can/raw.c37
-rw-r--r--net/ceph/Kconfig28
-rw-r--r--net/ceph/Makefile37
-rw-r--r--net/ceph/armor.c103
-rw-r--r--net/ceph/auth.c259
-rw-r--r--net/ceph/auth_none.c132
-rw-r--r--net/ceph/auth_none.h29
-rw-r--r--net/ceph/auth_x.c688
-rw-r--r--net/ceph/auth_x.h50
-rw-r--r--net/ceph/auth_x_protocol.h90
-rw-r--r--net/ceph/buffer.c68
-rw-r--r--net/ceph/ceph_common.c529
-rw-r--r--net/ceph/ceph_fs.c75
-rw-r--r--net/ceph/ceph_hash.c118
-rw-r--r--net/ceph/ceph_strings.c84
-rw-r--r--net/ceph/crush/crush.c151
-rw-r--r--net/ceph/crush/hash.c149
-rw-r--r--net/ceph/crush/mapper.c609
-rw-r--r--net/ceph/crypto.c412
-rw-r--r--net/ceph/crypto.h48
-rw-r--r--net/ceph/debugfs.c267
-rw-r--r--net/ceph/messenger.c2453
-rw-r--r--net/ceph/mon_client.c1027
-rw-r--r--net/ceph/msgpool.c64
-rw-r--r--net/ceph/osd_client.c1773
-rw-r--r--net/ceph/osdmap.c1128
-rw-r--r--net/ceph/pagelist.c154
-rw-r--r--net/ceph/pagevec.c223
-rw-r--r--net/compat.c10
-rw-r--r--net/core/datagram.c6
-rw-r--r--net/core/dev.c641
-rw-r--r--net/core/dst.c39
-rw-r--r--net/core/ethtool.c97
-rw-r--r--net/core/fib_rules.c37
-rw-r--r--net/core/filter.c14
-rw-r--r--net/core/flow.c82
-rw-r--r--net/core/gen_estimator.c16
-rw-r--r--net/core/iovec.c23
-rw-r--r--net/core/neighbour.c486
-rw-r--r--net/core/net-sysfs.c59
-rw-r--r--net/core/net-sysfs.h4
-rw-r--r--net/core/net-traces.c1
-rw-r--r--net/core/net_namespace.c4
-rw-r--r--net/core/netpoll.c6
-rw-r--r--net/core/pktgen.c49
-rw-r--r--net/core/rtnetlink.c39
-rw-r--r--net/core/skbuff.c112
-rw-r--r--net/core/sock.c19
-rw-r--r--net/core/stream.c8
-rw-r--r--net/core/sysctl_net_core.c3
-rw-r--r--net/core/utils.c15
-rw-r--r--net/dccp/ccid.h86
-rw-r--r--net/dccp/ccids/Kconfig31
-rw-r--r--net/dccp/ccids/ccid2.c310
-rw-r--r--net/dccp/ccids/ccid2.h40
-rw-r--r--net/dccp/ccids/ccid3.c268
-rw-r--r--net/dccp/ccids/ccid3.h51
-rw-r--r--net/dccp/ccids/lib/loss_interval.c2
-rw-r--r--net/dccp/ccids/lib/packet_history.c39
-rw-r--r--net/dccp/ccids/lib/packet_history.h22
-rw-r--r--net/dccp/ccids/lib/tfrc.h1
-rw-r--r--net/dccp/ccids/lib/tfrc_equation.c14
-rw-r--r--net/dccp/dccp.h51
-rw-r--r--net/dccp/feat.c10
-rw-r--r--net/dccp/feat.h1
-rw-r--r--net/dccp/input.c20
-rw-r--r--net/dccp/ipv4.c10
-rw-r--r--net/dccp/ipv6.c10
-rw-r--r--net/dccp/minisocks.c30
-rw-r--r--net/dccp/options.c31
-rw-r--r--net/dccp/output.c227
-rw-r--r--net/dccp/probe.c1
-rw-r--r--net/dccp/proto.c71
-rw-r--r--net/dccp/timer.c27
-rw-r--r--net/decnet/dn_neigh.c13
-rw-r--r--net/decnet/dn_nsp_out.c8
-rw-r--r--net/decnet/dn_route.c3
-rw-r--r--net/econet/af_econet.c6
-rw-r--r--net/ethernet/eth.c8
-rw-r--r--net/ipv4/Kconfig15
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/af_inet.c8
-rw-r--r--net/ipv4/arp.c245
-rw-r--r--net/ipv4/cipso_ipv4.c2
-rw-r--r--net/ipv4/datagram.c7
-rw-r--r--net/ipv4/devinet.c11
-rw-r--r--net/ipv4/fib_frontend.c209
-rw-r--r--net/ipv4/fib_hash.c329
-rw-r--r--net/ipv4/fib_lookup.h11
-rw-r--r--net/ipv4/fib_rules.c13
-rw-r--r--net/ipv4/fib_semantics.c297
-rw-r--r--net/ipv4/fib_trie.c87
-rw-r--r--net/ipv4/gre.c152
-rw-r--r--net/ipv4/icmp.c4
-rw-r--r--net/ipv4/igmp.c34
-rw-r--r--net/ipv4/inet_diag.c2
-rw-r--r--net/ipv4/inet_hashtables.c28
-rw-r--r--net/ipv4/inetpeer.c138
-rw-r--r--net/ipv4/ip_fragment.c6
-rw-r--r--net/ipv4/ip_gre.c250
-rw-r--r--net/ipv4/ip_options.c3
-rw-r--r--net/ipv4/ip_output.c43
-rw-r--r--net/ipv4/ip_sockglue.c13
-rw-r--r--net/ipv4/ipip.c213
-rw-r--r--net/ipv4/ipmr.c428
-rw-r--r--net/ipv4/netfilter/Kconfig6
-rw-r--r--net/ipv4/netfilter/arp_tables.c69
-rw-r--r--net/ipv4/netfilter/arpt_mangle.c2
-rw-r--r--net/ipv4/netfilter/ip_tables.c89
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c31
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c145
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c1
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c28
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_amanda.c9
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c53
-rw-r--r--net/ipv4/netfilter/nf_nat_ftp.c9
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c53
-rw-r--r--net/ipv4/netfilter/nf_nat_helper.c76
-rw-r--r--net/ipv4/netfilter/nf_nat_irc.c9
-rw-r--r--net/ipv4/netfilter/nf_nat_rule.c17
-rw-r--r--net/ipv4/netfilter/nf_nat_sip.c27
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c6
-rw-r--r--net/ipv4/protocol.c33
-rw-r--r--net/ipv4/raw.c2
-rw-r--r--net/ipv4/route.c274
-rw-r--r--net/ipv4/tcp.c52
-rw-r--r--net/ipv4/tcp_cong.c5
-rw-r--r--net/ipv4/tcp_illinois.c2
-rw-r--r--net/ipv4/tcp_input.c64
-rw-r--r--net/ipv4/tcp_ipv4.c12
-rw-r--r--net/ipv4/tcp_minisocks.c2
-rw-r--r--net/ipv4/tcp_output.c31
-rw-r--r--net/ipv4/tcp_probe.c1
-rw-r--r--net/ipv4/tcp_timer.c68
-rw-r--r--net/ipv4/tcp_veno.c2
-rw-r--r--net/ipv4/tcp_westwood.c2
-rw-r--r--net/ipv4/tunnel4.c44
-rw-r--r--net/ipv4/udp.c50
-rw-r--r--net/ipv4/xfrm4_policy.c6
-rw-r--r--net/ipv4/xfrm4_state.c33
-rw-r--r--net/ipv4/xfrm4_tunnel.c4
-rw-r--r--net/ipv6/addrconf.c35
-rw-r--r--net/ipv6/addrlabel.c10
-rw-r--r--net/ipv6/af_inet6.c9
-rw-r--r--net/ipv6/datagram.c26
-rw-r--r--net/ipv6/exthdrs_core.c4
-rw-r--r--net/ipv6/fib6_rules.c3
-rw-r--r--net/ipv6/ip6_fib.c9
-rw-r--r--net/ipv6/ip6_output.c24
-rw-r--r--net/ipv6/ip6_tunnel.c159
-rw-r--r--net/ipv6/ip6mr.c1
-rw-r--r--net/ipv6/ipv6_sockglue.c27
-rw-r--r--net/ipv6/ndisc.c36
-rw-r--r--net/ipv6/netfilter/Kconfig9
-rw-r--r--net/ipv6/netfilter/Makefile8
-rw-r--r--net/ipv6/netfilter/ip6_tables.c103
-rw-r--r--net/ipv6/netfilter/ip6t_LOG.c157
-rw-r--r--net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c78
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c99
-rw-r--r--net/ipv6/netfilter/nf_defrag_ipv6_hooks.c131
-rw-r--r--net/ipv6/proc.c4
-rw-r--r--net/ipv6/protocol.c34
-rw-r--r--net/ipv6/raw.c14
-rw-r--r--net/ipv6/reassembly.c73
-rw-r--r--net/ipv6/route.c84
-rw-r--r--net/ipv6/sit.c166
-rw-r--r--net/ipv6/tcp_ipv6.c14
-rw-r--r--net/ipv6/tunnel6.c37
-rw-r--r--net/ipv6/udp.c28
-rw-r--r--net/ipv6/xfrm6_policy.c10
-rw-r--r--net/ipv6/xfrm6_state.c33
-rw-r--r--net/ipv6/xfrm6_tunnel.c8
-rw-r--r--net/ipx/Kconfig1
-rw-r--r--net/irda/af_irda.c384
-rw-r--r--net/irda/discovery.c2
-rw-r--r--net/irda/ircomm/ircomm_tty.c4
-rw-r--r--net/irda/iriap.c3
-rw-r--r--net/irda/irlan/irlan_common.c2
-rw-r--r--net/irda/irlan/irlan_eth.c34
-rw-r--r--net/irda/irlan/irlan_event.c2
-rw-r--r--net/irda/irlmp.c2
-rw-r--r--net/irda/irlmp_frame.c2
-rw-r--r--net/irda/irnet/irnet.h2
-rw-r--r--net/irda/irnet/irnet_irda.c22
-rw-r--r--net/irda/irnet/irnet_ppp.c71
-rw-r--r--net/irda/irnet/irnet_ppp.h3
-rw-r--r--net/irda/parameters.c4
-rw-r--r--net/iucv/iucv.c3
-rw-r--r--net/key/af_key.c4
-rw-r--r--net/l2tp/l2tp_core.c53
-rw-r--r--net/l2tp/l2tp_core.h33
-rw-r--r--net/l2tp/l2tp_eth.c3
-rw-r--r--net/l2tp/l2tp_ip.c6
-rw-r--r--net/l2tp/l2tp_ppp.c2
-rw-r--r--net/llc/af_llc.c3
-rw-r--r--net/llc/llc_station.c2
-rw-r--r--net/mac80211/aes_ccm.c6
-rw-r--r--net/mac80211/aes_cmac.c6
-rw-r--r--net/mac80211/agg-rx.c30
-rw-r--r--net/mac80211/agg-tx.c16
-rw-r--r--net/mac80211/cfg.c244
-rw-r--r--net/mac80211/chan.c2
-rw-r--r--net/mac80211/debugfs.c28
-rw-r--r--net/mac80211/debugfs_key.c63
-rw-r--r--net/mac80211/debugfs_netdev.c4
-rw-r--r--net/mac80211/debugfs_sta.c7
-rw-r--r--net/mac80211/driver-ops.h14
-rw-r--r--net/mac80211/driver-trace.h42
-rw-r--r--net/mac80211/ht.c47
-rw-r--r--net/mac80211/ibss.c78
-rw-r--r--net/mac80211/ieee80211_i.h133
-rw-r--r--net/mac80211/iface.c460
-rw-r--r--net/mac80211/key.c168
-rw-r--r--net/mac80211/key.h13
-rw-r--r--net/mac80211/main.c213
-rw-r--r--net/mac80211/mesh.h2
-rw-r--r--net/mac80211/mesh_plink.c17
-rw-r--r--net/mac80211/mlme.c173
-rw-r--r--net/mac80211/offchannel.c26
-rw-r--r--net/mac80211/pm.c2
-rw-r--r--net/mac80211/rate.c15
-rw-r--r--net/mac80211/rc80211_minstrel_debugfs.c1
-rw-r--r--net/mac80211/rc80211_minstrel_ht.c7
-rw-r--r--net/mac80211/rc80211_minstrel_ht_debugfs.c3
-rw-r--r--net/mac80211/rc80211_pid_debugfs.c3
-rw-r--r--net/mac80211/rx.c819
-rw-r--r--net/mac80211/scan.c179
-rw-r--r--net/mac80211/sta_info.c52
-rw-r--r--net/mac80211/sta_info.h24
-rw-r--r--net/mac80211/status.c18
-rw-r--r--net/mac80211/tx.c73
-rw-r--r--net/mac80211/util.c102
-rw-r--r--net/mac80211/wep.c10
-rw-r--r--net/mac80211/work.c39
-rw-r--r--net/mac80211/wpa.c34
-rw-r--r--net/netfilter/Kconfig2
-rw-r--r--net/netfilter/core.c8
-rw-r--r--net/netfilter/ipvs/Kconfig20
-rw-r--r--net/netfilter/ipvs/Makefile10
-rw-r--r--net/netfilter/ipvs/ip_vs_app.c6
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c286
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c818
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c392
-rw-r--r--net/netfilter/ipvs/ip_vs_ftp.c203
-rw-r--r--net/netfilter/ipvs/ip_vs_nfct.c292
-rw-r--r--net/netfilter/ipvs/ip_vs_pe.c147
-rw-r--r--net/netfilter/ipvs/ip_vs_pe_sip.c169
-rw-r--r--net/netfilter/ipvs/ip_vs_proto.c8
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_ah_esp.c99
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c27
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c52
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_udp.c51
-rw-r--r--net/netfilter/ipvs/ip_vs_sched.c47
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c46
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c690
-rw-r--r--net/netfilter/nf_conntrack_core.c131
-rw-r--r--net/netfilter/nf_conntrack_ecache.c4
-rw-r--r--net/netfilter/nf_conntrack_expect.c68
-rw-r--r--net/netfilter/nf_conntrack_extend.c6
-rw-r--r--net/netfilter/nf_conntrack_netlink.c121
-rw-r--r--net/netfilter/nf_conntrack_proto.c4
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c4
-rw-r--r--net/netfilter/nf_conntrack_sip.c44
-rw-r--r--net/netfilter/nf_conntrack_standalone.c28
-rw-r--r--net/netfilter/nf_log.c2
-rw-r--r--net/netfilter/nf_queue.c2
-rw-r--r--net/netfilter/nf_tproxy_core.c41
-rw-r--r--net/netfilter/x_tables.c12
-rw-r--r--net/netfilter/xt_CT.c1
-rw-r--r--net/netfilter/xt_SECMARK.c35
-rw-r--r--net/netfilter/xt_TPROXY.c368
-rw-r--r--net/netfilter/xt_hashlimit.c15
-rw-r--r--net/netfilter/xt_ipvs.c1
-rw-r--r--net/netfilter/xt_recent.c1
-rw-r--r--net/netfilter/xt_socket.c170
-rw-r--r--net/netlink/af_netlink.c133
-rw-r--r--net/netlink/genetlink.c14
-rw-r--r--net/nonet.c1
-rw-r--r--net/packet/af_packet.c4
-rw-r--r--net/phonet/Kconfig12
-rw-r--r--net/phonet/af_phonet.c17
-rw-r--r--net/phonet/datagram.c13
-rw-r--r--net/phonet/pep.c391
-rw-r--r--net/phonet/pn_dev.c5
-rw-r--r--net/phonet/socket.c289
-rw-r--r--net/rds/af_rds.c26
-rw-r--r--net/rds/bind.c82
-rw-r--r--net/rds/cong.c8
-rw-r--r--net/rds/connection.c159
-rw-r--r--net/rds/ib.c200
-rw-r--r--net/rds/ib.h104
-rw-r--r--net/rds/ib_cm.c184
-rw-r--r--net/rds/ib_rdma.c318
-rw-r--r--net/rds/ib_recv.c549
-rw-r--r--net/rds/ib_send.c682
-rw-r--r--net/rds/ib_stats.c2
-rw-r--r--net/rds/ib_sysctl.c19
-rw-r--r--net/rds/info.c12
-rw-r--r--net/rds/iw.c8
-rw-r--r--net/rds/iw.h15
-rw-r--r--net/rds/iw_cm.c14
-rw-r--r--net/rds/iw_rdma.c8
-rw-r--r--net/rds/iw_recv.c24
-rw-r--r--net/rds/iw_send.c93
-rw-r--r--net/rds/iw_sysctl.c6
-rw-r--r--net/rds/loop.c31
-rw-r--r--net/rds/message.c147
-rw-r--r--net/rds/page.c35
-rw-r--r--net/rds/rdma.c421
-rw-r--r--net/rds/rdma.h85
-rw-r--r--net/rds/rdma_transport.c44
-rw-r--r--net/rds/rdma_transport.h4
-rw-r--r--net/rds/rds.h192
-rw-r--r--net/rds/recv.c14
-rw-r--r--net/rds/send.c552
-rw-r--r--net/rds/stats.c6
-rw-r--r--net/rds/sysctl.c4
-rw-r--r--net/rds/tcp.c12
-rw-r--r--net/rds/tcp.h9
-rw-r--r--net/rds/tcp_connect.c6
-rw-r--r--net/rds/tcp_listen.c10
-rw-r--r--net/rds/tcp_recv.c21
-rw-r--r--net/rds/tcp_send.c72
-rw-r--r--net/rds/threads.c69
-rw-r--r--net/rds/transport.c19
-rw-r--r--net/rds/xlist.h80
-rw-r--r--net/rfkill/core.c1
-rw-r--r--net/rfkill/input.c2
-rw-r--r--net/rose/af_rose.c4
-rw-r--r--net/rose/rose_link.c4
-rw-r--r--net/sched/Kconfig10
-rw-r--r--net/sched/Makefile1
-rw-r--r--net/sched/act_csum.c595
-rw-r--r--net/sched/act_gact.c21
-rw-r--r--net/sched/act_ipt.c14
-rw-r--r--net/sched/act_mirred.c15
-rw-r--r--net/sched/act_nat.c22
-rw-r--r--net/sched/act_police.c21
-rw-r--r--net/sched/act_simple.c11
-rw-r--r--net/sched/act_skbedit.c11
-rw-r--r--net/sched/cls_cgroup.c2
-rw-r--r--net/sched/cls_flow.c74
-rw-r--r--net/sched/cls_u32.c2
-rw-r--r--net/sched/em_meta.c6
-rw-r--r--net/sched/sch_api.c44
-rw-r--r--net/sched/sch_atm.c9
-rw-r--r--net/sched/sch_cbq.c12
-rw-r--r--net/sched/sch_drr.c4
-rw-r--r--net/sched/sch_dsmark.c6
-rw-r--r--net/sched/sch_fifo.c3
-rw-r--r--net/sched/sch_generic.c24
-rw-r--r--net/sched/sch_hfsc.c10
-rw-r--r--net/sched/sch_htb.c12
-rw-r--r--net/sched/sch_mq.c2
-rw-r--r--net/sched/sch_multiq.c3
-rw-r--r--net/sched/sch_netem.c3
-rw-r--r--net/sched/sch_prio.c2
-rw-r--r--net/sched/sch_sfq.c33
-rw-r--r--net/sched/sch_teql.c8
-rw-r--r--net/sctp/associola.c2
-rw-r--r--net/sctp/auth.c8
-rw-r--r--net/sctp/chunk.c2
-rw-r--r--net/sctp/inqueue.c2
-rw-r--r--net/sctp/ipv6.c4
-rw-r--r--net/sctp/objcnt.c5
-rw-r--r--net/sctp/output.c3
-rw-r--r--net/sctp/outqueue.c34
-rw-r--r--net/sctp/probe.c5
-rw-r--r--net/sctp/protocol.c19
-rw-r--r--net/sctp/sm_make_chunk.c2
-rw-r--r--net/sctp/sm_sideeffect.c21
-rw-r--r--net/sctp/sm_statefuns.c66
-rw-r--r--net/sctp/sm_statetable.c42
-rw-r--r--net/sctp/socket.c98
-rw-r--r--net/sctp/transport.c9
-rw-r--r--net/socket.c58
-rw-r--r--net/sunrpc/Kconfig28
-rw-r--r--net/sunrpc/auth.c6
-rw-r--r--net/sunrpc/auth_generic.c2
-rw-r--r--net/sunrpc/auth_gss/Makefile5
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c11
-rw-r--r--net/sunrpc/auth_gss/gss_generic_token.c44
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_mech.c12
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_seqnum.c2
-rw-r--r--net/sunrpc/auth_gss/gss_mech_switch.c2
-rw-r--r--net/sunrpc/auth_gss/gss_spkm3_mech.c244
-rw-r--r--net/sunrpc/auth_gss/gss_spkm3_seal.c186
-rw-r--r--net/sunrpc/auth_gss/gss_spkm3_token.c267
-rw-r--r--net/sunrpc/auth_gss/gss_spkm3_unseal.c127
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c51
-rw-r--r--net/sunrpc/cache.c305
-rw-r--r--net/sunrpc/clnt.c119
-rw-r--r--net/sunrpc/netns.h19
-rw-r--r--net/sunrpc/rpc_pipe.c69
-rw-r--r--net/sunrpc/rpcb_clnt.c60
-rw-r--r--net/sunrpc/sched.c4
-rw-r--r--net/sunrpc/stats.c43
-rw-r--r--net/sunrpc/sunrpc_syms.c58
-rw-r--r--net/sunrpc/svc.c3
-rw-r--r--net/sunrpc/svc_xprt.c59
-rw-r--r--net/sunrpc/svcauth_unix.c194
-rw-r--r--net/sunrpc/svcsock.c27
-rw-r--r--net/sunrpc/xdr.c61
-rw-r--r--net/sunrpc/xprt.c39
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c2
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma.c11
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c19
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c82
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c49
-rw-r--r--net/sunrpc/xprtrdma/transport.c25
-rw-r--r--net/sunrpc/xprtrdma/verbs.c22
-rw-r--r--net/sunrpc/xprtsock.c414
-rw-r--r--net/tipc/addr.c7
-rw-r--r--net/tipc/bcast.c51
-rw-r--r--net/tipc/bcast.h3
-rw-r--r--net/tipc/bearer.c42
-rw-r--r--net/tipc/cluster.c21
-rw-r--r--net/tipc/cluster.h2
-rw-r--r--net/tipc/config.c148
-rw-r--r--net/tipc/config.h6
-rw-r--r--net/tipc/core.c38
-rw-r--r--net/tipc/core.h9
-rw-r--r--net/tipc/dbg.c17
-rw-r--r--net/tipc/dbg.h3
-rw-r--r--net/tipc/discover.c44
-rw-r--r--net/tipc/discover.h5
-rw-r--r--net/tipc/eth_media.c48
-rw-r--r--net/tipc/link.c188
-rw-r--r--net/tipc/link.h24
-rw-r--r--net/tipc/msg.c2
-rw-r--r--net/tipc/msg.h6
-rw-r--r--net/tipc/name_distr.c2
-rw-r--r--net/tipc/name_table.c67
-rw-r--r--net/tipc/net.c10
-rw-r--r--net/tipc/node.c73
-rw-r--r--net/tipc/node.h3
-rw-r--r--net/tipc/port.c295
-rw-r--r--net/tipc/port.h4
-rw-r--r--net/tipc/ref.c17
-rw-r--r--net/tipc/ref.h1
-rw-r--r--net/tipc/socket.c83
-rw-r--r--net/tipc/subscr.c77
-rw-r--r--net/tipc/subscr.h2
-rw-r--r--net/tipc/zone.c11
-rw-r--r--net/tipc/zone.h1
-rw-r--r--net/unix/af_unix.c39
-rw-r--r--net/wanrouter/wanmain.c4
-rw-r--r--net/wireless/core.c87
-rw-r--r--net/wireless/core.h34
-rw-r--r--net/wireless/debugfs.c2
-rw-r--r--net/wireless/ibss.c21
-rw-r--r--net/wireless/mlme.c225
-rw-r--r--net/wireless/nl80211.c2189
-rw-r--r--net/wireless/nl80211.h14
-rw-r--r--net/wireless/radiotap.c61
-rw-r--r--net/wireless/reg.c24
-rw-r--r--net/wireless/scan.c12
-rw-r--r--net/wireless/sme.c11
-rw-r--r--net/wireless/sysfs.c18
-rw-r--r--net/wireless/util.c40
-rw-r--r--net/wireless/wext-compat.c45
-rw-r--r--net/wireless/wext-core.c18
-rw-r--r--net/wireless/wext-priv.c2
-rw-r--r--net/wireless/wext-sme.c2
-rw-r--r--net/x25/Kconfig1
-rw-r--r--net/x25/af_x25.c34
-rw-r--r--net/xfrm/xfrm_output.c2
-rw-r--r--net/xfrm/xfrm_policy.c12
-rw-r--r--net/xfrm/xfrm_state.c45
-rw-r--r--net/xfrm/xfrm_user.c4
530 files changed, 29183 insertions, 13549 deletions
diff --git a/net/802/fc.c b/net/802/fc.c
index 34cf1ee014b8..1e49f2d4ea96 100644
--- a/net/802/fc.c
+++ b/net/802/fc.c
@@ -70,7 +70,7 @@ static int fc_header(struct sk_buff *skb, struct net_device *dev,
70 if(daddr) 70 if(daddr)
71 { 71 {
72 memcpy(fch->daddr,daddr,dev->addr_len); 72 memcpy(fch->daddr,daddr,dev->addr_len);
73 return(hdr_len); 73 return hdr_len;
74 } 74 }
75 return -hdr_len; 75 return -hdr_len;
76} 76}
diff --git a/net/802/fddi.c b/net/802/fddi.c
index 3ef0ab0a543a..94b3ad08f39a 100644
--- a/net/802/fddi.c
+++ b/net/802/fddi.c
@@ -82,10 +82,10 @@ static int fddi_header(struct sk_buff *skb, struct net_device *dev,
82 if (daddr != NULL) 82 if (daddr != NULL)
83 { 83 {
84 memcpy(fddi->daddr, daddr, dev->addr_len); 84 memcpy(fddi->daddr, daddr, dev->addr_len);
85 return(hl); 85 return hl;
86 } 86 }
87 87
88 return(-hl); 88 return -hl;
89} 89}
90 90
91 91
@@ -108,7 +108,7 @@ static int fddi_rebuild_header(struct sk_buff *skb)
108 { 108 {
109 printk("%s: Don't know how to resolve type %04X addresses.\n", 109 printk("%s: Don't know how to resolve type %04X addresses.\n",
110 skb->dev->name, ntohs(fddi->hdr.llc_snap.ethertype)); 110 skb->dev->name, ntohs(fddi->hdr.llc_snap.ethertype));
111 return(0); 111 return 0;
112 } 112 }
113} 113}
114 114
@@ -162,7 +162,7 @@ __be16 fddi_type_trans(struct sk_buff *skb, struct net_device *dev)
162 162
163 /* Assume 802.2 SNAP frames, for now */ 163 /* Assume 802.2 SNAP frames, for now */
164 164
165 return(type); 165 return type;
166} 166}
167 167
168EXPORT_SYMBOL(fddi_type_trans); 168EXPORT_SYMBOL(fddi_type_trans);
@@ -170,9 +170,9 @@ EXPORT_SYMBOL(fddi_type_trans);
170int fddi_change_mtu(struct net_device *dev, int new_mtu) 170int fddi_change_mtu(struct net_device *dev, int new_mtu)
171{ 171{
172 if ((new_mtu < FDDI_K_SNAP_HLEN) || (new_mtu > FDDI_K_SNAP_DLEN)) 172 if ((new_mtu < FDDI_K_SNAP_HLEN) || (new_mtu > FDDI_K_SNAP_DLEN))
173 return(-EINVAL); 173 return -EINVAL;
174 dev->mtu = new_mtu; 174 dev->mtu = new_mtu;
175 return(0); 175 return 0;
176} 176}
177EXPORT_SYMBOL(fddi_change_mtu); 177EXPORT_SYMBOL(fddi_change_mtu);
178 178
diff --git a/net/802/garp.c b/net/802/garp.c
index 941f2a324d3a..c1df2dad8c6b 100644
--- a/net/802/garp.c
+++ b/net/802/garp.c
@@ -346,8 +346,8 @@ int garp_request_join(const struct net_device *dev,
346 const struct garp_application *appl, 346 const struct garp_application *appl,
347 const void *data, u8 len, u8 type) 347 const void *data, u8 len, u8 type)
348{ 348{
349 struct garp_port *port = dev->garp_port; 349 struct garp_port *port = rtnl_dereference(dev->garp_port);
350 struct garp_applicant *app = port->applicants[appl->type]; 350 struct garp_applicant *app = rtnl_dereference(port->applicants[appl->type]);
351 struct garp_attr *attr; 351 struct garp_attr *attr;
352 352
353 spin_lock_bh(&app->lock); 353 spin_lock_bh(&app->lock);
@@ -366,8 +366,8 @@ void garp_request_leave(const struct net_device *dev,
366 const struct garp_application *appl, 366 const struct garp_application *appl,
367 const void *data, u8 len, u8 type) 367 const void *data, u8 len, u8 type)
368{ 368{
369 struct garp_port *port = dev->garp_port; 369 struct garp_port *port = rtnl_dereference(dev->garp_port);
370 struct garp_applicant *app = port->applicants[appl->type]; 370 struct garp_applicant *app = rtnl_dereference(port->applicants[appl->type]);
371 struct garp_attr *attr; 371 struct garp_attr *attr;
372 372
373 spin_lock_bh(&app->lock); 373 spin_lock_bh(&app->lock);
@@ -546,11 +546,11 @@ static int garp_init_port(struct net_device *dev)
546 546
547static void garp_release_port(struct net_device *dev) 547static void garp_release_port(struct net_device *dev)
548{ 548{
549 struct garp_port *port = dev->garp_port; 549 struct garp_port *port = rtnl_dereference(dev->garp_port);
550 unsigned int i; 550 unsigned int i;
551 551
552 for (i = 0; i <= GARP_APPLICATION_MAX; i++) { 552 for (i = 0; i <= GARP_APPLICATION_MAX; i++) {
553 if (port->applicants[i]) 553 if (rtnl_dereference(port->applicants[i]))
554 return; 554 return;
555 } 555 }
556 rcu_assign_pointer(dev->garp_port, NULL); 556 rcu_assign_pointer(dev->garp_port, NULL);
@@ -565,7 +565,7 @@ int garp_init_applicant(struct net_device *dev, struct garp_application *appl)
565 565
566 ASSERT_RTNL(); 566 ASSERT_RTNL();
567 567
568 if (!dev->garp_port) { 568 if (!rtnl_dereference(dev->garp_port)) {
569 err = garp_init_port(dev); 569 err = garp_init_port(dev);
570 if (err < 0) 570 if (err < 0)
571 goto err1; 571 goto err1;
@@ -601,8 +601,8 @@ EXPORT_SYMBOL_GPL(garp_init_applicant);
601 601
602void garp_uninit_applicant(struct net_device *dev, struct garp_application *appl) 602void garp_uninit_applicant(struct net_device *dev, struct garp_application *appl)
603{ 603{
604 struct garp_port *port = dev->garp_port; 604 struct garp_port *port = rtnl_dereference(dev->garp_port);
605 struct garp_applicant *app = port->applicants[appl->type]; 605 struct garp_applicant *app = rtnl_dereference(port->applicants[appl->type]);
606 606
607 ASSERT_RTNL(); 607 ASSERT_RTNL();
608 608
diff --git a/net/802/hippi.c b/net/802/hippi.c
index cd3e8e929529..91aca8780fd0 100644
--- a/net/802/hippi.c
+++ b/net/802/hippi.c
@@ -152,7 +152,7 @@ int hippi_change_mtu(struct net_device *dev, int new_mtu)
152 if ((new_mtu < 68) || (new_mtu > 65280)) 152 if ((new_mtu < 68) || (new_mtu > 65280))
153 return -EINVAL; 153 return -EINVAL;
154 dev->mtu = new_mtu; 154 dev->mtu = new_mtu;
155 return(0); 155 return 0;
156} 156}
157EXPORT_SYMBOL(hippi_change_mtu); 157EXPORT_SYMBOL(hippi_change_mtu);
158 158
diff --git a/net/802/stp.c b/net/802/stp.c
index 53c8f77f0ccd..978c30b1b36b 100644
--- a/net/802/stp.c
+++ b/net/802/stp.c
@@ -21,8 +21,8 @@
21#define GARP_ADDR_MAX 0x2F 21#define GARP_ADDR_MAX 0x2F
22#define GARP_ADDR_RANGE (GARP_ADDR_MAX - GARP_ADDR_MIN) 22#define GARP_ADDR_RANGE (GARP_ADDR_MAX - GARP_ADDR_MIN)
23 23
24static const struct stp_proto *garp_protos[GARP_ADDR_RANGE + 1] __read_mostly; 24static const struct stp_proto __rcu *garp_protos[GARP_ADDR_RANGE + 1] __read_mostly;
25static const struct stp_proto *stp_proto __read_mostly; 25static const struct stp_proto __rcu *stp_proto __read_mostly;
26 26
27static struct llc_sap *sap __read_mostly; 27static struct llc_sap *sap __read_mostly;
28static unsigned int sap_registered; 28static unsigned int sap_registered;
diff --git a/net/802/tr.c b/net/802/tr.c
index 1c6e596074df..5e20cf8a074b 100644
--- a/net/802/tr.c
+++ b/net/802/tr.c
@@ -145,7 +145,7 @@ static int tr_header(struct sk_buff *skb, struct net_device *dev,
145 { 145 {
146 memcpy(trh->daddr,daddr,dev->addr_len); 146 memcpy(trh->daddr,daddr,dev->addr_len);
147 tr_source_route(skb, trh, dev); 147 tr_source_route(skb, trh, dev);
148 return(hdr_len); 148 return hdr_len;
149 } 149 }
150 150
151 return -hdr_len; 151 return -hdr_len;
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index a2ad15250575..52077ca22072 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -44,9 +44,6 @@
44 44
45int vlan_net_id __read_mostly; 45int vlan_net_id __read_mostly;
46 46
47/* Our listing of VLAN group(s) */
48static struct hlist_head vlan_group_hash[VLAN_GRP_HASH_SIZE];
49
50const char vlan_fullname[] = "802.1Q VLAN Support"; 47const char vlan_fullname[] = "802.1Q VLAN Support";
51const char vlan_version[] = DRV_VERSION; 48const char vlan_version[] = DRV_VERSION;
52static const char vlan_copyright[] = "Ben Greear <greearb@candelatech.com>"; 49static const char vlan_copyright[] = "Ben Greear <greearb@candelatech.com>";
@@ -59,40 +56,6 @@ static struct packet_type vlan_packet_type __read_mostly = {
59 56
60/* End of global variables definitions. */ 57/* End of global variables definitions. */
61 58
62static inline unsigned int vlan_grp_hashfn(unsigned int idx)
63{
64 return ((idx >> VLAN_GRP_HASH_SHIFT) ^ idx) & VLAN_GRP_HASH_MASK;
65}
66
67/* Must be invoked with RCU read lock (no preempt) */
68static struct vlan_group *__vlan_find_group(struct net_device *real_dev)
69{
70 struct vlan_group *grp;
71 struct hlist_node *n;
72 int hash = vlan_grp_hashfn(real_dev->ifindex);
73
74 hlist_for_each_entry_rcu(grp, n, &vlan_group_hash[hash], hlist) {
75 if (grp->real_dev == real_dev)
76 return grp;
77 }
78
79 return NULL;
80}
81
82/* Find the protocol handler. Assumes VID < VLAN_VID_MASK.
83 *
84 * Must be invoked with RCU read lock (no preempt)
85 */
86struct net_device *__find_vlan_dev(struct net_device *real_dev, u16 vlan_id)
87{
88 struct vlan_group *grp = __vlan_find_group(real_dev);
89
90 if (grp)
91 return vlan_group_get_device(grp, vlan_id);
92
93 return NULL;
94}
95
96static void vlan_group_free(struct vlan_group *grp) 59static void vlan_group_free(struct vlan_group *grp)
97{ 60{
98 int i; 61 int i;
@@ -111,8 +74,6 @@ static struct vlan_group *vlan_group_alloc(struct net_device *real_dev)
111 return NULL; 74 return NULL;
112 75
113 grp->real_dev = real_dev; 76 grp->real_dev = real_dev;
114 hlist_add_head_rcu(&grp->hlist,
115 &vlan_group_hash[vlan_grp_hashfn(real_dev->ifindex)]);
116 return grp; 77 return grp;
117} 78}
118 79
@@ -151,7 +112,7 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
151 112
152 ASSERT_RTNL(); 113 ASSERT_RTNL();
153 114
154 grp = __vlan_find_group(real_dev); 115 grp = rtnl_dereference(real_dev->vlgrp);
155 BUG_ON(!grp); 116 BUG_ON(!grp);
156 117
157 /* Take it out of our own structures, but be sure to interlock with 118 /* Take it out of our own structures, but be sure to interlock with
@@ -173,11 +134,10 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
173 if (grp->nr_vlans == 0) { 134 if (grp->nr_vlans == 0) {
174 vlan_gvrp_uninit_applicant(real_dev); 135 vlan_gvrp_uninit_applicant(real_dev);
175 136
176 if (real_dev->features & NETIF_F_HW_VLAN_RX) 137 rcu_assign_pointer(real_dev->vlgrp, NULL);
138 if (ops->ndo_vlan_rx_register)
177 ops->ndo_vlan_rx_register(real_dev, NULL); 139 ops->ndo_vlan_rx_register(real_dev, NULL);
178 140
179 hlist_del_rcu(&grp->hlist);
180
181 /* Free the group, after all cpu's are done. */ 141 /* Free the group, after all cpu's are done. */
182 call_rcu(&grp->rcu, vlan_rcu_free); 142 call_rcu(&grp->rcu, vlan_rcu_free);
183 } 143 }
@@ -196,18 +156,13 @@ int vlan_check_real_dev(struct net_device *real_dev, u16 vlan_id)
196 return -EOPNOTSUPP; 156 return -EOPNOTSUPP;
197 } 157 }
198 158
199 if ((real_dev->features & NETIF_F_HW_VLAN_RX) && !ops->ndo_vlan_rx_register) {
200 pr_info("8021q: device %s has buggy VLAN hw accel\n", name);
201 return -EOPNOTSUPP;
202 }
203
204 if ((real_dev->features & NETIF_F_HW_VLAN_FILTER) && 159 if ((real_dev->features & NETIF_F_HW_VLAN_FILTER) &&
205 (!ops->ndo_vlan_rx_add_vid || !ops->ndo_vlan_rx_kill_vid)) { 160 (!ops->ndo_vlan_rx_add_vid || !ops->ndo_vlan_rx_kill_vid)) {
206 pr_info("8021q: Device %s has buggy VLAN hw accel\n", name); 161 pr_info("8021q: Device %s has buggy VLAN hw accel\n", name);
207 return -EOPNOTSUPP; 162 return -EOPNOTSUPP;
208 } 163 }
209 164
210 if (__find_vlan_dev(real_dev, vlan_id) != NULL) 165 if (vlan_find_dev(real_dev, vlan_id) != NULL)
211 return -EEXIST; 166 return -EEXIST;
212 167
213 return 0; 168 return 0;
@@ -222,7 +177,7 @@ int register_vlan_dev(struct net_device *dev)
222 struct vlan_group *grp, *ngrp = NULL; 177 struct vlan_group *grp, *ngrp = NULL;
223 int err; 178 int err;
224 179
225 grp = __vlan_find_group(real_dev); 180 grp = rtnl_dereference(real_dev->vlgrp);
226 if (!grp) { 181 if (!grp) {
227 ngrp = grp = vlan_group_alloc(real_dev); 182 ngrp = grp = vlan_group_alloc(real_dev);
228 if (!grp) 183 if (!grp)
@@ -252,8 +207,11 @@ int register_vlan_dev(struct net_device *dev)
252 vlan_group_set_device(grp, vlan_id, dev); 207 vlan_group_set_device(grp, vlan_id, dev);
253 grp->nr_vlans++; 208 grp->nr_vlans++;
254 209
255 if (ngrp && real_dev->features & NETIF_F_HW_VLAN_RX) 210 if (ngrp) {
256 ops->ndo_vlan_rx_register(real_dev, ngrp); 211 if (ops->ndo_vlan_rx_register)
212 ops->ndo_vlan_rx_register(real_dev, ngrp);
213 rcu_assign_pointer(real_dev->vlgrp, ngrp);
214 }
257 if (real_dev->features & NETIF_F_HW_VLAN_FILTER) 215 if (real_dev->features & NETIF_F_HW_VLAN_FILTER)
258 ops->ndo_vlan_rx_add_vid(real_dev, vlan_id); 216 ops->ndo_vlan_rx_add_vid(real_dev, vlan_id);
259 217
@@ -264,7 +222,6 @@ out_uninit_applicant:
264 vlan_gvrp_uninit_applicant(real_dev); 222 vlan_gvrp_uninit_applicant(real_dev);
265out_free_group: 223out_free_group:
266 if (ngrp) { 224 if (ngrp) {
267 hlist_del_rcu(&ngrp->hlist);
268 /* Free the group, after all cpu's are done. */ 225 /* Free the group, after all cpu's are done. */
269 call_rcu(&ngrp->rcu, vlan_rcu_free); 226 call_rcu(&ngrp->rcu, vlan_rcu_free);
270 } 227 }
@@ -321,7 +278,7 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id)
321 if (new_dev == NULL) 278 if (new_dev == NULL)
322 return -ENOBUFS; 279 return -ENOBUFS;
323 280
324 new_dev->real_num_tx_queues = real_dev->real_num_tx_queues; 281 netif_copy_real_num_queues(new_dev, real_dev);
325 dev_net_set(new_dev, net); 282 dev_net_set(new_dev, net);
326 /* need 4 bytes for extra VLAN header info, 283 /* need 4 bytes for extra VLAN header info,
327 * hope the underlying device can handle it. 284 * hope the underlying device can handle it.
@@ -428,7 +385,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
428 dev->netdev_ops->ndo_vlan_rx_add_vid(dev, 0); 385 dev->netdev_ops->ndo_vlan_rx_add_vid(dev, 0);
429 } 386 }
430 387
431 grp = __vlan_find_group(dev); 388 grp = rtnl_dereference(dev->vlgrp);
432 if (!grp) 389 if (!grp)
433 goto out; 390 goto out;
434 391
@@ -439,7 +396,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
439 switch (event) { 396 switch (event) {
440 case NETDEV_CHANGE: 397 case NETDEV_CHANGE:
441 /* Propagate real device state to vlan devices */ 398 /* Propagate real device state to vlan devices */
442 for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) { 399 for (i = 0; i < VLAN_N_VID; i++) {
443 vlandev = vlan_group_get_device(grp, i); 400 vlandev = vlan_group_get_device(grp, i);
444 if (!vlandev) 401 if (!vlandev)
445 continue; 402 continue;
@@ -450,7 +407,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
450 407
451 case NETDEV_CHANGEADDR: 408 case NETDEV_CHANGEADDR:
452 /* Adjust unicast filters on underlying device */ 409 /* Adjust unicast filters on underlying device */
453 for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) { 410 for (i = 0; i < VLAN_N_VID; i++) {
454 vlandev = vlan_group_get_device(grp, i); 411 vlandev = vlan_group_get_device(grp, i);
455 if (!vlandev) 412 if (!vlandev)
456 continue; 413 continue;
@@ -464,7 +421,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
464 break; 421 break;
465 422
466 case NETDEV_CHANGEMTU: 423 case NETDEV_CHANGEMTU:
467 for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) { 424 for (i = 0; i < VLAN_N_VID; i++) {
468 vlandev = vlan_group_get_device(grp, i); 425 vlandev = vlan_group_get_device(grp, i);
469 if (!vlandev) 426 if (!vlandev)
470 continue; 427 continue;
@@ -478,7 +435,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
478 435
479 case NETDEV_FEAT_CHANGE: 436 case NETDEV_FEAT_CHANGE:
480 /* Propagate device features to underlying device */ 437 /* Propagate device features to underlying device */
481 for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) { 438 for (i = 0; i < VLAN_N_VID; i++) {
482 vlandev = vlan_group_get_device(grp, i); 439 vlandev = vlan_group_get_device(grp, i);
483 if (!vlandev) 440 if (!vlandev)
484 continue; 441 continue;
@@ -490,7 +447,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
490 447
491 case NETDEV_DOWN: 448 case NETDEV_DOWN:
492 /* Put all VLANs for this dev in the down state too. */ 449 /* Put all VLANs for this dev in the down state too. */
493 for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) { 450 for (i = 0; i < VLAN_N_VID; i++) {
494 vlandev = vlan_group_get_device(grp, i); 451 vlandev = vlan_group_get_device(grp, i);
495 if (!vlandev) 452 if (!vlandev)
496 continue; 453 continue;
@@ -508,7 +465,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
508 465
509 case NETDEV_UP: 466 case NETDEV_UP:
510 /* Put all VLANs for this dev in the up state too. */ 467 /* Put all VLANs for this dev in the up state too. */
511 for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) { 468 for (i = 0; i < VLAN_N_VID; i++) {
512 vlandev = vlan_group_get_device(grp, i); 469 vlandev = vlan_group_get_device(grp, i);
513 if (!vlandev) 470 if (!vlandev)
514 continue; 471 continue;
@@ -525,10 +482,14 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
525 break; 482 break;
526 483
527 case NETDEV_UNREGISTER: 484 case NETDEV_UNREGISTER:
485 /* twiddle thumbs on netns device moves */
486 if (dev->reg_state != NETREG_UNREGISTERING)
487 break;
488
528 /* Delete all VLANs for this dev. */ 489 /* Delete all VLANs for this dev. */
529 grp->killall = 1; 490 grp->killall = 1;
530 491
531 for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) { 492 for (i = 0; i < VLAN_N_VID; i++) {
532 vlandev = vlan_group_get_device(grp, i); 493 vlandev = vlan_group_get_device(grp, i);
533 if (!vlandev) 494 if (!vlandev)
534 continue; 495 continue;
@@ -536,7 +497,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
536 /* unregistration of last vlan destroys group, abort 497 /* unregistration of last vlan destroys group, abort
537 * afterwards */ 498 * afterwards */
538 if (grp->nr_vlans == 1) 499 if (grp->nr_vlans == 1)
539 i = VLAN_GROUP_ARRAY_LEN; 500 i = VLAN_N_VID;
540 501
541 unregister_vlan_dev(vlandev, &list); 502 unregister_vlan_dev(vlandev, &list);
542 } 503 }
@@ -742,8 +703,6 @@ err0:
742 703
743static void __exit vlan_cleanup_module(void) 704static void __exit vlan_cleanup_module(void)
744{ 705{
745 unsigned int i;
746
747 vlan_ioctl_set(NULL); 706 vlan_ioctl_set(NULL);
748 vlan_netlink_fini(); 707 vlan_netlink_fini();
749 708
@@ -751,10 +710,6 @@ static void __exit vlan_cleanup_module(void)
751 710
752 dev_remove_pack(&vlan_packet_type); 711 dev_remove_pack(&vlan_packet_type);
753 712
754 /* This table must be empty if there are no module references left. */
755 for (i = 0; i < VLAN_GRP_HASH_SIZE; i++)
756 BUG_ON(!hlist_empty(&vlan_group_hash[i]));
757
758 unregister_pernet_subsys(&vlan_net_ops); 713 unregister_pernet_subsys(&vlan_net_ops);
759 rcu_barrier(); /* Wait for completion of call_rcu()'s */ 714 rcu_barrier(); /* Wait for completion of call_rcu()'s */
760 715
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index 8d9503ad01da..db01b3181fdc 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -72,23 +72,6 @@ static inline struct vlan_dev_info *vlan_dev_info(const struct net_device *dev)
72 return netdev_priv(dev); 72 return netdev_priv(dev);
73} 73}
74 74
75#define VLAN_GRP_HASH_SHIFT 5
76#define VLAN_GRP_HASH_SIZE (1 << VLAN_GRP_HASH_SHIFT)
77#define VLAN_GRP_HASH_MASK (VLAN_GRP_HASH_SIZE - 1)
78
79/* Find a VLAN device by the MAC address of its Ethernet device, and
80 * it's VLAN ID. The default configuration is to have VLAN's scope
81 * to be box-wide, so the MAC will be ignored. The mac will only be
82 * looked at if we are configured to have a separate set of VLANs per
83 * each MAC addressable interface. Note that this latter option does
84 * NOT follow the spec for VLANs, but may be useful for doing very
85 * large quantities of VLAN MUX/DEMUX onto FrameRelay or ATM PVCs.
86 *
87 * Must be invoked with rcu_read_lock (ie preempt disabled)
88 * or with RTNL.
89 */
90struct net_device *__find_vlan_dev(struct net_device *real_dev, u16 vlan_id);
91
92/* found in vlan_dev.c */ 75/* found in vlan_dev.c */
93int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev, 76int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
94 struct packet_type *ptype, struct net_device *orig_dev); 77 struct packet_type *ptype, struct net_device *orig_dev);
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 01ddb0472f86..69b2f79800a5 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -4,50 +4,29 @@
4#include <linux/netpoll.h> 4#include <linux/netpoll.h>
5#include "vlan.h" 5#include "vlan.h"
6 6
7/* VLAN rx hw acceleration helper. This acts like netif_{rx,receive_skb}(). */ 7bool vlan_hwaccel_do_receive(struct sk_buff **skbp)
8int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp,
9 u16 vlan_tci, int polling)
10{ 8{
9 struct sk_buff *skb = *skbp;
10 u16 vlan_id = skb->vlan_tci & VLAN_VID_MASK;
11 struct net_device *vlan_dev; 11 struct net_device *vlan_dev;
12 u16 vlan_id; 12 struct vlan_rx_stats *rx_stats;
13
14 if (netpoll_rx(skb))
15 return NET_RX_DROP;
16
17 if (skb_bond_should_drop(skb, ACCESS_ONCE(skb->dev->master)))
18 skb->deliver_no_wcard = 1;
19
20 skb->skb_iif = skb->dev->ifindex;
21 __vlan_hwaccel_put_tag(skb, vlan_tci);
22 vlan_id = vlan_tci & VLAN_VID_MASK;
23 vlan_dev = vlan_group_get_device(grp, vlan_id);
24
25 if (vlan_dev)
26 skb->dev = vlan_dev;
27 else if (vlan_id)
28 goto drop;
29
30 return (polling ? netif_receive_skb(skb) : netif_rx(skb));
31 13
32drop: 14 vlan_dev = vlan_find_dev(skb->dev, vlan_id);
33 dev_kfree_skb_any(skb); 15 if (!vlan_dev) {
34 return NET_RX_DROP; 16 if (vlan_id)
35} 17 skb->pkt_type = PACKET_OTHERHOST;
36EXPORT_SYMBOL(__vlan_hwaccel_rx); 18 return false;
37 19 }
38int vlan_hwaccel_do_receive(struct sk_buff *skb)
39{
40 struct net_device *dev = skb->dev;
41 struct vlan_rx_stats *rx_stats;
42 20
43 skb->dev = vlan_dev_info(dev)->real_dev; 21 skb = *skbp = skb_share_check(skb, GFP_ATOMIC);
44 netif_nit_deliver(skb); 22 if (unlikely(!skb))
23 return false;
45 24
46 skb->dev = dev; 25 skb->dev = vlan_dev;
47 skb->priority = vlan_get_ingress_priority(dev, skb->vlan_tci); 26 skb->priority = vlan_get_ingress_priority(vlan_dev, skb->vlan_tci);
48 skb->vlan_tci = 0; 27 skb->vlan_tci = 0;
49 28
50 rx_stats = this_cpu_ptr(vlan_dev_info(dev)->vlan_rx_stats); 29 rx_stats = this_cpu_ptr(vlan_dev_info(vlan_dev)->vlan_rx_stats);
51 30
52 u64_stats_update_begin(&rx_stats->syncp); 31 u64_stats_update_begin(&rx_stats->syncp);
53 rx_stats->rx_packets++; 32 rx_stats->rx_packets++;
@@ -64,12 +43,13 @@ int vlan_hwaccel_do_receive(struct sk_buff *skb)
64 * This allows the VLAN to have a different MAC than the 43 * This allows the VLAN to have a different MAC than the
65 * underlying device, and still route correctly. */ 44 * underlying device, and still route correctly. */
66 if (!compare_ether_addr(eth_hdr(skb)->h_dest, 45 if (!compare_ether_addr(eth_hdr(skb)->h_dest,
67 dev->dev_addr)) 46 vlan_dev->dev_addr))
68 skb->pkt_type = PACKET_HOST; 47 skb->pkt_type = PACKET_HOST;
69 break; 48 break;
70 } 49 }
71 u64_stats_update_end(&rx_stats->syncp); 50 u64_stats_update_end(&rx_stats->syncp);
72 return 0; 51
52 return true;
73} 53}
74 54
75struct net_device *vlan_dev_real_dev(const struct net_device *dev) 55struct net_device *vlan_dev_real_dev(const struct net_device *dev)
@@ -84,68 +64,27 @@ u16 vlan_dev_vlan_id(const struct net_device *dev)
84} 64}
85EXPORT_SYMBOL(vlan_dev_vlan_id); 65EXPORT_SYMBOL(vlan_dev_vlan_id);
86 66
87static gro_result_t 67/* VLAN rx hw acceleration helper. This acts like netif_{rx,receive_skb}(). */
88vlan_gro_common(struct napi_struct *napi, struct vlan_group *grp, 68int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp,
89 unsigned int vlan_tci, struct sk_buff *skb) 69 u16 vlan_tci, int polling)
90{ 70{
91 struct sk_buff *p;
92 struct net_device *vlan_dev;
93 u16 vlan_id;
94
95 if (skb_bond_should_drop(skb, ACCESS_ONCE(skb->dev->master)))
96 skb->deliver_no_wcard = 1;
97
98 skb->skb_iif = skb->dev->ifindex;
99 __vlan_hwaccel_put_tag(skb, vlan_tci); 71 __vlan_hwaccel_put_tag(skb, vlan_tci);
100 vlan_id = vlan_tci & VLAN_VID_MASK; 72 return polling ? netif_receive_skb(skb) : netif_rx(skb);
101 vlan_dev = vlan_group_get_device(grp, vlan_id);
102
103 if (vlan_dev)
104 skb->dev = vlan_dev;
105 else if (vlan_id)
106 goto drop;
107
108 for (p = napi->gro_list; p; p = p->next) {
109 NAPI_GRO_CB(p)->same_flow =
110 p->dev == skb->dev && !compare_ether_header(
111 skb_mac_header(p), skb_gro_mac_header(skb));
112 NAPI_GRO_CB(p)->flush = 0;
113 }
114
115 return dev_gro_receive(napi, skb);
116
117drop:
118 return GRO_DROP;
119} 73}
74EXPORT_SYMBOL(__vlan_hwaccel_rx);
120 75
121gro_result_t vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp, 76gro_result_t vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp,
122 unsigned int vlan_tci, struct sk_buff *skb) 77 unsigned int vlan_tci, struct sk_buff *skb)
123{ 78{
124 if (netpoll_rx_on(skb)) 79 __vlan_hwaccel_put_tag(skb, vlan_tci);
125 return vlan_hwaccel_receive_skb(skb, grp, vlan_tci) 80 return napi_gro_receive(napi, skb);
126 ? GRO_DROP : GRO_NORMAL;
127
128 skb_gro_reset_offset(skb);
129
130 return napi_skb_finish(vlan_gro_common(napi, grp, vlan_tci, skb), skb);
131} 81}
132EXPORT_SYMBOL(vlan_gro_receive); 82EXPORT_SYMBOL(vlan_gro_receive);
133 83
134gro_result_t vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp, 84gro_result_t vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp,
135 unsigned int vlan_tci) 85 unsigned int vlan_tci)
136{ 86{
137 struct sk_buff *skb = napi_frags_skb(napi); 87 __vlan_hwaccel_put_tag(napi->skb, vlan_tci);
138 88 return napi_gro_frags(napi);
139 if (!skb)
140 return GRO_DROP;
141
142 if (netpoll_rx_on(skb)) {
143 skb->protocol = eth_type_trans(skb, skb->dev);
144 return vlan_hwaccel_receive_skb(skb, grp, vlan_tci)
145 ? GRO_DROP : GRO_NORMAL;
146 }
147
148 return napi_frags_finish(napi, skb,
149 vlan_gro_common(napi, grp, vlan_tci, skb));
150} 89}
151EXPORT_SYMBOL(vlan_gro_frags); 90EXPORT_SYMBOL(vlan_gro_frags);
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 3d59c9bf8feb..14e3d1fa07a0 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -158,7 +158,7 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
158 vlan_id = vlan_tci & VLAN_VID_MASK; 158 vlan_id = vlan_tci & VLAN_VID_MASK;
159 159
160 rcu_read_lock(); 160 rcu_read_lock();
161 vlan_dev = __find_vlan_dev(dev, vlan_id); 161 vlan_dev = vlan_find_dev(dev, vlan_id);
162 162
163 /* If the VLAN device is defined, we use it. 163 /* If the VLAN device is defined, we use it.
164 * If not, and the VID is 0, it is a 802.1p packet (not 164 * If not, and the VID is 0, it is a 802.1p packet (not
@@ -177,8 +177,8 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
177 } else { 177 } else {
178 skb->dev = vlan_dev; 178 skb->dev = vlan_dev;
179 179
180 rx_stats = per_cpu_ptr(vlan_dev_info(skb->dev)->vlan_rx_stats, 180 rx_stats = this_cpu_ptr(vlan_dev_info(skb->dev)->vlan_rx_stats);
181 smp_processor_id()); 181
182 u64_stats_update_begin(&rx_stats->syncp); 182 u64_stats_update_begin(&rx_stats->syncp);
183 rx_stats->rx_packets++; 183 rx_stats->rx_packets++;
184 rx_stats->rx_bytes += skb->len; 184 rx_stats->rx_bytes += skb->len;
@@ -226,12 +226,14 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
226 } 226 }
227 227
228 netif_rx(skb); 228 netif_rx(skb);
229
229 rcu_read_unlock(); 230 rcu_read_unlock();
230 return NET_RX_SUCCESS; 231 return NET_RX_SUCCESS;
231 232
232err_unlock: 233err_unlock:
233 rcu_read_unlock(); 234 rcu_read_unlock();
234err_free: 235err_free:
236 atomic_long_inc(&dev->rx_dropped);
235 kfree_skb(skb); 237 kfree_skb(skb);
236 return NET_RX_DROP; 238 return NET_RX_DROP;
237} 239}
@@ -510,7 +512,8 @@ static int vlan_dev_open(struct net_device *dev)
510 if (vlan->flags & VLAN_FLAG_GVRP) 512 if (vlan->flags & VLAN_FLAG_GVRP)
511 vlan_gvrp_request_join(dev); 513 vlan_gvrp_request_join(dev);
512 514
513 netif_carrier_on(dev); 515 if (netif_carrier_ok(real_dev))
516 netif_carrier_on(dev);
514 return 0; 517 return 0;
515 518
516clear_allmulti: 519clear_allmulti:
@@ -842,7 +845,7 @@ static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, st
842 accum.rx_packets += rxpackets; 845 accum.rx_packets += rxpackets;
843 accum.rx_bytes += rxbytes; 846 accum.rx_bytes += rxbytes;
844 accum.rx_multicast += rxmulticast; 847 accum.rx_multicast += rxmulticast;
845 /* rx_errors is an ulong, not protected by syncp */ 848 /* rx_errors is ulong, not protected by syncp */
846 accum.rx_errors += p->rx_errors; 849 accum.rx_errors += p->rx_errors;
847 } 850 }
848 stats->rx_packets = accum.rx_packets; 851 stats->rx_packets = accum.rx_packets;
diff --git a/net/9p/client.c b/net/9p/client.c
index dc6f2f26d023..a848bca9fbff 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -61,13 +61,13 @@ static const match_table_t tokens = {
61 61
62inline int p9_is_proto_dotl(struct p9_client *clnt) 62inline int p9_is_proto_dotl(struct p9_client *clnt)
63{ 63{
64 return (clnt->proto_version == p9_proto_2000L); 64 return clnt->proto_version == p9_proto_2000L;
65} 65}
66EXPORT_SYMBOL(p9_is_proto_dotl); 66EXPORT_SYMBOL(p9_is_proto_dotl);
67 67
68inline int p9_is_proto_dotu(struct p9_client *clnt) 68inline int p9_is_proto_dotu(struct p9_client *clnt)
69{ 69{
70 return (clnt->proto_version == p9_proto_2000u); 70 return clnt->proto_version == p9_proto_2000u;
71} 71}
72EXPORT_SYMBOL(p9_is_proto_dotu); 72EXPORT_SYMBOL(p9_is_proto_dotu);
73 73
@@ -331,8 +331,10 @@ static void p9_tag_cleanup(struct p9_client *c)
331 } 331 }
332 } 332 }
333 333
334 if (c->tagpool) 334 if (c->tagpool) {
335 p9_idpool_put(0, c->tagpool); /* free reserved tag 0 */
335 p9_idpool_destroy(c->tagpool); 336 p9_idpool_destroy(c->tagpool);
337 }
336 338
337 /* free requests associated with tags */ 339 /* free requests associated with tags */
338 for (row = 0; row < (c->max_tag/P9_ROW_MAXTAG); row++) { 340 for (row = 0; row < (c->max_tag/P9_ROW_MAXTAG); row++) {
@@ -448,32 +450,43 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req)
448 return err; 450 return err;
449 } 451 }
450 452
451 if (type == P9_RERROR) { 453 if (type == P9_RERROR || type == P9_RLERROR) {
452 int ecode; 454 int ecode;
453 char *ename;
454 455
455 err = p9pdu_readf(req->rc, c->proto_version, "s?d", 456 if (!p9_is_proto_dotl(c)) {
456 &ename, &ecode); 457 char *ename;
457 if (err) {
458 P9_DPRINTK(P9_DEBUG_ERROR, "couldn't parse error%d\n",
459 err);
460 return err;
461 }
462 458
463 if (p9_is_proto_dotu(c) || 459 err = p9pdu_readf(req->rc, c->proto_version, "s?d",
464 p9_is_proto_dotl(c)) 460 &ename, &ecode);
465 err = -ecode; 461 if (err)
462 goto out_err;
466 463
467 if (!err || !IS_ERR_VALUE(err)) 464 if (p9_is_proto_dotu(c))
468 err = p9_errstr2errno(ename, strlen(ename)); 465 err = -ecode;
466
467 if (!err || !IS_ERR_VALUE(err)) {
468 err = p9_errstr2errno(ename, strlen(ename));
469
470 P9_DPRINTK(P9_DEBUG_9P, "<<< RERROR (%d) %s\n", -ecode, ename);
471
472 kfree(ename);
473 }
474 } else {
475 err = p9pdu_readf(req->rc, c->proto_version, "d", &ecode);
476 err = -ecode;
469 477
470 P9_DPRINTK(P9_DEBUG_9P, "<<< RERROR (%d) %s\n", -ecode, ename); 478 P9_DPRINTK(P9_DEBUG_9P, "<<< RLERROR (%d)\n", -ecode);
479 }
471 480
472 kfree(ename);
473 } else 481 } else
474 err = 0; 482 err = 0;
475 483
476 return err; 484 return err;
485
486out_err:
487 P9_DPRINTK(P9_DEBUG_ERROR, "couldn't parse error%d\n", err);
488
489 return err;
477} 490}
478 491
479/** 492/**
@@ -566,11 +579,14 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
566 va_start(ap, fmt); 579 va_start(ap, fmt);
567 err = p9pdu_vwritef(req->tc, c->proto_version, fmt, ap); 580 err = p9pdu_vwritef(req->tc, c->proto_version, fmt, ap);
568 va_end(ap); 581 va_end(ap);
582 if (err)
583 goto reterr;
569 p9pdu_finalize(req->tc); 584 p9pdu_finalize(req->tc);
570 585
571 err = c->trans_mod->request(c, req); 586 err = c->trans_mod->request(c, req);
572 if (err < 0) { 587 if (err < 0) {
573 c->status = Disconnected; 588 if (err != -ERESTARTSYS)
589 c->status = Disconnected;
574 goto reterr; 590 goto reterr;
575 } 591 }
576 592
@@ -669,7 +685,7 @@ static void p9_fid_destroy(struct p9_fid *fid)
669 kfree(fid); 685 kfree(fid);
670} 686}
671 687
672int p9_client_version(struct p9_client *c) 688static int p9_client_version(struct p9_client *c)
673{ 689{
674 int err = 0; 690 int err = 0;
675 struct p9_req_t *req; 691 struct p9_req_t *req;
@@ -728,7 +744,6 @@ error:
728 744
729 return err; 745 return err;
730} 746}
731EXPORT_SYMBOL(p9_client_version);
732 747
733struct p9_client *p9_client_create(const char *dev_name, char *options) 748struct p9_client *p9_client_create(const char *dev_name, char *options)
734{ 749{
@@ -885,54 +900,6 @@ error:
885} 900}
886EXPORT_SYMBOL(p9_client_attach); 901EXPORT_SYMBOL(p9_client_attach);
887 902
888struct p9_fid *
889p9_client_auth(struct p9_client *clnt, char *uname, u32 n_uname, char *aname)
890{
891 int err;
892 struct p9_req_t *req;
893 struct p9_qid qid;
894 struct p9_fid *afid;
895
896 P9_DPRINTK(P9_DEBUG_9P, ">>> TAUTH uname %s aname %s\n", uname, aname);
897 err = 0;
898
899 afid = p9_fid_create(clnt);
900 if (IS_ERR(afid)) {
901 err = PTR_ERR(afid);
902 afid = NULL;
903 goto error;
904 }
905
906 req = p9_client_rpc(clnt, P9_TAUTH, "dss?d",
907 afid ? afid->fid : P9_NOFID, uname, aname, n_uname);
908 if (IS_ERR(req)) {
909 err = PTR_ERR(req);
910 goto error;
911 }
912
913 err = p9pdu_readf(req->rc, clnt->proto_version, "Q", &qid);
914 if (err) {
915 p9pdu_dump(1, req->rc);
916 p9_free_req(clnt, req);
917 goto error;
918 }
919
920 P9_DPRINTK(P9_DEBUG_9P, "<<< RAUTH qid %x.%llx.%x\n",
921 qid.type,
922 (unsigned long long)qid.path,
923 qid.version);
924
925 memmove(&afid->qid, &qid, sizeof(struct p9_qid));
926 p9_free_req(clnt, req);
927 return afid;
928
929error:
930 if (afid)
931 p9_fid_destroy(afid);
932 return ERR_PTR(err);
933}
934EXPORT_SYMBOL(p9_client_auth);
935
936struct p9_fid *p9_client_walk(struct p9_fid *oldfid, int nwname, char **wnames, 903struct p9_fid *p9_client_walk(struct p9_fid *oldfid, int nwname, char **wnames,
937 int clone) 904 int clone)
938{ 905{
@@ -944,6 +911,7 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, int nwname, char **wnames,
944 int16_t nwqids, count; 911 int16_t nwqids, count;
945 912
946 err = 0; 913 err = 0;
914 wqids = NULL;
947 clnt = oldfid->clnt; 915 clnt = oldfid->clnt;
948 if (clone) { 916 if (clone) {
949 fid = p9_fid_create(clnt); 917 fid = p9_fid_create(clnt);
@@ -994,9 +962,11 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, int nwname, char **wnames,
994 else 962 else
995 fid->qid = oldfid->qid; 963 fid->qid = oldfid->qid;
996 964
965 kfree(wqids);
997 return fid; 966 return fid;
998 967
999clunk_fid: 968clunk_fid:
969 kfree(wqids);
1000 p9_client_clunk(fid); 970 p9_client_clunk(fid);
1001 fid = NULL; 971 fid = NULL;
1002 972
@@ -1195,12 +1165,44 @@ int p9_client_link(struct p9_fid *dfid, struct p9_fid *oldfid, char *newname)
1195} 1165}
1196EXPORT_SYMBOL(p9_client_link); 1166EXPORT_SYMBOL(p9_client_link);
1197 1167
1168int p9_client_fsync(struct p9_fid *fid, int datasync)
1169{
1170 int err;
1171 struct p9_client *clnt;
1172 struct p9_req_t *req;
1173
1174 P9_DPRINTK(P9_DEBUG_9P, ">>> TFSYNC fid %d datasync:%d\n",
1175 fid->fid, datasync);
1176 err = 0;
1177 clnt = fid->clnt;
1178
1179 req = p9_client_rpc(clnt, P9_TFSYNC, "dd", fid->fid, datasync);
1180 if (IS_ERR(req)) {
1181 err = PTR_ERR(req);
1182 goto error;
1183 }
1184
1185 P9_DPRINTK(P9_DEBUG_9P, "<<< RFSYNC fid %d\n", fid->fid);
1186
1187 p9_free_req(clnt, req);
1188
1189error:
1190 return err;
1191}
1192EXPORT_SYMBOL(p9_client_fsync);
1193
1198int p9_client_clunk(struct p9_fid *fid) 1194int p9_client_clunk(struct p9_fid *fid)
1199{ 1195{
1200 int err; 1196 int err;
1201 struct p9_client *clnt; 1197 struct p9_client *clnt;
1202 struct p9_req_t *req; 1198 struct p9_req_t *req;
1203 1199
1200 if (!fid) {
1201 P9_EPRINTK(KERN_WARNING, "Trying to clunk with NULL fid\n");
1202 dump_stack();
1203 return 0;
1204 }
1205
1204 P9_DPRINTK(P9_DEBUG_9P, ">>> TCLUNK fid %d\n", fid->fid); 1206 P9_DPRINTK(P9_DEBUG_9P, ">>> TCLUNK fid %d\n", fid->fid);
1205 err = 0; 1207 err = 0;
1206 clnt = fid->clnt; 1208 clnt = fid->clnt;
@@ -1284,16 +1286,13 @@ p9_client_read(struct p9_fid *fid, char *data, char __user *udata, u64 offset,
1284 1286
1285 if (data) { 1287 if (data) {
1286 memmove(data, dataptr, count); 1288 memmove(data, dataptr, count);
1287 } 1289 } else {
1288
1289 if (udata) {
1290 err = copy_to_user(udata, dataptr, count); 1290 err = copy_to_user(udata, dataptr, count);
1291 if (err) { 1291 if (err) {
1292 err = -EFAULT; 1292 err = -EFAULT;
1293 goto free_and_error; 1293 goto free_and_error;
1294 } 1294 }
1295 } 1295 }
1296
1297 p9_free_req(clnt, req); 1296 p9_free_req(clnt, req);
1298 return count; 1297 return count;
1299 1298
@@ -1805,3 +1804,96 @@ error:
1805 1804
1806} 1805}
1807EXPORT_SYMBOL(p9_client_mkdir_dotl); 1806EXPORT_SYMBOL(p9_client_mkdir_dotl);
1807
1808int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status)
1809{
1810 int err;
1811 struct p9_client *clnt;
1812 struct p9_req_t *req;
1813
1814 err = 0;
1815 clnt = fid->clnt;
1816 P9_DPRINTK(P9_DEBUG_9P, ">>> TLOCK fid %d type %i flags %d "
1817 "start %lld length %lld proc_id %d client_id %s\n",
1818 fid->fid, flock->type, flock->flags, flock->start,
1819 flock->length, flock->proc_id, flock->client_id);
1820
1821 req = p9_client_rpc(clnt, P9_TLOCK, "dbdqqds", fid->fid, flock->type,
1822 flock->flags, flock->start, flock->length,
1823 flock->proc_id, flock->client_id);
1824
1825 if (IS_ERR(req))
1826 return PTR_ERR(req);
1827
1828 err = p9pdu_readf(req->rc, clnt->proto_version, "b", status);
1829 if (err) {
1830 p9pdu_dump(1, req->rc);
1831 goto error;
1832 }
1833 P9_DPRINTK(P9_DEBUG_9P, "<<< RLOCK status %i\n", *status);
1834error:
1835 p9_free_req(clnt, req);
1836 return err;
1837
1838}
1839EXPORT_SYMBOL(p9_client_lock_dotl);
1840
1841int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *glock)
1842{
1843 int err;
1844 struct p9_client *clnt;
1845 struct p9_req_t *req;
1846
1847 err = 0;
1848 clnt = fid->clnt;
1849 P9_DPRINTK(P9_DEBUG_9P, ">>> TGETLOCK fid %d, type %i start %lld "
1850 "length %lld proc_id %d client_id %s\n", fid->fid, glock->type,
1851 glock->start, glock->length, glock->proc_id, glock->client_id);
1852
1853 req = p9_client_rpc(clnt, P9_TGETLOCK, "dbqqds", fid->fid, glock->type,
1854 glock->start, glock->length, glock->proc_id, glock->client_id);
1855
1856 if (IS_ERR(req))
1857 return PTR_ERR(req);
1858
1859 err = p9pdu_readf(req->rc, clnt->proto_version, "bqqds", &glock->type,
1860 &glock->start, &glock->length, &glock->proc_id,
1861 &glock->client_id);
1862 if (err) {
1863 p9pdu_dump(1, req->rc);
1864 goto error;
1865 }
1866 P9_DPRINTK(P9_DEBUG_9P, "<<< RGETLOCK type %i start %lld length %lld "
1867 "proc_id %d client_id %s\n", glock->type, glock->start,
1868 glock->length, glock->proc_id, glock->client_id);
1869error:
1870 p9_free_req(clnt, req);
1871 return err;
1872}
1873EXPORT_SYMBOL(p9_client_getlock_dotl);
1874
1875int p9_client_readlink(struct p9_fid *fid, char **target)
1876{
1877 int err;
1878 struct p9_client *clnt;
1879 struct p9_req_t *req;
1880
1881 err = 0;
1882 clnt = fid->clnt;
1883 P9_DPRINTK(P9_DEBUG_9P, ">>> TREADLINK fid %d\n", fid->fid);
1884
1885 req = p9_client_rpc(clnt, P9_TREADLINK, "d", fid->fid);
1886 if (IS_ERR(req))
1887 return PTR_ERR(req);
1888
1889 err = p9pdu_readf(req->rc, clnt->proto_version, "s", target);
1890 if (err) {
1891 p9pdu_dump(1, req->rc);
1892 goto error;
1893 }
1894 P9_DPRINTK(P9_DEBUG_9P, "<<< RREADLINK target %s\n", *target);
1895error:
1896 p9_free_req(clnt, req);
1897 return err;
1898}
1899EXPORT_SYMBOL(p9_client_readlink);
diff --git a/net/9p/protocol.c b/net/9p/protocol.c
index 3acd3afb20c8..45c15f491401 100644
--- a/net/9p/protocol.c
+++ b/net/9p/protocol.c
@@ -122,9 +122,8 @@ static size_t
122pdu_write_u(struct p9_fcall *pdu, const char __user *udata, size_t size) 122pdu_write_u(struct p9_fcall *pdu, const char __user *udata, size_t size)
123{ 123{
124 size_t len = MIN(pdu->capacity - pdu->size, size); 124 size_t len = MIN(pdu->capacity - pdu->size, size);
125 int err = copy_from_user(&pdu->sdata[pdu->size], udata, len); 125 if (copy_from_user(&pdu->sdata[pdu->size], udata, len))
126 if (err) 126 len = 0;
127 printk(KERN_WARNING "pdu_write_u returning: %d\n", err);
128 127
129 pdu->size += len; 128 pdu->size += len;
130 return size - len; 129 return size - len;
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index c85109d809ca..078eb162d9bf 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -222,7 +222,7 @@ static void p9_conn_cancel(struct p9_conn *m, int err)
222 } 222 }
223} 223}
224 224
225static unsigned int 225static int
226p9_fd_poll(struct p9_client *client, struct poll_table_struct *pt) 226p9_fd_poll(struct p9_client *client, struct poll_table_struct *pt)
227{ 227{
228 int ret, n; 228 int ret, n;
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index 0ea20c30466c..17c5ba7551a5 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -426,8 +426,10 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
426 426
427 /* Allocate an fcall for the reply */ 427 /* Allocate an fcall for the reply */
428 rpl_context = kmalloc(sizeof *rpl_context, GFP_KERNEL); 428 rpl_context = kmalloc(sizeof *rpl_context, GFP_KERNEL);
429 if (!rpl_context) 429 if (!rpl_context) {
430 err = -ENOMEM;
430 goto err_close; 431 goto err_close;
432 }
431 433
432 /* 434 /*
433 * If the request has a buffer, steal it, otherwise 435 * If the request has a buffer, steal it, otherwise
@@ -445,8 +447,8 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
445 } 447 }
446 rpl_context->rc = req->rc; 448 rpl_context->rc = req->rc;
447 if (!rpl_context->rc) { 449 if (!rpl_context->rc) {
448 kfree(rpl_context); 450 err = -ENOMEM;
449 goto err_close; 451 goto err_free2;
450 } 452 }
451 453
452 /* 454 /*
@@ -458,11 +460,8 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
458 */ 460 */
459 if (atomic_inc_return(&rdma->rq_count) <= rdma->rq_depth) { 461 if (atomic_inc_return(&rdma->rq_count) <= rdma->rq_depth) {
460 err = post_recv(client, rpl_context); 462 err = post_recv(client, rpl_context);
461 if (err) { 463 if (err)
462 kfree(rpl_context->rc); 464 goto err_free1;
463 kfree(rpl_context);
464 goto err_close;
465 }
466 } else 465 } else
467 atomic_dec(&rdma->rq_count); 466 atomic_dec(&rdma->rq_count);
468 467
@@ -471,8 +470,10 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
471 470
472 /* Post the request */ 471 /* Post the request */
473 c = kmalloc(sizeof *c, GFP_KERNEL); 472 c = kmalloc(sizeof *c, GFP_KERNEL);
474 if (!c) 473 if (!c) {
475 goto err_close; 474 err = -ENOMEM;
475 goto err_free1;
476 }
476 c->req = req; 477 c->req = req;
477 478
478 c->busa = ib_dma_map_single(rdma->cm_id->device, 479 c->busa = ib_dma_map_single(rdma->cm_id->device,
@@ -499,9 +500,15 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
499 return ib_post_send(rdma->qp, &wr, &bad_wr); 500 return ib_post_send(rdma->qp, &wr, &bad_wr);
500 501
501 error: 502 error:
503 kfree(c);
504 kfree(rpl_context->rc);
505 kfree(rpl_context);
502 P9_DPRINTK(P9_DEBUG_ERROR, "EIO\n"); 506 P9_DPRINTK(P9_DEBUG_ERROR, "EIO\n");
503 return -EIO; 507 return -EIO;
504 508 err_free1:
509 kfree(rpl_context->rc);
510 err_free2:
511 kfree(rpl_context);
505 err_close: 512 err_close:
506 spin_lock_irqsave(&rdma->req_lock, flags); 513 spin_lock_irqsave(&rdma->req_lock, flags);
507 if (rdma->state < P9_RDMA_CLOSING) { 514 if (rdma->state < P9_RDMA_CLOSING) {
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index dcfbe99ff81c..c8f3f72ab20e 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -75,6 +75,8 @@ struct virtio_chan {
75 struct p9_client *client; 75 struct p9_client *client;
76 struct virtio_device *vdev; 76 struct virtio_device *vdev;
77 struct virtqueue *vq; 77 struct virtqueue *vq;
78 int ring_bufs_avail;
79 wait_queue_head_t *vc_wq;
78 80
79 /* Scatterlist: can be too big for stack. */ 81 /* Scatterlist: can be too big for stack. */
80 struct scatterlist sg[VIRTQUEUE_NUM]; 82 struct scatterlist sg[VIRTQUEUE_NUM];
@@ -134,16 +136,30 @@ static void req_done(struct virtqueue *vq)
134 struct p9_fcall *rc; 136 struct p9_fcall *rc;
135 unsigned int len; 137 unsigned int len;
136 struct p9_req_t *req; 138 struct p9_req_t *req;
139 unsigned long flags;
137 140
138 P9_DPRINTK(P9_DEBUG_TRANS, ": request done\n"); 141 P9_DPRINTK(P9_DEBUG_TRANS, ": request done\n");
139 142
140 while ((rc = virtqueue_get_buf(chan->vq, &len)) != NULL) { 143 do {
141 P9_DPRINTK(P9_DEBUG_TRANS, ": rc %p\n", rc); 144 spin_lock_irqsave(&chan->lock, flags);
142 P9_DPRINTK(P9_DEBUG_TRANS, ": lookup tag %d\n", rc->tag); 145 rc = virtqueue_get_buf(chan->vq, &len);
143 req = p9_tag_lookup(chan->client, rc->tag); 146
144 req->status = REQ_STATUS_RCVD; 147 if (rc != NULL) {
145 p9_client_cb(chan->client, req); 148 if (!chan->ring_bufs_avail) {
146 } 149 chan->ring_bufs_avail = 1;
150 wake_up(chan->vc_wq);
151 }
152 spin_unlock_irqrestore(&chan->lock, flags);
153 P9_DPRINTK(P9_DEBUG_TRANS, ": rc %p\n", rc);
154 P9_DPRINTK(P9_DEBUG_TRANS, ": lookup tag %d\n",
155 rc->tag);
156 req = p9_tag_lookup(chan->client, rc->tag);
157 req->status = REQ_STATUS_RCVD;
158 p9_client_cb(chan->client, req);
159 } else {
160 spin_unlock_irqrestore(&chan->lock, flags);
161 }
162 } while (rc != NULL);
147} 163}
148 164
149/** 165/**
@@ -199,23 +215,43 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req)
199 int in, out; 215 int in, out;
200 struct virtio_chan *chan = client->trans; 216 struct virtio_chan *chan = client->trans;
201 char *rdata = (char *)req->rc+sizeof(struct p9_fcall); 217 char *rdata = (char *)req->rc+sizeof(struct p9_fcall);
218 unsigned long flags;
219 int err;
202 220
203 P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio request\n"); 221 P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio request\n");
204 222
223req_retry:
224 req->status = REQ_STATUS_SENT;
225
226 spin_lock_irqsave(&chan->lock, flags);
205 out = pack_sg_list(chan->sg, 0, VIRTQUEUE_NUM, req->tc->sdata, 227 out = pack_sg_list(chan->sg, 0, VIRTQUEUE_NUM, req->tc->sdata,
206 req->tc->size); 228 req->tc->size);
207 in = pack_sg_list(chan->sg, out, VIRTQUEUE_NUM-out, rdata, 229 in = pack_sg_list(chan->sg, out, VIRTQUEUE_NUM-out, rdata,
208 client->msize); 230 client->msize);
209 231
210 req->status = REQ_STATUS_SENT; 232 err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc);
211 233 if (err < 0) {
212 if (virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc) < 0) { 234 if (err == -ENOSPC) {
213 P9_DPRINTK(P9_DEBUG_TRANS, 235 chan->ring_bufs_avail = 0;
214 "9p debug: virtio rpc add_buf returned failure"); 236 spin_unlock_irqrestore(&chan->lock, flags);
215 return -EIO; 237 err = wait_event_interruptible(*chan->vc_wq,
238 chan->ring_bufs_avail);
239 if (err == -ERESTARTSYS)
240 return err;
241
242 P9_DPRINTK(P9_DEBUG_TRANS, "9p:Retry virtio request\n");
243 goto req_retry;
244 } else {
245 spin_unlock_irqrestore(&chan->lock, flags);
246 P9_DPRINTK(P9_DEBUG_TRANS,
247 "9p debug: "
248 "virtio rpc add_buf returned failure");
249 return -EIO;
250 }
216 } 251 }
217 252
218 virtqueue_kick(chan->vq); 253 virtqueue_kick(chan->vq);
254 spin_unlock_irqrestore(&chan->lock, flags);
219 255
220 P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio request kicked\n"); 256 P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio request kicked\n");
221 return 0; 257 return 0;
@@ -290,14 +326,23 @@ static int p9_virtio_probe(struct virtio_device *vdev)
290 chan->tag_len = tag_len; 326 chan->tag_len = tag_len;
291 err = sysfs_create_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr); 327 err = sysfs_create_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr);
292 if (err) { 328 if (err) {
293 kfree(tag); 329 goto out_free_tag;
294 goto out_free_vq;
295 } 330 }
331 chan->vc_wq = kmalloc(sizeof(wait_queue_head_t), GFP_KERNEL);
332 if (!chan->vc_wq) {
333 err = -ENOMEM;
334 goto out_free_tag;
335 }
336 init_waitqueue_head(chan->vc_wq);
337 chan->ring_bufs_avail = 1;
338
296 mutex_lock(&virtio_9p_lock); 339 mutex_lock(&virtio_9p_lock);
297 list_add_tail(&chan->chan_list, &virtio_chan_list); 340 list_add_tail(&chan->chan_list, &virtio_chan_list);
298 mutex_unlock(&virtio_9p_lock); 341 mutex_unlock(&virtio_9p_lock);
299 return 0; 342 return 0;
300 343
344out_free_tag:
345 kfree(tag);
301out_free_vq: 346out_free_vq:
302 vdev->config->del_vqs(vdev); 347 vdev->config->del_vqs(vdev);
303 kfree(chan); 348 kfree(chan);
@@ -329,7 +374,8 @@ p9_virtio_create(struct p9_client *client, const char *devname, char *args)
329 374
330 mutex_lock(&virtio_9p_lock); 375 mutex_lock(&virtio_9p_lock);
331 list_for_each_entry(chan, &virtio_chan_list, chan_list) { 376 list_for_each_entry(chan, &virtio_chan_list, chan_list) {
332 if (!strncmp(devname, chan->tag, chan->tag_len)) { 377 if (!strncmp(devname, chan->tag, chan->tag_len) &&
378 strlen(devname) == chan->tag_len) {
333 if (!chan->inuse) { 379 if (!chan->inuse) {
334 chan->inuse = true; 380 chan->inuse = true;
335 found = 1; 381 found = 1;
@@ -370,6 +416,7 @@ static void p9_virtio_remove(struct virtio_device *vdev)
370 mutex_unlock(&virtio_9p_lock); 416 mutex_unlock(&virtio_9p_lock);
371 sysfs_remove_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr); 417 sysfs_remove_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr);
372 kfree(chan->tag); 418 kfree(chan->tag);
419 kfree(chan->vc_wq);
373 kfree(chan); 420 kfree(chan);
374 421
375} 422}
diff --git a/net/Kconfig b/net/Kconfig
index e330594d3709..55fd82e9ffd9 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -217,7 +217,7 @@ source "net/dns_resolver/Kconfig"
217 217
218config RPS 218config RPS
219 boolean 219 boolean
220 depends on SMP && SYSFS 220 depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS
221 default y 221 default y
222 222
223menu "Network testing" 223menu "Network testing"
@@ -293,6 +293,7 @@ source "net/wimax/Kconfig"
293source "net/rfkill/Kconfig" 293source "net/rfkill/Kconfig"
294source "net/9p/Kconfig" 294source "net/9p/Kconfig"
295source "net/caif/Kconfig" 295source "net/caif/Kconfig"
296source "net/ceph/Kconfig"
296 297
297 298
298endif # if NET 299endif # if NET
diff --git a/net/Makefile b/net/Makefile
index ea60fbce9b1b..6b7bfd7f1416 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -68,3 +68,4 @@ obj-$(CONFIG_SYSCTL) += sysctl_net.o
68endif 68endif
69obj-$(CONFIG_WIMAX) += wimax/ 69obj-$(CONFIG_WIMAX) += wimax/
70obj-$(CONFIG_DNS_RESOLVER) += dns_resolver/ 70obj-$(CONFIG_DNS_RESOLVER) += dns_resolver/
71obj-$(CONFIG_CEPH_LIB) += ceph/
diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index 651babdfab38..ad2b232a2055 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -399,12 +399,6 @@ static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb)
399 unregister_netdev(net_dev); 399 unregister_netdev(net_dev);
400 free_netdev(net_dev); 400 free_netdev(net_dev);
401 } 401 }
402 read_lock_irq(&devs_lock);
403 if (list_empty(&br2684_devs)) {
404 /* last br2684 device */
405 unregister_atmdevice_notifier(&atm_dev_notifier);
406 }
407 read_unlock_irq(&devs_lock);
408 return; 402 return;
409 } 403 }
410 404
@@ -675,7 +669,6 @@ static int br2684_create(void __user *arg)
675 669
676 if (list_empty(&br2684_devs)) { 670 if (list_empty(&br2684_devs)) {
677 /* 1st br2684 device */ 671 /* 1st br2684 device */
678 register_atmdevice_notifier(&atm_dev_notifier);
679 brdev->number = 1; 672 brdev->number = 1;
680 } else 673 } else
681 brdev->number = BRPRIV(list_entry_brdev(br2684_devs.prev))->number + 1; 674 brdev->number = BRPRIV(list_entry_brdev(br2684_devs.prev))->number + 1;
@@ -815,6 +808,7 @@ static int __init br2684_init(void)
815 return -ENOMEM; 808 return -ENOMEM;
816#endif 809#endif
817 register_atm_ioctl(&br2684_ioctl_ops); 810 register_atm_ioctl(&br2684_ioctl_ops);
811 register_atmdevice_notifier(&atm_dev_notifier);
818 return 0; 812 return 0;
819} 813}
820 814
@@ -830,9 +824,7 @@ static void __exit br2684_exit(void)
830#endif 824#endif
831 825
832 826
833 /* if not already empty */ 827 unregister_atmdevice_notifier(&atm_dev_notifier);
834 if (!list_empty(&br2684_devs))
835 unregister_atmdevice_notifier(&atm_dev_notifier);
836 828
837 while (!list_empty(&br2684_devs)) { 829 while (!list_empty(&br2684_devs)) {
838 net_dev = list_entry_brdev(br2684_devs.next); 830 net_dev = list_entry_brdev(br2684_devs.next);
diff --git a/net/atm/clip.c b/net/atm/clip.c
index 95fdd1185067..ff956d1115bc 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -310,9 +310,9 @@ static int clip_constructor(struct neighbour *neigh)
310 return 0; 310 return 0;
311} 311}
312 312
313static u32 clip_hash(const void *pkey, const struct net_device *dev) 313static u32 clip_hash(const void *pkey, const struct net_device *dev, __u32 rnd)
314{ 314{
315 return jhash_2words(*(u32 *) pkey, dev->ifindex, clip_tbl.hash_rnd); 315 return jhash_2words(*(u32 *) pkey, dev->ifindex, rnd);
316} 316}
317 317
318static struct neigh_table clip_tbl = { 318static struct neigh_table clip_tbl = {
diff --git a/net/atm/common.c b/net/atm/common.c
index 940404a73b3d..1b9c52a02cd3 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -792,7 +792,7 @@ int vcc_getsockopt(struct socket *sock, int level, int optname,
792 default: 792 default:
793 if (level == SOL_SOCKET) 793 if (level == SOL_SOCKET)
794 return -EINVAL; 794 return -EINVAL;
795 break; 795 break;
796 } 796 }
797 if (!vcc->dev || !vcc->dev->ops->getsockopt) 797 if (!vcc->dev || !vcc->dev->ops->getsockopt)
798 return -EINVAL; 798 return -EINVAL;
diff --git a/net/atm/lec.c b/net/atm/lec.c
index d98bde1a0ac8..181d70c73d70 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -220,7 +220,6 @@ static unsigned char *get_tr_dst(unsigned char *packet, unsigned char *rdesc)
220static int lec_open(struct net_device *dev) 220static int lec_open(struct net_device *dev)
221{ 221{
222 netif_start_queue(dev); 222 netif_start_queue(dev);
223 memset(&dev->stats, 0, sizeof(struct net_device_stats));
224 223
225 return 0; 224 return 0;
226} 225}
diff --git a/net/atm/mpc.c b/net/atm/mpc.c
index 622b471e14e0..74bcc662c3dd 100644
--- a/net/atm/mpc.c
+++ b/net/atm/mpc.c
@@ -778,7 +778,7 @@ static void mpc_push(struct atm_vcc *vcc, struct sk_buff *skb)
778 eg->packets_rcvd++; 778 eg->packets_rcvd++;
779 mpc->eg_ops->put(eg); 779 mpc->eg_ops->put(eg);
780 780
781 memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data)); 781 memset(ATM_SKB(new_skb), 0, sizeof(struct atm_skb_data));
782 netif_rx(new_skb); 782 netif_rx(new_skb);
783} 783}
784 784
diff --git a/net/atm/proc.c b/net/atm/proc.c
index 6262aeae398e..f85da0779e5e 100644
--- a/net/atm/proc.c
+++ b/net/atm/proc.c
@@ -38,6 +38,7 @@ static ssize_t proc_dev_atm_read(struct file *file, char __user *buf,
38static const struct file_operations proc_atm_dev_ops = { 38static const struct file_operations proc_atm_dev_ops = {
39 .owner = THIS_MODULE, 39 .owner = THIS_MODULE,
40 .read = proc_dev_atm_read, 40 .read = proc_dev_atm_read,
41 .llseek = noop_llseek,
41}; 42};
42 43
43static void add_stats(struct seq_file *seq, const char *aal, 44static void add_stats(struct seq_file *seq, const char *aal,
diff --git a/net/ax25/Kconfig b/net/ax25/Kconfig
index 2a72aa96a568..705e53ef4af0 100644
--- a/net/ax25/Kconfig
+++ b/net/ax25/Kconfig
@@ -7,7 +7,7 @@ menuconfig HAMRADIO
7 bool "Amateur Radio support" 7 bool "Amateur Radio support"
8 help 8 help
9 If you want to connect your Linux box to an amateur radio, answer Y 9 If you want to connect your Linux box to an amateur radio, answer Y
10 here. You want to read <http://www.tapr.org/tapr/html/pkthome.html> 10 here. You want to read <http://www.tapr.org/>
11 and more specifically about AX.25 on Linux 11 and more specifically about AX.25 on Linux
12 <http://www.linux-ax25.org/>. 12 <http://www.linux-ax25.org/>.
13 13
@@ -42,7 +42,7 @@ config AX25
42 check out the file <file:Documentation/networking/ax25.txt> in the 42 check out the file <file:Documentation/networking/ax25.txt> in the
43 kernel source. More information about digital amateur radio in 43 kernel source. More information about digital amateur radio in
44 general is on the WWW at 44 general is on the WWW at
45 <http://www.tapr.org/tapr/html/pkthome.html>. 45 <http://www.tapr.org/>.
46 46
47 To compile this driver as a module, choose M here: the 47 To compile this driver as a module, choose M here: the
48 module will be called ax25. 48 module will be called ax25.
@@ -89,7 +89,7 @@ config NETROM
89 <http://www.linux-ax25.org>. You also might want to check out the 89 <http://www.linux-ax25.org>. You also might want to check out the
90 file <file:Documentation/networking/ax25.txt>. More information about 90 file <file:Documentation/networking/ax25.txt>. More information about
91 digital amateur radio in general is on the WWW at 91 digital amateur radio in general is on the WWW at
92 <http://www.tapr.org/tapr/html/pkthome.html>. 92 <http://www.tapr.org/>.
93 93
94 To compile this driver as a module, choose M here: the 94 To compile this driver as a module, choose M here: the
95 module will be called netrom. 95 module will be called netrom.
@@ -108,7 +108,7 @@ config ROSE
108 <http://www.linux-ax25.org>. You also might want to check out the 108 <http://www.linux-ax25.org>. You also might want to check out the
109 file <file:Documentation/networking/ax25.txt>. More information about 109 file <file:Documentation/networking/ax25.txt>. More information about
110 digital amateur radio in general is on the WWW at 110 digital amateur radio in general is on the WWW at
111 <http://www.tapr.org/tapr/html/pkthome.html>. 111 <http://www.tapr.org/>.
112 112
113 To compile this driver as a module, choose M here: the 113 To compile this driver as a module, choose M here: the
114 module will be called rose. 114 module will be called rose.
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index cfdfd7e2a172..26eaebf4aaa9 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1103,7 +1103,7 @@ done:
1103out: 1103out:
1104 release_sock(sk); 1104 release_sock(sk);
1105 1105
1106 return 0; 1106 return err;
1107} 1107}
1108 1108
1109/* 1109/*
diff --git a/net/ax25/ax25_ds_timer.c b/net/ax25/ax25_ds_timer.c
index 2ce79df00680..c7d81436213d 100644
--- a/net/ax25/ax25_ds_timer.c
+++ b/net/ax25/ax25_ds_timer.c
@@ -112,8 +112,8 @@ void ax25_ds_heartbeat_expiry(ax25_cb *ax25)
112 if (sk) { 112 if (sk) {
113 sock_hold(sk); 113 sock_hold(sk);
114 ax25_destroy_socket(ax25); 114 ax25_destroy_socket(ax25);
115 sock_put(sk);
116 bh_unlock_sock(sk); 115 bh_unlock_sock(sk);
116 sock_put(sk);
117 } else 117 } else
118 ax25_destroy_socket(ax25); 118 ax25_destroy_socket(ax25);
119 return; 119 return;
diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c
index 7805945a5fd6..a1690845dc6e 100644
--- a/net/ax25/ax25_route.c
+++ b/net/ax25/ax25_route.c
@@ -412,7 +412,7 @@ int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr)
412{ 412{
413 ax25_uid_assoc *user; 413 ax25_uid_assoc *user;
414 ax25_route *ax25_rt; 414 ax25_route *ax25_rt;
415 int err; 415 int err = 0;
416 416
417 if ((ax25_rt = ax25_get_route(addr, NULL)) == NULL) 417 if ((ax25_rt = ax25_get_route(addr, NULL)) == NULL)
418 return -EHOSTUNREACH; 418 return -EHOSTUNREACH;
@@ -453,7 +453,7 @@ int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr)
453put: 453put:
454 ax25_put_route(ax25_rt); 454 ax25_put_route(ax25_rt);
455 455
456 return 0; 456 return err;
457} 457}
458 458
459struct sk_buff *ax25_rt_build_path(struct sk_buff *skb, ax25_address *src, 459struct sk_buff *ax25_rt_build_path(struct sk_buff *skb, ax25_address *src,
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 421c45bd1b95..c4cf3f595004 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -265,6 +265,115 @@ int bt_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
265} 265}
266EXPORT_SYMBOL(bt_sock_recvmsg); 266EXPORT_SYMBOL(bt_sock_recvmsg);
267 267
268static long bt_sock_data_wait(struct sock *sk, long timeo)
269{
270 DECLARE_WAITQUEUE(wait, current);
271
272 add_wait_queue(sk_sleep(sk), &wait);
273 for (;;) {
274 set_current_state(TASK_INTERRUPTIBLE);
275
276 if (!skb_queue_empty(&sk->sk_receive_queue))
277 break;
278
279 if (sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN))
280 break;
281
282 if (signal_pending(current) || !timeo)
283 break;
284
285 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
286 release_sock(sk);
287 timeo = schedule_timeout(timeo);
288 lock_sock(sk);
289 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
290 }
291
292 __set_current_state(TASK_RUNNING);
293 remove_wait_queue(sk_sleep(sk), &wait);
294 return timeo;
295}
296
297int bt_sock_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
298 struct msghdr *msg, size_t size, int flags)
299{
300 struct sock *sk = sock->sk;
301 int err = 0;
302 size_t target, copied = 0;
303 long timeo;
304
305 if (flags & MSG_OOB)
306 return -EOPNOTSUPP;
307
308 msg->msg_namelen = 0;
309
310 BT_DBG("sk %p size %zu", sk, size);
311
312 lock_sock(sk);
313
314 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
315 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
316
317 do {
318 struct sk_buff *skb;
319 int chunk;
320
321 skb = skb_dequeue(&sk->sk_receive_queue);
322 if (!skb) {
323 if (copied >= target)
324 break;
325
326 if ((err = sock_error(sk)) != 0)
327 break;
328 if (sk->sk_shutdown & RCV_SHUTDOWN)
329 break;
330
331 err = -EAGAIN;
332 if (!timeo)
333 break;
334
335 timeo = bt_sock_data_wait(sk, timeo);
336
337 if (signal_pending(current)) {
338 err = sock_intr_errno(timeo);
339 goto out;
340 }
341 continue;
342 }
343
344 chunk = min_t(unsigned int, skb->len, size);
345 if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
346 skb_queue_head(&sk->sk_receive_queue, skb);
347 if (!copied)
348 copied = -EFAULT;
349 break;
350 }
351 copied += chunk;
352 size -= chunk;
353
354 sock_recv_ts_and_drops(msg, sk, skb);
355
356 if (!(flags & MSG_PEEK)) {
357 skb_pull(skb, chunk);
358 if (skb->len) {
359 skb_queue_head(&sk->sk_receive_queue, skb);
360 break;
361 }
362 kfree_skb(skb);
363
364 } else {
365 /* put message back and return */
366 skb_queue_head(&sk->sk_receive_queue, skb);
367 break;
368 }
369 } while (size);
370
371out:
372 release_sock(sk);
373 return copied ? : err;
374}
375EXPORT_SYMBOL(bt_sock_stream_recvmsg);
376
268static inline unsigned int bt_accept_poll(struct sock *parent) 377static inline unsigned int bt_accept_poll(struct sock *parent)
269{ 378{
270 struct list_head *p, *n; 379 struct list_head *p, *n;
@@ -297,13 +406,12 @@ unsigned int bt_sock_poll(struct file * file, struct socket *sock, poll_table *w
297 mask |= POLLERR; 406 mask |= POLLERR;
298 407
299 if (sk->sk_shutdown & RCV_SHUTDOWN) 408 if (sk->sk_shutdown & RCV_SHUTDOWN)
300 mask |= POLLRDHUP; 409 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
301 410
302 if (sk->sk_shutdown == SHUTDOWN_MASK) 411 if (sk->sk_shutdown == SHUTDOWN_MASK)
303 mask |= POLLHUP; 412 mask |= POLLHUP;
304 413
305 if (!skb_queue_empty(&sk->sk_receive_queue) || 414 if (!skb_queue_empty(&sk->sk_receive_queue))
306 (sk->sk_shutdown & RCV_SHUTDOWN))
307 mask |= POLLIN | POLLRDNORM; 415 mask |= POLLIN | POLLRDNORM;
308 416
309 if (sk->sk_state == BT_CLOSED) 417 if (sk->sk_state == BT_CLOSED)
diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c
index d4c6af082d48..ec0a1347f933 100644
--- a/net/bluetooth/cmtp/core.c
+++ b/net/bluetooth/cmtp/core.c
@@ -321,14 +321,10 @@ static int cmtp_session(void *arg)
321int cmtp_add_connection(struct cmtp_connadd_req *req, struct socket *sock) 321int cmtp_add_connection(struct cmtp_connadd_req *req, struct socket *sock)
322{ 322{
323 struct cmtp_session *session, *s; 323 struct cmtp_session *session, *s;
324 bdaddr_t src, dst;
325 int i, err; 324 int i, err;
326 325
327 BT_DBG(""); 326 BT_DBG("");
328 327
329 baswap(&src, &bt_sk(sock->sk)->src);
330 baswap(&dst, &bt_sk(sock->sk)->dst);
331
332 session = kzalloc(sizeof(struct cmtp_session), GFP_KERNEL); 328 session = kzalloc(sizeof(struct cmtp_session), GFP_KERNEL);
333 if (!session) 329 if (!session)
334 return -ENOMEM; 330 return -ENOMEM;
@@ -347,7 +343,7 @@ int cmtp_add_connection(struct cmtp_connadd_req *req, struct socket *sock)
347 343
348 BT_DBG("mtu %d", session->mtu); 344 BT_DBG("mtu %d", session->mtu);
349 345
350 sprintf(session->name, "%s", batostr(&dst)); 346 sprintf(session->name, "%s", batostr(&bt_sk(sock->sk)->dst));
351 347
352 session->sock = sock; 348 session->sock = sock;
353 session->state = BT_CONFIG; 349 session->state = BT_CONFIG;
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index c52f091ee6de..bc2a052e518b 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -562,7 +562,6 @@ static int hci_dev_do_close(struct hci_dev *hdev)
562 hci_dev_lock_bh(hdev); 562 hci_dev_lock_bh(hdev);
563 inquiry_cache_flush(hdev); 563 inquiry_cache_flush(hdev);
564 hci_conn_hash_flush(hdev); 564 hci_conn_hash_flush(hdev);
565 hci_blacklist_clear(hdev);
566 hci_dev_unlock_bh(hdev); 565 hci_dev_unlock_bh(hdev);
567 566
568 hci_notify(hdev, HCI_DEV_DOWN); 567 hci_notify(hdev, HCI_DEV_DOWN);
diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c
index 8fb967beee80..5fce3d6d07b4 100644
--- a/net/bluetooth/hci_sysfs.c
+++ b/net/bluetooth/hci_sysfs.c
@@ -37,9 +37,7 @@ static ssize_t show_link_type(struct device *dev, struct device_attribute *attr,
37static ssize_t show_link_address(struct device *dev, struct device_attribute *attr, char *buf) 37static ssize_t show_link_address(struct device *dev, struct device_attribute *attr, char *buf)
38{ 38{
39 struct hci_conn *conn = dev_get_drvdata(dev); 39 struct hci_conn *conn = dev_get_drvdata(dev);
40 bdaddr_t bdaddr; 40 return sprintf(buf, "%s\n", batostr(&conn->dst));
41 baswap(&bdaddr, &conn->dst);
42 return sprintf(buf, "%s\n", batostr(&bdaddr));
43} 41}
44 42
45static ssize_t show_link_features(struct device *dev, struct device_attribute *attr, char *buf) 43static ssize_t show_link_features(struct device *dev, struct device_attribute *attr, char *buf)
@@ -196,8 +194,8 @@ static inline char *host_typetostr(int type)
196 switch (type) { 194 switch (type) {
197 case HCI_BREDR: 195 case HCI_BREDR:
198 return "BR/EDR"; 196 return "BR/EDR";
199 case HCI_80211: 197 case HCI_AMP:
200 return "802.11"; 198 return "AMP";
201 default: 199 default:
202 return "UNKNOWN"; 200 return "UNKNOWN";
203 } 201 }
@@ -238,9 +236,7 @@ static ssize_t show_class(struct device *dev, struct device_attribute *attr, cha
238static ssize_t show_address(struct device *dev, struct device_attribute *attr, char *buf) 236static ssize_t show_address(struct device *dev, struct device_attribute *attr, char *buf)
239{ 237{
240 struct hci_dev *hdev = dev_get_drvdata(dev); 238 struct hci_dev *hdev = dev_get_drvdata(dev);
241 bdaddr_t bdaddr; 239 return sprintf(buf, "%s\n", batostr(&hdev->bdaddr));
242 baswap(&bdaddr, &hdev->bdaddr);
243 return sprintf(buf, "%s\n", batostr(&bdaddr));
244} 240}
245 241
246static ssize_t show_features(struct device *dev, struct device_attribute *attr, char *buf) 242static ssize_t show_features(struct device *dev, struct device_attribute *attr, char *buf)
@@ -408,10 +404,8 @@ static int inquiry_cache_show(struct seq_file *f, void *p)
408 404
409 for (e = cache->list; e; e = e->next) { 405 for (e = cache->list; e; e = e->next) {
410 struct inquiry_data *data = &e->data; 406 struct inquiry_data *data = &e->data;
411 bdaddr_t bdaddr;
412 baswap(&bdaddr, &data->bdaddr);
413 seq_printf(f, "%s %d %d %d 0x%.2x%.2x%.2x 0x%.4x %d %d %u\n", 407 seq_printf(f, "%s %d %d %d 0x%.2x%.2x%.2x 0x%.4x %d %d %u\n",
414 batostr(&bdaddr), 408 batostr(&data->bdaddr),
415 data->pscan_rep_mode, data->pscan_period_mode, 409 data->pscan_rep_mode, data->pscan_period_mode,
416 data->pscan_mode, data->dev_class[2], 410 data->pscan_mode, data->dev_class[2],
417 data->dev_class[1], data->dev_class[0], 411 data->dev_class[1], data->dev_class[0],
@@ -445,13 +439,10 @@ static int blacklist_show(struct seq_file *f, void *p)
445 439
446 list_for_each(l, &hdev->blacklist) { 440 list_for_each(l, &hdev->blacklist) {
447 struct bdaddr_list *b; 441 struct bdaddr_list *b;
448 bdaddr_t bdaddr;
449 442
450 b = list_entry(l, struct bdaddr_list, list); 443 b = list_entry(l, struct bdaddr_list, list);
451 444
452 baswap(&bdaddr, &b->bdaddr); 445 seq_printf(f, "%s\n", batostr(&b->bdaddr));
453
454 seq_printf(f, "%s\n", batostr(&bdaddr));
455 } 446 }
456 447
457 hci_dev_unlock_bh(hdev); 448 hci_dev_unlock_bh(hdev);
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index bfe641b7dfaf..c0ee8b3928ed 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -758,7 +758,6 @@ static int hidp_setup_hid(struct hidp_session *session,
758 struct hidp_connadd_req *req) 758 struct hidp_connadd_req *req)
759{ 759{
760 struct hid_device *hid; 760 struct hid_device *hid;
761 bdaddr_t src, dst;
762 int err; 761 int err;
763 762
764 session->rd_data = kzalloc(req->rd_size, GFP_KERNEL); 763 session->rd_data = kzalloc(req->rd_size, GFP_KERNEL);
@@ -781,9 +780,6 @@ static int hidp_setup_hid(struct hidp_session *session,
781 780
782 hid->driver_data = session; 781 hid->driver_data = session;
783 782
784 baswap(&src, &bt_sk(session->ctrl_sock->sk)->src);
785 baswap(&dst, &bt_sk(session->ctrl_sock->sk)->dst);
786
787 hid->bus = BUS_BLUETOOTH; 783 hid->bus = BUS_BLUETOOTH;
788 hid->vendor = req->vendor; 784 hid->vendor = req->vendor;
789 hid->product = req->product; 785 hid->product = req->product;
@@ -791,8 +787,8 @@ static int hidp_setup_hid(struct hidp_session *session,
791 hid->country = req->country; 787 hid->country = req->country;
792 788
793 strncpy(hid->name, req->name, 128); 789 strncpy(hid->name, req->name, 128);
794 strncpy(hid->phys, batostr(&src), 64); 790 strncpy(hid->phys, batostr(&bt_sk(session->ctrl_sock->sk)->src), 64);
795 strncpy(hid->uniq, batostr(&dst), 64); 791 strncpy(hid->uniq, batostr(&bt_sk(session->ctrl_sock->sk)->dst), 64);
796 792
797 hid->dev.parent = hidp_get_device(session); 793 hid->dev.parent = hidp_get_device(session);
798 hid->ll_driver = &hidp_hid_driver; 794 hid->ll_driver = &hidp_hid_driver;
diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c
index fadf26b4ed7c..daa7a988d9a6 100644
--- a/net/bluetooth/l2cap.c
+++ b/net/bluetooth/l2cap.c
@@ -1008,10 +1008,20 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen)
1008 goto done; 1008 goto done;
1009 } 1009 }
1010 1010
1011 if (la.l2_psm && __le16_to_cpu(la.l2_psm) < 0x1001 && 1011 if (la.l2_psm) {
1012 !capable(CAP_NET_BIND_SERVICE)) { 1012 __u16 psm = __le16_to_cpu(la.l2_psm);
1013 err = -EACCES; 1013
1014 goto done; 1014 /* PSM must be odd and lsb of upper byte must be 0 */
1015 if ((psm & 0x0101) != 0x0001) {
1016 err = -EINVAL;
1017 goto done;
1018 }
1019
1020 /* Restrict usage of well-known PSMs */
1021 if (psm < 0x1001 && !capable(CAP_NET_BIND_SERVICE)) {
1022 err = -EACCES;
1023 goto done;
1024 }
1015 } 1025 }
1016 1026
1017 write_lock_bh(&l2cap_sk_list.lock); 1027 write_lock_bh(&l2cap_sk_list.lock);
@@ -1190,6 +1200,13 @@ static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, int al
1190 goto done; 1200 goto done;
1191 } 1201 }
1192 1202
1203 /* PSM must be odd and lsb of upper byte must be 0 */
1204 if ((__le16_to_cpu(la.l2_psm) & 0x0101) != 0x0001 &&
1205 sk->sk_type != SOCK_RAW) {
1206 err = -EINVAL;
1207 goto done;
1208 }
1209
1193 /* Set destination address and psm */ 1210 /* Set destination address and psm */
1194 bacpy(&bt_sk(sk)->dst, &la.l2_bdaddr); 1211 bacpy(&bt_sk(sk)->dst, &la.l2_bdaddr);
1195 l2cap_pi(sk)->psm = la.l2_psm; 1212 l2cap_pi(sk)->psm = la.l2_psm;
@@ -1441,33 +1458,23 @@ static inline void l2cap_do_send(struct sock *sk, struct sk_buff *skb)
1441 1458
1442static void l2cap_streaming_send(struct sock *sk) 1459static void l2cap_streaming_send(struct sock *sk)
1443{ 1460{
1444 struct sk_buff *skb, *tx_skb; 1461 struct sk_buff *skb;
1445 struct l2cap_pinfo *pi = l2cap_pi(sk); 1462 struct l2cap_pinfo *pi = l2cap_pi(sk);
1446 u16 control, fcs; 1463 u16 control, fcs;
1447 1464
1448 while ((skb = sk->sk_send_head)) { 1465 while ((skb = skb_dequeue(TX_QUEUE(sk)))) {
1449 tx_skb = skb_clone(skb, GFP_ATOMIC); 1466 control = get_unaligned_le16(skb->data + L2CAP_HDR_SIZE);
1450
1451 control = get_unaligned_le16(tx_skb->data + L2CAP_HDR_SIZE);
1452 control |= pi->next_tx_seq << L2CAP_CTRL_TXSEQ_SHIFT; 1467 control |= pi->next_tx_seq << L2CAP_CTRL_TXSEQ_SHIFT;
1453 put_unaligned_le16(control, tx_skb->data + L2CAP_HDR_SIZE); 1468 put_unaligned_le16(control, skb->data + L2CAP_HDR_SIZE);
1454 1469
1455 if (pi->fcs == L2CAP_FCS_CRC16) { 1470 if (pi->fcs == L2CAP_FCS_CRC16) {
1456 fcs = crc16(0, (u8 *)tx_skb->data, tx_skb->len - 2); 1471 fcs = crc16(0, (u8 *)skb->data, skb->len - 2);
1457 put_unaligned_le16(fcs, tx_skb->data + tx_skb->len - 2); 1472 put_unaligned_le16(fcs, skb->data + skb->len - 2);
1458 } 1473 }
1459 1474
1460 l2cap_do_send(sk, tx_skb); 1475 l2cap_do_send(sk, skb);
1461 1476
1462 pi->next_tx_seq = (pi->next_tx_seq + 1) % 64; 1477 pi->next_tx_seq = (pi->next_tx_seq + 1) % 64;
1463
1464 if (skb_queue_is_last(TX_QUEUE(sk), skb))
1465 sk->sk_send_head = NULL;
1466 else
1467 sk->sk_send_head = skb_queue_next(TX_QUEUE(sk), skb);
1468
1469 skb = skb_dequeue(TX_QUEUE(sk));
1470 kfree_skb(skb);
1471 } 1478 }
1472} 1479}
1473 1480
@@ -1645,7 +1652,7 @@ static inline int l2cap_skbuff_fromiovec(struct sock *sk, struct msghdr *msg, in
1645 1652
1646 *frag = bt_skb_send_alloc(sk, count, msg->msg_flags & MSG_DONTWAIT, &err); 1653 *frag = bt_skb_send_alloc(sk, count, msg->msg_flags & MSG_DONTWAIT, &err);
1647 if (!*frag) 1654 if (!*frag)
1648 return -EFAULT; 1655 return err;
1649 if (memcpy_fromiovec(skb_put(*frag, count), msg->msg_iov, count)) 1656 if (memcpy_fromiovec(skb_put(*frag, count), msg->msg_iov, count))
1650 return -EFAULT; 1657 return -EFAULT;
1651 1658
@@ -1671,7 +1678,7 @@ static struct sk_buff *l2cap_create_connless_pdu(struct sock *sk, struct msghdr
1671 skb = bt_skb_send_alloc(sk, count + hlen, 1678 skb = bt_skb_send_alloc(sk, count + hlen,
1672 msg->msg_flags & MSG_DONTWAIT, &err); 1679 msg->msg_flags & MSG_DONTWAIT, &err);
1673 if (!skb) 1680 if (!skb)
1674 return ERR_PTR(-ENOMEM); 1681 return ERR_PTR(err);
1675 1682
1676 /* Create L2CAP header */ 1683 /* Create L2CAP header */
1677 lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE); 1684 lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE);
@@ -1700,7 +1707,7 @@ static struct sk_buff *l2cap_create_basic_pdu(struct sock *sk, struct msghdr *ms
1700 skb = bt_skb_send_alloc(sk, count + hlen, 1707 skb = bt_skb_send_alloc(sk, count + hlen,
1701 msg->msg_flags & MSG_DONTWAIT, &err); 1708 msg->msg_flags & MSG_DONTWAIT, &err);
1702 if (!skb) 1709 if (!skb)
1703 return ERR_PTR(-ENOMEM); 1710 return ERR_PTR(err);
1704 1711
1705 /* Create L2CAP header */ 1712 /* Create L2CAP header */
1706 lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE); 1713 lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE);
@@ -1737,7 +1744,7 @@ static struct sk_buff *l2cap_create_iframe_pdu(struct sock *sk, struct msghdr *m
1737 skb = bt_skb_send_alloc(sk, count + hlen, 1744 skb = bt_skb_send_alloc(sk, count + hlen,
1738 msg->msg_flags & MSG_DONTWAIT, &err); 1745 msg->msg_flags & MSG_DONTWAIT, &err);
1739 if (!skb) 1746 if (!skb)
1740 return ERR_PTR(-ENOMEM); 1747 return ERR_PTR(err);
1741 1748
1742 /* Create L2CAP header */ 1749 /* Create L2CAP header */
1743 lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE); 1750 lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE);
@@ -1944,6 +1951,9 @@ static int l2cap_sock_recvmsg(struct kiocb *iocb, struct socket *sock, struct ms
1944 1951
1945 release_sock(sk); 1952 release_sock(sk);
1946 1953
1954 if (sock->type == SOCK_STREAM)
1955 return bt_sock_stream_recvmsg(iocb, sock, msg, len, flags);
1956
1947 return bt_sock_recvmsg(iocb, sock, msg, len, flags); 1957 return bt_sock_recvmsg(iocb, sock, msg, len, flags);
1948} 1958}
1949 1959
@@ -1960,6 +1970,11 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, char __us
1960 1970
1961 switch (optname) { 1971 switch (optname) {
1962 case L2CAP_OPTIONS: 1972 case L2CAP_OPTIONS:
1973 if (sk->sk_state == BT_CONNECTED) {
1974 err = -EINVAL;
1975 break;
1976 }
1977
1963 opts.imtu = l2cap_pi(sk)->imtu; 1978 opts.imtu = l2cap_pi(sk)->imtu;
1964 opts.omtu = l2cap_pi(sk)->omtu; 1979 opts.omtu = l2cap_pi(sk)->omtu;
1965 opts.flush_to = l2cap_pi(sk)->flush_to; 1980 opts.flush_to = l2cap_pi(sk)->flush_to;
@@ -2771,10 +2786,10 @@ static int l2cap_parse_conf_rsp(struct sock *sk, void *rsp, int len, void *data,
2771 case L2CAP_CONF_MTU: 2786 case L2CAP_CONF_MTU:
2772 if (val < L2CAP_DEFAULT_MIN_MTU) { 2787 if (val < L2CAP_DEFAULT_MIN_MTU) {
2773 *result = L2CAP_CONF_UNACCEPT; 2788 *result = L2CAP_CONF_UNACCEPT;
2774 pi->omtu = L2CAP_DEFAULT_MIN_MTU; 2789 pi->imtu = L2CAP_DEFAULT_MIN_MTU;
2775 } else 2790 } else
2776 pi->omtu = val; 2791 pi->imtu = val;
2777 l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, pi->omtu); 2792 l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, pi->imtu);
2778 break; 2793 break;
2779 2794
2780 case L2CAP_CONF_FLUSH_TO: 2795 case L2CAP_CONF_FLUSH_TO:
@@ -2896,7 +2911,7 @@ static inline int l2cap_connect_req(struct l2cap_conn *conn, struct l2cap_cmd_hd
2896 struct l2cap_chan_list *list = &conn->chan_list; 2911 struct l2cap_chan_list *list = &conn->chan_list;
2897 struct l2cap_conn_req *req = (struct l2cap_conn_req *) data; 2912 struct l2cap_conn_req *req = (struct l2cap_conn_req *) data;
2898 struct l2cap_conn_rsp rsp; 2913 struct l2cap_conn_rsp rsp;
2899 struct sock *parent, *uninitialized_var(sk); 2914 struct sock *parent, *sk = NULL;
2900 int result, status = L2CAP_CS_NO_INFO; 2915 int result, status = L2CAP_CS_NO_INFO;
2901 2916
2902 u16 dcid = 0, scid = __le16_to_cpu(req->scid); 2917 u16 dcid = 0, scid = __le16_to_cpu(req->scid);
@@ -3005,7 +3020,7 @@ sendresp:
3005 L2CAP_INFO_REQ, sizeof(info), &info); 3020 L2CAP_INFO_REQ, sizeof(info), &info);
3006 } 3021 }
3007 3022
3008 if (!(l2cap_pi(sk)->conf_state & L2CAP_CONF_REQ_SENT) && 3023 if (sk && !(l2cap_pi(sk)->conf_state & L2CAP_CONF_REQ_SENT) &&
3009 result == L2CAP_CR_SUCCESS) { 3024 result == L2CAP_CR_SUCCESS) {
3010 u8 buf[128]; 3025 u8 buf[128];
3011 l2cap_pi(sk)->conf_state |= L2CAP_CONF_REQ_SENT; 3026 l2cap_pi(sk)->conf_state |= L2CAP_CONF_REQ_SENT;
@@ -3071,6 +3086,17 @@ static inline int l2cap_connect_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hd
3071 return 0; 3086 return 0;
3072} 3087}
3073 3088
3089static inline void set_default_fcs(struct l2cap_pinfo *pi)
3090{
3091 /* FCS is enabled only in ERTM or streaming mode, if one or both
3092 * sides request it.
3093 */
3094 if (pi->mode != L2CAP_MODE_ERTM && pi->mode != L2CAP_MODE_STREAMING)
3095 pi->fcs = L2CAP_FCS_NONE;
3096 else if (!(pi->conf_state & L2CAP_CONF_NO_FCS_RECV))
3097 pi->fcs = L2CAP_FCS_CRC16;
3098}
3099
3074static inline int l2cap_config_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u16 cmd_len, u8 *data) 3100static inline int l2cap_config_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u16 cmd_len, u8 *data)
3075{ 3101{
3076 struct l2cap_conf_req *req = (struct l2cap_conf_req *) data; 3102 struct l2cap_conf_req *req = (struct l2cap_conf_req *) data;
@@ -3088,14 +3114,8 @@ static inline int l2cap_config_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr
3088 if (!sk) 3114 if (!sk)
3089 return -ENOENT; 3115 return -ENOENT;
3090 3116
3091 if (sk->sk_state != BT_CONFIG) { 3117 if (sk->sk_state == BT_DISCONN)
3092 struct l2cap_cmd_rej rej;
3093
3094 rej.reason = cpu_to_le16(0x0002);
3095 l2cap_send_cmd(conn, cmd->ident, L2CAP_COMMAND_REJ,
3096 sizeof(rej), &rej);
3097 goto unlock; 3118 goto unlock;
3098 }
3099 3119
3100 /* Reject if config buffer is too small. */ 3120 /* Reject if config buffer is too small. */
3101 len = cmd_len - sizeof(*req); 3121 len = cmd_len - sizeof(*req);
@@ -3135,9 +3155,7 @@ static inline int l2cap_config_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr
3135 goto unlock; 3155 goto unlock;
3136 3156
3137 if (l2cap_pi(sk)->conf_state & L2CAP_CONF_INPUT_DONE) { 3157 if (l2cap_pi(sk)->conf_state & L2CAP_CONF_INPUT_DONE) {
3138 if (!(l2cap_pi(sk)->conf_state & L2CAP_CONF_NO_FCS_RECV) || 3158 set_default_fcs(l2cap_pi(sk));
3139 l2cap_pi(sk)->fcs != L2CAP_FCS_NONE)
3140 l2cap_pi(sk)->fcs = L2CAP_FCS_CRC16;
3141 3159
3142 sk->sk_state = BT_CONNECTED; 3160 sk->sk_state = BT_CONNECTED;
3143 3161
@@ -3153,6 +3171,7 @@ static inline int l2cap_config_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr
3153 3171
3154 if (!(l2cap_pi(sk)->conf_state & L2CAP_CONF_REQ_SENT)) { 3172 if (!(l2cap_pi(sk)->conf_state & L2CAP_CONF_REQ_SENT)) {
3155 u8 buf[64]; 3173 u8 buf[64];
3174 l2cap_pi(sk)->conf_state |= L2CAP_CONF_REQ_SENT;
3156 l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, 3175 l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
3157 l2cap_build_conf_req(sk, buf), buf); 3176 l2cap_build_conf_req(sk, buf), buf);
3158 l2cap_pi(sk)->num_conf_req++; 3177 l2cap_pi(sk)->num_conf_req++;
@@ -3225,9 +3244,7 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hdr
3225 l2cap_pi(sk)->conf_state |= L2CAP_CONF_INPUT_DONE; 3244 l2cap_pi(sk)->conf_state |= L2CAP_CONF_INPUT_DONE;
3226 3245
3227 if (l2cap_pi(sk)->conf_state & L2CAP_CONF_OUTPUT_DONE) { 3246 if (l2cap_pi(sk)->conf_state & L2CAP_CONF_OUTPUT_DONE) {
3228 if (!(l2cap_pi(sk)->conf_state & L2CAP_CONF_NO_FCS_RECV) || 3247 set_default_fcs(l2cap_pi(sk));
3229 l2cap_pi(sk)->fcs != L2CAP_FCS_NONE)
3230 l2cap_pi(sk)->fcs = L2CAP_FCS_CRC16;
3231 3248
3232 sk->sk_state = BT_CONNECTED; 3249 sk->sk_state = BT_CONNECTED;
3233 l2cap_pi(sk)->next_tx_seq = 0; 3250 l2cap_pi(sk)->next_tx_seq = 0;
@@ -4647,6 +4664,8 @@ static int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 fl
4647 4664
4648 if (flags & ACL_START) { 4665 if (flags & ACL_START) {
4649 struct l2cap_hdr *hdr; 4666 struct l2cap_hdr *hdr;
4667 struct sock *sk;
4668 u16 cid;
4650 int len; 4669 int len;
4651 4670
4652 if (conn->rx_len) { 4671 if (conn->rx_len) {
@@ -4657,7 +4676,8 @@ static int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 fl
4657 l2cap_conn_unreliable(conn, ECOMM); 4676 l2cap_conn_unreliable(conn, ECOMM);
4658 } 4677 }
4659 4678
4660 if (skb->len < 2) { 4679 /* Start fragment always begin with Basic L2CAP header */
4680 if (skb->len < L2CAP_HDR_SIZE) {
4661 BT_ERR("Frame is too short (len %d)", skb->len); 4681 BT_ERR("Frame is too short (len %d)", skb->len);
4662 l2cap_conn_unreliable(conn, ECOMM); 4682 l2cap_conn_unreliable(conn, ECOMM);
4663 goto drop; 4683 goto drop;
@@ -4665,6 +4685,7 @@ static int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 fl
4665 4685
4666 hdr = (struct l2cap_hdr *) skb->data; 4686 hdr = (struct l2cap_hdr *) skb->data;
4667 len = __le16_to_cpu(hdr->len) + L2CAP_HDR_SIZE; 4687 len = __le16_to_cpu(hdr->len) + L2CAP_HDR_SIZE;
4688 cid = __le16_to_cpu(hdr->cid);
4668 4689
4669 if (len == skb->len) { 4690 if (len == skb->len) {
4670 /* Complete frame received */ 4691 /* Complete frame received */
@@ -4681,6 +4702,19 @@ static int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 fl
4681 goto drop; 4702 goto drop;
4682 } 4703 }
4683 4704
4705 sk = l2cap_get_chan_by_scid(&conn->chan_list, cid);
4706
4707 if (sk && l2cap_pi(sk)->imtu < len - L2CAP_HDR_SIZE) {
4708 BT_ERR("Frame exceeding recv MTU (len %d, MTU %d)",
4709 len, l2cap_pi(sk)->imtu);
4710 bh_unlock_sock(sk);
4711 l2cap_conn_unreliable(conn, ECOMM);
4712 goto drop;
4713 }
4714
4715 if (sk)
4716 bh_unlock_sock(sk);
4717
4684 /* Allocate skb for the complete frame (with header) */ 4718 /* Allocate skb for the complete frame (with header) */
4685 conn->rx_skb = bt_skb_alloc(len, GFP_ATOMIC); 4719 conn->rx_skb = bt_skb_alloc(len, GFP_ATOMIC);
4686 if (!conn->rx_skb) 4720 if (!conn->rx_skb)
diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c
index ad2af5814e40..b826d1bf10df 100644
--- a/net/bluetooth/lib.c
+++ b/net/bluetooth/lib.c
@@ -51,8 +51,8 @@ char *batostr(bdaddr_t *ba)
51 51
52 i ^= 1; 52 i ^= 1;
53 sprintf(str[i], "%2.2X:%2.2X:%2.2X:%2.2X:%2.2X:%2.2X", 53 sprintf(str[i], "%2.2X:%2.2X:%2.2X:%2.2X:%2.2X:%2.2X",
54 ba->b[0], ba->b[1], ba->b[2], 54 ba->b[5], ba->b[4], ba->b[3],
55 ba->b[3], ba->b[4], ba->b[5]); 55 ba->b[2], ba->b[1], ba->b[0]);
56 56
57 return str[i]; 57 return str[i];
58} 58}
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index 7dca91bb8c57..39a5d87e33b4 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -113,11 +113,10 @@ static void rfcomm_session_del(struct rfcomm_session *s);
113#define __get_rpn_stop_bits(line) (((line) >> 2) & 0x1) 113#define __get_rpn_stop_bits(line) (((line) >> 2) & 0x1)
114#define __get_rpn_parity(line) (((line) >> 3) & 0x7) 114#define __get_rpn_parity(line) (((line) >> 3) & 0x7)
115 115
116static inline void rfcomm_schedule(uint event) 116static inline void rfcomm_schedule(void)
117{ 117{
118 if (!rfcomm_thread) 118 if (!rfcomm_thread)
119 return; 119 return;
120 //set_bit(event, &rfcomm_event);
121 set_bit(RFCOMM_SCHED_WAKEUP, &rfcomm_event); 120 set_bit(RFCOMM_SCHED_WAKEUP, &rfcomm_event);
122 wake_up_process(rfcomm_thread); 121 wake_up_process(rfcomm_thread);
123} 122}
@@ -179,13 +178,13 @@ static unsigned char rfcomm_crc_table[256] = {
179/* FCS on 2 bytes */ 178/* FCS on 2 bytes */
180static inline u8 __fcs(u8 *data) 179static inline u8 __fcs(u8 *data)
181{ 180{
182 return (0xff - __crc(data)); 181 return 0xff - __crc(data);
183} 182}
184 183
185/* FCS on 3 bytes */ 184/* FCS on 3 bytes */
186static inline u8 __fcs2(u8 *data) 185static inline u8 __fcs2(u8 *data)
187{ 186{
188 return (0xff - rfcomm_crc_table[__crc(data) ^ data[2]]); 187 return 0xff - rfcomm_crc_table[__crc(data) ^ data[2]];
189} 188}
190 189
191/* Check FCS */ 190/* Check FCS */
@@ -203,13 +202,13 @@ static inline int __check_fcs(u8 *data, int type, u8 fcs)
203static void rfcomm_l2state_change(struct sock *sk) 202static void rfcomm_l2state_change(struct sock *sk)
204{ 203{
205 BT_DBG("%p state %d", sk, sk->sk_state); 204 BT_DBG("%p state %d", sk, sk->sk_state);
206 rfcomm_schedule(RFCOMM_SCHED_STATE); 205 rfcomm_schedule();
207} 206}
208 207
209static void rfcomm_l2data_ready(struct sock *sk, int bytes) 208static void rfcomm_l2data_ready(struct sock *sk, int bytes)
210{ 209{
211 BT_DBG("%p bytes %d", sk, bytes); 210 BT_DBG("%p bytes %d", sk, bytes);
212 rfcomm_schedule(RFCOMM_SCHED_RX); 211 rfcomm_schedule();
213} 212}
214 213
215static int rfcomm_l2sock_create(struct socket **sock) 214static int rfcomm_l2sock_create(struct socket **sock)
@@ -255,7 +254,7 @@ static void rfcomm_session_timeout(unsigned long arg)
255 BT_DBG("session %p state %ld", s, s->state); 254 BT_DBG("session %p state %ld", s, s->state);
256 255
257 set_bit(RFCOMM_TIMED_OUT, &s->flags); 256 set_bit(RFCOMM_TIMED_OUT, &s->flags);
258 rfcomm_schedule(RFCOMM_SCHED_TIMEO); 257 rfcomm_schedule();
259} 258}
260 259
261static void rfcomm_session_set_timer(struct rfcomm_session *s, long timeout) 260static void rfcomm_session_set_timer(struct rfcomm_session *s, long timeout)
@@ -283,7 +282,7 @@ static void rfcomm_dlc_timeout(unsigned long arg)
283 282
284 set_bit(RFCOMM_TIMED_OUT, &d->flags); 283 set_bit(RFCOMM_TIMED_OUT, &d->flags);
285 rfcomm_dlc_put(d); 284 rfcomm_dlc_put(d);
286 rfcomm_schedule(RFCOMM_SCHED_TIMEO); 285 rfcomm_schedule();
287} 286}
288 287
289static void rfcomm_dlc_set_timer(struct rfcomm_dlc *d, long timeout) 288static void rfcomm_dlc_set_timer(struct rfcomm_dlc *d, long timeout)
@@ -465,7 +464,7 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err)
465 case BT_CONFIG: 464 case BT_CONFIG:
466 if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) { 465 if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) {
467 set_bit(RFCOMM_AUTH_REJECT, &d->flags); 466 set_bit(RFCOMM_AUTH_REJECT, &d->flags);
468 rfcomm_schedule(RFCOMM_SCHED_AUTH); 467 rfcomm_schedule();
469 break; 468 break;
470 } 469 }
471 /* Fall through */ 470 /* Fall through */
@@ -485,7 +484,7 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err)
485 case BT_CONNECT2: 484 case BT_CONNECT2:
486 if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) { 485 if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) {
487 set_bit(RFCOMM_AUTH_REJECT, &d->flags); 486 set_bit(RFCOMM_AUTH_REJECT, &d->flags);
488 rfcomm_schedule(RFCOMM_SCHED_AUTH); 487 rfcomm_schedule();
489 break; 488 break;
490 } 489 }
491 /* Fall through */ 490 /* Fall through */
@@ -533,7 +532,7 @@ int rfcomm_dlc_send(struct rfcomm_dlc *d, struct sk_buff *skb)
533 skb_queue_tail(&d->tx_queue, skb); 532 skb_queue_tail(&d->tx_queue, skb);
534 533
535 if (!test_bit(RFCOMM_TX_THROTTLED, &d->flags)) 534 if (!test_bit(RFCOMM_TX_THROTTLED, &d->flags))
536 rfcomm_schedule(RFCOMM_SCHED_TX); 535 rfcomm_schedule();
537 return len; 536 return len;
538} 537}
539 538
@@ -545,7 +544,7 @@ void __rfcomm_dlc_throttle(struct rfcomm_dlc *d)
545 d->v24_sig |= RFCOMM_V24_FC; 544 d->v24_sig |= RFCOMM_V24_FC;
546 set_bit(RFCOMM_MSC_PENDING, &d->flags); 545 set_bit(RFCOMM_MSC_PENDING, &d->flags);
547 } 546 }
548 rfcomm_schedule(RFCOMM_SCHED_TX); 547 rfcomm_schedule();
549} 548}
550 549
551void __rfcomm_dlc_unthrottle(struct rfcomm_dlc *d) 550void __rfcomm_dlc_unthrottle(struct rfcomm_dlc *d)
@@ -556,7 +555,7 @@ void __rfcomm_dlc_unthrottle(struct rfcomm_dlc *d)
556 d->v24_sig &= ~RFCOMM_V24_FC; 555 d->v24_sig &= ~RFCOMM_V24_FC;
557 set_bit(RFCOMM_MSC_PENDING, &d->flags); 556 set_bit(RFCOMM_MSC_PENDING, &d->flags);
558 } 557 }
559 rfcomm_schedule(RFCOMM_SCHED_TX); 558 rfcomm_schedule();
560} 559}
561 560
562/* 561/*
@@ -577,7 +576,7 @@ int rfcomm_dlc_set_modem_status(struct rfcomm_dlc *d, u8 v24_sig)
577 d->v24_sig = v24_sig; 576 d->v24_sig = v24_sig;
578 577
579 if (!test_and_set_bit(RFCOMM_MSC_PENDING, &d->flags)) 578 if (!test_and_set_bit(RFCOMM_MSC_PENDING, &d->flags))
580 rfcomm_schedule(RFCOMM_SCHED_TX); 579 rfcomm_schedule();
581 580
582 return 0; 581 return 0;
583} 582}
@@ -816,7 +815,7 @@ static int rfcomm_queue_disc(struct rfcomm_dlc *d)
816 cmd->fcs = __fcs2((u8 *) cmd); 815 cmd->fcs = __fcs2((u8 *) cmd);
817 816
818 skb_queue_tail(&d->tx_queue, skb); 817 skb_queue_tail(&d->tx_queue, skb);
819 rfcomm_schedule(RFCOMM_SCHED_TX); 818 rfcomm_schedule();
820 return 0; 819 return 0;
821} 820}
822 821
@@ -1415,8 +1414,8 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
1415 return 0; 1414 return 0;
1416 1415
1417 if (len == 1) { 1416 if (len == 1) {
1418 /* This is a request, return default settings */ 1417 /* This is a request, return default (according to ETSI TS 07.10) settings */
1419 bit_rate = RFCOMM_RPN_BR_115200; 1418 bit_rate = RFCOMM_RPN_BR_9600;
1420 data_bits = RFCOMM_RPN_DATA_8; 1419 data_bits = RFCOMM_RPN_DATA_8;
1421 stop_bits = RFCOMM_RPN_STOP_1; 1420 stop_bits = RFCOMM_RPN_STOP_1;
1422 parity = RFCOMM_RPN_PARITY_NONE; 1421 parity = RFCOMM_RPN_PARITY_NONE;
@@ -1431,9 +1430,9 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
1431 1430
1432 if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_BITRATE)) { 1431 if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_BITRATE)) {
1433 bit_rate = rpn->bit_rate; 1432 bit_rate = rpn->bit_rate;
1434 if (bit_rate != RFCOMM_RPN_BR_115200) { 1433 if (bit_rate > RFCOMM_RPN_BR_230400) {
1435 BT_DBG("RPN bit rate mismatch 0x%x", bit_rate); 1434 BT_DBG("RPN bit rate mismatch 0x%x", bit_rate);
1436 bit_rate = RFCOMM_RPN_BR_115200; 1435 bit_rate = RFCOMM_RPN_BR_9600;
1437 rpn_mask ^= RFCOMM_RPN_PM_BITRATE; 1436 rpn_mask ^= RFCOMM_RPN_PM_BITRATE;
1438 } 1437 }
1439 } 1438 }
@@ -1698,7 +1697,7 @@ static int rfcomm_recv_frame(struct rfcomm_session *s, struct sk_buff *skb)
1698 break; 1697 break;
1699 1698
1700 default: 1699 default:
1701 BT_ERR("Unknown packet type 0x%02x\n", type); 1700 BT_ERR("Unknown packet type 0x%02x", type);
1702 break; 1701 break;
1703 } 1702 }
1704 kfree_skb(skb); 1703 kfree_skb(skb);
@@ -1884,7 +1883,7 @@ static inline void rfcomm_accept_connection(struct rfcomm_session *s)
1884 * L2CAP MTU minus UIH header and FCS. */ 1883 * L2CAP MTU minus UIH header and FCS. */
1885 s->mtu = min(l2cap_pi(nsock->sk)->omtu, l2cap_pi(nsock->sk)->imtu) - 5; 1884 s->mtu = min(l2cap_pi(nsock->sk)->omtu, l2cap_pi(nsock->sk)->imtu) - 5;
1886 1885
1887 rfcomm_schedule(RFCOMM_SCHED_RX); 1886 rfcomm_schedule();
1888 } else 1887 } else
1889 sock_release(nsock); 1888 sock_release(nsock);
1890} 1889}
@@ -2093,7 +2092,7 @@ static void rfcomm_security_cfm(struct hci_conn *conn, u8 status, u8 encrypt)
2093 2092
2094 rfcomm_session_put(s); 2093 rfcomm_session_put(s);
2095 2094
2096 rfcomm_schedule(RFCOMM_SCHED_AUTH); 2095 rfcomm_schedule();
2097} 2096}
2098 2097
2099static struct hci_cb rfcomm_cb = { 2098static struct hci_cb rfcomm_cb = {
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 44a623275951..aec505f934df 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -82,11 +82,14 @@ static void rfcomm_sk_data_ready(struct rfcomm_dlc *d, struct sk_buff *skb)
82static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err) 82static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err)
83{ 83{
84 struct sock *sk = d->owner, *parent; 84 struct sock *sk = d->owner, *parent;
85 unsigned long flags;
86
85 if (!sk) 87 if (!sk)
86 return; 88 return;
87 89
88 BT_DBG("dlc %p state %ld err %d", d, d->state, err); 90 BT_DBG("dlc %p state %ld err %d", d, d->state, err);
89 91
92 local_irq_save(flags);
90 bh_lock_sock(sk); 93 bh_lock_sock(sk);
91 94
92 if (err) 95 if (err)
@@ -108,6 +111,7 @@ static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err)
108 } 111 }
109 112
110 bh_unlock_sock(sk); 113 bh_unlock_sock(sk);
114 local_irq_restore(flags);
111 115
112 if (parent && sock_flag(sk, SOCK_ZAPPED)) { 116 if (parent && sock_flag(sk, SOCK_ZAPPED)) {
113 /* We have to drop DLC lock here, otherwise 117 /* We have to drop DLC lock here, otherwise
@@ -617,121 +621,29 @@ static int rfcomm_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
617 return sent; 621 return sent;
618} 622}
619 623
620static long rfcomm_sock_data_wait(struct sock *sk, long timeo)
621{
622 DECLARE_WAITQUEUE(wait, current);
623
624 add_wait_queue(sk_sleep(sk), &wait);
625 for (;;) {
626 set_current_state(TASK_INTERRUPTIBLE);
627
628 if (!skb_queue_empty(&sk->sk_receive_queue) ||
629 sk->sk_err ||
630 (sk->sk_shutdown & RCV_SHUTDOWN) ||
631 signal_pending(current) ||
632 !timeo)
633 break;
634
635 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
636 release_sock(sk);
637 timeo = schedule_timeout(timeo);
638 lock_sock(sk);
639 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
640 }
641
642 __set_current_state(TASK_RUNNING);
643 remove_wait_queue(sk_sleep(sk), &wait);
644 return timeo;
645}
646
647static int rfcomm_sock_recvmsg(struct kiocb *iocb, struct socket *sock, 624static int rfcomm_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
648 struct msghdr *msg, size_t size, int flags) 625 struct msghdr *msg, size_t size, int flags)
649{ 626{
650 struct sock *sk = sock->sk; 627 struct sock *sk = sock->sk;
651 struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc; 628 struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc;
652 int err = 0; 629 int len;
653 size_t target, copied = 0;
654 long timeo;
655 630
656 if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) { 631 if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) {
657 rfcomm_dlc_accept(d); 632 rfcomm_dlc_accept(d);
658 return 0; 633 return 0;
659 } 634 }
660 635
661 if (flags & MSG_OOB) 636 len = bt_sock_stream_recvmsg(iocb, sock, msg, size, flags);
662 return -EOPNOTSUPP;
663
664 msg->msg_namelen = 0;
665
666 BT_DBG("sk %p size %zu", sk, size);
667 637
668 lock_sock(sk); 638 lock_sock(sk);
639 if (!(flags & MSG_PEEK) && len > 0)
640 atomic_sub(len, &sk->sk_rmem_alloc);
669 641
670 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
671 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
672
673 do {
674 struct sk_buff *skb;
675 int chunk;
676
677 skb = skb_dequeue(&sk->sk_receive_queue);
678 if (!skb) {
679 if (copied >= target)
680 break;
681
682 if ((err = sock_error(sk)) != 0)
683 break;
684 if (sk->sk_shutdown & RCV_SHUTDOWN)
685 break;
686
687 err = -EAGAIN;
688 if (!timeo)
689 break;
690
691 timeo = rfcomm_sock_data_wait(sk, timeo);
692
693 if (signal_pending(current)) {
694 err = sock_intr_errno(timeo);
695 goto out;
696 }
697 continue;
698 }
699
700 chunk = min_t(unsigned int, skb->len, size);
701 if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
702 skb_queue_head(&sk->sk_receive_queue, skb);
703 if (!copied)
704 copied = -EFAULT;
705 break;
706 }
707 copied += chunk;
708 size -= chunk;
709
710 sock_recv_ts_and_drops(msg, sk, skb);
711
712 if (!(flags & MSG_PEEK)) {
713 atomic_sub(chunk, &sk->sk_rmem_alloc);
714
715 skb_pull(skb, chunk);
716 if (skb->len) {
717 skb_queue_head(&sk->sk_receive_queue, skb);
718 break;
719 }
720 kfree_skb(skb);
721
722 } else {
723 /* put message back and return */
724 skb_queue_head(&sk->sk_receive_queue, skb);
725 break;
726 }
727 } while (size);
728
729out:
730 if (atomic_read(&sk->sk_rmem_alloc) <= (sk->sk_rcvbuf >> 2)) 642 if (atomic_read(&sk->sk_rmem_alloc) <= (sk->sk_rcvbuf >> 2))
731 rfcomm_dlc_unthrottle(rfcomm_pi(sk)->dlc); 643 rfcomm_dlc_unthrottle(rfcomm_pi(sk)->dlc);
732
733 release_sock(sk); 644 release_sock(sk);
734 return copied ? : err; 645
646 return len;
735} 647}
736 648
737static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, char __user *optval, unsigned int optlen) 649static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, char __user *optval, unsigned int optlen)
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index befc3a52aa04..a9b81f5dacd1 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -183,9 +183,7 @@ static struct device *rfcomm_get_device(struct rfcomm_dev *dev)
183static ssize_t show_address(struct device *tty_dev, struct device_attribute *attr, char *buf) 183static ssize_t show_address(struct device *tty_dev, struct device_attribute *attr, char *buf)
184{ 184{
185 struct rfcomm_dev *dev = dev_get_drvdata(tty_dev); 185 struct rfcomm_dev *dev = dev_get_drvdata(tty_dev);
186 bdaddr_t bdaddr; 186 return sprintf(buf, "%s\n", batostr(&dev->dst));
187 baswap(&bdaddr, &dev->dst);
188 return sprintf(buf, "%s\n", batostr(&bdaddr));
189} 187}
190 188
191static ssize_t show_channel(struct device *tty_dev, struct device_attribute *attr, char *buf) 189static ssize_t show_channel(struct device *tty_dev, struct device_attribute *attr, char *buf)
@@ -844,10 +842,6 @@ static int rfcomm_tty_ioctl(struct tty_struct *tty, struct file *filp, unsigned
844 BT_DBG("TIOCMIWAIT"); 842 BT_DBG("TIOCMIWAIT");
845 break; 843 break;
846 844
847 case TIOCGICOUNT:
848 BT_DBG("TIOCGICOUNT");
849 break;
850
851 case TIOCGSERIAL: 845 case TIOCGSERIAL:
852 BT_ERR("TIOCGSERIAL is not supported"); 846 BT_ERR("TIOCGSERIAL is not supported");
853 return -ENOIOCTLCMD; 847 return -ENOIOCTLCMD;
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index cf09fe591fc2..17cb0b633576 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -212,6 +212,11 @@ static int br_set_tx_csum(struct net_device *dev, u32 data)
212 return 0; 212 return 0;
213} 213}
214 214
215static int br_set_flags(struct net_device *netdev, u32 data)
216{
217 return ethtool_op_set_flags(netdev, data, ETH_FLAG_TXVLAN);
218}
219
215#ifdef CONFIG_NET_POLL_CONTROLLER 220#ifdef CONFIG_NET_POLL_CONTROLLER
216static void br_poll_controller(struct net_device *br_dev) 221static void br_poll_controller(struct net_device *br_dev)
217{ 222{
@@ -304,6 +309,7 @@ static const struct ethtool_ops br_ethtool_ops = {
304 .get_ufo = ethtool_op_get_ufo, 309 .get_ufo = ethtool_op_get_ufo,
305 .set_ufo = ethtool_op_set_ufo, 310 .set_ufo = ethtool_op_set_ufo,
306 .get_flags = ethtool_op_get_flags, 311 .get_flags = ethtool_op_get_flags,
312 .set_flags = br_set_flags,
307}; 313};
308 314
309static const struct net_device_ops br_netdev_ops = { 315static const struct net_device_ops br_netdev_ops = {
@@ -343,5 +349,5 @@ void br_dev_setup(struct net_device *dev)
343 349
344 dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | 350 dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA |
345 NETIF_F_GSO_MASK | NETIF_F_NO_CSUM | NETIF_F_LLTX | 351 NETIF_F_GSO_MASK | NETIF_F_NO_CSUM | NETIF_F_LLTX |
346 NETIF_F_NETNS_LOCAL | NETIF_F_GSO; 352 NETIF_F_NETNS_LOCAL | NETIF_F_GSO | NETIF_F_HW_VLAN_TX;
347} 353}
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index c03d2c3ff03e..89ad25a76202 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -61,30 +61,27 @@ static int port_cost(struct net_device *dev)
61} 61}
62 62
63 63
64/* 64/* Check for port carrier transistions. */
65 * Check for port carrier transistions.
66 * Called from work queue to allow for calling functions that
67 * might sleep (such as speed check), and to debounce.
68 */
69void br_port_carrier_check(struct net_bridge_port *p) 65void br_port_carrier_check(struct net_bridge_port *p)
70{ 66{
71 struct net_device *dev = p->dev; 67 struct net_device *dev = p->dev;
72 struct net_bridge *br = p->br; 68 struct net_bridge *br = p->br;
73 69
74 if (netif_carrier_ok(dev)) 70 if (netif_running(dev) && netif_carrier_ok(dev))
75 p->path_cost = port_cost(dev); 71 p->path_cost = port_cost(dev);
76 72
77 if (netif_running(br->dev)) { 73 if (!netif_running(br->dev))
78 spin_lock_bh(&br->lock); 74 return;
79 if (netif_carrier_ok(dev)) { 75
80 if (p->state == BR_STATE_DISABLED) 76 spin_lock_bh(&br->lock);
81 br_stp_enable_port(p); 77 if (netif_running(dev) && netif_carrier_ok(dev)) {
82 } else { 78 if (p->state == BR_STATE_DISABLED)
83 if (p->state != BR_STATE_DISABLED) 79 br_stp_enable_port(p);
84 br_stp_disable_port(p); 80 } else {
85 } 81 if (p->state != BR_STATE_DISABLED)
86 spin_unlock_bh(&br->lock); 82 br_stp_disable_port(p);
87 } 83 }
84 spin_unlock_bh(&br->lock);
88} 85}
89 86
90static void release_nbp(struct kobject *kobj) 87static void release_nbp(struct kobject *kobj)
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 826cd5221536..25207a1f182b 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -141,7 +141,7 @@ struct sk_buff *br_handle_frame(struct sk_buff *skb)
141 const unsigned char *dest = eth_hdr(skb)->h_dest; 141 const unsigned char *dest = eth_hdr(skb)->h_dest;
142 int (*rhook)(struct sk_buff *skb); 142 int (*rhook)(struct sk_buff *skb);
143 143
144 if (skb->pkt_type == PACKET_LOOPBACK) 144 if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
145 return skb; 145 return skb;
146 146
147 if (!is_valid_ether_addr(eth_hdr(skb)->h_source)) 147 if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
@@ -159,7 +159,7 @@ struct sk_buff *br_handle_frame(struct sk_buff *skb)
159 goto drop; 159 goto drop;
160 160
161 /* If STP is turned off, then forward */ 161 /* If STP is turned off, then forward */
162 if (p->br->stp_enabled == BR_NO_STP && dest[5] == 0) 162 if (p->br->stp_enabled == BR_NO_STP)
163 goto forward; 163 goto forward;
164 164
165 if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, skb, skb->dev, 165 if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, skb, skb->dev,
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 2c911c0759c2..865fd7634b67 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -64,22 +64,24 @@ static int brnf_filter_pppoe_tagged __read_mostly = 0;
64 64
65static inline __be16 vlan_proto(const struct sk_buff *skb) 65static inline __be16 vlan_proto(const struct sk_buff *skb)
66{ 66{
67 return vlan_eth_hdr(skb)->h_vlan_encapsulated_proto; 67 if (vlan_tx_tag_present(skb))
68 return skb->protocol;
69 else if (skb->protocol == htons(ETH_P_8021Q))
70 return vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
71 else
72 return 0;
68} 73}
69 74
70#define IS_VLAN_IP(skb) \ 75#define IS_VLAN_IP(skb) \
71 (skb->protocol == htons(ETH_P_8021Q) && \ 76 (vlan_proto(skb) == htons(ETH_P_IP) && \
72 vlan_proto(skb) == htons(ETH_P_IP) && \
73 brnf_filter_vlan_tagged) 77 brnf_filter_vlan_tagged)
74 78
75#define IS_VLAN_IPV6(skb) \ 79#define IS_VLAN_IPV6(skb) \
76 (skb->protocol == htons(ETH_P_8021Q) && \ 80 (vlan_proto(skb) == htons(ETH_P_IPV6) && \
77 vlan_proto(skb) == htons(ETH_P_IPV6) &&\
78 brnf_filter_vlan_tagged) 81 brnf_filter_vlan_tagged)
79 82
80#define IS_VLAN_ARP(skb) \ 83#define IS_VLAN_ARP(skb) \
81 (skb->protocol == htons(ETH_P_8021Q) && \ 84 (vlan_proto(skb) == htons(ETH_P_ARP) && \
82 vlan_proto(skb) == htons(ETH_P_ARP) && \
83 brnf_filter_vlan_tagged) 85 brnf_filter_vlan_tagged)
84 86
85static inline __be16 pppoe_proto(const struct sk_buff *skb) 87static inline __be16 pppoe_proto(const struct sk_buff *skb)
@@ -106,7 +108,6 @@ static struct dst_ops fake_dst_ops = {
106 .family = AF_INET, 108 .family = AF_INET,
107 .protocol = cpu_to_be16(ETH_P_IP), 109 .protocol = cpu_to_be16(ETH_P_IP),
108 .update_pmtu = fake_update_pmtu, 110 .update_pmtu = fake_update_pmtu,
109 .entries = ATOMIC_INIT(0),
110}; 111};
111 112
112/* 113/*
@@ -162,8 +163,8 @@ static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb)
162 if (tmp) { 163 if (tmp) {
163 memcpy(tmp, nf_bridge, sizeof(struct nf_bridge_info)); 164 memcpy(tmp, nf_bridge, sizeof(struct nf_bridge_info));
164 atomic_set(&tmp->use, 1); 165 atomic_set(&tmp->use, 1);
165 nf_bridge_put(nf_bridge);
166 } 166 }
167 nf_bridge_put(nf_bridge);
167 nf_bridge = tmp; 168 nf_bridge = tmp;
168 } 169 }
169 return nf_bridge; 170 return nf_bridge;
@@ -209,6 +210,72 @@ static inline void nf_bridge_update_protocol(struct sk_buff *skb)
209 skb->protocol = htons(ETH_P_PPP_SES); 210 skb->protocol = htons(ETH_P_PPP_SES);
210} 211}
211 212
213/* When handing a packet over to the IP layer
214 * check whether we have a skb that is in the
215 * expected format
216 */
217
218static int br_parse_ip_options(struct sk_buff *skb)
219{
220 struct ip_options *opt;
221 struct iphdr *iph;
222 struct net_device *dev = skb->dev;
223 u32 len;
224
225 iph = ip_hdr(skb);
226 opt = &(IPCB(skb)->opt);
227
228 /* Basic sanity checks */
229 if (iph->ihl < 5 || iph->version != 4)
230 goto inhdr_error;
231
232 if (!pskb_may_pull(skb, iph->ihl*4))
233 goto inhdr_error;
234
235 iph = ip_hdr(skb);
236 if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
237 goto inhdr_error;
238
239 len = ntohs(iph->tot_len);
240 if (skb->len < len) {
241 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
242 goto drop;
243 } else if (len < (iph->ihl*4))
244 goto inhdr_error;
245
246 if (pskb_trim_rcsum(skb, len)) {
247 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
248 goto drop;
249 }
250
251 /* Zero out the CB buffer if no options present */
252 if (iph->ihl == 5) {
253 memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
254 return 0;
255 }
256
257 opt->optlen = iph->ihl*4 - sizeof(struct iphdr);
258 if (ip_options_compile(dev_net(dev), opt, skb))
259 goto inhdr_error;
260
261 /* Check correct handling of SRR option */
262 if (unlikely(opt->srr)) {
263 struct in_device *in_dev = __in_dev_get_rcu(dev);
264 if (in_dev && !IN_DEV_SOURCE_ROUTE(in_dev))
265 goto drop;
266
267 if (ip_options_rcv_srr(skb))
268 goto drop;
269 }
270
271 return 0;
272
273inhdr_error:
274 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
275drop:
276 return -1;
277}
278
212/* Fill in the header for fragmented IP packets handled by 279/* Fill in the header for fragmented IP packets handled by
213 * the IPv4 connection tracking code. 280 * the IPv4 connection tracking code.
214 */ 281 */
@@ -549,7 +616,6 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff *skb,
549{ 616{
550 struct net_bridge_port *p; 617 struct net_bridge_port *p;
551 struct net_bridge *br; 618 struct net_bridge *br;
552 struct iphdr *iph;
553 __u32 len = nf_bridge_encap_header_len(skb); 619 __u32 len = nf_bridge_encap_header_len(skb);
554 620
555 if (unlikely(!pskb_may_pull(skb, len))) 621 if (unlikely(!pskb_may_pull(skb, len)))
@@ -578,28 +644,9 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff *skb,
578 644
579 nf_bridge_pull_encap_header_rcsum(skb); 645 nf_bridge_pull_encap_header_rcsum(skb);
580 646
581 if (!pskb_may_pull(skb, sizeof(struct iphdr))) 647 if (br_parse_ip_options(skb))
582 goto inhdr_error; 648 /* Drop invalid packet */
583 649 goto out;
584 iph = ip_hdr(skb);
585 if (iph->ihl < 5 || iph->version != 4)
586 goto inhdr_error;
587
588 if (!pskb_may_pull(skb, 4 * iph->ihl))
589 goto inhdr_error;
590
591 iph = ip_hdr(skb);
592 if (ip_fast_csum((__u8 *) iph, iph->ihl) != 0)
593 goto inhdr_error;
594
595 len = ntohs(iph->tot_len);
596 if (skb->len < len || len < 4 * iph->ihl)
597 goto inhdr_error;
598
599 pskb_trim_rcsum(skb, len);
600
601 /* BUG: Should really parse the IP options here. */
602 memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
603 650
604 nf_bridge_put(skb->nf_bridge); 651 nf_bridge_put(skb->nf_bridge);
605 if (!nf_bridge_alloc(skb)) 652 if (!nf_bridge_alloc(skb))
@@ -614,8 +661,6 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff *skb,
614 661
615 return NF_STOLEN; 662 return NF_STOLEN;
616 663
617inhdr_error:
618// IP_INC_STATS_BH(IpInHdrErrors);
619out: 664out:
620 return NF_DROP; 665 return NF_DROP;
621} 666}
@@ -759,12 +804,19 @@ static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff *skb,
759#if defined(CONFIG_NF_CONNTRACK_IPV4) || defined(CONFIG_NF_CONNTRACK_IPV4_MODULE) 804#if defined(CONFIG_NF_CONNTRACK_IPV4) || defined(CONFIG_NF_CONNTRACK_IPV4_MODULE)
760static int br_nf_dev_queue_xmit(struct sk_buff *skb) 805static int br_nf_dev_queue_xmit(struct sk_buff *skb)
761{ 806{
807 int ret;
808
762 if (skb->nfct != NULL && skb->protocol == htons(ETH_P_IP) && 809 if (skb->nfct != NULL && skb->protocol == htons(ETH_P_IP) &&
763 skb->len + nf_bridge_mtu_reduction(skb) > skb->dev->mtu && 810 skb->len + nf_bridge_mtu_reduction(skb) > skb->dev->mtu &&
764 !skb_is_gso(skb)) 811 !skb_is_gso(skb)) {
765 return ip_fragment(skb, br_dev_queue_push_xmit); 812 if (br_parse_ip_options(skb))
766 else 813 /* Drop invalid packet */
767 return br_dev_queue_push_xmit(skb); 814 return NF_DROP;
815 ret = ip_fragment(skb, br_dev_queue_push_xmit);
816 } else
817 ret = br_dev_queue_push_xmit(skb);
818
819 return ret;
768} 820}
769#else 821#else
770static int br_nf_dev_queue_xmit(struct sk_buff *skb) 822static int br_nf_dev_queue_xmit(struct sk_buff *skb)
@@ -952,15 +1004,22 @@ int __init br_netfilter_init(void)
952{ 1004{
953 int ret; 1005 int ret;
954 1006
955 ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); 1007 ret = dst_entries_init(&fake_dst_ops);
956 if (ret < 0) 1008 if (ret < 0)
957 return ret; 1009 return ret;
1010
1011 ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
1012 if (ret < 0) {
1013 dst_entries_destroy(&fake_dst_ops);
1014 return ret;
1015 }
958#ifdef CONFIG_SYSCTL 1016#ifdef CONFIG_SYSCTL
959 brnf_sysctl_header = register_sysctl_paths(brnf_path, brnf_table); 1017 brnf_sysctl_header = register_sysctl_paths(brnf_path, brnf_table);
960 if (brnf_sysctl_header == NULL) { 1018 if (brnf_sysctl_header == NULL) {
961 printk(KERN_WARNING 1019 printk(KERN_WARNING
962 "br_netfilter: can't register to sysctl.\n"); 1020 "br_netfilter: can't register to sysctl.\n");
963 nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); 1021 nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
1022 dst_entries_destroy(&fake_dst_ops);
964 return -ENOMEM; 1023 return -ENOMEM;
965 } 1024 }
966#endif 1025#endif
@@ -974,4 +1033,5 @@ void br_netfilter_fini(void)
974#ifdef CONFIG_SYSCTL 1033#ifdef CONFIG_SYSCTL
975 unregister_sysctl_table(brnf_sysctl_header); 1034 unregister_sysctl_table(brnf_sysctl_header);
976#endif 1035#endif
1036 dst_entries_destroy(&fake_dst_ops);
977} 1037}
diff --git a/net/bridge/netfilter/ebt_vlan.c b/net/bridge/netfilter/ebt_vlan.c
index 87b53b3a921d..eae67bf0446c 100644
--- a/net/bridge/netfilter/ebt_vlan.c
+++ b/net/bridge/netfilter/ebt_vlan.c
@@ -39,8 +39,6 @@ static bool
39ebt_vlan_mt(const struct sk_buff *skb, struct xt_action_param *par) 39ebt_vlan_mt(const struct sk_buff *skb, struct xt_action_param *par)
40{ 40{
41 const struct ebt_vlan_info *info = par->matchinfo; 41 const struct ebt_vlan_info *info = par->matchinfo;
42 const struct vlan_hdr *fp;
43 struct vlan_hdr _frame;
44 42
45 unsigned short TCI; /* Whole TCI, given from parsed frame */ 43 unsigned short TCI; /* Whole TCI, given from parsed frame */
46 unsigned short id; /* VLAN ID, given from frame TCI */ 44 unsigned short id; /* VLAN ID, given from frame TCI */
@@ -48,9 +46,20 @@ ebt_vlan_mt(const struct sk_buff *skb, struct xt_action_param *par)
48 /* VLAN encapsulated Type/Length field, given from orig frame */ 46 /* VLAN encapsulated Type/Length field, given from orig frame */
49 __be16 encap; 47 __be16 encap;
50 48
51 fp = skb_header_pointer(skb, 0, sizeof(_frame), &_frame); 49 if (vlan_tx_tag_present(skb)) {
52 if (fp == NULL) 50 TCI = vlan_tx_tag_get(skb);
53 return false; 51 encap = skb->protocol;
52 } else {
53 const struct vlan_hdr *fp;
54 struct vlan_hdr _frame;
55
56 fp = skb_header_pointer(skb, 0, sizeof(_frame), &_frame);
57 if (fp == NULL)
58 return false;
59
60 TCI = ntohs(fp->h_vlan_TCI);
61 encap = fp->h_vlan_encapsulated_proto;
62 }
54 63
55 /* Tag Control Information (TCI) consists of the following elements: 64 /* Tag Control Information (TCI) consists of the following elements:
56 * - User_priority. The user_priority field is three bits in length, 65 * - User_priority. The user_priority field is three bits in length,
@@ -59,10 +68,8 @@ ebt_vlan_mt(const struct sk_buff *skb, struct xt_action_param *par)
59 * (CFI) is a single bit flag value. Currently ignored. 68 * (CFI) is a single bit flag value. Currently ignored.
60 * - VLAN Identifier (VID). The VID is encoded as 69 * - VLAN Identifier (VID). The VID is encoded as
61 * an unsigned binary number. */ 70 * an unsigned binary number. */
62 TCI = ntohs(fp->h_vlan_TCI);
63 id = TCI & VLAN_VID_MASK; 71 id = TCI & VLAN_VID_MASK;
64 prio = (TCI >> 13) & 0x7; 72 prio = (TCI >> 13) & 0x7;
65 encap = fp->h_vlan_encapsulated_proto;
66 73
67 /* Checking VLAN Identifier (VID) */ 74 /* Checking VLAN Identifier (VID) */
68 if (GET_BITMASK(EBT_VLAN_ID)) 75 if (GET_BITMASK(EBT_VLAN_ID))
@@ -111,10 +118,10 @@ static int ebt_vlan_mt_check(const struct xt_mtchk_param *par)
111 * 0 - The null VLAN ID. 118 * 0 - The null VLAN ID.
112 * 1 - The default Port VID (PVID) 119 * 1 - The default Port VID (PVID)
113 * 0x0FFF - Reserved for implementation use. 120 * 0x0FFF - Reserved for implementation use.
114 * if_vlan.h: VLAN_GROUP_ARRAY_LEN 4096. */ 121 * if_vlan.h: VLAN_N_VID 4096. */
115 if (GET_BITMASK(EBT_VLAN_ID)) { 122 if (GET_BITMASK(EBT_VLAN_ID)) {
116 if (!!info->id) { /* if id!=0 => check vid range */ 123 if (!!info->id) { /* if id!=0 => check vid range */
117 if (info->id > VLAN_GROUP_ARRAY_LEN) { 124 if (info->id > VLAN_N_VID) {
118 pr_debug("id %d is out of range (1-4096)\n", 125 pr_debug("id %d is out of range (1-4096)\n",
119 info->id); 126 info->id);
120 return -EINVAL; 127 return -EINVAL;
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index bcc102e3be4d..a1dcf83f0d58 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -124,16 +124,23 @@ ebt_dev_check(const char *entry, const struct net_device *device)
124#define FWINV2(bool,invflg) ((bool) ^ !!(e->invflags & invflg)) 124#define FWINV2(bool,invflg) ((bool) ^ !!(e->invflags & invflg))
125/* process standard matches */ 125/* process standard matches */
126static inline int 126static inline int
127ebt_basic_match(const struct ebt_entry *e, const struct ethhdr *h, 127ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb,
128 const struct net_device *in, const struct net_device *out) 128 const struct net_device *in, const struct net_device *out)
129{ 129{
130 const struct ethhdr *h = eth_hdr(skb);
131 __be16 ethproto;
130 int verdict, i; 132 int verdict, i;
131 133
134 if (vlan_tx_tag_present(skb))
135 ethproto = htons(ETH_P_8021Q);
136 else
137 ethproto = h->h_proto;
138
132 if (e->bitmask & EBT_802_3) { 139 if (e->bitmask & EBT_802_3) {
133 if (FWINV2(ntohs(h->h_proto) >= 1536, EBT_IPROTO)) 140 if (FWINV2(ntohs(ethproto) >= 1536, EBT_IPROTO))
134 return 1; 141 return 1;
135 } else if (!(e->bitmask & EBT_NOPROTO) && 142 } else if (!(e->bitmask & EBT_NOPROTO) &&
136 FWINV2(e->ethproto != h->h_proto, EBT_IPROTO)) 143 FWINV2(e->ethproto != ethproto, EBT_IPROTO))
137 return 1; 144 return 1;
138 145
139 if (FWINV2(ebt_dev_check(e->in, in), EBT_IIN)) 146 if (FWINV2(ebt_dev_check(e->in, in), EBT_IIN))
@@ -213,7 +220,7 @@ unsigned int ebt_do_table (unsigned int hook, struct sk_buff *skb,
213 base = private->entries; 220 base = private->entries;
214 i = 0; 221 i = 0;
215 while (i < nentries) { 222 while (i < nentries) {
216 if (ebt_basic_match(point, eth_hdr(skb), in, out)) 223 if (ebt_basic_match(point, skb, in, out))
217 goto letscontinue; 224 goto letscontinue;
218 225
219 if (EBT_MATCH_ITERATE(point, ebt_do_match, skb, &acpar) != 0) 226 if (EBT_MATCH_ITERATE(point, ebt_do_match, skb, &acpar) != 0)
diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c
index 0b586e9d1378..b99369a055d1 100644
--- a/net/caif/caif_dev.c
+++ b/net/caif/caif_dev.c
@@ -9,6 +9,8 @@
9 * and Sakari Ailus <sakari.ailus@nokia.com> 9 * and Sakari Ailus <sakari.ailus@nokia.com>
10 */ 10 */
11 11
12#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
13
12#include <linux/version.h> 14#include <linux/version.h>
13#include <linux/module.h> 15#include <linux/module.h>
14#include <linux/kernel.h> 16#include <linux/kernel.h>
@@ -171,7 +173,7 @@ static int receive(struct sk_buff *skb, struct net_device *dev,
171 net = dev_net(dev); 173 net = dev_net(dev);
172 pkt = cfpkt_fromnative(CAIF_DIR_IN, skb); 174 pkt = cfpkt_fromnative(CAIF_DIR_IN, skb);
173 caifd = caif_get(dev); 175 caifd = caif_get(dev);
174 if (!caifd || !caifd->layer.up || !caifd->layer.up->ctrlcmd) 176 if (!caifd || !caifd->layer.up || !caifd->layer.up->receive)
175 return NET_RX_DROP; 177 return NET_RX_DROP;
176 178
177 if (caifd->layer.up->receive(caifd->layer.up, pkt)) 179 if (caifd->layer.up->receive(caifd->layer.up, pkt))
@@ -214,7 +216,7 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what,
214 216
215 switch (what) { 217 switch (what) {
216 case NETDEV_REGISTER: 218 case NETDEV_REGISTER:
217 pr_info("CAIF: %s():register %s\n", __func__, dev->name); 219 netdev_info(dev, "register\n");
218 caifd = caif_device_alloc(dev); 220 caifd = caif_device_alloc(dev);
219 if (caifd == NULL) 221 if (caifd == NULL)
220 break; 222 break;
@@ -225,14 +227,13 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what,
225 break; 227 break;
226 228
227 case NETDEV_UP: 229 case NETDEV_UP:
228 pr_info("CAIF: %s(): up %s\n", __func__, dev->name); 230 netdev_info(dev, "up\n");
229 caifd = caif_get(dev); 231 caifd = caif_get(dev);
230 if (caifd == NULL) 232 if (caifd == NULL)
231 break; 233 break;
232 caifdev = netdev_priv(dev); 234 caifdev = netdev_priv(dev);
233 if (atomic_read(&caifd->state) == NETDEV_UP) { 235 if (atomic_read(&caifd->state) == NETDEV_UP) {
234 pr_info("CAIF: %s():%s already up\n", 236 netdev_info(dev, "already up\n");
235 __func__, dev->name);
236 break; 237 break;
237 } 238 }
238 atomic_set(&caifd->state, what); 239 atomic_set(&caifd->state, what);
@@ -273,7 +274,7 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what,
273 caifd = caif_get(dev); 274 caifd = caif_get(dev);
274 if (caifd == NULL) 275 if (caifd == NULL)
275 break; 276 break;
276 pr_info("CAIF: %s():going down %s\n", __func__, dev->name); 277 netdev_info(dev, "going down\n");
277 278
278 if (atomic_read(&caifd->state) == NETDEV_GOING_DOWN || 279 if (atomic_read(&caifd->state) == NETDEV_GOING_DOWN ||
279 atomic_read(&caifd->state) == NETDEV_DOWN) 280 atomic_read(&caifd->state) == NETDEV_DOWN)
@@ -295,11 +296,10 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what,
295 caifd = caif_get(dev); 296 caifd = caif_get(dev);
296 if (caifd == NULL) 297 if (caifd == NULL)
297 break; 298 break;
298 pr_info("CAIF: %s(): down %s\n", __func__, dev->name); 299 netdev_info(dev, "down\n");
299 if (atomic_read(&caifd->in_use)) 300 if (atomic_read(&caifd->in_use))
300 pr_warning("CAIF: %s(): " 301 netdev_warn(dev,
301 "Unregistering an active CAIF device: %s\n", 302 "Unregistering an active CAIF device\n");
302 __func__, dev->name);
303 cfcnfg_del_phy_layer(get_caif_conf(), &caifd->layer); 303 cfcnfg_del_phy_layer(get_caif_conf(), &caifd->layer);
304 dev_put(dev); 304 dev_put(dev);
305 atomic_set(&caifd->state, what); 305 atomic_set(&caifd->state, what);
@@ -307,7 +307,7 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what,
307 307
308 case NETDEV_UNREGISTER: 308 case NETDEV_UNREGISTER:
309 caifd = caif_get(dev); 309 caifd = caif_get(dev);
310 pr_info("CAIF: %s(): unregister %s\n", __func__, dev->name); 310 netdev_info(dev, "unregister\n");
311 atomic_set(&caifd->state, what); 311 atomic_set(&caifd->state, what);
312 caif_device_destroy(dev); 312 caif_device_destroy(dev);
313 break; 313 break;
@@ -391,7 +391,7 @@ static int __init caif_device_init(void)
391 int result; 391 int result;
392 cfg = cfcnfg_create(); 392 cfg = cfcnfg_create();
393 if (!cfg) { 393 if (!cfg) {
394 pr_warning("CAIF: %s(): can't create cfcnfg.\n", __func__); 394 pr_warn("can't create cfcnfg\n");
395 goto err_cfcnfg_create_failed; 395 goto err_cfcnfg_create_failed;
396 } 396 }
397 result = register_pernet_device(&caif_net_ops); 397 result = register_pernet_device(&caif_net_ops);
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 8ce904786116..2eca2dd0000f 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -4,6 +4,8 @@
4 * License terms: GNU General Public License (GPL) version 2 4 * License terms: GNU General Public License (GPL) version 2
5 */ 5 */
6 6
7#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
8
7#include <linux/fs.h> 9#include <linux/fs.h>
8#include <linux/init.h> 10#include <linux/init.h>
9#include <linux/module.h> 11#include <linux/module.h>
@@ -15,7 +17,6 @@
15#include <linux/poll.h> 17#include <linux/poll.h>
16#include <linux/tcp.h> 18#include <linux/tcp.h>
17#include <linux/uaccess.h> 19#include <linux/uaccess.h>
18#include <linux/mutex.h>
19#include <linux/debugfs.h> 20#include <linux/debugfs.h>
20#include <linux/caif/caif_socket.h> 21#include <linux/caif/caif_socket.h>
21#include <asm/atomic.h> 22#include <asm/atomic.h>
@@ -28,9 +29,6 @@
28MODULE_LICENSE("GPL"); 29MODULE_LICENSE("GPL");
29MODULE_ALIAS_NETPROTO(AF_CAIF); 30MODULE_ALIAS_NETPROTO(AF_CAIF);
30 31
31#define CAIF_DEF_SNDBUF (4096*10)
32#define CAIF_DEF_RCVBUF (4096*100)
33
34/* 32/*
35 * CAIF state is re-using the TCP socket states. 33 * CAIF state is re-using the TCP socket states.
36 * caif_states stored in sk_state reflect the state as reported by 34 * caif_states stored in sk_state reflect the state as reported by
@@ -157,9 +155,7 @@ static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
157 155
158 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= 156 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
159 (unsigned)sk->sk_rcvbuf && rx_flow_is_on(cf_sk)) { 157 (unsigned)sk->sk_rcvbuf && rx_flow_is_on(cf_sk)) {
160 trace_printk("CAIF: %s():" 158 pr_debug("sending flow OFF (queue len = %d %d)\n",
161 " sending flow OFF (queue len = %d %d)\n",
162 __func__,
163 atomic_read(&cf_sk->sk.sk_rmem_alloc), 159 atomic_read(&cf_sk->sk.sk_rmem_alloc),
164 sk_rcvbuf_lowwater(cf_sk)); 160 sk_rcvbuf_lowwater(cf_sk));
165 set_rx_flow_off(cf_sk); 161 set_rx_flow_off(cf_sk);
@@ -172,9 +168,7 @@ static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
172 return err; 168 return err;
173 if (!sk_rmem_schedule(sk, skb->truesize) && rx_flow_is_on(cf_sk)) { 169 if (!sk_rmem_schedule(sk, skb->truesize) && rx_flow_is_on(cf_sk)) {
174 set_rx_flow_off(cf_sk); 170 set_rx_flow_off(cf_sk);
175 trace_printk("CAIF: %s():" 171 pr_debug("sending flow OFF due to rmem_schedule\n");
176 " sending flow OFF due to rmem_schedule\n",
177 __func__);
178 dbfs_atomic_inc(&cnt.num_rx_flow_off); 172 dbfs_atomic_inc(&cnt.num_rx_flow_off);
179 caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ); 173 caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ);
180 } 174 }
@@ -275,8 +269,7 @@ static void caif_ctrl_cb(struct cflayer *layr,
275 break; 269 break;
276 270
277 default: 271 default:
278 pr_debug("CAIF: %s(): Unexpected flow command %d\n", 272 pr_debug("Unexpected flow command %d\n", flow);
279 __func__, flow);
280 } 273 }
281} 274}
282 275
@@ -536,8 +529,7 @@ static int transmit_skb(struct sk_buff *skb, struct caifsock *cf_sk,
536 529
537 /* Slight paranoia, probably not needed. */ 530 /* Slight paranoia, probably not needed. */
538 if (unlikely(loopcnt++ > 1000)) { 531 if (unlikely(loopcnt++ > 1000)) {
539 pr_warning("CAIF: %s(): transmit retries failed," 532 pr_warn("transmit retries failed, error = %d\n", ret);
540 " error = %d\n", __func__, ret);
541 break; 533 break;
542 } 534 }
543 535
@@ -827,6 +819,7 @@ static int caif_connect(struct socket *sock, struct sockaddr *uaddr,
827 long timeo; 819 long timeo;
828 int err; 820 int err;
829 int ifindex, headroom, tailroom; 821 int ifindex, headroom, tailroom;
822 unsigned int mtu;
830 struct net_device *dev; 823 struct net_device *dev;
831 824
832 lock_sock(sk); 825 lock_sock(sk);
@@ -896,15 +889,22 @@ static int caif_connect(struct socket *sock, struct sockaddr *uaddr,
896 cf_sk->sk.sk_state = CAIF_DISCONNECTED; 889 cf_sk->sk.sk_state = CAIF_DISCONNECTED;
897 goto out; 890 goto out;
898 } 891 }
899 dev = dev_get_by_index(sock_net(sk), ifindex); 892
893 err = -ENODEV;
894 rcu_read_lock();
895 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
896 if (!dev) {
897 rcu_read_unlock();
898 goto out;
899 }
900 cf_sk->headroom = LL_RESERVED_SPACE_EXTRA(dev, headroom); 900 cf_sk->headroom = LL_RESERVED_SPACE_EXTRA(dev, headroom);
901 mtu = dev->mtu;
902 rcu_read_unlock();
903
901 cf_sk->tailroom = tailroom; 904 cf_sk->tailroom = tailroom;
902 cf_sk->maxframe = dev->mtu - (headroom + tailroom); 905 cf_sk->maxframe = mtu - (headroom + tailroom);
903 dev_put(dev);
904 if (cf_sk->maxframe < 1) { 906 if (cf_sk->maxframe < 1) {
905 pr_warning("CAIF: %s(): CAIF Interface MTU too small (%d)\n", 907 pr_warn("CAIF Interface MTU too small (%d)\n", dev->mtu);
906 __func__, dev->mtu);
907 err = -ENODEV;
908 goto out; 908 goto out;
909 } 909 }
910 910
@@ -1123,10 +1123,6 @@ static int caif_create(struct net *net, struct socket *sock, int protocol,
1123 /* Store the protocol */ 1123 /* Store the protocol */
1124 sk->sk_protocol = (unsigned char) protocol; 1124 sk->sk_protocol = (unsigned char) protocol;
1125 1125
1126 /* Sendbuf dictates the amount of outbound packets not yet sent */
1127 sk->sk_sndbuf = CAIF_DEF_SNDBUF;
1128 sk->sk_rcvbuf = CAIF_DEF_RCVBUF;
1129
1130 /* 1126 /*
1131 * Lock in order to try to stop someone from opening the socket 1127 * Lock in order to try to stop someone from opening the socket
1132 * too early. 1128 * too early.
diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c
index 1c29189b344d..41adafd18914 100644
--- a/net/caif/cfcnfg.c
+++ b/net/caif/cfcnfg.c
@@ -3,6 +3,9 @@
3 * Author: Sjur Brendeland/sjur.brandeland@stericsson.com 3 * Author: Sjur Brendeland/sjur.brandeland@stericsson.com
4 * License terms: GNU General Public License (GPL) version 2 4 * License terms: GNU General Public License (GPL) version 2
5 */ 5 */
6
7#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
8
6#include <linux/kernel.h> 9#include <linux/kernel.h>
7#include <linux/stddef.h> 10#include <linux/stddef.h>
8#include <linux/slab.h> 11#include <linux/slab.h>
@@ -78,7 +81,7 @@ struct cfcnfg *cfcnfg_create(void)
78 /* Initiate this layer */ 81 /* Initiate this layer */
79 this = kzalloc(sizeof(struct cfcnfg), GFP_ATOMIC); 82 this = kzalloc(sizeof(struct cfcnfg), GFP_ATOMIC);
80 if (!this) { 83 if (!this) {
81 pr_warning("CAIF: %s(): Out of memory\n", __func__); 84 pr_warn("Out of memory\n");
82 return NULL; 85 return NULL;
83 } 86 }
84 this->mux = cfmuxl_create(); 87 this->mux = cfmuxl_create();
@@ -106,7 +109,7 @@ struct cfcnfg *cfcnfg_create(void)
106 layer_set_up(this->ctrl, this); 109 layer_set_up(this->ctrl, this);
107 return this; 110 return this;
108out_of_mem: 111out_of_mem:
109 pr_warning("CAIF: %s(): Out of memory\n", __func__); 112 pr_warn("Out of memory\n");
110 kfree(this->mux); 113 kfree(this->mux);
111 kfree(this->ctrl); 114 kfree(this->ctrl);
112 kfree(this); 115 kfree(this);
@@ -194,7 +197,7 @@ int cfcnfg_disconn_adapt_layer(struct cfcnfg *cnfg, struct cflayer *adap_layer)
194 caif_assert(adap_layer != NULL); 197 caif_assert(adap_layer != NULL);
195 channel_id = adap_layer->id; 198 channel_id = adap_layer->id;
196 if (adap_layer->dn == NULL || channel_id == 0) { 199 if (adap_layer->dn == NULL || channel_id == 0) {
197 pr_err("CAIF: %s():adap_layer->id is 0\n", __func__); 200 pr_err("adap_layer->dn == NULL or adap_layer->id is 0\n");
198 ret = -ENOTCONN; 201 ret = -ENOTCONN;
199 goto end; 202 goto end;
200 } 203 }
@@ -204,9 +207,8 @@ int cfcnfg_disconn_adapt_layer(struct cfcnfg *cnfg, struct cflayer *adap_layer)
204 layer_set_up(servl, NULL); 207 layer_set_up(servl, NULL);
205 ret = cfctrl_linkdown_req(cnfg->ctrl, channel_id, adap_layer); 208 ret = cfctrl_linkdown_req(cnfg->ctrl, channel_id, adap_layer);
206 if (servl == NULL) { 209 if (servl == NULL) {
207 pr_err("CAIF: %s(): PROTOCOL ERROR " 210 pr_err("PROTOCOL ERROR - Error removing service_layer Channel_Id(%d)",
208 "- Error removing service_layer Channel_Id(%d)", 211 channel_id);
209 __func__, channel_id);
210 ret = -EINVAL; 212 ret = -EINVAL;
211 goto end; 213 goto end;
212 } 214 }
@@ -216,18 +218,14 @@ int cfcnfg_disconn_adapt_layer(struct cfcnfg *cnfg, struct cflayer *adap_layer)
216 218
217 phyinfo = cfcnfg_get_phyinfo(cnfg, phyid); 219 phyinfo = cfcnfg_get_phyinfo(cnfg, phyid);
218 if (phyinfo == NULL) { 220 if (phyinfo == NULL) {
219 pr_warning("CAIF: %s(): " 221 pr_warn("No interface to send disconnect to\n");
220 "No interface to send disconnect to\n",
221 __func__);
222 ret = -ENODEV; 222 ret = -ENODEV;
223 goto end; 223 goto end;
224 } 224 }
225 if (phyinfo->id != phyid || 225 if (phyinfo->id != phyid ||
226 phyinfo->phy_layer->id != phyid || 226 phyinfo->phy_layer->id != phyid ||
227 phyinfo->frm_layer->id != phyid) { 227 phyinfo->frm_layer->id != phyid) {
228 pr_err("CAIF: %s(): " 228 pr_err("Inconsistency in phy registration\n");
229 "Inconsistency in phy registration\n",
230 __func__);
231 ret = -EINVAL; 229 ret = -EINVAL;
232 goto end; 230 goto end;
233 } 231 }
@@ -276,21 +274,20 @@ int cfcnfg_add_adaptation_layer(struct cfcnfg *cnfg,
276{ 274{
277 struct cflayer *frml; 275 struct cflayer *frml;
278 if (adap_layer == NULL) { 276 if (adap_layer == NULL) {
279 pr_err("CAIF: %s(): adap_layer is zero", __func__); 277 pr_err("adap_layer is zero\n");
280 return -EINVAL; 278 return -EINVAL;
281 } 279 }
282 if (adap_layer->receive == NULL) { 280 if (adap_layer->receive == NULL) {
283 pr_err("CAIF: %s(): adap_layer->receive is NULL", __func__); 281 pr_err("adap_layer->receive is NULL\n");
284 return -EINVAL; 282 return -EINVAL;
285 } 283 }
286 if (adap_layer->ctrlcmd == NULL) { 284 if (adap_layer->ctrlcmd == NULL) {
287 pr_err("CAIF: %s(): adap_layer->ctrlcmd == NULL", __func__); 285 pr_err("adap_layer->ctrlcmd == NULL\n");
288 return -EINVAL; 286 return -EINVAL;
289 } 287 }
290 frml = cnfg->phy_layers[param->phyid].frm_layer; 288 frml = cnfg->phy_layers[param->phyid].frm_layer;
291 if (frml == NULL) { 289 if (frml == NULL) {
292 pr_err("CAIF: %s(): Specified PHY type does not exist!", 290 pr_err("Specified PHY type does not exist!\n");
293 __func__);
294 return -ENODEV; 291 return -ENODEV;
295 } 292 }
296 caif_assert(param->phyid == cnfg->phy_layers[param->phyid].id); 293 caif_assert(param->phyid == cnfg->phy_layers[param->phyid].id);
@@ -330,9 +327,7 @@ cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv,
330 struct net_device *netdev; 327 struct net_device *netdev;
331 328
332 if (adapt_layer == NULL) { 329 if (adapt_layer == NULL) {
333 pr_debug("CAIF: %s(): link setup response " 330 pr_debug("link setup response but no client exist, send linkdown back\n");
334 "but no client exist, send linkdown back\n",
335 __func__);
336 cfctrl_linkdown_req(cnfg->ctrl, channel_id, NULL); 331 cfctrl_linkdown_req(cnfg->ctrl, channel_id, NULL);
337 return; 332 return;
338 } 333 }
@@ -374,13 +369,11 @@ cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv,
374 servicel = cfdbgl_create(channel_id, &phyinfo->dev_info); 369 servicel = cfdbgl_create(channel_id, &phyinfo->dev_info);
375 break; 370 break;
376 default: 371 default:
377 pr_err("CAIF: %s(): Protocol error. " 372 pr_err("Protocol error. Link setup response - unknown channel type\n");
378 "Link setup response - unknown channel type\n",
379 __func__);
380 return; 373 return;
381 } 374 }
382 if (!servicel) { 375 if (!servicel) {
383 pr_warning("CAIF: %s(): Out of memory\n", __func__); 376 pr_warn("Out of memory\n");
384 return; 377 return;
385 } 378 }
386 layer_set_dn(servicel, cnfg->mux); 379 layer_set_dn(servicel, cnfg->mux);
@@ -418,7 +411,7 @@ cfcnfg_add_phy_layer(struct cfcnfg *cnfg, enum cfcnfg_phy_type phy_type,
418 } 411 }
419 } 412 }
420 if (*phyid == 0) { 413 if (*phyid == 0) {
421 pr_err("CAIF: %s(): No Available PHY ID\n", __func__); 414 pr_err("No Available PHY ID\n");
422 return; 415 return;
423 } 416 }
424 417
@@ -427,7 +420,7 @@ cfcnfg_add_phy_layer(struct cfcnfg *cnfg, enum cfcnfg_phy_type phy_type,
427 phy_driver = 420 phy_driver =
428 cfserl_create(CFPHYTYPE_FRAG, *phyid, stx); 421 cfserl_create(CFPHYTYPE_FRAG, *phyid, stx);
429 if (!phy_driver) { 422 if (!phy_driver) {
430 pr_warning("CAIF: %s(): Out of memory\n", __func__); 423 pr_warn("Out of memory\n");
431 return; 424 return;
432 } 425 }
433 426
@@ -436,7 +429,7 @@ cfcnfg_add_phy_layer(struct cfcnfg *cnfg, enum cfcnfg_phy_type phy_type,
436 phy_driver = NULL; 429 phy_driver = NULL;
437 break; 430 break;
438 default: 431 default:
439 pr_err("CAIF: %s(): %d", __func__, phy_type); 432 pr_err("%d\n", phy_type);
440 return; 433 return;
441 break; 434 break;
442 } 435 }
@@ -455,7 +448,7 @@ cfcnfg_add_phy_layer(struct cfcnfg *cnfg, enum cfcnfg_phy_type phy_type,
455 phy_layer->type = phy_type; 448 phy_layer->type = phy_type;
456 frml = cffrml_create(*phyid, fcs); 449 frml = cffrml_create(*phyid, fcs);
457 if (!frml) { 450 if (!frml) {
458 pr_warning("CAIF: %s(): Out of memory\n", __func__); 451 pr_warn("Out of memory\n");
459 return; 452 return;
460 } 453 }
461 cnfg->phy_layers[*phyid].frm_layer = frml; 454 cnfg->phy_layers[*phyid].frm_layer = frml;
diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c
index 563145fdc4c3..08f267a109aa 100644
--- a/net/caif/cfctrl.c
+++ b/net/caif/cfctrl.c
@@ -4,6 +4,8 @@
4 * License terms: GNU General Public License (GPL) version 2 4 * License terms: GNU General Public License (GPL) version 2
5 */ 5 */
6 6
7#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
8
7#include <linux/stddef.h> 9#include <linux/stddef.h>
8#include <linux/spinlock.h> 10#include <linux/spinlock.h>
9#include <linux/slab.h> 11#include <linux/slab.h>
@@ -36,7 +38,7 @@ struct cflayer *cfctrl_create(void)
36 struct cfctrl *this = 38 struct cfctrl *this =
37 kmalloc(sizeof(struct cfctrl), GFP_ATOMIC); 39 kmalloc(sizeof(struct cfctrl), GFP_ATOMIC);
38 if (!this) { 40 if (!this) {
39 pr_warning("CAIF: %s(): Out of memory\n", __func__); 41 pr_warn("Out of memory\n");
40 return NULL; 42 return NULL;
41 } 43 }
42 caif_assert(offsetof(struct cfctrl, serv.layer) == 0); 44 caif_assert(offsetof(struct cfctrl, serv.layer) == 0);
@@ -132,9 +134,7 @@ struct cfctrl_request_info *cfctrl_remove_req(struct cfctrl *ctrl,
132 list_for_each_entry_safe(p, tmp, &ctrl->list, list) { 134 list_for_each_entry_safe(p, tmp, &ctrl->list, list) {
133 if (cfctrl_req_eq(req, p)) { 135 if (cfctrl_req_eq(req, p)) {
134 if (p != first) 136 if (p != first)
135 pr_warning("CAIF: %s(): Requests are not " 137 pr_warn("Requests are not received in order\n");
136 "received in order\n",
137 __func__);
138 138
139 atomic_set(&ctrl->rsp_seq_no, 139 atomic_set(&ctrl->rsp_seq_no,
140 p->sequence_no); 140 p->sequence_no);
@@ -177,7 +177,7 @@ void cfctrl_enum_req(struct cflayer *layer, u8 physlinkid)
177 int ret; 177 int ret;
178 struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN); 178 struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN);
179 if (!pkt) { 179 if (!pkt) {
180 pr_warning("CAIF: %s(): Out of memory\n", __func__); 180 pr_warn("Out of memory\n");
181 return; 181 return;
182 } 182 }
183 caif_assert(offsetof(struct cfctrl, serv.layer) == 0); 183 caif_assert(offsetof(struct cfctrl, serv.layer) == 0);
@@ -189,8 +189,7 @@ void cfctrl_enum_req(struct cflayer *layer, u8 physlinkid)
189 ret = 189 ret =
190 cfctrl->serv.layer.dn->transmit(cfctrl->serv.layer.dn, pkt); 190 cfctrl->serv.layer.dn->transmit(cfctrl->serv.layer.dn, pkt);
191 if (ret < 0) { 191 if (ret < 0) {
192 pr_err("CAIF: %s(): Could not transmit enum message\n", 192 pr_err("Could not transmit enum message\n");
193 __func__);
194 cfpkt_destroy(pkt); 193 cfpkt_destroy(pkt);
195 } 194 }
196} 195}
@@ -208,7 +207,7 @@ int cfctrl_linkup_request(struct cflayer *layer,
208 char utility_name[16]; 207 char utility_name[16];
209 struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN); 208 struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN);
210 if (!pkt) { 209 if (!pkt) {
211 pr_warning("CAIF: %s(): Out of memory\n", __func__); 210 pr_warn("Out of memory\n");
212 return -ENOMEM; 211 return -ENOMEM;
213 } 212 }
214 cfpkt_addbdy(pkt, CFCTRL_CMD_LINK_SETUP); 213 cfpkt_addbdy(pkt, CFCTRL_CMD_LINK_SETUP);
@@ -253,13 +252,13 @@ int cfctrl_linkup_request(struct cflayer *layer,
253 param->u.utility.paramlen); 252 param->u.utility.paramlen);
254 break; 253 break;
255 default: 254 default:
256 pr_warning("CAIF: %s():Request setup of bad link type = %d\n", 255 pr_warn("Request setup of bad link type = %d\n",
257 __func__, param->linktype); 256 param->linktype);
258 return -EINVAL; 257 return -EINVAL;
259 } 258 }
260 req = kzalloc(sizeof(*req), GFP_KERNEL); 259 req = kzalloc(sizeof(*req), GFP_KERNEL);
261 if (!req) { 260 if (!req) {
262 pr_warning("CAIF: %s(): Out of memory\n", __func__); 261 pr_warn("Out of memory\n");
263 return -ENOMEM; 262 return -ENOMEM;
264 } 263 }
265 req->client_layer = user_layer; 264 req->client_layer = user_layer;
@@ -276,8 +275,7 @@ int cfctrl_linkup_request(struct cflayer *layer,
276 ret = 275 ret =
277 cfctrl->serv.layer.dn->transmit(cfctrl->serv.layer.dn, pkt); 276 cfctrl->serv.layer.dn->transmit(cfctrl->serv.layer.dn, pkt);
278 if (ret < 0) { 277 if (ret < 0) {
279 pr_err("CAIF: %s(): Could not transmit linksetup request\n", 278 pr_err("Could not transmit linksetup request\n");
280 __func__);
281 cfpkt_destroy(pkt); 279 cfpkt_destroy(pkt);
282 return -ENODEV; 280 return -ENODEV;
283 } 281 }
@@ -291,7 +289,7 @@ int cfctrl_linkdown_req(struct cflayer *layer, u8 channelid,
291 struct cfctrl *cfctrl = container_obj(layer); 289 struct cfctrl *cfctrl = container_obj(layer);
292 struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN); 290 struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN);
293 if (!pkt) { 291 if (!pkt) {
294 pr_warning("CAIF: %s(): Out of memory\n", __func__); 292 pr_warn("Out of memory\n");
295 return -ENOMEM; 293 return -ENOMEM;
296 } 294 }
297 cfpkt_addbdy(pkt, CFCTRL_CMD_LINK_DESTROY); 295 cfpkt_addbdy(pkt, CFCTRL_CMD_LINK_DESTROY);
@@ -300,8 +298,7 @@ int cfctrl_linkdown_req(struct cflayer *layer, u8 channelid,
300 ret = 298 ret =
301 cfctrl->serv.layer.dn->transmit(cfctrl->serv.layer.dn, pkt); 299 cfctrl->serv.layer.dn->transmit(cfctrl->serv.layer.dn, pkt);
302 if (ret < 0) { 300 if (ret < 0) {
303 pr_err("CAIF: %s(): Could not transmit link-down request\n", 301 pr_err("Could not transmit link-down request\n");
304 __func__);
305 cfpkt_destroy(pkt); 302 cfpkt_destroy(pkt);
306 } 303 }
307 return ret; 304 return ret;
@@ -313,7 +310,7 @@ void cfctrl_sleep_req(struct cflayer *layer)
313 struct cfctrl *cfctrl = container_obj(layer); 310 struct cfctrl *cfctrl = container_obj(layer);
314 struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN); 311 struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN);
315 if (!pkt) { 312 if (!pkt) {
316 pr_warning("CAIF: %s(): Out of memory\n", __func__); 313 pr_warn("Out of memory\n");
317 return; 314 return;
318 } 315 }
319 cfpkt_addbdy(pkt, CFCTRL_CMD_SLEEP); 316 cfpkt_addbdy(pkt, CFCTRL_CMD_SLEEP);
@@ -330,7 +327,7 @@ void cfctrl_wake_req(struct cflayer *layer)
330 struct cfctrl *cfctrl = container_obj(layer); 327 struct cfctrl *cfctrl = container_obj(layer);
331 struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN); 328 struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN);
332 if (!pkt) { 329 if (!pkt) {
333 pr_warning("CAIF: %s(): Out of memory\n", __func__); 330 pr_warn("Out of memory\n");
334 return; 331 return;
335 } 332 }
336 cfpkt_addbdy(pkt, CFCTRL_CMD_WAKE); 333 cfpkt_addbdy(pkt, CFCTRL_CMD_WAKE);
@@ -347,7 +344,7 @@ void cfctrl_getstartreason_req(struct cflayer *layer)
347 struct cfctrl *cfctrl = container_obj(layer); 344 struct cfctrl *cfctrl = container_obj(layer);
348 struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN); 345 struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN);
349 if (!pkt) { 346 if (!pkt) {
350 pr_warning("CAIF: %s(): Out of memory\n", __func__); 347 pr_warn("Out of memory\n");
351 return; 348 return;
352 } 349 }
353 cfpkt_addbdy(pkt, CFCTRL_CMD_START_REASON); 350 cfpkt_addbdy(pkt, CFCTRL_CMD_START_REASON);
@@ -364,12 +361,11 @@ void cfctrl_cancel_req(struct cflayer *layr, struct cflayer *adap_layer)
364 struct cfctrl_request_info *p, *tmp; 361 struct cfctrl_request_info *p, *tmp;
365 struct cfctrl *ctrl = container_obj(layr); 362 struct cfctrl *ctrl = container_obj(layr);
366 spin_lock(&ctrl->info_list_lock); 363 spin_lock(&ctrl->info_list_lock);
367 pr_warning("CAIF: %s(): enter\n", __func__); 364 pr_warn("enter\n");
368 365
369 list_for_each_entry_safe(p, tmp, &ctrl->list, list) { 366 list_for_each_entry_safe(p, tmp, &ctrl->list, list) {
370 if (p->client_layer == adap_layer) { 367 if (p->client_layer == adap_layer) {
371 pr_warning("CAIF: %s(): cancel req :%d\n", __func__, 368 pr_warn("cancel req :%d\n", p->sequence_no);
372 p->sequence_no);
373 list_del(&p->list); 369 list_del(&p->list);
374 kfree(p); 370 kfree(p);
375 } 371 }
@@ -520,9 +516,8 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt)
520 cfpkt_extr_head(pkt, &param, len); 516 cfpkt_extr_head(pkt, &param, len);
521 break; 517 break;
522 default: 518 default:
523 pr_warning("CAIF: %s(): Request setup " 519 pr_warn("Request setup - invalid link type (%d)\n",
524 "- invalid link type (%d)", 520 serv);
525 __func__, serv);
526 goto error; 521 goto error;
527 } 522 }
528 523
@@ -532,9 +527,7 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt)
532 527
533 if (CFCTRL_ERR_BIT == (CFCTRL_ERR_BIT & cmdrsp) || 528 if (CFCTRL_ERR_BIT == (CFCTRL_ERR_BIT & cmdrsp) ||
534 cfpkt_erroneous(pkt)) { 529 cfpkt_erroneous(pkt)) {
535 pr_err("CAIF: %s(): Invalid O/E bit or parse " 530 pr_err("Invalid O/E bit or parse error on CAIF control channel\n");
536 "error on CAIF control channel",
537 __func__);
538 cfctrl->res.reject_rsp(cfctrl->serv.layer.up, 531 cfctrl->res.reject_rsp(cfctrl->serv.layer.up,
539 0, 532 0,
540 req ? req->client_layer 533 req ? req->client_layer
@@ -556,8 +549,7 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt)
556 cfctrl->res.linkdestroy_rsp(cfctrl->serv.layer.up, linkid); 549 cfctrl->res.linkdestroy_rsp(cfctrl->serv.layer.up, linkid);
557 break; 550 break;
558 case CFCTRL_CMD_LINK_ERR: 551 case CFCTRL_CMD_LINK_ERR:
559 pr_err("CAIF: %s(): Frame Error Indication received\n", 552 pr_err("Frame Error Indication received\n");
560 __func__);
561 cfctrl->res.linkerror_ind(); 553 cfctrl->res.linkerror_ind();
562 break; 554 break;
563 case CFCTRL_CMD_ENUM: 555 case CFCTRL_CMD_ENUM:
@@ -576,7 +568,7 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt)
576 cfctrl->res.radioset_rsp(); 568 cfctrl->res.radioset_rsp();
577 break; 569 break;
578 default: 570 default:
579 pr_err("CAIF: %s(): Unrecognized Control Frame\n", __func__); 571 pr_err("Unrecognized Control Frame\n");
580 goto error; 572 goto error;
581 break; 573 break;
582 } 574 }
@@ -595,8 +587,7 @@ static void cfctrl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
595 case CAIF_CTRLCMD_FLOW_OFF_IND: 587 case CAIF_CTRLCMD_FLOW_OFF_IND:
596 spin_lock(&this->info_list_lock); 588 spin_lock(&this->info_list_lock);
597 if (!list_empty(&this->list)) { 589 if (!list_empty(&this->list)) {
598 pr_debug("CAIF: %s(): Received flow off in " 590 pr_debug("Received flow off in control layer\n");
599 "control layer", __func__);
600 } 591 }
601 spin_unlock(&this->info_list_lock); 592 spin_unlock(&this->info_list_lock);
602 break; 593 break;
@@ -620,7 +611,7 @@ static int handle_loop(struct cfctrl *ctrl, int cmd, struct cfpkt *pkt)
620 if (!ctrl->loop_linkused[linkid]) 611 if (!ctrl->loop_linkused[linkid])
621 goto found; 612 goto found;
622 spin_unlock(&ctrl->loop_linkid_lock); 613 spin_unlock(&ctrl->loop_linkid_lock);
623 pr_err("CAIF: %s(): Out of link-ids\n", __func__); 614 pr_err("Out of link-ids\n");
624 return -EINVAL; 615 return -EINVAL;
625found: 616found:
626 if (!ctrl->loop_linkused[linkid]) 617 if (!ctrl->loop_linkused[linkid])
diff --git a/net/caif/cfdbgl.c b/net/caif/cfdbgl.c
index 676648cac8dd..496fda9ac66f 100644
--- a/net/caif/cfdbgl.c
+++ b/net/caif/cfdbgl.c
@@ -4,6 +4,8 @@
4 * License terms: GNU General Public License (GPL) version 2 4 * License terms: GNU General Public License (GPL) version 2
5 */ 5 */
6 6
7#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
8
7#include <linux/stddef.h> 9#include <linux/stddef.h>
8#include <linux/slab.h> 10#include <linux/slab.h>
9#include <net/caif/caif_layer.h> 11#include <net/caif/caif_layer.h>
@@ -17,7 +19,7 @@ struct cflayer *cfdbgl_create(u8 channel_id, struct dev_info *dev_info)
17{ 19{
18 struct cfsrvl *dbg = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC); 20 struct cfsrvl *dbg = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC);
19 if (!dbg) { 21 if (!dbg) {
20 pr_warning("CAIF: %s(): Out of memory\n", __func__); 22 pr_warn("Out of memory\n");
21 return NULL; 23 return NULL;
22 } 24 }
23 caif_assert(offsetof(struct cfsrvl, layer) == 0); 25 caif_assert(offsetof(struct cfsrvl, layer) == 0);
diff --git a/net/caif/cfdgml.c b/net/caif/cfdgml.c
index ed9d53aff280..d3ed264ad6c4 100644
--- a/net/caif/cfdgml.c
+++ b/net/caif/cfdgml.c
@@ -4,6 +4,8 @@
4 * License terms: GNU General Public License (GPL) version 2 4 * License terms: GNU General Public License (GPL) version 2
5 */ 5 */
6 6
7#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
8
7#include <linux/stddef.h> 9#include <linux/stddef.h>
8#include <linux/spinlock.h> 10#include <linux/spinlock.h>
9#include <linux/slab.h> 11#include <linux/slab.h>
@@ -26,7 +28,7 @@ struct cflayer *cfdgml_create(u8 channel_id, struct dev_info *dev_info)
26{ 28{
27 struct cfsrvl *dgm = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC); 29 struct cfsrvl *dgm = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC);
28 if (!dgm) { 30 if (!dgm) {
29 pr_warning("CAIF: %s(): Out of memory\n", __func__); 31 pr_warn("Out of memory\n");
30 return NULL; 32 return NULL;
31 } 33 }
32 caif_assert(offsetof(struct cfsrvl, layer) == 0); 34 caif_assert(offsetof(struct cfsrvl, layer) == 0);
@@ -49,14 +51,14 @@ static int cfdgml_receive(struct cflayer *layr, struct cfpkt *pkt)
49 caif_assert(layr->ctrlcmd != NULL); 51 caif_assert(layr->ctrlcmd != NULL);
50 52
51 if (cfpkt_extr_head(pkt, &cmd, 1) < 0) { 53 if (cfpkt_extr_head(pkt, &cmd, 1) < 0) {
52 pr_err("CAIF: %s(): Packet is erroneous!\n", __func__); 54 pr_err("Packet is erroneous!\n");
53 cfpkt_destroy(pkt); 55 cfpkt_destroy(pkt);
54 return -EPROTO; 56 return -EPROTO;
55 } 57 }
56 58
57 if ((cmd & DGM_CMD_BIT) == 0) { 59 if ((cmd & DGM_CMD_BIT) == 0) {
58 if (cfpkt_extr_head(pkt, &dgmhdr, 3) < 0) { 60 if (cfpkt_extr_head(pkt, &dgmhdr, 3) < 0) {
59 pr_err("CAIF: %s(): Packet is erroneous!\n", __func__); 61 pr_err("Packet is erroneous!\n");
60 cfpkt_destroy(pkt); 62 cfpkt_destroy(pkt);
61 return -EPROTO; 63 return -EPROTO;
62 } 64 }
@@ -75,8 +77,7 @@ static int cfdgml_receive(struct cflayer *layr, struct cfpkt *pkt)
75 return 0; 77 return 0;
76 default: 78 default:
77 cfpkt_destroy(pkt); 79 cfpkt_destroy(pkt);
78 pr_info("CAIF: %s(): Unknown datagram control %d (0x%x)\n", 80 pr_info("Unknown datagram control %d (0x%x)\n", cmd, cmd);
79 __func__, cmd, cmd);
80 return -EPROTO; 81 return -EPROTO;
81 } 82 }
82} 83}
diff --git a/net/caif/cffrml.c b/net/caif/cffrml.c
index e86a4ca3b217..a445043931ae 100644
--- a/net/caif/cffrml.c
+++ b/net/caif/cffrml.c
@@ -6,6 +6,8 @@
6 * License terms: GNU General Public License (GPL) version 2 6 * License terms: GNU General Public License (GPL) version 2
7 */ 7 */
8 8
9#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
10
9#include <linux/stddef.h> 11#include <linux/stddef.h>
10#include <linux/spinlock.h> 12#include <linux/spinlock.h>
11#include <linux/slab.h> 13#include <linux/slab.h>
@@ -32,7 +34,7 @@ struct cflayer *cffrml_create(u16 phyid, bool use_fcs)
32{ 34{
33 struct cffrml *this = kmalloc(sizeof(struct cffrml), GFP_ATOMIC); 35 struct cffrml *this = kmalloc(sizeof(struct cffrml), GFP_ATOMIC);
34 if (!this) { 36 if (!this) {
35 pr_warning("CAIF: %s(): Out of memory\n", __func__); 37 pr_warn("Out of memory\n");
36 return NULL; 38 return NULL;
37 } 39 }
38 caif_assert(offsetof(struct cffrml, layer) == 0); 40 caif_assert(offsetof(struct cffrml, layer) == 0);
@@ -83,7 +85,7 @@ static int cffrml_receive(struct cflayer *layr, struct cfpkt *pkt)
83 85
84 if (cfpkt_setlen(pkt, len) < 0) { 86 if (cfpkt_setlen(pkt, len) < 0) {
85 ++cffrml_rcv_error; 87 ++cffrml_rcv_error;
86 pr_err("CAIF: %s():Framing length error (%d)\n", __func__, len); 88 pr_err("Framing length error (%d)\n", len);
87 cfpkt_destroy(pkt); 89 cfpkt_destroy(pkt);
88 return -EPROTO; 90 return -EPROTO;
89 } 91 }
@@ -99,14 +101,14 @@ static int cffrml_receive(struct cflayer *layr, struct cfpkt *pkt)
99 cfpkt_add_trail(pkt, &tmp, 2); 101 cfpkt_add_trail(pkt, &tmp, 2);
100 ++cffrml_rcv_error; 102 ++cffrml_rcv_error;
101 ++cffrml_rcv_checsum_error; 103 ++cffrml_rcv_checsum_error;
102 pr_info("CAIF: %s(): Frame checksum error " 104 pr_info("Frame checksum error (0x%x != 0x%x)\n",
103 "(0x%x != 0x%x)\n", __func__, hdrchks, pktchks); 105 hdrchks, pktchks);
104 return -EILSEQ; 106 return -EILSEQ;
105 } 107 }
106 } 108 }
107 if (cfpkt_erroneous(pkt)) { 109 if (cfpkt_erroneous(pkt)) {
108 ++cffrml_rcv_error; 110 ++cffrml_rcv_error;
109 pr_err("CAIF: %s(): Packet is erroneous!\n", __func__); 111 pr_err("Packet is erroneous!\n");
110 cfpkt_destroy(pkt); 112 cfpkt_destroy(pkt);
111 return -EPROTO; 113 return -EPROTO;
112 } 114 }
@@ -132,7 +134,7 @@ static int cffrml_transmit(struct cflayer *layr, struct cfpkt *pkt)
132 cfpkt_add_head(pkt, &tmp, 2); 134 cfpkt_add_head(pkt, &tmp, 2);
133 cfpkt_info(pkt)->hdr_len += 2; 135 cfpkt_info(pkt)->hdr_len += 2;
134 if (cfpkt_erroneous(pkt)) { 136 if (cfpkt_erroneous(pkt)) {
135 pr_err("CAIF: %s(): Packet is erroneous!\n", __func__); 137 pr_err("Packet is erroneous!\n");
136 return -EPROTO; 138 return -EPROTO;
137 } 139 }
138 ret = layr->dn->transmit(layr->dn, pkt); 140 ret = layr->dn->transmit(layr->dn, pkt);
diff --git a/net/caif/cfmuxl.c b/net/caif/cfmuxl.c
index 80c8d332b258..46f34b2e0478 100644
--- a/net/caif/cfmuxl.c
+++ b/net/caif/cfmuxl.c
@@ -3,6 +3,9 @@
3 * Author: Sjur Brendeland/sjur.brandeland@stericsson.com 3 * Author: Sjur Brendeland/sjur.brandeland@stericsson.com
4 * License terms: GNU General Public License (GPL) version 2 4 * License terms: GNU General Public License (GPL) version 2
5 */ 5 */
6
7#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
8
6#include <linux/stddef.h> 9#include <linux/stddef.h>
7#include <linux/spinlock.h> 10#include <linux/spinlock.h>
8#include <linux/slab.h> 11#include <linux/slab.h>
@@ -190,7 +193,7 @@ static int cfmuxl_receive(struct cflayer *layr, struct cfpkt *pkt)
190 u8 id; 193 u8 id;
191 struct cflayer *up; 194 struct cflayer *up;
192 if (cfpkt_extr_head(pkt, &id, 1) < 0) { 195 if (cfpkt_extr_head(pkt, &id, 1) < 0) {
193 pr_err("CAIF: %s(): erroneous Caif Packet\n", __func__); 196 pr_err("erroneous Caif Packet\n");
194 cfpkt_destroy(pkt); 197 cfpkt_destroy(pkt);
195 return -EPROTO; 198 return -EPROTO;
196 } 199 }
@@ -199,8 +202,8 @@ static int cfmuxl_receive(struct cflayer *layr, struct cfpkt *pkt)
199 up = get_up(muxl, id); 202 up = get_up(muxl, id);
200 spin_unlock(&muxl->receive_lock); 203 spin_unlock(&muxl->receive_lock);
201 if (up == NULL) { 204 if (up == NULL) {
202 pr_info("CAIF: %s():Received data on unknown link ID = %d " 205 pr_info("Received data on unknown link ID = %d (0x%x) up == NULL",
203 "(0x%x) up == NULL", __func__, id, id); 206 id, id);
204 cfpkt_destroy(pkt); 207 cfpkt_destroy(pkt);
205 /* 208 /*
206 * Don't return ERROR, since modem misbehaves and sends out 209 * Don't return ERROR, since modem misbehaves and sends out
@@ -223,9 +226,8 @@ static int cfmuxl_transmit(struct cflayer *layr, struct cfpkt *pkt)
223 struct caif_payload_info *info = cfpkt_info(pkt); 226 struct caif_payload_info *info = cfpkt_info(pkt);
224 dn = get_dn(muxl, cfpkt_info(pkt)->dev_info); 227 dn = get_dn(muxl, cfpkt_info(pkt)->dev_info);
225 if (dn == NULL) { 228 if (dn == NULL) {
226 pr_warning("CAIF: %s(): Send data on unknown phy " 229 pr_warn("Send data on unknown phy ID = %d (0x%x)\n",
227 "ID = %d (0x%x)\n", 230 info->dev_info->id, info->dev_info->id);
228 __func__, info->dev_info->id, info->dev_info->id);
229 return -ENOTCONN; 231 return -ENOTCONN;
230 } 232 }
231 info->hdr_len += 1; 233 info->hdr_len += 1;
diff --git a/net/caif/cfpkt_skbuff.c b/net/caif/cfpkt_skbuff.c
index c49a6695793a..d7e865e2ff65 100644
--- a/net/caif/cfpkt_skbuff.c
+++ b/net/caif/cfpkt_skbuff.c
@@ -4,6 +4,8 @@
4 * License terms: GNU General Public License (GPL) version 2 4 * License terms: GNU General Public License (GPL) version 2
5 */ 5 */
6 6
7#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
8
7#include <linux/string.h> 9#include <linux/string.h>
8#include <linux/skbuff.h> 10#include <linux/skbuff.h>
9#include <linux/hardirq.h> 11#include <linux/hardirq.h>
@@ -12,11 +14,12 @@
12#define PKT_PREFIX 48 14#define PKT_PREFIX 48
13#define PKT_POSTFIX 2 15#define PKT_POSTFIX 2
14#define PKT_LEN_WHEN_EXTENDING 128 16#define PKT_LEN_WHEN_EXTENDING 128
15#define PKT_ERROR(pkt, errmsg) do { \ 17#define PKT_ERROR(pkt, errmsg) \
16 cfpkt_priv(pkt)->erronous = true; \ 18do { \
17 skb_reset_tail_pointer(&pkt->skb); \ 19 cfpkt_priv(pkt)->erronous = true; \
18 pr_warning("CAIF: " errmsg);\ 20 skb_reset_tail_pointer(&pkt->skb); \
19 } while (0) 21 pr_warn(errmsg); \
22} while (0)
20 23
21struct cfpktq { 24struct cfpktq {
22 struct sk_buff_head head; 25 struct sk_buff_head head;
@@ -130,13 +133,13 @@ int cfpkt_extr_head(struct cfpkt *pkt, void *data, u16 len)
130 return -EPROTO; 133 return -EPROTO;
131 134
132 if (unlikely(len > skb->len)) { 135 if (unlikely(len > skb->len)) {
133 PKT_ERROR(pkt, "cfpkt_extr_head read beyond end of packet\n"); 136 PKT_ERROR(pkt, "read beyond end of packet\n");
134 return -EPROTO; 137 return -EPROTO;
135 } 138 }
136 139
137 if (unlikely(len > skb_headlen(skb))) { 140 if (unlikely(len > skb_headlen(skb))) {
138 if (unlikely(skb_linearize(skb) != 0)) { 141 if (unlikely(skb_linearize(skb) != 0)) {
139 PKT_ERROR(pkt, "cfpkt_extr_head linearize failed\n"); 142 PKT_ERROR(pkt, "linearize failed\n");
140 return -EPROTO; 143 return -EPROTO;
141 } 144 }
142 } 145 }
@@ -156,11 +159,11 @@ int cfpkt_extr_trail(struct cfpkt *pkt, void *dta, u16 len)
156 return -EPROTO; 159 return -EPROTO;
157 160
158 if (unlikely(skb_linearize(skb) != 0)) { 161 if (unlikely(skb_linearize(skb) != 0)) {
159 PKT_ERROR(pkt, "cfpkt_extr_trail linearize failed\n"); 162 PKT_ERROR(pkt, "linearize failed\n");
160 return -EPROTO; 163 return -EPROTO;
161 } 164 }
162 if (unlikely(skb->data + len > skb_tail_pointer(skb))) { 165 if (unlikely(skb->data + len > skb_tail_pointer(skb))) {
163 PKT_ERROR(pkt, "cfpkt_extr_trail read beyond end of packet\n"); 166 PKT_ERROR(pkt, "read beyond end of packet\n");
164 return -EPROTO; 167 return -EPROTO;
165 } 168 }
166 from = skb_tail_pointer(skb) - len; 169 from = skb_tail_pointer(skb) - len;
@@ -202,7 +205,7 @@ int cfpkt_add_body(struct cfpkt *pkt, const void *data, u16 len)
202 205
203 /* Make sure data is writable */ 206 /* Make sure data is writable */
204 if (unlikely(skb_cow_data(skb, addlen, &lastskb) < 0)) { 207 if (unlikely(skb_cow_data(skb, addlen, &lastskb) < 0)) {
205 PKT_ERROR(pkt, "cfpkt_add_body: cow failed\n"); 208 PKT_ERROR(pkt, "cow failed\n");
206 return -EPROTO; 209 return -EPROTO;
207 } 210 }
208 /* 211 /*
@@ -211,8 +214,7 @@ int cfpkt_add_body(struct cfpkt *pkt, const void *data, u16 len)
211 * lengths of the top SKB. 214 * lengths of the top SKB.
212 */ 215 */
213 if (lastskb != skb) { 216 if (lastskb != skb) {
214 pr_warning("CAIF: %s(): Packet is non-linear\n", 217 pr_warn("Packet is non-linear\n");
215 __func__);
216 skb->len += len; 218 skb->len += len;
217 skb->data_len += len; 219 skb->data_len += len;
218 } 220 }
@@ -242,14 +244,14 @@ int cfpkt_add_head(struct cfpkt *pkt, const void *data2, u16 len)
242 if (unlikely(is_erronous(pkt))) 244 if (unlikely(is_erronous(pkt)))
243 return -EPROTO; 245 return -EPROTO;
244 if (unlikely(skb_headroom(skb) < len)) { 246 if (unlikely(skb_headroom(skb) < len)) {
245 PKT_ERROR(pkt, "cfpkt_add_head: no headroom\n"); 247 PKT_ERROR(pkt, "no headroom\n");
246 return -EPROTO; 248 return -EPROTO;
247 } 249 }
248 250
249 /* Make sure data is writable */ 251 /* Make sure data is writable */
250 ret = skb_cow_data(skb, 0, &lastskb); 252 ret = skb_cow_data(skb, 0, &lastskb);
251 if (unlikely(ret < 0)) { 253 if (unlikely(ret < 0)) {
252 PKT_ERROR(pkt, "cfpkt_add_head: cow failed\n"); 254 PKT_ERROR(pkt, "cow failed\n");
253 return ret; 255 return ret;
254 } 256 }
255 257
@@ -283,7 +285,7 @@ inline u16 cfpkt_iterate(struct cfpkt *pkt,
283 if (unlikely(is_erronous(pkt))) 285 if (unlikely(is_erronous(pkt)))
284 return -EPROTO; 286 return -EPROTO;
285 if (unlikely(skb_linearize(&pkt->skb) != 0)) { 287 if (unlikely(skb_linearize(&pkt->skb) != 0)) {
286 PKT_ERROR(pkt, "cfpkt_iterate: linearize failed\n"); 288 PKT_ERROR(pkt, "linearize failed\n");
287 return -EPROTO; 289 return -EPROTO;
288 } 290 }
289 return iter_func(data, pkt->skb.data, cfpkt_getlen(pkt)); 291 return iter_func(data, pkt->skb.data, cfpkt_getlen(pkt));
@@ -309,7 +311,7 @@ int cfpkt_setlen(struct cfpkt *pkt, u16 len)
309 311
310 /* Need to expand SKB */ 312 /* Need to expand SKB */
311 if (unlikely(!cfpkt_pad_trail(pkt, len - skb->len))) 313 if (unlikely(!cfpkt_pad_trail(pkt, len - skb->len)))
312 PKT_ERROR(pkt, "cfpkt_setlen: skb_pad_trail failed\n"); 314 PKT_ERROR(pkt, "skb_pad_trail failed\n");
313 315
314 return cfpkt_getlen(pkt); 316 return cfpkt_getlen(pkt);
315} 317}
@@ -380,8 +382,7 @@ struct cfpkt *cfpkt_split(struct cfpkt *pkt, u16 pos)
380 return NULL; 382 return NULL;
381 383
382 if (skb->data + pos > skb_tail_pointer(skb)) { 384 if (skb->data + pos > skb_tail_pointer(skb)) {
383 PKT_ERROR(pkt, 385 PKT_ERROR(pkt, "trying to split beyond end of packet\n");
384 "cfpkt_split: trying to split beyond end of packet");
385 return NULL; 386 return NULL;
386 } 387 }
387 388
@@ -455,17 +456,17 @@ int cfpkt_raw_append(struct cfpkt *pkt, void **buf, unsigned int buflen)
455 return -EPROTO; 456 return -EPROTO;
456 /* Make sure SKB is writable */ 457 /* Make sure SKB is writable */
457 if (unlikely(skb_cow_data(skb, 0, &lastskb) < 0)) { 458 if (unlikely(skb_cow_data(skb, 0, &lastskb) < 0)) {
458 PKT_ERROR(pkt, "cfpkt_raw_append: skb_cow_data failed\n"); 459 PKT_ERROR(pkt, "skb_cow_data failed\n");
459 return -EPROTO; 460 return -EPROTO;
460 } 461 }
461 462
462 if (unlikely(skb_linearize(skb) != 0)) { 463 if (unlikely(skb_linearize(skb) != 0)) {
463 PKT_ERROR(pkt, "cfpkt_raw_append: linearize failed\n"); 464 PKT_ERROR(pkt, "linearize failed\n");
464 return -EPROTO; 465 return -EPROTO;
465 } 466 }
466 467
467 if (unlikely(skb_tailroom(skb) < buflen)) { 468 if (unlikely(skb_tailroom(skb) < buflen)) {
468 PKT_ERROR(pkt, "cfpkt_raw_append: buffer too short - failed\n"); 469 PKT_ERROR(pkt, "buffer too short - failed\n");
469 return -EPROTO; 470 return -EPROTO;
470 } 471 }
471 472
@@ -483,14 +484,13 @@ int cfpkt_raw_extract(struct cfpkt *pkt, void **buf, unsigned int buflen)
483 return -EPROTO; 484 return -EPROTO;
484 485
485 if (unlikely(buflen > skb->len)) { 486 if (unlikely(buflen > skb->len)) {
486 PKT_ERROR(pkt, "cfpkt_raw_extract: buflen too large " 487 PKT_ERROR(pkt, "buflen too large - failed\n");
487 "- failed\n");
488 return -EPROTO; 488 return -EPROTO;
489 } 489 }
490 490
491 if (unlikely(buflen > skb_headlen(skb))) { 491 if (unlikely(buflen > skb_headlen(skb))) {
492 if (unlikely(skb_linearize(skb) != 0)) { 492 if (unlikely(skb_linearize(skb) != 0)) {
493 PKT_ERROR(pkt, "cfpkt_raw_extract: linearize failed\n"); 493 PKT_ERROR(pkt, "linearize failed\n");
494 return -EPROTO; 494 return -EPROTO;
495 } 495 }
496 } 496 }
diff --git a/net/caif/cfrfml.c b/net/caif/cfrfml.c
index eb1602022ac0..bde8481e8d25 100644
--- a/net/caif/cfrfml.c
+++ b/net/caif/cfrfml.c
@@ -4,10 +4,12 @@
4 * License terms: GNU General Public License (GPL) version 2 4 * License terms: GNU General Public License (GPL) version 2
5 */ 5 */
6 6
7#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
8
7#include <linux/stddef.h> 9#include <linux/stddef.h>
8#include <linux/spinlock.h> 10#include <linux/spinlock.h>
9#include <linux/slab.h> 11#include <linux/slab.h>
10#include <linux/unaligned/le_byteshift.h> 12#include <asm/unaligned.h>
11#include <net/caif/caif_layer.h> 13#include <net/caif/caif_layer.h>
12#include <net/caif/cfsrvl.h> 14#include <net/caif/cfsrvl.h>
13#include <net/caif/cfpkt.h> 15#include <net/caif/cfpkt.h>
@@ -48,7 +50,7 @@ struct cflayer *cfrfml_create(u8 channel_id, struct dev_info *dev_info,
48 kzalloc(sizeof(struct cfrfml), GFP_ATOMIC); 50 kzalloc(sizeof(struct cfrfml), GFP_ATOMIC);
49 51
50 if (!this) { 52 if (!this) {
51 pr_warning("CAIF: %s(): Out of memory\n", __func__); 53 pr_warn("Out of memory\n");
52 return NULL; 54 return NULL;
53 } 55 }
54 56
@@ -178,9 +180,7 @@ out:
178 cfpkt_destroy(rfml->incomplete_frm); 180 cfpkt_destroy(rfml->incomplete_frm);
179 rfml->incomplete_frm = NULL; 181 rfml->incomplete_frm = NULL;
180 182
181 pr_info("CAIF: %s(): " 183 pr_info("Connection error %d triggered on RFM link\n", err);
182 "Connection error %d triggered on RFM link\n",
183 __func__, err);
184 184
185 /* Trigger connection error upon failure.*/ 185 /* Trigger connection error upon failure.*/
186 layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND, 186 layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND,
@@ -280,9 +280,7 @@ static int cfrfml_transmit(struct cflayer *layr, struct cfpkt *pkt)
280out: 280out:
281 281
282 if (err != 0) { 282 if (err != 0) {
283 pr_info("CAIF: %s(): " 283 pr_info("Connection error %d triggered on RFM link\n", err);
284 "Connection error %d triggered on RFM link\n",
285 __func__, err);
286 /* Trigger connection error upon failure.*/ 284 /* Trigger connection error upon failure.*/
287 285
288 layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND, 286 layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND,
diff --git a/net/caif/cfserl.c b/net/caif/cfserl.c
index a11fbd68a13d..9297f7dea9d8 100644
--- a/net/caif/cfserl.c
+++ b/net/caif/cfserl.c
@@ -4,6 +4,8 @@
4 * License terms: GNU General Public License (GPL) version 2 4 * License terms: GNU General Public License (GPL) version 2
5 */ 5 */
6 6
7#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
8
7#include <linux/stddef.h> 9#include <linux/stddef.h>
8#include <linux/spinlock.h> 10#include <linux/spinlock.h>
9#include <linux/slab.h> 11#include <linux/slab.h>
@@ -34,7 +36,7 @@ struct cflayer *cfserl_create(int type, int instance, bool use_stx)
34{ 36{
35 struct cfserl *this = kmalloc(sizeof(struct cfserl), GFP_ATOMIC); 37 struct cfserl *this = kmalloc(sizeof(struct cfserl), GFP_ATOMIC);
36 if (!this) { 38 if (!this) {
37 pr_warning("CAIF: %s(): Out of memory\n", __func__); 39 pr_warn("Out of memory\n");
38 return NULL; 40 return NULL;
39 } 41 }
40 caif_assert(offsetof(struct cfserl, layer) == 0); 42 caif_assert(offsetof(struct cfserl, layer) == 0);
diff --git a/net/caif/cfsrvl.c b/net/caif/cfsrvl.c
index f40939a91211..ab5e542526bf 100644
--- a/net/caif/cfsrvl.c
+++ b/net/caif/cfsrvl.c
@@ -4,6 +4,8 @@
4 * License terms: GNU General Public License (GPL) version 2 4 * License terms: GNU General Public License (GPL) version 2
5 */ 5 */
6 6
7#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
8
7#include <linux/kernel.h> 9#include <linux/kernel.h>
8#include <linux/types.h> 10#include <linux/types.h>
9#include <linux/errno.h> 11#include <linux/errno.h>
@@ -79,8 +81,7 @@ static void cfservl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
79 layr->up->ctrlcmd(layr->up, ctrl, phyid); 81 layr->up->ctrlcmd(layr->up, ctrl, phyid);
80 break; 82 break;
81 default: 83 default:
82 pr_warning("CAIF: %s(): " 84 pr_warn("Unexpected ctrl in cfsrvl (%d)\n", ctrl);
83 "Unexpected ctrl in cfsrvl (%d)\n", __func__, ctrl);
84 /* We have both modem and phy flow on, send flow on */ 85 /* We have both modem and phy flow on, send flow on */
85 layr->up->ctrlcmd(layr->up, ctrl, phyid); 86 layr->up->ctrlcmd(layr->up, ctrl, phyid);
86 service->phy_flow_on = true; 87 service->phy_flow_on = true;
@@ -107,14 +108,12 @@ static int cfservl_modemcmd(struct cflayer *layr, enum caif_modemcmd ctrl)
107 u8 flow_on = SRVL_FLOW_ON; 108 u8 flow_on = SRVL_FLOW_ON;
108 pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE); 109 pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE);
109 if (!pkt) { 110 if (!pkt) {
110 pr_warning("CAIF: %s(): Out of memory\n", 111 pr_warn("Out of memory\n");
111 __func__);
112 return -ENOMEM; 112 return -ENOMEM;
113 } 113 }
114 114
115 if (cfpkt_add_head(pkt, &flow_on, 1) < 0) { 115 if (cfpkt_add_head(pkt, &flow_on, 1) < 0) {
116 pr_err("CAIF: %s(): Packet is erroneous!\n", 116 pr_err("Packet is erroneous!\n");
117 __func__);
118 cfpkt_destroy(pkt); 117 cfpkt_destroy(pkt);
119 return -EPROTO; 118 return -EPROTO;
120 } 119 }
@@ -131,14 +130,12 @@ static int cfservl_modemcmd(struct cflayer *layr, enum caif_modemcmd ctrl)
131 u8 flow_off = SRVL_FLOW_OFF; 130 u8 flow_off = SRVL_FLOW_OFF;
132 pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE); 131 pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE);
133 if (!pkt) { 132 if (!pkt) {
134 pr_warning("CAIF: %s(): Out of memory\n", 133 pr_warn("Out of memory\n");
135 __func__);
136 return -ENOMEM; 134 return -ENOMEM;
137 } 135 }
138 136
139 if (cfpkt_add_head(pkt, &flow_off, 1) < 0) { 137 if (cfpkt_add_head(pkt, &flow_off, 1) < 0) {
140 pr_err("CAIF: %s(): Packet is erroneous!\n", 138 pr_err("Packet is erroneous!\n");
141 __func__);
142 cfpkt_destroy(pkt); 139 cfpkt_destroy(pkt);
143 return -EPROTO; 140 return -EPROTO;
144 } 141 }
diff --git a/net/caif/cfutill.c b/net/caif/cfutill.c
index 02795aff57a4..efad410e4c82 100644
--- a/net/caif/cfutill.c
+++ b/net/caif/cfutill.c
@@ -4,6 +4,8 @@
4 * License terms: GNU General Public License (GPL) version 2 4 * License terms: GNU General Public License (GPL) version 2
5 */ 5 */
6 6
7#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
8
7#include <linux/kernel.h> 9#include <linux/kernel.h>
8#include <linux/types.h> 10#include <linux/types.h>
9#include <linux/slab.h> 11#include <linux/slab.h>
@@ -26,7 +28,7 @@ struct cflayer *cfutill_create(u8 channel_id, struct dev_info *dev_info)
26{ 28{
27 struct cfsrvl *util = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC); 29 struct cfsrvl *util = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC);
28 if (!util) { 30 if (!util) {
29 pr_warning("CAIF: %s(): Out of memory\n", __func__); 31 pr_warn("Out of memory\n");
30 return NULL; 32 return NULL;
31 } 33 }
32 caif_assert(offsetof(struct cfsrvl, layer) == 0); 34 caif_assert(offsetof(struct cfsrvl, layer) == 0);
@@ -47,7 +49,7 @@ static int cfutill_receive(struct cflayer *layr, struct cfpkt *pkt)
47 caif_assert(layr->up->receive != NULL); 49 caif_assert(layr->up->receive != NULL);
48 caif_assert(layr->up->ctrlcmd != NULL); 50 caif_assert(layr->up->ctrlcmd != NULL);
49 if (cfpkt_extr_head(pkt, &cmd, 1) < 0) { 51 if (cfpkt_extr_head(pkt, &cmd, 1) < 0) {
50 pr_err("CAIF: %s(): Packet is erroneous!\n", __func__); 52 pr_err("Packet is erroneous!\n");
51 cfpkt_destroy(pkt); 53 cfpkt_destroy(pkt);
52 return -EPROTO; 54 return -EPROTO;
53 } 55 }
@@ -64,16 +66,14 @@ static int cfutill_receive(struct cflayer *layr, struct cfpkt *pkt)
64 cfpkt_destroy(pkt); 66 cfpkt_destroy(pkt);
65 return 0; 67 return 0;
66 case UTIL_REMOTE_SHUTDOWN: /* Remote Shutdown Request */ 68 case UTIL_REMOTE_SHUTDOWN: /* Remote Shutdown Request */
67 pr_err("CAIF: %s(): REMOTE SHUTDOWN REQUEST RECEIVED\n", 69 pr_err("REMOTE SHUTDOWN REQUEST RECEIVED\n");
68 __func__);
69 layr->ctrlcmd(layr, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND, 0); 70 layr->ctrlcmd(layr, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND, 0);
70 service->open = false; 71 service->open = false;
71 cfpkt_destroy(pkt); 72 cfpkt_destroy(pkt);
72 return 0; 73 return 0;
73 default: 74 default:
74 cfpkt_destroy(pkt); 75 cfpkt_destroy(pkt);
75 pr_warning("CAIF: %s(): Unknown service control %d (0x%x)\n", 76 pr_warn("Unknown service control %d (0x%x)\n", cmd, cmd);
76 __func__, cmd, cmd);
77 return -EPROTO; 77 return -EPROTO;
78 } 78 }
79} 79}
diff --git a/net/caif/cfveil.c b/net/caif/cfveil.c
index 77cc09faac9a..3b425b189a99 100644
--- a/net/caif/cfveil.c
+++ b/net/caif/cfveil.c
@@ -4,6 +4,8 @@
4 * License terms: GNU General Public License (GPL) version 2 4 * License terms: GNU General Public License (GPL) version 2
5 */ 5 */
6 6
7#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
8
7#include <linux/stddef.h> 9#include <linux/stddef.h>
8#include <linux/slab.h> 10#include <linux/slab.h>
9#include <net/caif/caif_layer.h> 11#include <net/caif/caif_layer.h>
@@ -25,7 +27,7 @@ struct cflayer *cfvei_create(u8 channel_id, struct dev_info *dev_info)
25{ 27{
26 struct cfsrvl *vei = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC); 28 struct cfsrvl *vei = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC);
27 if (!vei) { 29 if (!vei) {
28 pr_warning("CAIF: %s(): Out of memory\n", __func__); 30 pr_warn("Out of memory\n");
29 return NULL; 31 return NULL;
30 } 32 }
31 caif_assert(offsetof(struct cfsrvl, layer) == 0); 33 caif_assert(offsetof(struct cfsrvl, layer) == 0);
@@ -47,7 +49,7 @@ static int cfvei_receive(struct cflayer *layr, struct cfpkt *pkt)
47 49
48 50
49 if (cfpkt_extr_head(pkt, &cmd, 1) < 0) { 51 if (cfpkt_extr_head(pkt, &cmd, 1) < 0) {
50 pr_err("CAIF: %s(): Packet is erroneous!\n", __func__); 52 pr_err("Packet is erroneous!\n");
51 cfpkt_destroy(pkt); 53 cfpkt_destroy(pkt);
52 return -EPROTO; 54 return -EPROTO;
53 } 55 }
@@ -67,8 +69,7 @@ static int cfvei_receive(struct cflayer *layr, struct cfpkt *pkt)
67 cfpkt_destroy(pkt); 69 cfpkt_destroy(pkt);
68 return 0; 70 return 0;
69 default: /* SET RS232 PIN */ 71 default: /* SET RS232 PIN */
70 pr_warning("CAIF: %s():Unknown VEI control packet %d (0x%x)!\n", 72 pr_warn("Unknown VEI control packet %d (0x%x)!\n", cmd, cmd);
71 __func__, cmd, cmd);
72 cfpkt_destroy(pkt); 73 cfpkt_destroy(pkt);
73 return -EPROTO; 74 return -EPROTO;
74 } 75 }
@@ -86,7 +87,7 @@ static int cfvei_transmit(struct cflayer *layr, struct cfpkt *pkt)
86 caif_assert(layr->dn->transmit != NULL); 87 caif_assert(layr->dn->transmit != NULL);
87 88
88 if (cfpkt_add_head(pkt, &tmp, 1) < 0) { 89 if (cfpkt_add_head(pkt, &tmp, 1) < 0) {
89 pr_err("CAIF: %s(): Packet is erroneous!\n", __func__); 90 pr_err("Packet is erroneous!\n");
90 return -EPROTO; 91 return -EPROTO;
91 } 92 }
92 93
diff --git a/net/caif/cfvidl.c b/net/caif/cfvidl.c
index ada6ee2d48f5..bf6fef2a0eff 100644
--- a/net/caif/cfvidl.c
+++ b/net/caif/cfvidl.c
@@ -4,6 +4,8 @@
4 * License terms: GNU General Public License (GPL) version 2 4 * License terms: GNU General Public License (GPL) version 2
5 */ 5 */
6 6
7#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
8
7#include <linux/kernel.h> 9#include <linux/kernel.h>
8#include <linux/types.h> 10#include <linux/types.h>
9#include <linux/slab.h> 11#include <linux/slab.h>
@@ -21,7 +23,7 @@ struct cflayer *cfvidl_create(u8 channel_id, struct dev_info *dev_info)
21{ 23{
22 struct cfsrvl *vid = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC); 24 struct cfsrvl *vid = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC);
23 if (!vid) { 25 if (!vid) {
24 pr_warning("CAIF: %s(): Out of memory\n", __func__); 26 pr_warn("Out of memory\n");
25 return NULL; 27 return NULL;
26 } 28 }
27 caif_assert(offsetof(struct cfsrvl, layer) == 0); 29 caif_assert(offsetof(struct cfsrvl, layer) == 0);
@@ -38,7 +40,7 @@ static int cfvidl_receive(struct cflayer *layr, struct cfpkt *pkt)
38{ 40{
39 u32 videoheader; 41 u32 videoheader;
40 if (cfpkt_extr_head(pkt, &videoheader, 4) < 0) { 42 if (cfpkt_extr_head(pkt, &videoheader, 4) < 0) {
41 pr_err("CAIF: %s(): Packet is erroneous!\n", __func__); 43 pr_err("Packet is erroneous!\n");
42 cfpkt_destroy(pkt); 44 cfpkt_destroy(pkt);
43 return -EPROTO; 45 return -EPROTO;
44 } 46 }
diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c
index 4293e190ec53..84a422c98941 100644
--- a/net/caif/chnl_net.c
+++ b/net/caif/chnl_net.c
@@ -5,6 +5,8 @@
5 * License terms: GNU General Public License (GPL) version 2 5 * License terms: GNU General Public License (GPL) version 2
6 */ 6 */
7 7
8#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
9
8#include <linux/version.h> 10#include <linux/version.h>
9#include <linux/fs.h> 11#include <linux/fs.h>
10#include <linux/init.h> 12#include <linux/init.h>
@@ -28,9 +30,6 @@
28#define CONNECT_TIMEOUT (5 * HZ) 30#define CONNECT_TIMEOUT (5 * HZ)
29#define CAIF_NET_DEFAULT_QUEUE_LEN 500 31#define CAIF_NET_DEFAULT_QUEUE_LEN 500
30 32
31#undef pr_debug
32#define pr_debug pr_warning
33
34/*This list is protected by the rtnl lock. */ 33/*This list is protected by the rtnl lock. */
35static LIST_HEAD(chnl_net_list); 34static LIST_HEAD(chnl_net_list);
36 35
@@ -142,8 +141,7 @@ static void chnl_flowctrl_cb(struct cflayer *layr, enum caif_ctrlcmd flow,
142 int phyid) 141 int phyid)
143{ 142{
144 struct chnl_net *priv = container_of(layr, struct chnl_net, chnl); 143 struct chnl_net *priv = container_of(layr, struct chnl_net, chnl);
145 pr_debug("CAIF: %s(): NET flowctrl func called flow: %s\n", 144 pr_debug("NET flowctrl func called flow: %s\n",
146 __func__,
147 flow == CAIF_CTRLCMD_FLOW_ON_IND ? "ON" : 145 flow == CAIF_CTRLCMD_FLOW_ON_IND ? "ON" :
148 flow == CAIF_CTRLCMD_INIT_RSP ? "INIT" : 146 flow == CAIF_CTRLCMD_INIT_RSP ? "INIT" :
149 flow == CAIF_CTRLCMD_FLOW_OFF_IND ? "OFF" : 147 flow == CAIF_CTRLCMD_FLOW_OFF_IND ? "OFF" :
@@ -196,12 +194,12 @@ static int chnl_net_start_xmit(struct sk_buff *skb, struct net_device *dev)
196 priv = netdev_priv(dev); 194 priv = netdev_priv(dev);
197 195
198 if (skb->len > priv->netdev->mtu) { 196 if (skb->len > priv->netdev->mtu) {
199 pr_warning("CAIF: %s(): Size of skb exceeded MTU\n", __func__); 197 pr_warn("Size of skb exceeded MTU\n");
200 return -ENOSPC; 198 return -ENOSPC;
201 } 199 }
202 200
203 if (!priv->flowenabled) { 201 if (!priv->flowenabled) {
204 pr_debug("CAIF: %s(): dropping packets flow off\n", __func__); 202 pr_debug("dropping packets flow off\n");
205 return NETDEV_TX_BUSY; 203 return NETDEV_TX_BUSY;
206 } 204 }
207 205
@@ -237,7 +235,7 @@ static int chnl_net_open(struct net_device *dev)
237 ASSERT_RTNL(); 235 ASSERT_RTNL();
238 priv = netdev_priv(dev); 236 priv = netdev_priv(dev);
239 if (!priv) { 237 if (!priv) {
240 pr_debug("CAIF: %s(): chnl_net_open: no priv\n", __func__); 238 pr_debug("chnl_net_open: no priv\n");
241 return -ENODEV; 239 return -ENODEV;
242 } 240 }
243 241
@@ -246,18 +244,17 @@ static int chnl_net_open(struct net_device *dev)
246 result = caif_connect_client(&priv->conn_req, &priv->chnl, 244 result = caif_connect_client(&priv->conn_req, &priv->chnl,
247 &llifindex, &headroom, &tailroom); 245 &llifindex, &headroom, &tailroom);
248 if (result != 0) { 246 if (result != 0) {
249 pr_debug("CAIF: %s(): err: " 247 pr_debug("err: "
250 "Unable to register and open device," 248 "Unable to register and open device,"
251 " Err:%d\n", 249 " Err:%d\n",
252 __func__, 250 result);
253 result);
254 goto error; 251 goto error;
255 } 252 }
256 253
257 lldev = dev_get_by_index(dev_net(dev), llifindex); 254 lldev = dev_get_by_index(dev_net(dev), llifindex);
258 255
259 if (lldev == NULL) { 256 if (lldev == NULL) {
260 pr_debug("CAIF: %s(): no interface?\n", __func__); 257 pr_debug("no interface?\n");
261 result = -ENODEV; 258 result = -ENODEV;
262 goto error; 259 goto error;
263 } 260 }
@@ -279,9 +276,7 @@ static int chnl_net_open(struct net_device *dev)
279 dev_put(lldev); 276 dev_put(lldev);
280 277
281 if (mtu < 100) { 278 if (mtu < 100) {
282 pr_warning("CAIF: %s(): " 279 pr_warn("CAIF Interface MTU too small (%d)\n", mtu);
283 "CAIF Interface MTU too small (%d)\n",
284 __func__, mtu);
285 result = -ENODEV; 280 result = -ENODEV;
286 goto error; 281 goto error;
287 } 282 }
@@ -296,33 +291,32 @@ static int chnl_net_open(struct net_device *dev)
296 rtnl_lock(); 291 rtnl_lock();
297 292
298 if (result == -ERESTARTSYS) { 293 if (result == -ERESTARTSYS) {
299 pr_debug("CAIF: %s(): wait_event_interruptible" 294 pr_debug("wait_event_interruptible woken by a signal\n");
300 " woken by a signal\n", __func__);
301 result = -ERESTARTSYS; 295 result = -ERESTARTSYS;
302 goto error; 296 goto error;
303 } 297 }
304 298
305 if (result == 0) { 299 if (result == 0) {
306 pr_debug("CAIF: %s(): connect timeout\n", __func__); 300 pr_debug("connect timeout\n");
307 caif_disconnect_client(&priv->chnl); 301 caif_disconnect_client(&priv->chnl);
308 priv->state = CAIF_DISCONNECTED; 302 priv->state = CAIF_DISCONNECTED;
309 pr_debug("CAIF: %s(): state disconnected\n", __func__); 303 pr_debug("state disconnected\n");
310 result = -ETIMEDOUT; 304 result = -ETIMEDOUT;
311 goto error; 305 goto error;
312 } 306 }
313 307
314 if (priv->state != CAIF_CONNECTED) { 308 if (priv->state != CAIF_CONNECTED) {
315 pr_debug("CAIF: %s(): connect failed\n", __func__); 309 pr_debug("connect failed\n");
316 result = -ECONNREFUSED; 310 result = -ECONNREFUSED;
317 goto error; 311 goto error;
318 } 312 }
319 pr_debug("CAIF: %s(): CAIF Netdevice connected\n", __func__); 313 pr_debug("CAIF Netdevice connected\n");
320 return 0; 314 return 0;
321 315
322error: 316error:
323 caif_disconnect_client(&priv->chnl); 317 caif_disconnect_client(&priv->chnl);
324 priv->state = CAIF_DISCONNECTED; 318 priv->state = CAIF_DISCONNECTED;
325 pr_debug("CAIF: %s(): state disconnected\n", __func__); 319 pr_debug("state disconnected\n");
326 return result; 320 return result;
327 321
328} 322}
@@ -413,7 +407,7 @@ static void caif_netlink_parms(struct nlattr *data[],
413 struct caif_connect_request *conn_req) 407 struct caif_connect_request *conn_req)
414{ 408{
415 if (!data) { 409 if (!data) {
416 pr_warning("CAIF: %s: no params data found\n", __func__); 410 pr_warn("no params data found\n");
417 return; 411 return;
418 } 412 }
419 if (data[IFLA_CAIF_IPV4_CONNID]) 413 if (data[IFLA_CAIF_IPV4_CONNID])
@@ -442,8 +436,7 @@ static int ipcaif_newlink(struct net *src_net, struct net_device *dev,
442 436
443 ret = register_netdevice(dev); 437 ret = register_netdevice(dev);
444 if (ret) 438 if (ret)
445 pr_warning("CAIF: %s(): device rtml registration failed\n", 439 pr_warn("device rtml registration failed\n");
446 __func__);
447 return ret; 440 return ret;
448} 441}
449 442
diff --git a/net/can/raw.c b/net/can/raw.c
index a10e3338f084..e88f610fdb7b 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -90,23 +90,39 @@ struct raw_sock {
90 can_err_mask_t err_mask; 90 can_err_mask_t err_mask;
91}; 91};
92 92
93/*
94 * Return pointer to store the extra msg flags for raw_recvmsg().
95 * We use the space of one unsigned int beyond the 'struct sockaddr_can'
96 * in skb->cb.
97 */
98static inline unsigned int *raw_flags(struct sk_buff *skb)
99{
100 BUILD_BUG_ON(sizeof(skb->cb) <= (sizeof(struct sockaddr_can) +
101 sizeof(unsigned int)));
102
103 /* return pointer after struct sockaddr_can */
104 return (unsigned int *)(&((struct sockaddr_can *)skb->cb)[1]);
105}
106
93static inline struct raw_sock *raw_sk(const struct sock *sk) 107static inline struct raw_sock *raw_sk(const struct sock *sk)
94{ 108{
95 return (struct raw_sock *)sk; 109 return (struct raw_sock *)sk;
96} 110}
97 111
98static void raw_rcv(struct sk_buff *skb, void *data) 112static void raw_rcv(struct sk_buff *oskb, void *data)
99{ 113{
100 struct sock *sk = (struct sock *)data; 114 struct sock *sk = (struct sock *)data;
101 struct raw_sock *ro = raw_sk(sk); 115 struct raw_sock *ro = raw_sk(sk);
102 struct sockaddr_can *addr; 116 struct sockaddr_can *addr;
117 struct sk_buff *skb;
118 unsigned int *pflags;
103 119
104 /* check the received tx sock reference */ 120 /* check the received tx sock reference */
105 if (!ro->recv_own_msgs && skb->sk == sk) 121 if (!ro->recv_own_msgs && oskb->sk == sk)
106 return; 122 return;
107 123
108 /* clone the given skb to be able to enqueue it into the rcv queue */ 124 /* clone the given skb to be able to enqueue it into the rcv queue */
109 skb = skb_clone(skb, GFP_ATOMIC); 125 skb = skb_clone(oskb, GFP_ATOMIC);
110 if (!skb) 126 if (!skb)
111 return; 127 return;
112 128
@@ -123,6 +139,14 @@ static void raw_rcv(struct sk_buff *skb, void *data)
123 addr->can_family = AF_CAN; 139 addr->can_family = AF_CAN;
124 addr->can_ifindex = skb->dev->ifindex; 140 addr->can_ifindex = skb->dev->ifindex;
125 141
142 /* add CAN specific message flags for raw_recvmsg() */
143 pflags = raw_flags(skb);
144 *pflags = 0;
145 if (oskb->sk)
146 *pflags |= MSG_DONTROUTE;
147 if (oskb->sk == sk)
148 *pflags |= MSG_CONFIRM;
149
126 if (sock_queue_rcv_skb(sk, skb) < 0) 150 if (sock_queue_rcv_skb(sk, skb) < 0)
127 kfree_skb(skb); 151 kfree_skb(skb);
128} 152}
@@ -647,12 +671,12 @@ static int raw_sendmsg(struct kiocb *iocb, struct socket *sock,
647 err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); 671 err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
648 if (err < 0) 672 if (err < 0)
649 goto free_skb; 673 goto free_skb;
650 err = sock_tx_timestamp(msg, sk, skb_tx(skb)); 674 err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
651 if (err < 0) 675 if (err < 0)
652 goto free_skb; 676 goto free_skb;
653 677
654 /* to be able to check the received tx sock reference in raw_rcv() */ 678 /* to be able to check the received tx sock reference in raw_rcv() */
655 skb_tx(skb)->prevent_sk_orphan = 1; 679 skb_shinfo(skb)->tx_flags |= SKBTX_DRV_NEEDS_SK_REF;
656 680
657 skb->dev = dev; 681 skb->dev = dev;
658 skb->sk = sk; 682 skb->sk = sk;
@@ -707,6 +731,9 @@ static int raw_recvmsg(struct kiocb *iocb, struct socket *sock,
707 memcpy(msg->msg_name, skb->cb, msg->msg_namelen); 731 memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
708 } 732 }
709 733
734 /* assign the flags that have been recorded in raw_rcv() */
735 msg->msg_flags |= *(raw_flags(skb));
736
710 skb_free_datagram(sk, skb); 737 skb_free_datagram(sk, skb);
711 738
712 return size; 739 return size;
diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig
new file mode 100644
index 000000000000..ad424049b0cf
--- /dev/null
+++ b/net/ceph/Kconfig
@@ -0,0 +1,28 @@
1config CEPH_LIB
2 tristate "Ceph core library (EXPERIMENTAL)"
3 depends on INET && EXPERIMENTAL
4 select LIBCRC32C
5 select CRYPTO_AES
6 select CRYPTO
7 default n
8 help
9 Choose Y or M here to include cephlib, which provides the
10 common functionality to both the Ceph filesystem and
11 to the rados block device (rbd).
12
13 More information at http://ceph.newdream.net/.
14
15 If unsure, say N.
16
17config CEPH_LIB_PRETTYDEBUG
18 bool "Include file:line in ceph debug output"
19 depends on CEPH_LIB
20 default n
21 help
22 If you say Y here, debug output will include a filename and
23 line to aid debugging. This increases kernel size and slows
24 execution slightly when debug call sites are enabled (e.g.,
25 via CONFIG_DYNAMIC_DEBUG).
26
27 If unsure, say N.
28
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
new file mode 100644
index 000000000000..aab1cabb8035
--- /dev/null
+++ b/net/ceph/Makefile
@@ -0,0 +1,37 @@
1#
2# Makefile for CEPH filesystem.
3#
4
5ifneq ($(KERNELRELEASE),)
6
7obj-$(CONFIG_CEPH_LIB) += libceph.o
8
9libceph-objs := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
10 mon_client.o \
11 osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
12 debugfs.o \
13 auth.o auth_none.o \
14 crypto.o armor.o \
15 auth_x.o \
16 ceph_fs.o ceph_strings.o ceph_hash.o \
17 pagevec.o
18
19else
20#Otherwise we were called directly from the command
21# line; invoke the kernel build system.
22
23KERNELDIR ?= /lib/modules/$(shell uname -r)/build
24PWD := $(shell pwd)
25
26default: all
27
28all:
29 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules
30
31modules_install:
32 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules_install
33
34clean:
35 $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
36
37endif
diff --git a/net/ceph/armor.c b/net/ceph/armor.c
new file mode 100644
index 000000000000..eb2a666b0be7
--- /dev/null
+++ b/net/ceph/armor.c
@@ -0,0 +1,103 @@
1
2#include <linux/errno.h>
3
4int ceph_armor(char *dst, const char *src, const char *end);
5int ceph_unarmor(char *dst, const char *src, const char *end);
6
7/*
8 * base64 encode/decode.
9 */
10
11static const char *pem_key =
12 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
13
14static int encode_bits(int c)
15{
16 return pem_key[c];
17}
18
19static int decode_bits(char c)
20{
21 if (c >= 'A' && c <= 'Z')
22 return c - 'A';
23 if (c >= 'a' && c <= 'z')
24 return c - 'a' + 26;
25 if (c >= '0' && c <= '9')
26 return c - '0' + 52;
27 if (c == '+')
28 return 62;
29 if (c == '/')
30 return 63;
31 if (c == '=')
32 return 0; /* just non-negative, please */
33 return -EINVAL;
34}
35
36int ceph_armor(char *dst, const char *src, const char *end)
37{
38 int olen = 0;
39 int line = 0;
40
41 while (src < end) {
42 unsigned char a, b, c;
43
44 a = *src++;
45 *dst++ = encode_bits(a >> 2);
46 if (src < end) {
47 b = *src++;
48 *dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
49 if (src < end) {
50 c = *src++;
51 *dst++ = encode_bits(((b & 15) << 2) |
52 (c >> 6));
53 *dst++ = encode_bits(c & 63);
54 } else {
55 *dst++ = encode_bits((b & 15) << 2);
56 *dst++ = '=';
57 }
58 } else {
59 *dst++ = encode_bits(((a & 3) << 4));
60 *dst++ = '=';
61 *dst++ = '=';
62 }
63 olen += 4;
64 line += 4;
65 if (line == 64) {
66 line = 0;
67 *(dst++) = '\n';
68 olen++;
69 }
70 }
71 return olen;
72}
73
74int ceph_unarmor(char *dst, const char *src, const char *end)
75{
76 int olen = 0;
77
78 while (src < end) {
79 int a, b, c, d;
80
81 if (src < end && src[0] == '\n')
82 src++;
83 if (src + 4 > end)
84 return -EINVAL;
85 a = decode_bits(src[0]);
86 b = decode_bits(src[1]);
87 c = decode_bits(src[2]);
88 d = decode_bits(src[3]);
89 if (a < 0 || b < 0 || c < 0 || d < 0)
90 return -EINVAL;
91
92 *dst++ = (a << 2) | (b >> 4);
93 if (src[2] == '=')
94 return olen + 1;
95 *dst++ = ((b & 15) << 4) | (c >> 2);
96 if (src[3] == '=')
97 return olen + 2;
98 *dst++ = ((c & 3) << 6) | d;
99 olen += 3;
100 src += 4;
101 }
102 return olen;
103}
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
new file mode 100644
index 000000000000..549c1f43e1d5
--- /dev/null
+++ b/net/ceph/auth.c
@@ -0,0 +1,259 @@
1#include <linux/ceph/ceph_debug.h>
2
3#include <linux/module.h>
4#include <linux/err.h>
5#include <linux/slab.h>
6
7#include <linux/ceph/types.h>
8#include <linux/ceph/decode.h>
9#include <linux/ceph/libceph.h>
10#include <linux/ceph/messenger.h>
11#include "auth_none.h"
12#include "auth_x.h"
13
14
15/*
16 * get protocol handler
17 */
18static u32 supported_protocols[] = {
19 CEPH_AUTH_NONE,
20 CEPH_AUTH_CEPHX
21};
22
23static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
24{
25 switch (protocol) {
26 case CEPH_AUTH_NONE:
27 return ceph_auth_none_init(ac);
28 case CEPH_AUTH_CEPHX:
29 return ceph_x_init(ac);
30 default:
31 return -ENOENT;
32 }
33}
34
35/*
36 * setup, teardown.
37 */
38struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
39{
40 struct ceph_auth_client *ac;
41 int ret;
42
43 dout("auth_init name '%s' secret '%s'\n", name, secret);
44
45 ret = -ENOMEM;
46 ac = kzalloc(sizeof(*ac), GFP_NOFS);
47 if (!ac)
48 goto out;
49
50 ac->negotiating = true;
51 if (name)
52 ac->name = name;
53 else
54 ac->name = CEPH_AUTH_NAME_DEFAULT;
55 dout("auth_init name %s secret %s\n", ac->name, secret);
56 ac->secret = secret;
57 return ac;
58
59out:
60 return ERR_PTR(ret);
61}
62
63void ceph_auth_destroy(struct ceph_auth_client *ac)
64{
65 dout("auth_destroy %p\n", ac);
66 if (ac->ops)
67 ac->ops->destroy(ac);
68 kfree(ac);
69}
70
71/*
72 * Reset occurs when reconnecting to the monitor.
73 */
74void ceph_auth_reset(struct ceph_auth_client *ac)
75{
76 dout("auth_reset %p\n", ac);
77 if (ac->ops && !ac->negotiating)
78 ac->ops->reset(ac);
79 ac->negotiating = true;
80}
81
82int ceph_entity_name_encode(const char *name, void **p, void *end)
83{
84 int len = strlen(name);
85
86 if (*p + 2*sizeof(u32) + len > end)
87 return -ERANGE;
88 ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
89 ceph_encode_32(p, len);
90 ceph_encode_copy(p, name, len);
91 return 0;
92}
93
94/*
95 * Initiate protocol negotiation with monitor. Include entity name
96 * and list supported protocols.
97 */
98int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
99{
100 struct ceph_mon_request_header *monhdr = buf;
101 void *p = monhdr + 1, *end = buf + len, *lenp;
102 int i, num;
103 int ret;
104
105 dout("auth_build_hello\n");
106 monhdr->have_version = 0;
107 monhdr->session_mon = cpu_to_le16(-1);
108 monhdr->session_mon_tid = 0;
109
110 ceph_encode_32(&p, 0); /* no protocol, yet */
111
112 lenp = p;
113 p += sizeof(u32);
114
115 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
116 ceph_encode_8(&p, 1);
117 num = ARRAY_SIZE(supported_protocols);
118 ceph_encode_32(&p, num);
119 ceph_decode_need(&p, end, num * sizeof(u32), bad);
120 for (i = 0; i < num; i++)
121 ceph_encode_32(&p, supported_protocols[i]);
122
123 ret = ceph_entity_name_encode(ac->name, &p, end);
124 if (ret < 0)
125 return ret;
126 ceph_decode_need(&p, end, sizeof(u64), bad);
127 ceph_encode_64(&p, ac->global_id);
128
129 ceph_encode_32(&lenp, p - lenp - sizeof(u32));
130 return p - buf;
131
132bad:
133 return -ERANGE;
134}
135
136static int ceph_build_auth_request(struct ceph_auth_client *ac,
137 void *msg_buf, size_t msg_len)
138{
139 struct ceph_mon_request_header *monhdr = msg_buf;
140 void *p = monhdr + 1;
141 void *end = msg_buf + msg_len;
142 int ret;
143
144 monhdr->have_version = 0;
145 monhdr->session_mon = cpu_to_le16(-1);
146 monhdr->session_mon_tid = 0;
147
148 ceph_encode_32(&p, ac->protocol);
149
150 ret = ac->ops->build_request(ac, p + sizeof(u32), end);
151 if (ret < 0) {
152 pr_err("error %d building auth method %s request\n", ret,
153 ac->ops->name);
154 return ret;
155 }
156 dout(" built request %d bytes\n", ret);
157 ceph_encode_32(&p, ret);
158 return p + ret - msg_buf;
159}
160
161/*
162 * Handle auth message from monitor.
163 */
164int ceph_handle_auth_reply(struct ceph_auth_client *ac,
165 void *buf, size_t len,
166 void *reply_buf, size_t reply_len)
167{
168 void *p = buf;
169 void *end = buf + len;
170 int protocol;
171 s32 result;
172 u64 global_id;
173 void *payload, *payload_end;
174 int payload_len;
175 char *result_msg;
176 int result_msg_len;
177 int ret = -EINVAL;
178
179 dout("handle_auth_reply %p %p\n", p, end);
180 ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
181 protocol = ceph_decode_32(&p);
182 result = ceph_decode_32(&p);
183 global_id = ceph_decode_64(&p);
184 payload_len = ceph_decode_32(&p);
185 payload = p;
186 p += payload_len;
187 ceph_decode_need(&p, end, sizeof(u32), bad);
188 result_msg_len = ceph_decode_32(&p);
189 result_msg = p;
190 p += result_msg_len;
191 if (p != end)
192 goto bad;
193
194 dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
195 result_msg, global_id, payload_len);
196
197 payload_end = payload + payload_len;
198
199 if (global_id && ac->global_id != global_id) {
200 dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
201 ac->global_id = global_id;
202 }
203
204 if (ac->negotiating) {
205 /* server does not support our protocols? */
206 if (!protocol && result < 0) {
207 ret = result;
208 goto out;
209 }
210 /* set up (new) protocol handler? */
211 if (ac->protocol && ac->protocol != protocol) {
212 ac->ops->destroy(ac);
213 ac->protocol = 0;
214 ac->ops = NULL;
215 }
216 if (ac->protocol != protocol) {
217 ret = ceph_auth_init_protocol(ac, protocol);
218 if (ret) {
219 pr_err("error %d on auth protocol %d init\n",
220 ret, protocol);
221 goto out;
222 }
223 }
224
225 ac->negotiating = false;
226 }
227
228 ret = ac->ops->handle_reply(ac, result, payload, payload_end);
229 if (ret == -EAGAIN) {
230 return ceph_build_auth_request(ac, reply_buf, reply_len);
231 } else if (ret) {
232 pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
233 return ret;
234 }
235 return 0;
236
237bad:
238 pr_err("failed to decode auth msg\n");
239out:
240 return ret;
241}
242
243int ceph_build_auth(struct ceph_auth_client *ac,
244 void *msg_buf, size_t msg_len)
245{
246 if (!ac->protocol)
247 return ceph_auth_build_hello(ac, msg_buf, msg_len);
248 BUG_ON(!ac->ops);
249 if (ac->ops->should_authenticate(ac))
250 return ceph_build_auth_request(ac, msg_buf, msg_len);
251 return 0;
252}
253
254int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
255{
256 if (!ac->ops)
257 return 0;
258 return ac->ops->is_authenticated(ac);
259}
diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c
new file mode 100644
index 000000000000..214c2bb43d62
--- /dev/null
+++ b/net/ceph/auth_none.c
@@ -0,0 +1,132 @@
1
2#include <linux/ceph/ceph_debug.h>
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7#include <linux/slab.h>
8
9#include <linux/ceph/decode.h>
10#include <linux/ceph/auth.h>
11
12#include "auth_none.h"
13
14static void reset(struct ceph_auth_client *ac)
15{
16 struct ceph_auth_none_info *xi = ac->private;
17
18 xi->starting = true;
19 xi->built_authorizer = false;
20}
21
22static void destroy(struct ceph_auth_client *ac)
23{
24 kfree(ac->private);
25 ac->private = NULL;
26}
27
28static int is_authenticated(struct ceph_auth_client *ac)
29{
30 struct ceph_auth_none_info *xi = ac->private;
31
32 return !xi->starting;
33}
34
35static int should_authenticate(struct ceph_auth_client *ac)
36{
37 struct ceph_auth_none_info *xi = ac->private;
38
39 return xi->starting;
40}
41
42/*
43 * the generic auth code decode the global_id, and we carry no actual
44 * authenticate state, so nothing happens here.
45 */
46static int handle_reply(struct ceph_auth_client *ac, int result,
47 void *buf, void *end)
48{
49 struct ceph_auth_none_info *xi = ac->private;
50
51 xi->starting = false;
52 return result;
53}
54
55/*
56 * build an 'authorizer' with our entity_name and global_id. we can
57 * reuse a single static copy since it is identical for all services
58 * we connect to.
59 */
60static int ceph_auth_none_create_authorizer(
61 struct ceph_auth_client *ac, int peer_type,
62 struct ceph_authorizer **a,
63 void **buf, size_t *len,
64 void **reply_buf, size_t *reply_len)
65{
66 struct ceph_auth_none_info *ai = ac->private;
67 struct ceph_none_authorizer *au = &ai->au;
68 void *p, *end;
69 int ret;
70
71 if (!ai->built_authorizer) {
72 p = au->buf;
73 end = p + sizeof(au->buf);
74 ceph_encode_8(&p, 1);
75 ret = ceph_entity_name_encode(ac->name, &p, end - 8);
76 if (ret < 0)
77 goto bad;
78 ceph_decode_need(&p, end, sizeof(u64), bad2);
79 ceph_encode_64(&p, ac->global_id);
80 au->buf_len = p - (void *)au->buf;
81 ai->built_authorizer = true;
82 dout("built authorizer len %d\n", au->buf_len);
83 }
84
85 *a = (struct ceph_authorizer *)au;
86 *buf = au->buf;
87 *len = au->buf_len;
88 *reply_buf = au->reply_buf;
89 *reply_len = sizeof(au->reply_buf);
90 return 0;
91
92bad2:
93 ret = -ERANGE;
94bad:
95 return ret;
96}
97
98static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
99 struct ceph_authorizer *a)
100{
101 /* nothing to do */
102}
103
104static const struct ceph_auth_client_ops ceph_auth_none_ops = {
105 .name = "none",
106 .reset = reset,
107 .destroy = destroy,
108 .is_authenticated = is_authenticated,
109 .should_authenticate = should_authenticate,
110 .handle_reply = handle_reply,
111 .create_authorizer = ceph_auth_none_create_authorizer,
112 .destroy_authorizer = ceph_auth_none_destroy_authorizer,
113};
114
115int ceph_auth_none_init(struct ceph_auth_client *ac)
116{
117 struct ceph_auth_none_info *xi;
118
119 dout("ceph_auth_none_init %p\n", ac);
120 xi = kzalloc(sizeof(*xi), GFP_NOFS);
121 if (!xi)
122 return -ENOMEM;
123
124 xi->starting = true;
125 xi->built_authorizer = false;
126
127 ac->protocol = CEPH_AUTH_NONE;
128 ac->private = xi;
129 ac->ops = &ceph_auth_none_ops;
130 return 0;
131}
132
diff --git a/net/ceph/auth_none.h b/net/ceph/auth_none.h
new file mode 100644
index 000000000000..ed7d088b1bc9
--- /dev/null
+++ b/net/ceph/auth_none.h
@@ -0,0 +1,29 @@
1#ifndef _FS_CEPH_AUTH_NONE_H
2#define _FS_CEPH_AUTH_NONE_H
3
4#include <linux/slab.h>
5#include <linux/ceph/auth.h>
6
7/*
8 * null security mode.
9 *
10 * we use a single static authorizer that simply encodes our entity name
11 * and global id.
12 */
13
14struct ceph_none_authorizer {
15 char buf[128];
16 int buf_len;
17 char reply_buf[0];
18};
19
20struct ceph_auth_none_info {
21 bool starting;
22 bool built_authorizer;
23 struct ceph_none_authorizer au; /* we only need one; it's static */
24};
25
26extern int ceph_auth_none_init(struct ceph_auth_client *ac);
27
28#endif
29
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
new file mode 100644
index 000000000000..7fd5dfcf6e18
--- /dev/null
+++ b/net/ceph/auth_x.c
@@ -0,0 +1,688 @@
1
2#include <linux/ceph/ceph_debug.h>
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7#include <linux/slab.h>
8
9#include <linux/ceph/decode.h>
10#include <linux/ceph/auth.h>
11
12#include "crypto.h"
13#include "auth_x.h"
14#include "auth_x_protocol.h"
15
16#define TEMP_TICKET_BUF_LEN 256
17
18static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
19
20static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
21{
22 struct ceph_x_info *xi = ac->private;
23 int need;
24
25 ceph_x_validate_tickets(ac, &need);
26 dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
27 ac->want_keys, need, xi->have_keys);
28 return (ac->want_keys & xi->have_keys) == ac->want_keys;
29}
30
31static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
32{
33 struct ceph_x_info *xi = ac->private;
34 int need;
35
36 ceph_x_validate_tickets(ac, &need);
37 dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
38 ac->want_keys, need, xi->have_keys);
39 return need != 0;
40}
41
42static int ceph_x_encrypt_buflen(int ilen)
43{
44 return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
45 sizeof(u32);
46}
47
48static int ceph_x_encrypt(struct ceph_crypto_key *secret,
49 void *ibuf, int ilen, void *obuf, size_t olen)
50{
51 struct ceph_x_encrypt_header head = {
52 .struct_v = 1,
53 .magic = cpu_to_le64(CEPHX_ENC_MAGIC)
54 };
55 size_t len = olen - sizeof(u32);
56 int ret;
57
58 ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
59 &head, sizeof(head), ibuf, ilen);
60 if (ret)
61 return ret;
62 ceph_encode_32(&obuf, len);
63 return len + sizeof(u32);
64}
65
66static int ceph_x_decrypt(struct ceph_crypto_key *secret,
67 void **p, void *end, void *obuf, size_t olen)
68{
69 struct ceph_x_encrypt_header head;
70 size_t head_len = sizeof(head);
71 int len, ret;
72
73 len = ceph_decode_32(p);
74 if (*p + len > end)
75 return -EINVAL;
76
77 dout("ceph_x_decrypt len %d\n", len);
78 ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
79 *p, len);
80 if (ret)
81 return ret;
82 if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
83 return -EPERM;
84 *p += len;
85 return olen;
86}
87
88/*
89 * get existing (or insert new) ticket handler
90 */
91static struct ceph_x_ticket_handler *
92get_ticket_handler(struct ceph_auth_client *ac, int service)
93{
94 struct ceph_x_ticket_handler *th;
95 struct ceph_x_info *xi = ac->private;
96 struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
97
98 while (*p) {
99 parent = *p;
100 th = rb_entry(parent, struct ceph_x_ticket_handler, node);
101 if (service < th->service)
102 p = &(*p)->rb_left;
103 else if (service > th->service)
104 p = &(*p)->rb_right;
105 else
106 return th;
107 }
108
109 /* add it */
110 th = kzalloc(sizeof(*th), GFP_NOFS);
111 if (!th)
112 return ERR_PTR(-ENOMEM);
113 th->service = service;
114 rb_link_node(&th->node, parent, p);
115 rb_insert_color(&th->node, &xi->ticket_handlers);
116 return th;
117}
118
119static void remove_ticket_handler(struct ceph_auth_client *ac,
120 struct ceph_x_ticket_handler *th)
121{
122 struct ceph_x_info *xi = ac->private;
123
124 dout("remove_ticket_handler %p %d\n", th, th->service);
125 rb_erase(&th->node, &xi->ticket_handlers);
126 ceph_crypto_key_destroy(&th->session_key);
127 if (th->ticket_blob)
128 ceph_buffer_put(th->ticket_blob);
129 kfree(th);
130}
131
132static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
133 struct ceph_crypto_key *secret,
134 void *buf, void *end)
135{
136 struct ceph_x_info *xi = ac->private;
137 int num;
138 void *p = buf;
139 int ret;
140 char *dbuf;
141 char *ticket_buf;
142 u8 reply_struct_v;
143
144 dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
145 if (!dbuf)
146 return -ENOMEM;
147
148 ret = -ENOMEM;
149 ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
150 if (!ticket_buf)
151 goto out_dbuf;
152
153 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
154 reply_struct_v = ceph_decode_8(&p);
155 if (reply_struct_v != 1)
156 goto bad;
157 num = ceph_decode_32(&p);
158 dout("%d tickets\n", num);
159 while (num--) {
160 int type;
161 u8 tkt_struct_v, blob_struct_v;
162 struct ceph_x_ticket_handler *th;
163 void *dp, *dend;
164 int dlen;
165 char is_enc;
166 struct timespec validity;
167 struct ceph_crypto_key old_key;
168 void *tp, *tpend;
169 struct ceph_timespec new_validity;
170 struct ceph_crypto_key new_session_key;
171 struct ceph_buffer *new_ticket_blob;
172 unsigned long new_expires, new_renew_after;
173 u64 new_secret_id;
174
175 ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
176
177 type = ceph_decode_32(&p);
178 dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
179
180 tkt_struct_v = ceph_decode_8(&p);
181 if (tkt_struct_v != 1)
182 goto bad;
183
184 th = get_ticket_handler(ac, type);
185 if (IS_ERR(th)) {
186 ret = PTR_ERR(th);
187 goto out;
188 }
189
190 /* blob for me */
191 dlen = ceph_x_decrypt(secret, &p, end, dbuf,
192 TEMP_TICKET_BUF_LEN);
193 if (dlen <= 0) {
194 ret = dlen;
195 goto out;
196 }
197 dout(" decrypted %d bytes\n", dlen);
198 dend = dbuf + dlen;
199 dp = dbuf;
200
201 tkt_struct_v = ceph_decode_8(&dp);
202 if (tkt_struct_v != 1)
203 goto bad;
204
205 memcpy(&old_key, &th->session_key, sizeof(old_key));
206 ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
207 if (ret)
208 goto out;
209
210 ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
211 ceph_decode_timespec(&validity, &new_validity);
212 new_expires = get_seconds() + validity.tv_sec;
213 new_renew_after = new_expires - (validity.tv_sec / 4);
214 dout(" expires=%lu renew_after=%lu\n", new_expires,
215 new_renew_after);
216
217 /* ticket blob for service */
218 ceph_decode_8_safe(&p, end, is_enc, bad);
219 tp = ticket_buf;
220 if (is_enc) {
221 /* encrypted */
222 dout(" encrypted ticket\n");
223 dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
224 TEMP_TICKET_BUF_LEN);
225 if (dlen < 0) {
226 ret = dlen;
227 goto out;
228 }
229 dlen = ceph_decode_32(&tp);
230 } else {
231 /* unencrypted */
232 ceph_decode_32_safe(&p, end, dlen, bad);
233 ceph_decode_need(&p, end, dlen, bad);
234 ceph_decode_copy(&p, ticket_buf, dlen);
235 }
236 tpend = tp + dlen;
237 dout(" ticket blob is %d bytes\n", dlen);
238 ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
239 blob_struct_v = ceph_decode_8(&tp);
240 new_secret_id = ceph_decode_64(&tp);
241 ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
242 if (ret)
243 goto out;
244
245 /* all is well, update our ticket */
246 ceph_crypto_key_destroy(&th->session_key);
247 if (th->ticket_blob)
248 ceph_buffer_put(th->ticket_blob);
249 th->session_key = new_session_key;
250 th->ticket_blob = new_ticket_blob;
251 th->validity = new_validity;
252 th->secret_id = new_secret_id;
253 th->expires = new_expires;
254 th->renew_after = new_renew_after;
255 dout(" got ticket service %d (%s) secret_id %lld len %d\n",
256 type, ceph_entity_type_name(type), th->secret_id,
257 (int)th->ticket_blob->vec.iov_len);
258 xi->have_keys |= th->service;
259 }
260
261 ret = 0;
262out:
263 kfree(ticket_buf);
264out_dbuf:
265 kfree(dbuf);
266 return ret;
267
268bad:
269 ret = -EINVAL;
270 goto out;
271}
272
273static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
274 struct ceph_x_ticket_handler *th,
275 struct ceph_x_authorizer *au)
276{
277 int maxlen;
278 struct ceph_x_authorize_a *msg_a;
279 struct ceph_x_authorize_b msg_b;
280 void *p, *end;
281 int ret;
282 int ticket_blob_len =
283 (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
284
285 dout("build_authorizer for %s %p\n",
286 ceph_entity_type_name(th->service), au);
287
288 maxlen = sizeof(*msg_a) + sizeof(msg_b) +
289 ceph_x_encrypt_buflen(ticket_blob_len);
290 dout(" need len %d\n", maxlen);
291 if (au->buf && au->buf->alloc_len < maxlen) {
292 ceph_buffer_put(au->buf);
293 au->buf = NULL;
294 }
295 if (!au->buf) {
296 au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
297 if (!au->buf)
298 return -ENOMEM;
299 }
300 au->service = th->service;
301
302 msg_a = au->buf->vec.iov_base;
303 msg_a->struct_v = 1;
304 msg_a->global_id = cpu_to_le64(ac->global_id);
305 msg_a->service_id = cpu_to_le32(th->service);
306 msg_a->ticket_blob.struct_v = 1;
307 msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
308 msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
309 if (ticket_blob_len) {
310 memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
311 th->ticket_blob->vec.iov_len);
312 }
313 dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
314 le64_to_cpu(msg_a->ticket_blob.secret_id));
315
316 p = msg_a + 1;
317 p += ticket_blob_len;
318 end = au->buf->vec.iov_base + au->buf->vec.iov_len;
319
320 get_random_bytes(&au->nonce, sizeof(au->nonce));
321 msg_b.struct_v = 1;
322 msg_b.nonce = cpu_to_le64(au->nonce);
323 ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
324 p, end - p);
325 if (ret < 0)
326 goto out_buf;
327 p += ret;
328 au->buf->vec.iov_len = p - au->buf->vec.iov_base;
329 dout(" built authorizer nonce %llx len %d\n", au->nonce,
330 (int)au->buf->vec.iov_len);
331 BUG_ON(au->buf->vec.iov_len > maxlen);
332 return 0;
333
334out_buf:
335 ceph_buffer_put(au->buf);
336 au->buf = NULL;
337 return ret;
338}
339
340static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
341 void **p, void *end)
342{
343 ceph_decode_need(p, end, 1 + sizeof(u64), bad);
344 ceph_encode_8(p, 1);
345 ceph_encode_64(p, th->secret_id);
346 if (th->ticket_blob) {
347 const char *buf = th->ticket_blob->vec.iov_base;
348 u32 len = th->ticket_blob->vec.iov_len;
349
350 ceph_encode_32_safe(p, end, len, bad);
351 ceph_encode_copy_safe(p, end, buf, len, bad);
352 } else {
353 ceph_encode_32_safe(p, end, 0, bad);
354 }
355
356 return 0;
357bad:
358 return -ERANGE;
359}
360
361static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
362{
363 int want = ac->want_keys;
364 struct ceph_x_info *xi = ac->private;
365 int service;
366
367 *pneed = ac->want_keys & ~(xi->have_keys);
368
369 for (service = 1; service <= want; service <<= 1) {
370 struct ceph_x_ticket_handler *th;
371
372 if (!(ac->want_keys & service))
373 continue;
374
375 if (*pneed & service)
376 continue;
377
378 th = get_ticket_handler(ac, service);
379
380 if (IS_ERR(th)) {
381 *pneed |= service;
382 continue;
383 }
384
385 if (get_seconds() >= th->renew_after)
386 *pneed |= service;
387 if (get_seconds() >= th->expires)
388 xi->have_keys &= ~service;
389 }
390}
391
392
393static int ceph_x_build_request(struct ceph_auth_client *ac,
394 void *buf, void *end)
395{
396 struct ceph_x_info *xi = ac->private;
397 int need;
398 struct ceph_x_request_header *head = buf;
399 int ret;
400 struct ceph_x_ticket_handler *th =
401 get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
402
403 if (IS_ERR(th))
404 return PTR_ERR(th);
405
406 ceph_x_validate_tickets(ac, &need);
407
408 dout("build_request want %x have %x need %x\n",
409 ac->want_keys, xi->have_keys, need);
410
411 if (need & CEPH_ENTITY_TYPE_AUTH) {
412 struct ceph_x_authenticate *auth = (void *)(head + 1);
413 void *p = auth + 1;
414 struct ceph_x_challenge_blob tmp;
415 char tmp_enc[40];
416 u64 *u;
417
418 if (p > end)
419 return -ERANGE;
420
421 dout(" get_auth_session_key\n");
422 head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
423
424 /* encrypt and hash */
425 get_random_bytes(&auth->client_challenge, sizeof(u64));
426 tmp.client_challenge = auth->client_challenge;
427 tmp.server_challenge = cpu_to_le64(xi->server_challenge);
428 ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
429 tmp_enc, sizeof(tmp_enc));
430 if (ret < 0)
431 return ret;
432
433 auth->struct_v = 1;
434 auth->key = 0;
435 for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
436 auth->key ^= *(__le64 *)u;
437 dout(" server_challenge %llx client_challenge %llx key %llx\n",
438 xi->server_challenge, le64_to_cpu(auth->client_challenge),
439 le64_to_cpu(auth->key));
440
441 /* now encode the old ticket if exists */
442 ret = ceph_x_encode_ticket(th, &p, end);
443 if (ret < 0)
444 return ret;
445
446 return p - buf;
447 }
448
449 if (need) {
450 void *p = head + 1;
451 struct ceph_x_service_ticket_request *req;
452
453 if (p > end)
454 return -ERANGE;
455 head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
456
457 ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
458 if (ret)
459 return ret;
460 ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
461 xi->auth_authorizer.buf->vec.iov_len);
462
463 req = p;
464 req->keys = cpu_to_le32(need);
465 p += sizeof(*req);
466 return p - buf;
467 }
468
469 return 0;
470}
471
472static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
473 void *buf, void *end)
474{
475 struct ceph_x_info *xi = ac->private;
476 struct ceph_x_reply_header *head = buf;
477 struct ceph_x_ticket_handler *th;
478 int len = end - buf;
479 int op;
480 int ret;
481
482 if (result)
483 return result; /* XXX hmm? */
484
485 if (xi->starting) {
486 /* it's a hello */
487 struct ceph_x_server_challenge *sc = buf;
488
489 if (len != sizeof(*sc))
490 return -EINVAL;
491 xi->server_challenge = le64_to_cpu(sc->server_challenge);
492 dout("handle_reply got server challenge %llx\n",
493 xi->server_challenge);
494 xi->starting = false;
495 xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
496 return -EAGAIN;
497 }
498
499 op = le16_to_cpu(head->op);
500 result = le32_to_cpu(head->result);
501 dout("handle_reply op %d result %d\n", op, result);
502 switch (op) {
503 case CEPHX_GET_AUTH_SESSION_KEY:
504 /* verify auth key */
505 ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
506 buf + sizeof(*head), end);
507 break;
508
509 case CEPHX_GET_PRINCIPAL_SESSION_KEY:
510 th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
511 if (IS_ERR(th))
512 return PTR_ERR(th);
513 ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
514 buf + sizeof(*head), end);
515 break;
516
517 default:
518 return -EINVAL;
519 }
520 if (ret)
521 return ret;
522 if (ac->want_keys == xi->have_keys)
523 return 0;
524 return -EAGAIN;
525}
526
527static int ceph_x_create_authorizer(
528 struct ceph_auth_client *ac, int peer_type,
529 struct ceph_authorizer **a,
530 void **buf, size_t *len,
531 void **reply_buf, size_t *reply_len)
532{
533 struct ceph_x_authorizer *au;
534 struct ceph_x_ticket_handler *th;
535 int ret;
536
537 th = get_ticket_handler(ac, peer_type);
538 if (IS_ERR(th))
539 return PTR_ERR(th);
540
541 au = kzalloc(sizeof(*au), GFP_NOFS);
542 if (!au)
543 return -ENOMEM;
544
545 ret = ceph_x_build_authorizer(ac, th, au);
546 if (ret) {
547 kfree(au);
548 return ret;
549 }
550
551 *a = (struct ceph_authorizer *)au;
552 *buf = au->buf->vec.iov_base;
553 *len = au->buf->vec.iov_len;
554 *reply_buf = au->reply_buf;
555 *reply_len = sizeof(au->reply_buf);
556 return 0;
557}
558
559static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
560 struct ceph_authorizer *a, size_t len)
561{
562 struct ceph_x_authorizer *au = (void *)a;
563 struct ceph_x_ticket_handler *th;
564 int ret = 0;
565 struct ceph_x_authorize_reply reply;
566 void *p = au->reply_buf;
567 void *end = p + sizeof(au->reply_buf);
568
569 th = get_ticket_handler(ac, au->service);
570 if (IS_ERR(th))
571 return PTR_ERR(th);
572 ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
573 if (ret < 0)
574 return ret;
575 if (ret != sizeof(reply))
576 return -EPERM;
577
578 if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
579 ret = -EPERM;
580 else
581 ret = 0;
582 dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
583 au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
584 return ret;
585}
586
587static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
588 struct ceph_authorizer *a)
589{
590 struct ceph_x_authorizer *au = (void *)a;
591
592 ceph_buffer_put(au->buf);
593 kfree(au);
594}
595
596
597static void ceph_x_reset(struct ceph_auth_client *ac)
598{
599 struct ceph_x_info *xi = ac->private;
600
601 dout("reset\n");
602 xi->starting = true;
603 xi->server_challenge = 0;
604}
605
606static void ceph_x_destroy(struct ceph_auth_client *ac)
607{
608 struct ceph_x_info *xi = ac->private;
609 struct rb_node *p;
610
611 dout("ceph_x_destroy %p\n", ac);
612 ceph_crypto_key_destroy(&xi->secret);
613
614 while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
615 struct ceph_x_ticket_handler *th =
616 rb_entry(p, struct ceph_x_ticket_handler, node);
617 remove_ticket_handler(ac, th);
618 }
619
620 if (xi->auth_authorizer.buf)
621 ceph_buffer_put(xi->auth_authorizer.buf);
622
623 kfree(ac->private);
624 ac->private = NULL;
625}
626
627static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
628 int peer_type)
629{
630 struct ceph_x_ticket_handler *th;
631
632 th = get_ticket_handler(ac, peer_type);
633 if (!IS_ERR(th))
634 remove_ticket_handler(ac, th);
635}
636
637
638static const struct ceph_auth_client_ops ceph_x_ops = {
639 .name = "x",
640 .is_authenticated = ceph_x_is_authenticated,
641 .should_authenticate = ceph_x_should_authenticate,
642 .build_request = ceph_x_build_request,
643 .handle_reply = ceph_x_handle_reply,
644 .create_authorizer = ceph_x_create_authorizer,
645 .verify_authorizer_reply = ceph_x_verify_authorizer_reply,
646 .destroy_authorizer = ceph_x_destroy_authorizer,
647 .invalidate_authorizer = ceph_x_invalidate_authorizer,
648 .reset = ceph_x_reset,
649 .destroy = ceph_x_destroy,
650};
651
652
653int ceph_x_init(struct ceph_auth_client *ac)
654{
655 struct ceph_x_info *xi;
656 int ret;
657
658 dout("ceph_x_init %p\n", ac);
659 ret = -ENOMEM;
660 xi = kzalloc(sizeof(*xi), GFP_NOFS);
661 if (!xi)
662 goto out;
663
664 ret = -EINVAL;
665 if (!ac->secret) {
666 pr_err("no secret set (for auth_x protocol)\n");
667 goto out_nomem;
668 }
669
670 ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
671 if (ret)
672 goto out_nomem;
673
674 xi->starting = true;
675 xi->ticket_handlers = RB_ROOT;
676
677 ac->protocol = CEPH_AUTH_CEPHX;
678 ac->private = xi;
679 ac->ops = &ceph_x_ops;
680 return 0;
681
682out_nomem:
683 kfree(xi);
684out:
685 return ret;
686}
687
688
diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h
new file mode 100644
index 000000000000..e02da7a5c5a1
--- /dev/null
+++ b/net/ceph/auth_x.h
@@ -0,0 +1,50 @@
1#ifndef _FS_CEPH_AUTH_X_H
2#define _FS_CEPH_AUTH_X_H
3
4#include <linux/rbtree.h>
5
6#include <linux/ceph/auth.h>
7
8#include "crypto.h"
9#include "auth_x_protocol.h"
10
11/*
12 * Handle ticket for a single service.
13 */
14struct ceph_x_ticket_handler {
15 struct rb_node node;
16 unsigned service;
17
18 struct ceph_crypto_key session_key;
19 struct ceph_timespec validity;
20
21 u64 secret_id;
22 struct ceph_buffer *ticket_blob;
23
24 unsigned long renew_after, expires;
25};
26
27
28struct ceph_x_authorizer {
29 struct ceph_buffer *buf;
30 unsigned service;
31 u64 nonce;
32 char reply_buf[128]; /* big enough for encrypted blob */
33};
34
35struct ceph_x_info {
36 struct ceph_crypto_key secret;
37
38 bool starting;
39 u64 server_challenge;
40
41 unsigned have_keys;
42 struct rb_root ticket_handlers;
43
44 struct ceph_x_authorizer auth_authorizer;
45};
46
47extern int ceph_x_init(struct ceph_auth_client *ac);
48
49#endif
50
diff --git a/net/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h
new file mode 100644
index 000000000000..671d30576c4f
--- /dev/null
+++ b/net/ceph/auth_x_protocol.h
@@ -0,0 +1,90 @@
1#ifndef __FS_CEPH_AUTH_X_PROTOCOL
2#define __FS_CEPH_AUTH_X_PROTOCOL
3
4#define CEPHX_GET_AUTH_SESSION_KEY 0x0100
5#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
6#define CEPHX_GET_ROTATING_KEY 0x0400
7
8/* common bits */
9struct ceph_x_ticket_blob {
10 __u8 struct_v;
11 __le64 secret_id;
12 __le32 blob_len;
13 char blob[];
14} __attribute__ ((packed));
15
16
17/* common request/reply headers */
18struct ceph_x_request_header {
19 __le16 op;
20} __attribute__ ((packed));
21
22struct ceph_x_reply_header {
23 __le16 op;
24 __le32 result;
25} __attribute__ ((packed));
26
27
28/* authenticate handshake */
29
30/* initial hello (no reply header) */
31struct ceph_x_server_challenge {
32 __u8 struct_v;
33 __le64 server_challenge;
34} __attribute__ ((packed));
35
36struct ceph_x_authenticate {
37 __u8 struct_v;
38 __le64 client_challenge;
39 __le64 key;
40 /* ticket blob */
41} __attribute__ ((packed));
42
43struct ceph_x_service_ticket_request {
44 __u8 struct_v;
45 __le32 keys;
46} __attribute__ ((packed));
47
48struct ceph_x_challenge_blob {
49 __le64 server_challenge;
50 __le64 client_challenge;
51} __attribute__ ((packed));
52
53
54
55/* authorize handshake */
56
57/*
58 * The authorizer consists of two pieces:
59 * a - service id, ticket blob
60 * b - encrypted with session key
61 */
62struct ceph_x_authorize_a {
63 __u8 struct_v;
64 __le64 global_id;
65 __le32 service_id;
66 struct ceph_x_ticket_blob ticket_blob;
67} __attribute__ ((packed));
68
69struct ceph_x_authorize_b {
70 __u8 struct_v;
71 __le64 nonce;
72} __attribute__ ((packed));
73
74struct ceph_x_authorize_reply {
75 __u8 struct_v;
76 __le64 nonce_plus_one;
77} __attribute__ ((packed));
78
79
80/*
81 * encyption bundle
82 */
83#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
84
85struct ceph_x_encrypt_header {
86 __u8 struct_v;
87 __le64 magic;
88} __attribute__ ((packed));
89
90#endif
diff --git a/net/ceph/buffer.c b/net/ceph/buffer.c
new file mode 100644
index 000000000000..53d8abfa25d5
--- /dev/null
+++ b/net/ceph/buffer.c
@@ -0,0 +1,68 @@
1
2#include <linux/ceph/ceph_debug.h>
3
4#include <linux/module.h>
5#include <linux/slab.h>
6
7#include <linux/ceph/buffer.h>
8#include <linux/ceph/decode.h>
9
10struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
11{
12 struct ceph_buffer *b;
13
14 b = kmalloc(sizeof(*b), gfp);
15 if (!b)
16 return NULL;
17
18 b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
19 if (b->vec.iov_base) {
20 b->is_vmalloc = false;
21 } else {
22 b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
23 if (!b->vec.iov_base) {
24 kfree(b);
25 return NULL;
26 }
27 b->is_vmalloc = true;
28 }
29
30 kref_init(&b->kref);
31 b->alloc_len = len;
32 b->vec.iov_len = len;
33 dout("buffer_new %p\n", b);
34 return b;
35}
36EXPORT_SYMBOL(ceph_buffer_new);
37
38void ceph_buffer_release(struct kref *kref)
39{
40 struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
41
42 dout("buffer_release %p\n", b);
43 if (b->vec.iov_base) {
44 if (b->is_vmalloc)
45 vfree(b->vec.iov_base);
46 else
47 kfree(b->vec.iov_base);
48 }
49 kfree(b);
50}
51EXPORT_SYMBOL(ceph_buffer_release);
52
53int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
54{
55 size_t len;
56
57 ceph_decode_need(p, end, sizeof(u32), bad);
58 len = ceph_decode_32(p);
59 dout("decode_buffer len %d\n", (int)len);
60 ceph_decode_need(p, end, len, bad);
61 *b = ceph_buffer_new(len, GFP_NOFS);
62 if (!*b)
63 return -ENOMEM;
64 ceph_decode_copy(p, (*b)->vec.iov_base, len);
65 return 0;
66bad:
67 return -EINVAL;
68}
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
new file mode 100644
index 000000000000..f3e4a13fea0c
--- /dev/null
+++ b/net/ceph/ceph_common.c
@@ -0,0 +1,529 @@
1
2#include <linux/ceph/ceph_debug.h>
3#include <linux/backing-dev.h>
4#include <linux/ctype.h>
5#include <linux/fs.h>
6#include <linux/inet.h>
7#include <linux/in6.h>
8#include <linux/module.h>
9#include <linux/mount.h>
10#include <linux/parser.h>
11#include <linux/sched.h>
12#include <linux/seq_file.h>
13#include <linux/slab.h>
14#include <linux/statfs.h>
15#include <linux/string.h>
16
17
18#include <linux/ceph/libceph.h>
19#include <linux/ceph/debugfs.h>
20#include <linux/ceph/decode.h>
21#include <linux/ceph/mon_client.h>
22#include <linux/ceph/auth.h>
23
24
25
26/*
27 * find filename portion of a path (/foo/bar/baz -> baz)
28 */
29const char *ceph_file_part(const char *s, int len)
30{
31 const char *e = s + len;
32
33 while (e != s && *(e-1) != '/')
34 e--;
35 return e;
36}
37EXPORT_SYMBOL(ceph_file_part);
38
39const char *ceph_msg_type_name(int type)
40{
41 switch (type) {
42 case CEPH_MSG_SHUTDOWN: return "shutdown";
43 case CEPH_MSG_PING: return "ping";
44 case CEPH_MSG_AUTH: return "auth";
45 case CEPH_MSG_AUTH_REPLY: return "auth_reply";
46 case CEPH_MSG_MON_MAP: return "mon_map";
47 case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
48 case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
49 case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
50 case CEPH_MSG_STATFS: return "statfs";
51 case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
52 case CEPH_MSG_MDS_MAP: return "mds_map";
53 case CEPH_MSG_CLIENT_SESSION: return "client_session";
54 case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
55 case CEPH_MSG_CLIENT_REQUEST: return "client_request";
56 case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
57 case CEPH_MSG_CLIENT_REPLY: return "client_reply";
58 case CEPH_MSG_CLIENT_CAPS: return "client_caps";
59 case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
60 case CEPH_MSG_CLIENT_SNAP: return "client_snap";
61 case CEPH_MSG_CLIENT_LEASE: return "client_lease";
62 case CEPH_MSG_OSD_MAP: return "osd_map";
63 case CEPH_MSG_OSD_OP: return "osd_op";
64 case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
65 default: return "unknown";
66 }
67}
68EXPORT_SYMBOL(ceph_msg_type_name);
69
70/*
71 * Initially learn our fsid, or verify an fsid matches.
72 */
73int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
74{
75 if (client->have_fsid) {
76 if (ceph_fsid_compare(&client->fsid, fsid)) {
77 pr_err("bad fsid, had %pU got %pU",
78 &client->fsid, fsid);
79 return -1;
80 }
81 } else {
82 pr_info("client%lld fsid %pU\n", ceph_client_id(client), fsid);
83 memcpy(&client->fsid, fsid, sizeof(*fsid));
84 ceph_debugfs_client_init(client);
85 client->have_fsid = true;
86 }
87 return 0;
88}
89EXPORT_SYMBOL(ceph_check_fsid);
90
91static int strcmp_null(const char *s1, const char *s2)
92{
93 if (!s1 && !s2)
94 return 0;
95 if (s1 && !s2)
96 return -1;
97 if (!s1 && s2)
98 return 1;
99 return strcmp(s1, s2);
100}
101
102int ceph_compare_options(struct ceph_options *new_opt,
103 struct ceph_client *client)
104{
105 struct ceph_options *opt1 = new_opt;
106 struct ceph_options *opt2 = client->options;
107 int ofs = offsetof(struct ceph_options, mon_addr);
108 int i;
109 int ret;
110
111 ret = memcmp(opt1, opt2, ofs);
112 if (ret)
113 return ret;
114
115 ret = strcmp_null(opt1->name, opt2->name);
116 if (ret)
117 return ret;
118
119 ret = strcmp_null(opt1->secret, opt2->secret);
120 if (ret)
121 return ret;
122
123 /* any matching mon ip implies a match */
124 for (i = 0; i < opt1->num_mon; i++) {
125 if (ceph_monmap_contains(client->monc.monmap,
126 &opt1->mon_addr[i]))
127 return 0;
128 }
129 return -1;
130}
131EXPORT_SYMBOL(ceph_compare_options);
132
133
134static int parse_fsid(const char *str, struct ceph_fsid *fsid)
135{
136 int i = 0;
137 char tmp[3];
138 int err = -EINVAL;
139 int d;
140
141 dout("parse_fsid '%s'\n", str);
142 tmp[2] = 0;
143 while (*str && i < 16) {
144 if (ispunct(*str)) {
145 str++;
146 continue;
147 }
148 if (!isxdigit(str[0]) || !isxdigit(str[1]))
149 break;
150 tmp[0] = str[0];
151 tmp[1] = str[1];
152 if (sscanf(tmp, "%x", &d) < 1)
153 break;
154 fsid->fsid[i] = d & 0xff;
155 i++;
156 str += 2;
157 }
158
159 if (i == 16)
160 err = 0;
161 dout("parse_fsid ret %d got fsid %pU", err, fsid);
162 return err;
163}
164
165/*
166 * ceph options
167 */
168enum {
169 Opt_osdtimeout,
170 Opt_osdkeepalivetimeout,
171 Opt_mount_timeout,
172 Opt_osd_idle_ttl,
173 Opt_last_int,
174 /* int args above */
175 Opt_fsid,
176 Opt_name,
177 Opt_secret,
178 Opt_ip,
179 Opt_last_string,
180 /* string args above */
181 Opt_noshare,
182 Opt_nocrc,
183};
184
185static match_table_t opt_tokens = {
186 {Opt_osdtimeout, "osdtimeout=%d"},
187 {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
188 {Opt_mount_timeout, "mount_timeout=%d"},
189 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
190 /* int args above */
191 {Opt_fsid, "fsid=%s"},
192 {Opt_name, "name=%s"},
193 {Opt_secret, "secret=%s"},
194 {Opt_ip, "ip=%s"},
195 /* string args above */
196 {Opt_noshare, "noshare"},
197 {Opt_nocrc, "nocrc"},
198 {-1, NULL}
199};
200
201void ceph_destroy_options(struct ceph_options *opt)
202{
203 dout("destroy_options %p\n", opt);
204 kfree(opt->name);
205 kfree(opt->secret);
206 kfree(opt);
207}
208EXPORT_SYMBOL(ceph_destroy_options);
209
210int ceph_parse_options(struct ceph_options **popt, char *options,
211 const char *dev_name, const char *dev_name_end,
212 int (*parse_extra_token)(char *c, void *private),
213 void *private)
214{
215 struct ceph_options *opt;
216 const char *c;
217 int err = -ENOMEM;
218 substring_t argstr[MAX_OPT_ARGS];
219
220 opt = kzalloc(sizeof(*opt), GFP_KERNEL);
221 if (!opt)
222 return err;
223 opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr),
224 GFP_KERNEL);
225 if (!opt->mon_addr)
226 goto out;
227
228 dout("parse_options %p options '%s' dev_name '%s'\n", opt, options,
229 dev_name);
230
231 /* start with defaults */
232 opt->flags = CEPH_OPT_DEFAULT;
233 opt->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
234 opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
235 opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
236 opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */
237
238 /* get mon ip(s) */
239 /* ip1[:port1][,ip2[:port2]...] */
240 err = ceph_parse_ips(dev_name, dev_name_end, opt->mon_addr,
241 CEPH_MAX_MON, &opt->num_mon);
242 if (err < 0)
243 goto out;
244
245 /* parse mount options */
246 while ((c = strsep(&options, ",")) != NULL) {
247 int token, intval, ret;
248 if (!*c)
249 continue;
250 err = -EINVAL;
251 token = match_token((char *)c, opt_tokens, argstr);
252 if (token < 0 && parse_extra_token) {
253 /* extra? */
254 err = parse_extra_token((char *)c, private);
255 if (err < 0) {
256 pr_err("bad option at '%s'\n", c);
257 goto out;
258 }
259 continue;
260 }
261 if (token < Opt_last_int) {
262 ret = match_int(&argstr[0], &intval);
263 if (ret < 0) {
264 pr_err("bad mount option arg (not int) "
265 "at '%s'\n", c);
266 continue;
267 }
268 dout("got int token %d val %d\n", token, intval);
269 } else if (token > Opt_last_int && token < Opt_last_string) {
270 dout("got string token %d val %s\n", token,
271 argstr[0].from);
272 } else {
273 dout("got token %d\n", token);
274 }
275 switch (token) {
276 case Opt_ip:
277 err = ceph_parse_ips(argstr[0].from,
278 argstr[0].to,
279 &opt->my_addr,
280 1, NULL);
281 if (err < 0)
282 goto out;
283 opt->flags |= CEPH_OPT_MYIP;
284 break;
285
286 case Opt_fsid:
287 err = parse_fsid(argstr[0].from, &opt->fsid);
288 if (err == 0)
289 opt->flags |= CEPH_OPT_FSID;
290 break;
291 case Opt_name:
292 opt->name = kstrndup(argstr[0].from,
293 argstr[0].to-argstr[0].from,
294 GFP_KERNEL);
295 break;
296 case Opt_secret:
297 opt->secret = kstrndup(argstr[0].from,
298 argstr[0].to-argstr[0].from,
299 GFP_KERNEL);
300 break;
301
302 /* misc */
303 case Opt_osdtimeout:
304 opt->osd_timeout = intval;
305 break;
306 case Opt_osdkeepalivetimeout:
307 opt->osd_keepalive_timeout = intval;
308 break;
309 case Opt_osd_idle_ttl:
310 opt->osd_idle_ttl = intval;
311 break;
312 case Opt_mount_timeout:
313 opt->mount_timeout = intval;
314 break;
315
316 case Opt_noshare:
317 opt->flags |= CEPH_OPT_NOSHARE;
318 break;
319
320 case Opt_nocrc:
321 opt->flags |= CEPH_OPT_NOCRC;
322 break;
323
324 default:
325 BUG_ON(token);
326 }
327 }
328
329 /* success */
330 *popt = opt;
331 return 0;
332
333out:
334 ceph_destroy_options(opt);
335 return err;
336}
337EXPORT_SYMBOL(ceph_parse_options);
338
339u64 ceph_client_id(struct ceph_client *client)
340{
341 return client->monc.auth->global_id;
342}
343EXPORT_SYMBOL(ceph_client_id);
344
345/*
346 * create a fresh client instance
347 */
348struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private)
349{
350 struct ceph_client *client;
351 int err = -ENOMEM;
352
353 client = kzalloc(sizeof(*client), GFP_KERNEL);
354 if (client == NULL)
355 return ERR_PTR(-ENOMEM);
356
357 client->private = private;
358 client->options = opt;
359
360 mutex_init(&client->mount_mutex);
361 init_waitqueue_head(&client->auth_wq);
362 client->auth_err = 0;
363
364 client->extra_mon_dispatch = NULL;
365 client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT;
366 client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT;
367
368 client->msgr = NULL;
369
370 /* subsystems */
371 err = ceph_monc_init(&client->monc, client);
372 if (err < 0)
373 goto fail;
374 err = ceph_osdc_init(&client->osdc, client);
375 if (err < 0)
376 goto fail_monc;
377
378 return client;
379
380fail_monc:
381 ceph_monc_stop(&client->monc);
382fail:
383 kfree(client);
384 return ERR_PTR(err);
385}
386EXPORT_SYMBOL(ceph_create_client);
387
388void ceph_destroy_client(struct ceph_client *client)
389{
390 dout("destroy_client %p\n", client);
391
392 /* unmount */
393 ceph_osdc_stop(&client->osdc);
394
395 /*
396 * make sure mds and osd connections close out before destroying
397 * the auth module, which is needed to free those connections'
398 * ceph_authorizers.
399 */
400 ceph_msgr_flush();
401
402 ceph_monc_stop(&client->monc);
403
404 ceph_debugfs_client_cleanup(client);
405
406 if (client->msgr)
407 ceph_messenger_destroy(client->msgr);
408
409 ceph_destroy_options(client->options);
410
411 kfree(client);
412 dout("destroy_client %p done\n", client);
413}
414EXPORT_SYMBOL(ceph_destroy_client);
415
416/*
417 * true if we have the mon map (and have thus joined the cluster)
418 */
419static int have_mon_and_osd_map(struct ceph_client *client)
420{
421 return client->monc.monmap && client->monc.monmap->epoch &&
422 client->osdc.osdmap && client->osdc.osdmap->epoch;
423}
424
425/*
426 * mount: join the ceph cluster, and open root directory.
427 */
428int __ceph_open_session(struct ceph_client *client, unsigned long started)
429{
430 struct ceph_entity_addr *myaddr = NULL;
431 int err;
432 unsigned long timeout = client->options->mount_timeout * HZ;
433
434 /* initialize the messenger */
435 if (client->msgr == NULL) {
436 if (ceph_test_opt(client, MYIP))
437 myaddr = &client->options->my_addr;
438 client->msgr = ceph_messenger_create(myaddr,
439 client->supported_features,
440 client->required_features);
441 if (IS_ERR(client->msgr)) {
442 client->msgr = NULL;
443 return PTR_ERR(client->msgr);
444 }
445 client->msgr->nocrc = ceph_test_opt(client, NOCRC);
446 }
447
448 /* open session, and wait for mon and osd maps */
449 err = ceph_monc_open_session(&client->monc);
450 if (err < 0)
451 return err;
452
453 while (!have_mon_and_osd_map(client)) {
454 err = -EIO;
455 if (timeout && time_after_eq(jiffies, started + timeout))
456 return err;
457
458 /* wait */
459 dout("mount waiting for mon_map\n");
460 err = wait_event_interruptible_timeout(client->auth_wq,
461 have_mon_and_osd_map(client) || (client->auth_err < 0),
462 timeout);
463 if (err == -EINTR || err == -ERESTARTSYS)
464 return err;
465 if (client->auth_err < 0)
466 return client->auth_err;
467 }
468
469 return 0;
470}
471EXPORT_SYMBOL(__ceph_open_session);
472
473
474int ceph_open_session(struct ceph_client *client)
475{
476 int ret;
477 unsigned long started = jiffies; /* note the start time */
478
479 dout("open_session start\n");
480 mutex_lock(&client->mount_mutex);
481
482 ret = __ceph_open_session(client, started);
483
484 mutex_unlock(&client->mount_mutex);
485 return ret;
486}
487EXPORT_SYMBOL(ceph_open_session);
488
489
490static int __init init_ceph_lib(void)
491{
492 int ret = 0;
493
494 ret = ceph_debugfs_init();
495 if (ret < 0)
496 goto out;
497
498 ret = ceph_msgr_init();
499 if (ret < 0)
500 goto out_debugfs;
501
502 pr_info("loaded (mon/osd proto %d/%d, osdmap %d/%d %d/%d)\n",
503 CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL,
504 CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
505 CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
506
507 return 0;
508
509out_debugfs:
510 ceph_debugfs_cleanup();
511out:
512 return ret;
513}
514
515static void __exit exit_ceph_lib(void)
516{
517 dout("exit_ceph_lib\n");
518 ceph_msgr_exit();
519 ceph_debugfs_cleanup();
520}
521
522module_init(init_ceph_lib);
523module_exit(exit_ceph_lib);
524
525MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
526MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
527MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
528MODULE_DESCRIPTION("Ceph filesystem for Linux");
529MODULE_LICENSE("GPL");
diff --git a/net/ceph/ceph_fs.c b/net/ceph/ceph_fs.c
new file mode 100644
index 000000000000..a3a3a31d3c37
--- /dev/null
+++ b/net/ceph/ceph_fs.c
@@ -0,0 +1,75 @@
1/*
2 * Some non-inline ceph helpers
3 */
4#include <linux/module.h>
5#include <linux/ceph/types.h>
6
7/*
8 * return true if @layout appears to be valid
9 */
10int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
11{
12 __u32 su = le32_to_cpu(layout->fl_stripe_unit);
13 __u32 sc = le32_to_cpu(layout->fl_stripe_count);
14 __u32 os = le32_to_cpu(layout->fl_object_size);
15
16 /* stripe unit, object size must be non-zero, 64k increment */
17 if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
18 return 0;
19 if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
20 return 0;
21 /* object size must be a multiple of stripe unit */
22 if (os < su || os % su)
23 return 0;
24 /* stripe count must be non-zero */
25 if (!sc)
26 return 0;
27 return 1;
28}
29
30
31int ceph_flags_to_mode(int flags)
32{
33 int mode;
34
35#ifdef O_DIRECTORY /* fixme */
36 if ((flags & O_DIRECTORY) == O_DIRECTORY)
37 return CEPH_FILE_MODE_PIN;
38#endif
39 if ((flags & O_APPEND) == O_APPEND)
40 flags |= O_WRONLY;
41
42 if ((flags & O_ACCMODE) == O_RDWR)
43 mode = CEPH_FILE_MODE_RDWR;
44 else if ((flags & O_ACCMODE) == O_WRONLY)
45 mode = CEPH_FILE_MODE_WR;
46 else
47 mode = CEPH_FILE_MODE_RD;
48
49#ifdef O_LAZY
50 if (flags & O_LAZY)
51 mode |= CEPH_FILE_MODE_LAZY;
52#endif
53
54 return mode;
55}
56EXPORT_SYMBOL(ceph_flags_to_mode);
57
58int ceph_caps_for_mode(int mode)
59{
60 int caps = CEPH_CAP_PIN;
61
62 if (mode & CEPH_FILE_MODE_RD)
63 caps |= CEPH_CAP_FILE_SHARED |
64 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
65 if (mode & CEPH_FILE_MODE_WR)
66 caps |= CEPH_CAP_FILE_EXCL |
67 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
68 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
69 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
70 if (mode & CEPH_FILE_MODE_LAZY)
71 caps |= CEPH_CAP_FILE_LAZYIO;
72
73 return caps;
74}
75EXPORT_SYMBOL(ceph_caps_for_mode);
diff --git a/net/ceph/ceph_hash.c b/net/ceph/ceph_hash.c
new file mode 100644
index 000000000000..815ef8826796
--- /dev/null
+++ b/net/ceph/ceph_hash.c
@@ -0,0 +1,118 @@
1
2#include <linux/ceph/types.h>
3
4/*
5 * Robert Jenkin's hash function.
6 * http://burtleburtle.net/bob/hash/evahash.html
7 * This is in the public domain.
8 */
9#define mix(a, b, c) \
10 do { \
11 a = a - b; a = a - c; a = a ^ (c >> 13); \
12 b = b - c; b = b - a; b = b ^ (a << 8); \
13 c = c - a; c = c - b; c = c ^ (b >> 13); \
14 a = a - b; a = a - c; a = a ^ (c >> 12); \
15 b = b - c; b = b - a; b = b ^ (a << 16); \
16 c = c - a; c = c - b; c = c ^ (b >> 5); \
17 a = a - b; a = a - c; a = a ^ (c >> 3); \
18 b = b - c; b = b - a; b = b ^ (a << 10); \
19 c = c - a; c = c - b; c = c ^ (b >> 15); \
20 } while (0)
21
22unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
23{
24 const unsigned char *k = (const unsigned char *)str;
25 __u32 a, b, c; /* the internal state */
26 __u32 len; /* how many key bytes still need mixing */
27
28 /* Set up the internal state */
29 len = length;
30 a = 0x9e3779b9; /* the golden ratio; an arbitrary value */
31 b = a;
32 c = 0; /* variable initialization of internal state */
33
34 /* handle most of the key */
35 while (len >= 12) {
36 a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
37 ((__u32)k[3] << 24));
38 b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
39 ((__u32)k[7] << 24));
40 c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
41 ((__u32)k[11] << 24));
42 mix(a, b, c);
43 k = k + 12;
44 len = len - 12;
45 }
46
47 /* handle the last 11 bytes */
48 c = c + length;
49 switch (len) { /* all the case statements fall through */
50 case 11:
51 c = c + ((__u32)k[10] << 24);
52 case 10:
53 c = c + ((__u32)k[9] << 16);
54 case 9:
55 c = c + ((__u32)k[8] << 8);
56 /* the first byte of c is reserved for the length */
57 case 8:
58 b = b + ((__u32)k[7] << 24);
59 case 7:
60 b = b + ((__u32)k[6] << 16);
61 case 6:
62 b = b + ((__u32)k[5] << 8);
63 case 5:
64 b = b + k[4];
65 case 4:
66 a = a + ((__u32)k[3] << 24);
67 case 3:
68 a = a + ((__u32)k[2] << 16);
69 case 2:
70 a = a + ((__u32)k[1] << 8);
71 case 1:
72 a = a + k[0];
73 /* case 0: nothing left to add */
74 }
75 mix(a, b, c);
76
77 return c;
78}
79
80/*
81 * linux dcache hash
82 */
83unsigned ceph_str_hash_linux(const char *str, unsigned length)
84{
85 unsigned long hash = 0;
86 unsigned char c;
87
88 while (length--) {
89 c = *str++;
90 hash = (hash + (c << 4) + (c >> 4)) * 11;
91 }
92 return hash;
93}
94
95
96unsigned ceph_str_hash(int type, const char *s, unsigned len)
97{
98 switch (type) {
99 case CEPH_STR_HASH_LINUX:
100 return ceph_str_hash_linux(s, len);
101 case CEPH_STR_HASH_RJENKINS:
102 return ceph_str_hash_rjenkins(s, len);
103 default:
104 return -1;
105 }
106}
107
108const char *ceph_str_hash_name(int type)
109{
110 switch (type) {
111 case CEPH_STR_HASH_LINUX:
112 return "linux";
113 case CEPH_STR_HASH_RJENKINS:
114 return "rjenkins";
115 default:
116 return "unknown";
117 }
118}
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
new file mode 100644
index 000000000000..3fbda04de29c
--- /dev/null
+++ b/net/ceph/ceph_strings.c
@@ -0,0 +1,84 @@
1/*
2 * Ceph string constants
3 */
4#include <linux/module.h>
5#include <linux/ceph/types.h>
6
7const char *ceph_entity_type_name(int type)
8{
9 switch (type) {
10 case CEPH_ENTITY_TYPE_MDS: return "mds";
11 case CEPH_ENTITY_TYPE_OSD: return "osd";
12 case CEPH_ENTITY_TYPE_MON: return "mon";
13 case CEPH_ENTITY_TYPE_CLIENT: return "client";
14 case CEPH_ENTITY_TYPE_AUTH: return "auth";
15 default: return "unknown";
16 }
17}
18
19const char *ceph_osd_op_name(int op)
20{
21 switch (op) {
22 case CEPH_OSD_OP_READ: return "read";
23 case CEPH_OSD_OP_STAT: return "stat";
24
25 case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
26
27 case CEPH_OSD_OP_WRITE: return "write";
28 case CEPH_OSD_OP_DELETE: return "delete";
29 case CEPH_OSD_OP_TRUNCATE: return "truncate";
30 case CEPH_OSD_OP_ZERO: return "zero";
31 case CEPH_OSD_OP_WRITEFULL: return "writefull";
32 case CEPH_OSD_OP_ROLLBACK: return "rollback";
33
34 case CEPH_OSD_OP_APPEND: return "append";
35 case CEPH_OSD_OP_STARTSYNC: return "startsync";
36 case CEPH_OSD_OP_SETTRUNC: return "settrunc";
37 case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
38
39 case CEPH_OSD_OP_TMAPUP: return "tmapup";
40 case CEPH_OSD_OP_TMAPGET: return "tmapget";
41 case CEPH_OSD_OP_TMAPPUT: return "tmapput";
42
43 case CEPH_OSD_OP_GETXATTR: return "getxattr";
44 case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
45 case CEPH_OSD_OP_SETXATTR: return "setxattr";
46 case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
47 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
48 case CEPH_OSD_OP_RMXATTR: return "rmxattr";
49 case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
50
51 case CEPH_OSD_OP_PULL: return "pull";
52 case CEPH_OSD_OP_PUSH: return "push";
53 case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
54 case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
55 case CEPH_OSD_OP_SCRUB: return "scrub";
56
57 case CEPH_OSD_OP_WRLOCK: return "wrlock";
58 case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
59 case CEPH_OSD_OP_RDLOCK: return "rdlock";
60 case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
61 case CEPH_OSD_OP_UPLOCK: return "uplock";
62 case CEPH_OSD_OP_DNLOCK: return "dnlock";
63
64 case CEPH_OSD_OP_CALL: return "call";
65
66 case CEPH_OSD_OP_PGLS: return "pgls";
67 }
68 return "???";
69}
70
71
72const char *ceph_pool_op_name(int op)
73{
74 switch (op) {
75 case POOL_OP_CREATE: return "create";
76 case POOL_OP_DELETE: return "delete";
77 case POOL_OP_AUID_CHANGE: return "auid change";
78 case POOL_OP_CREATE_SNAP: return "create snap";
79 case POOL_OP_DELETE_SNAP: return "delete snap";
80 case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
81 case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
82 }
83 return "???";
84}
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
new file mode 100644
index 000000000000..d6ebb13a18a4
--- /dev/null
+++ b/net/ceph/crush/crush.c
@@ -0,0 +1,151 @@
1
2#ifdef __KERNEL__
3# include <linux/slab.h>
4#else
5# include <stdlib.h>
6# include <assert.h>
7# define kfree(x) do { if (x) free(x); } while (0)
8# define BUG_ON(x) assert(!(x))
9#endif
10
11#include <linux/crush/crush.h>
12
13const char *crush_bucket_alg_name(int alg)
14{
15 switch (alg) {
16 case CRUSH_BUCKET_UNIFORM: return "uniform";
17 case CRUSH_BUCKET_LIST: return "list";
18 case CRUSH_BUCKET_TREE: return "tree";
19 case CRUSH_BUCKET_STRAW: return "straw";
20 default: return "unknown";
21 }
22}
23
24/**
25 * crush_get_bucket_item_weight - Get weight of an item in given bucket
26 * @b: bucket pointer
27 * @p: item index in bucket
28 */
29int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
30{
31 if (p >= b->size)
32 return 0;
33
34 switch (b->alg) {
35 case CRUSH_BUCKET_UNIFORM:
36 return ((struct crush_bucket_uniform *)b)->item_weight;
37 case CRUSH_BUCKET_LIST:
38 return ((struct crush_bucket_list *)b)->item_weights[p];
39 case CRUSH_BUCKET_TREE:
40 if (p & 1)
41 return ((struct crush_bucket_tree *)b)->node_weights[p];
42 return 0;
43 case CRUSH_BUCKET_STRAW:
44 return ((struct crush_bucket_straw *)b)->item_weights[p];
45 }
46 return 0;
47}
48
49/**
50 * crush_calc_parents - Calculate parent vectors for the given crush map.
51 * @map: crush_map pointer
52 */
53void crush_calc_parents(struct crush_map *map)
54{
55 int i, b, c;
56
57 for (b = 0; b < map->max_buckets; b++) {
58 if (map->buckets[b] == NULL)
59 continue;
60 for (i = 0; i < map->buckets[b]->size; i++) {
61 c = map->buckets[b]->items[i];
62 BUG_ON(c >= map->max_devices ||
63 c < -map->max_buckets);
64 if (c >= 0)
65 map->device_parents[c] = map->buckets[b]->id;
66 else
67 map->bucket_parents[-1-c] = map->buckets[b]->id;
68 }
69 }
70}
71
72void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
73{
74 kfree(b->h.perm);
75 kfree(b->h.items);
76 kfree(b);
77}
78
79void crush_destroy_bucket_list(struct crush_bucket_list *b)
80{
81 kfree(b->item_weights);
82 kfree(b->sum_weights);
83 kfree(b->h.perm);
84 kfree(b->h.items);
85 kfree(b);
86}
87
88void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
89{
90 kfree(b->node_weights);
91 kfree(b);
92}
93
94void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
95{
96 kfree(b->straws);
97 kfree(b->item_weights);
98 kfree(b->h.perm);
99 kfree(b->h.items);
100 kfree(b);
101}
102
103void crush_destroy_bucket(struct crush_bucket *b)
104{
105 switch (b->alg) {
106 case CRUSH_BUCKET_UNIFORM:
107 crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
108 break;
109 case CRUSH_BUCKET_LIST:
110 crush_destroy_bucket_list((struct crush_bucket_list *)b);
111 break;
112 case CRUSH_BUCKET_TREE:
113 crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
114 break;
115 case CRUSH_BUCKET_STRAW:
116 crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
117 break;
118 }
119}
120
121/**
122 * crush_destroy - Destroy a crush_map
123 * @map: crush_map pointer
124 */
125void crush_destroy(struct crush_map *map)
126{
127 int b;
128
129 /* buckets */
130 if (map->buckets) {
131 for (b = 0; b < map->max_buckets; b++) {
132 if (map->buckets[b] == NULL)
133 continue;
134 crush_destroy_bucket(map->buckets[b]);
135 }
136 kfree(map->buckets);
137 }
138
139 /* rules */
140 if (map->rules) {
141 for (b = 0; b < map->max_rules; b++)
142 kfree(map->rules[b]);
143 kfree(map->rules);
144 }
145
146 kfree(map->bucket_parents);
147 kfree(map->device_parents);
148 kfree(map);
149}
150
151
diff --git a/net/ceph/crush/hash.c b/net/ceph/crush/hash.c
new file mode 100644
index 000000000000..5bb63e37a8a1
--- /dev/null
+++ b/net/ceph/crush/hash.c
@@ -0,0 +1,149 @@
1
2#include <linux/types.h>
3#include <linux/crush/hash.h>
4
5/*
6 * Robert Jenkins' function for mixing 32-bit values
7 * http://burtleburtle.net/bob/hash/evahash.html
8 * a, b = random bits, c = input and output
9 */
10#define crush_hashmix(a, b, c) do { \
11 a = a-b; a = a-c; a = a^(c>>13); \
12 b = b-c; b = b-a; b = b^(a<<8); \
13 c = c-a; c = c-b; c = c^(b>>13); \
14 a = a-b; a = a-c; a = a^(c>>12); \
15 b = b-c; b = b-a; b = b^(a<<16); \
16 c = c-a; c = c-b; c = c^(b>>5); \
17 a = a-b; a = a-c; a = a^(c>>3); \
18 b = b-c; b = b-a; b = b^(a<<10); \
19 c = c-a; c = c-b; c = c^(b>>15); \
20 } while (0)
21
22#define crush_hash_seed 1315423911
23
24static __u32 crush_hash32_rjenkins1(__u32 a)
25{
26 __u32 hash = crush_hash_seed ^ a;
27 __u32 b = a;
28 __u32 x = 231232;
29 __u32 y = 1232;
30 crush_hashmix(b, x, hash);
31 crush_hashmix(y, a, hash);
32 return hash;
33}
34
35static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
36{
37 __u32 hash = crush_hash_seed ^ a ^ b;
38 __u32 x = 231232;
39 __u32 y = 1232;
40 crush_hashmix(a, b, hash);
41 crush_hashmix(x, a, hash);
42 crush_hashmix(b, y, hash);
43 return hash;
44}
45
46static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
47{
48 __u32 hash = crush_hash_seed ^ a ^ b ^ c;
49 __u32 x = 231232;
50 __u32 y = 1232;
51 crush_hashmix(a, b, hash);
52 crush_hashmix(c, x, hash);
53 crush_hashmix(y, a, hash);
54 crush_hashmix(b, x, hash);
55 crush_hashmix(y, c, hash);
56 return hash;
57}
58
59static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
60{
61 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
62 __u32 x = 231232;
63 __u32 y = 1232;
64 crush_hashmix(a, b, hash);
65 crush_hashmix(c, d, hash);
66 crush_hashmix(a, x, hash);
67 crush_hashmix(y, b, hash);
68 crush_hashmix(c, x, hash);
69 crush_hashmix(y, d, hash);
70 return hash;
71}
72
73static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
74 __u32 e)
75{
76 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
77 __u32 x = 231232;
78 __u32 y = 1232;
79 crush_hashmix(a, b, hash);
80 crush_hashmix(c, d, hash);
81 crush_hashmix(e, x, hash);
82 crush_hashmix(y, a, hash);
83 crush_hashmix(b, x, hash);
84 crush_hashmix(y, c, hash);
85 crush_hashmix(d, x, hash);
86 crush_hashmix(y, e, hash);
87 return hash;
88}
89
90
91__u32 crush_hash32(int type, __u32 a)
92{
93 switch (type) {
94 case CRUSH_HASH_RJENKINS1:
95 return crush_hash32_rjenkins1(a);
96 default:
97 return 0;
98 }
99}
100
101__u32 crush_hash32_2(int type, __u32 a, __u32 b)
102{
103 switch (type) {
104 case CRUSH_HASH_RJENKINS1:
105 return crush_hash32_rjenkins1_2(a, b);
106 default:
107 return 0;
108 }
109}
110
111__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
112{
113 switch (type) {
114 case CRUSH_HASH_RJENKINS1:
115 return crush_hash32_rjenkins1_3(a, b, c);
116 default:
117 return 0;
118 }
119}
120
121__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
122{
123 switch (type) {
124 case CRUSH_HASH_RJENKINS1:
125 return crush_hash32_rjenkins1_4(a, b, c, d);
126 default:
127 return 0;
128 }
129}
130
131__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
132{
133 switch (type) {
134 case CRUSH_HASH_RJENKINS1:
135 return crush_hash32_rjenkins1_5(a, b, c, d, e);
136 default:
137 return 0;
138 }
139}
140
141const char *crush_hash_name(int type)
142{
143 switch (type) {
144 case CRUSH_HASH_RJENKINS1:
145 return "rjenkins1";
146 default:
147 return "unknown";
148 }
149}
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
new file mode 100644
index 000000000000..42599e31dcad
--- /dev/null
+++ b/net/ceph/crush/mapper.c
@@ -0,0 +1,609 @@
1
2#ifdef __KERNEL__
3# include <linux/string.h>
4# include <linux/slab.h>
5# include <linux/bug.h>
6# include <linux/kernel.h>
7# ifndef dprintk
8# define dprintk(args...)
9# endif
10#else
11# include <string.h>
12# include <stdio.h>
13# include <stdlib.h>
14# include <assert.h>
15# define BUG_ON(x) assert(!(x))
16# define dprintk(args...) /* printf(args) */
17# define kmalloc(x, f) malloc(x)
18# define kfree(x) free(x)
19#endif
20
21#include <linux/crush/crush.h>
22#include <linux/crush/hash.h>
23
24/*
25 * Implement the core CRUSH mapping algorithm.
26 */
27
28/**
29 * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
30 * @map: the crush_map
31 * @ruleset: the storage ruleset id (user defined)
32 * @type: storage ruleset type (user defined)
33 * @size: output set size
34 */
35int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
36{
37 int i;
38
39 for (i = 0; i < map->max_rules; i++) {
40 if (map->rules[i] &&
41 map->rules[i]->mask.ruleset == ruleset &&
42 map->rules[i]->mask.type == type &&
43 map->rules[i]->mask.min_size <= size &&
44 map->rules[i]->mask.max_size >= size)
45 return i;
46 }
47 return -1;
48}
49
50
51/*
52 * bucket choose methods
53 *
54 * For each bucket algorithm, we have a "choose" method that, given a
55 * crush input @x and replica position (usually, position in output set) @r,
56 * will produce an item in the bucket.
57 */
58
59/*
60 * Choose based on a random permutation of the bucket.
61 *
62 * We used to use some prime number arithmetic to do this, but it
63 * wasn't very random, and had some other bad behaviors. Instead, we
64 * calculate an actual random permutation of the bucket members.
65 * Since this is expensive, we optimize for the r=0 case, which
66 * captures the vast majority of calls.
67 */
68static int bucket_perm_choose(struct crush_bucket *bucket,
69 int x, int r)
70{
71 unsigned pr = r % bucket->size;
72 unsigned i, s;
73
74 /* start a new permutation if @x has changed */
75 if (bucket->perm_x != x || bucket->perm_n == 0) {
76 dprintk("bucket %d new x=%d\n", bucket->id, x);
77 bucket->perm_x = x;
78
79 /* optimize common r=0 case */
80 if (pr == 0) {
81 s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
82 bucket->size;
83 bucket->perm[0] = s;
84 bucket->perm_n = 0xffff; /* magic value, see below */
85 goto out;
86 }
87
88 for (i = 0; i < bucket->size; i++)
89 bucket->perm[i] = i;
90 bucket->perm_n = 0;
91 } else if (bucket->perm_n == 0xffff) {
92 /* clean up after the r=0 case above */
93 for (i = 1; i < bucket->size; i++)
94 bucket->perm[i] = i;
95 bucket->perm[bucket->perm[0]] = 0;
96 bucket->perm_n = 1;
97 }
98
99 /* calculate permutation up to pr */
100 for (i = 0; i < bucket->perm_n; i++)
101 dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
102 while (bucket->perm_n <= pr) {
103 unsigned p = bucket->perm_n;
104 /* no point in swapping the final entry */
105 if (p < bucket->size - 1) {
106 i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
107 (bucket->size - p);
108 if (i) {
109 unsigned t = bucket->perm[p + i];
110 bucket->perm[p + i] = bucket->perm[p];
111 bucket->perm[p] = t;
112 }
113 dprintk(" perm_choose swap %d with %d\n", p, p+i);
114 }
115 bucket->perm_n++;
116 }
117 for (i = 0; i < bucket->size; i++)
118 dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]);
119
120 s = bucket->perm[pr];
121out:
122 dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
123 bucket->size, x, r, pr, s);
124 return bucket->items[s];
125}
126
127/* uniform */
128static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
129 int x, int r)
130{
131 return bucket_perm_choose(&bucket->h, x, r);
132}
133
134/* list */
135static int bucket_list_choose(struct crush_bucket_list *bucket,
136 int x, int r)
137{
138 int i;
139
140 for (i = bucket->h.size-1; i >= 0; i--) {
141 __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
142 r, bucket->h.id);
143 w &= 0xffff;
144 dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
145 "sw %x rand %llx",
146 i, x, r, bucket->h.items[i], bucket->item_weights[i],
147 bucket->sum_weights[i], w);
148 w *= bucket->sum_weights[i];
149 w = w >> 16;
150 /*dprintk(" scaled %llx\n", w);*/
151 if (w < bucket->item_weights[i])
152 return bucket->h.items[i];
153 }
154
155 BUG_ON(1);
156 return 0;
157}
158
159
160/* (binary) tree */
161static int height(int n)
162{
163 int h = 0;
164 while ((n & 1) == 0) {
165 h++;
166 n = n >> 1;
167 }
168 return h;
169}
170
171static int left(int x)
172{
173 int h = height(x);
174 return x - (1 << (h-1));
175}
176
177static int right(int x)
178{
179 int h = height(x);
180 return x + (1 << (h-1));
181}
182
183static int terminal(int x)
184{
185 return x & 1;
186}
187
188static int bucket_tree_choose(struct crush_bucket_tree *bucket,
189 int x, int r)
190{
191 int n, l;
192 __u32 w;
193 __u64 t;
194
195 /* start at root */
196 n = bucket->num_nodes >> 1;
197
198 while (!terminal(n)) {
199 /* pick point in [0, w) */
200 w = bucket->node_weights[n];
201 t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
202 bucket->h.id) * (__u64)w;
203 t = t >> 32;
204
205 /* descend to the left or right? */
206 l = left(n);
207 if (t < bucket->node_weights[l])
208 n = l;
209 else
210 n = right(n);
211 }
212
213 return bucket->h.items[n >> 1];
214}
215
216
217/* straw */
218
219static int bucket_straw_choose(struct crush_bucket_straw *bucket,
220 int x, int r)
221{
222 int i;
223 int high = 0;
224 __u64 high_draw = 0;
225 __u64 draw;
226
227 for (i = 0; i < bucket->h.size; i++) {
228 draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
229 draw &= 0xffff;
230 draw *= bucket->straws[i];
231 if (i == 0 || draw > high_draw) {
232 high = i;
233 high_draw = draw;
234 }
235 }
236 return bucket->h.items[high];
237}
238
239static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
240{
241 dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
242 switch (in->alg) {
243 case CRUSH_BUCKET_UNIFORM:
244 return bucket_uniform_choose((struct crush_bucket_uniform *)in,
245 x, r);
246 case CRUSH_BUCKET_LIST:
247 return bucket_list_choose((struct crush_bucket_list *)in,
248 x, r);
249 case CRUSH_BUCKET_TREE:
250 return bucket_tree_choose((struct crush_bucket_tree *)in,
251 x, r);
252 case CRUSH_BUCKET_STRAW:
253 return bucket_straw_choose((struct crush_bucket_straw *)in,
254 x, r);
255 default:
256 BUG_ON(1);
257 return in->items[0];
258 }
259}
260
261/*
262 * true if device is marked "out" (failed, fully offloaded)
263 * of the cluster
264 */
265static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
266{
267 if (weight[item] >= 0x10000)
268 return 0;
269 if (weight[item] == 0)
270 return 1;
271 if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
272 < weight[item])
273 return 0;
274 return 1;
275}
276
277/**
278 * crush_choose - choose numrep distinct items of given type
279 * @map: the crush_map
280 * @bucket: the bucket we are choose an item from
281 * @x: crush input value
282 * @numrep: the number of items to choose
283 * @type: the type of item to choose
284 * @out: pointer to output vector
285 * @outpos: our position in that vector
286 * @firstn: true if choosing "first n" items, false if choosing "indep"
287 * @recurse_to_leaf: true if we want one device under each item of given type
288 * @out2: second output vector for leaf items (if @recurse_to_leaf)
289 */
290static int crush_choose(struct crush_map *map,
291 struct crush_bucket *bucket,
292 __u32 *weight,
293 int x, int numrep, int type,
294 int *out, int outpos,
295 int firstn, int recurse_to_leaf,
296 int *out2)
297{
298 int rep;
299 int ftotal, flocal;
300 int retry_descent, retry_bucket, skip_rep;
301 struct crush_bucket *in = bucket;
302 int r;
303 int i;
304 int item = 0;
305 int itemtype;
306 int collide, reject;
307 const int orig_tries = 5; /* attempts before we fall back to search */
308
309 dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
310 bucket->id, x, outpos, numrep);
311
312 for (rep = outpos; rep < numrep; rep++) {
313 /* keep trying until we get a non-out, non-colliding item */
314 ftotal = 0;
315 skip_rep = 0;
316 do {
317 retry_descent = 0;
318 in = bucket; /* initial bucket */
319
320 /* choose through intervening buckets */
321 flocal = 0;
322 do {
323 collide = 0;
324 retry_bucket = 0;
325 r = rep;
326 if (in->alg == CRUSH_BUCKET_UNIFORM) {
327 /* be careful */
328 if (firstn || numrep >= in->size)
329 /* r' = r + f_total */
330 r += ftotal;
331 else if (in->size % numrep == 0)
332 /* r'=r+(n+1)*f_local */
333 r += (numrep+1) *
334 (flocal+ftotal);
335 else
336 /* r' = r + n*f_local */
337 r += numrep * (flocal+ftotal);
338 } else {
339 if (firstn)
340 /* r' = r + f_total */
341 r += ftotal;
342 else
343 /* r' = r + n*f_local */
344 r += numrep * (flocal+ftotal);
345 }
346
347 /* bucket choose */
348 if (in->size == 0) {
349 reject = 1;
350 goto reject;
351 }
352 if (flocal >= (in->size>>1) &&
353 flocal > orig_tries)
354 item = bucket_perm_choose(in, x, r);
355 else
356 item = crush_bucket_choose(in, x, r);
357 BUG_ON(item >= map->max_devices);
358
359 /* desired type? */
360 if (item < 0)
361 itemtype = map->buckets[-1-item]->type;
362 else
363 itemtype = 0;
364 dprintk(" item %d type %d\n", item, itemtype);
365
366 /* keep going? */
367 if (itemtype != type) {
368 BUG_ON(item >= 0 ||
369 (-1-item) >= map->max_buckets);
370 in = map->buckets[-1-item];
371 retry_bucket = 1;
372 continue;
373 }
374
375 /* collision? */
376 for (i = 0; i < outpos; i++) {
377 if (out[i] == item) {
378 collide = 1;
379 break;
380 }
381 }
382
383 reject = 0;
384 if (recurse_to_leaf) {
385 if (item < 0) {
386 if (crush_choose(map,
387 map->buckets[-1-item],
388 weight,
389 x, outpos+1, 0,
390 out2, outpos,
391 firstn, 0,
392 NULL) <= outpos)
393 /* didn't get leaf */
394 reject = 1;
395 } else {
396 /* we already have a leaf! */
397 out2[outpos] = item;
398 }
399 }
400
401 if (!reject) {
402 /* out? */
403 if (itemtype == 0)
404 reject = is_out(map, weight,
405 item, x);
406 else
407 reject = 0;
408 }
409
410reject:
411 if (reject || collide) {
412 ftotal++;
413 flocal++;
414
415 if (collide && flocal < 3)
416 /* retry locally a few times */
417 retry_bucket = 1;
418 else if (flocal < in->size + orig_tries)
419 /* exhaustive bucket search */
420 retry_bucket = 1;
421 else if (ftotal < 20)
422 /* then retry descent */
423 retry_descent = 1;
424 else
425 /* else give up */
426 skip_rep = 1;
427 dprintk(" reject %d collide %d "
428 "ftotal %d flocal %d\n",
429 reject, collide, ftotal,
430 flocal);
431 }
432 } while (retry_bucket);
433 } while (retry_descent);
434
435 if (skip_rep) {
436 dprintk("skip rep\n");
437 continue;
438 }
439
440 dprintk("CHOOSE got %d\n", item);
441 out[outpos] = item;
442 outpos++;
443 }
444
445 dprintk("CHOOSE returns %d\n", outpos);
446 return outpos;
447}
448
449
450/**
451 * crush_do_rule - calculate a mapping with the given input and rule
452 * @map: the crush_map
453 * @ruleno: the rule id
454 * @x: hash input
455 * @result: pointer to result vector
456 * @result_max: maximum result size
457 * @force: force initial replica choice; -1 for none
458 */
459int crush_do_rule(struct crush_map *map,
460 int ruleno, int x, int *result, int result_max,
461 int force, __u32 *weight)
462{
463 int result_len;
464 int force_context[CRUSH_MAX_DEPTH];
465 int force_pos = -1;
466 int a[CRUSH_MAX_SET];
467 int b[CRUSH_MAX_SET];
468 int c[CRUSH_MAX_SET];
469 int recurse_to_leaf;
470 int *w;
471 int wsize = 0;
472 int *o;
473 int osize;
474 int *tmp;
475 struct crush_rule *rule;
476 int step;
477 int i, j;
478 int numrep;
479 int firstn;
480 int rc = -1;
481
482 BUG_ON(ruleno >= map->max_rules);
483
484 rule = map->rules[ruleno];
485 result_len = 0;
486 w = a;
487 o = b;
488
489 /*
490 * determine hierarchical context of force, if any. note
491 * that this may or may not correspond to the specific types
492 * referenced by the crush rule.
493 */
494 if (force >= 0) {
495 if (force >= map->max_devices ||
496 map->device_parents[force] == 0) {
497 /*dprintk("CRUSH: forcefed device dne\n");*/
498 rc = -1; /* force fed device dne */
499 goto out;
500 }
501 if (!is_out(map, weight, force, x)) {
502 while (1) {
503 force_context[++force_pos] = force;
504 if (force >= 0)
505 force = map->device_parents[force];
506 else
507 force = map->bucket_parents[-1-force];
508 if (force == 0)
509 break;
510 }
511 }
512 }
513
514 for (step = 0; step < rule->len; step++) {
515 firstn = 0;
516 switch (rule->steps[step].op) {
517 case CRUSH_RULE_TAKE:
518 w[0] = rule->steps[step].arg1;
519 if (force_pos >= 0) {
520 BUG_ON(force_context[force_pos] != w[0]);
521 force_pos--;
522 }
523 wsize = 1;
524 break;
525
526 case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
527 case CRUSH_RULE_CHOOSE_FIRSTN:
528 firstn = 1;
529 case CRUSH_RULE_CHOOSE_LEAF_INDEP:
530 case CRUSH_RULE_CHOOSE_INDEP:
531 BUG_ON(wsize == 0);
532
533 recurse_to_leaf =
534 rule->steps[step].op ==
535 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
536 rule->steps[step].op ==
537 CRUSH_RULE_CHOOSE_LEAF_INDEP;
538
539 /* reset output */
540 osize = 0;
541
542 for (i = 0; i < wsize; i++) {
543 /*
544 * see CRUSH_N, CRUSH_N_MINUS macros.
545 * basically, numrep <= 0 means relative to
546 * the provided result_max
547 */
548 numrep = rule->steps[step].arg1;
549 if (numrep <= 0) {
550 numrep += result_max;
551 if (numrep <= 0)
552 continue;
553 }
554 j = 0;
555 if (osize == 0 && force_pos >= 0) {
556 /* skip any intermediate types */
557 while (force_pos &&
558 force_context[force_pos] < 0 &&
559 rule->steps[step].arg2 !=
560 map->buckets[-1 -
561 force_context[force_pos]]->type)
562 force_pos--;
563 o[osize] = force_context[force_pos];
564 if (recurse_to_leaf)
565 c[osize] = force_context[0];
566 j++;
567 force_pos--;
568 }
569 osize += crush_choose(map,
570 map->buckets[-1-w[i]],
571 weight,
572 x, numrep,
573 rule->steps[step].arg2,
574 o+osize, j,
575 firstn,
576 recurse_to_leaf, c+osize);
577 }
578
579 if (recurse_to_leaf)
580 /* copy final _leaf_ values to output set */
581 memcpy(o, c, osize*sizeof(*o));
582
583 /* swap t and w arrays */
584 tmp = o;
585 o = w;
586 w = tmp;
587 wsize = osize;
588 break;
589
590
591 case CRUSH_RULE_EMIT:
592 for (i = 0; i < wsize && result_len < result_max; i++) {
593 result[result_len] = w[i];
594 result_len++;
595 }
596 wsize = 0;
597 break;
598
599 default:
600 BUG_ON(1);
601 }
602 }
603 rc = result_len;
604
605out:
606 return rc;
607}
608
609
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
new file mode 100644
index 000000000000..7b505b0c983f
--- /dev/null
+++ b/net/ceph/crypto.c
@@ -0,0 +1,412 @@
1
2#include <linux/ceph/ceph_debug.h>
3
4#include <linux/err.h>
5#include <linux/scatterlist.h>
6#include <linux/slab.h>
7#include <crypto/hash.h>
8
9#include <linux/ceph/decode.h>
10#include "crypto.h"
11
12int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
13{
14 if (*p + sizeof(u16) + sizeof(key->created) +
15 sizeof(u16) + key->len > end)
16 return -ERANGE;
17 ceph_encode_16(p, key->type);
18 ceph_encode_copy(p, &key->created, sizeof(key->created));
19 ceph_encode_16(p, key->len);
20 ceph_encode_copy(p, key->key, key->len);
21 return 0;
22}
23
24int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
25{
26 ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
27 key->type = ceph_decode_16(p);
28 ceph_decode_copy(p, &key->created, sizeof(key->created));
29 key->len = ceph_decode_16(p);
30 ceph_decode_need(p, end, key->len, bad);
31 key->key = kmalloc(key->len, GFP_NOFS);
32 if (!key->key)
33 return -ENOMEM;
34 ceph_decode_copy(p, key->key, key->len);
35 return 0;
36
37bad:
38 dout("failed to decode crypto key\n");
39 return -EINVAL;
40}
41
42int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
43{
44 int inlen = strlen(inkey);
45 int blen = inlen * 3 / 4;
46 void *buf, *p;
47 int ret;
48
49 dout("crypto_key_unarmor %s\n", inkey);
50 buf = kmalloc(blen, GFP_NOFS);
51 if (!buf)
52 return -ENOMEM;
53 blen = ceph_unarmor(buf, inkey, inkey+inlen);
54 if (blen < 0) {
55 kfree(buf);
56 return blen;
57 }
58
59 p = buf;
60 ret = ceph_crypto_key_decode(key, &p, p + blen);
61 kfree(buf);
62 if (ret)
63 return ret;
64 dout("crypto_key_unarmor key %p type %d len %d\n", key,
65 key->type, key->len);
66 return 0;
67}
68
69
70
71#define AES_KEY_SIZE 16
72
73static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
74{
75 return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
76}
77
78static const u8 *aes_iv = (u8 *)CEPH_AES_IV;
79
80static int ceph_aes_encrypt(const void *key, int key_len,
81 void *dst, size_t *dst_len,
82 const void *src, size_t src_len)
83{
84 struct scatterlist sg_in[2], sg_out[1];
85 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
86 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
87 int ret;
88 void *iv;
89 int ivsize;
90 size_t zero_padding = (0x10 - (src_len & 0x0f));
91 char pad[16];
92
93 if (IS_ERR(tfm))
94 return PTR_ERR(tfm);
95
96 memset(pad, zero_padding, zero_padding);
97
98 *dst_len = src_len + zero_padding;
99
100 crypto_blkcipher_setkey((void *)tfm, key, key_len);
101 sg_init_table(sg_in, 2);
102 sg_set_buf(&sg_in[0], src, src_len);
103 sg_set_buf(&sg_in[1], pad, zero_padding);
104 sg_init_table(sg_out, 1);
105 sg_set_buf(sg_out, dst, *dst_len);
106 iv = crypto_blkcipher_crt(tfm)->iv;
107 ivsize = crypto_blkcipher_ivsize(tfm);
108
109 memcpy(iv, aes_iv, ivsize);
110 /*
111 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
112 key, key_len, 1);
113 print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
114 src, src_len, 1);
115 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
116 pad, zero_padding, 1);
117 */
118 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
119 src_len + zero_padding);
120 crypto_free_blkcipher(tfm);
121 if (ret < 0)
122 pr_err("ceph_aes_crypt failed %d\n", ret);
123 /*
124 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
125 dst, *dst_len, 1);
126 */
127 return 0;
128}
129
130static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
131 size_t *dst_len,
132 const void *src1, size_t src1_len,
133 const void *src2, size_t src2_len)
134{
135 struct scatterlist sg_in[3], sg_out[1];
136 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
137 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
138 int ret;
139 void *iv;
140 int ivsize;
141 size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
142 char pad[16];
143
144 if (IS_ERR(tfm))
145 return PTR_ERR(tfm);
146
147 memset(pad, zero_padding, zero_padding);
148
149 *dst_len = src1_len + src2_len + zero_padding;
150
151 crypto_blkcipher_setkey((void *)tfm, key, key_len);
152 sg_init_table(sg_in, 3);
153 sg_set_buf(&sg_in[0], src1, src1_len);
154 sg_set_buf(&sg_in[1], src2, src2_len);
155 sg_set_buf(&sg_in[2], pad, zero_padding);
156 sg_init_table(sg_out, 1);
157 sg_set_buf(sg_out, dst, *dst_len);
158 iv = crypto_blkcipher_crt(tfm)->iv;
159 ivsize = crypto_blkcipher_ivsize(tfm);
160
161 memcpy(iv, aes_iv, ivsize);
162 /*
163 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
164 key, key_len, 1);
165 print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
166 src1, src1_len, 1);
167 print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
168 src2, src2_len, 1);
169 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
170 pad, zero_padding, 1);
171 */
172 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
173 src1_len + src2_len + zero_padding);
174 crypto_free_blkcipher(tfm);
175 if (ret < 0)
176 pr_err("ceph_aes_crypt2 failed %d\n", ret);
177 /*
178 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
179 dst, *dst_len, 1);
180 */
181 return 0;
182}
183
184static int ceph_aes_decrypt(const void *key, int key_len,
185 void *dst, size_t *dst_len,
186 const void *src, size_t src_len)
187{
188 struct scatterlist sg_in[1], sg_out[2];
189 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
190 struct blkcipher_desc desc = { .tfm = tfm };
191 char pad[16];
192 void *iv;
193 int ivsize;
194 int ret;
195 int last_byte;
196
197 if (IS_ERR(tfm))
198 return PTR_ERR(tfm);
199
200 crypto_blkcipher_setkey((void *)tfm, key, key_len);
201 sg_init_table(sg_in, 1);
202 sg_init_table(sg_out, 2);
203 sg_set_buf(sg_in, src, src_len);
204 sg_set_buf(&sg_out[0], dst, *dst_len);
205 sg_set_buf(&sg_out[1], pad, sizeof(pad));
206
207 iv = crypto_blkcipher_crt(tfm)->iv;
208 ivsize = crypto_blkcipher_ivsize(tfm);
209
210 memcpy(iv, aes_iv, ivsize);
211
212 /*
213 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
214 key, key_len, 1);
215 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
216 src, src_len, 1);
217 */
218
219 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
220 crypto_free_blkcipher(tfm);
221 if (ret < 0) {
222 pr_err("ceph_aes_decrypt failed %d\n", ret);
223 return ret;
224 }
225
226 if (src_len <= *dst_len)
227 last_byte = ((char *)dst)[src_len - 1];
228 else
229 last_byte = pad[src_len - *dst_len - 1];
230 if (last_byte <= 16 && src_len >= last_byte) {
231 *dst_len = src_len - last_byte;
232 } else {
233 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
234 last_byte, (int)src_len);
235 return -EPERM; /* bad padding */
236 }
237 /*
238 print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
239 dst, *dst_len, 1);
240 */
241 return 0;
242}
243
244static int ceph_aes_decrypt2(const void *key, int key_len,
245 void *dst1, size_t *dst1_len,
246 void *dst2, size_t *dst2_len,
247 const void *src, size_t src_len)
248{
249 struct scatterlist sg_in[1], sg_out[3];
250 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
251 struct blkcipher_desc desc = { .tfm = tfm };
252 char pad[16];
253 void *iv;
254 int ivsize;
255 int ret;
256 int last_byte;
257
258 if (IS_ERR(tfm))
259 return PTR_ERR(tfm);
260
261 sg_init_table(sg_in, 1);
262 sg_set_buf(sg_in, src, src_len);
263 sg_init_table(sg_out, 3);
264 sg_set_buf(&sg_out[0], dst1, *dst1_len);
265 sg_set_buf(&sg_out[1], dst2, *dst2_len);
266 sg_set_buf(&sg_out[2], pad, sizeof(pad));
267
268 crypto_blkcipher_setkey((void *)tfm, key, key_len);
269 iv = crypto_blkcipher_crt(tfm)->iv;
270 ivsize = crypto_blkcipher_ivsize(tfm);
271
272 memcpy(iv, aes_iv, ivsize);
273
274 /*
275 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
276 key, key_len, 1);
277 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
278 src, src_len, 1);
279 */
280
281 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
282 crypto_free_blkcipher(tfm);
283 if (ret < 0) {
284 pr_err("ceph_aes_decrypt failed %d\n", ret);
285 return ret;
286 }
287
288 if (src_len <= *dst1_len)
289 last_byte = ((char *)dst1)[src_len - 1];
290 else if (src_len <= *dst1_len + *dst2_len)
291 last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
292 else
293 last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
294 if (last_byte <= 16 && src_len >= last_byte) {
295 src_len -= last_byte;
296 } else {
297 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
298 last_byte, (int)src_len);
299 return -EPERM; /* bad padding */
300 }
301
302 if (src_len < *dst1_len) {
303 *dst1_len = src_len;
304 *dst2_len = 0;
305 } else {
306 *dst2_len = src_len - *dst1_len;
307 }
308 /*
309 print_hex_dump(KERN_ERR, "dec out1: ", DUMP_PREFIX_NONE, 16, 1,
310 dst1, *dst1_len, 1);
311 print_hex_dump(KERN_ERR, "dec out2: ", DUMP_PREFIX_NONE, 16, 1,
312 dst2, *dst2_len, 1);
313 */
314
315 return 0;
316}
317
318
319int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
320 const void *src, size_t src_len)
321{
322 switch (secret->type) {
323 case CEPH_CRYPTO_NONE:
324 if (*dst_len < src_len)
325 return -ERANGE;
326 memcpy(dst, src, src_len);
327 *dst_len = src_len;
328 return 0;
329
330 case CEPH_CRYPTO_AES:
331 return ceph_aes_decrypt(secret->key, secret->len, dst,
332 dst_len, src, src_len);
333
334 default:
335 return -EINVAL;
336 }
337}
338
339int ceph_decrypt2(struct ceph_crypto_key *secret,
340 void *dst1, size_t *dst1_len,
341 void *dst2, size_t *dst2_len,
342 const void *src, size_t src_len)
343{
344 size_t t;
345
346 switch (secret->type) {
347 case CEPH_CRYPTO_NONE:
348 if (*dst1_len + *dst2_len < src_len)
349 return -ERANGE;
350 t = min(*dst1_len, src_len);
351 memcpy(dst1, src, t);
352 *dst1_len = t;
353 src += t;
354 src_len -= t;
355 if (src_len) {
356 t = min(*dst2_len, src_len);
357 memcpy(dst2, src, t);
358 *dst2_len = t;
359 }
360 return 0;
361
362 case CEPH_CRYPTO_AES:
363 return ceph_aes_decrypt2(secret->key, secret->len,
364 dst1, dst1_len, dst2, dst2_len,
365 src, src_len);
366
367 default:
368 return -EINVAL;
369 }
370}
371
372int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
373 const void *src, size_t src_len)
374{
375 switch (secret->type) {
376 case CEPH_CRYPTO_NONE:
377 if (*dst_len < src_len)
378 return -ERANGE;
379 memcpy(dst, src, src_len);
380 *dst_len = src_len;
381 return 0;
382
383 case CEPH_CRYPTO_AES:
384 return ceph_aes_encrypt(secret->key, secret->len, dst,
385 dst_len, src, src_len);
386
387 default:
388 return -EINVAL;
389 }
390}
391
392int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
393 const void *src1, size_t src1_len,
394 const void *src2, size_t src2_len)
395{
396 switch (secret->type) {
397 case CEPH_CRYPTO_NONE:
398 if (*dst_len < src1_len + src2_len)
399 return -ERANGE;
400 memcpy(dst, src1, src1_len);
401 memcpy(dst + src1_len, src2, src2_len);
402 *dst_len = src1_len + src2_len;
403 return 0;
404
405 case CEPH_CRYPTO_AES:
406 return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
407 src1, src1_len, src2, src2_len);
408
409 default:
410 return -EINVAL;
411 }
412}
diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h
new file mode 100644
index 000000000000..f9eccace592b
--- /dev/null
+++ b/net/ceph/crypto.h
@@ -0,0 +1,48 @@
1#ifndef _FS_CEPH_CRYPTO_H
2#define _FS_CEPH_CRYPTO_H
3
4#include <linux/ceph/types.h>
5#include <linux/ceph/buffer.h>
6
7/*
8 * cryptographic secret
9 */
10struct ceph_crypto_key {
11 int type;
12 struct ceph_timespec created;
13 int len;
14 void *key;
15};
16
17static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
18{
19 kfree(key->key);
20}
21
22extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
23 void **p, void *end);
24extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
25 void **p, void *end);
26extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
27
28/* crypto.c */
29extern int ceph_decrypt(struct ceph_crypto_key *secret,
30 void *dst, size_t *dst_len,
31 const void *src, size_t src_len);
32extern int ceph_encrypt(struct ceph_crypto_key *secret,
33 void *dst, size_t *dst_len,
34 const void *src, size_t src_len);
35extern int ceph_decrypt2(struct ceph_crypto_key *secret,
36 void *dst1, size_t *dst1_len,
37 void *dst2, size_t *dst2_len,
38 const void *src, size_t src_len);
39extern int ceph_encrypt2(struct ceph_crypto_key *secret,
40 void *dst, size_t *dst_len,
41 const void *src1, size_t src1_len,
42 const void *src2, size_t src2_len);
43
44/* armor.c */
45extern int ceph_armor(char *dst, const char *src, const char *end);
46extern int ceph_unarmor(char *dst, const char *src, const char *end);
47
48#endif
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
new file mode 100644
index 000000000000..27d4ea315d12
--- /dev/null
+++ b/net/ceph/debugfs.c
@@ -0,0 +1,267 @@
1#include <linux/ceph/ceph_debug.h>
2
3#include <linux/device.h>
4#include <linux/slab.h>
5#include <linux/module.h>
6#include <linux/ctype.h>
7#include <linux/debugfs.h>
8#include <linux/seq_file.h>
9
10#include <linux/ceph/libceph.h>
11#include <linux/ceph/mon_client.h>
12#include <linux/ceph/auth.h>
13#include <linux/ceph/debugfs.h>
14
15#ifdef CONFIG_DEBUG_FS
16
17/*
18 * Implement /sys/kernel/debug/ceph fun
19 *
20 * /sys/kernel/debug/ceph/client* - an instance of the ceph client
21 * .../osdmap - current osdmap
22 * .../monmap - current monmap
23 * .../osdc - active osd requests
24 * .../monc - mon client state
25 * .../dentry_lru - dump contents of dentry lru
26 * .../caps - expose cap (reservation) stats
27 * .../bdi - symlink to ../../bdi/something
28 */
29
30static struct dentry *ceph_debugfs_dir;
31
32static int monmap_show(struct seq_file *s, void *p)
33{
34 int i;
35 struct ceph_client *client = s->private;
36
37 if (client->monc.monmap == NULL)
38 return 0;
39
40 seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
41 for (i = 0; i < client->monc.monmap->num_mon; i++) {
42 struct ceph_entity_inst *inst =
43 &client->monc.monmap->mon_inst[i];
44
45 seq_printf(s, "\t%s%lld\t%s\n",
46 ENTITY_NAME(inst->name),
47 ceph_pr_addr(&inst->addr.in_addr));
48 }
49 return 0;
50}
51
52static int osdmap_show(struct seq_file *s, void *p)
53{
54 int i;
55 struct ceph_client *client = s->private;
56 struct rb_node *n;
57
58 if (client->osdc.osdmap == NULL)
59 return 0;
60 seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
61 seq_printf(s, "flags%s%s\n",
62 (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
63 " NEARFULL" : "",
64 (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
65 " FULL" : "");
66 for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
67 struct ceph_pg_pool_info *pool =
68 rb_entry(n, struct ceph_pg_pool_info, node);
69 seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
70 pool->id, pool->v.pg_num, pool->pg_num_mask,
71 pool->v.lpg_num, pool->lpg_num_mask);
72 }
73 for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
74 struct ceph_entity_addr *addr =
75 &client->osdc.osdmap->osd_addr[i];
76 int state = client->osdc.osdmap->osd_state[i];
77 char sb[64];
78
79 seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
80 i, ceph_pr_addr(&addr->in_addr),
81 ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
82 ceph_osdmap_state_str(sb, sizeof(sb), state));
83 }
84 return 0;
85}
86
87static int monc_show(struct seq_file *s, void *p)
88{
89 struct ceph_client *client = s->private;
90 struct ceph_mon_generic_request *req;
91 struct ceph_mon_client *monc = &client->monc;
92 struct rb_node *rp;
93
94 mutex_lock(&monc->mutex);
95
96 if (monc->have_mdsmap)
97 seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
98 if (monc->have_osdmap)
99 seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
100 if (monc->want_next_osdmap)
101 seq_printf(s, "want next osdmap\n");
102
103 for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
104 __u16 op;
105 req = rb_entry(rp, struct ceph_mon_generic_request, node);
106 op = le16_to_cpu(req->request->hdr.type);
107 if (op == CEPH_MSG_STATFS)
108 seq_printf(s, "%lld statfs\n", req->tid);
109 else
110 seq_printf(s, "%lld unknown\n", req->tid);
111 }
112
113 mutex_unlock(&monc->mutex);
114 return 0;
115}
116
117static int osdc_show(struct seq_file *s, void *pp)
118{
119 struct ceph_client *client = s->private;
120 struct ceph_osd_client *osdc = &client->osdc;
121 struct rb_node *p;
122
123 mutex_lock(&osdc->request_mutex);
124 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
125 struct ceph_osd_request *req;
126 struct ceph_osd_request_head *head;
127 struct ceph_osd_op *op;
128 int num_ops;
129 int opcode, olen;
130 int i;
131
132 req = rb_entry(p, struct ceph_osd_request, r_node);
133
134 seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
135 req->r_osd ? req->r_osd->o_osd : -1,
136 le32_to_cpu(req->r_pgid.pool),
137 le16_to_cpu(req->r_pgid.ps));
138
139 head = req->r_request->front.iov_base;
140 op = (void *)(head + 1);
141
142 num_ops = le16_to_cpu(head->num_ops);
143 olen = le32_to_cpu(head->object_len);
144 seq_printf(s, "%.*s", olen,
145 (const char *)(head->ops + num_ops));
146
147 if (req->r_reassert_version.epoch)
148 seq_printf(s, "\t%u'%llu",
149 (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
150 le64_to_cpu(req->r_reassert_version.version));
151 else
152 seq_printf(s, "\t");
153
154 for (i = 0; i < num_ops; i++) {
155 opcode = le16_to_cpu(op->op);
156 seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
157 op++;
158 }
159
160 seq_printf(s, "\n");
161 }
162 mutex_unlock(&osdc->request_mutex);
163 return 0;
164}
165
166CEPH_DEFINE_SHOW_FUNC(monmap_show)
167CEPH_DEFINE_SHOW_FUNC(osdmap_show)
168CEPH_DEFINE_SHOW_FUNC(monc_show)
169CEPH_DEFINE_SHOW_FUNC(osdc_show)
170
171int ceph_debugfs_init(void)
172{
173 ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
174 if (!ceph_debugfs_dir)
175 return -ENOMEM;
176 return 0;
177}
178
179void ceph_debugfs_cleanup(void)
180{
181 debugfs_remove(ceph_debugfs_dir);
182}
183
184int ceph_debugfs_client_init(struct ceph_client *client)
185{
186 int ret = -ENOMEM;
187 char name[80];
188
189 snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
190 client->monc.auth->global_id);
191
192 client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
193 if (!client->debugfs_dir)
194 goto out;
195
196 client->monc.debugfs_file = debugfs_create_file("monc",
197 0600,
198 client->debugfs_dir,
199 client,
200 &monc_show_fops);
201 if (!client->monc.debugfs_file)
202 goto out;
203
204 client->osdc.debugfs_file = debugfs_create_file("osdc",
205 0600,
206 client->debugfs_dir,
207 client,
208 &osdc_show_fops);
209 if (!client->osdc.debugfs_file)
210 goto out;
211
212 client->debugfs_monmap = debugfs_create_file("monmap",
213 0600,
214 client->debugfs_dir,
215 client,
216 &monmap_show_fops);
217 if (!client->debugfs_monmap)
218 goto out;
219
220 client->debugfs_osdmap = debugfs_create_file("osdmap",
221 0600,
222 client->debugfs_dir,
223 client,
224 &osdmap_show_fops);
225 if (!client->debugfs_osdmap)
226 goto out;
227
228 return 0;
229
230out:
231 ceph_debugfs_client_cleanup(client);
232 return ret;
233}
234
235void ceph_debugfs_client_cleanup(struct ceph_client *client)
236{
237 debugfs_remove(client->debugfs_osdmap);
238 debugfs_remove(client->debugfs_monmap);
239 debugfs_remove(client->osdc.debugfs_file);
240 debugfs_remove(client->monc.debugfs_file);
241 debugfs_remove(client->debugfs_dir);
242}
243
244#else /* CONFIG_DEBUG_FS */
245
246int ceph_debugfs_init(void)
247{
248 return 0;
249}
250
251void ceph_debugfs_cleanup(void)
252{
253}
254
255int ceph_debugfs_client_init(struct ceph_client *client)
256{
257 return 0;
258}
259
260void ceph_debugfs_client_cleanup(struct ceph_client *client)
261{
262}
263
264#endif /* CONFIG_DEBUG_FS */
265
266EXPORT_SYMBOL(ceph_debugfs_init);
267EXPORT_SYMBOL(ceph_debugfs_cleanup);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
new file mode 100644
index 000000000000..0e8157ee5d43
--- /dev/null
+++ b/net/ceph/messenger.c
@@ -0,0 +1,2453 @@
1#include <linux/ceph/ceph_debug.h>
2
3#include <linux/crc32c.h>
4#include <linux/ctype.h>
5#include <linux/highmem.h>
6#include <linux/inet.h>
7#include <linux/kthread.h>
8#include <linux/net.h>
9#include <linux/slab.h>
10#include <linux/socket.h>
11#include <linux/string.h>
12#include <linux/bio.h>
13#include <linux/blkdev.h>
14#include <net/tcp.h>
15
16#include <linux/ceph/libceph.h>
17#include <linux/ceph/messenger.h>
18#include <linux/ceph/decode.h>
19#include <linux/ceph/pagelist.h>
20
21/*
22 * Ceph uses the messenger to exchange ceph_msg messages with other
23 * hosts in the system. The messenger provides ordered and reliable
24 * delivery. We tolerate TCP disconnects by reconnecting (with
25 * exponential backoff) in the case of a fault (disconnection, bad
26 * crc, protocol error). Acks allow sent messages to be discarded by
27 * the sender.
28 */
29
30/* static tag bytes (protocol control messages) */
31static char tag_msg = CEPH_MSGR_TAG_MSG;
32static char tag_ack = CEPH_MSGR_TAG_ACK;
33static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
34
35#ifdef CONFIG_LOCKDEP
36static struct lock_class_key socket_class;
37#endif
38
39
40static void queue_con(struct ceph_connection *con);
41static void con_work(struct work_struct *);
42static void ceph_fault(struct ceph_connection *con);
43
44/*
45 * nicely render a sockaddr as a string.
46 */
47#define MAX_ADDR_STR 20
48#define MAX_ADDR_STR_LEN 60
49static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
50static DEFINE_SPINLOCK(addr_str_lock);
51static int last_addr_str;
52
53const char *ceph_pr_addr(const struct sockaddr_storage *ss)
54{
55 int i;
56 char *s;
57 struct sockaddr_in *in4 = (void *)ss;
58 struct sockaddr_in6 *in6 = (void *)ss;
59
60 spin_lock(&addr_str_lock);
61 i = last_addr_str++;
62 if (last_addr_str == MAX_ADDR_STR)
63 last_addr_str = 0;
64 spin_unlock(&addr_str_lock);
65 s = addr_str[i];
66
67 switch (ss->ss_family) {
68 case AF_INET:
69 snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr,
70 (unsigned int)ntohs(in4->sin_port));
71 break;
72
73 case AF_INET6:
74 snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr,
75 (unsigned int)ntohs(in6->sin6_port));
76 break;
77
78 default:
79 sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
80 }
81
82 return s;
83}
84EXPORT_SYMBOL(ceph_pr_addr);
85
86static void encode_my_addr(struct ceph_messenger *msgr)
87{
88 memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
89 ceph_encode_addr(&msgr->my_enc_addr);
90}
91
92/*
93 * work queue for all reading and writing to/from the socket.
94 */
95struct workqueue_struct *ceph_msgr_wq;
96
97int ceph_msgr_init(void)
98{
99 ceph_msgr_wq = create_workqueue("ceph-msgr");
100 if (IS_ERR(ceph_msgr_wq)) {
101 int ret = PTR_ERR(ceph_msgr_wq);
102 pr_err("msgr_init failed to create workqueue: %d\n", ret);
103 ceph_msgr_wq = NULL;
104 return ret;
105 }
106 return 0;
107}
108EXPORT_SYMBOL(ceph_msgr_init);
109
110void ceph_msgr_exit(void)
111{
112 destroy_workqueue(ceph_msgr_wq);
113}
114EXPORT_SYMBOL(ceph_msgr_exit);
115
116void ceph_msgr_flush(void)
117{
118 flush_workqueue(ceph_msgr_wq);
119}
120EXPORT_SYMBOL(ceph_msgr_flush);
121
122
123/*
124 * socket callback functions
125 */
126
127/* data available on socket, or listen socket received a connect */
128static void ceph_data_ready(struct sock *sk, int count_unused)
129{
130 struct ceph_connection *con =
131 (struct ceph_connection *)sk->sk_user_data;
132 if (sk->sk_state != TCP_CLOSE_WAIT) {
133 dout("ceph_data_ready on %p state = %lu, queueing work\n",
134 con, con->state);
135 queue_con(con);
136 }
137}
138
139/* socket has buffer space for writing */
140static void ceph_write_space(struct sock *sk)
141{
142 struct ceph_connection *con =
143 (struct ceph_connection *)sk->sk_user_data;
144
145 /* only queue to workqueue if there is data we want to write. */
146 if (test_bit(WRITE_PENDING, &con->state)) {
147 dout("ceph_write_space %p queueing write work\n", con);
148 queue_con(con);
149 } else {
150 dout("ceph_write_space %p nothing to write\n", con);
151 }
152
153 /* since we have our own write_space, clear the SOCK_NOSPACE flag */
154 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
155}
156
157/* socket's state has changed */
158static void ceph_state_change(struct sock *sk)
159{
160 struct ceph_connection *con =
161 (struct ceph_connection *)sk->sk_user_data;
162
163 dout("ceph_state_change %p state = %lu sk_state = %u\n",
164 con, con->state, sk->sk_state);
165
166 if (test_bit(CLOSED, &con->state))
167 return;
168
169 switch (sk->sk_state) {
170 case TCP_CLOSE:
171 dout("ceph_state_change TCP_CLOSE\n");
172 case TCP_CLOSE_WAIT:
173 dout("ceph_state_change TCP_CLOSE_WAIT\n");
174 if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
175 if (test_bit(CONNECTING, &con->state))
176 con->error_msg = "connection failed";
177 else
178 con->error_msg = "socket closed";
179 queue_con(con);
180 }
181 break;
182 case TCP_ESTABLISHED:
183 dout("ceph_state_change TCP_ESTABLISHED\n");
184 queue_con(con);
185 break;
186 }
187}
188
189/*
190 * set up socket callbacks
191 */
192static void set_sock_callbacks(struct socket *sock,
193 struct ceph_connection *con)
194{
195 struct sock *sk = sock->sk;
196 sk->sk_user_data = (void *)con;
197 sk->sk_data_ready = ceph_data_ready;
198 sk->sk_write_space = ceph_write_space;
199 sk->sk_state_change = ceph_state_change;
200}
201
202
203/*
204 * socket helpers
205 */
206
207/*
208 * initiate connection to a remote socket.
209 */
210static struct socket *ceph_tcp_connect(struct ceph_connection *con)
211{
212 struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
213 struct socket *sock;
214 int ret;
215
216 BUG_ON(con->sock);
217 ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
218 IPPROTO_TCP, &sock);
219 if (ret)
220 return ERR_PTR(ret);
221 con->sock = sock;
222 sock->sk->sk_allocation = GFP_NOFS;
223
224#ifdef CONFIG_LOCKDEP
225 lockdep_set_class(&sock->sk->sk_lock, &socket_class);
226#endif
227
228 set_sock_callbacks(sock, con);
229
230 dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr));
231
232 ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
233 O_NONBLOCK);
234 if (ret == -EINPROGRESS) {
235 dout("connect %s EINPROGRESS sk_state = %u\n",
236 ceph_pr_addr(&con->peer_addr.in_addr),
237 sock->sk->sk_state);
238 ret = 0;
239 }
240 if (ret < 0) {
241 pr_err("connect %s error %d\n",
242 ceph_pr_addr(&con->peer_addr.in_addr), ret);
243 sock_release(sock);
244 con->sock = NULL;
245 con->error_msg = "connect error";
246 }
247
248 if (ret < 0)
249 return ERR_PTR(ret);
250 return sock;
251}
252
253static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
254{
255 struct kvec iov = {buf, len};
256 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
257
258 return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
259}
260
261/*
262 * write something. @more is true if caller will be sending more data
263 * shortly.
264 */
265static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
266 size_t kvlen, size_t len, int more)
267{
268 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
269
270 if (more)
271 msg.msg_flags |= MSG_MORE;
272 else
273 msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
274
275 return kernel_sendmsg(sock, &msg, iov, kvlen, len);
276}
277
278
279/*
280 * Shutdown/close the socket for the given connection.
281 */
282static int con_close_socket(struct ceph_connection *con)
283{
284 int rc;
285
286 dout("con_close_socket on %p sock %p\n", con, con->sock);
287 if (!con->sock)
288 return 0;
289 set_bit(SOCK_CLOSED, &con->state);
290 rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
291 sock_release(con->sock);
292 con->sock = NULL;
293 clear_bit(SOCK_CLOSED, &con->state);
294 return rc;
295}
296
297/*
298 * Reset a connection. Discard all incoming and outgoing messages
299 * and clear *_seq state.
300 */
301static void ceph_msg_remove(struct ceph_msg *msg)
302{
303 list_del_init(&msg->list_head);
304 ceph_msg_put(msg);
305}
306static void ceph_msg_remove_list(struct list_head *head)
307{
308 while (!list_empty(head)) {
309 struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
310 list_head);
311 ceph_msg_remove(msg);
312 }
313}
314
315static void reset_connection(struct ceph_connection *con)
316{
317 /* reset connection, out_queue, msg_ and connect_seq */
318 /* discard existing out_queue and msg_seq */
319 ceph_msg_remove_list(&con->out_queue);
320 ceph_msg_remove_list(&con->out_sent);
321
322 if (con->in_msg) {
323 ceph_msg_put(con->in_msg);
324 con->in_msg = NULL;
325 }
326
327 con->connect_seq = 0;
328 con->out_seq = 0;
329 if (con->out_msg) {
330 ceph_msg_put(con->out_msg);
331 con->out_msg = NULL;
332 }
333 con->out_keepalive_pending = false;
334 con->in_seq = 0;
335 con->in_seq_acked = 0;
336}
337
338/*
339 * mark a peer down. drop any open connections.
340 */
341void ceph_con_close(struct ceph_connection *con)
342{
343 dout("con_close %p peer %s\n", con,
344 ceph_pr_addr(&con->peer_addr.in_addr));
345 set_bit(CLOSED, &con->state); /* in case there's queued work */
346 clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */
347 clear_bit(LOSSYTX, &con->state); /* so we retry next connect */
348 clear_bit(KEEPALIVE_PENDING, &con->state);
349 clear_bit(WRITE_PENDING, &con->state);
350 mutex_lock(&con->mutex);
351 reset_connection(con);
352 con->peer_global_seq = 0;
353 cancel_delayed_work(&con->work);
354 mutex_unlock(&con->mutex);
355 queue_con(con);
356}
357EXPORT_SYMBOL(ceph_con_close);
358
359/*
360 * Reopen a closed connection, with a new peer address.
361 */
362void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
363{
364 dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr));
365 set_bit(OPENING, &con->state);
366 clear_bit(CLOSED, &con->state);
367 memcpy(&con->peer_addr, addr, sizeof(*addr));
368 con->delay = 0; /* reset backoff memory */
369 queue_con(con);
370}
371EXPORT_SYMBOL(ceph_con_open);
372
373/*
374 * return true if this connection ever successfully opened
375 */
376bool ceph_con_opened(struct ceph_connection *con)
377{
378 return con->connect_seq > 0;
379}
380
381/*
382 * generic get/put
383 */
384struct ceph_connection *ceph_con_get(struct ceph_connection *con)
385{
386 dout("con_get %p nref = %d -> %d\n", con,
387 atomic_read(&con->nref), atomic_read(&con->nref) + 1);
388 if (atomic_inc_not_zero(&con->nref))
389 return con;
390 return NULL;
391}
392
393void ceph_con_put(struct ceph_connection *con)
394{
395 dout("con_put %p nref = %d -> %d\n", con,
396 atomic_read(&con->nref), atomic_read(&con->nref) - 1);
397 BUG_ON(atomic_read(&con->nref) == 0);
398 if (atomic_dec_and_test(&con->nref)) {
399 BUG_ON(con->sock);
400 kfree(con);
401 }
402}
403
404/*
405 * initialize a new connection.
406 */
407void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
408{
409 dout("con_init %p\n", con);
410 memset(con, 0, sizeof(*con));
411 atomic_set(&con->nref, 1);
412 con->msgr = msgr;
413 mutex_init(&con->mutex);
414 INIT_LIST_HEAD(&con->out_queue);
415 INIT_LIST_HEAD(&con->out_sent);
416 INIT_DELAYED_WORK(&con->work, con_work);
417}
418EXPORT_SYMBOL(ceph_con_init);
419
420
421/*
422 * We maintain a global counter to order connection attempts. Get
423 * a unique seq greater than @gt.
424 */
425static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
426{
427 u32 ret;
428
429 spin_lock(&msgr->global_seq_lock);
430 if (msgr->global_seq < gt)
431 msgr->global_seq = gt;
432 ret = ++msgr->global_seq;
433 spin_unlock(&msgr->global_seq_lock);
434 return ret;
435}
436
437
438/*
439 * Prepare footer for currently outgoing message, and finish things
440 * off. Assumes out_kvec* are already valid.. we just add on to the end.
441 */
442static void prepare_write_message_footer(struct ceph_connection *con, int v)
443{
444 struct ceph_msg *m = con->out_msg;
445
446 dout("prepare_write_message_footer %p\n", con);
447 con->out_kvec_is_msg = true;
448 con->out_kvec[v].iov_base = &m->footer;
449 con->out_kvec[v].iov_len = sizeof(m->footer);
450 con->out_kvec_bytes += sizeof(m->footer);
451 con->out_kvec_left++;
452 con->out_more = m->more_to_follow;
453 con->out_msg_done = true;
454}
455
456/*
457 * Prepare headers for the next outgoing message.
458 */
459static void prepare_write_message(struct ceph_connection *con)
460{
461 struct ceph_msg *m;
462 int v = 0;
463
464 con->out_kvec_bytes = 0;
465 con->out_kvec_is_msg = true;
466 con->out_msg_done = false;
467
468 /* Sneak an ack in there first? If we can get it into the same
469 * TCP packet that's a good thing. */
470 if (con->in_seq > con->in_seq_acked) {
471 con->in_seq_acked = con->in_seq;
472 con->out_kvec[v].iov_base = &tag_ack;
473 con->out_kvec[v++].iov_len = 1;
474 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
475 con->out_kvec[v].iov_base = &con->out_temp_ack;
476 con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
477 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
478 }
479
480 m = list_first_entry(&con->out_queue,
481 struct ceph_msg, list_head);
482 con->out_msg = m;
483 if (test_bit(LOSSYTX, &con->state)) {
484 list_del_init(&m->list_head);
485 } else {
486 /* put message on sent list */
487 ceph_msg_get(m);
488 list_move_tail(&m->list_head, &con->out_sent);
489 }
490
491 /*
492 * only assign outgoing seq # if we haven't sent this message
493 * yet. if it is requeued, resend with it's original seq.
494 */
495 if (m->needs_out_seq) {
496 m->hdr.seq = cpu_to_le64(++con->out_seq);
497 m->needs_out_seq = false;
498 }
499
500 dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
501 m, con->out_seq, le16_to_cpu(m->hdr.type),
502 le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
503 le32_to_cpu(m->hdr.data_len),
504 m->nr_pages);
505 BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
506
507 /* tag + hdr + front + middle */
508 con->out_kvec[v].iov_base = &tag_msg;
509 con->out_kvec[v++].iov_len = 1;
510 con->out_kvec[v].iov_base = &m->hdr;
511 con->out_kvec[v++].iov_len = sizeof(m->hdr);
512 con->out_kvec[v++] = m->front;
513 if (m->middle)
514 con->out_kvec[v++] = m->middle->vec;
515 con->out_kvec_left = v;
516 con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
517 (m->middle ? m->middle->vec.iov_len : 0);
518 con->out_kvec_cur = con->out_kvec;
519
520 /* fill in crc (except data pages), footer */
521 con->out_msg->hdr.crc =
522 cpu_to_le32(crc32c(0, (void *)&m->hdr,
523 sizeof(m->hdr) - sizeof(m->hdr.crc)));
524 con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
525 con->out_msg->footer.front_crc =
526 cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
527 if (m->middle)
528 con->out_msg->footer.middle_crc =
529 cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
530 m->middle->vec.iov_len));
531 else
532 con->out_msg->footer.middle_crc = 0;
533 con->out_msg->footer.data_crc = 0;
534 dout("prepare_write_message front_crc %u data_crc %u\n",
535 le32_to_cpu(con->out_msg->footer.front_crc),
536 le32_to_cpu(con->out_msg->footer.middle_crc));
537
538 /* is there a data payload? */
539 if (le32_to_cpu(m->hdr.data_len) > 0) {
540 /* initialize page iterator */
541 con->out_msg_pos.page = 0;
542 if (m->pages)
543 con->out_msg_pos.page_pos =
544 le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
545 else
546 con->out_msg_pos.page_pos = 0;
547 con->out_msg_pos.data_pos = 0;
548 con->out_msg_pos.did_page_crc = 0;
549 con->out_more = 1; /* data + footer will follow */
550 } else {
551 /* no, queue up footer too and be done */
552 prepare_write_message_footer(con, v);
553 }
554
555 set_bit(WRITE_PENDING, &con->state);
556}
557
558/*
559 * Prepare an ack.
560 */
561static void prepare_write_ack(struct ceph_connection *con)
562{
563 dout("prepare_write_ack %p %llu -> %llu\n", con,
564 con->in_seq_acked, con->in_seq);
565 con->in_seq_acked = con->in_seq;
566
567 con->out_kvec[0].iov_base = &tag_ack;
568 con->out_kvec[0].iov_len = 1;
569 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
570 con->out_kvec[1].iov_base = &con->out_temp_ack;
571 con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
572 con->out_kvec_left = 2;
573 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
574 con->out_kvec_cur = con->out_kvec;
575 con->out_more = 1; /* more will follow.. eventually.. */
576 set_bit(WRITE_PENDING, &con->state);
577}
578
579/*
580 * Prepare to write keepalive byte.
581 */
582static void prepare_write_keepalive(struct ceph_connection *con)
583{
584 dout("prepare_write_keepalive %p\n", con);
585 con->out_kvec[0].iov_base = &tag_keepalive;
586 con->out_kvec[0].iov_len = 1;
587 con->out_kvec_left = 1;
588 con->out_kvec_bytes = 1;
589 con->out_kvec_cur = con->out_kvec;
590 set_bit(WRITE_PENDING, &con->state);
591}
592
593/*
594 * Connection negotiation.
595 */
596
597static void prepare_connect_authorizer(struct ceph_connection *con)
598{
599 void *auth_buf;
600 int auth_len = 0;
601 int auth_protocol = 0;
602
603 mutex_unlock(&con->mutex);
604 if (con->ops->get_authorizer)
605 con->ops->get_authorizer(con, &auth_buf, &auth_len,
606 &auth_protocol, &con->auth_reply_buf,
607 &con->auth_reply_buf_len,
608 con->auth_retry);
609 mutex_lock(&con->mutex);
610
611 con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
612 con->out_connect.authorizer_len = cpu_to_le32(auth_len);
613
614 con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
615 con->out_kvec[con->out_kvec_left].iov_len = auth_len;
616 con->out_kvec_left++;
617 con->out_kvec_bytes += auth_len;
618}
619
620/*
621 * We connected to a peer and are saying hello.
622 */
623static void prepare_write_banner(struct ceph_messenger *msgr,
624 struct ceph_connection *con)
625{
626 int len = strlen(CEPH_BANNER);
627
628 con->out_kvec[0].iov_base = CEPH_BANNER;
629 con->out_kvec[0].iov_len = len;
630 con->out_kvec[1].iov_base = &msgr->my_enc_addr;
631 con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
632 con->out_kvec_left = 2;
633 con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
634 con->out_kvec_cur = con->out_kvec;
635 con->out_more = 0;
636 set_bit(WRITE_PENDING, &con->state);
637}
638
639static void prepare_write_connect(struct ceph_messenger *msgr,
640 struct ceph_connection *con,
641 int after_banner)
642{
643 unsigned global_seq = get_global_seq(con->msgr, 0);
644 int proto;
645
646 switch (con->peer_name.type) {
647 case CEPH_ENTITY_TYPE_MON:
648 proto = CEPH_MONC_PROTOCOL;
649 break;
650 case CEPH_ENTITY_TYPE_OSD:
651 proto = CEPH_OSDC_PROTOCOL;
652 break;
653 case CEPH_ENTITY_TYPE_MDS:
654 proto = CEPH_MDSC_PROTOCOL;
655 break;
656 default:
657 BUG();
658 }
659
660 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
661 con->connect_seq, global_seq, proto);
662
663 con->out_connect.features = cpu_to_le64(msgr->supported_features);
664 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
665 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
666 con->out_connect.global_seq = cpu_to_le32(global_seq);
667 con->out_connect.protocol_version = cpu_to_le32(proto);
668 con->out_connect.flags = 0;
669
670 if (!after_banner) {
671 con->out_kvec_left = 0;
672 con->out_kvec_bytes = 0;
673 }
674 con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
675 con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
676 con->out_kvec_left++;
677 con->out_kvec_bytes += sizeof(con->out_connect);
678 con->out_kvec_cur = con->out_kvec;
679 con->out_more = 0;
680 set_bit(WRITE_PENDING, &con->state);
681
682 prepare_connect_authorizer(con);
683}
684
685
686/*
687 * write as much of pending kvecs to the socket as we can.
688 * 1 -> done
689 * 0 -> socket full, but more to do
690 * <0 -> error
691 */
692static int write_partial_kvec(struct ceph_connection *con)
693{
694 int ret;
695
696 dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
697 while (con->out_kvec_bytes > 0) {
698 ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
699 con->out_kvec_left, con->out_kvec_bytes,
700 con->out_more);
701 if (ret <= 0)
702 goto out;
703 con->out_kvec_bytes -= ret;
704 if (con->out_kvec_bytes == 0)
705 break; /* done */
706 while (ret > 0) {
707 if (ret >= con->out_kvec_cur->iov_len) {
708 ret -= con->out_kvec_cur->iov_len;
709 con->out_kvec_cur++;
710 con->out_kvec_left--;
711 } else {
712 con->out_kvec_cur->iov_len -= ret;
713 con->out_kvec_cur->iov_base += ret;
714 ret = 0;
715 break;
716 }
717 }
718 }
719 con->out_kvec_left = 0;
720 con->out_kvec_is_msg = false;
721 ret = 1;
722out:
723 dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
724 con->out_kvec_bytes, con->out_kvec_left, ret);
725 return ret; /* done! */
726}
727
728#ifdef CONFIG_BLOCK
729static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg)
730{
731 if (!bio) {
732 *iter = NULL;
733 *seg = 0;
734 return;
735 }
736 *iter = bio;
737 *seg = bio->bi_idx;
738}
739
740static void iter_bio_next(struct bio **bio_iter, int *seg)
741{
742 if (*bio_iter == NULL)
743 return;
744
745 BUG_ON(*seg >= (*bio_iter)->bi_vcnt);
746
747 (*seg)++;
748 if (*seg == (*bio_iter)->bi_vcnt)
749 init_bio_iter((*bio_iter)->bi_next, bio_iter, seg);
750}
751#endif
752
753/*
754 * Write as much message data payload as we can. If we finish, queue
755 * up the footer.
756 * 1 -> done, footer is now queued in out_kvec[].
757 * 0 -> socket full, but more to do
758 * <0 -> error
759 */
760static int write_partial_msg_pages(struct ceph_connection *con)
761{
762 struct ceph_msg *msg = con->out_msg;
763 unsigned data_len = le32_to_cpu(msg->hdr.data_len);
764 size_t len;
765 int crc = con->msgr->nocrc;
766 int ret;
767 int total_max_write;
768 int in_trail = 0;
769 size_t trail_len = (msg->trail ? msg->trail->length : 0);
770
771 dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
772 con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
773 con->out_msg_pos.page_pos);
774
775#ifdef CONFIG_BLOCK
776 if (msg->bio && !msg->bio_iter)
777 init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg);
778#endif
779
780 while (data_len > con->out_msg_pos.data_pos) {
781 struct page *page = NULL;
782 void *kaddr = NULL;
783 int max_write = PAGE_SIZE;
784 int page_shift = 0;
785
786 total_max_write = data_len - trail_len -
787 con->out_msg_pos.data_pos;
788
789 /*
790 * if we are calculating the data crc (the default), we need
791 * to map the page. if our pages[] has been revoked, use the
792 * zero page.
793 */
794
795 /* have we reached the trail part of the data? */
796 if (con->out_msg_pos.data_pos >= data_len - trail_len) {
797 in_trail = 1;
798
799 total_max_write = data_len - con->out_msg_pos.data_pos;
800
801 page = list_first_entry(&msg->trail->head,
802 struct page, lru);
803 if (crc)
804 kaddr = kmap(page);
805 max_write = PAGE_SIZE;
806 } else if (msg->pages) {
807 page = msg->pages[con->out_msg_pos.page];
808 if (crc)
809 kaddr = kmap(page);
810 } else if (msg->pagelist) {
811 page = list_first_entry(&msg->pagelist->head,
812 struct page, lru);
813 if (crc)
814 kaddr = kmap(page);
815#ifdef CONFIG_BLOCK
816 } else if (msg->bio) {
817 struct bio_vec *bv;
818
819 bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg);
820 page = bv->bv_page;
821 page_shift = bv->bv_offset;
822 if (crc)
823 kaddr = kmap(page) + page_shift;
824 max_write = bv->bv_len;
825#endif
826 } else {
827 page = con->msgr->zero_page;
828 if (crc)
829 kaddr = page_address(con->msgr->zero_page);
830 }
831 len = min_t(int, max_write - con->out_msg_pos.page_pos,
832 total_max_write);
833
834 if (crc && !con->out_msg_pos.did_page_crc) {
835 void *base = kaddr + con->out_msg_pos.page_pos;
836 u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
837
838 BUG_ON(kaddr == NULL);
839 con->out_msg->footer.data_crc =
840 cpu_to_le32(crc32c(tmpcrc, base, len));
841 con->out_msg_pos.did_page_crc = 1;
842 }
843 ret = kernel_sendpage(con->sock, page,
844 con->out_msg_pos.page_pos + page_shift,
845 len,
846 MSG_DONTWAIT | MSG_NOSIGNAL |
847 MSG_MORE);
848
849 if (crc &&
850 (msg->pages || msg->pagelist || msg->bio || in_trail))
851 kunmap(page);
852
853 if (ret <= 0)
854 goto out;
855
856 con->out_msg_pos.data_pos += ret;
857 con->out_msg_pos.page_pos += ret;
858 if (ret == len) {
859 con->out_msg_pos.page_pos = 0;
860 con->out_msg_pos.page++;
861 con->out_msg_pos.did_page_crc = 0;
862 if (in_trail)
863 list_move_tail(&page->lru,
864 &msg->trail->head);
865 else if (msg->pagelist)
866 list_move_tail(&page->lru,
867 &msg->pagelist->head);
868#ifdef CONFIG_BLOCK
869 else if (msg->bio)
870 iter_bio_next(&msg->bio_iter, &msg->bio_seg);
871#endif
872 }
873 }
874
875 dout("write_partial_msg_pages %p msg %p done\n", con, msg);
876
877 /* prepare and queue up footer, too */
878 if (!crc)
879 con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
880 con->out_kvec_bytes = 0;
881 con->out_kvec_left = 0;
882 con->out_kvec_cur = con->out_kvec;
883 prepare_write_message_footer(con, 0);
884 ret = 1;
885out:
886 return ret;
887}
888
889/*
890 * write some zeros
891 */
892static int write_partial_skip(struct ceph_connection *con)
893{
894 int ret;
895
896 while (con->out_skip > 0) {
897 struct kvec iov = {
898 .iov_base = page_address(con->msgr->zero_page),
899 .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
900 };
901
902 ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
903 if (ret <= 0)
904 goto out;
905 con->out_skip -= ret;
906 }
907 ret = 1;
908out:
909 return ret;
910}
911
912/*
913 * Prepare to read connection handshake, or an ack.
914 */
915static void prepare_read_banner(struct ceph_connection *con)
916{
917 dout("prepare_read_banner %p\n", con);
918 con->in_base_pos = 0;
919}
920
921static void prepare_read_connect(struct ceph_connection *con)
922{
923 dout("prepare_read_connect %p\n", con);
924 con->in_base_pos = 0;
925}
926
927static void prepare_read_ack(struct ceph_connection *con)
928{
929 dout("prepare_read_ack %p\n", con);
930 con->in_base_pos = 0;
931}
932
933static void prepare_read_tag(struct ceph_connection *con)
934{
935 dout("prepare_read_tag %p\n", con);
936 con->in_base_pos = 0;
937 con->in_tag = CEPH_MSGR_TAG_READY;
938}
939
940/*
941 * Prepare to read a message.
942 */
943static int prepare_read_message(struct ceph_connection *con)
944{
945 dout("prepare_read_message %p\n", con);
946 BUG_ON(con->in_msg != NULL);
947 con->in_base_pos = 0;
948 con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
949 return 0;
950}
951
952
953static int read_partial(struct ceph_connection *con,
954 int *to, int size, void *object)
955{
956 *to += size;
957 while (con->in_base_pos < *to) {
958 int left = *to - con->in_base_pos;
959 int have = size - left;
960 int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
961 if (ret <= 0)
962 return ret;
963 con->in_base_pos += ret;
964 }
965 return 1;
966}
967
968
969/*
970 * Read all or part of the connect-side handshake on a new connection
971 */
972static int read_partial_banner(struct ceph_connection *con)
973{
974 int ret, to = 0;
975
976 dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
977
978 /* peer's banner */
979 ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
980 if (ret <= 0)
981 goto out;
982 ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
983 &con->actual_peer_addr);
984 if (ret <= 0)
985 goto out;
986 ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
987 &con->peer_addr_for_me);
988 if (ret <= 0)
989 goto out;
990out:
991 return ret;
992}
993
994static int read_partial_connect(struct ceph_connection *con)
995{
996 int ret, to = 0;
997
998 dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
999
1000 ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
1001 if (ret <= 0)
1002 goto out;
1003 ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
1004 con->auth_reply_buf);
1005 if (ret <= 0)
1006 goto out;
1007
1008 dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
1009 con, (int)con->in_reply.tag,
1010 le32_to_cpu(con->in_reply.connect_seq),
1011 le32_to_cpu(con->in_reply.global_seq));
1012out:
1013 return ret;
1014
1015}
1016
1017/*
1018 * Verify the hello banner looks okay.
1019 */
1020static int verify_hello(struct ceph_connection *con)
1021{
1022 if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
1023 pr_err("connect to %s got bad banner\n",
1024 ceph_pr_addr(&con->peer_addr.in_addr));
1025 con->error_msg = "protocol error, bad banner";
1026 return -1;
1027 }
1028 return 0;
1029}
1030
1031static bool addr_is_blank(struct sockaddr_storage *ss)
1032{
1033 switch (ss->ss_family) {
1034 case AF_INET:
1035 return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
1036 case AF_INET6:
1037 return
1038 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
1039 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
1040 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
1041 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
1042 }
1043 return false;
1044}
1045
1046static int addr_port(struct sockaddr_storage *ss)
1047{
1048 switch (ss->ss_family) {
1049 case AF_INET:
1050 return ntohs(((struct sockaddr_in *)ss)->sin_port);
1051 case AF_INET6:
1052 return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
1053 }
1054 return 0;
1055}
1056
1057static void addr_set_port(struct sockaddr_storage *ss, int p)
1058{
1059 switch (ss->ss_family) {
1060 case AF_INET:
1061 ((struct sockaddr_in *)ss)->sin_port = htons(p);
1062 case AF_INET6:
1063 ((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
1064 }
1065}
1066
1067/*
1068 * Parse an ip[:port] list into an addr array. Use the default
1069 * monitor port if a port isn't specified.
1070 */
1071int ceph_parse_ips(const char *c, const char *end,
1072 struct ceph_entity_addr *addr,
1073 int max_count, int *count)
1074{
1075 int i;
1076 const char *p = c;
1077
1078 dout("parse_ips on '%.*s'\n", (int)(end-c), c);
1079 for (i = 0; i < max_count; i++) {
1080 const char *ipend;
1081 struct sockaddr_storage *ss = &addr[i].in_addr;
1082 struct sockaddr_in *in4 = (void *)ss;
1083 struct sockaddr_in6 *in6 = (void *)ss;
1084 int port;
1085 char delim = ',';
1086
1087 if (*p == '[') {
1088 delim = ']';
1089 p++;
1090 }
1091
1092 memset(ss, 0, sizeof(*ss));
1093 if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
1094 delim, &ipend))
1095 ss->ss_family = AF_INET;
1096 else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
1097 delim, &ipend))
1098 ss->ss_family = AF_INET6;
1099 else
1100 goto bad;
1101 p = ipend;
1102
1103 if (delim == ']') {
1104 if (*p != ']') {
1105 dout("missing matching ']'\n");
1106 goto bad;
1107 }
1108 p++;
1109 }
1110
1111 /* port? */
1112 if (p < end && *p == ':') {
1113 port = 0;
1114 p++;
1115 while (p < end && *p >= '0' && *p <= '9') {
1116 port = (port * 10) + (*p - '0');
1117 p++;
1118 }
1119 if (port > 65535 || port == 0)
1120 goto bad;
1121 } else {
1122 port = CEPH_MON_PORT;
1123 }
1124
1125 addr_set_port(ss, port);
1126
1127 dout("parse_ips got %s\n", ceph_pr_addr(ss));
1128
1129 if (p == end)
1130 break;
1131 if (*p != ',')
1132 goto bad;
1133 p++;
1134 }
1135
1136 if (p != end)
1137 goto bad;
1138
1139 if (count)
1140 *count = i + 1;
1141 return 0;
1142
1143bad:
1144 pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
1145 return -EINVAL;
1146}
1147EXPORT_SYMBOL(ceph_parse_ips);
1148
1149static int process_banner(struct ceph_connection *con)
1150{
1151 dout("process_banner on %p\n", con);
1152
1153 if (verify_hello(con) < 0)
1154 return -1;
1155
1156 ceph_decode_addr(&con->actual_peer_addr);
1157 ceph_decode_addr(&con->peer_addr_for_me);
1158
1159 /*
1160 * Make sure the other end is who we wanted. note that the other
1161 * end may not yet know their ip address, so if it's 0.0.0.0, give
1162 * them the benefit of the doubt.
1163 */
1164 if (memcmp(&con->peer_addr, &con->actual_peer_addr,
1165 sizeof(con->peer_addr)) != 0 &&
1166 !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
1167 con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
1168 pr_warning("wrong peer, want %s/%d, got %s/%d\n",
1169 ceph_pr_addr(&con->peer_addr.in_addr),
1170 (int)le32_to_cpu(con->peer_addr.nonce),
1171 ceph_pr_addr(&con->actual_peer_addr.in_addr),
1172 (int)le32_to_cpu(con->actual_peer_addr.nonce));
1173 con->error_msg = "wrong peer at address";
1174 return -1;
1175 }
1176
1177 /*
1178 * did we learn our address?
1179 */
1180 if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
1181 int port = addr_port(&con->msgr->inst.addr.in_addr);
1182
1183 memcpy(&con->msgr->inst.addr.in_addr,
1184 &con->peer_addr_for_me.in_addr,
1185 sizeof(con->peer_addr_for_me.in_addr));
1186 addr_set_port(&con->msgr->inst.addr.in_addr, port);
1187 encode_my_addr(con->msgr);
1188 dout("process_banner learned my addr is %s\n",
1189 ceph_pr_addr(&con->msgr->inst.addr.in_addr));
1190 }
1191
1192 set_bit(NEGOTIATING, &con->state);
1193 prepare_read_connect(con);
1194 return 0;
1195}
1196
1197static void fail_protocol(struct ceph_connection *con)
1198{
1199 reset_connection(con);
1200 set_bit(CLOSED, &con->state); /* in case there's queued work */
1201
1202 mutex_unlock(&con->mutex);
1203 if (con->ops->bad_proto)
1204 con->ops->bad_proto(con);
1205 mutex_lock(&con->mutex);
1206}
1207
1208static int process_connect(struct ceph_connection *con)
1209{
1210 u64 sup_feat = con->msgr->supported_features;
1211 u64 req_feat = con->msgr->required_features;
1212 u64 server_feat = le64_to_cpu(con->in_reply.features);
1213
1214 dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
1215
1216 switch (con->in_reply.tag) {
1217 case CEPH_MSGR_TAG_FEATURES:
1218 pr_err("%s%lld %s feature set mismatch,"
1219 " my %llx < server's %llx, missing %llx\n",
1220 ENTITY_NAME(con->peer_name),
1221 ceph_pr_addr(&con->peer_addr.in_addr),
1222 sup_feat, server_feat, server_feat & ~sup_feat);
1223 con->error_msg = "missing required protocol features";
1224 fail_protocol(con);
1225 return -1;
1226
1227 case CEPH_MSGR_TAG_BADPROTOVER:
1228 pr_err("%s%lld %s protocol version mismatch,"
1229 " my %d != server's %d\n",
1230 ENTITY_NAME(con->peer_name),
1231 ceph_pr_addr(&con->peer_addr.in_addr),
1232 le32_to_cpu(con->out_connect.protocol_version),
1233 le32_to_cpu(con->in_reply.protocol_version));
1234 con->error_msg = "protocol version mismatch";
1235 fail_protocol(con);
1236 return -1;
1237
1238 case CEPH_MSGR_TAG_BADAUTHORIZER:
1239 con->auth_retry++;
1240 dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
1241 con->auth_retry);
1242 if (con->auth_retry == 2) {
1243 con->error_msg = "connect authorization failure";
1244 reset_connection(con);
1245 set_bit(CLOSED, &con->state);
1246 return -1;
1247 }
1248 con->auth_retry = 1;
1249 prepare_write_connect(con->msgr, con, 0);
1250 prepare_read_connect(con);
1251 break;
1252
1253 case CEPH_MSGR_TAG_RESETSESSION:
1254 /*
1255 * If we connected with a large connect_seq but the peer
1256 * has no record of a session with us (no connection, or
1257 * connect_seq == 0), they will send RESETSESION to indicate
1258 * that they must have reset their session, and may have
1259 * dropped messages.
1260 */
1261 dout("process_connect got RESET peer seq %u\n",
1262 le32_to_cpu(con->in_connect.connect_seq));
1263 pr_err("%s%lld %s connection reset\n",
1264 ENTITY_NAME(con->peer_name),
1265 ceph_pr_addr(&con->peer_addr.in_addr));
1266 reset_connection(con);
1267 prepare_write_connect(con->msgr, con, 0);
1268 prepare_read_connect(con);
1269
1270 /* Tell ceph about it. */
1271 mutex_unlock(&con->mutex);
1272 pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
1273 if (con->ops->peer_reset)
1274 con->ops->peer_reset(con);
1275 mutex_lock(&con->mutex);
1276 break;
1277
1278 case CEPH_MSGR_TAG_RETRY_SESSION:
1279 /*
1280 * If we sent a smaller connect_seq than the peer has, try
1281 * again with a larger value.
1282 */
1283 dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
1284 le32_to_cpu(con->out_connect.connect_seq),
1285 le32_to_cpu(con->in_connect.connect_seq));
1286 con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
1287 prepare_write_connect(con->msgr, con, 0);
1288 prepare_read_connect(con);
1289 break;
1290
1291 case CEPH_MSGR_TAG_RETRY_GLOBAL:
1292 /*
1293 * If we sent a smaller global_seq than the peer has, try
1294 * again with a larger value.
1295 */
1296 dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
1297 con->peer_global_seq,
1298 le32_to_cpu(con->in_connect.global_seq));
1299 get_global_seq(con->msgr,
1300 le32_to_cpu(con->in_connect.global_seq));
1301 prepare_write_connect(con->msgr, con, 0);
1302 prepare_read_connect(con);
1303 break;
1304
1305 case CEPH_MSGR_TAG_READY:
1306 if (req_feat & ~server_feat) {
1307 pr_err("%s%lld %s protocol feature mismatch,"
1308 " my required %llx > server's %llx, need %llx\n",
1309 ENTITY_NAME(con->peer_name),
1310 ceph_pr_addr(&con->peer_addr.in_addr),
1311 req_feat, server_feat, req_feat & ~server_feat);
1312 con->error_msg = "missing required protocol features";
1313 fail_protocol(con);
1314 return -1;
1315 }
1316 clear_bit(CONNECTING, &con->state);
1317 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
1318 con->connect_seq++;
1319 con->peer_features = server_feat;
1320 dout("process_connect got READY gseq %d cseq %d (%d)\n",
1321 con->peer_global_seq,
1322 le32_to_cpu(con->in_reply.connect_seq),
1323 con->connect_seq);
1324 WARN_ON(con->connect_seq !=
1325 le32_to_cpu(con->in_reply.connect_seq));
1326
1327 if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
1328 set_bit(LOSSYTX, &con->state);
1329
1330 prepare_read_tag(con);
1331 break;
1332
1333 case CEPH_MSGR_TAG_WAIT:
1334 /*
1335 * If there is a connection race (we are opening
1336 * connections to each other), one of us may just have
1337 * to WAIT. This shouldn't happen if we are the
1338 * client.
1339 */
1340 pr_err("process_connect peer connecting WAIT\n");
1341
1342 default:
1343 pr_err("connect protocol error, will retry\n");
1344 con->error_msg = "protocol error, garbage tag during connect";
1345 return -1;
1346 }
1347 return 0;
1348}
1349
1350
1351/*
1352 * read (part of) an ack
1353 */
1354static int read_partial_ack(struct ceph_connection *con)
1355{
1356 int to = 0;
1357
1358 return read_partial(con, &to, sizeof(con->in_temp_ack),
1359 &con->in_temp_ack);
1360}
1361
1362
1363/*
1364 * We can finally discard anything that's been acked.
1365 */
1366static void process_ack(struct ceph_connection *con)
1367{
1368 struct ceph_msg *m;
1369 u64 ack = le64_to_cpu(con->in_temp_ack);
1370 u64 seq;
1371
1372 while (!list_empty(&con->out_sent)) {
1373 m = list_first_entry(&con->out_sent, struct ceph_msg,
1374 list_head);
1375 seq = le64_to_cpu(m->hdr.seq);
1376 if (seq > ack)
1377 break;
1378 dout("got ack for seq %llu type %d at %p\n", seq,
1379 le16_to_cpu(m->hdr.type), m);
1380 ceph_msg_remove(m);
1381 }
1382 prepare_read_tag(con);
1383}
1384
1385
1386
1387
1388static int read_partial_message_section(struct ceph_connection *con,
1389 struct kvec *section,
1390 unsigned int sec_len, u32 *crc)
1391{
1392 int ret, left;
1393
1394 BUG_ON(!section);
1395
1396 while (section->iov_len < sec_len) {
1397 BUG_ON(section->iov_base == NULL);
1398 left = sec_len - section->iov_len;
1399 ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
1400 section->iov_len, left);
1401 if (ret <= 0)
1402 return ret;
1403 section->iov_len += ret;
1404 if (section->iov_len == sec_len)
1405 *crc = crc32c(0, section->iov_base,
1406 section->iov_len);
1407 }
1408
1409 return 1;
1410}
1411
1412static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
1413 struct ceph_msg_header *hdr,
1414 int *skip);
1415
1416
1417static int read_partial_message_pages(struct ceph_connection *con,
1418 struct page **pages,
1419 unsigned data_len, int datacrc)
1420{
1421 void *p;
1422 int ret;
1423 int left;
1424
1425 left = min((int)(data_len - con->in_msg_pos.data_pos),
1426 (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
1427 /* (page) data */
1428 BUG_ON(pages == NULL);
1429 p = kmap(pages[con->in_msg_pos.page]);
1430 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
1431 left);
1432 if (ret > 0 && datacrc)
1433 con->in_data_crc =
1434 crc32c(con->in_data_crc,
1435 p + con->in_msg_pos.page_pos, ret);
1436 kunmap(pages[con->in_msg_pos.page]);
1437 if (ret <= 0)
1438 return ret;
1439 con->in_msg_pos.data_pos += ret;
1440 con->in_msg_pos.page_pos += ret;
1441 if (con->in_msg_pos.page_pos == PAGE_SIZE) {
1442 con->in_msg_pos.page_pos = 0;
1443 con->in_msg_pos.page++;
1444 }
1445
1446 return ret;
1447}
1448
1449#ifdef CONFIG_BLOCK
1450static int read_partial_message_bio(struct ceph_connection *con,
1451 struct bio **bio_iter, int *bio_seg,
1452 unsigned data_len, int datacrc)
1453{
1454 struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg);
1455 void *p;
1456 int ret, left;
1457
1458 if (IS_ERR(bv))
1459 return PTR_ERR(bv);
1460
1461 left = min((int)(data_len - con->in_msg_pos.data_pos),
1462 (int)(bv->bv_len - con->in_msg_pos.page_pos));
1463
1464 p = kmap(bv->bv_page) + bv->bv_offset;
1465
1466 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
1467 left);
1468 if (ret > 0 && datacrc)
1469 con->in_data_crc =
1470 crc32c(con->in_data_crc,
1471 p + con->in_msg_pos.page_pos, ret);
1472 kunmap(bv->bv_page);
1473 if (ret <= 0)
1474 return ret;
1475 con->in_msg_pos.data_pos += ret;
1476 con->in_msg_pos.page_pos += ret;
1477 if (con->in_msg_pos.page_pos == bv->bv_len) {
1478 con->in_msg_pos.page_pos = 0;
1479 iter_bio_next(bio_iter, bio_seg);
1480 }
1481
1482 return ret;
1483}
1484#endif
1485
1486/*
1487 * read (part of) a message.
1488 */
1489static int read_partial_message(struct ceph_connection *con)
1490{
1491 struct ceph_msg *m = con->in_msg;
1492 int ret;
1493 int to, left;
1494 unsigned front_len, middle_len, data_len, data_off;
1495 int datacrc = con->msgr->nocrc;
1496 int skip;
1497 u64 seq;
1498
1499 dout("read_partial_message con %p msg %p\n", con, m);
1500
1501 /* header */
1502 while (con->in_base_pos < sizeof(con->in_hdr)) {
1503 left = sizeof(con->in_hdr) - con->in_base_pos;
1504 ret = ceph_tcp_recvmsg(con->sock,
1505 (char *)&con->in_hdr + con->in_base_pos,
1506 left);
1507 if (ret <= 0)
1508 return ret;
1509 con->in_base_pos += ret;
1510 if (con->in_base_pos == sizeof(con->in_hdr)) {
1511 u32 crc = crc32c(0, (void *)&con->in_hdr,
1512 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
1513 if (crc != le32_to_cpu(con->in_hdr.crc)) {
1514 pr_err("read_partial_message bad hdr "
1515 " crc %u != expected %u\n",
1516 crc, con->in_hdr.crc);
1517 return -EBADMSG;
1518 }
1519 }
1520 }
1521 front_len = le32_to_cpu(con->in_hdr.front_len);
1522 if (front_len > CEPH_MSG_MAX_FRONT_LEN)
1523 return -EIO;
1524 middle_len = le32_to_cpu(con->in_hdr.middle_len);
1525 if (middle_len > CEPH_MSG_MAX_DATA_LEN)
1526 return -EIO;
1527 data_len = le32_to_cpu(con->in_hdr.data_len);
1528 if (data_len > CEPH_MSG_MAX_DATA_LEN)
1529 return -EIO;
1530 data_off = le16_to_cpu(con->in_hdr.data_off);
1531
1532 /* verify seq# */
1533 seq = le64_to_cpu(con->in_hdr.seq);
1534 if ((s64)seq - (s64)con->in_seq < 1) {
1535 pr_info("skipping %s%lld %s seq %lld, expected %lld\n",
1536 ENTITY_NAME(con->peer_name),
1537 ceph_pr_addr(&con->peer_addr.in_addr),
1538 seq, con->in_seq + 1);
1539 con->in_base_pos = -front_len - middle_len - data_len -
1540 sizeof(m->footer);
1541 con->in_tag = CEPH_MSGR_TAG_READY;
1542 con->in_seq++;
1543 return 0;
1544 } else if ((s64)seq - (s64)con->in_seq > 1) {
1545 pr_err("read_partial_message bad seq %lld expected %lld\n",
1546 seq, con->in_seq + 1);
1547 con->error_msg = "bad message sequence # for incoming message";
1548 return -EBADMSG;
1549 }
1550
1551 /* allocate message? */
1552 if (!con->in_msg) {
1553 dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
1554 con->in_hdr.front_len, con->in_hdr.data_len);
1555 skip = 0;
1556 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
1557 if (skip) {
1558 /* skip this message */
1559 dout("alloc_msg said skip message\n");
1560 BUG_ON(con->in_msg);
1561 con->in_base_pos = -front_len - middle_len - data_len -
1562 sizeof(m->footer);
1563 con->in_tag = CEPH_MSGR_TAG_READY;
1564 con->in_seq++;
1565 return 0;
1566 }
1567 if (!con->in_msg) {
1568 con->error_msg =
1569 "error allocating memory for incoming message";
1570 return -ENOMEM;
1571 }
1572 m = con->in_msg;
1573 m->front.iov_len = 0; /* haven't read it yet */
1574 if (m->middle)
1575 m->middle->vec.iov_len = 0;
1576
1577 con->in_msg_pos.page = 0;
1578 if (m->pages)
1579 con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
1580 else
1581 con->in_msg_pos.page_pos = 0;
1582 con->in_msg_pos.data_pos = 0;
1583 }
1584
1585 /* front */
1586 ret = read_partial_message_section(con, &m->front, front_len,
1587 &con->in_front_crc);
1588 if (ret <= 0)
1589 return ret;
1590
1591 /* middle */
1592 if (m->middle) {
1593 ret = read_partial_message_section(con, &m->middle->vec,
1594 middle_len,
1595 &con->in_middle_crc);
1596 if (ret <= 0)
1597 return ret;
1598 }
1599#ifdef CONFIG_BLOCK
1600 if (m->bio && !m->bio_iter)
1601 init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg);
1602#endif
1603
1604 /* (page) data */
1605 while (con->in_msg_pos.data_pos < data_len) {
1606 if (m->pages) {
1607 ret = read_partial_message_pages(con, m->pages,
1608 data_len, datacrc);
1609 if (ret <= 0)
1610 return ret;
1611#ifdef CONFIG_BLOCK
1612 } else if (m->bio) {
1613
1614 ret = read_partial_message_bio(con,
1615 &m->bio_iter, &m->bio_seg,
1616 data_len, datacrc);
1617 if (ret <= 0)
1618 return ret;
1619#endif
1620 } else {
1621 BUG_ON(1);
1622 }
1623 }
1624
1625 /* footer */
1626 to = sizeof(m->hdr) + sizeof(m->footer);
1627 while (con->in_base_pos < to) {
1628 left = to - con->in_base_pos;
1629 ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
1630 (con->in_base_pos - sizeof(m->hdr)),
1631 left);
1632 if (ret <= 0)
1633 return ret;
1634 con->in_base_pos += ret;
1635 }
1636 dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
1637 m, front_len, m->footer.front_crc, middle_len,
1638 m->footer.middle_crc, data_len, m->footer.data_crc);
1639
1640 /* crc ok? */
1641 if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
1642 pr_err("read_partial_message %p front crc %u != exp. %u\n",
1643 m, con->in_front_crc, m->footer.front_crc);
1644 return -EBADMSG;
1645 }
1646 if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
1647 pr_err("read_partial_message %p middle crc %u != exp %u\n",
1648 m, con->in_middle_crc, m->footer.middle_crc);
1649 return -EBADMSG;
1650 }
1651 if (datacrc &&
1652 (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
1653 con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
1654 pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
1655 con->in_data_crc, le32_to_cpu(m->footer.data_crc));
1656 return -EBADMSG;
1657 }
1658
1659 return 1; /* done! */
1660}
1661
1662/*
1663 * Process message. This happens in the worker thread. The callback should
1664 * be careful not to do anything that waits on other incoming messages or it
1665 * may deadlock.
1666 */
1667static void process_message(struct ceph_connection *con)
1668{
1669 struct ceph_msg *msg;
1670
1671 msg = con->in_msg;
1672 con->in_msg = NULL;
1673
1674 /* if first message, set peer_name */
1675 if (con->peer_name.type == 0)
1676 con->peer_name = msg->hdr.src;
1677
1678 con->in_seq++;
1679 mutex_unlock(&con->mutex);
1680
1681 dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
1682 msg, le64_to_cpu(msg->hdr.seq),
1683 ENTITY_NAME(msg->hdr.src),
1684 le16_to_cpu(msg->hdr.type),
1685 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
1686 le32_to_cpu(msg->hdr.front_len),
1687 le32_to_cpu(msg->hdr.data_len),
1688 con->in_front_crc, con->in_middle_crc, con->in_data_crc);
1689 con->ops->dispatch(con, msg);
1690
1691 mutex_lock(&con->mutex);
1692 prepare_read_tag(con);
1693}
1694
1695
1696/*
1697 * Write something to the socket. Called in a worker thread when the
1698 * socket appears to be writeable and we have something ready to send.
1699 */
1700static int try_write(struct ceph_connection *con)
1701{
1702 struct ceph_messenger *msgr = con->msgr;
1703 int ret = 1;
1704
1705 dout("try_write start %p state %lu nref %d\n", con, con->state,
1706 atomic_read(&con->nref));
1707
1708more:
1709 dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
1710
1711 /* open the socket first? */
1712 if (con->sock == NULL) {
1713 /*
1714 * if we were STANDBY and are reconnecting _this_
1715 * connection, bump connect_seq now. Always bump
1716 * global_seq.
1717 */
1718 if (test_and_clear_bit(STANDBY, &con->state))
1719 con->connect_seq++;
1720
1721 prepare_write_banner(msgr, con);
1722 prepare_write_connect(msgr, con, 1);
1723 prepare_read_banner(con);
1724 set_bit(CONNECTING, &con->state);
1725 clear_bit(NEGOTIATING, &con->state);
1726
1727 BUG_ON(con->in_msg);
1728 con->in_tag = CEPH_MSGR_TAG_READY;
1729 dout("try_write initiating connect on %p new state %lu\n",
1730 con, con->state);
1731 con->sock = ceph_tcp_connect(con);
1732 if (IS_ERR(con->sock)) {
1733 con->sock = NULL;
1734 con->error_msg = "connect error";
1735 ret = -1;
1736 goto out;
1737 }
1738 }
1739
1740more_kvec:
1741 /* kvec data queued? */
1742 if (con->out_skip) {
1743 ret = write_partial_skip(con);
1744 if (ret <= 0)
1745 goto done;
1746 if (ret < 0) {
1747 dout("try_write write_partial_skip err %d\n", ret);
1748 goto done;
1749 }
1750 }
1751 if (con->out_kvec_left) {
1752 ret = write_partial_kvec(con);
1753 if (ret <= 0)
1754 goto done;
1755 }
1756
1757 /* msg pages? */
1758 if (con->out_msg) {
1759 if (con->out_msg_done) {
1760 ceph_msg_put(con->out_msg);
1761 con->out_msg = NULL; /* we're done with this one */
1762 goto do_next;
1763 }
1764
1765 ret = write_partial_msg_pages(con);
1766 if (ret == 1)
1767 goto more_kvec; /* we need to send the footer, too! */
1768 if (ret == 0)
1769 goto done;
1770 if (ret < 0) {
1771 dout("try_write write_partial_msg_pages err %d\n",
1772 ret);
1773 goto done;
1774 }
1775 }
1776
1777do_next:
1778 if (!test_bit(CONNECTING, &con->state)) {
1779 /* is anything else pending? */
1780 if (!list_empty(&con->out_queue)) {
1781 prepare_write_message(con);
1782 goto more;
1783 }
1784 if (con->in_seq > con->in_seq_acked) {
1785 prepare_write_ack(con);
1786 goto more;
1787 }
1788 if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
1789 prepare_write_keepalive(con);
1790 goto more;
1791 }
1792 }
1793
1794 /* Nothing to do! */
1795 clear_bit(WRITE_PENDING, &con->state);
1796 dout("try_write nothing else to write.\n");
1797done:
1798 ret = 0;
1799out:
1800 dout("try_write done on %p\n", con);
1801 return ret;
1802}
1803
1804
1805
1806/*
1807 * Read what we can from the socket.
1808 */
1809static int try_read(struct ceph_connection *con)
1810{
1811 int ret = -1;
1812
1813 if (!con->sock)
1814 return 0;
1815
1816 if (test_bit(STANDBY, &con->state))
1817 return 0;
1818
1819 dout("try_read start on %p\n", con);
1820
1821more:
1822 dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
1823 con->in_base_pos);
1824 if (test_bit(CONNECTING, &con->state)) {
1825 if (!test_bit(NEGOTIATING, &con->state)) {
1826 dout("try_read connecting\n");
1827 ret = read_partial_banner(con);
1828 if (ret <= 0)
1829 goto done;
1830 if (process_banner(con) < 0) {
1831 ret = -1;
1832 goto out;
1833 }
1834 }
1835 ret = read_partial_connect(con);
1836 if (ret <= 0)
1837 goto done;
1838 if (process_connect(con) < 0) {
1839 ret = -1;
1840 goto out;
1841 }
1842 goto more;
1843 }
1844
1845 if (con->in_base_pos < 0) {
1846 /*
1847 * skipping + discarding content.
1848 *
1849 * FIXME: there must be a better way to do this!
1850 */
1851 static char buf[1024];
1852 int skip = min(1024, -con->in_base_pos);
1853 dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
1854 ret = ceph_tcp_recvmsg(con->sock, buf, skip);
1855 if (ret <= 0)
1856 goto done;
1857 con->in_base_pos += ret;
1858 if (con->in_base_pos)
1859 goto more;
1860 }
1861 if (con->in_tag == CEPH_MSGR_TAG_READY) {
1862 /*
1863 * what's next?
1864 */
1865 ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
1866 if (ret <= 0)
1867 goto done;
1868 dout("try_read got tag %d\n", (int)con->in_tag);
1869 switch (con->in_tag) {
1870 case CEPH_MSGR_TAG_MSG:
1871 prepare_read_message(con);
1872 break;
1873 case CEPH_MSGR_TAG_ACK:
1874 prepare_read_ack(con);
1875 break;
1876 case CEPH_MSGR_TAG_CLOSE:
1877 set_bit(CLOSED, &con->state); /* fixme */
1878 goto done;
1879 default:
1880 goto bad_tag;
1881 }
1882 }
1883 if (con->in_tag == CEPH_MSGR_TAG_MSG) {
1884 ret = read_partial_message(con);
1885 if (ret <= 0) {
1886 switch (ret) {
1887 case -EBADMSG:
1888 con->error_msg = "bad crc";
1889 ret = -EIO;
1890 goto out;
1891 case -EIO:
1892 con->error_msg = "io error";
1893 goto out;
1894 default:
1895 goto done;
1896 }
1897 }
1898 if (con->in_tag == CEPH_MSGR_TAG_READY)
1899 goto more;
1900 process_message(con);
1901 goto more;
1902 }
1903 if (con->in_tag == CEPH_MSGR_TAG_ACK) {
1904 ret = read_partial_ack(con);
1905 if (ret <= 0)
1906 goto done;
1907 process_ack(con);
1908 goto more;
1909 }
1910
1911done:
1912 ret = 0;
1913out:
1914 dout("try_read done on %p\n", con);
1915 return ret;
1916
1917bad_tag:
1918 pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
1919 con->error_msg = "protocol error, garbage tag";
1920 ret = -1;
1921 goto out;
1922}
1923
1924
1925/*
1926 * Atomically queue work on a connection. Bump @con reference to
1927 * avoid races with connection teardown.
1928 *
1929 * There is some trickery going on with QUEUED and BUSY because we
1930 * only want a _single_ thread operating on each connection at any
1931 * point in time, but we want to use all available CPUs.
1932 *
1933 * The worker thread only proceeds if it can atomically set BUSY. It
1934 * clears QUEUED and does it's thing. When it thinks it's done, it
1935 * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
1936 * (tries again to set BUSY).
1937 *
1938 * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
1939 * try to queue work. If that fails (work is already queued, or BUSY)
1940 * we give up (work also already being done or is queued) but leave QUEUED
1941 * set so that the worker thread will loop if necessary.
1942 */
1943static void queue_con(struct ceph_connection *con)
1944{
1945 if (test_bit(DEAD, &con->state)) {
1946 dout("queue_con %p ignoring: DEAD\n",
1947 con);
1948 return;
1949 }
1950
1951 if (!con->ops->get(con)) {
1952 dout("queue_con %p ref count 0\n", con);
1953 return;
1954 }
1955
1956 set_bit(QUEUED, &con->state);
1957 if (test_bit(BUSY, &con->state)) {
1958 dout("queue_con %p - already BUSY\n", con);
1959 con->ops->put(con);
1960 } else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
1961 dout("queue_con %p - already queued\n", con);
1962 con->ops->put(con);
1963 } else {
1964 dout("queue_con %p\n", con);
1965 }
1966}
1967
1968/*
1969 * Do some work on a connection. Drop a connection ref when we're done.
1970 */
1971static void con_work(struct work_struct *work)
1972{
1973 struct ceph_connection *con = container_of(work, struct ceph_connection,
1974 work.work);
1975 int backoff = 0;
1976
1977more:
1978 if (test_and_set_bit(BUSY, &con->state) != 0) {
1979 dout("con_work %p BUSY already set\n", con);
1980 goto out;
1981 }
1982 dout("con_work %p start, clearing QUEUED\n", con);
1983 clear_bit(QUEUED, &con->state);
1984
1985 mutex_lock(&con->mutex);
1986
1987 if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
1988 dout("con_work CLOSED\n");
1989 con_close_socket(con);
1990 goto done;
1991 }
1992 if (test_and_clear_bit(OPENING, &con->state)) {
1993 /* reopen w/ new peer */
1994 dout("con_work OPENING\n");
1995 con_close_socket(con);
1996 }
1997
1998 if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
1999 try_read(con) < 0 ||
2000 try_write(con) < 0) {
2001 mutex_unlock(&con->mutex);
2002 backoff = 1;
2003 ceph_fault(con); /* error/fault path */
2004 goto done_unlocked;
2005 }
2006
2007done:
2008 mutex_unlock(&con->mutex);
2009
2010done_unlocked:
2011 clear_bit(BUSY, &con->state);
2012 dout("con->state=%lu\n", con->state);
2013 if (test_bit(QUEUED, &con->state)) {
2014 if (!backoff || test_bit(OPENING, &con->state)) {
2015 dout("con_work %p QUEUED reset, looping\n", con);
2016 goto more;
2017 }
2018 dout("con_work %p QUEUED reset, but just faulted\n", con);
2019 clear_bit(QUEUED, &con->state);
2020 }
2021 dout("con_work %p done\n", con);
2022
2023out:
2024 con->ops->put(con);
2025}
2026
2027
2028/*
2029 * Generic error/fault handler. A retry mechanism is used with
2030 * exponential backoff
2031 */
2032static void ceph_fault(struct ceph_connection *con)
2033{
2034 pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
2035 ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
2036 dout("fault %p state %lu to peer %s\n",
2037 con, con->state, ceph_pr_addr(&con->peer_addr.in_addr));
2038
2039 if (test_bit(LOSSYTX, &con->state)) {
2040 dout("fault on LOSSYTX channel\n");
2041 goto out;
2042 }
2043
2044 mutex_lock(&con->mutex);
2045 if (test_bit(CLOSED, &con->state))
2046 goto out_unlock;
2047
2048 con_close_socket(con);
2049
2050 if (con->in_msg) {
2051 ceph_msg_put(con->in_msg);
2052 con->in_msg = NULL;
2053 }
2054
2055 /* Requeue anything that hasn't been acked */
2056 list_splice_init(&con->out_sent, &con->out_queue);
2057
2058 /* If there are no messages in the queue, place the connection
2059 * in a STANDBY state (i.e., don't try to reconnect just yet). */
2060 if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
2061 dout("fault setting STANDBY\n");
2062 set_bit(STANDBY, &con->state);
2063 } else {
2064 /* retry after a delay. */
2065 if (con->delay == 0)
2066 con->delay = BASE_DELAY_INTERVAL;
2067 else if (con->delay < MAX_DELAY_INTERVAL)
2068 con->delay *= 2;
2069 dout("fault queueing %p delay %lu\n", con, con->delay);
2070 con->ops->get(con);
2071 if (queue_delayed_work(ceph_msgr_wq, &con->work,
2072 round_jiffies_relative(con->delay)) == 0)
2073 con->ops->put(con);
2074 }
2075
2076out_unlock:
2077 mutex_unlock(&con->mutex);
2078out:
2079 /*
2080 * in case we faulted due to authentication, invalidate our
2081 * current tickets so that we can get new ones.
2082 */
2083 if (con->auth_retry && con->ops->invalidate_authorizer) {
2084 dout("calling invalidate_authorizer()\n");
2085 con->ops->invalidate_authorizer(con);
2086 }
2087
2088 if (con->ops->fault)
2089 con->ops->fault(con);
2090}
2091
2092
2093
2094/*
2095 * create a new messenger instance
2096 */
2097struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr,
2098 u32 supported_features,
2099 u32 required_features)
2100{
2101 struct ceph_messenger *msgr;
2102
2103 msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
2104 if (msgr == NULL)
2105 return ERR_PTR(-ENOMEM);
2106
2107 msgr->supported_features = supported_features;
2108 msgr->required_features = required_features;
2109
2110 spin_lock_init(&msgr->global_seq_lock);
2111
2112 /* the zero page is needed if a request is "canceled" while the message
2113 * is being written over the socket */
2114 msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
2115 if (!msgr->zero_page) {
2116 kfree(msgr);
2117 return ERR_PTR(-ENOMEM);
2118 }
2119 kmap(msgr->zero_page);
2120
2121 if (myaddr)
2122 msgr->inst.addr = *myaddr;
2123
2124 /* select a random nonce */
2125 msgr->inst.addr.type = 0;
2126 get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
2127 encode_my_addr(msgr);
2128
2129 dout("messenger_create %p\n", msgr);
2130 return msgr;
2131}
2132EXPORT_SYMBOL(ceph_messenger_create);
2133
2134void ceph_messenger_destroy(struct ceph_messenger *msgr)
2135{
2136 dout("destroy %p\n", msgr);
2137 kunmap(msgr->zero_page);
2138 __free_page(msgr->zero_page);
2139 kfree(msgr);
2140 dout("destroyed messenger %p\n", msgr);
2141}
2142EXPORT_SYMBOL(ceph_messenger_destroy);
2143
2144/*
2145 * Queue up an outgoing message on the given connection.
2146 */
2147void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
2148{
2149 if (test_bit(CLOSED, &con->state)) {
2150 dout("con_send %p closed, dropping %p\n", con, msg);
2151 ceph_msg_put(msg);
2152 return;
2153 }
2154
2155 /* set src+dst */
2156 msg->hdr.src = con->msgr->inst.name;
2157
2158 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
2159
2160 msg->needs_out_seq = true;
2161
2162 /* queue */
2163 mutex_lock(&con->mutex);
2164 BUG_ON(!list_empty(&msg->list_head));
2165 list_add_tail(&msg->list_head, &con->out_queue);
2166 dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
2167 ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
2168 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
2169 le32_to_cpu(msg->hdr.front_len),
2170 le32_to_cpu(msg->hdr.middle_len),
2171 le32_to_cpu(msg->hdr.data_len));
2172 mutex_unlock(&con->mutex);
2173
2174 /* if there wasn't anything waiting to send before, queue
2175 * new work */
2176 if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
2177 queue_con(con);
2178}
2179EXPORT_SYMBOL(ceph_con_send);
2180
2181/*
2182 * Revoke a message that was previously queued for send
2183 */
2184void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
2185{
2186 mutex_lock(&con->mutex);
2187 if (!list_empty(&msg->list_head)) {
2188 dout("con_revoke %p msg %p - was on queue\n", con, msg);
2189 list_del_init(&msg->list_head);
2190 ceph_msg_put(msg);
2191 msg->hdr.seq = 0;
2192 }
2193 if (con->out_msg == msg) {
2194 dout("con_revoke %p msg %p - was sending\n", con, msg);
2195 con->out_msg = NULL;
2196 if (con->out_kvec_is_msg) {
2197 con->out_skip = con->out_kvec_bytes;
2198 con->out_kvec_is_msg = false;
2199 }
2200 ceph_msg_put(msg);
2201 msg->hdr.seq = 0;
2202 }
2203 mutex_unlock(&con->mutex);
2204}
2205
2206/*
2207 * Revoke a message that we may be reading data into
2208 */
2209void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
2210{
2211 mutex_lock(&con->mutex);
2212 if (con->in_msg && con->in_msg == msg) {
2213 unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
2214 unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
2215 unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
2216
2217 /* skip rest of message */
2218 dout("con_revoke_pages %p msg %p revoked\n", con, msg);
2219 con->in_base_pos = con->in_base_pos -
2220 sizeof(struct ceph_msg_header) -
2221 front_len -
2222 middle_len -
2223 data_len -
2224 sizeof(struct ceph_msg_footer);
2225 ceph_msg_put(con->in_msg);
2226 con->in_msg = NULL;
2227 con->in_tag = CEPH_MSGR_TAG_READY;
2228 con->in_seq++;
2229 } else {
2230 dout("con_revoke_pages %p msg %p pages %p no-op\n",
2231 con, con->in_msg, msg);
2232 }
2233 mutex_unlock(&con->mutex);
2234}
2235
2236/*
2237 * Queue a keepalive byte to ensure the tcp connection is alive.
2238 */
2239void ceph_con_keepalive(struct ceph_connection *con)
2240{
2241 if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
2242 test_and_set_bit(WRITE_PENDING, &con->state) == 0)
2243 queue_con(con);
2244}
2245EXPORT_SYMBOL(ceph_con_keepalive);
2246
2247
2248/*
2249 * construct a new message with given type, size
2250 * the new msg has a ref count of 1.
2251 */
2252struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
2253{
2254 struct ceph_msg *m;
2255
2256 m = kmalloc(sizeof(*m), flags);
2257 if (m == NULL)
2258 goto out;
2259 kref_init(&m->kref);
2260 INIT_LIST_HEAD(&m->list_head);
2261
2262 m->hdr.tid = 0;
2263 m->hdr.type = cpu_to_le16(type);
2264 m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
2265 m->hdr.version = 0;
2266 m->hdr.front_len = cpu_to_le32(front_len);
2267 m->hdr.middle_len = 0;
2268 m->hdr.data_len = 0;
2269 m->hdr.data_off = 0;
2270 m->hdr.reserved = 0;
2271 m->footer.front_crc = 0;
2272 m->footer.middle_crc = 0;
2273 m->footer.data_crc = 0;
2274 m->footer.flags = 0;
2275 m->front_max = front_len;
2276 m->front_is_vmalloc = false;
2277 m->more_to_follow = false;
2278 m->pool = NULL;
2279
2280 /* front */
2281 if (front_len) {
2282 if (front_len > PAGE_CACHE_SIZE) {
2283 m->front.iov_base = __vmalloc(front_len, flags,
2284 PAGE_KERNEL);
2285 m->front_is_vmalloc = true;
2286 } else {
2287 m->front.iov_base = kmalloc(front_len, flags);
2288 }
2289 if (m->front.iov_base == NULL) {
2290 pr_err("msg_new can't allocate %d bytes\n",
2291 front_len);
2292 goto out2;
2293 }
2294 } else {
2295 m->front.iov_base = NULL;
2296 }
2297 m->front.iov_len = front_len;
2298
2299 /* middle */
2300 m->middle = NULL;
2301
2302 /* data */
2303 m->nr_pages = 0;
2304 m->pages = NULL;
2305 m->pagelist = NULL;
2306 m->bio = NULL;
2307 m->bio_iter = NULL;
2308 m->bio_seg = 0;
2309 m->trail = NULL;
2310
2311 dout("ceph_msg_new %p front %d\n", m, front_len);
2312 return m;
2313
2314out2:
2315 ceph_msg_put(m);
2316out:
2317 pr_err("msg_new can't create type %d front %d\n", type, front_len);
2318 return NULL;
2319}
2320EXPORT_SYMBOL(ceph_msg_new);
2321
2322/*
2323 * Allocate "middle" portion of a message, if it is needed and wasn't
2324 * allocated by alloc_msg. This allows us to read a small fixed-size
2325 * per-type header in the front and then gracefully fail (i.e.,
2326 * propagate the error to the caller based on info in the front) when
2327 * the middle is too large.
2328 */
2329static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
2330{
2331 int type = le16_to_cpu(msg->hdr.type);
2332 int middle_len = le32_to_cpu(msg->hdr.middle_len);
2333
2334 dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
2335 ceph_msg_type_name(type), middle_len);
2336 BUG_ON(!middle_len);
2337 BUG_ON(msg->middle);
2338
2339 msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
2340 if (!msg->middle)
2341 return -ENOMEM;
2342 return 0;
2343}
2344
2345/*
2346 * Generic message allocator, for incoming messages.
2347 */
2348static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
2349 struct ceph_msg_header *hdr,
2350 int *skip)
2351{
2352 int type = le16_to_cpu(hdr->type);
2353 int front_len = le32_to_cpu(hdr->front_len);
2354 int middle_len = le32_to_cpu(hdr->middle_len);
2355 struct ceph_msg *msg = NULL;
2356 int ret;
2357
2358 if (con->ops->alloc_msg) {
2359 mutex_unlock(&con->mutex);
2360 msg = con->ops->alloc_msg(con, hdr, skip);
2361 mutex_lock(&con->mutex);
2362 if (!msg || *skip)
2363 return NULL;
2364 }
2365 if (!msg) {
2366 *skip = 0;
2367 msg = ceph_msg_new(type, front_len, GFP_NOFS);
2368 if (!msg) {
2369 pr_err("unable to allocate msg type %d len %d\n",
2370 type, front_len);
2371 return NULL;
2372 }
2373 }
2374 memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
2375
2376 if (middle_len && !msg->middle) {
2377 ret = ceph_alloc_middle(con, msg);
2378 if (ret < 0) {
2379 ceph_msg_put(msg);
2380 return NULL;
2381 }
2382 }
2383
2384 return msg;
2385}
2386
2387
2388/*
2389 * Free a generically kmalloc'd message.
2390 */
2391void ceph_msg_kfree(struct ceph_msg *m)
2392{
2393 dout("msg_kfree %p\n", m);
2394 if (m->front_is_vmalloc)
2395 vfree(m->front.iov_base);
2396 else
2397 kfree(m->front.iov_base);
2398 kfree(m);
2399}
2400
2401/*
2402 * Drop a msg ref. Destroy as needed.
2403 */
2404void ceph_msg_last_put(struct kref *kref)
2405{
2406 struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
2407
2408 dout("ceph_msg_put last one on %p\n", m);
2409 WARN_ON(!list_empty(&m->list_head));
2410
2411 /* drop middle, data, if any */
2412 if (m->middle) {
2413 ceph_buffer_put(m->middle);
2414 m->middle = NULL;
2415 }
2416 m->nr_pages = 0;
2417 m->pages = NULL;
2418
2419 if (m->pagelist) {
2420 ceph_pagelist_release(m->pagelist);
2421 kfree(m->pagelist);
2422 m->pagelist = NULL;
2423 }
2424
2425 m->trail = NULL;
2426
2427 if (m->pool)
2428 ceph_msgpool_put(m->pool, m);
2429 else
2430 ceph_msg_kfree(m);
2431}
2432EXPORT_SYMBOL(ceph_msg_last_put);
2433
2434void ceph_msg_dump(struct ceph_msg *msg)
2435{
2436 pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
2437 msg->front_max, msg->nr_pages);
2438 print_hex_dump(KERN_DEBUG, "header: ",
2439 DUMP_PREFIX_OFFSET, 16, 1,
2440 &msg->hdr, sizeof(msg->hdr), true);
2441 print_hex_dump(KERN_DEBUG, " front: ",
2442 DUMP_PREFIX_OFFSET, 16, 1,
2443 msg->front.iov_base, msg->front.iov_len, true);
2444 if (msg->middle)
2445 print_hex_dump(KERN_DEBUG, "middle: ",
2446 DUMP_PREFIX_OFFSET, 16, 1,
2447 msg->middle->vec.iov_base,
2448 msg->middle->vec.iov_len, true);
2449 print_hex_dump(KERN_DEBUG, "footer: ",
2450 DUMP_PREFIX_OFFSET, 16, 1,
2451 &msg->footer, sizeof(msg->footer), true);
2452}
2453EXPORT_SYMBOL(ceph_msg_dump);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
new file mode 100644
index 000000000000..8a079399174a
--- /dev/null
+++ b/net/ceph/mon_client.c
@@ -0,0 +1,1027 @@
1#include <linux/ceph/ceph_debug.h>
2
3#include <linux/module.h>
4#include <linux/types.h>
5#include <linux/slab.h>
6#include <linux/random.h>
7#include <linux/sched.h>
8
9#include <linux/ceph/mon_client.h>
10#include <linux/ceph/libceph.h>
11#include <linux/ceph/decode.h>
12
13#include <linux/ceph/auth.h>
14
15/*
16 * Interact with Ceph monitor cluster. Handle requests for new map
17 * versions, and periodically resend as needed. Also implement
18 * statfs() and umount().
19 *
20 * A small cluster of Ceph "monitors" are responsible for managing critical
21 * cluster configuration and state information. An odd number (e.g., 3, 5)
22 * of cmon daemons use a modified version of the Paxos part-time parliament
23 * algorithm to manage the MDS map (mds cluster membership), OSD map, and
24 * list of clients who have mounted the file system.
25 *
26 * We maintain an open, active session with a monitor at all times in order to
27 * receive timely MDSMap updates. We periodically send a keepalive byte on the
28 * TCP socket to ensure we detect a failure. If the connection does break, we
29 * randomly hunt for a new monitor. Once the connection is reestablished, we
30 * resend any outstanding requests.
31 */
32
33static const struct ceph_connection_operations mon_con_ops;
34
35static int __validate_auth(struct ceph_mon_client *monc);
36
37/*
38 * Decode a monmap blob (e.g., during mount).
39 */
40struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
41{
42 struct ceph_monmap *m = NULL;
43 int i, err = -EINVAL;
44 struct ceph_fsid fsid;
45 u32 epoch, num_mon;
46 u16 version;
47 u32 len;
48
49 ceph_decode_32_safe(&p, end, len, bad);
50 ceph_decode_need(&p, end, len, bad);
51
52 dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
53
54 ceph_decode_16_safe(&p, end, version, bad);
55
56 ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
57 ceph_decode_copy(&p, &fsid, sizeof(fsid));
58 epoch = ceph_decode_32(&p);
59
60 num_mon = ceph_decode_32(&p);
61 ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
62
63 if (num_mon >= CEPH_MAX_MON)
64 goto bad;
65 m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
66 if (m == NULL)
67 return ERR_PTR(-ENOMEM);
68 m->fsid = fsid;
69 m->epoch = epoch;
70 m->num_mon = num_mon;
71 ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
72 for (i = 0; i < num_mon; i++)
73 ceph_decode_addr(&m->mon_inst[i].addr);
74
75 dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
76 m->num_mon);
77 for (i = 0; i < m->num_mon; i++)
78 dout("monmap_decode mon%d is %s\n", i,
79 ceph_pr_addr(&m->mon_inst[i].addr.in_addr));
80 return m;
81
82bad:
83 dout("monmap_decode failed with %d\n", err);
84 kfree(m);
85 return ERR_PTR(err);
86}
87
88/*
89 * return true if *addr is included in the monmap.
90 */
91int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
92{
93 int i;
94
95 for (i = 0; i < m->num_mon; i++)
96 if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
97 return 1;
98 return 0;
99}
100
101/*
102 * Send an auth request.
103 */
104static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
105{
106 monc->pending_auth = 1;
107 monc->m_auth->front.iov_len = len;
108 monc->m_auth->hdr.front_len = cpu_to_le32(len);
109 ceph_con_revoke(monc->con, monc->m_auth);
110 ceph_msg_get(monc->m_auth); /* keep our ref */
111 ceph_con_send(monc->con, monc->m_auth);
112}
113
114/*
115 * Close monitor session, if any.
116 */
117static void __close_session(struct ceph_mon_client *monc)
118{
119 if (monc->con) {
120 dout("__close_session closing mon%d\n", monc->cur_mon);
121 ceph_con_revoke(monc->con, monc->m_auth);
122 ceph_con_close(monc->con);
123 monc->cur_mon = -1;
124 monc->pending_auth = 0;
125 ceph_auth_reset(monc->auth);
126 }
127}
128
129/*
130 * Open a session with a (new) monitor.
131 */
132static int __open_session(struct ceph_mon_client *monc)
133{
134 char r;
135 int ret;
136
137 if (monc->cur_mon < 0) {
138 get_random_bytes(&r, 1);
139 monc->cur_mon = r % monc->monmap->num_mon;
140 dout("open_session num=%d r=%d -> mon%d\n",
141 monc->monmap->num_mon, r, monc->cur_mon);
142 monc->sub_sent = 0;
143 monc->sub_renew_after = jiffies; /* i.e., expired */
144 monc->want_next_osdmap = !!monc->want_next_osdmap;
145
146 dout("open_session mon%d opening\n", monc->cur_mon);
147 monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
148 monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
149 ceph_con_open(monc->con,
150 &monc->monmap->mon_inst[monc->cur_mon].addr);
151
152 /* initiatiate authentication handshake */
153 ret = ceph_auth_build_hello(monc->auth,
154 monc->m_auth->front.iov_base,
155 monc->m_auth->front_max);
156 __send_prepared_auth_request(monc, ret);
157 } else {
158 dout("open_session mon%d already open\n", monc->cur_mon);
159 }
160 return 0;
161}
162
163static bool __sub_expired(struct ceph_mon_client *monc)
164{
165 return time_after_eq(jiffies, monc->sub_renew_after);
166}
167
168/*
169 * Reschedule delayed work timer.
170 */
171static void __schedule_delayed(struct ceph_mon_client *monc)
172{
173 unsigned delay;
174
175 if (monc->cur_mon < 0 || __sub_expired(monc))
176 delay = 10 * HZ;
177 else
178 delay = 20 * HZ;
179 dout("__schedule_delayed after %u\n", delay);
180 schedule_delayed_work(&monc->delayed_work, delay);
181}
182
183/*
184 * Send subscribe request for mdsmap and/or osdmap.
185 */
186static void __send_subscribe(struct ceph_mon_client *monc)
187{
188 dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
189 (unsigned)monc->sub_sent, __sub_expired(monc),
190 monc->want_next_osdmap);
191 if ((__sub_expired(monc) && !monc->sub_sent) ||
192 monc->want_next_osdmap == 1) {
193 struct ceph_msg *msg = monc->m_subscribe;
194 struct ceph_mon_subscribe_item *i;
195 void *p, *end;
196 int num;
197
198 p = msg->front.iov_base;
199 end = p + msg->front_max;
200
201 num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap;
202 ceph_encode_32(&p, num);
203
204 if (monc->want_next_osdmap) {
205 dout("__send_subscribe to 'osdmap' %u\n",
206 (unsigned)monc->have_osdmap);
207 ceph_encode_string(&p, end, "osdmap", 6);
208 i = p;
209 i->have = cpu_to_le64(monc->have_osdmap);
210 i->onetime = 1;
211 p += sizeof(*i);
212 monc->want_next_osdmap = 2; /* requested */
213 }
214 if (monc->want_mdsmap) {
215 dout("__send_subscribe to 'mdsmap' %u+\n",
216 (unsigned)monc->have_mdsmap);
217 ceph_encode_string(&p, end, "mdsmap", 6);
218 i = p;
219 i->have = cpu_to_le64(monc->have_mdsmap);
220 i->onetime = 0;
221 p += sizeof(*i);
222 }
223 ceph_encode_string(&p, end, "monmap", 6);
224 i = p;
225 i->have = 0;
226 i->onetime = 0;
227 p += sizeof(*i);
228
229 msg->front.iov_len = p - msg->front.iov_base;
230 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
231 ceph_con_revoke(monc->con, msg);
232 ceph_con_send(monc->con, ceph_msg_get(msg));
233
234 monc->sub_sent = jiffies | 1; /* never 0 */
235 }
236}
237
238static void handle_subscribe_ack(struct ceph_mon_client *monc,
239 struct ceph_msg *msg)
240{
241 unsigned seconds;
242 struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
243
244 if (msg->front.iov_len < sizeof(*h))
245 goto bad;
246 seconds = le32_to_cpu(h->duration);
247
248 mutex_lock(&monc->mutex);
249 if (monc->hunting) {
250 pr_info("mon%d %s session established\n",
251 monc->cur_mon,
252 ceph_pr_addr(&monc->con->peer_addr.in_addr));
253 monc->hunting = false;
254 }
255 dout("handle_subscribe_ack after %d seconds\n", seconds);
256 monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
257 monc->sub_sent = 0;
258 mutex_unlock(&monc->mutex);
259 return;
260bad:
261 pr_err("got corrupt subscribe-ack msg\n");
262 ceph_msg_dump(msg);
263}
264
265/*
266 * Keep track of which maps we have
267 */
268int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
269{
270 mutex_lock(&monc->mutex);
271 monc->have_mdsmap = got;
272 mutex_unlock(&monc->mutex);
273 return 0;
274}
275EXPORT_SYMBOL(ceph_monc_got_mdsmap);
276
277int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
278{
279 mutex_lock(&monc->mutex);
280 monc->have_osdmap = got;
281 monc->want_next_osdmap = 0;
282 mutex_unlock(&monc->mutex);
283 return 0;
284}
285
286/*
287 * Register interest in the next osdmap
288 */
289void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
290{
291 dout("request_next_osdmap have %u\n", monc->have_osdmap);
292 mutex_lock(&monc->mutex);
293 if (!monc->want_next_osdmap)
294 monc->want_next_osdmap = 1;
295 if (monc->want_next_osdmap < 2)
296 __send_subscribe(monc);
297 mutex_unlock(&monc->mutex);
298}
299
300/*
301 *
302 */
303int ceph_monc_open_session(struct ceph_mon_client *monc)
304{
305 if (!monc->con) {
306 monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
307 if (!monc->con)
308 return -ENOMEM;
309 ceph_con_init(monc->client->msgr, monc->con);
310 monc->con->private = monc;
311 monc->con->ops = &mon_con_ops;
312 }
313
314 mutex_lock(&monc->mutex);
315 __open_session(monc);
316 __schedule_delayed(monc);
317 mutex_unlock(&monc->mutex);
318 return 0;
319}
320EXPORT_SYMBOL(ceph_monc_open_session);
321
322/*
323 * The monitor responds with mount ack indicate mount success. The
324 * included client ticket allows the client to talk to MDSs and OSDs.
325 */
326static void ceph_monc_handle_map(struct ceph_mon_client *monc,
327 struct ceph_msg *msg)
328{
329 struct ceph_client *client = monc->client;
330 struct ceph_monmap *monmap = NULL, *old = monc->monmap;
331 void *p, *end;
332
333 mutex_lock(&monc->mutex);
334
335 dout("handle_monmap\n");
336 p = msg->front.iov_base;
337 end = p + msg->front.iov_len;
338
339 monmap = ceph_monmap_decode(p, end);
340 if (IS_ERR(monmap)) {
341 pr_err("problem decoding monmap, %d\n",
342 (int)PTR_ERR(monmap));
343 goto out;
344 }
345
346 if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
347 kfree(monmap);
348 goto out;
349 }
350
351 client->monc.monmap = monmap;
352 kfree(old);
353
354out:
355 mutex_unlock(&monc->mutex);
356 wake_up_all(&client->auth_wq);
357}
358
359/*
360 * generic requests (e.g., statfs, poolop)
361 */
362static struct ceph_mon_generic_request *__lookup_generic_req(
363 struct ceph_mon_client *monc, u64 tid)
364{
365 struct ceph_mon_generic_request *req;
366 struct rb_node *n = monc->generic_request_tree.rb_node;
367
368 while (n) {
369 req = rb_entry(n, struct ceph_mon_generic_request, node);
370 if (tid < req->tid)
371 n = n->rb_left;
372 else if (tid > req->tid)
373 n = n->rb_right;
374 else
375 return req;
376 }
377 return NULL;
378}
379
380static void __insert_generic_request(struct ceph_mon_client *monc,
381 struct ceph_mon_generic_request *new)
382{
383 struct rb_node **p = &monc->generic_request_tree.rb_node;
384 struct rb_node *parent = NULL;
385 struct ceph_mon_generic_request *req = NULL;
386
387 while (*p) {
388 parent = *p;
389 req = rb_entry(parent, struct ceph_mon_generic_request, node);
390 if (new->tid < req->tid)
391 p = &(*p)->rb_left;
392 else if (new->tid > req->tid)
393 p = &(*p)->rb_right;
394 else
395 BUG();
396 }
397
398 rb_link_node(&new->node, parent, p);
399 rb_insert_color(&new->node, &monc->generic_request_tree);
400}
401
402static void release_generic_request(struct kref *kref)
403{
404 struct ceph_mon_generic_request *req =
405 container_of(kref, struct ceph_mon_generic_request, kref);
406
407 if (req->reply)
408 ceph_msg_put(req->reply);
409 if (req->request)
410 ceph_msg_put(req->request);
411
412 kfree(req);
413}
414
415static void put_generic_request(struct ceph_mon_generic_request *req)
416{
417 kref_put(&req->kref, release_generic_request);
418}
419
420static void get_generic_request(struct ceph_mon_generic_request *req)
421{
422 kref_get(&req->kref);
423}
424
425static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
426 struct ceph_msg_header *hdr,
427 int *skip)
428{
429 struct ceph_mon_client *monc = con->private;
430 struct ceph_mon_generic_request *req;
431 u64 tid = le64_to_cpu(hdr->tid);
432 struct ceph_msg *m;
433
434 mutex_lock(&monc->mutex);
435 req = __lookup_generic_req(monc, tid);
436 if (!req) {
437 dout("get_generic_reply %lld dne\n", tid);
438 *skip = 1;
439 m = NULL;
440 } else {
441 dout("get_generic_reply %lld got %p\n", tid, req->reply);
442 m = ceph_msg_get(req->reply);
443 /*
444 * we don't need to track the connection reading into
445 * this reply because we only have one open connection
446 * at a time, ever.
447 */
448 }
449 mutex_unlock(&monc->mutex);
450 return m;
451}
452
453static int do_generic_request(struct ceph_mon_client *monc,
454 struct ceph_mon_generic_request *req)
455{
456 int err;
457
458 /* register request */
459 mutex_lock(&monc->mutex);
460 req->tid = ++monc->last_tid;
461 req->request->hdr.tid = cpu_to_le64(req->tid);
462 __insert_generic_request(monc, req);
463 monc->num_generic_requests++;
464 ceph_con_send(monc->con, ceph_msg_get(req->request));
465 mutex_unlock(&monc->mutex);
466
467 err = wait_for_completion_interruptible(&req->completion);
468
469 mutex_lock(&monc->mutex);
470 rb_erase(&req->node, &monc->generic_request_tree);
471 monc->num_generic_requests--;
472 mutex_unlock(&monc->mutex);
473
474 if (!err)
475 err = req->result;
476 return err;
477}
478
479/*
480 * statfs
481 */
482static void handle_statfs_reply(struct ceph_mon_client *monc,
483 struct ceph_msg *msg)
484{
485 struct ceph_mon_generic_request *req;
486 struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
487 u64 tid = le64_to_cpu(msg->hdr.tid);
488
489 if (msg->front.iov_len != sizeof(*reply))
490 goto bad;
491 dout("handle_statfs_reply %p tid %llu\n", msg, tid);
492
493 mutex_lock(&monc->mutex);
494 req = __lookup_generic_req(monc, tid);
495 if (req) {
496 *(struct ceph_statfs *)req->buf = reply->st;
497 req->result = 0;
498 get_generic_request(req);
499 }
500 mutex_unlock(&monc->mutex);
501 if (req) {
502 complete_all(&req->completion);
503 put_generic_request(req);
504 }
505 return;
506
507bad:
508 pr_err("corrupt generic reply, tid %llu\n", tid);
509 ceph_msg_dump(msg);
510}
511
512/*
513 * Do a synchronous statfs().
514 */
515int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
516{
517 struct ceph_mon_generic_request *req;
518 struct ceph_mon_statfs *h;
519 int err;
520
521 req = kzalloc(sizeof(*req), GFP_NOFS);
522 if (!req)
523 return -ENOMEM;
524
525 kref_init(&req->kref);
526 req->buf = buf;
527 req->buf_len = sizeof(*buf);
528 init_completion(&req->completion);
529
530 err = -ENOMEM;
531 req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
532 if (!req->request)
533 goto out;
534 req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
535 if (!req->reply)
536 goto out;
537
538 /* fill out request */
539 h = req->request->front.iov_base;
540 h->monhdr.have_version = 0;
541 h->monhdr.session_mon = cpu_to_le16(-1);
542 h->monhdr.session_mon_tid = 0;
543 h->fsid = monc->monmap->fsid;
544
545 err = do_generic_request(monc, req);
546
547out:
548 kref_put(&req->kref, release_generic_request);
549 return err;
550}
551EXPORT_SYMBOL(ceph_monc_do_statfs);
552
553/*
554 * pool ops
555 */
556static int get_poolop_reply_buf(const char *src, size_t src_len,
557 char *dst, size_t dst_len)
558{
559 u32 buf_len;
560
561 if (src_len != sizeof(u32) + dst_len)
562 return -EINVAL;
563
564 buf_len = le32_to_cpu(*(u32 *)src);
565 if (buf_len != dst_len)
566 return -EINVAL;
567
568 memcpy(dst, src + sizeof(u32), dst_len);
569 return 0;
570}
571
572static void handle_poolop_reply(struct ceph_mon_client *monc,
573 struct ceph_msg *msg)
574{
575 struct ceph_mon_generic_request *req;
576 struct ceph_mon_poolop_reply *reply = msg->front.iov_base;
577 u64 tid = le64_to_cpu(msg->hdr.tid);
578
579 if (msg->front.iov_len < sizeof(*reply))
580 goto bad;
581 dout("handle_poolop_reply %p tid %llu\n", msg, tid);
582
583 mutex_lock(&monc->mutex);
584 req = __lookup_generic_req(monc, tid);
585 if (req) {
586 if (req->buf_len &&
587 get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply),
588 msg->front.iov_len - sizeof(*reply),
589 req->buf, req->buf_len) < 0) {
590 mutex_unlock(&monc->mutex);
591 goto bad;
592 }
593 req->result = le32_to_cpu(reply->reply_code);
594 get_generic_request(req);
595 }
596 mutex_unlock(&monc->mutex);
597 if (req) {
598 complete(&req->completion);
599 put_generic_request(req);
600 }
601 return;
602
603bad:
604 pr_err("corrupt generic reply, tid %llu\n", tid);
605 ceph_msg_dump(msg);
606}
607
608/*
609 * Do a synchronous pool op.
610 */
611int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op,
612 u32 pool, u64 snapid,
613 char *buf, int len)
614{
615 struct ceph_mon_generic_request *req;
616 struct ceph_mon_poolop *h;
617 int err;
618
619 req = kzalloc(sizeof(*req), GFP_NOFS);
620 if (!req)
621 return -ENOMEM;
622
623 kref_init(&req->kref);
624 req->buf = buf;
625 req->buf_len = len;
626 init_completion(&req->completion);
627
628 err = -ENOMEM;
629 req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS);
630 if (!req->request)
631 goto out;
632 req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS);
633 if (!req->reply)
634 goto out;
635
636 /* fill out request */
637 req->request->hdr.version = cpu_to_le16(2);
638 h = req->request->front.iov_base;
639 h->monhdr.have_version = 0;
640 h->monhdr.session_mon = cpu_to_le16(-1);
641 h->monhdr.session_mon_tid = 0;
642 h->fsid = monc->monmap->fsid;
643 h->pool = cpu_to_le32(pool);
644 h->op = cpu_to_le32(op);
645 h->auid = 0;
646 h->snapid = cpu_to_le64(snapid);
647 h->name_len = 0;
648
649 err = do_generic_request(monc, req);
650
651out:
652 kref_put(&req->kref, release_generic_request);
653 return err;
654}
655
656int ceph_monc_create_snapid(struct ceph_mon_client *monc,
657 u32 pool, u64 *snapid)
658{
659 return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
660 pool, 0, (char *)snapid, sizeof(*snapid));
661
662}
663EXPORT_SYMBOL(ceph_monc_create_snapid);
664
665int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
666 u32 pool, u64 snapid)
667{
668 return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
669 pool, snapid, 0, 0);
670
671}
672
673/*
674 * Resend pending generic requests.
675 */
676static void __resend_generic_request(struct ceph_mon_client *monc)
677{
678 struct ceph_mon_generic_request *req;
679 struct rb_node *p;
680
681 for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
682 req = rb_entry(p, struct ceph_mon_generic_request, node);
683 ceph_con_revoke(monc->con, req->request);
684 ceph_con_send(monc->con, ceph_msg_get(req->request));
685 }
686}
687
688/*
689 * Delayed work. If we haven't mounted yet, retry. Otherwise,
690 * renew/retry subscription as needed (in case it is timing out, or we
691 * got an ENOMEM). And keep the monitor connection alive.
692 */
693static void delayed_work(struct work_struct *work)
694{
695 struct ceph_mon_client *monc =
696 container_of(work, struct ceph_mon_client, delayed_work.work);
697
698 dout("monc delayed_work\n");
699 mutex_lock(&monc->mutex);
700 if (monc->hunting) {
701 __close_session(monc);
702 __open_session(monc); /* continue hunting */
703 } else {
704 ceph_con_keepalive(monc->con);
705
706 __validate_auth(monc);
707
708 if (monc->auth->ops->is_authenticated(monc->auth))
709 __send_subscribe(monc);
710 }
711 __schedule_delayed(monc);
712 mutex_unlock(&monc->mutex);
713}
714
715/*
716 * On startup, we build a temporary monmap populated with the IPs
717 * provided by mount(2).
718 */
719static int build_initial_monmap(struct ceph_mon_client *monc)
720{
721 struct ceph_options *opt = monc->client->options;
722 struct ceph_entity_addr *mon_addr = opt->mon_addr;
723 int num_mon = opt->num_mon;
724 int i;
725
726 /* build initial monmap */
727 monc->monmap = kzalloc(sizeof(*monc->monmap) +
728 num_mon*sizeof(monc->monmap->mon_inst[0]),
729 GFP_KERNEL);
730 if (!monc->monmap)
731 return -ENOMEM;
732 for (i = 0; i < num_mon; i++) {
733 monc->monmap->mon_inst[i].addr = mon_addr[i];
734 monc->monmap->mon_inst[i].addr.nonce = 0;
735 monc->monmap->mon_inst[i].name.type =
736 CEPH_ENTITY_TYPE_MON;
737 monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
738 }
739 monc->monmap->num_mon = num_mon;
740 monc->have_fsid = false;
741 return 0;
742}
743
744int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
745{
746 int err = 0;
747
748 dout("init\n");
749 memset(monc, 0, sizeof(*monc));
750 monc->client = cl;
751 monc->monmap = NULL;
752 mutex_init(&monc->mutex);
753
754 err = build_initial_monmap(monc);
755 if (err)
756 goto out;
757
758 monc->con = NULL;
759
760 /* authentication */
761 monc->auth = ceph_auth_init(cl->options->name,
762 cl->options->secret);
763 if (IS_ERR(monc->auth))
764 return PTR_ERR(monc->auth);
765 monc->auth->want_keys =
766 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
767 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
768
769 /* msgs */
770 err = -ENOMEM;
771 monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
772 sizeof(struct ceph_mon_subscribe_ack),
773 GFP_NOFS);
774 if (!monc->m_subscribe_ack)
775 goto out_monmap;
776
777 monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
778 if (!monc->m_subscribe)
779 goto out_subscribe_ack;
780
781 monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
782 if (!monc->m_auth_reply)
783 goto out_subscribe;
784
785 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
786 monc->pending_auth = 0;
787 if (!monc->m_auth)
788 goto out_auth_reply;
789
790 monc->cur_mon = -1;
791 monc->hunting = true;
792 monc->sub_renew_after = jiffies;
793 monc->sub_sent = 0;
794
795 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
796 monc->generic_request_tree = RB_ROOT;
797 monc->num_generic_requests = 0;
798 monc->last_tid = 0;
799
800 monc->have_mdsmap = 0;
801 monc->have_osdmap = 0;
802 monc->want_next_osdmap = 1;
803 return 0;
804
805out_auth_reply:
806 ceph_msg_put(monc->m_auth_reply);
807out_subscribe:
808 ceph_msg_put(monc->m_subscribe);
809out_subscribe_ack:
810 ceph_msg_put(monc->m_subscribe_ack);
811out_monmap:
812 kfree(monc->monmap);
813out:
814 return err;
815}
816EXPORT_SYMBOL(ceph_monc_init);
817
818void ceph_monc_stop(struct ceph_mon_client *monc)
819{
820 dout("stop\n");
821 cancel_delayed_work_sync(&monc->delayed_work);
822
823 mutex_lock(&monc->mutex);
824 __close_session(monc);
825 if (monc->con) {
826 monc->con->private = NULL;
827 monc->con->ops->put(monc->con);
828 monc->con = NULL;
829 }
830 mutex_unlock(&monc->mutex);
831
832 ceph_auth_destroy(monc->auth);
833
834 ceph_msg_put(monc->m_auth);
835 ceph_msg_put(monc->m_auth_reply);
836 ceph_msg_put(monc->m_subscribe);
837 ceph_msg_put(monc->m_subscribe_ack);
838
839 kfree(monc->monmap);
840}
841EXPORT_SYMBOL(ceph_monc_stop);
842
843static void handle_auth_reply(struct ceph_mon_client *monc,
844 struct ceph_msg *msg)
845{
846 int ret;
847 int was_auth = 0;
848
849 mutex_lock(&monc->mutex);
850 if (monc->auth->ops)
851 was_auth = monc->auth->ops->is_authenticated(monc->auth);
852 monc->pending_auth = 0;
853 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
854 msg->front.iov_len,
855 monc->m_auth->front.iov_base,
856 monc->m_auth->front_max);
857 if (ret < 0) {
858 monc->client->auth_err = ret;
859 wake_up_all(&monc->client->auth_wq);
860 } else if (ret > 0) {
861 __send_prepared_auth_request(monc, ret);
862 } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
863 dout("authenticated, starting session\n");
864
865 monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
866 monc->client->msgr->inst.name.num =
867 cpu_to_le64(monc->auth->global_id);
868
869 __send_subscribe(monc);
870 __resend_generic_request(monc);
871 }
872 mutex_unlock(&monc->mutex);
873}
874
875static int __validate_auth(struct ceph_mon_client *monc)
876{
877 int ret;
878
879 if (monc->pending_auth)
880 return 0;
881
882 ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
883 monc->m_auth->front_max);
884 if (ret <= 0)
885 return ret; /* either an error, or no need to authenticate */
886 __send_prepared_auth_request(monc, ret);
887 return 0;
888}
889
890int ceph_monc_validate_auth(struct ceph_mon_client *monc)
891{
892 int ret;
893
894 mutex_lock(&monc->mutex);
895 ret = __validate_auth(monc);
896 mutex_unlock(&monc->mutex);
897 return ret;
898}
899EXPORT_SYMBOL(ceph_monc_validate_auth);
900
901/*
902 * handle incoming message
903 */
904static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
905{
906 struct ceph_mon_client *monc = con->private;
907 int type = le16_to_cpu(msg->hdr.type);
908
909 if (!monc)
910 return;
911
912 switch (type) {
913 case CEPH_MSG_AUTH_REPLY:
914 handle_auth_reply(monc, msg);
915 break;
916
917 case CEPH_MSG_MON_SUBSCRIBE_ACK:
918 handle_subscribe_ack(monc, msg);
919 break;
920
921 case CEPH_MSG_STATFS_REPLY:
922 handle_statfs_reply(monc, msg);
923 break;
924
925 case CEPH_MSG_POOLOP_REPLY:
926 handle_poolop_reply(monc, msg);
927 break;
928
929 case CEPH_MSG_MON_MAP:
930 ceph_monc_handle_map(monc, msg);
931 break;
932
933 case CEPH_MSG_OSD_MAP:
934 ceph_osdc_handle_map(&monc->client->osdc, msg);
935 break;
936
937 default:
938 /* can the chained handler handle it? */
939 if (monc->client->extra_mon_dispatch &&
940 monc->client->extra_mon_dispatch(monc->client, msg) == 0)
941 break;
942
943 pr_err("received unknown message type %d %s\n", type,
944 ceph_msg_type_name(type));
945 }
946 ceph_msg_put(msg);
947}
948
949/*
950 * Allocate memory for incoming message
951 */
952static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
953 struct ceph_msg_header *hdr,
954 int *skip)
955{
956 struct ceph_mon_client *monc = con->private;
957 int type = le16_to_cpu(hdr->type);
958 int front_len = le32_to_cpu(hdr->front_len);
959 struct ceph_msg *m = NULL;
960
961 *skip = 0;
962
963 switch (type) {
964 case CEPH_MSG_MON_SUBSCRIBE_ACK:
965 m = ceph_msg_get(monc->m_subscribe_ack);
966 break;
967 case CEPH_MSG_POOLOP_REPLY:
968 case CEPH_MSG_STATFS_REPLY:
969 return get_generic_reply(con, hdr, skip);
970 case CEPH_MSG_AUTH_REPLY:
971 m = ceph_msg_get(monc->m_auth_reply);
972 break;
973 case CEPH_MSG_MON_MAP:
974 case CEPH_MSG_MDS_MAP:
975 case CEPH_MSG_OSD_MAP:
976 m = ceph_msg_new(type, front_len, GFP_NOFS);
977 break;
978 }
979
980 if (!m) {
981 pr_info("alloc_msg unknown type %d\n", type);
982 *skip = 1;
983 }
984 return m;
985}
986
987/*
988 * If the monitor connection resets, pick a new monitor and resubmit
989 * any pending requests.
990 */
991static void mon_fault(struct ceph_connection *con)
992{
993 struct ceph_mon_client *monc = con->private;
994
995 if (!monc)
996 return;
997
998 dout("mon_fault\n");
999 mutex_lock(&monc->mutex);
1000 if (!con->private)
1001 goto out;
1002
1003 if (monc->con && !monc->hunting)
1004 pr_info("mon%d %s session lost, "
1005 "hunting for new mon\n", monc->cur_mon,
1006 ceph_pr_addr(&monc->con->peer_addr.in_addr));
1007
1008 __close_session(monc);
1009 if (!monc->hunting) {
1010 /* start hunting */
1011 monc->hunting = true;
1012 __open_session(monc);
1013 } else {
1014 /* already hunting, let's wait a bit */
1015 __schedule_delayed(monc);
1016 }
1017out:
1018 mutex_unlock(&monc->mutex);
1019}
1020
1021static const struct ceph_connection_operations mon_con_ops = {
1022 .get = ceph_con_get,
1023 .put = ceph_con_put,
1024 .dispatch = dispatch,
1025 .fault = mon_fault,
1026 .alloc_msg = mon_alloc_msg,
1027};
diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c
new file mode 100644
index 000000000000..d5f2d97ac05c
--- /dev/null
+++ b/net/ceph/msgpool.c
@@ -0,0 +1,64 @@
1#include <linux/ceph/ceph_debug.h>
2
3#include <linux/err.h>
4#include <linux/sched.h>
5#include <linux/types.h>
6#include <linux/vmalloc.h>
7
8#include <linux/ceph/msgpool.h>
9
10static void *alloc_fn(gfp_t gfp_mask, void *arg)
11{
12 struct ceph_msgpool *pool = arg;
13 void *p;
14
15 p = ceph_msg_new(0, pool->front_len, gfp_mask);
16 if (!p)
17 pr_err("msgpool %s alloc failed\n", pool->name);
18 return p;
19}
20
21static void free_fn(void *element, void *arg)
22{
23 ceph_msg_put(element);
24}
25
26int ceph_msgpool_init(struct ceph_msgpool *pool,
27 int front_len, int size, bool blocking, const char *name)
28{
29 pool->front_len = front_len;
30 pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
31 if (!pool->pool)
32 return -ENOMEM;
33 pool->name = name;
34 return 0;
35}
36
37void ceph_msgpool_destroy(struct ceph_msgpool *pool)
38{
39 mempool_destroy(pool->pool);
40}
41
42struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
43 int front_len)
44{
45 if (front_len > pool->front_len) {
46 pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
47 pool->name, front_len, pool->front_len);
48 WARN_ON(1);
49
50 /* try to alloc a fresh message */
51 return ceph_msg_new(0, front_len, GFP_NOFS);
52 }
53
54 return mempool_alloc(pool->pool, GFP_NOFS);
55}
56
57void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
58{
59 /* reset msg front_len; user may have changed it */
60 msg->front.iov_len = pool->front_len;
61 msg->hdr.front_len = cpu_to_le32(pool->front_len);
62
63 kref_init(&msg->kref); /* retake single ref */
64}
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
new file mode 100644
index 000000000000..79391994b3ed
--- /dev/null
+++ b/net/ceph/osd_client.c
@@ -0,0 +1,1773 @@
1#include <linux/ceph/ceph_debug.h>
2
3#include <linux/module.h>
4#include <linux/err.h>
5#include <linux/highmem.h>
6#include <linux/mm.h>
7#include <linux/pagemap.h>
8#include <linux/slab.h>
9#include <linux/uaccess.h>
10#ifdef CONFIG_BLOCK
11#include <linux/bio.h>
12#endif
13
14#include <linux/ceph/libceph.h>
15#include <linux/ceph/osd_client.h>
16#include <linux/ceph/messenger.h>
17#include <linux/ceph/decode.h>
18#include <linux/ceph/auth.h>
19#include <linux/ceph/pagelist.h>
20
21#define OSD_OP_FRONT_LEN 4096
22#define OSD_OPREPLY_FRONT_LEN 512
23
24static const struct ceph_connection_operations osd_con_ops;
25static int __kick_requests(struct ceph_osd_client *osdc,
26 struct ceph_osd *kickosd);
27
28static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
29
30static int op_needs_trail(int op)
31{
32 switch (op) {
33 case CEPH_OSD_OP_GETXATTR:
34 case CEPH_OSD_OP_SETXATTR:
35 case CEPH_OSD_OP_CMPXATTR:
36 case CEPH_OSD_OP_CALL:
37 return 1;
38 default:
39 return 0;
40 }
41}
42
43static int op_has_extent(int op)
44{
45 return (op == CEPH_OSD_OP_READ ||
46 op == CEPH_OSD_OP_WRITE);
47}
48
49void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
50 struct ceph_file_layout *layout,
51 u64 snapid,
52 u64 off, u64 *plen, u64 *bno,
53 struct ceph_osd_request *req,
54 struct ceph_osd_req_op *op)
55{
56 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
57 u64 orig_len = *plen;
58 u64 objoff, objlen; /* extent in object */
59
60 reqhead->snapid = cpu_to_le64(snapid);
61
62 /* object extent? */
63 ceph_calc_file_object_mapping(layout, off, plen, bno,
64 &objoff, &objlen);
65 if (*plen < orig_len)
66 dout(" skipping last %llu, final file extent %llu~%llu\n",
67 orig_len - *plen, off, *plen);
68
69 if (op_has_extent(op->op)) {
70 op->extent.offset = objoff;
71 op->extent.length = objlen;
72 }
73 req->r_num_pages = calc_pages_for(off, *plen);
74 if (op->op == CEPH_OSD_OP_WRITE)
75 op->payload_len = *plen;
76
77 dout("calc_layout bno=%llx %llu~%llu (%d pages)\n",
78 *bno, objoff, objlen, req->r_num_pages);
79
80}
81EXPORT_SYMBOL(ceph_calc_raw_layout);
82
83/*
84 * Implement client access to distributed object storage cluster.
85 *
86 * All data objects are stored within a cluster/cloud of OSDs, or
87 * "object storage devices." (Note that Ceph OSDs have _nothing_ to
88 * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply
89 * remote daemons serving up and coordinating consistent and safe
90 * access to storage.
91 *
92 * Cluster membership and the mapping of data objects onto storage devices
93 * are described by the osd map.
94 *
95 * We keep track of pending OSD requests (read, write), resubmit
96 * requests to different OSDs when the cluster topology/data layout
97 * change, or retry the affected requests when the communications
98 * channel with an OSD is reset.
99 */
100
101/*
102 * calculate the mapping of a file extent onto an object, and fill out the
103 * request accordingly. shorten extent as necessary if it crosses an
104 * object boundary.
105 *
106 * fill osd op in request message.
107 */
108static void calc_layout(struct ceph_osd_client *osdc,
109 struct ceph_vino vino,
110 struct ceph_file_layout *layout,
111 u64 off, u64 *plen,
112 struct ceph_osd_request *req,
113 struct ceph_osd_req_op *op)
114{
115 u64 bno;
116
117 ceph_calc_raw_layout(osdc, layout, vino.snap, off,
118 plen, &bno, req, op);
119
120 sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
121 req->r_oid_len = strlen(req->r_oid);
122}
123
124/*
125 * requests
126 */
127void ceph_osdc_release_request(struct kref *kref)
128{
129 struct ceph_osd_request *req = container_of(kref,
130 struct ceph_osd_request,
131 r_kref);
132
133 if (req->r_request)
134 ceph_msg_put(req->r_request);
135 if (req->r_reply)
136 ceph_msg_put(req->r_reply);
137 if (req->r_con_filling_msg) {
138 dout("release_request revoking pages %p from con %p\n",
139 req->r_pages, req->r_con_filling_msg);
140 ceph_con_revoke_message(req->r_con_filling_msg,
141 req->r_reply);
142 ceph_con_put(req->r_con_filling_msg);
143 }
144 if (req->r_own_pages)
145 ceph_release_page_vector(req->r_pages,
146 req->r_num_pages);
147#ifdef CONFIG_BLOCK
148 if (req->r_bio)
149 bio_put(req->r_bio);
150#endif
151 ceph_put_snap_context(req->r_snapc);
152 if (req->r_trail) {
153 ceph_pagelist_release(req->r_trail);
154 kfree(req->r_trail);
155 }
156 if (req->r_mempool)
157 mempool_free(req, req->r_osdc->req_mempool);
158 else
159 kfree(req);
160}
161EXPORT_SYMBOL(ceph_osdc_release_request);
162
163static int get_num_ops(struct ceph_osd_req_op *ops, int *needs_trail)
164{
165 int i = 0;
166
167 if (needs_trail)
168 *needs_trail = 0;
169 while (ops[i].op) {
170 if (needs_trail && op_needs_trail(ops[i].op))
171 *needs_trail = 1;
172 i++;
173 }
174
175 return i;
176}
177
178struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
179 int flags,
180 struct ceph_snap_context *snapc,
181 struct ceph_osd_req_op *ops,
182 bool use_mempool,
183 gfp_t gfp_flags,
184 struct page **pages,
185 struct bio *bio)
186{
187 struct ceph_osd_request *req;
188 struct ceph_msg *msg;
189 int needs_trail;
190 int num_op = get_num_ops(ops, &needs_trail);
191 size_t msg_size = sizeof(struct ceph_osd_request_head);
192
193 msg_size += num_op*sizeof(struct ceph_osd_op);
194
195 if (use_mempool) {
196 req = mempool_alloc(osdc->req_mempool, gfp_flags);
197 memset(req, 0, sizeof(*req));
198 } else {
199 req = kzalloc(sizeof(*req), gfp_flags);
200 }
201 if (req == NULL)
202 return NULL;
203
204 req->r_osdc = osdc;
205 req->r_mempool = use_mempool;
206
207 kref_init(&req->r_kref);
208 init_completion(&req->r_completion);
209 init_completion(&req->r_safe_completion);
210 INIT_LIST_HEAD(&req->r_unsafe_item);
211 req->r_flags = flags;
212
213 WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
214
215 /* create reply message */
216 if (use_mempool)
217 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
218 else
219 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
220 OSD_OPREPLY_FRONT_LEN, gfp_flags);
221 if (!msg) {
222 ceph_osdc_put_request(req);
223 return NULL;
224 }
225 req->r_reply = msg;
226
227 /* allocate space for the trailing data */
228 if (needs_trail) {
229 req->r_trail = kmalloc(sizeof(struct ceph_pagelist), gfp_flags);
230 if (!req->r_trail) {
231 ceph_osdc_put_request(req);
232 return NULL;
233 }
234 ceph_pagelist_init(req->r_trail);
235 }
236 /* create request message; allow space for oid */
237 msg_size += 40;
238 if (snapc)
239 msg_size += sizeof(u64) * snapc->num_snaps;
240 if (use_mempool)
241 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
242 else
243 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags);
244 if (!msg) {
245 ceph_osdc_put_request(req);
246 return NULL;
247 }
248
249 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
250 memset(msg->front.iov_base, 0, msg->front.iov_len);
251
252 req->r_request = msg;
253 req->r_pages = pages;
254#ifdef CONFIG_BLOCK
255 if (bio) {
256 req->r_bio = bio;
257 bio_get(req->r_bio);
258 }
259#endif
260
261 return req;
262}
263EXPORT_SYMBOL(ceph_osdc_alloc_request);
264
265static void osd_req_encode_op(struct ceph_osd_request *req,
266 struct ceph_osd_op *dst,
267 struct ceph_osd_req_op *src)
268{
269 dst->op = cpu_to_le16(src->op);
270
271 switch (dst->op) {
272 case CEPH_OSD_OP_READ:
273 case CEPH_OSD_OP_WRITE:
274 dst->extent.offset =
275 cpu_to_le64(src->extent.offset);
276 dst->extent.length =
277 cpu_to_le64(src->extent.length);
278 dst->extent.truncate_size =
279 cpu_to_le64(src->extent.truncate_size);
280 dst->extent.truncate_seq =
281 cpu_to_le32(src->extent.truncate_seq);
282 break;
283
284 case CEPH_OSD_OP_GETXATTR:
285 case CEPH_OSD_OP_SETXATTR:
286 case CEPH_OSD_OP_CMPXATTR:
287 BUG_ON(!req->r_trail);
288
289 dst->xattr.name_len = cpu_to_le32(src->xattr.name_len);
290 dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
291 dst->xattr.cmp_op = src->xattr.cmp_op;
292 dst->xattr.cmp_mode = src->xattr.cmp_mode;
293 ceph_pagelist_append(req->r_trail, src->xattr.name,
294 src->xattr.name_len);
295 ceph_pagelist_append(req->r_trail, src->xattr.val,
296 src->xattr.value_len);
297 break;
298 case CEPH_OSD_OP_CALL:
299 BUG_ON(!req->r_trail);
300
301 dst->cls.class_len = src->cls.class_len;
302 dst->cls.method_len = src->cls.method_len;
303 dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
304
305 ceph_pagelist_append(req->r_trail, src->cls.class_name,
306 src->cls.class_len);
307 ceph_pagelist_append(req->r_trail, src->cls.method_name,
308 src->cls.method_len);
309 ceph_pagelist_append(req->r_trail, src->cls.indata,
310 src->cls.indata_len);
311 break;
312 case CEPH_OSD_OP_ROLLBACK:
313 dst->snap.snapid = cpu_to_le64(src->snap.snapid);
314 break;
315 case CEPH_OSD_OP_STARTSYNC:
316 break;
317 default:
318 pr_err("unrecognized osd opcode %d\n", dst->op);
319 WARN_ON(1);
320 break;
321 }
322 dst->payload_len = cpu_to_le32(src->payload_len);
323}
324
325/*
326 * build new request AND message
327 *
328 */
329void ceph_osdc_build_request(struct ceph_osd_request *req,
330 u64 off, u64 *plen,
331 struct ceph_osd_req_op *src_ops,
332 struct ceph_snap_context *snapc,
333 struct timespec *mtime,
334 const char *oid,
335 int oid_len)
336{
337 struct ceph_msg *msg = req->r_request;
338 struct ceph_osd_request_head *head;
339 struct ceph_osd_req_op *src_op;
340 struct ceph_osd_op *op;
341 void *p;
342 int num_op = get_num_ops(src_ops, NULL);
343 size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
344 int flags = req->r_flags;
345 u64 data_len = 0;
346 int i;
347
348 head = msg->front.iov_base;
349 op = (void *)(head + 1);
350 p = (void *)(op + num_op);
351
352 req->r_snapc = ceph_get_snap_context(snapc);
353
354 head->client_inc = cpu_to_le32(1); /* always, for now. */
355 head->flags = cpu_to_le32(flags);
356 if (flags & CEPH_OSD_FLAG_WRITE)
357 ceph_encode_timespec(&head->mtime, mtime);
358 head->num_ops = cpu_to_le16(num_op);
359
360
361 /* fill in oid */
362 head->object_len = cpu_to_le32(oid_len);
363 memcpy(p, oid, oid_len);
364 p += oid_len;
365
366 src_op = src_ops;
367 while (src_op->op) {
368 osd_req_encode_op(req, op, src_op);
369 src_op++;
370 op++;
371 }
372
373 if (req->r_trail)
374 data_len += req->r_trail->length;
375
376 if (snapc) {
377 head->snap_seq = cpu_to_le64(snapc->seq);
378 head->num_snaps = cpu_to_le32(snapc->num_snaps);
379 for (i = 0; i < snapc->num_snaps; i++) {
380 put_unaligned_le64(snapc->snaps[i], p);
381 p += sizeof(u64);
382 }
383 }
384
385 if (flags & CEPH_OSD_FLAG_WRITE) {
386 req->r_request->hdr.data_off = cpu_to_le16(off);
387 req->r_request->hdr.data_len = cpu_to_le32(*plen + data_len);
388 } else if (data_len) {
389 req->r_request->hdr.data_off = 0;
390 req->r_request->hdr.data_len = cpu_to_le32(data_len);
391 }
392
393 BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
394 msg_size = p - msg->front.iov_base;
395 msg->front.iov_len = msg_size;
396 msg->hdr.front_len = cpu_to_le32(msg_size);
397 return;
398}
399EXPORT_SYMBOL(ceph_osdc_build_request);
400
401/*
402 * build new request AND message, calculate layout, and adjust file
403 * extent as needed.
404 *
405 * if the file was recently truncated, we include information about its
406 * old and new size so that the object can be updated appropriately. (we
407 * avoid synchronously deleting truncated objects because it's slow.)
408 *
409 * if @do_sync, include a 'startsync' command so that the osd will flush
410 * data quickly.
411 */
412struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
413 struct ceph_file_layout *layout,
414 struct ceph_vino vino,
415 u64 off, u64 *plen,
416 int opcode, int flags,
417 struct ceph_snap_context *snapc,
418 int do_sync,
419 u32 truncate_seq,
420 u64 truncate_size,
421 struct timespec *mtime,
422 bool use_mempool, int num_reply)
423{
424 struct ceph_osd_req_op ops[3];
425 struct ceph_osd_request *req;
426
427 ops[0].op = opcode;
428 ops[0].extent.truncate_seq = truncate_seq;
429 ops[0].extent.truncate_size = truncate_size;
430 ops[0].payload_len = 0;
431
432 if (do_sync) {
433 ops[1].op = CEPH_OSD_OP_STARTSYNC;
434 ops[1].payload_len = 0;
435 ops[2].op = 0;
436 } else
437 ops[1].op = 0;
438
439 req = ceph_osdc_alloc_request(osdc, flags,
440 snapc, ops,
441 use_mempool,
442 GFP_NOFS, NULL, NULL);
443 if (IS_ERR(req))
444 return req;
445
446 /* calculate max write size */
447 calc_layout(osdc, vino, layout, off, plen, req, ops);
448 req->r_file_layout = *layout; /* keep a copy */
449
450 ceph_osdc_build_request(req, off, plen, ops,
451 snapc,
452 mtime,
453 req->r_oid, req->r_oid_len);
454
455 return req;
456}
457EXPORT_SYMBOL(ceph_osdc_new_request);
458
459/*
460 * We keep osd requests in an rbtree, sorted by ->r_tid.
461 */
462static void __insert_request(struct ceph_osd_client *osdc,
463 struct ceph_osd_request *new)
464{
465 struct rb_node **p = &osdc->requests.rb_node;
466 struct rb_node *parent = NULL;
467 struct ceph_osd_request *req = NULL;
468
469 while (*p) {
470 parent = *p;
471 req = rb_entry(parent, struct ceph_osd_request, r_node);
472 if (new->r_tid < req->r_tid)
473 p = &(*p)->rb_left;
474 else if (new->r_tid > req->r_tid)
475 p = &(*p)->rb_right;
476 else
477 BUG();
478 }
479
480 rb_link_node(&new->r_node, parent, p);
481 rb_insert_color(&new->r_node, &osdc->requests);
482}
483
484static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
485 u64 tid)
486{
487 struct ceph_osd_request *req;
488 struct rb_node *n = osdc->requests.rb_node;
489
490 while (n) {
491 req = rb_entry(n, struct ceph_osd_request, r_node);
492 if (tid < req->r_tid)
493 n = n->rb_left;
494 else if (tid > req->r_tid)
495 n = n->rb_right;
496 else
497 return req;
498 }
499 return NULL;
500}
501
502static struct ceph_osd_request *
503__lookup_request_ge(struct ceph_osd_client *osdc,
504 u64 tid)
505{
506 struct ceph_osd_request *req;
507 struct rb_node *n = osdc->requests.rb_node;
508
509 while (n) {
510 req = rb_entry(n, struct ceph_osd_request, r_node);
511 if (tid < req->r_tid) {
512 if (!n->rb_left)
513 return req;
514 n = n->rb_left;
515 } else if (tid > req->r_tid) {
516 n = n->rb_right;
517 } else {
518 return req;
519 }
520 }
521 return NULL;
522}
523
524
525/*
526 * If the osd connection drops, we need to resubmit all requests.
527 */
528static void osd_reset(struct ceph_connection *con)
529{
530 struct ceph_osd *osd = con->private;
531 struct ceph_osd_client *osdc;
532
533 if (!osd)
534 return;
535 dout("osd_reset osd%d\n", osd->o_osd);
536 osdc = osd->o_osdc;
537 down_read(&osdc->map_sem);
538 kick_requests(osdc, osd);
539 up_read(&osdc->map_sem);
540}
541
542/*
543 * Track open sessions with osds.
544 */
545static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
546{
547 struct ceph_osd *osd;
548
549 osd = kzalloc(sizeof(*osd), GFP_NOFS);
550 if (!osd)
551 return NULL;
552
553 atomic_set(&osd->o_ref, 1);
554 osd->o_osdc = osdc;
555 INIT_LIST_HEAD(&osd->o_requests);
556 INIT_LIST_HEAD(&osd->o_osd_lru);
557 osd->o_incarnation = 1;
558
559 ceph_con_init(osdc->client->msgr, &osd->o_con);
560 osd->o_con.private = osd;
561 osd->o_con.ops = &osd_con_ops;
562 osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
563
564 INIT_LIST_HEAD(&osd->o_keepalive_item);
565 return osd;
566}
567
568static struct ceph_osd *get_osd(struct ceph_osd *osd)
569{
570 if (atomic_inc_not_zero(&osd->o_ref)) {
571 dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
572 atomic_read(&osd->o_ref));
573 return osd;
574 } else {
575 dout("get_osd %p FAIL\n", osd);
576 return NULL;
577 }
578}
579
580static void put_osd(struct ceph_osd *osd)
581{
582 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
583 atomic_read(&osd->o_ref) - 1);
584 if (atomic_dec_and_test(&osd->o_ref)) {
585 struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
586
587 if (osd->o_authorizer)
588 ac->ops->destroy_authorizer(ac, osd->o_authorizer);
589 kfree(osd);
590 }
591}
592
593/*
594 * remove an osd from our map
595 */
596static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
597{
598 dout("__remove_osd %p\n", osd);
599 BUG_ON(!list_empty(&osd->o_requests));
600 rb_erase(&osd->o_node, &osdc->osds);
601 list_del_init(&osd->o_osd_lru);
602 ceph_con_close(&osd->o_con);
603 put_osd(osd);
604}
605
606static void __move_osd_to_lru(struct ceph_osd_client *osdc,
607 struct ceph_osd *osd)
608{
609 dout("__move_osd_to_lru %p\n", osd);
610 BUG_ON(!list_empty(&osd->o_osd_lru));
611 list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
612 osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ;
613}
614
615static void __remove_osd_from_lru(struct ceph_osd *osd)
616{
617 dout("__remove_osd_from_lru %p\n", osd);
618 if (!list_empty(&osd->o_osd_lru))
619 list_del_init(&osd->o_osd_lru);
620}
621
622static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
623{
624 struct ceph_osd *osd, *nosd;
625
626 dout("__remove_old_osds %p\n", osdc);
627 mutex_lock(&osdc->request_mutex);
628 list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
629 if (!remove_all && time_before(jiffies, osd->lru_ttl))
630 break;
631 __remove_osd(osdc, osd);
632 }
633 mutex_unlock(&osdc->request_mutex);
634}
635
636/*
637 * reset osd connect
638 */
639static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
640{
641 struct ceph_osd_request *req;
642 int ret = 0;
643
644 dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
645 if (list_empty(&osd->o_requests)) {
646 __remove_osd(osdc, osd);
647 } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
648 &osd->o_con.peer_addr,
649 sizeof(osd->o_con.peer_addr)) == 0 &&
650 !ceph_con_opened(&osd->o_con)) {
651 dout(" osd addr hasn't changed and connection never opened,"
652 " letting msgr retry");
653 /* touch each r_stamp for handle_timeout()'s benfit */
654 list_for_each_entry(req, &osd->o_requests, r_osd_item)
655 req->r_stamp = jiffies;
656 ret = -EAGAIN;
657 } else {
658 ceph_con_close(&osd->o_con);
659 ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
660 osd->o_incarnation++;
661 }
662 return ret;
663}
664
665static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
666{
667 struct rb_node **p = &osdc->osds.rb_node;
668 struct rb_node *parent = NULL;
669 struct ceph_osd *osd = NULL;
670
671 while (*p) {
672 parent = *p;
673 osd = rb_entry(parent, struct ceph_osd, o_node);
674 if (new->o_osd < osd->o_osd)
675 p = &(*p)->rb_left;
676 else if (new->o_osd > osd->o_osd)
677 p = &(*p)->rb_right;
678 else
679 BUG();
680 }
681
682 rb_link_node(&new->o_node, parent, p);
683 rb_insert_color(&new->o_node, &osdc->osds);
684}
685
686static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
687{
688 struct ceph_osd *osd;
689 struct rb_node *n = osdc->osds.rb_node;
690
691 while (n) {
692 osd = rb_entry(n, struct ceph_osd, o_node);
693 if (o < osd->o_osd)
694 n = n->rb_left;
695 else if (o > osd->o_osd)
696 n = n->rb_right;
697 else
698 return osd;
699 }
700 return NULL;
701}
702
703static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
704{
705 schedule_delayed_work(&osdc->timeout_work,
706 osdc->client->options->osd_keepalive_timeout * HZ);
707}
708
709static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
710{
711 cancel_delayed_work(&osdc->timeout_work);
712}
713
714/*
715 * Register request, assign tid. If this is the first request, set up
716 * the timeout event.
717 */
718static void register_request(struct ceph_osd_client *osdc,
719 struct ceph_osd_request *req)
720{
721 mutex_lock(&osdc->request_mutex);
722 req->r_tid = ++osdc->last_tid;
723 req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
724 INIT_LIST_HEAD(&req->r_req_lru_item);
725
726 dout("register_request %p tid %lld\n", req, req->r_tid);
727 __insert_request(osdc, req);
728 ceph_osdc_get_request(req);
729 osdc->num_requests++;
730
731 if (osdc->num_requests == 1) {
732 dout(" first request, scheduling timeout\n");
733 __schedule_osd_timeout(osdc);
734 }
735 mutex_unlock(&osdc->request_mutex);
736}
737
738/*
739 * called under osdc->request_mutex
740 */
741static void __unregister_request(struct ceph_osd_client *osdc,
742 struct ceph_osd_request *req)
743{
744 dout("__unregister_request %p tid %lld\n", req, req->r_tid);
745 rb_erase(&req->r_node, &osdc->requests);
746 osdc->num_requests--;
747
748 if (req->r_osd) {
749 /* make sure the original request isn't in flight. */
750 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
751
752 list_del_init(&req->r_osd_item);
753 if (list_empty(&req->r_osd->o_requests))
754 __move_osd_to_lru(osdc, req->r_osd);
755 req->r_osd = NULL;
756 }
757
758 ceph_osdc_put_request(req);
759
760 list_del_init(&req->r_req_lru_item);
761 if (osdc->num_requests == 0) {
762 dout(" no requests, canceling timeout\n");
763 __cancel_osd_timeout(osdc);
764 }
765}
766
767/*
768 * Cancel a previously queued request message
769 */
770static void __cancel_request(struct ceph_osd_request *req)
771{
772 if (req->r_sent && req->r_osd) {
773 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
774 req->r_sent = 0;
775 }
776 list_del_init(&req->r_req_lru_item);
777}
778
779/*
780 * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
781 * (as needed), and set the request r_osd appropriately. If there is
782 * no up osd, set r_osd to NULL.
783 *
784 * Return 0 if unchanged, 1 if changed, or negative on error.
785 *
786 * Caller should hold map_sem for read and request_mutex.
787 */
788static int __map_osds(struct ceph_osd_client *osdc,
789 struct ceph_osd_request *req)
790{
791 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
792 struct ceph_pg pgid;
793 int acting[CEPH_PG_MAX_SIZE];
794 int o = -1, num = 0;
795 int err;
796
797 dout("map_osds %p tid %lld\n", req, req->r_tid);
798 err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
799 &req->r_file_layout, osdc->osdmap);
800 if (err)
801 return err;
802 pgid = reqhead->layout.ol_pgid;
803 req->r_pgid = pgid;
804
805 err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
806 if (err > 0) {
807 o = acting[0];
808 num = err;
809 }
810
811 if ((req->r_osd && req->r_osd->o_osd == o &&
812 req->r_sent >= req->r_osd->o_incarnation &&
813 req->r_num_pg_osds == num &&
814 memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
815 (req->r_osd == NULL && o == -1))
816 return 0; /* no change */
817
818 dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
819 req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
820 req->r_osd ? req->r_osd->o_osd : -1);
821
822 /* record full pg acting set */
823 memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
824 req->r_num_pg_osds = num;
825
826 if (req->r_osd) {
827 __cancel_request(req);
828 list_del_init(&req->r_osd_item);
829 req->r_osd = NULL;
830 }
831
832 req->r_osd = __lookup_osd(osdc, o);
833 if (!req->r_osd && o >= 0) {
834 err = -ENOMEM;
835 req->r_osd = create_osd(osdc);
836 if (!req->r_osd)
837 goto out;
838
839 dout("map_osds osd %p is osd%d\n", req->r_osd, o);
840 req->r_osd->o_osd = o;
841 req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
842 __insert_osd(osdc, req->r_osd);
843
844 ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
845 }
846
847 if (req->r_osd) {
848 __remove_osd_from_lru(req->r_osd);
849 list_add(&req->r_osd_item, &req->r_osd->o_requests);
850 }
851 err = 1; /* osd or pg changed */
852
853out:
854 return err;
855}
856
857/*
858 * caller should hold map_sem (for read) and request_mutex
859 */
860static int __send_request(struct ceph_osd_client *osdc,
861 struct ceph_osd_request *req)
862{
863 struct ceph_osd_request_head *reqhead;
864 int err;
865
866 err = __map_osds(osdc, req);
867 if (err < 0)
868 return err;
869 if (req->r_osd == NULL) {
870 dout("send_request %p no up osds in pg\n", req);
871 ceph_monc_request_next_osdmap(&osdc->client->monc);
872 return 0;
873 }
874
875 dout("send_request %p tid %llu to osd%d flags %d\n",
876 req, req->r_tid, req->r_osd->o_osd, req->r_flags);
877
878 reqhead = req->r_request->front.iov_base;
879 reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
880 reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */
881 reqhead->reassert_version = req->r_reassert_version;
882
883 req->r_stamp = jiffies;
884 list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
885
886 ceph_msg_get(req->r_request); /* send consumes a ref */
887 ceph_con_send(&req->r_osd->o_con, req->r_request);
888 req->r_sent = req->r_osd->o_incarnation;
889 return 0;
890}
891
892/*
893 * Timeout callback, called every N seconds when 1 or more osd
894 * requests has been active for more than N seconds. When this
895 * happens, we ping all OSDs with requests who have timed out to
896 * ensure any communications channel reset is detected. Reset the
897 * request timeouts another N seconds in the future as we go.
898 * Reschedule the timeout event another N seconds in future (unless
899 * there are no open requests).
900 */
901static void handle_timeout(struct work_struct *work)
902{
903 struct ceph_osd_client *osdc =
904 container_of(work, struct ceph_osd_client, timeout_work.work);
905 struct ceph_osd_request *req, *last_req = NULL;
906 struct ceph_osd *osd;
907 unsigned long timeout = osdc->client->options->osd_timeout * HZ;
908 unsigned long keepalive =
909 osdc->client->options->osd_keepalive_timeout * HZ;
910 unsigned long last_stamp = 0;
911 struct rb_node *p;
912 struct list_head slow_osds;
913
914 dout("timeout\n");
915 down_read(&osdc->map_sem);
916
917 ceph_monc_request_next_osdmap(&osdc->client->monc);
918
919 mutex_lock(&osdc->request_mutex);
920 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
921 req = rb_entry(p, struct ceph_osd_request, r_node);
922
923 if (req->r_resend) {
924 int err;
925
926 dout("osdc resending prev failed %lld\n", req->r_tid);
927 err = __send_request(osdc, req);
928 if (err)
929 dout("osdc failed again on %lld\n", req->r_tid);
930 else
931 req->r_resend = false;
932 continue;
933 }
934 }
935
936 /*
937 * reset osds that appear to be _really_ unresponsive. this
938 * is a failsafe measure.. we really shouldn't be getting to
939 * this point if the system is working properly. the monitors
940 * should mark the osd as failed and we should find out about
941 * it from an updated osd map.
942 */
943 while (timeout && !list_empty(&osdc->req_lru)) {
944 req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
945 r_req_lru_item);
946
947 if (time_before(jiffies, req->r_stamp + timeout))
948 break;
949
950 BUG_ON(req == last_req && req->r_stamp == last_stamp);
951 last_req = req;
952 last_stamp = req->r_stamp;
953
954 osd = req->r_osd;
955 BUG_ON(!osd);
956 pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
957 req->r_tid, osd->o_osd);
958 __kick_requests(osdc, osd);
959 }
960
961 /*
962 * ping osds that are a bit slow. this ensures that if there
963 * is a break in the TCP connection we will notice, and reopen
964 * a connection with that osd (from the fault callback).
965 */
966 INIT_LIST_HEAD(&slow_osds);
967 list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
968 if (time_before(jiffies, req->r_stamp + keepalive))
969 break;
970
971 osd = req->r_osd;
972 BUG_ON(!osd);
973 dout(" tid %llu is slow, will send keepalive on osd%d\n",
974 req->r_tid, osd->o_osd);
975 list_move_tail(&osd->o_keepalive_item, &slow_osds);
976 }
977 while (!list_empty(&slow_osds)) {
978 osd = list_entry(slow_osds.next, struct ceph_osd,
979 o_keepalive_item);
980 list_del_init(&osd->o_keepalive_item);
981 ceph_con_keepalive(&osd->o_con);
982 }
983
984 __schedule_osd_timeout(osdc);
985 mutex_unlock(&osdc->request_mutex);
986
987 up_read(&osdc->map_sem);
988}
989
990static void handle_osds_timeout(struct work_struct *work)
991{
992 struct ceph_osd_client *osdc =
993 container_of(work, struct ceph_osd_client,
994 osds_timeout_work.work);
995 unsigned long delay =
996 osdc->client->options->osd_idle_ttl * HZ >> 2;
997
998 dout("osds timeout\n");
999 down_read(&osdc->map_sem);
1000 remove_old_osds(osdc, 0);
1001 up_read(&osdc->map_sem);
1002
1003 schedule_delayed_work(&osdc->osds_timeout_work,
1004 round_jiffies_relative(delay));
1005}
1006
1007/*
1008 * handle osd op reply. either call the callback if it is specified,
1009 * or do the completion to wake up the waiting thread.
1010 */
1011static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
1012 struct ceph_connection *con)
1013{
1014 struct ceph_osd_reply_head *rhead = msg->front.iov_base;
1015 struct ceph_osd_request *req;
1016 u64 tid;
1017 int numops, object_len, flags;
1018 s32 result;
1019
1020 tid = le64_to_cpu(msg->hdr.tid);
1021 if (msg->front.iov_len < sizeof(*rhead))
1022 goto bad;
1023 numops = le32_to_cpu(rhead->num_ops);
1024 object_len = le32_to_cpu(rhead->object_len);
1025 result = le32_to_cpu(rhead->result);
1026 if (msg->front.iov_len != sizeof(*rhead) + object_len +
1027 numops * sizeof(struct ceph_osd_op))
1028 goto bad;
1029 dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result);
1030
1031 /* lookup */
1032 mutex_lock(&osdc->request_mutex);
1033 req = __lookup_request(osdc, tid);
1034 if (req == NULL) {
1035 dout("handle_reply tid %llu dne\n", tid);
1036 mutex_unlock(&osdc->request_mutex);
1037 return;
1038 }
1039 ceph_osdc_get_request(req);
1040 flags = le32_to_cpu(rhead->flags);
1041
1042 /*
1043 * if this connection filled our message, drop our reference now, to
1044 * avoid a (safe but slower) revoke later.
1045 */
1046 if (req->r_con_filling_msg == con && req->r_reply == msg) {
1047 dout(" dropping con_filling_msg ref %p\n", con);
1048 req->r_con_filling_msg = NULL;
1049 ceph_con_put(con);
1050 }
1051
1052 if (!req->r_got_reply) {
1053 unsigned bytes;
1054
1055 req->r_result = le32_to_cpu(rhead->result);
1056 bytes = le32_to_cpu(msg->hdr.data_len);
1057 dout("handle_reply result %d bytes %d\n", req->r_result,
1058 bytes);
1059 if (req->r_result == 0)
1060 req->r_result = bytes;
1061
1062 /* in case this is a write and we need to replay, */
1063 req->r_reassert_version = rhead->reassert_version;
1064
1065 req->r_got_reply = 1;
1066 } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
1067 dout("handle_reply tid %llu dup ack\n", tid);
1068 mutex_unlock(&osdc->request_mutex);
1069 goto done;
1070 }
1071
1072 dout("handle_reply tid %llu flags %d\n", tid, flags);
1073
1074 /* either this is a read, or we got the safe response */
1075 if (result < 0 ||
1076 (flags & CEPH_OSD_FLAG_ONDISK) ||
1077 ((flags & CEPH_OSD_FLAG_WRITE) == 0))
1078 __unregister_request(osdc, req);
1079
1080 mutex_unlock(&osdc->request_mutex);
1081
1082 if (req->r_callback)
1083 req->r_callback(req, msg);
1084 else
1085 complete_all(&req->r_completion);
1086
1087 if (flags & CEPH_OSD_FLAG_ONDISK) {
1088 if (req->r_safe_callback)
1089 req->r_safe_callback(req, msg);
1090 complete_all(&req->r_safe_completion); /* fsync waiter */
1091 }
1092
1093done:
1094 ceph_osdc_put_request(req);
1095 return;
1096
1097bad:
1098 pr_err("corrupt osd_op_reply got %d %d expected %d\n",
1099 (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
1100 (int)sizeof(*rhead));
1101 ceph_msg_dump(msg);
1102}
1103
1104
1105static int __kick_requests(struct ceph_osd_client *osdc,
1106 struct ceph_osd *kickosd)
1107{
1108 struct ceph_osd_request *req;
1109 struct rb_node *p, *n;
1110 int needmap = 0;
1111 int err;
1112
1113 dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
1114 if (kickosd) {
1115 err = __reset_osd(osdc, kickosd);
1116 if (err == -EAGAIN)
1117 return 1;
1118 } else {
1119 for (p = rb_first(&osdc->osds); p; p = n) {
1120 struct ceph_osd *osd =
1121 rb_entry(p, struct ceph_osd, o_node);
1122
1123 n = rb_next(p);
1124 if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
1125 memcmp(&osd->o_con.peer_addr,
1126 ceph_osd_addr(osdc->osdmap,
1127 osd->o_osd),
1128 sizeof(struct ceph_entity_addr)) != 0)
1129 __reset_osd(osdc, osd);
1130 }
1131 }
1132
1133 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
1134 req = rb_entry(p, struct ceph_osd_request, r_node);
1135
1136 if (req->r_resend) {
1137 dout(" r_resend set on tid %llu\n", req->r_tid);
1138 __cancel_request(req);
1139 goto kick;
1140 }
1141 if (req->r_osd && kickosd == req->r_osd) {
1142 __cancel_request(req);
1143 goto kick;
1144 }
1145
1146 err = __map_osds(osdc, req);
1147 if (err == 0)
1148 continue; /* no change */
1149 if (err < 0) {
1150 /*
1151 * FIXME: really, we should set the request
1152 * error and fail if this isn't a 'nofail'
1153 * request, but that's a fair bit more
1154 * complicated to do. So retry!
1155 */
1156 dout(" setting r_resend on %llu\n", req->r_tid);
1157 req->r_resend = true;
1158 continue;
1159 }
1160 if (req->r_osd == NULL) {
1161 dout("tid %llu maps to no valid osd\n", req->r_tid);
1162 needmap++; /* request a newer map */
1163 continue;
1164 }
1165
1166kick:
1167 dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
1168 req->r_osd ? req->r_osd->o_osd : -1);
1169 req->r_flags |= CEPH_OSD_FLAG_RETRY;
1170 err = __send_request(osdc, req);
1171 if (err) {
1172 dout(" setting r_resend on %llu\n", req->r_tid);
1173 req->r_resend = true;
1174 }
1175 }
1176
1177 return needmap;
1178}
1179
1180/*
1181 * Resubmit osd requests whose osd or osd address has changed. Request
1182 * a new osd map if osds are down, or we are otherwise unable to determine
1183 * how to direct a request.
1184 *
1185 * Close connections to down osds.
1186 *
1187 * If @who is specified, resubmit requests for that specific osd.
1188 *
1189 * Caller should hold map_sem for read and request_mutex.
1190 */
1191static void kick_requests(struct ceph_osd_client *osdc,
1192 struct ceph_osd *kickosd)
1193{
1194 int needmap;
1195
1196 mutex_lock(&osdc->request_mutex);
1197 needmap = __kick_requests(osdc, kickosd);
1198 mutex_unlock(&osdc->request_mutex);
1199
1200 if (needmap) {
1201 dout("%d requests for down osds, need new map\n", needmap);
1202 ceph_monc_request_next_osdmap(&osdc->client->monc);
1203 }
1204
1205}
1206/*
1207 * Process updated osd map.
1208 *
1209 * The message contains any number of incremental and full maps, normally
1210 * indicating some sort of topology change in the cluster. Kick requests
1211 * off to different OSDs as needed.
1212 */
1213void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
1214{
1215 void *p, *end, *next;
1216 u32 nr_maps, maplen;
1217 u32 epoch;
1218 struct ceph_osdmap *newmap = NULL, *oldmap;
1219 int err;
1220 struct ceph_fsid fsid;
1221
1222 dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
1223 p = msg->front.iov_base;
1224 end = p + msg->front.iov_len;
1225
1226 /* verify fsid */
1227 ceph_decode_need(&p, end, sizeof(fsid), bad);
1228 ceph_decode_copy(&p, &fsid, sizeof(fsid));
1229 if (ceph_check_fsid(osdc->client, &fsid) < 0)
1230 return;
1231
1232 down_write(&osdc->map_sem);
1233
1234 /* incremental maps */
1235 ceph_decode_32_safe(&p, end, nr_maps, bad);
1236 dout(" %d inc maps\n", nr_maps);
1237 while (nr_maps > 0) {
1238 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1239 epoch = ceph_decode_32(&p);
1240 maplen = ceph_decode_32(&p);
1241 ceph_decode_need(&p, end, maplen, bad);
1242 next = p + maplen;
1243 if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
1244 dout("applying incremental map %u len %d\n",
1245 epoch, maplen);
1246 newmap = osdmap_apply_incremental(&p, next,
1247 osdc->osdmap,
1248 osdc->client->msgr);
1249 if (IS_ERR(newmap)) {
1250 err = PTR_ERR(newmap);
1251 goto bad;
1252 }
1253 BUG_ON(!newmap);
1254 if (newmap != osdc->osdmap) {
1255 ceph_osdmap_destroy(osdc->osdmap);
1256 osdc->osdmap = newmap;
1257 }
1258 } else {
1259 dout("ignoring incremental map %u len %d\n",
1260 epoch, maplen);
1261 }
1262 p = next;
1263 nr_maps--;
1264 }
1265 if (newmap)
1266 goto done;
1267
1268 /* full maps */
1269 ceph_decode_32_safe(&p, end, nr_maps, bad);
1270 dout(" %d full maps\n", nr_maps);
1271 while (nr_maps) {
1272 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1273 epoch = ceph_decode_32(&p);
1274 maplen = ceph_decode_32(&p);
1275 ceph_decode_need(&p, end, maplen, bad);
1276 if (nr_maps > 1) {
1277 dout("skipping non-latest full map %u len %d\n",
1278 epoch, maplen);
1279 } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
1280 dout("skipping full map %u len %d, "
1281 "older than our %u\n", epoch, maplen,
1282 osdc->osdmap->epoch);
1283 } else {
1284 dout("taking full map %u len %d\n", epoch, maplen);
1285 newmap = osdmap_decode(&p, p+maplen);
1286 if (IS_ERR(newmap)) {
1287 err = PTR_ERR(newmap);
1288 goto bad;
1289 }
1290 BUG_ON(!newmap);
1291 oldmap = osdc->osdmap;
1292 osdc->osdmap = newmap;
1293 if (oldmap)
1294 ceph_osdmap_destroy(oldmap);
1295 }
1296 p += maplen;
1297 nr_maps--;
1298 }
1299
1300done:
1301 downgrade_write(&osdc->map_sem);
1302 ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
1303 if (newmap)
1304 kick_requests(osdc, NULL);
1305 up_read(&osdc->map_sem);
1306 wake_up_all(&osdc->client->auth_wq);
1307 return;
1308
1309bad:
1310 pr_err("osdc handle_map corrupt msg\n");
1311 ceph_msg_dump(msg);
1312 up_write(&osdc->map_sem);
1313 return;
1314}
1315
1316/*
1317 * Register request, send initial attempt.
1318 */
1319int ceph_osdc_start_request(struct ceph_osd_client *osdc,
1320 struct ceph_osd_request *req,
1321 bool nofail)
1322{
1323 int rc = 0;
1324
1325 req->r_request->pages = req->r_pages;
1326 req->r_request->nr_pages = req->r_num_pages;
1327#ifdef CONFIG_BLOCK
1328 req->r_request->bio = req->r_bio;
1329#endif
1330 req->r_request->trail = req->r_trail;
1331
1332 register_request(osdc, req);
1333
1334 down_read(&osdc->map_sem);
1335 mutex_lock(&osdc->request_mutex);
1336 /*
1337 * a racing kick_requests() may have sent the message for us
1338 * while we dropped request_mutex above, so only send now if
1339 * the request still han't been touched yet.
1340 */
1341 if (req->r_sent == 0) {
1342 rc = __send_request(osdc, req);
1343 if (rc) {
1344 if (nofail) {
1345 dout("osdc_start_request failed send, "
1346 " marking %lld\n", req->r_tid);
1347 req->r_resend = true;
1348 rc = 0;
1349 } else {
1350 __unregister_request(osdc, req);
1351 }
1352 }
1353 }
1354 mutex_unlock(&osdc->request_mutex);
1355 up_read(&osdc->map_sem);
1356 return rc;
1357}
1358EXPORT_SYMBOL(ceph_osdc_start_request);
1359
1360/*
1361 * wait for a request to complete
1362 */
1363int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
1364 struct ceph_osd_request *req)
1365{
1366 int rc;
1367
1368 rc = wait_for_completion_interruptible(&req->r_completion);
1369 if (rc < 0) {
1370 mutex_lock(&osdc->request_mutex);
1371 __cancel_request(req);
1372 __unregister_request(osdc, req);
1373 mutex_unlock(&osdc->request_mutex);
1374 dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
1375 return rc;
1376 }
1377
1378 dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
1379 return req->r_result;
1380}
1381EXPORT_SYMBOL(ceph_osdc_wait_request);
1382
1383/*
1384 * sync - wait for all in-flight requests to flush. avoid starvation.
1385 */
1386void ceph_osdc_sync(struct ceph_osd_client *osdc)
1387{
1388 struct ceph_osd_request *req;
1389 u64 last_tid, next_tid = 0;
1390
1391 mutex_lock(&osdc->request_mutex);
1392 last_tid = osdc->last_tid;
1393 while (1) {
1394 req = __lookup_request_ge(osdc, next_tid);
1395 if (!req)
1396 break;
1397 if (req->r_tid > last_tid)
1398 break;
1399
1400 next_tid = req->r_tid + 1;
1401 if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
1402 continue;
1403
1404 ceph_osdc_get_request(req);
1405 mutex_unlock(&osdc->request_mutex);
1406 dout("sync waiting on tid %llu (last is %llu)\n",
1407 req->r_tid, last_tid);
1408 wait_for_completion(&req->r_safe_completion);
1409 mutex_lock(&osdc->request_mutex);
1410 ceph_osdc_put_request(req);
1411 }
1412 mutex_unlock(&osdc->request_mutex);
1413 dout("sync done (thru tid %llu)\n", last_tid);
1414}
1415EXPORT_SYMBOL(ceph_osdc_sync);
1416
1417/*
1418 * init, shutdown
1419 */
1420int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
1421{
1422 int err;
1423
1424 dout("init\n");
1425 osdc->client = client;
1426 osdc->osdmap = NULL;
1427 init_rwsem(&osdc->map_sem);
1428 init_completion(&osdc->map_waiters);
1429 osdc->last_requested_map = 0;
1430 mutex_init(&osdc->request_mutex);
1431 osdc->last_tid = 0;
1432 osdc->osds = RB_ROOT;
1433 INIT_LIST_HEAD(&osdc->osd_lru);
1434 osdc->requests = RB_ROOT;
1435 INIT_LIST_HEAD(&osdc->req_lru);
1436 osdc->num_requests = 0;
1437 INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
1438 INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
1439
1440 schedule_delayed_work(&osdc->osds_timeout_work,
1441 round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ));
1442
1443 err = -ENOMEM;
1444 osdc->req_mempool = mempool_create_kmalloc_pool(10,
1445 sizeof(struct ceph_osd_request));
1446 if (!osdc->req_mempool)
1447 goto out;
1448
1449 err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
1450 "osd_op");
1451 if (err < 0)
1452 goto out_mempool;
1453 err = ceph_msgpool_init(&osdc->msgpool_op_reply,
1454 OSD_OPREPLY_FRONT_LEN, 10, true,
1455 "osd_op_reply");
1456 if (err < 0)
1457 goto out_msgpool;
1458 return 0;
1459
1460out_msgpool:
1461 ceph_msgpool_destroy(&osdc->msgpool_op);
1462out_mempool:
1463 mempool_destroy(osdc->req_mempool);
1464out:
1465 return err;
1466}
1467EXPORT_SYMBOL(ceph_osdc_init);
1468
1469void ceph_osdc_stop(struct ceph_osd_client *osdc)
1470{
1471 cancel_delayed_work_sync(&osdc->timeout_work);
1472 cancel_delayed_work_sync(&osdc->osds_timeout_work);
1473 if (osdc->osdmap) {
1474 ceph_osdmap_destroy(osdc->osdmap);
1475 osdc->osdmap = NULL;
1476 }
1477 remove_old_osds(osdc, 1);
1478 mempool_destroy(osdc->req_mempool);
1479 ceph_msgpool_destroy(&osdc->msgpool_op);
1480 ceph_msgpool_destroy(&osdc->msgpool_op_reply);
1481}
1482EXPORT_SYMBOL(ceph_osdc_stop);
1483
1484/*
1485 * Read some contiguous pages. If we cross a stripe boundary, shorten
1486 * *plen. Return number of bytes read, or error.
1487 */
1488int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1489 struct ceph_vino vino, struct ceph_file_layout *layout,
1490 u64 off, u64 *plen,
1491 u32 truncate_seq, u64 truncate_size,
1492 struct page **pages, int num_pages)
1493{
1494 struct ceph_osd_request *req;
1495 int rc = 0;
1496
1497 dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
1498 vino.snap, off, *plen);
1499 req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
1500 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
1501 NULL, 0, truncate_seq, truncate_size, NULL,
1502 false, 1);
1503 if (!req)
1504 return -ENOMEM;
1505
1506 /* it may be a short read due to an object boundary */
1507 req->r_pages = pages;
1508
1509 dout("readpages final extent is %llu~%llu (%d pages)\n",
1510 off, *plen, req->r_num_pages);
1511
1512 rc = ceph_osdc_start_request(osdc, req, false);
1513 if (!rc)
1514 rc = ceph_osdc_wait_request(osdc, req);
1515
1516 ceph_osdc_put_request(req);
1517 dout("readpages result %d\n", rc);
1518 return rc;
1519}
1520EXPORT_SYMBOL(ceph_osdc_readpages);
1521
1522/*
1523 * do a synchronous write on N pages
1524 */
1525int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1526 struct ceph_file_layout *layout,
1527 struct ceph_snap_context *snapc,
1528 u64 off, u64 len,
1529 u32 truncate_seq, u64 truncate_size,
1530 struct timespec *mtime,
1531 struct page **pages, int num_pages,
1532 int flags, int do_sync, bool nofail)
1533{
1534 struct ceph_osd_request *req;
1535 int rc = 0;
1536
1537 BUG_ON(vino.snap != CEPH_NOSNAP);
1538 req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
1539 CEPH_OSD_OP_WRITE,
1540 flags | CEPH_OSD_FLAG_ONDISK |
1541 CEPH_OSD_FLAG_WRITE,
1542 snapc, do_sync,
1543 truncate_seq, truncate_size, mtime,
1544 nofail, 1);
1545 if (!req)
1546 return -ENOMEM;
1547
1548 /* it may be a short write due to an object boundary */
1549 req->r_pages = pages;
1550 dout("writepages %llu~%llu (%d pages)\n", off, len,
1551 req->r_num_pages);
1552
1553 rc = ceph_osdc_start_request(osdc, req, nofail);
1554 if (!rc)
1555 rc = ceph_osdc_wait_request(osdc, req);
1556
1557 ceph_osdc_put_request(req);
1558 if (rc == 0)
1559 rc = len;
1560 dout("writepages result %d\n", rc);
1561 return rc;
1562}
1563EXPORT_SYMBOL(ceph_osdc_writepages);
1564
1565/*
1566 * handle incoming message
1567 */
1568static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
1569{
1570 struct ceph_osd *osd = con->private;
1571 struct ceph_osd_client *osdc;
1572 int type = le16_to_cpu(msg->hdr.type);
1573
1574 if (!osd)
1575 goto out;
1576 osdc = osd->o_osdc;
1577
1578 switch (type) {
1579 case CEPH_MSG_OSD_MAP:
1580 ceph_osdc_handle_map(osdc, msg);
1581 break;
1582 case CEPH_MSG_OSD_OPREPLY:
1583 handle_reply(osdc, msg, con);
1584 break;
1585
1586 default:
1587 pr_err("received unknown message type %d %s\n", type,
1588 ceph_msg_type_name(type));
1589 }
1590out:
1591 ceph_msg_put(msg);
1592}
1593
1594/*
1595 * lookup and return message for incoming reply. set up reply message
1596 * pages.
1597 */
1598static struct ceph_msg *get_reply(struct ceph_connection *con,
1599 struct ceph_msg_header *hdr,
1600 int *skip)
1601{
1602 struct ceph_osd *osd = con->private;
1603 struct ceph_osd_client *osdc = osd->o_osdc;
1604 struct ceph_msg *m;
1605 struct ceph_osd_request *req;
1606 int front = le32_to_cpu(hdr->front_len);
1607 int data_len = le32_to_cpu(hdr->data_len);
1608 u64 tid;
1609
1610 tid = le64_to_cpu(hdr->tid);
1611 mutex_lock(&osdc->request_mutex);
1612 req = __lookup_request(osdc, tid);
1613 if (!req) {
1614 *skip = 1;
1615 m = NULL;
1616 pr_info("get_reply unknown tid %llu from osd%d\n", tid,
1617 osd->o_osd);
1618 goto out;
1619 }
1620
1621 if (req->r_con_filling_msg) {
1622 dout("get_reply revoking msg %p from old con %p\n",
1623 req->r_reply, req->r_con_filling_msg);
1624 ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
1625 ceph_con_put(req->r_con_filling_msg);
1626 req->r_con_filling_msg = NULL;
1627 }
1628
1629 if (front > req->r_reply->front.iov_len) {
1630 pr_warning("get_reply front %d > preallocated %d\n",
1631 front, (int)req->r_reply->front.iov_len);
1632 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
1633 if (!m)
1634 goto out;
1635 ceph_msg_put(req->r_reply);
1636 req->r_reply = m;
1637 }
1638 m = ceph_msg_get(req->r_reply);
1639
1640 if (data_len > 0) {
1641 unsigned data_off = le16_to_cpu(hdr->data_off);
1642 int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
1643
1644 if (unlikely(req->r_num_pages < want)) {
1645 pr_warning("tid %lld reply %d > expected %d pages\n",
1646 tid, want, m->nr_pages);
1647 *skip = 1;
1648 ceph_msg_put(m);
1649 m = NULL;
1650 goto out;
1651 }
1652 m->pages = req->r_pages;
1653 m->nr_pages = req->r_num_pages;
1654#ifdef CONFIG_BLOCK
1655 m->bio = req->r_bio;
1656#endif
1657 }
1658 *skip = 0;
1659 req->r_con_filling_msg = ceph_con_get(con);
1660 dout("get_reply tid %lld %p\n", tid, m);
1661
1662out:
1663 mutex_unlock(&osdc->request_mutex);
1664 return m;
1665
1666}
1667
1668static struct ceph_msg *alloc_msg(struct ceph_connection *con,
1669 struct ceph_msg_header *hdr,
1670 int *skip)
1671{
1672 struct ceph_osd *osd = con->private;
1673 int type = le16_to_cpu(hdr->type);
1674 int front = le32_to_cpu(hdr->front_len);
1675
1676 switch (type) {
1677 case CEPH_MSG_OSD_MAP:
1678 return ceph_msg_new(type, front, GFP_NOFS);
1679 case CEPH_MSG_OSD_OPREPLY:
1680 return get_reply(con, hdr, skip);
1681 default:
1682 pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
1683 osd->o_osd);
1684 *skip = 1;
1685 return NULL;
1686 }
1687}
1688
1689/*
1690 * Wrappers to refcount containing ceph_osd struct
1691 */
1692static struct ceph_connection *get_osd_con(struct ceph_connection *con)
1693{
1694 struct ceph_osd *osd = con->private;
1695 if (get_osd(osd))
1696 return con;
1697 return NULL;
1698}
1699
1700static void put_osd_con(struct ceph_connection *con)
1701{
1702 struct ceph_osd *osd = con->private;
1703 put_osd(osd);
1704}
1705
1706/*
1707 * authentication
1708 */
1709static int get_authorizer(struct ceph_connection *con,
1710 void **buf, int *len, int *proto,
1711 void **reply_buf, int *reply_len, int force_new)
1712{
1713 struct ceph_osd *o = con->private;
1714 struct ceph_osd_client *osdc = o->o_osdc;
1715 struct ceph_auth_client *ac = osdc->client->monc.auth;
1716 int ret = 0;
1717
1718 if (force_new && o->o_authorizer) {
1719 ac->ops->destroy_authorizer(ac, o->o_authorizer);
1720 o->o_authorizer = NULL;
1721 }
1722 if (o->o_authorizer == NULL) {
1723 ret = ac->ops->create_authorizer(
1724 ac, CEPH_ENTITY_TYPE_OSD,
1725 &o->o_authorizer,
1726 &o->o_authorizer_buf,
1727 &o->o_authorizer_buf_len,
1728 &o->o_authorizer_reply_buf,
1729 &o->o_authorizer_reply_buf_len);
1730 if (ret)
1731 return ret;
1732 }
1733
1734 *proto = ac->protocol;
1735 *buf = o->o_authorizer_buf;
1736 *len = o->o_authorizer_buf_len;
1737 *reply_buf = o->o_authorizer_reply_buf;
1738 *reply_len = o->o_authorizer_reply_buf_len;
1739 return 0;
1740}
1741
1742
1743static int verify_authorizer_reply(struct ceph_connection *con, int len)
1744{
1745 struct ceph_osd *o = con->private;
1746 struct ceph_osd_client *osdc = o->o_osdc;
1747 struct ceph_auth_client *ac = osdc->client->monc.auth;
1748
1749 return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
1750}
1751
1752static int invalidate_authorizer(struct ceph_connection *con)
1753{
1754 struct ceph_osd *o = con->private;
1755 struct ceph_osd_client *osdc = o->o_osdc;
1756 struct ceph_auth_client *ac = osdc->client->monc.auth;
1757
1758 if (ac->ops->invalidate_authorizer)
1759 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
1760
1761 return ceph_monc_validate_auth(&osdc->client->monc);
1762}
1763
1764static const struct ceph_connection_operations osd_con_ops = {
1765 .get = get_osd_con,
1766 .put = put_osd_con,
1767 .dispatch = dispatch,
1768 .get_authorizer = get_authorizer,
1769 .verify_authorizer_reply = verify_authorizer_reply,
1770 .invalidate_authorizer = invalidate_authorizer,
1771 .alloc_msg = alloc_msg,
1772 .fault = osd_reset,
1773};
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
new file mode 100644
index 000000000000..d73f3f6efa36
--- /dev/null
+++ b/net/ceph/osdmap.c
@@ -0,0 +1,1128 @@
1
2#include <linux/ceph/ceph_debug.h>
3
4#include <linux/module.h>
5#include <linux/slab.h>
6#include <asm/div64.h>
7
8#include <linux/ceph/libceph.h>
9#include <linux/ceph/osdmap.h>
10#include <linux/ceph/decode.h>
11#include <linux/crush/hash.h>
12#include <linux/crush/mapper.h>
13
14char *ceph_osdmap_state_str(char *str, int len, int state)
15{
16 int flag = 0;
17
18 if (!len)
19 goto done;
20
21 *str = '\0';
22 if (state) {
23 if (state & CEPH_OSD_EXISTS) {
24 snprintf(str, len, "exists");
25 flag = 1;
26 }
27 if (state & CEPH_OSD_UP) {
28 snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
29 "up");
30 flag = 1;
31 }
32 } else {
33 snprintf(str, len, "doesn't exist");
34 }
35done:
36 return str;
37}
38
39/* maps */
40
41static int calc_bits_of(unsigned t)
42{
43 int b = 0;
44 while (t) {
45 t = t >> 1;
46 b++;
47 }
48 return b;
49}
50
51/*
52 * the foo_mask is the smallest value 2^n-1 that is >= foo.
53 */
54static void calc_pg_masks(struct ceph_pg_pool_info *pi)
55{
56 pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
57 pi->pgp_num_mask =
58 (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
59 pi->lpg_num_mask =
60 (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
61 pi->lpgp_num_mask =
62 (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
63}
64
65/*
66 * decode crush map
67 */
68static int crush_decode_uniform_bucket(void **p, void *end,
69 struct crush_bucket_uniform *b)
70{
71 dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
72 ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
73 b->item_weight = ceph_decode_32(p);
74 return 0;
75bad:
76 return -EINVAL;
77}
78
79static int crush_decode_list_bucket(void **p, void *end,
80 struct crush_bucket_list *b)
81{
82 int j;
83 dout("crush_decode_list_bucket %p to %p\n", *p, end);
84 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
85 if (b->item_weights == NULL)
86 return -ENOMEM;
87 b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
88 if (b->sum_weights == NULL)
89 return -ENOMEM;
90 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
91 for (j = 0; j < b->h.size; j++) {
92 b->item_weights[j] = ceph_decode_32(p);
93 b->sum_weights[j] = ceph_decode_32(p);
94 }
95 return 0;
96bad:
97 return -EINVAL;
98}
99
100static int crush_decode_tree_bucket(void **p, void *end,
101 struct crush_bucket_tree *b)
102{
103 int j;
104 dout("crush_decode_tree_bucket %p to %p\n", *p, end);
105 ceph_decode_32_safe(p, end, b->num_nodes, bad);
106 b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
107 if (b->node_weights == NULL)
108 return -ENOMEM;
109 ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
110 for (j = 0; j < b->num_nodes; j++)
111 b->node_weights[j] = ceph_decode_32(p);
112 return 0;
113bad:
114 return -EINVAL;
115}
116
117static int crush_decode_straw_bucket(void **p, void *end,
118 struct crush_bucket_straw *b)
119{
120 int j;
121 dout("crush_decode_straw_bucket %p to %p\n", *p, end);
122 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
123 if (b->item_weights == NULL)
124 return -ENOMEM;
125 b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
126 if (b->straws == NULL)
127 return -ENOMEM;
128 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
129 for (j = 0; j < b->h.size; j++) {
130 b->item_weights[j] = ceph_decode_32(p);
131 b->straws[j] = ceph_decode_32(p);
132 }
133 return 0;
134bad:
135 return -EINVAL;
136}
137
138static struct crush_map *crush_decode(void *pbyval, void *end)
139{
140 struct crush_map *c;
141 int err = -EINVAL;
142 int i, j;
143 void **p = &pbyval;
144 void *start = pbyval;
145 u32 magic;
146
147 dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
148
149 c = kzalloc(sizeof(*c), GFP_NOFS);
150 if (c == NULL)
151 return ERR_PTR(-ENOMEM);
152
153 ceph_decode_need(p, end, 4*sizeof(u32), bad);
154 magic = ceph_decode_32(p);
155 if (magic != CRUSH_MAGIC) {
156 pr_err("crush_decode magic %x != current %x\n",
157 (unsigned)magic, (unsigned)CRUSH_MAGIC);
158 goto bad;
159 }
160 c->max_buckets = ceph_decode_32(p);
161 c->max_rules = ceph_decode_32(p);
162 c->max_devices = ceph_decode_32(p);
163
164 c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
165 if (c->device_parents == NULL)
166 goto badmem;
167 c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
168 if (c->bucket_parents == NULL)
169 goto badmem;
170
171 c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
172 if (c->buckets == NULL)
173 goto badmem;
174 c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
175 if (c->rules == NULL)
176 goto badmem;
177
178 /* buckets */
179 for (i = 0; i < c->max_buckets; i++) {
180 int size = 0;
181 u32 alg;
182 struct crush_bucket *b;
183
184 ceph_decode_32_safe(p, end, alg, bad);
185 if (alg == 0) {
186 c->buckets[i] = NULL;
187 continue;
188 }
189 dout("crush_decode bucket %d off %x %p to %p\n",
190 i, (int)(*p-start), *p, end);
191
192 switch (alg) {
193 case CRUSH_BUCKET_UNIFORM:
194 size = sizeof(struct crush_bucket_uniform);
195 break;
196 case CRUSH_BUCKET_LIST:
197 size = sizeof(struct crush_bucket_list);
198 break;
199 case CRUSH_BUCKET_TREE:
200 size = sizeof(struct crush_bucket_tree);
201 break;
202 case CRUSH_BUCKET_STRAW:
203 size = sizeof(struct crush_bucket_straw);
204 break;
205 default:
206 err = -EINVAL;
207 goto bad;
208 }
209 BUG_ON(size == 0);
210 b = c->buckets[i] = kzalloc(size, GFP_NOFS);
211 if (b == NULL)
212 goto badmem;
213
214 ceph_decode_need(p, end, 4*sizeof(u32), bad);
215 b->id = ceph_decode_32(p);
216 b->type = ceph_decode_16(p);
217 b->alg = ceph_decode_8(p);
218 b->hash = ceph_decode_8(p);
219 b->weight = ceph_decode_32(p);
220 b->size = ceph_decode_32(p);
221
222 dout("crush_decode bucket size %d off %x %p to %p\n",
223 b->size, (int)(*p-start), *p, end);
224
225 b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
226 if (b->items == NULL)
227 goto badmem;
228 b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
229 if (b->perm == NULL)
230 goto badmem;
231 b->perm_n = 0;
232
233 ceph_decode_need(p, end, b->size*sizeof(u32), bad);
234 for (j = 0; j < b->size; j++)
235 b->items[j] = ceph_decode_32(p);
236
237 switch (b->alg) {
238 case CRUSH_BUCKET_UNIFORM:
239 err = crush_decode_uniform_bucket(p, end,
240 (struct crush_bucket_uniform *)b);
241 if (err < 0)
242 goto bad;
243 break;
244 case CRUSH_BUCKET_LIST:
245 err = crush_decode_list_bucket(p, end,
246 (struct crush_bucket_list *)b);
247 if (err < 0)
248 goto bad;
249 break;
250 case CRUSH_BUCKET_TREE:
251 err = crush_decode_tree_bucket(p, end,
252 (struct crush_bucket_tree *)b);
253 if (err < 0)
254 goto bad;
255 break;
256 case CRUSH_BUCKET_STRAW:
257 err = crush_decode_straw_bucket(p, end,
258 (struct crush_bucket_straw *)b);
259 if (err < 0)
260 goto bad;
261 break;
262 }
263 }
264
265 /* rules */
266 dout("rule vec is %p\n", c->rules);
267 for (i = 0; i < c->max_rules; i++) {
268 u32 yes;
269 struct crush_rule *r;
270
271 ceph_decode_32_safe(p, end, yes, bad);
272 if (!yes) {
273 dout("crush_decode NO rule %d off %x %p to %p\n",
274 i, (int)(*p-start), *p, end);
275 c->rules[i] = NULL;
276 continue;
277 }
278
279 dout("crush_decode rule %d off %x %p to %p\n",
280 i, (int)(*p-start), *p, end);
281
282 /* len */
283 ceph_decode_32_safe(p, end, yes, bad);
284#if BITS_PER_LONG == 32
285 err = -EINVAL;
286 if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
287 goto bad;
288#endif
289 r = c->rules[i] = kmalloc(sizeof(*r) +
290 yes*sizeof(struct crush_rule_step),
291 GFP_NOFS);
292 if (r == NULL)
293 goto badmem;
294 dout(" rule %d is at %p\n", i, r);
295 r->len = yes;
296 ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
297 ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
298 for (j = 0; j < r->len; j++) {
299 r->steps[j].op = ceph_decode_32(p);
300 r->steps[j].arg1 = ceph_decode_32(p);
301 r->steps[j].arg2 = ceph_decode_32(p);
302 }
303 }
304
305 /* ignore trailing name maps. */
306
307 dout("crush_decode success\n");
308 return c;
309
310badmem:
311 err = -ENOMEM;
312bad:
313 dout("crush_decode fail %d\n", err);
314 crush_destroy(c);
315 return ERR_PTR(err);
316}
317
318/*
319 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
320 * to a set of osds)
321 */
322static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
323{
324 u64 a = *(u64 *)&l;
325 u64 b = *(u64 *)&r;
326
327 if (a < b)
328 return -1;
329 if (a > b)
330 return 1;
331 return 0;
332}
333
334static int __insert_pg_mapping(struct ceph_pg_mapping *new,
335 struct rb_root *root)
336{
337 struct rb_node **p = &root->rb_node;
338 struct rb_node *parent = NULL;
339 struct ceph_pg_mapping *pg = NULL;
340 int c;
341
342 while (*p) {
343 parent = *p;
344 pg = rb_entry(parent, struct ceph_pg_mapping, node);
345 c = pgid_cmp(new->pgid, pg->pgid);
346 if (c < 0)
347 p = &(*p)->rb_left;
348 else if (c > 0)
349 p = &(*p)->rb_right;
350 else
351 return -EEXIST;
352 }
353
354 rb_link_node(&new->node, parent, p);
355 rb_insert_color(&new->node, root);
356 return 0;
357}
358
359static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
360 struct ceph_pg pgid)
361{
362 struct rb_node *n = root->rb_node;
363 struct ceph_pg_mapping *pg;
364 int c;
365
366 while (n) {
367 pg = rb_entry(n, struct ceph_pg_mapping, node);
368 c = pgid_cmp(pgid, pg->pgid);
369 if (c < 0)
370 n = n->rb_left;
371 else if (c > 0)
372 n = n->rb_right;
373 else
374 return pg;
375 }
376 return NULL;
377}
378
379/*
380 * rbtree of pg pool info
381 */
382static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
383{
384 struct rb_node **p = &root->rb_node;
385 struct rb_node *parent = NULL;
386 struct ceph_pg_pool_info *pi = NULL;
387
388 while (*p) {
389 parent = *p;
390 pi = rb_entry(parent, struct ceph_pg_pool_info, node);
391 if (new->id < pi->id)
392 p = &(*p)->rb_left;
393 else if (new->id > pi->id)
394 p = &(*p)->rb_right;
395 else
396 return -EEXIST;
397 }
398
399 rb_link_node(&new->node, parent, p);
400 rb_insert_color(&new->node, root);
401 return 0;
402}
403
404static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
405{
406 struct ceph_pg_pool_info *pi;
407 struct rb_node *n = root->rb_node;
408
409 while (n) {
410 pi = rb_entry(n, struct ceph_pg_pool_info, node);
411 if (id < pi->id)
412 n = n->rb_left;
413 else if (id > pi->id)
414 n = n->rb_right;
415 else
416 return pi;
417 }
418 return NULL;
419}
420
421int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
422{
423 struct rb_node *rbp;
424
425 for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) {
426 struct ceph_pg_pool_info *pi =
427 rb_entry(rbp, struct ceph_pg_pool_info, node);
428 if (pi->name && strcmp(pi->name, name) == 0)
429 return pi->id;
430 }
431 return -ENOENT;
432}
433EXPORT_SYMBOL(ceph_pg_poolid_by_name);
434
435static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
436{
437 rb_erase(&pi->node, root);
438 kfree(pi->name);
439 kfree(pi);
440}
441
442static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
443{
444 unsigned n, m;
445
446 ceph_decode_copy(p, &pi->v, sizeof(pi->v));
447 calc_pg_masks(pi);
448
449 /* num_snaps * snap_info_t */
450 n = le32_to_cpu(pi->v.num_snaps);
451 while (n--) {
452 ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) +
453 sizeof(struct ceph_timespec), bad);
454 *p += sizeof(u64) + /* key */
455 1 + sizeof(u64) + /* u8, snapid */
456 sizeof(struct ceph_timespec);
457 m = ceph_decode_32(p); /* snap name */
458 *p += m;
459 }
460
461 *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
462 return 0;
463
464bad:
465 return -EINVAL;
466}
467
468static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
469{
470 struct ceph_pg_pool_info *pi;
471 u32 num, len, pool;
472
473 ceph_decode_32_safe(p, end, num, bad);
474 dout(" %d pool names\n", num);
475 while (num--) {
476 ceph_decode_32_safe(p, end, pool, bad);
477 ceph_decode_32_safe(p, end, len, bad);
478 dout(" pool %d len %d\n", pool, len);
479 pi = __lookup_pg_pool(&map->pg_pools, pool);
480 if (pi) {
481 kfree(pi->name);
482 pi->name = kmalloc(len + 1, GFP_NOFS);
483 if (pi->name) {
484 memcpy(pi->name, *p, len);
485 pi->name[len] = '\0';
486 dout(" name is %s\n", pi->name);
487 }
488 }
489 *p += len;
490 }
491 return 0;
492
493bad:
494 return -EINVAL;
495}
496
497/*
498 * osd map
499 */
500void ceph_osdmap_destroy(struct ceph_osdmap *map)
501{
502 dout("osdmap_destroy %p\n", map);
503 if (map->crush)
504 crush_destroy(map->crush);
505 while (!RB_EMPTY_ROOT(&map->pg_temp)) {
506 struct ceph_pg_mapping *pg =
507 rb_entry(rb_first(&map->pg_temp),
508 struct ceph_pg_mapping, node);
509 rb_erase(&pg->node, &map->pg_temp);
510 kfree(pg);
511 }
512 while (!RB_EMPTY_ROOT(&map->pg_pools)) {
513 struct ceph_pg_pool_info *pi =
514 rb_entry(rb_first(&map->pg_pools),
515 struct ceph_pg_pool_info, node);
516 __remove_pg_pool(&map->pg_pools, pi);
517 }
518 kfree(map->osd_state);
519 kfree(map->osd_weight);
520 kfree(map->osd_addr);
521 kfree(map);
522}
523
524/*
525 * adjust max osd value. reallocate arrays.
526 */
527static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
528{
529 u8 *state;
530 struct ceph_entity_addr *addr;
531 u32 *weight;
532
533 state = kcalloc(max, sizeof(*state), GFP_NOFS);
534 addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
535 weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
536 if (state == NULL || addr == NULL || weight == NULL) {
537 kfree(state);
538 kfree(addr);
539 kfree(weight);
540 return -ENOMEM;
541 }
542
543 /* copy old? */
544 if (map->osd_state) {
545 memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
546 memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
547 memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
548 kfree(map->osd_state);
549 kfree(map->osd_addr);
550 kfree(map->osd_weight);
551 }
552
553 map->osd_state = state;
554 map->osd_weight = weight;
555 map->osd_addr = addr;
556 map->max_osd = max;
557 return 0;
558}
559
560/*
561 * decode a full map.
562 */
563struct ceph_osdmap *osdmap_decode(void **p, void *end)
564{
565 struct ceph_osdmap *map;
566 u16 version;
567 u32 len, max, i;
568 u8 ev;
569 int err = -EINVAL;
570 void *start = *p;
571 struct ceph_pg_pool_info *pi;
572
573 dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
574
575 map = kzalloc(sizeof(*map), GFP_NOFS);
576 if (map == NULL)
577 return ERR_PTR(-ENOMEM);
578 map->pg_temp = RB_ROOT;
579
580 ceph_decode_16_safe(p, end, version, bad);
581 if (version > CEPH_OSDMAP_VERSION) {
582 pr_warning("got unknown v %d > %d of osdmap\n", version,
583 CEPH_OSDMAP_VERSION);
584 goto bad;
585 }
586
587 ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
588 ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
589 map->epoch = ceph_decode_32(p);
590 ceph_decode_copy(p, &map->created, sizeof(map->created));
591 ceph_decode_copy(p, &map->modified, sizeof(map->modified));
592
593 ceph_decode_32_safe(p, end, max, bad);
594 while (max--) {
595 ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
596 pi = kzalloc(sizeof(*pi), GFP_NOFS);
597 if (!pi)
598 goto bad;
599 pi->id = ceph_decode_32(p);
600 ev = ceph_decode_8(p); /* encoding version */
601 if (ev > CEPH_PG_POOL_VERSION) {
602 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
603 ev, CEPH_PG_POOL_VERSION);
604 kfree(pi);
605 goto bad;
606 }
607 err = __decode_pool(p, end, pi);
608 if (err < 0)
609 goto bad;
610 __insert_pg_pool(&map->pg_pools, pi);
611 }
612
613 if (version >= 5 && __decode_pool_names(p, end, map) < 0)
614 goto bad;
615
616 ceph_decode_32_safe(p, end, map->pool_max, bad);
617
618 ceph_decode_32_safe(p, end, map->flags, bad);
619
620 max = ceph_decode_32(p);
621
622 /* (re)alloc osd arrays */
623 err = osdmap_set_max_osd(map, max);
624 if (err < 0)
625 goto bad;
626 dout("osdmap_decode max_osd = %d\n", map->max_osd);
627
628 /* osds */
629 err = -EINVAL;
630 ceph_decode_need(p, end, 3*sizeof(u32) +
631 map->max_osd*(1 + sizeof(*map->osd_weight) +
632 sizeof(*map->osd_addr)), bad);
633 *p += 4; /* skip length field (should match max) */
634 ceph_decode_copy(p, map->osd_state, map->max_osd);
635
636 *p += 4; /* skip length field (should match max) */
637 for (i = 0; i < map->max_osd; i++)
638 map->osd_weight[i] = ceph_decode_32(p);
639
640 *p += 4; /* skip length field (should match max) */
641 ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
642 for (i = 0; i < map->max_osd; i++)
643 ceph_decode_addr(&map->osd_addr[i]);
644
645 /* pg_temp */
646 ceph_decode_32_safe(p, end, len, bad);
647 for (i = 0; i < len; i++) {
648 int n, j;
649 struct ceph_pg pgid;
650 struct ceph_pg_mapping *pg;
651
652 ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
653 ceph_decode_copy(p, &pgid, sizeof(pgid));
654 n = ceph_decode_32(p);
655 ceph_decode_need(p, end, n * sizeof(u32), bad);
656 err = -ENOMEM;
657 pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
658 if (!pg)
659 goto bad;
660 pg->pgid = pgid;
661 pg->len = n;
662 for (j = 0; j < n; j++)
663 pg->osds[j] = ceph_decode_32(p);
664
665 err = __insert_pg_mapping(pg, &map->pg_temp);
666 if (err)
667 goto bad;
668 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
669 }
670
671 /* crush */
672 ceph_decode_32_safe(p, end, len, bad);
673 dout("osdmap_decode crush len %d from off 0x%x\n", len,
674 (int)(*p - start));
675 ceph_decode_need(p, end, len, bad);
676 map->crush = crush_decode(*p, end);
677 *p += len;
678 if (IS_ERR(map->crush)) {
679 err = PTR_ERR(map->crush);
680 map->crush = NULL;
681 goto bad;
682 }
683
684 /* ignore the rest of the map */
685 *p = end;
686
687 dout("osdmap_decode done %p %p\n", *p, end);
688 return map;
689
690bad:
691 dout("osdmap_decode fail\n");
692 ceph_osdmap_destroy(map);
693 return ERR_PTR(err);
694}
695
696/*
697 * decode and apply an incremental map update.
698 */
699struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
700 struct ceph_osdmap *map,
701 struct ceph_messenger *msgr)
702{
703 struct crush_map *newcrush = NULL;
704 struct ceph_fsid fsid;
705 u32 epoch = 0;
706 struct ceph_timespec modified;
707 u32 len, pool;
708 __s32 new_pool_max, new_flags, max;
709 void *start = *p;
710 int err = -EINVAL;
711 u16 version;
712 struct rb_node *rbp;
713
714 ceph_decode_16_safe(p, end, version, bad);
715 if (version > CEPH_OSDMAP_INC_VERSION) {
716 pr_warning("got unknown v %d > %d of inc osdmap\n", version,
717 CEPH_OSDMAP_INC_VERSION);
718 goto bad;
719 }
720
721 ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
722 bad);
723 ceph_decode_copy(p, &fsid, sizeof(fsid));
724 epoch = ceph_decode_32(p);
725 BUG_ON(epoch != map->epoch+1);
726 ceph_decode_copy(p, &modified, sizeof(modified));
727 new_pool_max = ceph_decode_32(p);
728 new_flags = ceph_decode_32(p);
729
730 /* full map? */
731 ceph_decode_32_safe(p, end, len, bad);
732 if (len > 0) {
733 dout("apply_incremental full map len %d, %p to %p\n",
734 len, *p, end);
735 return osdmap_decode(p, min(*p+len, end));
736 }
737
738 /* new crush? */
739 ceph_decode_32_safe(p, end, len, bad);
740 if (len > 0) {
741 dout("apply_incremental new crush map len %d, %p to %p\n",
742 len, *p, end);
743 newcrush = crush_decode(*p, min(*p+len, end));
744 if (IS_ERR(newcrush))
745 return ERR_CAST(newcrush);
746 *p += len;
747 }
748
749 /* new flags? */
750 if (new_flags >= 0)
751 map->flags = new_flags;
752 if (new_pool_max >= 0)
753 map->pool_max = new_pool_max;
754
755 ceph_decode_need(p, end, 5*sizeof(u32), bad);
756
757 /* new max? */
758 max = ceph_decode_32(p);
759 if (max >= 0) {
760 err = osdmap_set_max_osd(map, max);
761 if (err < 0)
762 goto bad;
763 }
764
765 map->epoch++;
766 map->modified = map->modified;
767 if (newcrush) {
768 if (map->crush)
769 crush_destroy(map->crush);
770 map->crush = newcrush;
771 newcrush = NULL;
772 }
773
774 /* new_pool */
775 ceph_decode_32_safe(p, end, len, bad);
776 while (len--) {
777 __u8 ev;
778 struct ceph_pg_pool_info *pi;
779
780 ceph_decode_32_safe(p, end, pool, bad);
781 ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
782 ev = ceph_decode_8(p); /* encoding version */
783 if (ev > CEPH_PG_POOL_VERSION) {
784 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
785 ev, CEPH_PG_POOL_VERSION);
786 goto bad;
787 }
788 pi = __lookup_pg_pool(&map->pg_pools, pool);
789 if (!pi) {
790 pi = kzalloc(sizeof(*pi), GFP_NOFS);
791 if (!pi) {
792 err = -ENOMEM;
793 goto bad;
794 }
795 pi->id = pool;
796 __insert_pg_pool(&map->pg_pools, pi);
797 }
798 err = __decode_pool(p, end, pi);
799 if (err < 0)
800 goto bad;
801 }
802 if (version >= 5 && __decode_pool_names(p, end, map) < 0)
803 goto bad;
804
805 /* old_pool */
806 ceph_decode_32_safe(p, end, len, bad);
807 while (len--) {
808 struct ceph_pg_pool_info *pi;
809
810 ceph_decode_32_safe(p, end, pool, bad);
811 pi = __lookup_pg_pool(&map->pg_pools, pool);
812 if (pi)
813 __remove_pg_pool(&map->pg_pools, pi);
814 }
815
816 /* new_up */
817 err = -EINVAL;
818 ceph_decode_32_safe(p, end, len, bad);
819 while (len--) {
820 u32 osd;
821 struct ceph_entity_addr addr;
822 ceph_decode_32_safe(p, end, osd, bad);
823 ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
824 ceph_decode_addr(&addr);
825 pr_info("osd%d up\n", osd);
826 BUG_ON(osd >= map->max_osd);
827 map->osd_state[osd] |= CEPH_OSD_UP;
828 map->osd_addr[osd] = addr;
829 }
830
831 /* new_down */
832 ceph_decode_32_safe(p, end, len, bad);
833 while (len--) {
834 u32 osd;
835 ceph_decode_32_safe(p, end, osd, bad);
836 (*p)++; /* clean flag */
837 pr_info("osd%d down\n", osd);
838 if (osd < map->max_osd)
839 map->osd_state[osd] &= ~CEPH_OSD_UP;
840 }
841
842 /* new_weight */
843 ceph_decode_32_safe(p, end, len, bad);
844 while (len--) {
845 u32 osd, off;
846 ceph_decode_need(p, end, sizeof(u32)*2, bad);
847 osd = ceph_decode_32(p);
848 off = ceph_decode_32(p);
849 pr_info("osd%d weight 0x%x %s\n", osd, off,
850 off == CEPH_OSD_IN ? "(in)" :
851 (off == CEPH_OSD_OUT ? "(out)" : ""));
852 if (osd < map->max_osd)
853 map->osd_weight[osd] = off;
854 }
855
856 /* new_pg_temp */
857 rbp = rb_first(&map->pg_temp);
858 ceph_decode_32_safe(p, end, len, bad);
859 while (len--) {
860 struct ceph_pg_mapping *pg;
861 int j;
862 struct ceph_pg pgid;
863 u32 pglen;
864 ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
865 ceph_decode_copy(p, &pgid, sizeof(pgid));
866 pglen = ceph_decode_32(p);
867
868 /* remove any? */
869 while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
870 node)->pgid, pgid) <= 0) {
871 struct ceph_pg_mapping *cur =
872 rb_entry(rbp, struct ceph_pg_mapping, node);
873
874 rbp = rb_next(rbp);
875 dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
876 rb_erase(&cur->node, &map->pg_temp);
877 kfree(cur);
878 }
879
880 if (pglen) {
881 /* insert */
882 ceph_decode_need(p, end, pglen*sizeof(u32), bad);
883 pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
884 if (!pg) {
885 err = -ENOMEM;
886 goto bad;
887 }
888 pg->pgid = pgid;
889 pg->len = pglen;
890 for (j = 0; j < pglen; j++)
891 pg->osds[j] = ceph_decode_32(p);
892 err = __insert_pg_mapping(pg, &map->pg_temp);
893 if (err) {
894 kfree(pg);
895 goto bad;
896 }
897 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
898 pglen);
899 }
900 }
901 while (rbp) {
902 struct ceph_pg_mapping *cur =
903 rb_entry(rbp, struct ceph_pg_mapping, node);
904
905 rbp = rb_next(rbp);
906 dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
907 rb_erase(&cur->node, &map->pg_temp);
908 kfree(cur);
909 }
910
911 /* ignore the rest */
912 *p = end;
913 return map;
914
915bad:
916 pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
917 epoch, (int)(*p - start), *p, start, end);
918 print_hex_dump(KERN_DEBUG, "osdmap: ",
919 DUMP_PREFIX_OFFSET, 16, 1,
920 start, end - start, true);
921 if (newcrush)
922 crush_destroy(newcrush);
923 return ERR_PTR(err);
924}
925
926
927
928
929/*
930 * calculate file layout from given offset, length.
931 * fill in correct oid, logical length, and object extent
932 * offset, length.
933 *
934 * for now, we write only a single su, until we can
935 * pass a stride back to the caller.
936 */
937void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
938 u64 off, u64 *plen,
939 u64 *ono,
940 u64 *oxoff, u64 *oxlen)
941{
942 u32 osize = le32_to_cpu(layout->fl_object_size);
943 u32 su = le32_to_cpu(layout->fl_stripe_unit);
944 u32 sc = le32_to_cpu(layout->fl_stripe_count);
945 u32 bl, stripeno, stripepos, objsetno;
946 u32 su_per_object;
947 u64 t, su_offset;
948
949 dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen,
950 osize, su);
951 su_per_object = osize / su;
952 dout("osize %u / su %u = su_per_object %u\n", osize, su,
953 su_per_object);
954
955 BUG_ON((su & ~PAGE_MASK) != 0);
956 /* bl = *off / su; */
957 t = off;
958 do_div(t, su);
959 bl = t;
960 dout("off %llu / su %u = bl %u\n", off, su, bl);
961
962 stripeno = bl / sc;
963 stripepos = bl % sc;
964 objsetno = stripeno / su_per_object;
965
966 *ono = objsetno * sc + stripepos;
967 dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
968
969 /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */
970 t = off;
971 su_offset = do_div(t, su);
972 *oxoff = su_offset + (stripeno % su_per_object) * su;
973
974 /*
975 * Calculate the length of the extent being written to the selected
976 * object. This is the minimum of the full length requested (plen) or
977 * the remainder of the current stripe being written to.
978 */
979 *oxlen = min_t(u64, *plen, su - su_offset);
980 *plen = *oxlen;
981
982 dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
983}
984EXPORT_SYMBOL(ceph_calc_file_object_mapping);
985
986/*
987 * calculate an object layout (i.e. pgid) from an oid,
988 * file_layout, and osdmap
989 */
990int ceph_calc_object_layout(struct ceph_object_layout *ol,
991 const char *oid,
992 struct ceph_file_layout *fl,
993 struct ceph_osdmap *osdmap)
994{
995 unsigned num, num_mask;
996 struct ceph_pg pgid;
997 s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
998 int poolid = le32_to_cpu(fl->fl_pg_pool);
999 struct ceph_pg_pool_info *pool;
1000 unsigned ps;
1001
1002 BUG_ON(!osdmap);
1003
1004 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
1005 if (!pool)
1006 return -EIO;
1007 ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
1008 if (preferred >= 0) {
1009 ps += preferred;
1010 num = le32_to_cpu(pool->v.lpg_num);
1011 num_mask = pool->lpg_num_mask;
1012 } else {
1013 num = le32_to_cpu(pool->v.pg_num);
1014 num_mask = pool->pg_num_mask;
1015 }
1016
1017 pgid.ps = cpu_to_le16(ps);
1018 pgid.preferred = cpu_to_le16(preferred);
1019 pgid.pool = fl->fl_pg_pool;
1020 if (preferred >= 0)
1021 dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
1022 (int)preferred);
1023 else
1024 dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
1025
1026 ol->ol_pgid = pgid;
1027 ol->ol_stripe_unit = fl->fl_object_stripe_unit;
1028 return 0;
1029}
1030EXPORT_SYMBOL(ceph_calc_object_layout);
1031
1032/*
1033 * Calculate raw osd vector for the given pgid. Return pointer to osd
1034 * array, or NULL on failure.
1035 */
1036static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1037 int *osds, int *num)
1038{
1039 struct ceph_pg_mapping *pg;
1040 struct ceph_pg_pool_info *pool;
1041 int ruleno;
1042 unsigned poolid, ps, pps;
1043 int preferred;
1044
1045 /* pg_temp? */
1046 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
1047 if (pg) {
1048 *num = pg->len;
1049 return pg->osds;
1050 }
1051
1052 /* crush */
1053 poolid = le32_to_cpu(pgid.pool);
1054 ps = le16_to_cpu(pgid.ps);
1055 preferred = (s16)le16_to_cpu(pgid.preferred);
1056
1057 /* don't forcefeed bad device ids to crush */
1058 if (preferred >= osdmap->max_osd ||
1059 preferred >= osdmap->crush->max_devices)
1060 preferred = -1;
1061
1062 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
1063 if (!pool)
1064 return NULL;
1065 ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
1066 pool->v.type, pool->v.size);
1067 if (ruleno < 0) {
1068 pr_err("no crush rule pool %d ruleset %d type %d size %d\n",
1069 poolid, pool->v.crush_ruleset, pool->v.type,
1070 pool->v.size);
1071 return NULL;
1072 }
1073
1074 if (preferred >= 0)
1075 pps = ceph_stable_mod(ps,
1076 le32_to_cpu(pool->v.lpgp_num),
1077 pool->lpgp_num_mask);
1078 else
1079 pps = ceph_stable_mod(ps,
1080 le32_to_cpu(pool->v.pgp_num),
1081 pool->pgp_num_mask);
1082 pps += poolid;
1083 *num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
1084 min_t(int, pool->v.size, *num),
1085 preferred, osdmap->osd_weight);
1086 return osds;
1087}
1088
1089/*
1090 * Return acting set for given pgid.
1091 */
1092int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1093 int *acting)
1094{
1095 int rawosds[CEPH_PG_MAX_SIZE], *osds;
1096 int i, o, num = CEPH_PG_MAX_SIZE;
1097
1098 osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
1099 if (!osds)
1100 return -1;
1101
1102 /* primary is first up osd */
1103 o = 0;
1104 for (i = 0; i < num; i++)
1105 if (ceph_osd_is_up(osdmap, osds[i]))
1106 acting[o++] = osds[i];
1107 return o;
1108}
1109
1110/*
1111 * Return primary osd for given pgid, or -1 if none.
1112 */
1113int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
1114{
1115 int rawosds[CEPH_PG_MAX_SIZE], *osds;
1116 int i, num = CEPH_PG_MAX_SIZE;
1117
1118 osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
1119 if (!osds)
1120 return -1;
1121
1122 /* primary is first up osd */
1123 for (i = 0; i < num; i++)
1124 if (ceph_osd_is_up(osdmap, osds[i]))
1125 return osds[i];
1126 return -1;
1127}
1128EXPORT_SYMBOL(ceph_calc_pg_primary);
diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c
new file mode 100644
index 000000000000..13cb409a7bba
--- /dev/null
+++ b/net/ceph/pagelist.c
@@ -0,0 +1,154 @@
1
2#include <linux/module.h>
3#include <linux/gfp.h>
4#include <linux/pagemap.h>
5#include <linux/highmem.h>
6#include <linux/ceph/pagelist.h>
7
8static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
9{
10 if (pl->mapped_tail) {
11 struct page *page = list_entry(pl->head.prev, struct page, lru);
12 kunmap(page);
13 pl->mapped_tail = NULL;
14 }
15}
16
17int ceph_pagelist_release(struct ceph_pagelist *pl)
18{
19 ceph_pagelist_unmap_tail(pl);
20 while (!list_empty(&pl->head)) {
21 struct page *page = list_first_entry(&pl->head, struct page,
22 lru);
23 list_del(&page->lru);
24 __free_page(page);
25 }
26 ceph_pagelist_free_reserve(pl);
27 return 0;
28}
29EXPORT_SYMBOL(ceph_pagelist_release);
30
31static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
32{
33 struct page *page;
34
35 if (!pl->num_pages_free) {
36 page = __page_cache_alloc(GFP_NOFS);
37 } else {
38 page = list_first_entry(&pl->free_list, struct page, lru);
39 list_del(&page->lru);
40 --pl->num_pages_free;
41 }
42 if (!page)
43 return -ENOMEM;
44 pl->room += PAGE_SIZE;
45 ceph_pagelist_unmap_tail(pl);
46 list_add_tail(&page->lru, &pl->head);
47 pl->mapped_tail = kmap(page);
48 return 0;
49}
50
51int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len)
52{
53 while (pl->room < len) {
54 size_t bit = pl->room;
55 int ret;
56
57 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
58 buf, bit);
59 pl->length += bit;
60 pl->room -= bit;
61 buf += bit;
62 len -= bit;
63 ret = ceph_pagelist_addpage(pl);
64 if (ret)
65 return ret;
66 }
67
68 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
69 pl->length += len;
70 pl->room -= len;
71 return 0;
72}
73EXPORT_SYMBOL(ceph_pagelist_append);
74
75/**
76 * Allocate enough pages for a pagelist to append the given amount
77 * of data without without allocating.
78 * Returns: 0 on success, -ENOMEM on error.
79 */
80int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space)
81{
82 if (space <= pl->room)
83 return 0;
84 space -= pl->room;
85 space = (space + PAGE_SIZE - 1) >> PAGE_SHIFT; /* conv to num pages */
86
87 while (space > pl->num_pages_free) {
88 struct page *page = __page_cache_alloc(GFP_NOFS);
89 if (!page)
90 return -ENOMEM;
91 list_add_tail(&page->lru, &pl->free_list);
92 ++pl->num_pages_free;
93 }
94 return 0;
95}
96EXPORT_SYMBOL(ceph_pagelist_reserve);
97
98/**
99 * Free any pages that have been preallocated.
100 */
101int ceph_pagelist_free_reserve(struct ceph_pagelist *pl)
102{
103 while (!list_empty(&pl->free_list)) {
104 struct page *page = list_first_entry(&pl->free_list,
105 struct page, lru);
106 list_del(&page->lru);
107 __free_page(page);
108 --pl->num_pages_free;
109 }
110 BUG_ON(pl->num_pages_free);
111 return 0;
112}
113EXPORT_SYMBOL(ceph_pagelist_free_reserve);
114
115/**
116 * Create a truncation point.
117 */
118void ceph_pagelist_set_cursor(struct ceph_pagelist *pl,
119 struct ceph_pagelist_cursor *c)
120{
121 c->pl = pl;
122 c->page_lru = pl->head.prev;
123 c->room = pl->room;
124}
125EXPORT_SYMBOL(ceph_pagelist_set_cursor);
126
127/**
128 * Truncate a pagelist to the given point. Move extra pages to reserve.
129 * This won't sleep.
130 * Returns: 0 on success,
131 * -EINVAL if the pagelist doesn't match the trunc point pagelist
132 */
133int ceph_pagelist_truncate(struct ceph_pagelist *pl,
134 struct ceph_pagelist_cursor *c)
135{
136 struct page *page;
137
138 if (pl != c->pl)
139 return -EINVAL;
140 ceph_pagelist_unmap_tail(pl);
141 while (pl->head.prev != c->page_lru) {
142 page = list_entry(pl->head.prev, struct page, lru);
143 list_del(&page->lru); /* remove from pagelist */
144 list_add_tail(&page->lru, &pl->free_list); /* add to reserve */
145 ++pl->num_pages_free;
146 }
147 pl->room = c->room;
148 if (!list_empty(&pl->head)) {
149 page = list_entry(pl->head.prev, struct page, lru);
150 pl->mapped_tail = kmap(page);
151 }
152 return 0;
153}
154EXPORT_SYMBOL(ceph_pagelist_truncate);
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
new file mode 100644
index 000000000000..54caf0687155
--- /dev/null
+++ b/net/ceph/pagevec.c
@@ -0,0 +1,223 @@
1#include <linux/ceph/ceph_debug.h>
2
3#include <linux/module.h>
4#include <linux/sched.h>
5#include <linux/slab.h>
6#include <linux/file.h>
7#include <linux/namei.h>
8#include <linux/writeback.h>
9
10#include <linux/ceph/libceph.h>
11
12/*
13 * build a vector of user pages
14 */
15struct page **ceph_get_direct_page_vector(const char __user *data,
16 int num_pages,
17 loff_t off, size_t len)
18{
19 struct page **pages;
20 int rc;
21
22 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
23 if (!pages)
24 return ERR_PTR(-ENOMEM);
25
26 down_read(&current->mm->mmap_sem);
27 rc = get_user_pages(current, current->mm, (unsigned long)data,
28 num_pages, 0, 0, pages, NULL);
29 up_read(&current->mm->mmap_sem);
30 if (rc < 0)
31 goto fail;
32 return pages;
33
34fail:
35 kfree(pages);
36 return ERR_PTR(rc);
37}
38EXPORT_SYMBOL(ceph_get_direct_page_vector);
39
40void ceph_put_page_vector(struct page **pages, int num_pages)
41{
42 int i;
43
44 for (i = 0; i < num_pages; i++)
45 put_page(pages[i]);
46 kfree(pages);
47}
48EXPORT_SYMBOL(ceph_put_page_vector);
49
50void ceph_release_page_vector(struct page **pages, int num_pages)
51{
52 int i;
53
54 for (i = 0; i < num_pages; i++)
55 __free_pages(pages[i], 0);
56 kfree(pages);
57}
58EXPORT_SYMBOL(ceph_release_page_vector);
59
60/*
61 * allocate a vector new pages
62 */
63struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
64{
65 struct page **pages;
66 int i;
67
68 pages = kmalloc(sizeof(*pages) * num_pages, flags);
69 if (!pages)
70 return ERR_PTR(-ENOMEM);
71 for (i = 0; i < num_pages; i++) {
72 pages[i] = __page_cache_alloc(flags);
73 if (pages[i] == NULL) {
74 ceph_release_page_vector(pages, i);
75 return ERR_PTR(-ENOMEM);
76 }
77 }
78 return pages;
79}
80EXPORT_SYMBOL(ceph_alloc_page_vector);
81
82/*
83 * copy user data into a page vector
84 */
85int ceph_copy_user_to_page_vector(struct page **pages,
86 const char __user *data,
87 loff_t off, size_t len)
88{
89 int i = 0;
90 int po = off & ~PAGE_CACHE_MASK;
91 int left = len;
92 int l, bad;
93
94 while (left > 0) {
95 l = min_t(int, PAGE_CACHE_SIZE-po, left);
96 bad = copy_from_user(page_address(pages[i]) + po, data, l);
97 if (bad == l)
98 return -EFAULT;
99 data += l - bad;
100 left -= l - bad;
101 po += l - bad;
102 if (po == PAGE_CACHE_SIZE) {
103 po = 0;
104 i++;
105 }
106 }
107 return len;
108}
109EXPORT_SYMBOL(ceph_copy_user_to_page_vector);
110
111int ceph_copy_to_page_vector(struct page **pages,
112 const char *data,
113 loff_t off, size_t len)
114{
115 int i = 0;
116 size_t po = off & ~PAGE_CACHE_MASK;
117 size_t left = len;
118 size_t l;
119
120 while (left > 0) {
121 l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
122 memcpy(page_address(pages[i]) + po, data, l);
123 data += l;
124 left -= l;
125 po += l;
126 if (po == PAGE_CACHE_SIZE) {
127 po = 0;
128 i++;
129 }
130 }
131 return len;
132}
133EXPORT_SYMBOL(ceph_copy_to_page_vector);
134
135int ceph_copy_from_page_vector(struct page **pages,
136 char *data,
137 loff_t off, size_t len)
138{
139 int i = 0;
140 size_t po = off & ~PAGE_CACHE_MASK;
141 size_t left = len;
142 size_t l;
143
144 while (left > 0) {
145 l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
146 memcpy(data, page_address(pages[i]) + po, l);
147 data += l;
148 left -= l;
149 po += l;
150 if (po == PAGE_CACHE_SIZE) {
151 po = 0;
152 i++;
153 }
154 }
155 return len;
156}
157EXPORT_SYMBOL(ceph_copy_from_page_vector);
158
159/*
160 * copy user data from a page vector into a user pointer
161 */
162int ceph_copy_page_vector_to_user(struct page **pages,
163 char __user *data,
164 loff_t off, size_t len)
165{
166 int i = 0;
167 int po = off & ~PAGE_CACHE_MASK;
168 int left = len;
169 int l, bad;
170
171 while (left > 0) {
172 l = min_t(int, left, PAGE_CACHE_SIZE-po);
173 bad = copy_to_user(data, page_address(pages[i]) + po, l);
174 if (bad == l)
175 return -EFAULT;
176 data += l - bad;
177 left -= l - bad;
178 if (po) {
179 po += l - bad;
180 if (po == PAGE_CACHE_SIZE)
181 po = 0;
182 }
183 i++;
184 }
185 return len;
186}
187EXPORT_SYMBOL(ceph_copy_page_vector_to_user);
188
189/*
190 * Zero an extent within a page vector. Offset is relative to the
191 * start of the first page.
192 */
193void ceph_zero_page_vector_range(int off, int len, struct page **pages)
194{
195 int i = off >> PAGE_CACHE_SHIFT;
196
197 off &= ~PAGE_CACHE_MASK;
198
199 dout("zero_page_vector_page %u~%u\n", off, len);
200
201 /* leading partial page? */
202 if (off) {
203 int end = min((int)PAGE_CACHE_SIZE, off + len);
204 dout("zeroing %d %p head from %d\n", i, pages[i],
205 (int)off);
206 zero_user_segment(pages[i], off, end);
207 len -= (end - off);
208 i++;
209 }
210 while (len >= PAGE_CACHE_SIZE) {
211 dout("zeroing %d %p len=%d\n", i, pages[i], len);
212 zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
213 len -= PAGE_CACHE_SIZE;
214 i++;
215 }
216 /* trailing partial page? */
217 if (len) {
218 dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
219 zero_user_segment(pages[i], 0, len);
220 }
221}
222EXPORT_SYMBOL(ceph_zero_page_vector_range);
223
diff --git a/net/compat.c b/net/compat.c
index 63d260e81472..3649d5895361 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -41,10 +41,12 @@ static inline int iov_from_user_compat_to_kern(struct iovec *kiov,
41 compat_size_t len; 41 compat_size_t len;
42 42
43 if (get_user(len, &uiov32->iov_len) || 43 if (get_user(len, &uiov32->iov_len) ||
44 get_user(buf, &uiov32->iov_base)) { 44 get_user(buf, &uiov32->iov_base))
45 tot_len = -EFAULT; 45 return -EFAULT;
46 break; 46
47 } 47 if (len > INT_MAX - tot_len)
48 len = INT_MAX - tot_len;
49
48 tot_len += len; 50 tot_len += len;
49 kiov->iov_base = compat_ptr(buf); 51 kiov->iov_base = compat_ptr(buf);
50 kiov->iov_len = (__kernel_size_t) len; 52 kiov->iov_len = (__kernel_size_t) len;
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 251997a95483..cd1e039c8755 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -243,6 +243,7 @@ void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb)
243 unlock_sock_fast(sk, slow); 243 unlock_sock_fast(sk, slow);
244 244
245 /* skb is now orphaned, can be freed outside of locked section */ 245 /* skb is now orphaned, can be freed outside of locked section */
246 trace_kfree_skb(skb, skb_free_datagram_locked);
246 __kfree_skb(skb); 247 __kfree_skb(skb);
247} 248}
248EXPORT_SYMBOL(skb_free_datagram_locked); 249EXPORT_SYMBOL(skb_free_datagram_locked);
@@ -746,13 +747,12 @@ unsigned int datagram_poll(struct file *file, struct socket *sock,
746 if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) 747 if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
747 mask |= POLLERR; 748 mask |= POLLERR;
748 if (sk->sk_shutdown & RCV_SHUTDOWN) 749 if (sk->sk_shutdown & RCV_SHUTDOWN)
749 mask |= POLLRDHUP; 750 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
750 if (sk->sk_shutdown == SHUTDOWN_MASK) 751 if (sk->sk_shutdown == SHUTDOWN_MASK)
751 mask |= POLLHUP; 752 mask |= POLLHUP;
752 753
753 /* readable? */ 754 /* readable? */
754 if (!skb_queue_empty(&sk->sk_receive_queue) || 755 if (!skb_queue_empty(&sk->sk_receive_queue))
755 (sk->sk_shutdown & RCV_SHUTDOWN))
756 mask |= POLLIN | POLLRDNORM; 756 mask |= POLLIN | POLLRDNORM;
757 757
758 /* Connection-based need to check for termination and startup */ 758 /* Connection-based need to check for termination and startup */
diff --git a/net/core/dev.c b/net/core/dev.c
index 1ae654391442..35dfb8318483 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -128,7 +128,10 @@
128#include <linux/jhash.h> 128#include <linux/jhash.h>
129#include <linux/random.h> 129#include <linux/random.h>
130#include <trace/events/napi.h> 130#include <trace/events/napi.h>
131#include <trace/events/net.h>
132#include <trace/events/skb.h>
131#include <linux/pci.h> 133#include <linux/pci.h>
134#include <linux/inetdevice.h>
132 135
133#include "net-sysfs.h" 136#include "net-sysfs.h"
134 137
@@ -371,6 +374,14 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
371 * --ANK (980803) 374 * --ANK (980803)
372 */ 375 */
373 376
377static inline struct list_head *ptype_head(const struct packet_type *pt)
378{
379 if (pt->type == htons(ETH_P_ALL))
380 return &ptype_all;
381 else
382 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
383}
384
374/** 385/**
375 * dev_add_pack - add packet handler 386 * dev_add_pack - add packet handler
376 * @pt: packet type declaration 387 * @pt: packet type declaration
@@ -386,16 +397,11 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
386 397
387void dev_add_pack(struct packet_type *pt) 398void dev_add_pack(struct packet_type *pt)
388{ 399{
389 int hash; 400 struct list_head *head = ptype_head(pt);
390 401
391 spin_lock_bh(&ptype_lock); 402 spin_lock(&ptype_lock);
392 if (pt->type == htons(ETH_P_ALL)) 403 list_add_rcu(&pt->list, head);
393 list_add_rcu(&pt->list, &ptype_all); 404 spin_unlock(&ptype_lock);
394 else {
395 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
396 list_add_rcu(&pt->list, &ptype_base[hash]);
397 }
398 spin_unlock_bh(&ptype_lock);
399} 405}
400EXPORT_SYMBOL(dev_add_pack); 406EXPORT_SYMBOL(dev_add_pack);
401 407
@@ -414,15 +420,10 @@ EXPORT_SYMBOL(dev_add_pack);
414 */ 420 */
415void __dev_remove_pack(struct packet_type *pt) 421void __dev_remove_pack(struct packet_type *pt)
416{ 422{
417 struct list_head *head; 423 struct list_head *head = ptype_head(pt);
418 struct packet_type *pt1; 424 struct packet_type *pt1;
419 425
420 spin_lock_bh(&ptype_lock); 426 spin_lock(&ptype_lock);
421
422 if (pt->type == htons(ETH_P_ALL))
423 head = &ptype_all;
424 else
425 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
426 427
427 list_for_each_entry(pt1, head, list) { 428 list_for_each_entry(pt1, head, list) {
428 if (pt == pt1) { 429 if (pt == pt1) {
@@ -433,7 +434,7 @@ void __dev_remove_pack(struct packet_type *pt)
433 434
434 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt); 435 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
435out: 436out:
436 spin_unlock_bh(&ptype_lock); 437 spin_unlock(&ptype_lock);
437} 438}
438EXPORT_SYMBOL(__dev_remove_pack); 439EXPORT_SYMBOL(__dev_remove_pack);
439 440
@@ -1484,8 +1485,9 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1484 skb_orphan(skb); 1485 skb_orphan(skb);
1485 nf_reset(skb); 1486 nf_reset(skb);
1486 1487
1487 if (!(dev->flags & IFF_UP) || 1488 if (unlikely(!(dev->flags & IFF_UP) ||
1488 (skb->len > (dev->mtu + dev->hard_header_len))) { 1489 (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) {
1490 atomic_long_inc(&dev->rx_dropped);
1489 kfree_skb(skb); 1491 kfree_skb(skb);
1490 return NET_RX_DROP; 1492 return NET_RX_DROP;
1491 } 1493 }
@@ -1553,21 +1555,56 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1553 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues 1555 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1554 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. 1556 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1555 */ 1557 */
1556void netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) 1558int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1557{ 1559{
1558 unsigned int real_num = dev->real_num_tx_queues; 1560 if (txq < 1 || txq > dev->num_tx_queues)
1561 return -EINVAL;
1559 1562
1560 if (unlikely(txq > dev->num_tx_queues)) 1563 if (dev->reg_state == NETREG_REGISTERED) {
1561 ; 1564 ASSERT_RTNL();
1562 else if (txq > real_num) 1565
1563 dev->real_num_tx_queues = txq; 1566 if (txq < dev->real_num_tx_queues)
1564 else if (txq < real_num) { 1567 qdisc_reset_all_tx_gt(dev, txq);
1565 dev->real_num_tx_queues = txq;
1566 qdisc_reset_all_tx_gt(dev, txq);
1567 } 1568 }
1569
1570 dev->real_num_tx_queues = txq;
1571 return 0;
1568} 1572}
1569EXPORT_SYMBOL(netif_set_real_num_tx_queues); 1573EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1570 1574
1575#ifdef CONFIG_RPS
1576/**
1577 * netif_set_real_num_rx_queues - set actual number of RX queues used
1578 * @dev: Network device
1579 * @rxq: Actual number of RX queues
1580 *
1581 * This must be called either with the rtnl_lock held or before
1582 * registration of the net device. Returns 0 on success, or a
1583 * negative error code. If called before registration, it always
1584 * succeeds.
1585 */
1586int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1587{
1588 int rc;
1589
1590 if (rxq < 1 || rxq > dev->num_rx_queues)
1591 return -EINVAL;
1592
1593 if (dev->reg_state == NETREG_REGISTERED) {
1594 ASSERT_RTNL();
1595
1596 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1597 rxq);
1598 if (rc)
1599 return rc;
1600 }
1601
1602 dev->real_num_rx_queues = rxq;
1603 return 0;
1604}
1605EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1606#endif
1607
1571static inline void __netif_reschedule(struct Qdisc *q) 1608static inline void __netif_reschedule(struct Qdisc *q)
1572{ 1609{
1573 struct softnet_data *sd; 1610 struct softnet_data *sd;
@@ -1648,10 +1685,10 @@ EXPORT_SYMBOL(netif_device_attach);
1648 1685
1649static bool can_checksum_protocol(unsigned long features, __be16 protocol) 1686static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1650{ 1687{
1651 return ((features & NETIF_F_GEN_CSUM) || 1688 return ((features & NETIF_F_NO_CSUM) ||
1652 ((features & NETIF_F_IP_CSUM) && 1689 ((features & NETIF_F_V4_CSUM) &&
1653 protocol == htons(ETH_P_IP)) || 1690 protocol == htons(ETH_P_IP)) ||
1654 ((features & NETIF_F_IPV6_CSUM) && 1691 ((features & NETIF_F_V6_CSUM) &&
1655 protocol == htons(ETH_P_IPV6)) || 1692 protocol == htons(ETH_P_IPV6)) ||
1656 ((features & NETIF_F_FCOE_CRC) && 1693 ((features & NETIF_F_FCOE_CRC) &&
1657 protocol == htons(ETH_P_FCOE))); 1694 protocol == htons(ETH_P_FCOE)));
@@ -1659,17 +1696,18 @@ static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1659 1696
1660static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb) 1697static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1661{ 1698{
1662 if (can_checksum_protocol(dev->features, skb->protocol)) 1699 __be16 protocol = skb->protocol;
1663 return true; 1700 int features = dev->features;
1664 1701
1665 if (skb->protocol == htons(ETH_P_8021Q)) { 1702 if (vlan_tx_tag_present(skb)) {
1703 features &= dev->vlan_features;
1704 } else if (protocol == htons(ETH_P_8021Q)) {
1666 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; 1705 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1667 if (can_checksum_protocol(dev->features & dev->vlan_features, 1706 protocol = veh->h_vlan_encapsulated_proto;
1668 veh->h_vlan_encapsulated_proto)) 1707 features &= dev->vlan_features;
1669 return true;
1670 } 1708 }
1671 1709
1672 return false; 1710 return can_checksum_protocol(features, protocol);
1673} 1711}
1674 1712
1675/** 1713/**
@@ -1758,6 +1796,16 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1758 __be16 type = skb->protocol; 1796 __be16 type = skb->protocol;
1759 int err; 1797 int err;
1760 1798
1799 if (type == htons(ETH_P_8021Q)) {
1800 struct vlan_ethhdr *veh;
1801
1802 if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
1803 return ERR_PTR(-EINVAL);
1804
1805 veh = (struct vlan_ethhdr *)skb->data;
1806 type = veh->h_vlan_encapsulated_proto;
1807 }
1808
1761 skb_reset_mac_header(skb); 1809 skb_reset_mac_header(skb);
1762 skb->mac_len = skb->network_header - skb->mac_header; 1810 skb->mac_len = skb->network_header - skb->mac_header;
1763 __skb_pull(skb, skb->mac_len); 1811 __skb_pull(skb, skb->mac_len);
@@ -1902,14 +1950,14 @@ static int dev_gso_segment(struct sk_buff *skb)
1902 1950
1903/* 1951/*
1904 * Try to orphan skb early, right before transmission by the device. 1952 * Try to orphan skb early, right before transmission by the device.
1905 * We cannot orphan skb if tx timestamp is requested, since 1953 * We cannot orphan skb if tx timestamp is requested or the sk-reference
1906 * drivers need to call skb_tstamp_tx() to send the timestamp. 1954 * is needed on driver level for other reasons, e.g. see net/can/raw.c
1907 */ 1955 */
1908static inline void skb_orphan_try(struct sk_buff *skb) 1956static inline void skb_orphan_try(struct sk_buff *skb)
1909{ 1957{
1910 struct sock *sk = skb->sk; 1958 struct sock *sk = skb->sk;
1911 1959
1912 if (sk && !skb_tx(skb)->flags) { 1960 if (sk && !skb_shinfo(skb)->tx_flags) {
1913 /* skb_tx_hash() wont be able to get sk. 1961 /* skb_tx_hash() wont be able to get sk.
1914 * We copy sk_hash into skb->rxhash 1962 * We copy sk_hash into skb->rxhash
1915 */ 1963 */
@@ -1929,9 +1977,14 @@ static inline void skb_orphan_try(struct sk_buff *skb)
1929static inline int skb_needs_linearize(struct sk_buff *skb, 1977static inline int skb_needs_linearize(struct sk_buff *skb,
1930 struct net_device *dev) 1978 struct net_device *dev)
1931{ 1979{
1980 int features = dev->features;
1981
1982 if (skb->protocol == htons(ETH_P_8021Q) || vlan_tx_tag_present(skb))
1983 features &= dev->vlan_features;
1984
1932 return skb_is_nonlinear(skb) && 1985 return skb_is_nonlinear(skb) &&
1933 ((skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) || 1986 ((skb_has_frag_list(skb) && !(features & NETIF_F_FRAGLIST)) ||
1934 (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) || 1987 (skb_shinfo(skb)->nr_frags && (!(features & NETIF_F_SG) ||
1935 illegal_highdma(dev, skb)))); 1988 illegal_highdma(dev, skb))));
1936} 1989}
1937 1990
@@ -1954,6 +2007,15 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1954 2007
1955 skb_orphan_try(skb); 2008 skb_orphan_try(skb);
1956 2009
2010 if (vlan_tx_tag_present(skb) &&
2011 !(dev->features & NETIF_F_HW_VLAN_TX)) {
2012 skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2013 if (unlikely(!skb))
2014 goto out;
2015
2016 skb->vlan_tci = 0;
2017 }
2018
1957 if (netif_needs_gso(dev, skb)) { 2019 if (netif_needs_gso(dev, skb)) {
1958 if (unlikely(dev_gso_segment(skb))) 2020 if (unlikely(dev_gso_segment(skb)))
1959 goto out_kfree_skb; 2021 goto out_kfree_skb;
@@ -1978,6 +2040,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1978 } 2040 }
1979 2041
1980 rc = ops->ndo_start_xmit(skb, dev); 2042 rc = ops->ndo_start_xmit(skb, dev);
2043 trace_net_dev_xmit(skb, rc);
1981 if (rc == NETDEV_TX_OK) 2044 if (rc == NETDEV_TX_OK)
1982 txq_trans_update(txq); 2045 txq_trans_update(txq);
1983 return rc; 2046 return rc;
@@ -1998,6 +2061,7 @@ gso:
1998 skb_dst_drop(nskb); 2061 skb_dst_drop(nskb);
1999 2062
2000 rc = ops->ndo_start_xmit(nskb, dev); 2063 rc = ops->ndo_start_xmit(nskb, dev);
2064 trace_net_dev_xmit(nskb, rc);
2001 if (unlikely(rc != NETDEV_TX_OK)) { 2065 if (unlikely(rc != NETDEV_TX_OK)) {
2002 if (rc & ~NETDEV_TX_MASK) 2066 if (rc & ~NETDEV_TX_MASK)
2003 goto out_kfree_gso_skb; 2067 goto out_kfree_gso_skb;
@@ -2015,6 +2079,7 @@ out_kfree_gso_skb:
2015 skb->destructor = DEV_GSO_CB(skb)->destructor; 2079 skb->destructor = DEV_GSO_CB(skb)->destructor;
2016out_kfree_skb: 2080out_kfree_skb:
2017 kfree_skb(skb); 2081 kfree_skb(skb);
2082out:
2018 return rc; 2083 return rc;
2019} 2084}
2020 2085
@@ -2058,16 +2123,16 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2058 struct sk_buff *skb) 2123 struct sk_buff *skb)
2059{ 2124{
2060 int queue_index; 2125 int queue_index;
2061 struct sock *sk = skb->sk; 2126 const struct net_device_ops *ops = dev->netdev_ops;
2062 2127
2063 queue_index = sk_tx_queue_get(sk); 2128 if (ops->ndo_select_queue) {
2064 if (queue_index < 0) { 2129 queue_index = ops->ndo_select_queue(dev, skb);
2065 const struct net_device_ops *ops = dev->netdev_ops; 2130 queue_index = dev_cap_txqueue(dev, queue_index);
2131 } else {
2132 struct sock *sk = skb->sk;
2133 queue_index = sk_tx_queue_get(sk);
2134 if (queue_index < 0) {
2066 2135
2067 if (ops->ndo_select_queue) {
2068 queue_index = ops->ndo_select_queue(dev, skb);
2069 queue_index = dev_cap_txqueue(dev, queue_index);
2070 } else {
2071 queue_index = 0; 2136 queue_index = 0;
2072 if (dev->real_num_tx_queues > 1) 2137 if (dev->real_num_tx_queues > 1)
2073 queue_index = skb_tx_hash(dev, skb); 2138 queue_index = skb_tx_hash(dev, skb);
@@ -2143,6 +2208,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2143 return rc; 2208 return rc;
2144} 2209}
2145 2210
2211static DEFINE_PER_CPU(int, xmit_recursion);
2212#define RECURSION_LIMIT 10
2213
2146/** 2214/**
2147 * dev_queue_xmit - transmit a buffer 2215 * dev_queue_xmit - transmit a buffer
2148 * @skb: buffer to transmit 2216 * @skb: buffer to transmit
@@ -2186,6 +2254,7 @@ int dev_queue_xmit(struct sk_buff *skb)
2186#ifdef CONFIG_NET_CLS_ACT 2254#ifdef CONFIG_NET_CLS_ACT
2187 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); 2255 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2188#endif 2256#endif
2257 trace_net_dev_queue(skb);
2189 if (q->enqueue) { 2258 if (q->enqueue) {
2190 rc = __dev_xmit_skb(skb, q, dev, txq); 2259 rc = __dev_xmit_skb(skb, q, dev, txq);
2191 goto out; 2260 goto out;
@@ -2208,10 +2277,15 @@ int dev_queue_xmit(struct sk_buff *skb)
2208 2277
2209 if (txq->xmit_lock_owner != cpu) { 2278 if (txq->xmit_lock_owner != cpu) {
2210 2279
2280 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2281 goto recursion_alert;
2282
2211 HARD_TX_LOCK(dev, txq, cpu); 2283 HARD_TX_LOCK(dev, txq, cpu);
2212 2284
2213 if (!netif_tx_queue_stopped(txq)) { 2285 if (!netif_tx_queue_stopped(txq)) {
2286 __this_cpu_inc(xmit_recursion);
2214 rc = dev_hard_start_xmit(skb, dev, txq); 2287 rc = dev_hard_start_xmit(skb, dev, txq);
2288 __this_cpu_dec(xmit_recursion);
2215 if (dev_xmit_complete(rc)) { 2289 if (dev_xmit_complete(rc)) {
2216 HARD_TX_UNLOCK(dev, txq); 2290 HARD_TX_UNLOCK(dev, txq);
2217 goto out; 2291 goto out;
@@ -2223,7 +2297,9 @@ int dev_queue_xmit(struct sk_buff *skb)
2223 "queue packet!\n", dev->name); 2297 "queue packet!\n", dev->name);
2224 } else { 2298 } else {
2225 /* Recursion is detected! It is possible, 2299 /* Recursion is detected! It is possible,
2226 * unfortunately */ 2300 * unfortunately
2301 */
2302recursion_alert:
2227 if (net_ratelimit()) 2303 if (net_ratelimit())
2228 printk(KERN_CRIT "Dead loop on virtual device " 2304 printk(KERN_CRIT "Dead loop on virtual device "
2229 "%s, fix it urgently!\n", dev->name); 2305 "%s, fix it urgently!\n", dev->name);
@@ -2259,69 +2335,44 @@ static inline void ____napi_schedule(struct softnet_data *sd,
2259 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 2335 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2260} 2336}
2261 2337
2262#ifdef CONFIG_RPS
2263
2264/* One global table that all flow-based protocols share. */
2265struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
2266EXPORT_SYMBOL(rps_sock_flow_table);
2267
2268/* 2338/*
2269 * get_rps_cpu is called from netif_receive_skb and returns the target 2339 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2270 * CPU from the RPS map of the receiving queue for a given skb. 2340 * and src/dst port numbers. Returns a non-zero hash number on success
2271 * rcu_read_lock must be held on entry. 2341 * and 0 on failure.
2272 */ 2342 */
2273static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, 2343__u32 __skb_get_rxhash(struct sk_buff *skb)
2274 struct rps_dev_flow **rflowp)
2275{ 2344{
2345 int nhoff, hash = 0, poff;
2276 struct ipv6hdr *ip6; 2346 struct ipv6hdr *ip6;
2277 struct iphdr *ip; 2347 struct iphdr *ip;
2278 struct netdev_rx_queue *rxqueue;
2279 struct rps_map *map;
2280 struct rps_dev_flow_table *flow_table;
2281 struct rps_sock_flow_table *sock_flow_table;
2282 int cpu = -1;
2283 u8 ip_proto; 2348 u8 ip_proto;
2284 u16 tcpu;
2285 u32 addr1, addr2, ihl; 2349 u32 addr1, addr2, ihl;
2286 union { 2350 union {
2287 u32 v32; 2351 u32 v32;
2288 u16 v16[2]; 2352 u16 v16[2];
2289 } ports; 2353 } ports;
2290 2354
2291 if (skb_rx_queue_recorded(skb)) { 2355 nhoff = skb_network_offset(skb);
2292 u16 index = skb_get_rx_queue(skb);
2293 if (unlikely(index >= dev->num_rx_queues)) {
2294 WARN_ONCE(dev->num_rx_queues > 1, "%s received packet "
2295 "on queue %u, but number of RX queues is %u\n",
2296 dev->name, index, dev->num_rx_queues);
2297 goto done;
2298 }
2299 rxqueue = dev->_rx + index;
2300 } else
2301 rxqueue = dev->_rx;
2302
2303 if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
2304 goto done;
2305
2306 if (skb->rxhash)
2307 goto got_hash; /* Skip hash computation on packet header */
2308 2356
2309 switch (skb->protocol) { 2357 switch (skb->protocol) {
2310 case __constant_htons(ETH_P_IP): 2358 case __constant_htons(ETH_P_IP):
2311 if (!pskb_may_pull(skb, sizeof(*ip))) 2359 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2312 goto done; 2360 goto done;
2313 2361
2314 ip = (struct iphdr *) skb->data; 2362 ip = (struct iphdr *) (skb->data + nhoff);
2315 ip_proto = ip->protocol; 2363 if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2364 ip_proto = 0;
2365 else
2366 ip_proto = ip->protocol;
2316 addr1 = (__force u32) ip->saddr; 2367 addr1 = (__force u32) ip->saddr;
2317 addr2 = (__force u32) ip->daddr; 2368 addr2 = (__force u32) ip->daddr;
2318 ihl = ip->ihl; 2369 ihl = ip->ihl;
2319 break; 2370 break;
2320 case __constant_htons(ETH_P_IPV6): 2371 case __constant_htons(ETH_P_IPV6):
2321 if (!pskb_may_pull(skb, sizeof(*ip6))) 2372 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2322 goto done; 2373 goto done;
2323 2374
2324 ip6 = (struct ipv6hdr *) skb->data; 2375 ip6 = (struct ipv6hdr *) (skb->data + nhoff);
2325 ip_proto = ip6->nexthdr; 2376 ip_proto = ip6->nexthdr;
2326 addr1 = (__force u32) ip6->saddr.s6_addr32[3]; 2377 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2327 addr2 = (__force u32) ip6->daddr.s6_addr32[3]; 2378 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
@@ -2330,33 +2381,81 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2330 default: 2381 default:
2331 goto done; 2382 goto done;
2332 } 2383 }
2333 switch (ip_proto) { 2384
2334 case IPPROTO_TCP: 2385 ports.v32 = 0;
2335 case IPPROTO_UDP: 2386 poff = proto_ports_offset(ip_proto);
2336 case IPPROTO_DCCP: 2387 if (poff >= 0) {
2337 case IPPROTO_ESP: 2388 nhoff += ihl * 4 + poff;
2338 case IPPROTO_AH: 2389 if (pskb_may_pull(skb, nhoff + 4)) {
2339 case IPPROTO_SCTP: 2390 ports.v32 = * (__force u32 *) (skb->data + nhoff);
2340 case IPPROTO_UDPLITE:
2341 if (pskb_may_pull(skb, (ihl * 4) + 4)) {
2342 ports.v32 = * (__force u32 *) (skb->data + (ihl * 4));
2343 if (ports.v16[1] < ports.v16[0]) 2391 if (ports.v16[1] < ports.v16[0])
2344 swap(ports.v16[0], ports.v16[1]); 2392 swap(ports.v16[0], ports.v16[1]);
2345 break;
2346 } 2393 }
2347 default:
2348 ports.v32 = 0;
2349 break;
2350 } 2394 }
2351 2395
2352 /* get a consistent hash (same value on both flow directions) */ 2396 /* get a consistent hash (same value on both flow directions) */
2353 if (addr2 < addr1) 2397 if (addr2 < addr1)
2354 swap(addr1, addr2); 2398 swap(addr1, addr2);
2355 skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2356 if (!skb->rxhash)
2357 skb->rxhash = 1;
2358 2399
2359got_hash: 2400 hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2401 if (!hash)
2402 hash = 1;
2403
2404done:
2405 return hash;
2406}
2407EXPORT_SYMBOL(__skb_get_rxhash);
2408
2409#ifdef CONFIG_RPS
2410
2411/* One global table that all flow-based protocols share. */
2412struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2413EXPORT_SYMBOL(rps_sock_flow_table);
2414
2415/*
2416 * get_rps_cpu is called from netif_receive_skb and returns the target
2417 * CPU from the RPS map of the receiving queue for a given skb.
2418 * rcu_read_lock must be held on entry.
2419 */
2420static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2421 struct rps_dev_flow **rflowp)
2422{
2423 struct netdev_rx_queue *rxqueue;
2424 struct rps_map *map;
2425 struct rps_dev_flow_table *flow_table;
2426 struct rps_sock_flow_table *sock_flow_table;
2427 int cpu = -1;
2428 u16 tcpu;
2429
2430 if (skb_rx_queue_recorded(skb)) {
2431 u16 index = skb_get_rx_queue(skb);
2432 if (unlikely(index >= dev->real_num_rx_queues)) {
2433 WARN_ONCE(dev->real_num_rx_queues > 1,
2434 "%s received packet on queue %u, but number "
2435 "of RX queues is %u\n",
2436 dev->name, index, dev->real_num_rx_queues);
2437 goto done;
2438 }
2439 rxqueue = dev->_rx + index;
2440 } else
2441 rxqueue = dev->_rx;
2442
2443 map = rcu_dereference(rxqueue->rps_map);
2444 if (map) {
2445 if (map->len == 1) {
2446 tcpu = map->cpus[0];
2447 if (cpu_online(tcpu))
2448 cpu = tcpu;
2449 goto done;
2450 }
2451 } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2452 goto done;
2453 }
2454
2455 skb_reset_network_header(skb);
2456 if (!skb_get_rxhash(skb))
2457 goto done;
2458
2360 flow_table = rcu_dereference(rxqueue->rps_flow_table); 2459 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2361 sock_flow_table = rcu_dereference(rps_sock_flow_table); 2460 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2362 if (flow_table && sock_flow_table) { 2461 if (flow_table && sock_flow_table) {
@@ -2396,7 +2495,6 @@ got_hash:
2396 } 2495 }
2397 } 2496 }
2398 2497
2399 map = rcu_dereference(rxqueue->rps_map);
2400 if (map) { 2498 if (map) {
2401 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; 2499 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2402 2500
@@ -2482,6 +2580,7 @@ enqueue:
2482 2580
2483 local_irq_restore(flags); 2581 local_irq_restore(flags);
2484 2582
2583 atomic_long_inc(&skb->dev->rx_dropped);
2485 kfree_skb(skb); 2584 kfree_skb(skb);
2486 return NET_RX_DROP; 2585 return NET_RX_DROP;
2487} 2586}
@@ -2512,6 +2611,7 @@ int netif_rx(struct sk_buff *skb)
2512 if (netdev_tstamp_prequeue) 2611 if (netdev_tstamp_prequeue)
2513 net_timestamp_check(skb); 2612 net_timestamp_check(skb);
2514 2613
2614 trace_netif_rx(skb);
2515#ifdef CONFIG_RPS 2615#ifdef CONFIG_RPS
2516 { 2616 {
2517 struct rps_dev_flow voidflow, *rflow = &voidflow; 2617 struct rps_dev_flow voidflow, *rflow = &voidflow;
@@ -2571,6 +2671,7 @@ static void net_tx_action(struct softirq_action *h)
2571 clist = clist->next; 2671 clist = clist->next;
2572 2672
2573 WARN_ON(atomic_read(&skb->users)); 2673 WARN_ON(atomic_read(&skb->users));
2674 trace_kfree_skb(skb, net_tx_action);
2574 __kfree_skb(skb); 2675 __kfree_skb(skb);
2575 } 2676 }
2576 } 2677 }
@@ -2636,11 +2737,10 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2636 * the ingress scheduler, you just cant add policies on ingress. 2737 * the ingress scheduler, you just cant add policies on ingress.
2637 * 2738 *
2638 */ 2739 */
2639static int ing_filter(struct sk_buff *skb) 2740static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
2640{ 2741{
2641 struct net_device *dev = skb->dev; 2742 struct net_device *dev = skb->dev;
2642 u32 ttl = G_TC_RTTL(skb->tc_verd); 2743 u32 ttl = G_TC_RTTL(skb->tc_verd);
2643 struct netdev_queue *rxq;
2644 int result = TC_ACT_OK; 2744 int result = TC_ACT_OK;
2645 struct Qdisc *q; 2745 struct Qdisc *q;
2646 2746
@@ -2654,8 +2754,6 @@ static int ing_filter(struct sk_buff *skb)
2654 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); 2754 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2655 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); 2755 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2656 2756
2657 rxq = &dev->rx_queue;
2658
2659 q = rxq->qdisc; 2757 q = rxq->qdisc;
2660 if (q != &noop_qdisc) { 2758 if (q != &noop_qdisc) {
2661 spin_lock(qdisc_lock(q)); 2759 spin_lock(qdisc_lock(q));
@@ -2671,7 +2769,9 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2671 struct packet_type **pt_prev, 2769 struct packet_type **pt_prev,
2672 int *ret, struct net_device *orig_dev) 2770 int *ret, struct net_device *orig_dev)
2673{ 2771{
2674 if (skb->dev->rx_queue.qdisc == &noop_qdisc) 2772 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
2773
2774 if (!rxq || rxq->qdisc == &noop_qdisc)
2675 goto out; 2775 goto out;
2676 2776
2677 if (*pt_prev) { 2777 if (*pt_prev) {
@@ -2679,7 +2779,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2679 *pt_prev = NULL; 2779 *pt_prev = NULL;
2680 } 2780 }
2681 2781
2682 switch (ing_filter(skb)) { 2782 switch (ing_filter(skb, rxq)) {
2683 case TC_ACT_SHOT: 2783 case TC_ACT_SHOT:
2684 case TC_ACT_STOLEN: 2784 case TC_ACT_STOLEN:
2685 kfree_skb(skb); 2785 kfree_skb(skb);
@@ -2692,33 +2792,6 @@ out:
2692} 2792}
2693#endif 2793#endif
2694 2794
2695/*
2696 * netif_nit_deliver - deliver received packets to network taps
2697 * @skb: buffer
2698 *
2699 * This function is used to deliver incoming packets to network
2700 * taps. It should be used when the normal netif_receive_skb path
2701 * is bypassed, for example because of VLAN acceleration.
2702 */
2703void netif_nit_deliver(struct sk_buff *skb)
2704{
2705 struct packet_type *ptype;
2706
2707 if (list_empty(&ptype_all))
2708 return;
2709
2710 skb_reset_network_header(skb);
2711 skb_reset_transport_header(skb);
2712 skb->mac_len = skb->network_header - skb->mac_header;
2713
2714 rcu_read_lock();
2715 list_for_each_entry_rcu(ptype, &ptype_all, list) {
2716 if (!ptype->dev || ptype->dev == skb->dev)
2717 deliver_skb(skb, ptype, skb->dev);
2718 }
2719 rcu_read_unlock();
2720}
2721
2722/** 2795/**
2723 * netdev_rx_handler_register - register receive handler 2796 * netdev_rx_handler_register - register receive handler
2724 * @dev: device to register a handler for 2797 * @dev: device to register a handler for
@@ -2828,8 +2901,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
2828 if (!netdev_tstamp_prequeue) 2901 if (!netdev_tstamp_prequeue)
2829 net_timestamp_check(skb); 2902 net_timestamp_check(skb);
2830 2903
2831 if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb)) 2904 trace_netif_receive_skb(skb);
2832 return NET_RX_SUCCESS;
2833 2905
2834 /* if we've gotten here through NAPI, check netpoll */ 2906 /* if we've gotten here through NAPI, check netpoll */
2835 if (netpoll_receive_skb(skb)) 2907 if (netpoll_receive_skb(skb))
@@ -2843,8 +2915,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
2843 * be delivered to pkt handlers that are exact matches. Also 2915 * be delivered to pkt handlers that are exact matches. Also
2844 * the deliver_no_wcard flag will be set. If packet handlers 2916 * the deliver_no_wcard flag will be set. If packet handlers
2845 * are sensitive to duplicate packets these skbs will need to 2917 * are sensitive to duplicate packets these skbs will need to
2846 * be dropped at the handler. The vlan accel path may have 2918 * be dropped at the handler.
2847 * already set the deliver_no_wcard flag.
2848 */ 2919 */
2849 null_or_orig = NULL; 2920 null_or_orig = NULL;
2850 orig_dev = skb->dev; 2921 orig_dev = skb->dev;
@@ -2903,6 +2974,18 @@ ncls:
2903 goto out; 2974 goto out;
2904 } 2975 }
2905 2976
2977 if (vlan_tx_tag_present(skb)) {
2978 if (pt_prev) {
2979 ret = deliver_skb(skb, pt_prev, orig_dev);
2980 pt_prev = NULL;
2981 }
2982 if (vlan_hwaccel_do_receive(&skb)) {
2983 ret = __netif_receive_skb(skb);
2984 goto out;
2985 } else if (unlikely(!skb))
2986 goto out;
2987 }
2988
2906 /* 2989 /*
2907 * Make sure frames received on VLAN interfaces stacked on 2990 * Make sure frames received on VLAN interfaces stacked on
2908 * bonding interfaces still make their way to any base bonding 2991 * bonding interfaces still make their way to any base bonding
@@ -2930,6 +3013,7 @@ ncls:
2930 if (pt_prev) { 3013 if (pt_prev) {
2931 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 3014 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2932 } else { 3015 } else {
3016 atomic_long_inc(&skb->dev->rx_dropped);
2933 kfree_skb(skb); 3017 kfree_skb(skb);
2934 /* Jamal, now you will not able to escape explaining 3018 /* Jamal, now you will not able to escape explaining
2935 * me how you were going to use this. :-) 3019 * me how you were going to use this. :-)
@@ -3050,7 +3134,7 @@ out:
3050 return netif_receive_skb(skb); 3134 return netif_receive_skb(skb);
3051} 3135}
3052 3136
3053static void napi_gro_flush(struct napi_struct *napi) 3137inline void napi_gro_flush(struct napi_struct *napi)
3054{ 3138{
3055 struct sk_buff *skb, *next; 3139 struct sk_buff *skb, *next;
3056 3140
@@ -3063,6 +3147,7 @@ static void napi_gro_flush(struct napi_struct *napi)
3063 napi->gro_count = 0; 3147 napi->gro_count = 0;
3064 napi->gro_list = NULL; 3148 napi->gro_list = NULL;
3065} 3149}
3150EXPORT_SYMBOL(napi_gro_flush);
3066 3151
3067enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 3152enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3068{ 3153{
@@ -3077,7 +3162,7 @@ enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3077 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb)) 3162 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3078 goto normal; 3163 goto normal;
3079 3164
3080 if (skb_is_gso(skb) || skb_has_frags(skb)) 3165 if (skb_is_gso(skb) || skb_has_frag_list(skb))
3081 goto normal; 3166 goto normal;
3082 3167
3083 rcu_read_lock(); 3168 rcu_read_lock();
@@ -3143,7 +3228,7 @@ pull:
3143 put_page(skb_shinfo(skb)->frags[0].page); 3228 put_page(skb_shinfo(skb)->frags[0].page);
3144 memmove(skb_shinfo(skb)->frags, 3229 memmove(skb_shinfo(skb)->frags,
3145 skb_shinfo(skb)->frags + 1, 3230 skb_shinfo(skb)->frags + 1,
3146 --skb_shinfo(skb)->nr_frags); 3231 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3147 } 3232 }
3148 } 3233 }
3149 3234
@@ -3156,16 +3241,19 @@ normal:
3156} 3241}
3157EXPORT_SYMBOL(dev_gro_receive); 3242EXPORT_SYMBOL(dev_gro_receive);
3158 3243
3159static gro_result_t 3244static inline gro_result_t
3160__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 3245__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3161{ 3246{
3162 struct sk_buff *p; 3247 struct sk_buff *p;
3163 3248
3164 for (p = napi->gro_list; p; p = p->next) { 3249 for (p = napi->gro_list; p; p = p->next) {
3165 NAPI_GRO_CB(p)->same_flow = 3250 unsigned long diffs;
3166 (p->dev == skb->dev) && 3251
3167 !compare_ether_header(skb_mac_header(p), 3252 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3253 diffs |= p->vlan_tci ^ skb->vlan_tci;
3254 diffs |= compare_ether_header(skb_mac_header(p),
3168 skb_gro_mac_header(skb)); 3255 skb_gro_mac_header(skb));
3256 NAPI_GRO_CB(p)->same_flow = !diffs;
3169 NAPI_GRO_CB(p)->flush = 0; 3257 NAPI_GRO_CB(p)->flush = 0;
3170 } 3258 }
3171 3259
@@ -3218,14 +3306,14 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3218} 3306}
3219EXPORT_SYMBOL(napi_gro_receive); 3307EXPORT_SYMBOL(napi_gro_receive);
3220 3308
3221void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) 3309static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3222{ 3310{
3223 __skb_pull(skb, skb_headlen(skb)); 3311 __skb_pull(skb, skb_headlen(skb));
3224 skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb)); 3312 skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3313 skb->vlan_tci = 0;
3225 3314
3226 napi->skb = skb; 3315 napi->skb = skb;
3227} 3316}
3228EXPORT_SYMBOL(napi_reuse_skb);
3229 3317
3230struct sk_buff *napi_get_frags(struct napi_struct *napi) 3318struct sk_buff *napi_get_frags(struct napi_struct *napi)
3231{ 3319{
@@ -4845,7 +4933,7 @@ static void rollback_registered_many(struct list_head *head)
4845 dev = list_first_entry(head, struct net_device, unreg_list); 4933 dev = list_first_entry(head, struct net_device, unreg_list);
4846 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); 4934 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
4847 4935
4848 synchronize_net(); 4936 rcu_barrier();
4849 4937
4850 list_for_each_entry(dev, head, unreg_list) 4938 list_for_each_entry(dev, head, unreg_list)
4851 dev_put(dev); 4939 dev_put(dev);
@@ -4859,21 +4947,6 @@ static void rollback_registered(struct net_device *dev)
4859 rollback_registered_many(&single); 4947 rollback_registered_many(&single);
4860} 4948}
4861 4949
4862static void __netdev_init_queue_locks_one(struct net_device *dev,
4863 struct netdev_queue *dev_queue,
4864 void *_unused)
4865{
4866 spin_lock_init(&dev_queue->_xmit_lock);
4867 netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4868 dev_queue->xmit_lock_owner = -1;
4869}
4870
4871static void netdev_init_queue_locks(struct net_device *dev)
4872{
4873 netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4874 __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
4875}
4876
4877unsigned long netdev_fix_features(unsigned long features, const char *name) 4950unsigned long netdev_fix_features(unsigned long features, const char *name)
4878{ 4951{
4879 /* Fix illegal SG+CSUM combinations. */ 4952 /* Fix illegal SG+CSUM combinations. */
@@ -4941,6 +5014,66 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev,
4941} 5014}
4942EXPORT_SYMBOL(netif_stacked_transfer_operstate); 5015EXPORT_SYMBOL(netif_stacked_transfer_operstate);
4943 5016
5017static int netif_alloc_rx_queues(struct net_device *dev)
5018{
5019#ifdef CONFIG_RPS
5020 unsigned int i, count = dev->num_rx_queues;
5021 struct netdev_rx_queue *rx;
5022
5023 BUG_ON(count < 1);
5024
5025 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5026 if (!rx) {
5027 pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5028 return -ENOMEM;
5029 }
5030 dev->_rx = rx;
5031
5032 /*
5033 * Set a pointer to first element in the array which holds the
5034 * reference count.
5035 */
5036 for (i = 0; i < count; i++)
5037 rx[i].first = rx;
5038#endif
5039 return 0;
5040}
5041
5042static int netif_alloc_netdev_queues(struct net_device *dev)
5043{
5044 unsigned int count = dev->num_tx_queues;
5045 struct netdev_queue *tx;
5046
5047 BUG_ON(count < 1);
5048
5049 tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5050 if (!tx) {
5051 pr_err("netdev: Unable to allocate %u tx queues.\n",
5052 count);
5053 return -ENOMEM;
5054 }
5055 dev->_tx = tx;
5056 return 0;
5057}
5058
5059static void netdev_init_one_queue(struct net_device *dev,
5060 struct netdev_queue *queue,
5061 void *_unused)
5062{
5063 queue->dev = dev;
5064
5065 /* Initialize queue lock */
5066 spin_lock_init(&queue->_xmit_lock);
5067 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5068 queue->xmit_lock_owner = -1;
5069}
5070
5071static void netdev_init_queues(struct net_device *dev)
5072{
5073 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5074 spin_lock_init(&dev->tx_global_lock);
5075}
5076
4944/** 5077/**
4945 * register_netdevice - register a network device 5078 * register_netdevice - register a network device
4946 * @dev: device to register 5079 * @dev: device to register
@@ -4974,28 +5107,19 @@ int register_netdevice(struct net_device *dev)
4974 5107
4975 spin_lock_init(&dev->addr_list_lock); 5108 spin_lock_init(&dev->addr_list_lock);
4976 netdev_set_addr_lockdep_class(dev); 5109 netdev_set_addr_lockdep_class(dev);
4977 netdev_init_queue_locks(dev);
4978 5110
4979 dev->iflink = -1; 5111 dev->iflink = -1;
4980 5112
4981#ifdef CONFIG_RPS 5113 ret = netif_alloc_rx_queues(dev);
4982 if (!dev->num_rx_queues) { 5114 if (ret)
4983 /* 5115 goto out;
4984 * Allocate a single RX queue if driver never called
4985 * alloc_netdev_mq
4986 */
4987 5116
4988 dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL); 5117 ret = netif_alloc_netdev_queues(dev);
4989 if (!dev->_rx) { 5118 if (ret)
4990 ret = -ENOMEM; 5119 goto out;
4991 goto out; 5120
4992 } 5121 netdev_init_queues(dev);
4993 5122
4994 dev->_rx->first = dev->_rx;
4995 atomic_set(&dev->_rx->count, 1);
4996 dev->num_rx_queues = 1;
4997 }
4998#endif
4999 /* Init, if this function is available */ 5123 /* Init, if this function is available */
5000 if (dev->netdev_ops->ndo_init) { 5124 if (dev->netdev_ops->ndo_init) {
5001 ret = dev->netdev_ops->ndo_init(dev); 5125 ret = dev->netdev_ops->ndo_init(dev);
@@ -5035,6 +5159,12 @@ int register_netdevice(struct net_device *dev)
5035 if (dev->features & NETIF_F_SG) 5159 if (dev->features & NETIF_F_SG)
5036 dev->features |= NETIF_F_GSO; 5160 dev->features |= NETIF_F_GSO;
5037 5161
5162 /* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5163 * vlan_dev_init() will do the dev->features check, so these features
5164 * are enabled only if supported by underlying device.
5165 */
5166 dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5167
5038 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); 5168 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5039 ret = notifier_to_errno(ret); 5169 ret = notifier_to_errno(ret);
5040 if (ret) 5170 if (ret)
@@ -5105,9 +5235,6 @@ int init_dummy_netdev(struct net_device *dev)
5105 */ 5235 */
5106 dev->reg_state = NETREG_DUMMY; 5236 dev->reg_state = NETREG_DUMMY;
5107 5237
5108 /* initialize the ref count */
5109 atomic_set(&dev->refcnt, 1);
5110
5111 /* NAPI wants this */ 5238 /* NAPI wants this */
5112 INIT_LIST_HEAD(&dev->napi_list); 5239 INIT_LIST_HEAD(&dev->napi_list);
5113 5240
@@ -5115,6 +5242,11 @@ int init_dummy_netdev(struct net_device *dev)
5115 set_bit(__LINK_STATE_PRESENT, &dev->state); 5242 set_bit(__LINK_STATE_PRESENT, &dev->state);
5116 set_bit(__LINK_STATE_START, &dev->state); 5243 set_bit(__LINK_STATE_START, &dev->state);
5117 5244
5245 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5246 * because users of this 'device' dont need to change
5247 * its refcount.
5248 */
5249
5118 return 0; 5250 return 0;
5119} 5251}
5120EXPORT_SYMBOL_GPL(init_dummy_netdev); 5252EXPORT_SYMBOL_GPL(init_dummy_netdev);
@@ -5156,6 +5288,16 @@ out:
5156} 5288}
5157EXPORT_SYMBOL(register_netdev); 5289EXPORT_SYMBOL(register_netdev);
5158 5290
5291int netdev_refcnt_read(const struct net_device *dev)
5292{
5293 int i, refcnt = 0;
5294
5295 for_each_possible_cpu(i)
5296 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5297 return refcnt;
5298}
5299EXPORT_SYMBOL(netdev_refcnt_read);
5300
5159/* 5301/*
5160 * netdev_wait_allrefs - wait until all references are gone. 5302 * netdev_wait_allrefs - wait until all references are gone.
5161 * 5303 *
@@ -5170,11 +5312,14 @@ EXPORT_SYMBOL(register_netdev);
5170static void netdev_wait_allrefs(struct net_device *dev) 5312static void netdev_wait_allrefs(struct net_device *dev)
5171{ 5313{
5172 unsigned long rebroadcast_time, warning_time; 5314 unsigned long rebroadcast_time, warning_time;
5315 int refcnt;
5173 5316
5174 linkwatch_forget_dev(dev); 5317 linkwatch_forget_dev(dev);
5175 5318
5176 rebroadcast_time = warning_time = jiffies; 5319 rebroadcast_time = warning_time = jiffies;
5177 while (atomic_read(&dev->refcnt) != 0) { 5320 refcnt = netdev_refcnt_read(dev);
5321
5322 while (refcnt != 0) {
5178 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { 5323 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5179 rtnl_lock(); 5324 rtnl_lock();
5180 5325
@@ -5201,11 +5346,13 @@ static void netdev_wait_allrefs(struct net_device *dev)
5201 5346
5202 msleep(250); 5347 msleep(250);
5203 5348
5349 refcnt = netdev_refcnt_read(dev);
5350
5204 if (time_after(jiffies, warning_time + 10 * HZ)) { 5351 if (time_after(jiffies, warning_time + 10 * HZ)) {
5205 printk(KERN_EMERG "unregister_netdevice: " 5352 printk(KERN_EMERG "unregister_netdevice: "
5206 "waiting for %s to become free. Usage " 5353 "waiting for %s to become free. Usage "
5207 "count = %d\n", 5354 "count = %d\n",
5208 dev->name, atomic_read(&dev->refcnt)); 5355 dev->name, refcnt);
5209 warning_time = jiffies; 5356 warning_time = jiffies;
5210 } 5357 }
5211 } 5358 }
@@ -5263,9 +5410,9 @@ void netdev_run_todo(void)
5263 netdev_wait_allrefs(dev); 5410 netdev_wait_allrefs(dev);
5264 5411
5265 /* paranoia */ 5412 /* paranoia */
5266 BUG_ON(atomic_read(&dev->refcnt)); 5413 BUG_ON(netdev_refcnt_read(dev));
5267 WARN_ON(dev->ip_ptr); 5414 WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5268 WARN_ON(dev->ip6_ptr); 5415 WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5269 WARN_ON(dev->dn_ptr); 5416 WARN_ON(dev->dn_ptr);
5270 5417
5271 if (dev->destructor) 5418 if (dev->destructor)
@@ -5342,30 +5489,34 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5342 5489
5343 if (ops->ndo_get_stats64) { 5490 if (ops->ndo_get_stats64) {
5344 memset(storage, 0, sizeof(*storage)); 5491 memset(storage, 0, sizeof(*storage));
5345 return ops->ndo_get_stats64(dev, storage); 5492 ops->ndo_get_stats64(dev, storage);
5346 } 5493 } else if (ops->ndo_get_stats) {
5347 if (ops->ndo_get_stats) {
5348 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); 5494 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5349 return storage; 5495 } else {
5496 netdev_stats_to_stats64(storage, &dev->stats);
5497 dev_txq_stats_fold(dev, storage);
5350 } 5498 }
5351 netdev_stats_to_stats64(storage, &dev->stats); 5499 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5352 dev_txq_stats_fold(dev, storage);
5353 return storage; 5500 return storage;
5354} 5501}
5355EXPORT_SYMBOL(dev_get_stats); 5502EXPORT_SYMBOL(dev_get_stats);
5356 5503
5357static void netdev_init_one_queue(struct net_device *dev, 5504struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5358 struct netdev_queue *queue,
5359 void *_unused)
5360{ 5505{
5361 queue->dev = dev; 5506 struct netdev_queue *queue = dev_ingress_queue(dev);
5362}
5363 5507
5364static void netdev_init_queues(struct net_device *dev) 5508#ifdef CONFIG_NET_CLS_ACT
5365{ 5509 if (queue)
5366 netdev_init_one_queue(dev, &dev->rx_queue, NULL); 5510 return queue;
5367 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); 5511 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5368 spin_lock_init(&dev->tx_global_lock); 5512 if (!queue)
5513 return NULL;
5514 netdev_init_one_queue(dev, queue, NULL);
5515 queue->qdisc = &noop_qdisc;
5516 queue->qdisc_sleeping = &noop_qdisc;
5517 rcu_assign_pointer(dev->ingress_queue, queue);
5518#endif
5519 return queue;
5369} 5520}
5370 5521
5371/** 5522/**
@@ -5382,17 +5533,18 @@ static void netdev_init_queues(struct net_device *dev)
5382struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, 5533struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5383 void (*setup)(struct net_device *), unsigned int queue_count) 5534 void (*setup)(struct net_device *), unsigned int queue_count)
5384{ 5535{
5385 struct netdev_queue *tx;
5386 struct net_device *dev; 5536 struct net_device *dev;
5387 size_t alloc_size; 5537 size_t alloc_size;
5388 struct net_device *p; 5538 struct net_device *p;
5389#ifdef CONFIG_RPS
5390 struct netdev_rx_queue *rx;
5391 int i;
5392#endif
5393 5539
5394 BUG_ON(strlen(name) >= sizeof(dev->name)); 5540 BUG_ON(strlen(name) >= sizeof(dev->name));
5395 5541
5542 if (queue_count < 1) {
5543 pr_err("alloc_netdev: Unable to allocate device "
5544 "with zero queues.\n");
5545 return NULL;
5546 }
5547
5396 alloc_size = sizeof(struct net_device); 5548 alloc_size = sizeof(struct net_device);
5397 if (sizeof_priv) { 5549 if (sizeof_priv) {
5398 /* ensure 32-byte alignment of private area */ 5550 /* ensure 32-byte alignment of private area */
@@ -5408,55 +5560,31 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5408 return NULL; 5560 return NULL;
5409 } 5561 }
5410 5562
5411 tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
5412 if (!tx) {
5413 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5414 "tx qdiscs.\n");
5415 goto free_p;
5416 }
5417
5418#ifdef CONFIG_RPS
5419 rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5420 if (!rx) {
5421 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5422 "rx queues.\n");
5423 goto free_tx;
5424 }
5425
5426 atomic_set(&rx->count, queue_count);
5427
5428 /*
5429 * Set a pointer to first element in the array which holds the
5430 * reference count.
5431 */
5432 for (i = 0; i < queue_count; i++)
5433 rx[i].first = rx;
5434#endif
5435
5436 dev = PTR_ALIGN(p, NETDEV_ALIGN); 5563 dev = PTR_ALIGN(p, NETDEV_ALIGN);
5437 dev->padded = (char *)dev - (char *)p; 5564 dev->padded = (char *)dev - (char *)p;
5438 5565
5566 dev->pcpu_refcnt = alloc_percpu(int);
5567 if (!dev->pcpu_refcnt)
5568 goto free_p;
5569
5439 if (dev_addr_init(dev)) 5570 if (dev_addr_init(dev))
5440 goto free_rx; 5571 goto free_pcpu;
5441 5572
5442 dev_mc_init(dev); 5573 dev_mc_init(dev);
5443 dev_uc_init(dev); 5574 dev_uc_init(dev);
5444 5575
5445 dev_net_set(dev, &init_net); 5576 dev_net_set(dev, &init_net);
5446 5577
5447 dev->_tx = tx;
5448 dev->num_tx_queues = queue_count; 5578 dev->num_tx_queues = queue_count;
5449 dev->real_num_tx_queues = queue_count; 5579 dev->real_num_tx_queues = queue_count;
5450 5580
5451#ifdef CONFIG_RPS 5581#ifdef CONFIG_RPS
5452 dev->_rx = rx;
5453 dev->num_rx_queues = queue_count; 5582 dev->num_rx_queues = queue_count;
5583 dev->real_num_rx_queues = queue_count;
5454#endif 5584#endif
5455 5585
5456 dev->gso_max_size = GSO_MAX_SIZE; 5586 dev->gso_max_size = GSO_MAX_SIZE;
5457 5587
5458 netdev_init_queues(dev);
5459
5460 INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list); 5588 INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5461 dev->ethtool_ntuple_list.count = 0; 5589 dev->ethtool_ntuple_list.count = 0;
5462 INIT_LIST_HEAD(&dev->napi_list); 5590 INIT_LIST_HEAD(&dev->napi_list);
@@ -5467,12 +5595,8 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5467 strcpy(dev->name, name); 5595 strcpy(dev->name, name);
5468 return dev; 5596 return dev;
5469 5597
5470free_rx: 5598free_pcpu:
5471#ifdef CONFIG_RPS 5599 free_percpu(dev->pcpu_refcnt);
5472 kfree(rx);
5473free_tx:
5474#endif
5475 kfree(tx);
5476free_p: 5600free_p:
5477 kfree(p); 5601 kfree(p);
5478 return NULL; 5602 return NULL;
@@ -5495,6 +5619,8 @@ void free_netdev(struct net_device *dev)
5495 5619
5496 kfree(dev->_tx); 5620 kfree(dev->_tx);
5497 5621
5622 kfree(rcu_dereference_raw(dev->ingress_queue));
5623
5498 /* Flush device addresses */ 5624 /* Flush device addresses */
5499 dev_addr_flush(dev); 5625 dev_addr_flush(dev);
5500 5626
@@ -5504,6 +5630,9 @@ void free_netdev(struct net_device *dev)
5504 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) 5630 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5505 netif_napi_del(p); 5631 netif_napi_del(p);
5506 5632
5633 free_percpu(dev->pcpu_refcnt);
5634 dev->pcpu_refcnt = NULL;
5635
5507 /* Compatibility with error handling in drivers */ 5636 /* Compatibility with error handling in drivers */
5508 if (dev->reg_state == NETREG_UNINITIALIZED) { 5637 if (dev->reg_state == NETREG_UNINITIALIZED) {
5509 kfree((char *)dev - dev->padded); 5638 kfree((char *)dev - dev->padded);
@@ -5658,6 +5787,10 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
5658 5787
5659 /* Notify protocols, that we are about to destroy 5788 /* Notify protocols, that we are about to destroy
5660 this device. They should clean all the things. 5789 this device. They should clean all the things.
5790
5791 Note that dev->reg_state stays at NETREG_REGISTERED.
5792 This is wanted because this way 8021q and macvlan know
5793 the device is just moving and can keep their slaves up.
5661 */ 5794 */
5662 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 5795 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5663 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); 5796 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
diff --git a/net/core/dst.c b/net/core/dst.c
index 6c41b1fac3db..8abe628b79f1 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -168,7 +168,7 @@ void *dst_alloc(struct dst_ops *ops)
168{ 168{
169 struct dst_entry *dst; 169 struct dst_entry *dst;
170 170
171 if (ops->gc && atomic_read(&ops->entries) > ops->gc_thresh) { 171 if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) {
172 if (ops->gc(ops)) 172 if (ops->gc(ops))
173 return NULL; 173 return NULL;
174 } 174 }
@@ -183,7 +183,7 @@ void *dst_alloc(struct dst_ops *ops)
183#if RT_CACHE_DEBUG >= 2 183#if RT_CACHE_DEBUG >= 2
184 atomic_inc(&dst_total); 184 atomic_inc(&dst_total);
185#endif 185#endif
186 atomic_inc(&ops->entries); 186 dst_entries_add(ops, 1);
187 return dst; 187 return dst;
188} 188}
189EXPORT_SYMBOL(dst_alloc); 189EXPORT_SYMBOL(dst_alloc);
@@ -228,15 +228,15 @@ again:
228 child = dst->child; 228 child = dst->child;
229 229
230 dst->hh = NULL; 230 dst->hh = NULL;
231 if (hh && atomic_dec_and_test(&hh->hh_refcnt)) 231 if (hh)
232 kfree(hh); 232 hh_cache_put(hh);
233 233
234 if (neigh) { 234 if (neigh) {
235 dst->neighbour = NULL; 235 dst->neighbour = NULL;
236 neigh_release(neigh); 236 neigh_release(neigh);
237 } 237 }
238 238
239 atomic_dec(&dst->ops->entries); 239 dst_entries_add(dst->ops, -1);
240 240
241 if (dst->ops->destroy) 241 if (dst->ops->destroy)
242 dst->ops->destroy(dst); 242 dst->ops->destroy(dst);
@@ -271,13 +271,40 @@ void dst_release(struct dst_entry *dst)
271 if (dst) { 271 if (dst) {
272 int newrefcnt; 272 int newrefcnt;
273 273
274 smp_mb__before_atomic_dec();
275 newrefcnt = atomic_dec_return(&dst->__refcnt); 274 newrefcnt = atomic_dec_return(&dst->__refcnt);
276 WARN_ON(newrefcnt < 0); 275 WARN_ON(newrefcnt < 0);
276 if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) {
277 dst = dst_destroy(dst);
278 if (dst)
279 __dst_free(dst);
280 }
277 } 281 }
278} 282}
279EXPORT_SYMBOL(dst_release); 283EXPORT_SYMBOL(dst_release);
280 284
285/**
286 * skb_dst_set_noref - sets skb dst, without a reference
287 * @skb: buffer
288 * @dst: dst entry
289 *
290 * Sets skb dst, assuming a reference was not taken on dst
291 * skb_dst_drop() should not dst_release() this dst
292 */
293void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst)
294{
295 WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
296 /* If dst not in cache, we must take a reference, because
297 * dst_release() will destroy dst as soon as its refcount becomes zero
298 */
299 if (unlikely(dst->flags & DST_NOCACHE)) {
300 dst_hold(dst);
301 skb_dst_set(skb, dst);
302 } else {
303 skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF;
304 }
305}
306EXPORT_SYMBOL(skb_dst_set_noref);
307
281/* Dirty hack. We did it in 2.2 (in __dst_free), 308/* Dirty hack. We did it in 2.2 (in __dst_free),
282 * we have _very_ good reasons not to repeat 309 * we have _very_ good reasons not to repeat
283 * this mistake in 2.3, but we have no choice 310 * this mistake in 2.3, but we have no choice
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 7a85367b3c2f..956a9f4971cb 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -19,6 +19,7 @@
19#include <linux/netdevice.h> 19#include <linux/netdevice.h>
20#include <linux/bitops.h> 20#include <linux/bitops.h>
21#include <linux/uaccess.h> 21#include <linux/uaccess.h>
22#include <linux/vmalloc.h>
22#include <linux/slab.h> 23#include <linux/slab.h>
23 24
24/* 25/*
@@ -131,7 +132,8 @@ EXPORT_SYMBOL(ethtool_op_set_ufo);
131 * NETIF_F_xxx values in include/linux/netdevice.h 132 * NETIF_F_xxx values in include/linux/netdevice.h
132 */ 133 */
133static const u32 flags_dup_features = 134static const u32 flags_dup_features =
134 (ETH_FLAG_LRO | ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH); 135 (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | ETH_FLAG_NTUPLE |
136 ETH_FLAG_RXHASH);
135 137
136u32 ethtool_op_get_flags(struct net_device *dev) 138u32 ethtool_op_get_flags(struct net_device *dev)
137{ 139{
@@ -205,18 +207,24 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev,
205 struct ethtool_drvinfo info; 207 struct ethtool_drvinfo info;
206 const struct ethtool_ops *ops = dev->ethtool_ops; 208 const struct ethtool_ops *ops = dev->ethtool_ops;
207 209
208 if (!ops->get_drvinfo)
209 return -EOPNOTSUPP;
210
211 memset(&info, 0, sizeof(info)); 210 memset(&info, 0, sizeof(info));
212 info.cmd = ETHTOOL_GDRVINFO; 211 info.cmd = ETHTOOL_GDRVINFO;
213 ops->get_drvinfo(dev, &info); 212 if (ops && ops->get_drvinfo) {
213 ops->get_drvinfo(dev, &info);
214 } else if (dev->dev.parent && dev->dev.parent->driver) {
215 strlcpy(info.bus_info, dev_name(dev->dev.parent),
216 sizeof(info.bus_info));
217 strlcpy(info.driver, dev->dev.parent->driver->name,
218 sizeof(info.driver));
219 } else {
220 return -EOPNOTSUPP;
221 }
214 222
215 /* 223 /*
216 * this method of obtaining string set info is deprecated; 224 * this method of obtaining string set info is deprecated;
217 * Use ETHTOOL_GSSET_INFO instead. 225 * Use ETHTOOL_GSSET_INFO instead.
218 */ 226 */
219 if (ops->get_sset_count) { 227 if (ops && ops->get_sset_count) {
220 int rc; 228 int rc;
221 229
222 rc = ops->get_sset_count(dev, ETH_SS_TEST); 230 rc = ops->get_sset_count(dev, ETH_SS_TEST);
@@ -229,9 +237,9 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev,
229 if (rc >= 0) 237 if (rc >= 0)
230 info.n_priv_flags = rc; 238 info.n_priv_flags = rc;
231 } 239 }
232 if (ops->get_regs_len) 240 if (ops && ops->get_regs_len)
233 info.regdump_len = ops->get_regs_len(dev); 241 info.regdump_len = ops->get_regs_len(dev);
234 if (ops->get_eeprom_len) 242 if (ops && ops->get_eeprom_len)
235 info.eedump_len = ops->get_eeprom_len(dev); 243 info.eedump_len = ops->get_eeprom_len(dev);
236 244
237 if (copy_to_user(useraddr, &info, sizeof(info))) 245 if (copy_to_user(useraddr, &info, sizeof(info)))
@@ -348,7 +356,7 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev,
348 if (info.cmd == ETHTOOL_GRXCLSRLALL) { 356 if (info.cmd == ETHTOOL_GRXCLSRLALL) {
349 if (info.rule_cnt > 0) { 357 if (info.rule_cnt > 0) {
350 if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32)) 358 if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32))
351 rule_buf = kmalloc(info.rule_cnt * sizeof(u32), 359 rule_buf = kzalloc(info.rule_cnt * sizeof(u32),
352 GFP_USER); 360 GFP_USER);
353 if (!rule_buf) 361 if (!rule_buf)
354 return -ENOMEM; 362 return -ENOMEM;
@@ -397,7 +405,7 @@ static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
397 (KMALLOC_MAX_SIZE - sizeof(*indir)) / sizeof(*indir->ring_index)) 405 (KMALLOC_MAX_SIZE - sizeof(*indir)) / sizeof(*indir->ring_index))
398 return -ENOMEM; 406 return -ENOMEM;
399 full_size = sizeof(*indir) + sizeof(*indir->ring_index) * table_size; 407 full_size = sizeof(*indir) + sizeof(*indir->ring_index) * table_size;
400 indir = kmalloc(full_size, GFP_USER); 408 indir = kzalloc(full_size, GFP_USER);
401 if (!indir) 409 if (!indir)
402 return -ENOMEM; 410 return -ENOMEM;
403 411
@@ -479,6 +487,38 @@ static void __rx_ntuple_filter_add(struct ethtool_rx_ntuple_list *list,
479 list->count++; 487 list->count++;
480} 488}
481 489
490/*
491 * ethtool does not (or did not) set masks for flow parameters that are
492 * not specified, so if both value and mask are 0 then this must be
493 * treated as equivalent to a mask with all bits set. Implement that
494 * here rather than in drivers.
495 */
496static void rx_ntuple_fix_masks(struct ethtool_rx_ntuple_flow_spec *fs)
497{
498 struct ethtool_tcpip4_spec *entry = &fs->h_u.tcp_ip4_spec;
499 struct ethtool_tcpip4_spec *mask = &fs->m_u.tcp_ip4_spec;
500
501 if (fs->flow_type != TCP_V4_FLOW &&
502 fs->flow_type != UDP_V4_FLOW &&
503 fs->flow_type != SCTP_V4_FLOW)
504 return;
505
506 if (!(entry->ip4src | mask->ip4src))
507 mask->ip4src = htonl(0xffffffff);
508 if (!(entry->ip4dst | mask->ip4dst))
509 mask->ip4dst = htonl(0xffffffff);
510 if (!(entry->psrc | mask->psrc))
511 mask->psrc = htons(0xffff);
512 if (!(entry->pdst | mask->pdst))
513 mask->pdst = htons(0xffff);
514 if (!(entry->tos | mask->tos))
515 mask->tos = 0xff;
516 if (!(fs->vlan_tag | fs->vlan_tag_mask))
517 fs->vlan_tag_mask = 0xffff;
518 if (!(fs->data | fs->data_mask))
519 fs->data_mask = 0xffffffffffffffffULL;
520}
521
482static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev, 522static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev,
483 void __user *useraddr) 523 void __user *useraddr)
484{ 524{
@@ -493,6 +533,8 @@ static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev,
493 if (copy_from_user(&cmd, useraddr, sizeof(cmd))) 533 if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
494 return -EFAULT; 534 return -EFAULT;
495 535
536 rx_ntuple_fix_masks(&cmd.fs);
537
496 /* 538 /*
497 * Cache filter in dev struct for GET operation only if 539 * Cache filter in dev struct for GET operation only if
498 * the underlying driver doesn't have its own GET operation, and 540 * the underlying driver doesn't have its own GET operation, and
@@ -538,7 +580,7 @@ static int ethtool_get_rx_ntuple(struct net_device *dev, void __user *useraddr)
538 580
539 gstrings.len = ret; 581 gstrings.len = ret;
540 582
541 data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER); 583 data = kzalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER);
542 if (!data) 584 if (!data)
543 return -ENOMEM; 585 return -ENOMEM;
544 586
@@ -667,19 +709,19 @@ static int ethtool_get_rx_ntuple(struct net_device *dev, void __user *useraddr)
667 break; 709 break;
668 case IP_USER_FLOW: 710 case IP_USER_FLOW:
669 sprintf(p, "\tSrc IP addr: 0x%x\n", 711 sprintf(p, "\tSrc IP addr: 0x%x\n",
670 fsc->fs.h_u.raw_ip4_spec.ip4src); 712 fsc->fs.h_u.usr_ip4_spec.ip4src);
671 p += ETH_GSTRING_LEN; 713 p += ETH_GSTRING_LEN;
672 num_strings++; 714 num_strings++;
673 sprintf(p, "\tSrc IP mask: 0x%x\n", 715 sprintf(p, "\tSrc IP mask: 0x%x\n",
674 fsc->fs.m_u.raw_ip4_spec.ip4src); 716 fsc->fs.m_u.usr_ip4_spec.ip4src);
675 p += ETH_GSTRING_LEN; 717 p += ETH_GSTRING_LEN;
676 num_strings++; 718 num_strings++;
677 sprintf(p, "\tDest IP addr: 0x%x\n", 719 sprintf(p, "\tDest IP addr: 0x%x\n",
678 fsc->fs.h_u.raw_ip4_spec.ip4dst); 720 fsc->fs.h_u.usr_ip4_spec.ip4dst);
679 p += ETH_GSTRING_LEN; 721 p += ETH_GSTRING_LEN;
680 num_strings++; 722 num_strings++;
681 sprintf(p, "\tDest IP mask: 0x%x\n", 723 sprintf(p, "\tDest IP mask: 0x%x\n",
682 fsc->fs.m_u.raw_ip4_spec.ip4dst); 724 fsc->fs.m_u.usr_ip4_spec.ip4dst);
683 p += ETH_GSTRING_LEN; 725 p += ETH_GSTRING_LEN;
684 num_strings++; 726 num_strings++;
685 break; 727 break;
@@ -775,7 +817,7 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
775 if (regs.len > reglen) 817 if (regs.len > reglen)
776 regs.len = reglen; 818 regs.len = reglen;
777 819
778 regbuf = kmalloc(reglen, GFP_USER); 820 regbuf = vmalloc(reglen);
779 if (!regbuf) 821 if (!regbuf)
780 return -ENOMEM; 822 return -ENOMEM;
781 823
@@ -790,7 +832,7 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
790 ret = 0; 832 ret = 0;
791 833
792 out: 834 out:
793 kfree(regbuf); 835 vfree(regbuf);
794 return ret; 836 return ret;
795} 837}
796 838
@@ -1175,8 +1217,11 @@ static int ethtool_set_gro(struct net_device *dev, char __user *useraddr)
1175 return -EFAULT; 1217 return -EFAULT;
1176 1218
1177 if (edata.data) { 1219 if (edata.data) {
1178 if (!dev->ethtool_ops->get_rx_csum || 1220 u32 rxcsum = dev->ethtool_ops->get_rx_csum ?
1179 !dev->ethtool_ops->get_rx_csum(dev)) 1221 dev->ethtool_ops->get_rx_csum(dev) :
1222 ethtool_op_get_rx_csum(dev);
1223
1224 if (!rxcsum)
1180 return -EINVAL; 1225 return -EINVAL;
1181 dev->features |= NETIF_F_GRO; 1226 dev->features |= NETIF_F_GRO;
1182 } else 1227 } else
@@ -1402,14 +1447,22 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
1402 if (!dev || !netif_device_present(dev)) 1447 if (!dev || !netif_device_present(dev))
1403 return -ENODEV; 1448 return -ENODEV;
1404 1449
1405 if (!dev->ethtool_ops)
1406 return -EOPNOTSUPP;
1407
1408 if (copy_from_user(&ethcmd, useraddr, sizeof(ethcmd))) 1450 if (copy_from_user(&ethcmd, useraddr, sizeof(ethcmd)))
1409 return -EFAULT; 1451 return -EFAULT;
1410 1452
1453 if (!dev->ethtool_ops) {
1454 /* ETHTOOL_GDRVINFO does not require any driver support.
1455 * It is also unprivileged and does not change anything,
1456 * so we can take a shortcut to it. */
1457 if (ethcmd == ETHTOOL_GDRVINFO)
1458 return ethtool_get_drvinfo(dev, useraddr);
1459 else
1460 return -EOPNOTSUPP;
1461 }
1462
1411 /* Allow some commands to be done by anyone */ 1463 /* Allow some commands to be done by anyone */
1412 switch (ethcmd) { 1464 switch (ethcmd) {
1465 case ETHTOOL_GSET:
1413 case ETHTOOL_GDRVINFO: 1466 case ETHTOOL_GDRVINFO:
1414 case ETHTOOL_GMSGLVL: 1467 case ETHTOOL_GMSGLVL:
1415 case ETHTOOL_GCOALESCE: 1468 case ETHTOOL_GCOALESCE:
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 42e84e08a1be..82a4369ae150 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -144,7 +144,7 @@ fib_rules_register(const struct fib_rules_ops *tmpl, struct net *net)
144} 144}
145EXPORT_SYMBOL_GPL(fib_rules_register); 145EXPORT_SYMBOL_GPL(fib_rules_register);
146 146
147void fib_rules_cleanup_ops(struct fib_rules_ops *ops) 147static void fib_rules_cleanup_ops(struct fib_rules_ops *ops)
148{ 148{
149 struct fib_rule *rule, *tmp; 149 struct fib_rule *rule, *tmp;
150 150
@@ -153,7 +153,6 @@ void fib_rules_cleanup_ops(struct fib_rules_ops *ops)
153 fib_rule_put(rule); 153 fib_rule_put(rule);
154 } 154 }
155} 155}
156EXPORT_SYMBOL_GPL(fib_rules_cleanup_ops);
157 156
158static void fib_rules_put_rcu(struct rcu_head *head) 157static void fib_rules_put_rcu(struct rcu_head *head)
159{ 158{
@@ -182,7 +181,8 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
182{ 181{
183 int ret = 0; 182 int ret = 0;
184 183
185 if (rule->iifindex && (rule->iifindex != fl->iif)) 184 if (rule->iifindex && (rule->iifindex != fl->iif) &&
185 !(fl->flags & FLOWI_FLAG_MATCH_ANY_IIF))
186 goto out; 186 goto out;
187 187
188 if (rule->oifindex && (rule->oifindex != fl->oif)) 188 if (rule->oifindex && (rule->oifindex != fl->oif))
@@ -225,9 +225,12 @@ jumped:
225 err = ops->action(rule, fl, flags, arg); 225 err = ops->action(rule, fl, flags, arg);
226 226
227 if (err != -EAGAIN) { 227 if (err != -EAGAIN) {
228 fib_rule_get(rule); 228 if ((arg->flags & FIB_LOOKUP_NOREF) ||
229 arg->rule = rule; 229 likely(atomic_inc_not_zero(&rule->refcnt))) {
230 goto out; 230 arg->rule = rule;
231 goto out;
232 }
233 break;
231 } 234 }
232 } 235 }
233 236
@@ -348,12 +351,12 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
348 351
349 list_for_each_entry(r, &ops->rules_list, list) { 352 list_for_each_entry(r, &ops->rules_list, list) {
350 if (r->pref == rule->target) { 353 if (r->pref == rule->target) {
351 rule->ctarget = r; 354 RCU_INIT_POINTER(rule->ctarget, r);
352 break; 355 break;
353 } 356 }
354 } 357 }
355 358
356 if (rule->ctarget == NULL) 359 if (rcu_dereference_protected(rule->ctarget, 1) == NULL)
357 unresolved = 1; 360 unresolved = 1;
358 } else if (rule->action == FR_ACT_GOTO) 361 } else if (rule->action == FR_ACT_GOTO)
359 goto errout_free; 362 goto errout_free;
@@ -370,6 +373,11 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
370 373
371 fib_rule_get(rule); 374 fib_rule_get(rule);
372 375
376 if (last)
377 list_add_rcu(&rule->list, &last->list);
378 else
379 list_add_rcu(&rule->list, &ops->rules_list);
380
373 if (ops->unresolved_rules) { 381 if (ops->unresolved_rules) {
374 /* 382 /*
375 * There are unresolved goto rules in the list, check if 383 * There are unresolved goto rules in the list, check if
@@ -378,7 +386,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
378 list_for_each_entry(r, &ops->rules_list, list) { 386 list_for_each_entry(r, &ops->rules_list, list) {
379 if (r->action == FR_ACT_GOTO && 387 if (r->action == FR_ACT_GOTO &&
380 r->target == rule->pref) { 388 r->target == rule->pref) {
381 BUG_ON(r->ctarget != NULL); 389 BUG_ON(rtnl_dereference(r->ctarget) != NULL);
382 rcu_assign_pointer(r->ctarget, rule); 390 rcu_assign_pointer(r->ctarget, rule);
383 if (--ops->unresolved_rules == 0) 391 if (--ops->unresolved_rules == 0)
384 break; 392 break;
@@ -392,11 +400,6 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
392 if (unresolved) 400 if (unresolved)
393 ops->unresolved_rules++; 401 ops->unresolved_rules++;
394 402
395 if (last)
396 list_add_rcu(&rule->list, &last->list);
397 else
398 list_add_rcu(&rule->list, &ops->rules_list);
399
400 notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).pid); 403 notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).pid);
401 flush_route_cache(ops); 404 flush_route_cache(ops);
402 rules_ops_put(ops); 405 rules_ops_put(ops);
@@ -484,14 +487,13 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
484 */ 487 */
485 if (ops->nr_goto_rules > 0) { 488 if (ops->nr_goto_rules > 0) {
486 list_for_each_entry(tmp, &ops->rules_list, list) { 489 list_for_each_entry(tmp, &ops->rules_list, list) {
487 if (tmp->ctarget == rule) { 490 if (rtnl_dereference(tmp->ctarget) == rule) {
488 rcu_assign_pointer(tmp->ctarget, NULL); 491 rcu_assign_pointer(tmp->ctarget, NULL);
489 ops->unresolved_rules++; 492 ops->unresolved_rules++;
490 } 493 }
491 } 494 }
492 } 495 }
493 496
494 synchronize_rcu();
495 notify_rule_change(RTM_DELRULE, rule, ops, nlh, 497 notify_rule_change(RTM_DELRULE, rule, ops, nlh,
496 NETLINK_CB(skb).pid); 498 NETLINK_CB(skb).pid);
497 fib_rule_put(rule); 499 fib_rule_put(rule);
@@ -543,7 +545,8 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
543 frh->action = rule->action; 545 frh->action = rule->action;
544 frh->flags = rule->flags; 546 frh->flags = rule->flags;
545 547
546 if (rule->action == FR_ACT_GOTO && rule->ctarget == NULL) 548 if (rule->action == FR_ACT_GOTO &&
549 rcu_dereference_raw(rule->ctarget) == NULL)
547 frh->flags |= FIB_RULE_UNRESOLVED; 550 frh->flags |= FIB_RULE_UNRESOLVED;
548 551
549 if (rule->iifname[0]) { 552 if (rule->iifname[0]) {
diff --git a/net/core/filter.c b/net/core/filter.c
index 52b051f82a01..7beaec36b541 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -89,8 +89,8 @@ int sk_filter(struct sock *sk, struct sk_buff *skb)
89 rcu_read_lock_bh(); 89 rcu_read_lock_bh();
90 filter = rcu_dereference_bh(sk->sk_filter); 90 filter = rcu_dereference_bh(sk->sk_filter);
91 if (filter) { 91 if (filter) {
92 unsigned int pkt_len = sk_run_filter(skb, filter->insns, 92 unsigned int pkt_len = sk_run_filter(skb, filter->insns, filter->len);
93 filter->len); 93
94 err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM; 94 err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
95 } 95 }
96 rcu_read_unlock_bh(); 96 rcu_read_unlock_bh();
@@ -638,10 +638,9 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
638 return err; 638 return err;
639 } 639 }
640 640
641 rcu_read_lock_bh(); 641 old_fp = rcu_dereference_protected(sk->sk_filter,
642 old_fp = rcu_dereference_bh(sk->sk_filter); 642 sock_owned_by_user(sk));
643 rcu_assign_pointer(sk->sk_filter, fp); 643 rcu_assign_pointer(sk->sk_filter, fp);
644 rcu_read_unlock_bh();
645 644
646 if (old_fp) 645 if (old_fp)
647 sk_filter_delayed_uncharge(sk, old_fp); 646 sk_filter_delayed_uncharge(sk, old_fp);
@@ -654,14 +653,13 @@ int sk_detach_filter(struct sock *sk)
654 int ret = -ENOENT; 653 int ret = -ENOENT;
655 struct sk_filter *filter; 654 struct sk_filter *filter;
656 655
657 rcu_read_lock_bh(); 656 filter = rcu_dereference_protected(sk->sk_filter,
658 filter = rcu_dereference_bh(sk->sk_filter); 657 sock_owned_by_user(sk));
659 if (filter) { 658 if (filter) {
660 rcu_assign_pointer(sk->sk_filter, NULL); 659 rcu_assign_pointer(sk->sk_filter, NULL);
661 sk_filter_delayed_uncharge(sk, filter); 660 sk_filter_delayed_uncharge(sk, filter);
662 ret = 0; 661 ret = 0;
663 } 662 }
664 rcu_read_unlock_bh();
665 return ret; 663 return ret;
666} 664}
667EXPORT_SYMBOL_GPL(sk_detach_filter); 665EXPORT_SYMBOL_GPL(sk_detach_filter);
diff --git a/net/core/flow.c b/net/core/flow.c
index f67dcbfe54ef..127c8a7ffd61 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -53,8 +53,7 @@ struct flow_flush_info {
53 53
54struct flow_cache { 54struct flow_cache {
55 u32 hash_shift; 55 u32 hash_shift;
56 unsigned long order; 56 struct flow_cache_percpu __percpu *percpu;
57 struct flow_cache_percpu *percpu;
58 struct notifier_block hotcpu_notifier; 57 struct notifier_block hotcpu_notifier;
59 int low_watermark; 58 int low_watermark;
60 int high_watermark; 59 int high_watermark;
@@ -64,7 +63,7 @@ struct flow_cache {
64atomic_t flow_cache_genid = ATOMIC_INIT(0); 63atomic_t flow_cache_genid = ATOMIC_INIT(0);
65EXPORT_SYMBOL(flow_cache_genid); 64EXPORT_SYMBOL(flow_cache_genid);
66static struct flow_cache flow_cache_global; 65static struct flow_cache flow_cache_global;
67static struct kmem_cache *flow_cachep; 66static struct kmem_cache *flow_cachep __read_mostly;
68 67
69static DEFINE_SPINLOCK(flow_cache_gc_lock); 68static DEFINE_SPINLOCK(flow_cache_gc_lock);
70static LIST_HEAD(flow_cache_gc_list); 69static LIST_HEAD(flow_cache_gc_list);
@@ -177,15 +176,11 @@ static u32 flow_hash_code(struct flow_cache *fc,
177{ 176{
178 u32 *k = (u32 *) key; 177 u32 *k = (u32 *) key;
179 178
180 return (jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd) 179 return jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd)
181 & (flow_cache_hash_size(fc) - 1)); 180 & (flow_cache_hash_size(fc) - 1);
182} 181}
183 182
184#if (BITS_PER_LONG == 64) 183typedef unsigned long flow_compare_t;
185typedef u64 flow_compare_t;
186#else
187typedef u32 flow_compare_t;
188#endif
189 184
190/* I hear what you're saying, use memcmp. But memcmp cannot make 185/* I hear what you're saying, use memcmp. But memcmp cannot make
191 * important assumptions that we can here, such as alignment and 186 * important assumptions that we can here, such as alignment and
@@ -357,62 +352,73 @@ void flow_cache_flush(void)
357 put_online_cpus(); 352 put_online_cpus();
358} 353}
359 354
360static void __init flow_cache_cpu_prepare(struct flow_cache *fc, 355static int __cpuinit flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
361 struct flow_cache_percpu *fcp)
362{ 356{
363 fcp->hash_table = (struct hlist_head *) 357 struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
364 __get_free_pages(GFP_KERNEL|__GFP_ZERO, fc->order); 358 size_t sz = sizeof(struct hlist_head) * flow_cache_hash_size(fc);
365 if (!fcp->hash_table)
366 panic("NET: failed to allocate flow cache order %lu\n", fc->order);
367 359
368 fcp->hash_rnd_recalc = 1; 360 if (!fcp->hash_table) {
369 fcp->hash_count = 0; 361 fcp->hash_table = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu));
370 tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0); 362 if (!fcp->hash_table) {
363 pr_err("NET: failed to allocate flow cache sz %zu\n", sz);
364 return -ENOMEM;
365 }
366 fcp->hash_rnd_recalc = 1;
367 fcp->hash_count = 0;
368 tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0);
369 }
370 return 0;
371} 371}
372 372
373static int flow_cache_cpu(struct notifier_block *nfb, 373static int __cpuinit flow_cache_cpu(struct notifier_block *nfb,
374 unsigned long action, 374 unsigned long action,
375 void *hcpu) 375 void *hcpu)
376{ 376{
377 struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier); 377 struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier);
378 int cpu = (unsigned long) hcpu; 378 int res, cpu = (unsigned long) hcpu;
379 struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); 379 struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
380 380
381 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) 381 switch (action) {
382 case CPU_UP_PREPARE:
383 case CPU_UP_PREPARE_FROZEN:
384 res = flow_cache_cpu_prepare(fc, cpu);
385 if (res)
386 return notifier_from_errno(res);
387 break;
388 case CPU_DEAD:
389 case CPU_DEAD_FROZEN:
382 __flow_cache_shrink(fc, fcp, 0); 390 __flow_cache_shrink(fc, fcp, 0);
391 break;
392 }
383 return NOTIFY_OK; 393 return NOTIFY_OK;
384} 394}
385 395
386static int flow_cache_init(struct flow_cache *fc) 396static int __init flow_cache_init(struct flow_cache *fc)
387{ 397{
388 unsigned long order;
389 int i; 398 int i;
390 399
391 fc->hash_shift = 10; 400 fc->hash_shift = 10;
392 fc->low_watermark = 2 * flow_cache_hash_size(fc); 401 fc->low_watermark = 2 * flow_cache_hash_size(fc);
393 fc->high_watermark = 4 * flow_cache_hash_size(fc); 402 fc->high_watermark = 4 * flow_cache_hash_size(fc);
394 403
395 for (order = 0;
396 (PAGE_SIZE << order) <
397 (sizeof(struct hlist_head)*flow_cache_hash_size(fc));
398 order++)
399 /* NOTHING */;
400 fc->order = order;
401 fc->percpu = alloc_percpu(struct flow_cache_percpu); 404 fc->percpu = alloc_percpu(struct flow_cache_percpu);
405 if (!fc->percpu)
406 return -ENOMEM;
402 407
403 setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd, 408 for_each_online_cpu(i) {
404 (unsigned long) fc); 409 if (flow_cache_cpu_prepare(fc, i))
405 fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; 410 return -ENOMEM;
406 add_timer(&fc->rnd_timer); 411 }
407
408 for_each_possible_cpu(i)
409 flow_cache_cpu_prepare(fc, per_cpu_ptr(fc->percpu, i));
410
411 fc->hotcpu_notifier = (struct notifier_block){ 412 fc->hotcpu_notifier = (struct notifier_block){
412 .notifier_call = flow_cache_cpu, 413 .notifier_call = flow_cache_cpu,
413 }; 414 };
414 register_hotcpu_notifier(&fc->hotcpu_notifier); 415 register_hotcpu_notifier(&fc->hotcpu_notifier);
415 416
417 setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
418 (unsigned long) fc);
419 fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
420 add_timer(&fc->rnd_timer);
421
416 return 0; 422 return 0;
417} 423}
418 424
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 9fbe7f7429b0..7c2373321b74 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -232,7 +232,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
232 est->last_packets = bstats->packets; 232 est->last_packets = bstats->packets;
233 est->avpps = rate_est->pps<<10; 233 est->avpps = rate_est->pps<<10;
234 234
235 spin_lock(&est_tree_lock); 235 spin_lock_bh(&est_tree_lock);
236 if (!elist[idx].timer.function) { 236 if (!elist[idx].timer.function) {
237 INIT_LIST_HEAD(&elist[idx].list); 237 INIT_LIST_HEAD(&elist[idx].list);
238 setup_timer(&elist[idx].timer, est_timer, idx); 238 setup_timer(&elist[idx].timer, est_timer, idx);
@@ -243,7 +243,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
243 243
244 list_add_rcu(&est->list, &elist[idx].list); 244 list_add_rcu(&est->list, &elist[idx].list);
245 gen_add_node(est); 245 gen_add_node(est);
246 spin_unlock(&est_tree_lock); 246 spin_unlock_bh(&est_tree_lock);
247 247
248 return 0; 248 return 0;
249} 249}
@@ -270,18 +270,18 @@ void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
270{ 270{
271 struct gen_estimator *e; 271 struct gen_estimator *e;
272 272
273 spin_lock(&est_tree_lock); 273 spin_lock_bh(&est_tree_lock);
274 while ((e = gen_find_node(bstats, rate_est))) { 274 while ((e = gen_find_node(bstats, rate_est))) {
275 rb_erase(&e->node, &est_root); 275 rb_erase(&e->node, &est_root);
276 276
277 write_lock_bh(&est_lock); 277 write_lock(&est_lock);
278 e->bstats = NULL; 278 e->bstats = NULL;
279 write_unlock_bh(&est_lock); 279 write_unlock(&est_lock);
280 280
281 list_del_rcu(&e->list); 281 list_del_rcu(&e->list);
282 call_rcu(&e->e_rcu, __gen_kill_estimator); 282 call_rcu(&e->e_rcu, __gen_kill_estimator);
283 } 283 }
284 spin_unlock(&est_tree_lock); 284 spin_unlock_bh(&est_tree_lock);
285} 285}
286EXPORT_SYMBOL(gen_kill_estimator); 286EXPORT_SYMBOL(gen_kill_estimator);
287 287
@@ -320,9 +320,9 @@ bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
320 320
321 ASSERT_RTNL(); 321 ASSERT_RTNL();
322 322
323 spin_lock(&est_tree_lock); 323 spin_lock_bh(&est_tree_lock);
324 res = gen_find_node(bstats, rate_est) != NULL; 324 res = gen_find_node(bstats, rate_est) != NULL;
325 spin_unlock(&est_tree_lock); 325 spin_unlock_bh(&est_tree_lock);
326 326
327 return res; 327 return res;
328} 328}
diff --git a/net/core/iovec.c b/net/core/iovec.c
index 1cd98df412df..c40f27e7d208 100644
--- a/net/core/iovec.c
+++ b/net/core/iovec.c
@@ -37,11 +37,13 @@
37 37
38int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, int mode) 38int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, int mode)
39{ 39{
40 int size, err, ct; 40 int size, ct, err;
41 41
42 if (m->msg_namelen) { 42 if (m->msg_namelen) {
43 if (mode == VERIFY_READ) { 43 if (mode == VERIFY_READ) {
44 err = move_addr_to_kernel(m->msg_name, m->msg_namelen, 44 void __user *namep;
45 namep = (void __user __force *) m->msg_name;
46 err = move_addr_to_kernel(namep, m->msg_namelen,
45 address); 47 address);
46 if (err < 0) 48 if (err < 0)
47 return err; 49 return err;
@@ -52,21 +54,20 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address,
52 } 54 }
53 55
54 size = m->msg_iovlen * sizeof(struct iovec); 56 size = m->msg_iovlen * sizeof(struct iovec);
55 if (copy_from_user(iov, m->msg_iov, size)) 57 if (copy_from_user(iov, (void __user __force *) m->msg_iov, size))
56 return -EFAULT; 58 return -EFAULT;
57 59
58 m->msg_iov = iov; 60 m->msg_iov = iov;
59 err = 0; 61 err = 0;
60 62
61 for (ct = 0; ct < m->msg_iovlen; ct++) { 63 for (ct = 0; ct < m->msg_iovlen; ct++) {
62 err += iov[ct].iov_len; 64 size_t len = iov[ct].iov_len;
63 /* 65
64 * Goal is not to verify user data, but to prevent returning 66 if (len > INT_MAX - err) {
65 * negative value, which is interpreted as errno. 67 len = INT_MAX - err;
66 * Overflow is still possible, but it is harmless. 68 iov[ct].iov_len = len;
67 */ 69 }
68 if (err < 0) 70 err += len;
69 return -EMSGSIZE;
70 } 71 }
71 72
72 return err; 73 return err;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index a4e0a7482c2b..8cc8f9a79db9 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -122,7 +122,7 @@ static void neigh_cleanup_and_release(struct neighbour *neigh)
122 122
123unsigned long neigh_rand_reach_time(unsigned long base) 123unsigned long neigh_rand_reach_time(unsigned long base)
124{ 124{
125 return (base ? (net_random() % base) + (base >> 1) : 0); 125 return base ? (net_random() % base) + (base >> 1) : 0;
126} 126}
127EXPORT_SYMBOL(neigh_rand_reach_time); 127EXPORT_SYMBOL(neigh_rand_reach_time);
128 128
@@ -131,15 +131,20 @@ static int neigh_forced_gc(struct neigh_table *tbl)
131{ 131{
132 int shrunk = 0; 132 int shrunk = 0;
133 int i; 133 int i;
134 struct neigh_hash_table *nht;
134 135
135 NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); 136 NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs);
136 137
137 write_lock_bh(&tbl->lock); 138 write_lock_bh(&tbl->lock);
138 for (i = 0; i <= tbl->hash_mask; i++) { 139 nht = rcu_dereference_protected(tbl->nht,
139 struct neighbour *n, **np; 140 lockdep_is_held(&tbl->lock));
141 for (i = 0; i <= nht->hash_mask; i++) {
142 struct neighbour *n;
143 struct neighbour __rcu **np;
140 144
141 np = &tbl->hash_buckets[i]; 145 np = &nht->hash_buckets[i];
142 while ((n = *np) != NULL) { 146 while ((n = rcu_dereference_protected(*np,
147 lockdep_is_held(&tbl->lock))) != NULL) {
143 /* Neighbour record may be discarded if: 148 /* Neighbour record may be discarded if:
144 * - nobody refers to it. 149 * - nobody refers to it.
145 * - it is not permanent 150 * - it is not permanent
@@ -147,7 +152,9 @@ static int neigh_forced_gc(struct neigh_table *tbl)
147 write_lock(&n->lock); 152 write_lock(&n->lock);
148 if (atomic_read(&n->refcnt) == 1 && 153 if (atomic_read(&n->refcnt) == 1 &&
149 !(n->nud_state & NUD_PERMANENT)) { 154 !(n->nud_state & NUD_PERMANENT)) {
150 *np = n->next; 155 rcu_assign_pointer(*np,
156 rcu_dereference_protected(n->next,
157 lockdep_is_held(&tbl->lock)));
151 n->dead = 1; 158 n->dead = 1;
152 shrunk = 1; 159 shrunk = 1;
153 write_unlock(&n->lock); 160 write_unlock(&n->lock);
@@ -199,16 +206,24 @@ static void pneigh_queue_purge(struct sk_buff_head *list)
199static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) 206static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
200{ 207{
201 int i; 208 int i;
209 struct neigh_hash_table *nht;
202 210
203 for (i = 0; i <= tbl->hash_mask; i++) { 211 nht = rcu_dereference_protected(tbl->nht,
204 struct neighbour *n, **np = &tbl->hash_buckets[i]; 212 lockdep_is_held(&tbl->lock));
205 213
206 while ((n = *np) != NULL) { 214 for (i = 0; i <= nht->hash_mask; i++) {
215 struct neighbour *n;
216 struct neighbour __rcu **np = &nht->hash_buckets[i];
217
218 while ((n = rcu_dereference_protected(*np,
219 lockdep_is_held(&tbl->lock))) != NULL) {
207 if (dev && n->dev != dev) { 220 if (dev && n->dev != dev) {
208 np = &n->next; 221 np = &n->next;
209 continue; 222 continue;
210 } 223 }
211 *np = n->next; 224 rcu_assign_pointer(*np,
225 rcu_dereference_protected(n->next,
226 lockdep_is_held(&tbl->lock)));
212 write_lock(&n->lock); 227 write_lock(&n->lock);
213 neigh_del_timer(n); 228 neigh_del_timer(n);
214 n->dead = 1; 229 n->dead = 1;
@@ -279,6 +294,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl)
279 294
280 skb_queue_head_init(&n->arp_queue); 295 skb_queue_head_init(&n->arp_queue);
281 rwlock_init(&n->lock); 296 rwlock_init(&n->lock);
297 seqlock_init(&n->ha_lock);
282 n->updated = n->used = now; 298 n->updated = n->used = now;
283 n->nud_state = NUD_NONE; 299 n->nud_state = NUD_NONE;
284 n->output = neigh_blackhole; 300 n->output = neigh_blackhole;
@@ -297,64 +313,86 @@ out_entries:
297 goto out; 313 goto out;
298} 314}
299 315
300static struct neighbour **neigh_hash_alloc(unsigned int entries) 316static struct neigh_hash_table *neigh_hash_alloc(unsigned int entries)
301{ 317{
302 unsigned long size = entries * sizeof(struct neighbour *); 318 size_t size = entries * sizeof(struct neighbour *);
303 struct neighbour **ret; 319 struct neigh_hash_table *ret;
320 struct neighbour **buckets;
304 321
305 if (size <= PAGE_SIZE) { 322 ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
306 ret = kzalloc(size, GFP_ATOMIC); 323 if (!ret)
307 } else { 324 return NULL;
308 ret = (struct neighbour **) 325 if (size <= PAGE_SIZE)
309 __get_free_pages(GFP_ATOMIC|__GFP_ZERO, get_order(size)); 326 buckets = kzalloc(size, GFP_ATOMIC);
327 else
328 buckets = (struct neighbour **)
329 __get_free_pages(GFP_ATOMIC | __GFP_ZERO,
330 get_order(size));
331 if (!buckets) {
332 kfree(ret);
333 return NULL;
310 } 334 }
335 rcu_assign_pointer(ret->hash_buckets, buckets);
336 ret->hash_mask = entries - 1;
337 get_random_bytes(&ret->hash_rnd, sizeof(ret->hash_rnd));
311 return ret; 338 return ret;
312} 339}
313 340
314static void neigh_hash_free(struct neighbour **hash, unsigned int entries) 341static void neigh_hash_free_rcu(struct rcu_head *head)
315{ 342{
316 unsigned long size = entries * sizeof(struct neighbour *); 343 struct neigh_hash_table *nht = container_of(head,
344 struct neigh_hash_table,
345 rcu);
346 size_t size = (nht->hash_mask + 1) * sizeof(struct neighbour *);
347 struct neighbour **buckets = nht->hash_buckets;
317 348
318 if (size <= PAGE_SIZE) 349 if (size <= PAGE_SIZE)
319 kfree(hash); 350 kfree(buckets);
320 else 351 else
321 free_pages((unsigned long)hash, get_order(size)); 352 free_pages((unsigned long)buckets, get_order(size));
353 kfree(nht);
322} 354}
323 355
324static void neigh_hash_grow(struct neigh_table *tbl, unsigned long new_entries) 356static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl,
357 unsigned long new_entries)
325{ 358{
326 struct neighbour **new_hash, **old_hash; 359 unsigned int i, hash;
327 unsigned int i, new_hash_mask, old_entries; 360 struct neigh_hash_table *new_nht, *old_nht;
328 361
329 NEIGH_CACHE_STAT_INC(tbl, hash_grows); 362 NEIGH_CACHE_STAT_INC(tbl, hash_grows);
330 363
331 BUG_ON(!is_power_of_2(new_entries)); 364 BUG_ON(!is_power_of_2(new_entries));
332 new_hash = neigh_hash_alloc(new_entries); 365 old_nht = rcu_dereference_protected(tbl->nht,
333 if (!new_hash) 366 lockdep_is_held(&tbl->lock));
334 return; 367 new_nht = neigh_hash_alloc(new_entries);
335 368 if (!new_nht)
336 old_entries = tbl->hash_mask + 1; 369 return old_nht;
337 new_hash_mask = new_entries - 1;
338 old_hash = tbl->hash_buckets;
339 370
340 get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd)); 371 for (i = 0; i <= old_nht->hash_mask; i++) {
341 for (i = 0; i < old_entries; i++) {
342 struct neighbour *n, *next; 372 struct neighbour *n, *next;
343 373
344 for (n = old_hash[i]; n; n = next) { 374 for (n = rcu_dereference_protected(old_nht->hash_buckets[i],
345 unsigned int hash_val = tbl->hash(n->primary_key, n->dev); 375 lockdep_is_held(&tbl->lock));
346 376 n != NULL;
347 hash_val &= new_hash_mask; 377 n = next) {
348 next = n->next; 378 hash = tbl->hash(n->primary_key, n->dev,
349 379 new_nht->hash_rnd);
350 n->next = new_hash[hash_val]; 380
351 new_hash[hash_val] = n; 381 hash &= new_nht->hash_mask;
382 next = rcu_dereference_protected(n->next,
383 lockdep_is_held(&tbl->lock));
384
385 rcu_assign_pointer(n->next,
386 rcu_dereference_protected(
387 new_nht->hash_buckets[hash],
388 lockdep_is_held(&tbl->lock)));
389 rcu_assign_pointer(new_nht->hash_buckets[hash], n);
352 } 390 }
353 } 391 }
354 tbl->hash_buckets = new_hash;
355 tbl->hash_mask = new_hash_mask;
356 392
357 neigh_hash_free(old_hash, old_entries); 393 rcu_assign_pointer(tbl->nht, new_nht);
394 call_rcu(&old_nht->rcu, neigh_hash_free_rcu);
395 return new_nht;
358} 396}
359 397
360struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, 398struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
@@ -363,19 +401,26 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
363 struct neighbour *n; 401 struct neighbour *n;
364 int key_len = tbl->key_len; 402 int key_len = tbl->key_len;
365 u32 hash_val; 403 u32 hash_val;
404 struct neigh_hash_table *nht;
366 405
367 NEIGH_CACHE_STAT_INC(tbl, lookups); 406 NEIGH_CACHE_STAT_INC(tbl, lookups);
368 407
369 read_lock_bh(&tbl->lock); 408 rcu_read_lock_bh();
370 hash_val = tbl->hash(pkey, dev); 409 nht = rcu_dereference_bh(tbl->nht);
371 for (n = tbl->hash_buckets[hash_val & tbl->hash_mask]; n; n = n->next) { 410 hash_val = tbl->hash(pkey, dev, nht->hash_rnd) & nht->hash_mask;
411
412 for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
413 n != NULL;
414 n = rcu_dereference_bh(n->next)) {
372 if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) { 415 if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) {
373 neigh_hold(n); 416 if (!atomic_inc_not_zero(&n->refcnt))
417 n = NULL;
374 NEIGH_CACHE_STAT_INC(tbl, hits); 418 NEIGH_CACHE_STAT_INC(tbl, hits);
375 break; 419 break;
376 } 420 }
377 } 421 }
378 read_unlock_bh(&tbl->lock); 422
423 rcu_read_unlock_bh();
379 return n; 424 return n;
380} 425}
381EXPORT_SYMBOL(neigh_lookup); 426EXPORT_SYMBOL(neigh_lookup);
@@ -386,20 +431,27 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net,
386 struct neighbour *n; 431 struct neighbour *n;
387 int key_len = tbl->key_len; 432 int key_len = tbl->key_len;
388 u32 hash_val; 433 u32 hash_val;
434 struct neigh_hash_table *nht;
389 435
390 NEIGH_CACHE_STAT_INC(tbl, lookups); 436 NEIGH_CACHE_STAT_INC(tbl, lookups);
391 437
392 read_lock_bh(&tbl->lock); 438 rcu_read_lock_bh();
393 hash_val = tbl->hash(pkey, NULL); 439 nht = rcu_dereference_bh(tbl->nht);
394 for (n = tbl->hash_buckets[hash_val & tbl->hash_mask]; n; n = n->next) { 440 hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) & nht->hash_mask;
441
442 for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
443 n != NULL;
444 n = rcu_dereference_bh(n->next)) {
395 if (!memcmp(n->primary_key, pkey, key_len) && 445 if (!memcmp(n->primary_key, pkey, key_len) &&
396 net_eq(dev_net(n->dev), net)) { 446 net_eq(dev_net(n->dev), net)) {
397 neigh_hold(n); 447 if (!atomic_inc_not_zero(&n->refcnt))
448 n = NULL;
398 NEIGH_CACHE_STAT_INC(tbl, hits); 449 NEIGH_CACHE_STAT_INC(tbl, hits);
399 break; 450 break;
400 } 451 }
401 } 452 }
402 read_unlock_bh(&tbl->lock); 453
454 rcu_read_unlock_bh();
403 return n; 455 return n;
404} 456}
405EXPORT_SYMBOL(neigh_lookup_nodev); 457EXPORT_SYMBOL(neigh_lookup_nodev);
@@ -411,6 +463,7 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
411 int key_len = tbl->key_len; 463 int key_len = tbl->key_len;
412 int error; 464 int error;
413 struct neighbour *n1, *rc, *n = neigh_alloc(tbl); 465 struct neighbour *n1, *rc, *n = neigh_alloc(tbl);
466 struct neigh_hash_table *nht;
414 467
415 if (!n) { 468 if (!n) {
416 rc = ERR_PTR(-ENOBUFS); 469 rc = ERR_PTR(-ENOBUFS);
@@ -437,18 +490,24 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
437 n->confirmed = jiffies - (n->parms->base_reachable_time << 1); 490 n->confirmed = jiffies - (n->parms->base_reachable_time << 1);
438 491
439 write_lock_bh(&tbl->lock); 492 write_lock_bh(&tbl->lock);
493 nht = rcu_dereference_protected(tbl->nht,
494 lockdep_is_held(&tbl->lock));
440 495
441 if (atomic_read(&tbl->entries) > (tbl->hash_mask + 1)) 496 if (atomic_read(&tbl->entries) > (nht->hash_mask + 1))
442 neigh_hash_grow(tbl, (tbl->hash_mask + 1) << 1); 497 nht = neigh_hash_grow(tbl, (nht->hash_mask + 1) << 1);
443 498
444 hash_val = tbl->hash(pkey, dev) & tbl->hash_mask; 499 hash_val = tbl->hash(pkey, dev, nht->hash_rnd) & nht->hash_mask;
445 500
446 if (n->parms->dead) { 501 if (n->parms->dead) {
447 rc = ERR_PTR(-EINVAL); 502 rc = ERR_PTR(-EINVAL);
448 goto out_tbl_unlock; 503 goto out_tbl_unlock;
449 } 504 }
450 505
451 for (n1 = tbl->hash_buckets[hash_val]; n1; n1 = n1->next) { 506 for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val],
507 lockdep_is_held(&tbl->lock));
508 n1 != NULL;
509 n1 = rcu_dereference_protected(n1->next,
510 lockdep_is_held(&tbl->lock))) {
452 if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) { 511 if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) {
453 neigh_hold(n1); 512 neigh_hold(n1);
454 rc = n1; 513 rc = n1;
@@ -456,10 +515,12 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
456 } 515 }
457 } 516 }
458 517
459 n->next = tbl->hash_buckets[hash_val];
460 tbl->hash_buckets[hash_val] = n;
461 n->dead = 0; 518 n->dead = 0;
462 neigh_hold(n); 519 neigh_hold(n);
520 rcu_assign_pointer(n->next,
521 rcu_dereference_protected(nht->hash_buckets[hash_val],
522 lockdep_is_held(&tbl->lock)));
523 rcu_assign_pointer(nht->hash_buckets[hash_val], n);
463 write_unlock_bh(&tbl->lock); 524 write_unlock_bh(&tbl->lock);
464 NEIGH_PRINTK2("neigh %p is created.\n", n); 525 NEIGH_PRINTK2("neigh %p is created.\n", n);
465 rc = n; 526 rc = n;
@@ -616,6 +677,12 @@ static inline void neigh_parms_put(struct neigh_parms *parms)
616 neigh_parms_destroy(parms); 677 neigh_parms_destroy(parms);
617} 678}
618 679
680static void neigh_destroy_rcu(struct rcu_head *head)
681{
682 struct neighbour *neigh = container_of(head, struct neighbour, rcu);
683
684 kmem_cache_free(neigh->tbl->kmem_cachep, neigh);
685}
619/* 686/*
620 * neighbour must already be out of the table; 687 * neighbour must already be out of the table;
621 * 688 *
@@ -643,8 +710,7 @@ void neigh_destroy(struct neighbour *neigh)
643 write_seqlock_bh(&hh->hh_lock); 710 write_seqlock_bh(&hh->hh_lock);
644 hh->hh_output = neigh_blackhole; 711 hh->hh_output = neigh_blackhole;
645 write_sequnlock_bh(&hh->hh_lock); 712 write_sequnlock_bh(&hh->hh_lock);
646 if (atomic_dec_and_test(&hh->hh_refcnt)) 713 hh_cache_put(hh);
647 kfree(hh);
648 } 714 }
649 715
650 skb_queue_purge(&neigh->arp_queue); 716 skb_queue_purge(&neigh->arp_queue);
@@ -655,7 +721,7 @@ void neigh_destroy(struct neighbour *neigh)
655 NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh); 721 NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh);
656 722
657 atomic_dec(&neigh->tbl->entries); 723 atomic_dec(&neigh->tbl->entries);
658 kmem_cache_free(neigh->tbl->kmem_cachep, neigh); 724 call_rcu(&neigh->rcu, neigh_destroy_rcu);
659} 725}
660EXPORT_SYMBOL(neigh_destroy); 726EXPORT_SYMBOL(neigh_destroy);
661 727
@@ -696,12 +762,16 @@ static void neigh_connect(struct neighbour *neigh)
696static void neigh_periodic_work(struct work_struct *work) 762static void neigh_periodic_work(struct work_struct *work)
697{ 763{
698 struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work); 764 struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work);
699 struct neighbour *n, **np; 765 struct neighbour *n;
766 struct neighbour __rcu **np;
700 unsigned int i; 767 unsigned int i;
768 struct neigh_hash_table *nht;
701 769
702 NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs); 770 NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);
703 771
704 write_lock_bh(&tbl->lock); 772 write_lock_bh(&tbl->lock);
773 nht = rcu_dereference_protected(tbl->nht,
774 lockdep_is_held(&tbl->lock));
705 775
706 /* 776 /*
707 * periodically recompute ReachableTime from random function 777 * periodically recompute ReachableTime from random function
@@ -715,10 +785,11 @@ static void neigh_periodic_work(struct work_struct *work)
715 neigh_rand_reach_time(p->base_reachable_time); 785 neigh_rand_reach_time(p->base_reachable_time);
716 } 786 }
717 787
718 for (i = 0 ; i <= tbl->hash_mask; i++) { 788 for (i = 0 ; i <= nht->hash_mask; i++) {
719 np = &tbl->hash_buckets[i]; 789 np = &nht->hash_buckets[i];
720 790
721 while ((n = *np) != NULL) { 791 while ((n = rcu_dereference_protected(*np,
792 lockdep_is_held(&tbl->lock))) != NULL) {
722 unsigned int state; 793 unsigned int state;
723 794
724 write_lock(&n->lock); 795 write_lock(&n->lock);
@@ -766,9 +837,9 @@ next_elt:
766static __inline__ int neigh_max_probes(struct neighbour *n) 837static __inline__ int neigh_max_probes(struct neighbour *n)
767{ 838{
768 struct neigh_parms *p = n->parms; 839 struct neigh_parms *p = n->parms;
769 return (n->nud_state & NUD_PROBE ? 840 return (n->nud_state & NUD_PROBE) ?
770 p->ucast_probes : 841 p->ucast_probes :
771 p->ucast_probes + p->app_probes + p->mcast_probes); 842 p->ucast_probes + p->app_probes + p->mcast_probes;
772} 843}
773 844
774static void neigh_invalidate(struct neighbour *neigh) 845static void neigh_invalidate(struct neighbour *neigh)
@@ -945,7 +1016,7 @@ out_unlock_bh:
945} 1016}
946EXPORT_SYMBOL(__neigh_event_send); 1017EXPORT_SYMBOL(__neigh_event_send);
947 1018
948static void neigh_update_hhs(struct neighbour *neigh) 1019static void neigh_update_hhs(const struct neighbour *neigh)
949{ 1020{
950 struct hh_cache *hh; 1021 struct hh_cache *hh;
951 void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *) 1022 void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *)
@@ -1081,7 +1152,9 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
1081 } 1152 }
1082 1153
1083 if (lladdr != neigh->ha) { 1154 if (lladdr != neigh->ha) {
1155 write_seqlock(&neigh->ha_lock);
1084 memcpy(&neigh->ha, lladdr, dev->addr_len); 1156 memcpy(&neigh->ha, lladdr, dev->addr_len);
1157 write_sequnlock(&neigh->ha_lock);
1085 neigh_update_hhs(neigh); 1158 neigh_update_hhs(neigh);
1086 if (!(new & NUD_CONNECTED)) 1159 if (!(new & NUD_CONNECTED))
1087 neigh->confirmed = jiffies - 1160 neigh->confirmed = jiffies -
@@ -1139,44 +1212,73 @@ struct neighbour *neigh_event_ns(struct neigh_table *tbl,
1139} 1212}
1140EXPORT_SYMBOL(neigh_event_ns); 1213EXPORT_SYMBOL(neigh_event_ns);
1141 1214
1215static inline bool neigh_hh_lookup(struct neighbour *n, struct dst_entry *dst,
1216 __be16 protocol)
1217{
1218 struct hh_cache *hh;
1219
1220 smp_rmb(); /* paired with smp_wmb() in neigh_hh_init() */
1221 for (hh = n->hh; hh; hh = hh->hh_next) {
1222 if (hh->hh_type == protocol) {
1223 atomic_inc(&hh->hh_refcnt);
1224 if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL))
1225 hh_cache_put(hh);
1226 return true;
1227 }
1228 }
1229 return false;
1230}
1231
1232/* called with read_lock_bh(&n->lock); */
1142static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst, 1233static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst,
1143 __be16 protocol) 1234 __be16 protocol)
1144{ 1235{
1145 struct hh_cache *hh; 1236 struct hh_cache *hh;
1146 struct net_device *dev = dst->dev; 1237 struct net_device *dev = dst->dev;
1147 1238
1148 for (hh = n->hh; hh; hh = hh->hh_next) 1239 if (likely(neigh_hh_lookup(n, dst, protocol)))
1149 if (hh->hh_type == protocol) 1240 return;
1150 break;
1151 1241
1152 if (!hh && (hh = kzalloc(sizeof(*hh), GFP_ATOMIC)) != NULL) { 1242 /* slow path */
1153 seqlock_init(&hh->hh_lock); 1243 hh = kzalloc(sizeof(*hh), GFP_ATOMIC);
1154 hh->hh_type = protocol; 1244 if (!hh)
1155 atomic_set(&hh->hh_refcnt, 0); 1245 return;
1156 hh->hh_next = NULL;
1157 1246
1158 if (dev->header_ops->cache(n, hh)) { 1247 seqlock_init(&hh->hh_lock);
1159 kfree(hh); 1248 hh->hh_type = protocol;
1160 hh = NULL; 1249 atomic_set(&hh->hh_refcnt, 2);
1161 } else { 1250
1162 atomic_inc(&hh->hh_refcnt); 1251 if (dev->header_ops->cache(n, hh)) {
1163 hh->hh_next = n->hh; 1252 kfree(hh);
1164 n->hh = hh; 1253 return;
1165 if (n->nud_state & NUD_CONNECTED)
1166 hh->hh_output = n->ops->hh_output;
1167 else
1168 hh->hh_output = n->ops->output;
1169 }
1170 } 1254 }
1171 if (hh) { 1255
1172 atomic_inc(&hh->hh_refcnt); 1256 write_lock_bh(&n->lock);
1173 dst->hh = hh; 1257
1258 /* must check if another thread already did the insert */
1259 if (neigh_hh_lookup(n, dst, protocol)) {
1260 kfree(hh);
1261 goto end;
1174 } 1262 }
1263
1264 if (n->nud_state & NUD_CONNECTED)
1265 hh->hh_output = n->ops->hh_output;
1266 else
1267 hh->hh_output = n->ops->output;
1268
1269 hh->hh_next = n->hh;
1270 smp_wmb(); /* paired with smp_rmb() in neigh_hh_lookup() */
1271 n->hh = hh;
1272
1273 if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL))
1274 hh_cache_put(hh);
1275end:
1276 write_unlock_bh(&n->lock);
1175} 1277}
1176 1278
1177/* This function can be used in contexts, where only old dev_queue_xmit 1279/* This function can be used in contexts, where only old dev_queue_xmit
1178 worked, f.e. if you want to override normal output path (eql, shaper), 1280 * worked, f.e. if you want to override normal output path (eql, shaper),
1179 but resolution is not made yet. 1281 * but resolution is not made yet.
1180 */ 1282 */
1181 1283
1182int neigh_compat_output(struct sk_buff *skb) 1284int neigh_compat_output(struct sk_buff *skb)
@@ -1210,19 +1312,19 @@ int neigh_resolve_output(struct sk_buff *skb)
1210 if (!neigh_event_send(neigh, skb)) { 1312 if (!neigh_event_send(neigh, skb)) {
1211 int err; 1313 int err;
1212 struct net_device *dev = neigh->dev; 1314 struct net_device *dev = neigh->dev;
1213 if (dev->header_ops->cache && !dst->hh) { 1315 unsigned int seq;
1214 write_lock_bh(&neigh->lock); 1316
1215 if (!dst->hh) 1317 if (dev->header_ops->cache &&
1216 neigh_hh_init(neigh, dst, dst->ops->protocol); 1318 !dst->hh &&
1217 err = dev_hard_header(skb, dev, ntohs(skb->protocol), 1319 !(dst->flags & DST_NOCACHE))
1218 neigh->ha, NULL, skb->len); 1320 neigh_hh_init(neigh, dst, dst->ops->protocol);
1219 write_unlock_bh(&neigh->lock); 1321
1220 } else { 1322 do {
1221 read_lock_bh(&neigh->lock); 1323 seq = read_seqbegin(&neigh->ha_lock);
1222 err = dev_hard_header(skb, dev, ntohs(skb->protocol), 1324 err = dev_hard_header(skb, dev, ntohs(skb->protocol),
1223 neigh->ha, NULL, skb->len); 1325 neigh->ha, NULL, skb->len);
1224 read_unlock_bh(&neigh->lock); 1326 } while (read_seqretry(&neigh->ha_lock, seq));
1225 } 1327
1226 if (err >= 0) 1328 if (err >= 0)
1227 rc = neigh->ops->queue_xmit(skb); 1329 rc = neigh->ops->queue_xmit(skb);
1228 else 1330 else
@@ -1248,13 +1350,16 @@ int neigh_connected_output(struct sk_buff *skb)
1248 struct dst_entry *dst = skb_dst(skb); 1350 struct dst_entry *dst = skb_dst(skb);
1249 struct neighbour *neigh = dst->neighbour; 1351 struct neighbour *neigh = dst->neighbour;
1250 struct net_device *dev = neigh->dev; 1352 struct net_device *dev = neigh->dev;
1353 unsigned int seq;
1251 1354
1252 __skb_pull(skb, skb_network_offset(skb)); 1355 __skb_pull(skb, skb_network_offset(skb));
1253 1356
1254 read_lock_bh(&neigh->lock); 1357 do {
1255 err = dev_hard_header(skb, dev, ntohs(skb->protocol), 1358 seq = read_seqbegin(&neigh->ha_lock);
1256 neigh->ha, NULL, skb->len); 1359 err = dev_hard_header(skb, dev, ntohs(skb->protocol),
1257 read_unlock_bh(&neigh->lock); 1360 neigh->ha, NULL, skb->len);
1361 } while (read_seqretry(&neigh->ha_lock, seq));
1362
1258 if (err >= 0) 1363 if (err >= 0)
1259 err = neigh->ops->queue_xmit(skb); 1364 err = neigh->ops->queue_xmit(skb);
1260 else { 1365 else {
@@ -1436,17 +1541,14 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl)
1436 panic("cannot create neighbour proc dir entry"); 1541 panic("cannot create neighbour proc dir entry");
1437#endif 1542#endif
1438 1543
1439 tbl->hash_mask = 1; 1544 tbl->nht = neigh_hash_alloc(8);
1440 tbl->hash_buckets = neigh_hash_alloc(tbl->hash_mask + 1);
1441 1545
1442 phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *); 1546 phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *);
1443 tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL); 1547 tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL);
1444 1548
1445 if (!tbl->hash_buckets || !tbl->phash_buckets) 1549 if (!tbl->nht || !tbl->phash_buckets)
1446 panic("cannot allocate neighbour cache hashes"); 1550 panic("cannot allocate neighbour cache hashes");
1447 1551
1448 get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
1449
1450 rwlock_init(&tbl->lock); 1552 rwlock_init(&tbl->lock);
1451 INIT_DELAYED_WORK_DEFERRABLE(&tbl->gc_work, neigh_periodic_work); 1553 INIT_DELAYED_WORK_DEFERRABLE(&tbl->gc_work, neigh_periodic_work);
1452 schedule_delayed_work(&tbl->gc_work, tbl->parms.reachable_time); 1554 schedule_delayed_work(&tbl->gc_work, tbl->parms.reachable_time);
@@ -1486,8 +1588,7 @@ int neigh_table_clear(struct neigh_table *tbl)
1486 struct neigh_table **tp; 1588 struct neigh_table **tp;
1487 1589
1488 /* It is not clean... Fix it to unload IPv6 module safely */ 1590 /* It is not clean... Fix it to unload IPv6 module safely */
1489 cancel_delayed_work(&tbl->gc_work); 1591 cancel_delayed_work_sync(&tbl->gc_work);
1490 flush_scheduled_work();
1491 del_timer_sync(&tbl->proxy_timer); 1592 del_timer_sync(&tbl->proxy_timer);
1492 pneigh_queue_purge(&tbl->proxy_queue); 1593 pneigh_queue_purge(&tbl->proxy_queue);
1493 neigh_ifdown(tbl, NULL); 1594 neigh_ifdown(tbl, NULL);
@@ -1502,8 +1603,8 @@ int neigh_table_clear(struct neigh_table *tbl)
1502 } 1603 }
1503 write_unlock(&neigh_tbl_lock); 1604 write_unlock(&neigh_tbl_lock);
1504 1605
1505 neigh_hash_free(tbl->hash_buckets, tbl->hash_mask + 1); 1606 call_rcu(&tbl->nht->rcu, neigh_hash_free_rcu);
1506 tbl->hash_buckets = NULL; 1607 tbl->nht = NULL;
1507 1608
1508 kfree(tbl->phash_buckets); 1609 kfree(tbl->phash_buckets);
1509 tbl->phash_buckets = NULL; 1610 tbl->phash_buckets = NULL;
@@ -1529,6 +1630,7 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
1529 struct net_device *dev = NULL; 1630 struct net_device *dev = NULL;
1530 int err = -EINVAL; 1631 int err = -EINVAL;
1531 1632
1633 ASSERT_RTNL();
1532 if (nlmsg_len(nlh) < sizeof(*ndm)) 1634 if (nlmsg_len(nlh) < sizeof(*ndm))
1533 goto out; 1635 goto out;
1534 1636
@@ -1538,7 +1640,7 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
1538 1640
1539 ndm = nlmsg_data(nlh); 1641 ndm = nlmsg_data(nlh);
1540 if (ndm->ndm_ifindex) { 1642 if (ndm->ndm_ifindex) {
1541 dev = dev_get_by_index(net, ndm->ndm_ifindex); 1643 dev = __dev_get_by_index(net, ndm->ndm_ifindex);
1542 if (dev == NULL) { 1644 if (dev == NULL) {
1543 err = -ENODEV; 1645 err = -ENODEV;
1544 goto out; 1646 goto out;
@@ -1554,34 +1656,31 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
1554 read_unlock(&neigh_tbl_lock); 1656 read_unlock(&neigh_tbl_lock);
1555 1657
1556 if (nla_len(dst_attr) < tbl->key_len) 1658 if (nla_len(dst_attr) < tbl->key_len)
1557 goto out_dev_put; 1659 goto out;
1558 1660
1559 if (ndm->ndm_flags & NTF_PROXY) { 1661 if (ndm->ndm_flags & NTF_PROXY) {
1560 err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); 1662 err = pneigh_delete(tbl, net, nla_data(dst_attr), dev);
1561 goto out_dev_put; 1663 goto out;
1562 } 1664 }
1563 1665
1564 if (dev == NULL) 1666 if (dev == NULL)
1565 goto out_dev_put; 1667 goto out;
1566 1668
1567 neigh = neigh_lookup(tbl, nla_data(dst_attr), dev); 1669 neigh = neigh_lookup(tbl, nla_data(dst_attr), dev);
1568 if (neigh == NULL) { 1670 if (neigh == NULL) {
1569 err = -ENOENT; 1671 err = -ENOENT;
1570 goto out_dev_put; 1672 goto out;
1571 } 1673 }
1572 1674
1573 err = neigh_update(neigh, NULL, NUD_FAILED, 1675 err = neigh_update(neigh, NULL, NUD_FAILED,
1574 NEIGH_UPDATE_F_OVERRIDE | 1676 NEIGH_UPDATE_F_OVERRIDE |
1575 NEIGH_UPDATE_F_ADMIN); 1677 NEIGH_UPDATE_F_ADMIN);
1576 neigh_release(neigh); 1678 neigh_release(neigh);
1577 goto out_dev_put; 1679 goto out;
1578 } 1680 }
1579 read_unlock(&neigh_tbl_lock); 1681 read_unlock(&neigh_tbl_lock);
1580 err = -EAFNOSUPPORT; 1682 err = -EAFNOSUPPORT;
1581 1683
1582out_dev_put:
1583 if (dev)
1584 dev_put(dev);
1585out: 1684out:
1586 return err; 1685 return err;
1587} 1686}
@@ -1595,6 +1694,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
1595 struct net_device *dev = NULL; 1694 struct net_device *dev = NULL;
1596 int err; 1695 int err;
1597 1696
1697 ASSERT_RTNL();
1598 err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); 1698 err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL);
1599 if (err < 0) 1699 if (err < 0)
1600 goto out; 1700 goto out;
@@ -1605,14 +1705,14 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
1605 1705
1606 ndm = nlmsg_data(nlh); 1706 ndm = nlmsg_data(nlh);
1607 if (ndm->ndm_ifindex) { 1707 if (ndm->ndm_ifindex) {
1608 dev = dev_get_by_index(net, ndm->ndm_ifindex); 1708 dev = __dev_get_by_index(net, ndm->ndm_ifindex);
1609 if (dev == NULL) { 1709 if (dev == NULL) {
1610 err = -ENODEV; 1710 err = -ENODEV;
1611 goto out; 1711 goto out;
1612 } 1712 }
1613 1713
1614 if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) 1714 if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len)
1615 goto out_dev_put; 1715 goto out;
1616 } 1716 }
1617 1717
1618 read_lock(&neigh_tbl_lock); 1718 read_lock(&neigh_tbl_lock);
@@ -1626,7 +1726,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
1626 read_unlock(&neigh_tbl_lock); 1726 read_unlock(&neigh_tbl_lock);
1627 1727
1628 if (nla_len(tb[NDA_DST]) < tbl->key_len) 1728 if (nla_len(tb[NDA_DST]) < tbl->key_len)
1629 goto out_dev_put; 1729 goto out;
1630 dst = nla_data(tb[NDA_DST]); 1730 dst = nla_data(tb[NDA_DST]);
1631 lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; 1731 lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL;
1632 1732
@@ -1639,29 +1739,29 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
1639 pn->flags = ndm->ndm_flags; 1739 pn->flags = ndm->ndm_flags;
1640 err = 0; 1740 err = 0;
1641 } 1741 }
1642 goto out_dev_put; 1742 goto out;
1643 } 1743 }
1644 1744
1645 if (dev == NULL) 1745 if (dev == NULL)
1646 goto out_dev_put; 1746 goto out;
1647 1747
1648 neigh = neigh_lookup(tbl, dst, dev); 1748 neigh = neigh_lookup(tbl, dst, dev);
1649 if (neigh == NULL) { 1749 if (neigh == NULL) {
1650 if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { 1750 if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
1651 err = -ENOENT; 1751 err = -ENOENT;
1652 goto out_dev_put; 1752 goto out;
1653 } 1753 }
1654 1754
1655 neigh = __neigh_lookup_errno(tbl, dst, dev); 1755 neigh = __neigh_lookup_errno(tbl, dst, dev);
1656 if (IS_ERR(neigh)) { 1756 if (IS_ERR(neigh)) {
1657 err = PTR_ERR(neigh); 1757 err = PTR_ERR(neigh);
1658 goto out_dev_put; 1758 goto out;
1659 } 1759 }
1660 } else { 1760 } else {
1661 if (nlh->nlmsg_flags & NLM_F_EXCL) { 1761 if (nlh->nlmsg_flags & NLM_F_EXCL) {
1662 err = -EEXIST; 1762 err = -EEXIST;
1663 neigh_release(neigh); 1763 neigh_release(neigh);
1664 goto out_dev_put; 1764 goto out;
1665 } 1765 }
1666 1766
1667 if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) 1767 if (!(nlh->nlmsg_flags & NLM_F_REPLACE))
@@ -1674,15 +1774,11 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
1674 } else 1774 } else
1675 err = neigh_update(neigh, lladdr, ndm->ndm_state, flags); 1775 err = neigh_update(neigh, lladdr, ndm->ndm_state, flags);
1676 neigh_release(neigh); 1776 neigh_release(neigh);
1677 goto out_dev_put; 1777 goto out;
1678 } 1778 }
1679 1779
1680 read_unlock(&neigh_tbl_lock); 1780 read_unlock(&neigh_tbl_lock);
1681 err = -EAFNOSUPPORT; 1781 err = -EAFNOSUPPORT;
1682
1683out_dev_put:
1684 if (dev)
1685 dev_put(dev);
1686out: 1782out:
1687 return err; 1783 return err;
1688} 1784}
@@ -1748,18 +1844,22 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
1748 unsigned long now = jiffies; 1844 unsigned long now = jiffies;
1749 unsigned int flush_delta = now - tbl->last_flush; 1845 unsigned int flush_delta = now - tbl->last_flush;
1750 unsigned int rand_delta = now - tbl->last_rand; 1846 unsigned int rand_delta = now - tbl->last_rand;
1751 1847 struct neigh_hash_table *nht;
1752 struct ndt_config ndc = { 1848 struct ndt_config ndc = {
1753 .ndtc_key_len = tbl->key_len, 1849 .ndtc_key_len = tbl->key_len,
1754 .ndtc_entry_size = tbl->entry_size, 1850 .ndtc_entry_size = tbl->entry_size,
1755 .ndtc_entries = atomic_read(&tbl->entries), 1851 .ndtc_entries = atomic_read(&tbl->entries),
1756 .ndtc_last_flush = jiffies_to_msecs(flush_delta), 1852 .ndtc_last_flush = jiffies_to_msecs(flush_delta),
1757 .ndtc_last_rand = jiffies_to_msecs(rand_delta), 1853 .ndtc_last_rand = jiffies_to_msecs(rand_delta),
1758 .ndtc_hash_rnd = tbl->hash_rnd,
1759 .ndtc_hash_mask = tbl->hash_mask,
1760 .ndtc_proxy_qlen = tbl->proxy_queue.qlen, 1854 .ndtc_proxy_qlen = tbl->proxy_queue.qlen,
1761 }; 1855 };
1762 1856
1857 rcu_read_lock_bh();
1858 nht = rcu_dereference_bh(tbl->nht);
1859 ndc.ndtc_hash_rnd = nht->hash_rnd;
1860 ndc.ndtc_hash_mask = nht->hash_mask;
1861 rcu_read_unlock_bh();
1862
1763 NLA_PUT(skb, NDTA_CONFIG, sizeof(ndc), &ndc); 1863 NLA_PUT(skb, NDTA_CONFIG, sizeof(ndc), &ndc);
1764 } 1864 }
1765 1865
@@ -2056,10 +2156,14 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
2056 2156
2057 read_lock_bh(&neigh->lock); 2157 read_lock_bh(&neigh->lock);
2058 ndm->ndm_state = neigh->nud_state; 2158 ndm->ndm_state = neigh->nud_state;
2059 if ((neigh->nud_state & NUD_VALID) && 2159 if (neigh->nud_state & NUD_VALID) {
2060 nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, neigh->ha) < 0) { 2160 char haddr[MAX_ADDR_LEN];
2061 read_unlock_bh(&neigh->lock); 2161
2062 goto nla_put_failure; 2162 neigh_ha_snapshot(haddr, neigh, neigh->dev);
2163 if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) {
2164 read_unlock_bh(&neigh->lock);
2165 goto nla_put_failure;
2166 }
2063 } 2167 }
2064 2168
2065 ci.ndm_used = jiffies_to_clock_t(now - neigh->used); 2169 ci.ndm_used = jiffies_to_clock_t(now - neigh->used);
@@ -2087,18 +2191,23 @@ static void neigh_update_notify(struct neighbour *neigh)
2087static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, 2191static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
2088 struct netlink_callback *cb) 2192 struct netlink_callback *cb)
2089{ 2193{
2090 struct net * net = sock_net(skb->sk); 2194 struct net *net = sock_net(skb->sk);
2091 struct neighbour *n; 2195 struct neighbour *n;
2092 int rc, h, s_h = cb->args[1]; 2196 int rc, h, s_h = cb->args[1];
2093 int idx, s_idx = idx = cb->args[2]; 2197 int idx, s_idx = idx = cb->args[2];
2198 struct neigh_hash_table *nht;
2094 2199
2095 read_lock_bh(&tbl->lock); 2200 rcu_read_lock_bh();
2096 for (h = 0; h <= tbl->hash_mask; h++) { 2201 nht = rcu_dereference_bh(tbl->nht);
2202
2203 for (h = 0; h <= nht->hash_mask; h++) {
2097 if (h < s_h) 2204 if (h < s_h)
2098 continue; 2205 continue;
2099 if (h > s_h) 2206 if (h > s_h)
2100 s_idx = 0; 2207 s_idx = 0;
2101 for (n = tbl->hash_buckets[h], idx = 0; n; n = n->next) { 2208 for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0;
2209 n != NULL;
2210 n = rcu_dereference_bh(n->next)) {
2102 if (!net_eq(dev_net(n->dev), net)) 2211 if (!net_eq(dev_net(n->dev), net))
2103 continue; 2212 continue;
2104 if (idx < s_idx) 2213 if (idx < s_idx)
@@ -2107,17 +2216,16 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
2107 cb->nlh->nlmsg_seq, 2216 cb->nlh->nlmsg_seq,
2108 RTM_NEWNEIGH, 2217 RTM_NEWNEIGH,
2109 NLM_F_MULTI) <= 0) { 2218 NLM_F_MULTI) <= 0) {
2110 read_unlock_bh(&tbl->lock);
2111 rc = -1; 2219 rc = -1;
2112 goto out; 2220 goto out;
2113 } 2221 }
2114 next: 2222next:
2115 idx++; 2223 idx++;
2116 } 2224 }
2117 } 2225 }
2118 read_unlock_bh(&tbl->lock);
2119 rc = skb->len; 2226 rc = skb->len;
2120out: 2227out:
2228 rcu_read_unlock_bh();
2121 cb->args[1] = h; 2229 cb->args[1] = h;
2122 cb->args[2] = idx; 2230 cb->args[2] = idx;
2123 return rc; 2231 return rc;
@@ -2150,15 +2258,22 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
2150void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie) 2258void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie)
2151{ 2259{
2152 int chain; 2260 int chain;
2261 struct neigh_hash_table *nht;
2153 2262
2154 read_lock_bh(&tbl->lock); 2263 rcu_read_lock_bh();
2155 for (chain = 0; chain <= tbl->hash_mask; chain++) { 2264 nht = rcu_dereference_bh(tbl->nht);
2265
2266 read_lock(&tbl->lock); /* avoid resizes */
2267 for (chain = 0; chain <= nht->hash_mask; chain++) {
2156 struct neighbour *n; 2268 struct neighbour *n;
2157 2269
2158 for (n = tbl->hash_buckets[chain]; n; n = n->next) 2270 for (n = rcu_dereference_bh(nht->hash_buckets[chain]);
2271 n != NULL;
2272 n = rcu_dereference_bh(n->next))
2159 cb(n, cookie); 2273 cb(n, cookie);
2160 } 2274 }
2161 read_unlock_bh(&tbl->lock); 2275 read_unlock(&tbl->lock);
2276 rcu_read_unlock_bh();
2162} 2277}
2163EXPORT_SYMBOL(neigh_for_each); 2278EXPORT_SYMBOL(neigh_for_each);
2164 2279
@@ -2167,18 +2282,25 @@ void __neigh_for_each_release(struct neigh_table *tbl,
2167 int (*cb)(struct neighbour *)) 2282 int (*cb)(struct neighbour *))
2168{ 2283{
2169 int chain; 2284 int chain;
2285 struct neigh_hash_table *nht;
2170 2286
2171 for (chain = 0; chain <= tbl->hash_mask; chain++) { 2287 nht = rcu_dereference_protected(tbl->nht,
2172 struct neighbour *n, **np; 2288 lockdep_is_held(&tbl->lock));
2289 for (chain = 0; chain <= nht->hash_mask; chain++) {
2290 struct neighbour *n;
2291 struct neighbour __rcu **np;
2173 2292
2174 np = &tbl->hash_buckets[chain]; 2293 np = &nht->hash_buckets[chain];
2175 while ((n = *np) != NULL) { 2294 while ((n = rcu_dereference_protected(*np,
2295 lockdep_is_held(&tbl->lock))) != NULL) {
2176 int release; 2296 int release;
2177 2297
2178 write_lock(&n->lock); 2298 write_lock(&n->lock);
2179 release = cb(n); 2299 release = cb(n);
2180 if (release) { 2300 if (release) {
2181 *np = n->next; 2301 rcu_assign_pointer(*np,
2302 rcu_dereference_protected(n->next,
2303 lockdep_is_held(&tbl->lock)));
2182 n->dead = 1; 2304 n->dead = 1;
2183 } else 2305 } else
2184 np = &n->next; 2306 np = &n->next;
@@ -2196,13 +2318,13 @@ static struct neighbour *neigh_get_first(struct seq_file *seq)
2196{ 2318{
2197 struct neigh_seq_state *state = seq->private; 2319 struct neigh_seq_state *state = seq->private;
2198 struct net *net = seq_file_net(seq); 2320 struct net *net = seq_file_net(seq);
2199 struct neigh_table *tbl = state->tbl; 2321 struct neigh_hash_table *nht = state->nht;
2200 struct neighbour *n = NULL; 2322 struct neighbour *n = NULL;
2201 int bucket = state->bucket; 2323 int bucket = state->bucket;
2202 2324
2203 state->flags &= ~NEIGH_SEQ_IS_PNEIGH; 2325 state->flags &= ~NEIGH_SEQ_IS_PNEIGH;
2204 for (bucket = 0; bucket <= tbl->hash_mask; bucket++) { 2326 for (bucket = 0; bucket <= nht->hash_mask; bucket++) {
2205 n = tbl->hash_buckets[bucket]; 2327 n = rcu_dereference_bh(nht->hash_buckets[bucket]);
2206 2328
2207 while (n) { 2329 while (n) {
2208 if (!net_eq(dev_net(n->dev), net)) 2330 if (!net_eq(dev_net(n->dev), net))
@@ -2219,8 +2341,8 @@ static struct neighbour *neigh_get_first(struct seq_file *seq)
2219 break; 2341 break;
2220 if (n->nud_state & ~NUD_NOARP) 2342 if (n->nud_state & ~NUD_NOARP)
2221 break; 2343 break;
2222 next: 2344next:
2223 n = n->next; 2345 n = rcu_dereference_bh(n->next);
2224 } 2346 }
2225 2347
2226 if (n) 2348 if (n)
@@ -2237,14 +2359,14 @@ static struct neighbour *neigh_get_next(struct seq_file *seq,
2237{ 2359{
2238 struct neigh_seq_state *state = seq->private; 2360 struct neigh_seq_state *state = seq->private;
2239 struct net *net = seq_file_net(seq); 2361 struct net *net = seq_file_net(seq);
2240 struct neigh_table *tbl = state->tbl; 2362 struct neigh_hash_table *nht = state->nht;
2241 2363
2242 if (state->neigh_sub_iter) { 2364 if (state->neigh_sub_iter) {
2243 void *v = state->neigh_sub_iter(state, n, pos); 2365 void *v = state->neigh_sub_iter(state, n, pos);
2244 if (v) 2366 if (v)
2245 return n; 2367 return n;
2246 } 2368 }
2247 n = n->next; 2369 n = rcu_dereference_bh(n->next);
2248 2370
2249 while (1) { 2371 while (1) {
2250 while (n) { 2372 while (n) {
@@ -2261,17 +2383,17 @@ static struct neighbour *neigh_get_next(struct seq_file *seq,
2261 2383
2262 if (n->nud_state & ~NUD_NOARP) 2384 if (n->nud_state & ~NUD_NOARP)
2263 break; 2385 break;
2264 next: 2386next:
2265 n = n->next; 2387 n = rcu_dereference_bh(n->next);
2266 } 2388 }
2267 2389
2268 if (n) 2390 if (n)
2269 break; 2391 break;
2270 2392
2271 if (++state->bucket > tbl->hash_mask) 2393 if (++state->bucket > nht->hash_mask)
2272 break; 2394 break;
2273 2395
2274 n = tbl->hash_buckets[state->bucket]; 2396 n = rcu_dereference_bh(nht->hash_buckets[state->bucket]);
2275 } 2397 }
2276 2398
2277 if (n && pos) 2399 if (n && pos)
@@ -2369,7 +2491,7 @@ static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos)
2369} 2491}
2370 2492
2371void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags) 2493void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags)
2372 __acquires(tbl->lock) 2494 __acquires(rcu_bh)
2373{ 2495{
2374 struct neigh_seq_state *state = seq->private; 2496 struct neigh_seq_state *state = seq->private;
2375 2497
@@ -2377,7 +2499,8 @@ void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl
2377 state->bucket = 0; 2499 state->bucket = 0;
2378 state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH); 2500 state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH);
2379 2501
2380 read_lock_bh(&tbl->lock); 2502 rcu_read_lock_bh();
2503 state->nht = rcu_dereference_bh(tbl->nht);
2381 2504
2382 return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN; 2505 return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN;
2383} 2506}
@@ -2411,12 +2534,9 @@ out:
2411EXPORT_SYMBOL(neigh_seq_next); 2534EXPORT_SYMBOL(neigh_seq_next);
2412 2535
2413void neigh_seq_stop(struct seq_file *seq, void *v) 2536void neigh_seq_stop(struct seq_file *seq, void *v)
2414 __releases(tbl->lock) 2537 __releases(rcu_bh)
2415{ 2538{
2416 struct neigh_seq_state *state = seq->private; 2539 rcu_read_unlock_bh();
2417 struct neigh_table *tbl = state->tbl;
2418
2419 read_unlock_bh(&tbl->lock);
2420} 2540}
2421EXPORT_SYMBOL(neigh_seq_stop); 2541EXPORT_SYMBOL(neigh_seq_stop);
2422 2542
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index af4dfbadf2a0..a5ff5a89f376 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -515,7 +515,7 @@ static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr,
515 return attribute->store(queue, attribute, buf, count); 515 return attribute->store(queue, attribute, buf, count);
516} 516}
517 517
518static struct sysfs_ops rx_queue_sysfs_ops = { 518static const struct sysfs_ops rx_queue_sysfs_ops = {
519 .show = rx_queue_attr_show, 519 .show = rx_queue_attr_show,
520 .store = rx_queue_attr_store, 520 .store = rx_queue_attr_store,
521}; 521};
@@ -598,7 +598,8 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue,
598 } 598 }
599 599
600 spin_lock(&rps_map_lock); 600 spin_lock(&rps_map_lock);
601 old_map = queue->rps_map; 601 old_map = rcu_dereference_protected(queue->rps_map,
602 lockdep_is_held(&rps_map_lock));
602 rcu_assign_pointer(queue->rps_map, map); 603 rcu_assign_pointer(queue->rps_map, map);
603 spin_unlock(&rps_map_lock); 604 spin_unlock(&rps_map_lock);
604 605
@@ -677,7 +678,8 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
677 table = NULL; 678 table = NULL;
678 679
679 spin_lock(&rps_dev_flow_lock); 680 spin_lock(&rps_dev_flow_lock);
680 old_table = queue->rps_flow_table; 681 old_table = rcu_dereference_protected(queue->rps_flow_table,
682 lockdep_is_held(&rps_dev_flow_lock));
681 rcu_assign_pointer(queue->rps_flow_table, table); 683 rcu_assign_pointer(queue->rps_flow_table, table);
682 spin_unlock(&rps_dev_flow_lock); 684 spin_unlock(&rps_dev_flow_lock);
683 685
@@ -705,13 +707,17 @@ static void rx_queue_release(struct kobject *kobj)
705{ 707{
706 struct netdev_rx_queue *queue = to_rx_queue(kobj); 708 struct netdev_rx_queue *queue = to_rx_queue(kobj);
707 struct netdev_rx_queue *first = queue->first; 709 struct netdev_rx_queue *first = queue->first;
710 struct rps_map *map;
711 struct rps_dev_flow_table *flow_table;
708 712
709 if (queue->rps_map)
710 call_rcu(&queue->rps_map->rcu, rps_map_release);
711 713
712 if (queue->rps_flow_table) 714 map = rcu_dereference_raw(queue->rps_map);
713 call_rcu(&queue->rps_flow_table->rcu, 715 if (map)
714 rps_dev_flow_table_release); 716 call_rcu(&map->rcu, rps_map_release);
717
718 flow_table = rcu_dereference_raw(queue->rps_flow_table);
719 if (flow_table)
720 call_rcu(&flow_table->rcu, rps_dev_flow_table_release);
715 721
716 if (atomic_dec_and_test(&first->count)) 722 if (atomic_dec_and_test(&first->count))
717 kfree(first); 723 kfree(first);
@@ -726,6 +732,7 @@ static struct kobj_type rx_queue_ktype = {
726static int rx_queue_add_kobject(struct net_device *net, int index) 732static int rx_queue_add_kobject(struct net_device *net, int index)
727{ 733{
728 struct netdev_rx_queue *queue = net->_rx + index; 734 struct netdev_rx_queue *queue = net->_rx + index;
735 struct netdev_rx_queue *first = queue->first;
729 struct kobject *kobj = &queue->kobj; 736 struct kobject *kobj = &queue->kobj;
730 int error = 0; 737 int error = 0;
731 738
@@ -738,38 +745,43 @@ static int rx_queue_add_kobject(struct net_device *net, int index)
738 } 745 }
739 746
740 kobject_uevent(kobj, KOBJ_ADD); 747 kobject_uevent(kobj, KOBJ_ADD);
748 atomic_inc(&first->count);
741 749
742 return error; 750 return error;
743} 751}
744 752
745static int rx_queue_register_kobjects(struct net_device *net) 753int
754net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
746{ 755{
747 int i; 756 int i;
748 int error = 0; 757 int error = 0;
749 758
750 net->queues_kset = kset_create_and_add("queues", 759 for (i = old_num; i < new_num; i++) {
751 NULL, &net->dev.kobj);
752 if (!net->queues_kset)
753 return -ENOMEM;
754 for (i = 0; i < net->num_rx_queues; i++) {
755 error = rx_queue_add_kobject(net, i); 760 error = rx_queue_add_kobject(net, i);
756 if (error) 761 if (error) {
762 new_num = old_num;
757 break; 763 break;
764 }
758 } 765 }
759 766
760 if (error) 767 while (--i >= new_num)
761 while (--i >= 0) 768 kobject_put(&net->_rx[i].kobj);
762 kobject_put(&net->_rx[i].kobj);
763 769
764 return error; 770 return error;
765} 771}
766 772
767static void rx_queue_remove_kobjects(struct net_device *net) 773static int rx_queue_register_kobjects(struct net_device *net)
768{ 774{
769 int i; 775 net->queues_kset = kset_create_and_add("queues",
776 NULL, &net->dev.kobj);
777 if (!net->queues_kset)
778 return -ENOMEM;
779 return net_rx_queue_update_kobjects(net, 0, net->real_num_rx_queues);
780}
770 781
771 for (i = 0; i < net->num_rx_queues; i++) 782static void rx_queue_remove_kobjects(struct net_device *net)
772 kobject_put(&net->_rx[i].kobj); 783{
784 net_rx_queue_update_kobjects(net, net->real_num_rx_queues, 0);
773 kset_unregister(net->queues_kset); 785 kset_unregister(net->queues_kset);
774} 786}
775#endif /* CONFIG_RPS */ 787#endif /* CONFIG_RPS */
@@ -789,12 +801,13 @@ static const void *net_netlink_ns(struct sock *sk)
789 return sock_net(sk); 801 return sock_net(sk);
790} 802}
791 803
792static struct kobj_ns_type_operations net_ns_type_operations = { 804struct kobj_ns_type_operations net_ns_type_operations = {
793 .type = KOBJ_NS_TYPE_NET, 805 .type = KOBJ_NS_TYPE_NET,
794 .current_ns = net_current_ns, 806 .current_ns = net_current_ns,
795 .netlink_ns = net_netlink_ns, 807 .netlink_ns = net_netlink_ns,
796 .initial_ns = net_initial_ns, 808 .initial_ns = net_initial_ns,
797}; 809};
810EXPORT_SYMBOL_GPL(net_ns_type_operations);
798 811
799static void net_kobj_ns_exit(struct net *net) 812static void net_kobj_ns_exit(struct net *net)
800{ 813{
diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h
index 805555e8b187..778e1571548d 100644
--- a/net/core/net-sysfs.h
+++ b/net/core/net-sysfs.h
@@ -4,4 +4,8 @@
4int netdev_kobject_init(void); 4int netdev_kobject_init(void);
5int netdev_register_kobject(struct net_device *); 5int netdev_register_kobject(struct net_device *);
6void netdev_unregister_kobject(struct net_device *); 6void netdev_unregister_kobject(struct net_device *);
7#ifdef CONFIG_RPS
8int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num);
9#endif
10
7#endif 11#endif
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index afa6380ed88a..7f1bb2aba03b 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -26,6 +26,7 @@
26 26
27#define CREATE_TRACE_POINTS 27#define CREATE_TRACE_POINTS
28#include <trace/events/skb.h> 28#include <trace/events/skb.h>
29#include <trace/events/net.h>
29#include <trace/events/napi.h> 30#include <trace/events/napi.h>
30 31
31EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); 32EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index c988e685433a..3f860261c5ee 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -42,7 +42,9 @@ static int net_assign_generic(struct net *net, int id, void *data)
42 BUG_ON(!mutex_is_locked(&net_mutex)); 42 BUG_ON(!mutex_is_locked(&net_mutex));
43 BUG_ON(id == 0); 43 BUG_ON(id == 0);
44 44
45 ng = old_ng = net->gen; 45 old_ng = rcu_dereference_protected(net->gen,
46 lockdep_is_held(&net_mutex));
47 ng = old_ng;
46 if (old_ng->len >= id) 48 if (old_ng->len >= id)
47 goto assign; 49 goto assign;
48 50
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 537e01afd81b..4e98ffac3af0 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -288,11 +288,11 @@ static int netpoll_owner_active(struct net_device *dev)
288 return 0; 288 return 0;
289} 289}
290 290
291void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) 291void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
292 struct net_device *dev)
292{ 293{
293 int status = NETDEV_TX_BUSY; 294 int status = NETDEV_TX_BUSY;
294 unsigned long tries; 295 unsigned long tries;
295 struct net_device *dev = np->dev;
296 const struct net_device_ops *ops = dev->netdev_ops; 296 const struct net_device_ops *ops = dev->netdev_ops;
297 /* It is up to the caller to keep npinfo alive. */ 297 /* It is up to the caller to keep npinfo alive. */
298 struct netpoll_info *npinfo = np->dev->npinfo; 298 struct netpoll_info *npinfo = np->dev->npinfo;
@@ -346,7 +346,7 @@ void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
346 schedule_delayed_work(&npinfo->tx_work,0); 346 schedule_delayed_work(&npinfo->tx_work,0);
347 } 347 }
348} 348}
349EXPORT_SYMBOL(netpoll_send_skb); 349EXPORT_SYMBOL(netpoll_send_skb_on_dev);
350 350
351void netpoll_send_udp(struct netpoll *np, const char *msg, int len) 351void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
352{ 352{
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 10a1ea72010d..fbce4b05a53e 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -729,16 +729,14 @@ static int hex32_arg(const char __user *user_buffer, unsigned long maxlen,
729 *num = 0; 729 *num = 0;
730 730
731 for (; i < maxlen; i++) { 731 for (; i < maxlen; i++) {
732 int value;
732 char c; 733 char c;
733 *num <<= 4; 734 *num <<= 4;
734 if (get_user(c, &user_buffer[i])) 735 if (get_user(c, &user_buffer[i]))
735 return -EFAULT; 736 return -EFAULT;
736 if ((c >= '0') && (c <= '9')) 737 value = hex_to_bin(c);
737 *num |= c - '0'; 738 if (value >= 0)
738 else if ((c >= 'a') && (c <= 'f')) 739 *num |= value;
739 *num |= c - 'a' + 10;
740 else if ((c >= 'A') && (c <= 'F'))
741 *num |= c - 'A' + 10;
742 else 740 else
743 break; 741 break;
744 } 742 }
@@ -773,10 +771,10 @@ done:
773static unsigned long num_arg(const char __user * user_buffer, 771static unsigned long num_arg(const char __user * user_buffer,
774 unsigned long maxlen, unsigned long *num) 772 unsigned long maxlen, unsigned long *num)
775{ 773{
776 int i = 0; 774 int i;
777 *num = 0; 775 *num = 0;
778 776
779 for (; i < maxlen; i++) { 777 for (i = 0; i < maxlen; i++) {
780 char c; 778 char c;
781 if (get_user(c, &user_buffer[i])) 779 if (get_user(c, &user_buffer[i]))
782 return -EFAULT; 780 return -EFAULT;
@@ -791,9 +789,9 @@ static unsigned long num_arg(const char __user * user_buffer,
791 789
792static int strn_len(const char __user * user_buffer, unsigned int maxlen) 790static int strn_len(const char __user * user_buffer, unsigned int maxlen)
793{ 791{
794 int i = 0; 792 int i;
795 793
796 for (; i < maxlen; i++) { 794 for (i = 0; i < maxlen; i++) {
797 char c; 795 char c;
798 if (get_user(c, &user_buffer[i])) 796 if (get_user(c, &user_buffer[i]))
799 return -EFAULT; 797 return -EFAULT;
@@ -848,7 +846,7 @@ static ssize_t pktgen_if_write(struct file *file,
848{ 846{
849 struct seq_file *seq = file->private_data; 847 struct seq_file *seq = file->private_data;
850 struct pktgen_dev *pkt_dev = seq->private; 848 struct pktgen_dev *pkt_dev = seq->private;
851 int i = 0, max, len; 849 int i, max, len;
852 char name[16], valstr[32]; 850 char name[16], valstr[32];
853 unsigned long value = 0; 851 unsigned long value = 0;
854 char *pg_result = NULL; 852 char *pg_result = NULL;
@@ -862,13 +860,13 @@ static ssize_t pktgen_if_write(struct file *file,
862 return -EINVAL; 860 return -EINVAL;
863 } 861 }
864 862
865 max = count - i; 863 max = count;
866 tmp = count_trail_chars(&user_buffer[i], max); 864 tmp = count_trail_chars(user_buffer, max);
867 if (tmp < 0) { 865 if (tmp < 0) {
868 pr_warning("illegal format\n"); 866 pr_warning("illegal format\n");
869 return tmp; 867 return tmp;
870 } 868 }
871 i += tmp; 869 i = tmp;
872 870
873 /* Read variable name */ 871 /* Read variable name */
874 872
@@ -889,10 +887,11 @@ static ssize_t pktgen_if_write(struct file *file,
889 i += len; 887 i += len;
890 888
891 if (debug) { 889 if (debug) {
892 char tb[count + 1]; 890 size_t copy = min(count, 1023);
893 if (copy_from_user(tb, user_buffer, count)) 891 char tb[copy + 1];
892 if (copy_from_user(tb, user_buffer, copy))
894 return -EFAULT; 893 return -EFAULT;
895 tb[count] = 0; 894 tb[copy] = 0;
896 printk(KERN_DEBUG "pktgen: %s,%lu buffer -:%s:-\n", name, 895 printk(KERN_DEBUG "pktgen: %s,%lu buffer -:%s:-\n", name,
897 (unsigned long)count, tb); 896 (unsigned long)count, tb);
898 } 897 }
@@ -1766,7 +1765,7 @@ static ssize_t pktgen_thread_write(struct file *file,
1766{ 1765{
1767 struct seq_file *seq = file->private_data; 1766 struct seq_file *seq = file->private_data;
1768 struct pktgen_thread *t = seq->private; 1767 struct pktgen_thread *t = seq->private;
1769 int i = 0, max, len, ret; 1768 int i, max, len, ret;
1770 char name[40]; 1769 char name[40];
1771 char *pg_result; 1770 char *pg_result;
1772 1771
@@ -1775,12 +1774,12 @@ static ssize_t pktgen_thread_write(struct file *file,
1775 return -EINVAL; 1774 return -EINVAL;
1776 } 1775 }
1777 1776
1778 max = count - i; 1777 max = count;
1779 len = count_trail_chars(&user_buffer[i], max); 1778 len = count_trail_chars(user_buffer, max);
1780 if (len < 0) 1779 if (len < 0)
1781 return len; 1780 return len;
1782 1781
1783 i += len; 1782 i = len;
1784 1783
1785 /* Read variable name */ 1784 /* Read variable name */
1786 1785
@@ -1977,7 +1976,7 @@ static struct net_device *pktgen_dev_get_by_name(struct pktgen_dev *pkt_dev,
1977 const char *ifname) 1976 const char *ifname)
1978{ 1977{
1979 char b[IFNAMSIZ+5]; 1978 char b[IFNAMSIZ+5];
1980 int i = 0; 1979 int i;
1981 1980
1982 for (i = 0; ifname[i] != '@'; i++) { 1981 for (i = 0; ifname[i] != '@'; i++) {
1983 if (i == IFNAMSIZ) 1982 if (i == IFNAMSIZ)
@@ -2521,8 +2520,8 @@ static void free_SAs(struct pktgen_dev *pkt_dev)
2521{ 2520{
2522 if (pkt_dev->cflows) { 2521 if (pkt_dev->cflows) {
2523 /* let go of the SAs if we have them */ 2522 /* let go of the SAs if we have them */
2524 int i = 0; 2523 int i;
2525 for (; i < pkt_dev->cflows; i++) { 2524 for (i = 0; i < pkt_dev->cflows; i++) {
2526 struct xfrm_state *x = pkt_dev->flows[i].x; 2525 struct xfrm_state *x = pkt_dev->flows[i].x;
2527 if (x) { 2526 if (x) {
2528 xfrm_state_put(x); 2527 xfrm_state_put(x);
@@ -3907,8 +3906,6 @@ static void __exit pg_cleanup(void)
3907{ 3906{
3908 struct pktgen_thread *t; 3907 struct pktgen_thread *t;
3909 struct list_head *q, *n; 3908 struct list_head *q, *n;
3910 wait_queue_head_t queue;
3911 init_waitqueue_head(&queue);
3912 3909
3913 /* Stop all interfaces & threads */ 3910 /* Stop all interfaces & threads */
3914 3911
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index f78d821bd935..8121268ddbdd 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -299,14 +299,6 @@ static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops)
299 unregister_netdevice_many(&list_kill); 299 unregister_netdevice_many(&list_kill);
300} 300}
301 301
302void rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops)
303{
304 rtnl_lock();
305 __rtnl_kill_links(net, ops);
306 rtnl_unlock();
307}
308EXPORT_SYMBOL_GPL(rtnl_kill_links);
309
310/** 302/**
311 * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. 303 * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink.
312 * @ops: struct rtnl_link_ops * to unregister 304 * @ops: struct rtnl_link_ops * to unregister
@@ -612,36 +604,7 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
612 604
613static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b) 605static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b)
614{ 606{
615 struct rtnl_link_stats64 a; 607 memcpy(v, b, sizeof(*b));
616
617 a.rx_packets = b->rx_packets;
618 a.tx_packets = b->tx_packets;
619 a.rx_bytes = b->rx_bytes;
620 a.tx_bytes = b->tx_bytes;
621 a.rx_errors = b->rx_errors;
622 a.tx_errors = b->tx_errors;
623 a.rx_dropped = b->rx_dropped;
624 a.tx_dropped = b->tx_dropped;
625
626 a.multicast = b->multicast;
627 a.collisions = b->collisions;
628
629 a.rx_length_errors = b->rx_length_errors;
630 a.rx_over_errors = b->rx_over_errors;
631 a.rx_crc_errors = b->rx_crc_errors;
632 a.rx_frame_errors = b->rx_frame_errors;
633 a.rx_fifo_errors = b->rx_fifo_errors;
634 a.rx_missed_errors = b->rx_missed_errors;
635
636 a.tx_aborted_errors = b->tx_aborted_errors;
637 a.tx_carrier_errors = b->tx_carrier_errors;
638 a.tx_fifo_errors = b->tx_fifo_errors;
639 a.tx_heartbeat_errors = b->tx_heartbeat_errors;
640 a.tx_window_errors = b->tx_window_errors;
641
642 a.rx_compressed = b->rx_compressed;
643 a.tx_compressed = b->tx_compressed;
644 memcpy(v, &a, sizeof(a));
645} 608}
646 609
647/* All VF info */ 610/* All VF info */
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3a2513f0d0c3..104f8444754a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -202,8 +202,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
202 skb->data = data; 202 skb->data = data;
203 skb_reset_tail_pointer(skb); 203 skb_reset_tail_pointer(skb);
204 skb->end = skb->tail + size; 204 skb->end = skb->tail + size;
205 kmemcheck_annotate_bitfield(skb, flags1);
206 kmemcheck_annotate_bitfield(skb, flags2);
207#ifdef NET_SKBUFF_DATA_USES_OFFSET 205#ifdef NET_SKBUFF_DATA_USES_OFFSET
208 skb->mac_header = ~0U; 206 skb->mac_header = ~0U;
209#endif 207#endif
@@ -249,10 +247,9 @@ EXPORT_SYMBOL(__alloc_skb);
249struct sk_buff *__netdev_alloc_skb(struct net_device *dev, 247struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
250 unsigned int length, gfp_t gfp_mask) 248 unsigned int length, gfp_t gfp_mask)
251{ 249{
252 int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
253 struct sk_buff *skb; 250 struct sk_buff *skb;
254 251
255 skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node); 252 skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, NUMA_NO_NODE);
256 if (likely(skb)) { 253 if (likely(skb)) {
257 skb_reserve(skb, NET_SKB_PAD); 254 skb_reserve(skb, NET_SKB_PAD);
258 skb->dev = dev; 255 skb->dev = dev;
@@ -261,16 +258,6 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
261} 258}
262EXPORT_SYMBOL(__netdev_alloc_skb); 259EXPORT_SYMBOL(__netdev_alloc_skb);
263 260
264struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask)
265{
266 int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
267 struct page *page;
268
269 page = alloc_pages_node(node, gfp_mask, 0);
270 return page;
271}
272EXPORT_SYMBOL(__netdev_alloc_page);
273
274void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, 261void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
275 int size) 262 int size)
276{ 263{
@@ -340,7 +327,7 @@ static void skb_release_data(struct sk_buff *skb)
340 put_page(skb_shinfo(skb)->frags[i].page); 327 put_page(skb_shinfo(skb)->frags[i].page);
341 } 328 }
342 329
343 if (skb_has_frags(skb)) 330 if (skb_has_frag_list(skb))
344 skb_drop_fraglist(skb); 331 skb_drop_fraglist(skb);
345 332
346 kfree(skb->head); 333 kfree(skb->head);
@@ -466,6 +453,7 @@ void consume_skb(struct sk_buff *skb)
466 smp_rmb(); 453 smp_rmb();
467 else if (likely(!atomic_dec_and_test(&skb->users))) 454 else if (likely(!atomic_dec_and_test(&skb->users)))
468 return; 455 return;
456 trace_consume_skb(skb);
469 __kfree_skb(skb); 457 __kfree_skb(skb);
470} 458}
471EXPORT_SYMBOL(consume_skb); 459EXPORT_SYMBOL(consume_skb);
@@ -685,16 +673,10 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
685 673
686struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) 674struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
687{ 675{
688 int headerlen = skb->data - skb->head; 676 int headerlen = skb_headroom(skb);
689 /* 677 unsigned int size = (skb_end_pointer(skb) - skb->head) + skb->data_len;
690 * Allocate the copy buffer 678 struct sk_buff *n = alloc_skb(size, gfp_mask);
691 */ 679
692 struct sk_buff *n;
693#ifdef NET_SKBUFF_DATA_USES_OFFSET
694 n = alloc_skb(skb->end + skb->data_len, gfp_mask);
695#else
696 n = alloc_skb(skb->end - skb->head + skb->data_len, gfp_mask);
697#endif
698 if (!n) 680 if (!n)
699 return NULL; 681 return NULL;
700 682
@@ -726,20 +708,14 @@ EXPORT_SYMBOL(skb_copy);
726 708
727struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) 709struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
728{ 710{
729 /* 711 unsigned int size = skb_end_pointer(skb) - skb->head;
730 * Allocate the copy buffer 712 struct sk_buff *n = alloc_skb(size, gfp_mask);
731 */ 713
732 struct sk_buff *n;
733#ifdef NET_SKBUFF_DATA_USES_OFFSET
734 n = alloc_skb(skb->end, gfp_mask);
735#else
736 n = alloc_skb(skb->end - skb->head, gfp_mask);
737#endif
738 if (!n) 714 if (!n)
739 goto out; 715 goto out;
740 716
741 /* Set the data pointer */ 717 /* Set the data pointer */
742 skb_reserve(n, skb->data - skb->head); 718 skb_reserve(n, skb_headroom(skb));
743 /* Set the tail pointer and length */ 719 /* Set the tail pointer and length */
744 skb_put(n, skb_headlen(skb)); 720 skb_put(n, skb_headlen(skb));
745 /* Copy the bytes */ 721 /* Copy the bytes */
@@ -759,7 +735,7 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
759 skb_shinfo(n)->nr_frags = i; 735 skb_shinfo(n)->nr_frags = i;
760 } 736 }
761 737
762 if (skb_has_frags(skb)) { 738 if (skb_has_frag_list(skb)) {
763 skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; 739 skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
764 skb_clone_fraglist(n); 740 skb_clone_fraglist(n);
765 } 741 }
@@ -791,12 +767,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
791{ 767{
792 int i; 768 int i;
793 u8 *data; 769 u8 *data;
794#ifdef NET_SKBUFF_DATA_USES_OFFSET 770 int size = nhead + (skb_end_pointer(skb) - skb->head) + ntail;
795 int size = nhead + skb->end + ntail;
796#else
797 int size = nhead + (skb->end - skb->head) + ntail;
798#endif
799 long off; 771 long off;
772 bool fastpath;
800 773
801 BUG_ON(nhead < 0); 774 BUG_ON(nhead < 0);
802 775
@@ -810,23 +783,36 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
810 goto nodata; 783 goto nodata;
811 784
812 /* Copy only real data... and, alas, header. This should be 785 /* Copy only real data... and, alas, header. This should be
813 * optimized for the cases when header is void. */ 786 * optimized for the cases when header is void.
814#ifdef NET_SKBUFF_DATA_USES_OFFSET 787 */
815 memcpy(data + nhead, skb->head, skb->tail); 788 memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head);
816#else 789
817 memcpy(data + nhead, skb->head, skb->tail - skb->head); 790 memcpy((struct skb_shared_info *)(data + size),
818#endif 791 skb_shinfo(skb),
819 memcpy(data + size, skb_end_pointer(skb),
820 offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); 792 offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
821 793
822 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 794 /* Check if we can avoid taking references on fragments if we own
823 get_page(skb_shinfo(skb)->frags[i].page); 795 * the last reference on skb->head. (see skb_release_data())
796 */
797 if (!skb->cloned)
798 fastpath = true;
799 else {
800 int delta = skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1;
824 801
825 if (skb_has_frags(skb)) 802 fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta;
826 skb_clone_fraglist(skb); 803 }
827 804
828 skb_release_data(skb); 805 if (fastpath) {
806 kfree(skb->head);
807 } else {
808 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
809 get_page(skb_shinfo(skb)->frags[i].page);
810
811 if (skb_has_frag_list(skb))
812 skb_clone_fraglist(skb);
829 813
814 skb_release_data(skb);
815 }
830 off = (data + nhead) - skb->head; 816 off = (data + nhead) - skb->head;
831 817
832 skb->head = data; 818 skb->head = data;
@@ -1099,7 +1085,7 @@ drop_pages:
1099 for (; i < nfrags; i++) 1085 for (; i < nfrags; i++)
1100 put_page(skb_shinfo(skb)->frags[i].page); 1086 put_page(skb_shinfo(skb)->frags[i].page);
1101 1087
1102 if (skb_has_frags(skb)) 1088 if (skb_has_frag_list(skb))
1103 skb_drop_fraglist(skb); 1089 skb_drop_fraglist(skb);
1104 goto done; 1090 goto done;
1105 } 1091 }
@@ -1194,7 +1180,7 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
1194 /* Optimization: no fragments, no reasons to preestimate 1180 /* Optimization: no fragments, no reasons to preestimate
1195 * size of pulled pages. Superb. 1181 * size of pulled pages. Superb.
1196 */ 1182 */
1197 if (!skb_has_frags(skb)) 1183 if (!skb_has_frag_list(skb))
1198 goto pull_pages; 1184 goto pull_pages;
1199 1185
1200 /* Estimate size of pulled pages. */ 1186 /* Estimate size of pulled pages. */
@@ -2323,7 +2309,7 @@ next_skb:
2323 st->frag_data = NULL; 2309 st->frag_data = NULL;
2324 } 2310 }
2325 2311
2326 if (st->root_skb == st->cur_skb && skb_has_frags(st->root_skb)) { 2312 if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) {
2327 st->cur_skb = skb_shinfo(st->root_skb)->frag_list; 2313 st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
2328 st->frag_idx = 0; 2314 st->frag_idx = 0;
2329 goto next_skb; 2315 goto next_skb;
@@ -2573,6 +2559,10 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features)
2573 __copy_skb_header(nskb, skb); 2559 __copy_skb_header(nskb, skb);
2574 nskb->mac_len = skb->mac_len; 2560 nskb->mac_len = skb->mac_len;
2575 2561
2562 /* nskb and skb might have different headroom */
2563 if (nskb->ip_summed == CHECKSUM_PARTIAL)
2564 nskb->csum_start += skb_headroom(nskb) - headroom;
2565
2576 skb_reset_mac_header(nskb); 2566 skb_reset_mac_header(nskb);
2577 skb_set_network_header(nskb, skb->mac_len); 2567 skb_set_network_header(nskb, skb->mac_len);
2578 nskb->transport_header = (nskb->network_header + 2568 nskb->transport_header = (nskb->network_header +
@@ -2703,7 +2693,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2703 return -E2BIG; 2693 return -E2BIG;
2704 2694
2705 headroom = skb_headroom(p); 2695 headroom = skb_headroom(p);
2706 nskb = netdev_alloc_skb(p->dev, headroom + skb_gro_offset(p)); 2696 nskb = alloc_skb(headroom + skb_gro_offset(p), GFP_ATOMIC);
2707 if (unlikely(!nskb)) 2697 if (unlikely(!nskb))
2708 return -ENOMEM; 2698 return -ENOMEM;
2709 2699
@@ -2889,7 +2879,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
2889 return -ENOMEM; 2879 return -ENOMEM;
2890 2880
2891 /* Easy case. Most of packets will go this way. */ 2881 /* Easy case. Most of packets will go this way. */
2892 if (!skb_has_frags(skb)) { 2882 if (!skb_has_frag_list(skb)) {
2893 /* A little of trouble, not enough of space for trailer. 2883 /* A little of trouble, not enough of space for trailer.
2894 * This should not happen, when stack is tuned to generate 2884 * This should not happen, when stack is tuned to generate
2895 * good frames. OK, on miss we reallocate and reserve even more 2885 * good frames. OK, on miss we reallocate and reserve even more
@@ -2924,7 +2914,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
2924 2914
2925 if (skb1->next == NULL && tailbits) { 2915 if (skb1->next == NULL && tailbits) {
2926 if (skb_shinfo(skb1)->nr_frags || 2916 if (skb_shinfo(skb1)->nr_frags ||
2927 skb_has_frags(skb1) || 2917 skb_has_frag_list(skb1) ||
2928 skb_tailroom(skb1) < tailbits) 2918 skb_tailroom(skb1) < tailbits)
2929 ntail = tailbits + 128; 2919 ntail = tailbits + 128;
2930 } 2920 }
@@ -2933,7 +2923,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
2933 skb_cloned(skb1) || 2923 skb_cloned(skb1) ||
2934 ntail || 2924 ntail ||
2935 skb_shinfo(skb1)->nr_frags || 2925 skb_shinfo(skb1)->nr_frags ||
2936 skb_has_frags(skb1)) { 2926 skb_has_frag_list(skb1)) {
2937 struct sk_buff *skb2; 2927 struct sk_buff *skb2;
2938 2928
2939 /* Fuck, we are miserable poor guys... */ 2929 /* Fuck, we are miserable poor guys... */
@@ -3016,7 +3006,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
3016 } else { 3006 } else {
3017 /* 3007 /*
3018 * no hardware time stamps available, 3008 * no hardware time stamps available,
3019 * so keep the skb_shared_tx and only 3009 * so keep the shared tx_flags and only
3020 * store software time stamp 3010 * store software time stamp
3021 */ 3011 */
3022 skb->tstamp = ktime_get_real(); 3012 skb->tstamp = ktime_get_real();
diff --git a/net/core/sock.c b/net/core/sock.c
index b05b9b6ddb87..3eed5424e659 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1078,8 +1078,11 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
1078#ifdef CONFIG_CGROUPS 1078#ifdef CONFIG_CGROUPS
1079void sock_update_classid(struct sock *sk) 1079void sock_update_classid(struct sock *sk)
1080{ 1080{
1081 u32 classid = task_cls_classid(current); 1081 u32 classid;
1082 1082
1083 rcu_read_lock(); /* doing current task, which cannot vanish. */
1084 classid = task_cls_classid(current);
1085 rcu_read_unlock();
1083 if (classid && classid != sk->sk_classid) 1086 if (classid && classid != sk->sk_classid)
1084 sk->sk_classid = classid; 1087 sk->sk_classid = classid;
1085} 1088}
@@ -1222,7 +1225,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1222 sock_reset_flag(newsk, SOCK_DONE); 1225 sock_reset_flag(newsk, SOCK_DONE);
1223 skb_queue_head_init(&newsk->sk_error_queue); 1226 skb_queue_head_init(&newsk->sk_error_queue);
1224 1227
1225 filter = newsk->sk_filter; 1228 filter = rcu_dereference_protected(newsk->sk_filter, 1);
1226 if (filter != NULL) 1229 if (filter != NULL)
1227 sk_filter_charge(newsk, filter); 1230 sk_filter_charge(newsk, filter);
1228 1231
@@ -1351,9 +1354,9 @@ int sock_i_uid(struct sock *sk)
1351{ 1354{
1352 int uid; 1355 int uid;
1353 1356
1354 read_lock(&sk->sk_callback_lock); 1357 read_lock_bh(&sk->sk_callback_lock);
1355 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0; 1358 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1356 read_unlock(&sk->sk_callback_lock); 1359 read_unlock_bh(&sk->sk_callback_lock);
1357 return uid; 1360 return uid;
1358} 1361}
1359EXPORT_SYMBOL(sock_i_uid); 1362EXPORT_SYMBOL(sock_i_uid);
@@ -1362,9 +1365,9 @@ unsigned long sock_i_ino(struct sock *sk)
1362{ 1365{
1363 unsigned long ino; 1366 unsigned long ino;
1364 1367
1365 read_lock(&sk->sk_callback_lock); 1368 read_lock_bh(&sk->sk_callback_lock);
1366 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; 1369 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1367 read_unlock(&sk->sk_callback_lock); 1370 read_unlock_bh(&sk->sk_callback_lock);
1368 return ino; 1371 return ino;
1369} 1372}
1370EXPORT_SYMBOL(sock_i_ino); 1373EXPORT_SYMBOL(sock_i_ino);
@@ -1557,6 +1560,8 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1557EXPORT_SYMBOL(sock_alloc_send_skb); 1560EXPORT_SYMBOL(sock_alloc_send_skb);
1558 1561
1559static void __lock_sock(struct sock *sk) 1562static void __lock_sock(struct sock *sk)
1563 __releases(&sk->sk_lock.slock)
1564 __acquires(&sk->sk_lock.slock)
1560{ 1565{
1561 DEFINE_WAIT(wait); 1566 DEFINE_WAIT(wait);
1562 1567
@@ -1573,6 +1578,8 @@ static void __lock_sock(struct sock *sk)
1573} 1578}
1574 1579
1575static void __release_sock(struct sock *sk) 1580static void __release_sock(struct sock *sk)
1581 __releases(&sk->sk_lock.slock)
1582 __acquires(&sk->sk_lock.slock)
1576{ 1583{
1577 struct sk_buff *skb = sk->sk_backlog.head; 1584 struct sk_buff *skb = sk->sk_backlog.head;
1578 1585
diff --git a/net/core/stream.c b/net/core/stream.c
index d959e0f41528..f5df85dcd20b 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -141,10 +141,10 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
141 141
142 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 142 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
143 sk->sk_write_pending++; 143 sk->sk_write_pending++;
144 sk_wait_event(sk, &current_timeo, !sk->sk_err && 144 sk_wait_event(sk, &current_timeo, sk->sk_err ||
145 !(sk->sk_shutdown & SEND_SHUTDOWN) && 145 (sk->sk_shutdown & SEND_SHUTDOWN) ||
146 sk_stream_memory_free(sk) && 146 (sk_stream_memory_free(sk) &&
147 vm_wait); 147 !vm_wait));
148 sk->sk_write_pending--; 148 sk->sk_write_pending--;
149 149
150 if (vm_wait) { 150 if (vm_wait) {
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 01eee5d984be..385b6095fdc4 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -34,7 +34,8 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write,
34 34
35 mutex_lock(&sock_flow_mutex); 35 mutex_lock(&sock_flow_mutex);
36 36
37 orig_sock_table = rps_sock_flow_table; 37 orig_sock_table = rcu_dereference_protected(rps_sock_flow_table,
38 lockdep_is_held(&sock_flow_mutex));
38 size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0; 39 size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0;
39 40
40 ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); 41 ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
diff --git a/net/core/utils.c b/net/core/utils.c
index f41854470539..5fea0ab21902 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -75,7 +75,7 @@ __be32 in_aton(const char *str)
75 str++; 75 str++;
76 } 76 }
77 } 77 }
78 return(htonl(l)); 78 return htonl(l);
79} 79}
80EXPORT_SYMBOL(in_aton); 80EXPORT_SYMBOL(in_aton);
81 81
@@ -92,18 +92,19 @@ EXPORT_SYMBOL(in_aton);
92 92
93static inline int xdigit2bin(char c, int delim) 93static inline int xdigit2bin(char c, int delim)
94{ 94{
95 int val;
96
95 if (c == delim || c == '\0') 97 if (c == delim || c == '\0')
96 return IN6PTON_DELIM; 98 return IN6PTON_DELIM;
97 if (c == ':') 99 if (c == ':')
98 return IN6PTON_COLON_MASK; 100 return IN6PTON_COLON_MASK;
99 if (c == '.') 101 if (c == '.')
100 return IN6PTON_DOT; 102 return IN6PTON_DOT;
101 if (c >= '0' && c <= '9') 103
102 return (IN6PTON_XDIGIT | IN6PTON_DIGIT| (c - '0')); 104 val = hex_to_bin(c);
103 if (c >= 'a' && c <= 'f') 105 if (val >= 0)
104 return (IN6PTON_XDIGIT | (c - 'a' + 10)); 106 return val | IN6PTON_XDIGIT | (val < 10 ? IN6PTON_DIGIT : 0);
105 if (c >= 'A' && c <= 'F') 107
106 return (IN6PTON_XDIGIT | (c - 'A' + 10));
107 if (delim == -1) 108 if (delim == -1)
108 return IN6PTON_DELIM; 109 return IN6PTON_DELIM;
109 return IN6PTON_UNKNOWN; 110 return IN6PTON_UNKNOWN;
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index 6df6f8ac9636..75c3582a7678 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -62,22 +62,18 @@ struct ccid_operations {
62 void (*ccid_hc_tx_exit)(struct sock *sk); 62 void (*ccid_hc_tx_exit)(struct sock *sk);
63 void (*ccid_hc_rx_packet_recv)(struct sock *sk, 63 void (*ccid_hc_rx_packet_recv)(struct sock *sk,
64 struct sk_buff *skb); 64 struct sk_buff *skb);
65 int (*ccid_hc_rx_parse_options)(struct sock *sk, 65 int (*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt,
66 unsigned char option, 66 u8 opt, u8 *val, u8 len);
67 unsigned char len, u16 idx,
68 unsigned char* value);
69 int (*ccid_hc_rx_insert_options)(struct sock *sk, 67 int (*ccid_hc_rx_insert_options)(struct sock *sk,
70 struct sk_buff *skb); 68 struct sk_buff *skb);
71 void (*ccid_hc_tx_packet_recv)(struct sock *sk, 69 void (*ccid_hc_tx_packet_recv)(struct sock *sk,
72 struct sk_buff *skb); 70 struct sk_buff *skb);
73 int (*ccid_hc_tx_parse_options)(struct sock *sk, 71 int (*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt,
74 unsigned char option, 72 u8 opt, u8 *val, u8 len);
75 unsigned char len, u16 idx,
76 unsigned char* value);
77 int (*ccid_hc_tx_send_packet)(struct sock *sk, 73 int (*ccid_hc_tx_send_packet)(struct sock *sk,
78 struct sk_buff *skb); 74 struct sk_buff *skb);
79 void (*ccid_hc_tx_packet_sent)(struct sock *sk, 75 void (*ccid_hc_tx_packet_sent)(struct sock *sk,
80 int more, unsigned int len); 76 unsigned int len);
81 void (*ccid_hc_rx_get_info)(struct sock *sk, 77 void (*ccid_hc_rx_get_info)(struct sock *sk,
82 struct tcp_info *info); 78 struct tcp_info *info);
83 void (*ccid_hc_tx_get_info)(struct sock *sk, 79 void (*ccid_hc_tx_get_info)(struct sock *sk,
@@ -138,20 +134,48 @@ static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp)
138extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk); 134extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk);
139extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk); 135extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk);
140 136
137/*
138 * Congestion control of queued data packets via CCID decision.
139 *
140 * The TX CCID performs its congestion-control by indicating whether and when a
141 * queued packet may be sent, using the return code of ccid_hc_tx_send_packet().
142 * The following modes are supported via the symbolic constants below:
143 * - timer-based pacing (CCID returns a delay value in milliseconds);
144 * - autonomous dequeueing (CCID internally schedules dccps_xmitlet).
145 */
146
147enum ccid_dequeueing_decision {
148 CCID_PACKET_SEND_AT_ONCE = 0x00000, /* "green light": no delay */
149 CCID_PACKET_DELAY_MAX = 0x0FFFF, /* maximum delay in msecs */
150 CCID_PACKET_DELAY = 0x10000, /* CCID msec-delay mode */
151 CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000, /* CCID autonomous mode */
152 CCID_PACKET_ERR = 0xF0000, /* error condition */
153};
154
155static inline int ccid_packet_dequeue_eval(const int return_code)
156{
157 if (return_code < 0)
158 return CCID_PACKET_ERR;
159 if (return_code == 0)
160 return CCID_PACKET_SEND_AT_ONCE;
161 if (return_code <= CCID_PACKET_DELAY_MAX)
162 return CCID_PACKET_DELAY;
163 return return_code;
164}
165
141static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk, 166static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk,
142 struct sk_buff *skb) 167 struct sk_buff *skb)
143{ 168{
144 int rc = 0;
145 if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL) 169 if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL)
146 rc = ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb); 170 return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb);
147 return rc; 171 return CCID_PACKET_SEND_AT_ONCE;
148} 172}
149 173
150static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk, 174static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk,
151 int more, unsigned int len) 175 unsigned int len)
152{ 176{
153 if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL) 177 if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL)
154 ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, more, len); 178 ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len);
155} 179}
156 180
157static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk, 181static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk,
@@ -168,27 +192,31 @@ static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk,
168 ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb); 192 ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb);
169} 193}
170 194
195/**
196 * ccid_hc_tx_parse_options - Parse CCID-specific options sent by the receiver
197 * @pkt: type of packet that @opt appears on (RFC 4340, 5.1)
198 * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3)
199 * @val: value of @opt
200 * @len: length of @val in bytes
201 */
171static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk, 202static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk,
172 unsigned char option, 203 u8 pkt, u8 opt, u8 *val, u8 len)
173 unsigned char len, u16 idx,
174 unsigned char* value)
175{ 204{
176 int rc = 0; 205 if (ccid->ccid_ops->ccid_hc_tx_parse_options == NULL)
177 if (ccid->ccid_ops->ccid_hc_tx_parse_options != NULL) 206 return 0;
178 rc = ccid->ccid_ops->ccid_hc_tx_parse_options(sk, option, len, idx, 207 return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len);
179 value);
180 return rc;
181} 208}
182 209
210/**
211 * ccid_hc_rx_parse_options - Parse CCID-specific options sent by the sender
212 * Arguments are analogous to ccid_hc_tx_parse_options()
213 */
183static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk, 214static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk,
184 unsigned char option, 215 u8 pkt, u8 opt, u8 *val, u8 len)
185 unsigned char len, u16 idx,
186 unsigned char* value)
187{ 216{
188 int rc = 0; 217 if (ccid->ccid_ops->ccid_hc_rx_parse_options == NULL)
189 if (ccid->ccid_ops->ccid_hc_rx_parse_options != NULL) 218 return 0;
190 rc = ccid->ccid_ops->ccid_hc_rx_parse_options(sk, option, len, idx, value); 219 return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len);
191 return rc;
192} 220}
193 221
194static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk, 222static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk,
diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig
index 8408398cd44e..0581143cb800 100644
--- a/net/dccp/ccids/Kconfig
+++ b/net/dccp/ccids/Kconfig
@@ -47,37 +47,6 @@ config IP_DCCP_CCID3_DEBUG
47 47
48 If in doubt, say N. 48 If in doubt, say N.
49 49
50config IP_DCCP_CCID3_RTO
51 int "Use higher bound for nofeedback timer"
52 default 100
53 depends on IP_DCCP_CCID3 && EXPERIMENTAL
54 ---help---
55 Use higher lower bound for nofeedback timer expiration.
56
57 The TFRC nofeedback timer normally expires after the maximum of 4
58 RTTs and twice the current send interval (RFC 3448, 4.3). On LANs
59 with a small RTT this can mean a high processing load and reduced
60 performance, since then the nofeedback timer is triggered very
61 frequently.
62
63 This option enables to set a higher lower bound for the nofeedback
64 value. Values in units of milliseconds can be set here.
65
66 A value of 0 disables this feature by enforcing the value specified
67 in RFC 3448. The following values have been suggested as bounds for
68 experimental use:
69 * 16-20ms to match the typical multimedia inter-frame interval
70 * 100ms as a reasonable compromise [default]
71 * 1000ms corresponds to the lower TCP RTO bound (RFC 2988, 2.4)
72
73 The default of 100ms is a compromise between a large value for
74 efficient DCCP implementations, and a small value to avoid disrupting
75 the network in times of congestion.
76
77 The purpose of the nofeedback timer is to slow DCCP down when there
78 is serious network congestion: experimenting with larger values should
79 therefore not be performed on WANs.
80
81config IP_DCCP_TFRC_LIB 50config IP_DCCP_TFRC_LIB
82 def_bool y if IP_DCCP_CCID3 51 def_bool y if IP_DCCP_CCID3
83 52
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index 9b3ae9922be1..6576eae9e779 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -25,59 +25,14 @@
25 */ 25 */
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include "../feat.h" 27#include "../feat.h"
28#include "../ccid.h"
29#include "../dccp.h"
30#include "ccid2.h" 28#include "ccid2.h"
31 29
32 30
33#ifdef CONFIG_IP_DCCP_CCID2_DEBUG 31#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
34static int ccid2_debug; 32static int ccid2_debug;
35#define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a) 33#define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a)
36
37static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hc)
38{
39 int len = 0;
40 int pipe = 0;
41 struct ccid2_seq *seqp = hc->tx_seqh;
42
43 /* there is data in the chain */
44 if (seqp != hc->tx_seqt) {
45 seqp = seqp->ccid2s_prev;
46 len++;
47 if (!seqp->ccid2s_acked)
48 pipe++;
49
50 while (seqp != hc->tx_seqt) {
51 struct ccid2_seq *prev = seqp->ccid2s_prev;
52
53 len++;
54 if (!prev->ccid2s_acked)
55 pipe++;
56
57 /* packets are sent sequentially */
58 BUG_ON(dccp_delta_seqno(seqp->ccid2s_seq,
59 prev->ccid2s_seq ) >= 0);
60 BUG_ON(time_before(seqp->ccid2s_sent,
61 prev->ccid2s_sent));
62
63 seqp = prev;
64 }
65 }
66
67 BUG_ON(pipe != hc->tx_pipe);
68 ccid2_pr_debug("len of chain=%d\n", len);
69
70 do {
71 seqp = seqp->ccid2s_prev;
72 len++;
73 } while (seqp != hc->tx_seqh);
74
75 ccid2_pr_debug("total len=%d\n", len);
76 BUG_ON(len != hc->tx_seqbufc * CCID2_SEQBUF_LEN);
77}
78#else 34#else
79#define ccid2_pr_debug(format, a...) 35#define ccid2_pr_debug(format, a...)
80#define ccid2_hc_tx_check_sanity(hc)
81#endif 36#endif
82 37
83static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc) 38static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc)
@@ -123,12 +78,9 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc)
123 78
124static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) 79static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
125{ 80{
126 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); 81 if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk)))
127 82 return CCID_PACKET_WILL_DEQUEUE_LATER;
128 if (hc->tx_pipe < hc->tx_cwnd) 83 return CCID_PACKET_SEND_AT_ONCE;
129 return 0;
130
131 return 1; /* XXX CCID should dequeue when ready instead of polling */
132} 84}
133 85
134static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) 86static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
@@ -156,19 +108,11 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
156 dp->dccps_l_ack_ratio = val; 108 dp->dccps_l_ack_ratio = val;
157} 109}
158 110
159static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hc, long val)
160{
161 ccid2_pr_debug("change SRTT to %ld\n", val);
162 hc->tx_srtt = val;
163}
164
165static void ccid2_start_rto_timer(struct sock *sk);
166
167static void ccid2_hc_tx_rto_expire(unsigned long data) 111static void ccid2_hc_tx_rto_expire(unsigned long data)
168{ 112{
169 struct sock *sk = (struct sock *)data; 113 struct sock *sk = (struct sock *)data;
170 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); 114 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
171 long s; 115 const bool sender_was_blocked = ccid2_cwnd_network_limited(hc);
172 116
173 bh_lock_sock(sk); 117 bh_lock_sock(sk);
174 if (sock_owned_by_user(sk)) { 118 if (sock_owned_by_user(sk)) {
@@ -178,23 +122,17 @@ static void ccid2_hc_tx_rto_expire(unsigned long data)
178 122
179 ccid2_pr_debug("RTO_EXPIRE\n"); 123 ccid2_pr_debug("RTO_EXPIRE\n");
180 124
181 ccid2_hc_tx_check_sanity(hc);
182
183 /* back-off timer */ 125 /* back-off timer */
184 hc->tx_rto <<= 1; 126 hc->tx_rto <<= 1;
185 127 if (hc->tx_rto > DCCP_RTO_MAX)
186 s = hc->tx_rto / HZ; 128 hc->tx_rto = DCCP_RTO_MAX;
187 if (s > 60)
188 hc->tx_rto = 60 * HZ;
189
190 ccid2_start_rto_timer(sk);
191 129
192 /* adjust pipe, cwnd etc */ 130 /* adjust pipe, cwnd etc */
193 hc->tx_ssthresh = hc->tx_cwnd / 2; 131 hc->tx_ssthresh = hc->tx_cwnd / 2;
194 if (hc->tx_ssthresh < 2) 132 if (hc->tx_ssthresh < 2)
195 hc->tx_ssthresh = 2; 133 hc->tx_ssthresh = 2;
196 hc->tx_cwnd = 1; 134 hc->tx_cwnd = 1;
197 hc->tx_pipe = 0; 135 hc->tx_pipe = 0;
198 136
199 /* clear state about stuff we sent */ 137 /* clear state about stuff we sent */
200 hc->tx_seqt = hc->tx_seqh; 138 hc->tx_seqt = hc->tx_seqh;
@@ -204,23 +142,18 @@ static void ccid2_hc_tx_rto_expire(unsigned long data)
204 hc->tx_rpseq = 0; 142 hc->tx_rpseq = 0;
205 hc->tx_rpdupack = -1; 143 hc->tx_rpdupack = -1;
206 ccid2_change_l_ack_ratio(sk, 1); 144 ccid2_change_l_ack_ratio(sk, 1);
207 ccid2_hc_tx_check_sanity(hc); 145
146 /* if we were blocked before, we may now send cwnd=1 packet */
147 if (sender_was_blocked)
148 tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
149 /* restart backed-off timer */
150 sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
208out: 151out:
209 bh_unlock_sock(sk); 152 bh_unlock_sock(sk);
210 sock_put(sk); 153 sock_put(sk);
211} 154}
212 155
213static void ccid2_start_rto_timer(struct sock *sk) 156static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
214{
215 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
216
217 ccid2_pr_debug("setting RTO timeout=%ld\n", hc->tx_rto);
218
219 BUG_ON(timer_pending(&hc->tx_rtotimer));
220 sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
221}
222
223static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
224{ 157{
225 struct dccp_sock *dp = dccp_sk(sk); 158 struct dccp_sock *dp = dccp_sk(sk);
226 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); 159 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
@@ -230,7 +163,7 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
230 163
231 hc->tx_seqh->ccid2s_seq = dp->dccps_gss; 164 hc->tx_seqh->ccid2s_seq = dp->dccps_gss;
232 hc->tx_seqh->ccid2s_acked = 0; 165 hc->tx_seqh->ccid2s_acked = 0;
233 hc->tx_seqh->ccid2s_sent = jiffies; 166 hc->tx_seqh->ccid2s_sent = ccid2_time_stamp;
234 167
235 next = hc->tx_seqh->ccid2s_next; 168 next = hc->tx_seqh->ccid2s_next;
236 /* check if we need to alloc more space */ 169 /* check if we need to alloc more space */
@@ -296,23 +229,20 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
296 } 229 }
297#endif 230#endif
298 231
299 /* setup RTO timer */ 232 sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
300 if (!timer_pending(&hc->tx_rtotimer))
301 ccid2_start_rto_timer(sk);
302 233
303#ifdef CONFIG_IP_DCCP_CCID2_DEBUG 234#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
304 do { 235 do {
305 struct ccid2_seq *seqp = hc->tx_seqt; 236 struct ccid2_seq *seqp = hc->tx_seqt;
306 237
307 while (seqp != hc->tx_seqh) { 238 while (seqp != hc->tx_seqh) {
308 ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n", 239 ccid2_pr_debug("out seq=%llu acked=%d time=%u\n",
309 (unsigned long long)seqp->ccid2s_seq, 240 (unsigned long long)seqp->ccid2s_seq,
310 seqp->ccid2s_acked, seqp->ccid2s_sent); 241 seqp->ccid2s_acked, seqp->ccid2s_sent);
311 seqp = seqp->ccid2s_next; 242 seqp = seqp->ccid2s_next;
312 } 243 }
313 } while (0); 244 } while (0);
314 ccid2_pr_debug("=========\n"); 245 ccid2_pr_debug("=========\n");
315 ccid2_hc_tx_check_sanity(hc);
316#endif 246#endif
317} 247}
318 248
@@ -378,17 +308,87 @@ out_invalid_option:
378 return -1; 308 return -1;
379} 309}
380 310
381static void ccid2_hc_tx_kill_rto_timer(struct sock *sk) 311/**
312 * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm
313 * This code is almost identical with TCP's tcp_rtt_estimator(), since
314 * - it has a higher sampling frequency (recommended by RFC 1323),
315 * - the RTO does not collapse into RTT due to RTTVAR going towards zero,
316 * - it is simple (cf. more complex proposals such as Eifel timer or research
317 * which suggests that the gain should be set according to window size),
318 * - in tests it was found to work well with CCID2 [gerrit].
319 */
320static void ccid2_rtt_estimator(struct sock *sk, const long mrtt)
382{ 321{
383 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); 322 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
323 long m = mrtt ? : 1;
384 324
385 sk_stop_timer(sk, &hc->tx_rtotimer); 325 if (hc->tx_srtt == 0) {
386 ccid2_pr_debug("deleted RTO timer\n"); 326 /* First measurement m */
327 hc->tx_srtt = m << 3;
328 hc->tx_mdev = m << 1;
329
330 hc->tx_mdev_max = max(hc->tx_mdev, tcp_rto_min(sk));
331 hc->tx_rttvar = hc->tx_mdev_max;
332
333 hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss;
334 } else {
335 /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */
336 m -= (hc->tx_srtt >> 3);
337 hc->tx_srtt += m;
338
339 /* Similarly, update scaled mdev with regard to |m| */
340 if (m < 0) {
341 m = -m;
342 m -= (hc->tx_mdev >> 2);
343 /*
344 * This neutralises RTO increase when RTT < SRTT - mdev
345 * (see P. Sarolahti, A. Kuznetsov,"Congestion Control
346 * in Linux TCP", USENIX 2002, pp. 49-62).
347 */
348 if (m > 0)
349 m >>= 3;
350 } else {
351 m -= (hc->tx_mdev >> 2);
352 }
353 hc->tx_mdev += m;
354
355 if (hc->tx_mdev > hc->tx_mdev_max) {
356 hc->tx_mdev_max = hc->tx_mdev;
357 if (hc->tx_mdev_max > hc->tx_rttvar)
358 hc->tx_rttvar = hc->tx_mdev_max;
359 }
360
361 /*
362 * Decay RTTVAR at most once per flight, exploiting that
363 * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2)
364 * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1)
365 * GAR is a useful bound for FlightSize = pipe.
366 * AWL is probably too low here, as it over-estimates pipe.
367 */
368 if (after48(dccp_sk(sk)->dccps_gar, hc->tx_rtt_seq)) {
369 if (hc->tx_mdev_max < hc->tx_rttvar)
370 hc->tx_rttvar -= (hc->tx_rttvar -
371 hc->tx_mdev_max) >> 2;
372 hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss;
373 hc->tx_mdev_max = tcp_rto_min(sk);
374 }
375 }
376
377 /*
378 * Set RTO from SRTT and RTTVAR
379 * As in TCP, 4 * RTTVAR >= TCP_RTO_MIN, giving a minimum RTO of 200 ms.
380 * This agrees with RFC 4341, 5:
381 * "Because DCCP does not retransmit data, DCCP does not require
382 * TCP's recommended minimum timeout of one second".
383 */
384 hc->tx_rto = (hc->tx_srtt >> 3) + hc->tx_rttvar;
385
386 if (hc->tx_rto > DCCP_RTO_MAX)
387 hc->tx_rto = DCCP_RTO_MAX;
387} 388}
388 389
389static inline void ccid2_new_ack(struct sock *sk, 390static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp,
390 struct ccid2_seq *seqp, 391 unsigned int *maxincr)
391 unsigned int *maxincr)
392{ 392{
393 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); 393 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
394 394
@@ -402,93 +402,27 @@ static inline void ccid2_new_ack(struct sock *sk,
402 hc->tx_cwnd += 1; 402 hc->tx_cwnd += 1;
403 hc->tx_packets_acked = 0; 403 hc->tx_packets_acked = 0;
404 } 404 }
405 405 /*
406 /* update RTO */ 406 * FIXME: RTT is sampled several times per acknowledgment (for each
407 if (hc->tx_srtt == -1 || 407 * entry in the Ack Vector), instead of once per Ack (as in TCP SACK).
408 time_after(jiffies, hc->tx_lastrtt + hc->tx_srtt)) { 408 * This causes the RTT to be over-estimated, since the older entries
409 unsigned long r = (long)jiffies - (long)seqp->ccid2s_sent; 409 * in the Ack Vector have earlier sending times.
410 int s; 410 * The cleanest solution is to not use the ccid2s_sent field at all
411 411 * and instead use DCCP timestamps: requires changes in other places.
412 /* first measurement */ 412 */
413 if (hc->tx_srtt == -1) { 413 ccid2_rtt_estimator(sk, ccid2_time_stamp - seqp->ccid2s_sent);
414 ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n",
415 r, jiffies,
416 (unsigned long long)seqp->ccid2s_seq);
417 ccid2_change_srtt(hc, r);
418 hc->tx_rttvar = r >> 1;
419 } else {
420 /* RTTVAR */
421 long tmp = hc->tx_srtt - r;
422 long srtt;
423
424 if (tmp < 0)
425 tmp *= -1;
426
427 tmp >>= 2;
428 hc->tx_rttvar *= 3;
429 hc->tx_rttvar >>= 2;
430 hc->tx_rttvar += tmp;
431
432 /* SRTT */
433 srtt = hc->tx_srtt;
434 srtt *= 7;
435 srtt >>= 3;
436 tmp = r >> 3;
437 srtt += tmp;
438 ccid2_change_srtt(hc, srtt);
439 }
440 s = hc->tx_rttvar << 2;
441 /* clock granularity is 1 when based on jiffies */
442 if (!s)
443 s = 1;
444 hc->tx_rto = hc->tx_srtt + s;
445
446 /* must be at least a second */
447 s = hc->tx_rto / HZ;
448 /* DCCP doesn't require this [but I like it cuz my code sux] */
449#if 1
450 if (s < 1)
451 hc->tx_rto = HZ;
452#endif
453 /* max 60 seconds */
454 if (s > 60)
455 hc->tx_rto = HZ * 60;
456
457 hc->tx_lastrtt = jiffies;
458
459 ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n",
460 hc->tx_srtt, hc->tx_rttvar,
461 hc->tx_rto, HZ, r);
462 }
463
464 /* we got a new ack, so re-start RTO timer */
465 ccid2_hc_tx_kill_rto_timer(sk);
466 ccid2_start_rto_timer(sk);
467}
468
469static void ccid2_hc_tx_dec_pipe(struct sock *sk)
470{
471 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
472
473 if (hc->tx_pipe == 0)
474 DCCP_BUG("pipe == 0");
475 else
476 hc->tx_pipe--;
477
478 if (hc->tx_pipe == 0)
479 ccid2_hc_tx_kill_rto_timer(sk);
480} 414}
481 415
482static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) 416static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
483{ 417{
484 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); 418 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
485 419
486 if (time_before(seqp->ccid2s_sent, hc->tx_last_cong)) { 420 if ((s32)(seqp->ccid2s_sent - hc->tx_last_cong) < 0) {
487 ccid2_pr_debug("Multiple losses in an RTT---treating as one\n"); 421 ccid2_pr_debug("Multiple losses in an RTT---treating as one\n");
488 return; 422 return;
489 } 423 }
490 424
491 hc->tx_last_cong = jiffies; 425 hc->tx_last_cong = ccid2_time_stamp;
492 426
493 hc->tx_cwnd = hc->tx_cwnd / 2 ? : 1U; 427 hc->tx_cwnd = hc->tx_cwnd / 2 ? : 1U;
494 hc->tx_ssthresh = max(hc->tx_cwnd, 2U); 428 hc->tx_ssthresh = max(hc->tx_cwnd, 2U);
@@ -502,6 +436,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
502{ 436{
503 struct dccp_sock *dp = dccp_sk(sk); 437 struct dccp_sock *dp = dccp_sk(sk);
504 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); 438 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
439 const bool sender_was_blocked = ccid2_cwnd_network_limited(hc);
505 u64 ackno, seqno; 440 u64 ackno, seqno;
506 struct ccid2_seq *seqp; 441 struct ccid2_seq *seqp;
507 unsigned char *vector; 442 unsigned char *vector;
@@ -510,7 +445,6 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
510 int done = 0; 445 int done = 0;
511 unsigned int maxincr = 0; 446 unsigned int maxincr = 0;
512 447
513 ccid2_hc_tx_check_sanity(hc);
514 /* check reverse path congestion */ 448 /* check reverse path congestion */
515 seqno = DCCP_SKB_CB(skb)->dccpd_seq; 449 seqno = DCCP_SKB_CB(skb)->dccpd_seq;
516 450
@@ -620,7 +554,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
620 seqp->ccid2s_acked = 1; 554 seqp->ccid2s_acked = 1;
621 ccid2_pr_debug("Got ack for %llu\n", 555 ccid2_pr_debug("Got ack for %llu\n",
622 (unsigned long long)seqp->ccid2s_seq); 556 (unsigned long long)seqp->ccid2s_seq);
623 ccid2_hc_tx_dec_pipe(sk); 557 hc->tx_pipe--;
624 } 558 }
625 if (seqp == hc->tx_seqt) { 559 if (seqp == hc->tx_seqt) {
626 done = 1; 560 done = 1;
@@ -677,7 +611,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
677 * one ack vector. 611 * one ack vector.
678 */ 612 */
679 ccid2_congestion_event(sk, seqp); 613 ccid2_congestion_event(sk, seqp);
680 ccid2_hc_tx_dec_pipe(sk); 614 hc->tx_pipe--;
681 } 615 }
682 if (seqp == hc->tx_seqt) 616 if (seqp == hc->tx_seqt)
683 break; 617 break;
@@ -695,7 +629,15 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
695 hc->tx_seqt = hc->tx_seqt->ccid2s_next; 629 hc->tx_seqt = hc->tx_seqt->ccid2s_next;
696 } 630 }
697 631
698 ccid2_hc_tx_check_sanity(hc); 632 /* restart RTO timer if not all outstanding data has been acked */
633 if (hc->tx_pipe == 0)
634 sk_stop_timer(sk, &hc->tx_rtotimer);
635 else
636 sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
637
638 /* check if incoming Acks allow pending packets to be sent */
639 if (sender_was_blocked && !ccid2_cwnd_network_limited(hc))
640 tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
699} 641}
700 642
701static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) 643static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
@@ -707,12 +649,8 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
707 /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */ 649 /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */
708 hc->tx_ssthresh = ~0U; 650 hc->tx_ssthresh = ~0U;
709 651
710 /* 652 /* Use larger initial windows (RFC 4341, section 5). */
711 * RFC 4341, 5: "The cwnd parameter is initialized to at most four 653 hc->tx_cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache);
712 * packets for new connections, following the rules from [RFC3390]".
713 * We need to convert the bytes of RFC3390 into the packets of RFC 4341.
714 */
715 hc->tx_cwnd = clamp(4380U / dp->dccps_mss_cache, 2U, 4U);
716 654
717 /* Make sure that Ack Ratio is enabled and within bounds. */ 655 /* Make sure that Ack Ratio is enabled and within bounds. */
718 max_ratio = DIV_ROUND_UP(hc->tx_cwnd, 2); 656 max_ratio = DIV_ROUND_UP(hc->tx_cwnd, 2);
@@ -723,15 +661,11 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
723 if (ccid2_hc_tx_alloc_seq(hc)) 661 if (ccid2_hc_tx_alloc_seq(hc))
724 return -ENOMEM; 662 return -ENOMEM;
725 663
726 hc->tx_rto = 3 * HZ; 664 hc->tx_rto = DCCP_TIMEOUT_INIT;
727 ccid2_change_srtt(hc, -1);
728 hc->tx_rttvar = -1;
729 hc->tx_rpdupack = -1; 665 hc->tx_rpdupack = -1;
730 hc->tx_last_cong = jiffies; 666 hc->tx_last_cong = ccid2_time_stamp;
731 setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire, 667 setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire,
732 (unsigned long)sk); 668 (unsigned long)sk);
733
734 ccid2_hc_tx_check_sanity(hc);
735 return 0; 669 return 0;
736} 670}
737 671
@@ -740,7 +674,7 @@ static void ccid2_hc_tx_exit(struct sock *sk)
740 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); 674 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
741 int i; 675 int i;
742 676
743 ccid2_hc_tx_kill_rto_timer(sk); 677 sk_stop_timer(sk, &hc->tx_rtotimer);
744 678
745 for (i = 0; i < hc->tx_seqbufc; i++) 679 for (i = 0; i < hc->tx_seqbufc; i++)
746 kfree(hc->tx_seqbuf[i]); 680 kfree(hc->tx_seqbuf[i]);
diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h
index 1ec6a30103bb..25cb6b216eda 100644
--- a/net/dccp/ccids/ccid2.h
+++ b/net/dccp/ccids/ccid2.h
@@ -18,18 +18,23 @@
18#ifndef _DCCP_CCID2_H_ 18#ifndef _DCCP_CCID2_H_
19#define _DCCP_CCID2_H_ 19#define _DCCP_CCID2_H_
20 20
21#include <linux/dccp.h>
22#include <linux/timer.h> 21#include <linux/timer.h>
23#include <linux/types.h> 22#include <linux/types.h>
24#include "../ccid.h" 23#include "../ccid.h"
24#include "../dccp.h"
25
26/*
27 * CCID-2 timestamping faces the same issues as TCP timestamping.
28 * Hence we reuse/share as much of the code as possible.
29 */
30#define ccid2_time_stamp tcp_time_stamp
31
25/* NUMDUPACK parameter from RFC 4341, p. 6 */ 32/* NUMDUPACK parameter from RFC 4341, p. 6 */
26#define NUMDUPACK 3 33#define NUMDUPACK 3
27 34
28struct sock;
29
30struct ccid2_seq { 35struct ccid2_seq {
31 u64 ccid2s_seq; 36 u64 ccid2s_seq;
32 unsigned long ccid2s_sent; 37 u32 ccid2s_sent;
33 int ccid2s_acked; 38 int ccid2s_acked;
34 struct ccid2_seq *ccid2s_prev; 39 struct ccid2_seq *ccid2s_prev;
35 struct ccid2_seq *ccid2s_next; 40 struct ccid2_seq *ccid2s_next;
@@ -42,7 +47,12 @@ struct ccid2_seq {
42 * struct ccid2_hc_tx_sock - CCID2 TX half connection 47 * struct ccid2_hc_tx_sock - CCID2 TX half connection
43 * @tx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 48 * @tx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5
44 * @tx_packets_acked: Ack counter for deriving cwnd growth (RFC 3465) 49 * @tx_packets_acked: Ack counter for deriving cwnd growth (RFC 3465)
45 * @tx_lastrtt: time RTT was last measured 50 * @tx_srtt: smoothed RTT estimate, scaled by 2^3
51 * @tx_mdev: smoothed RTT variation, scaled by 2^2
52 * @tx_mdev_max: maximum of @mdev during one flight
53 * @tx_rttvar: moving average/maximum of @mdev_max
54 * @tx_rto: RTO value deriving from SRTT and RTTVAR (RFC 2988)
55 * @tx_rtt_seq: to decay RTTVAR at most once per flight
46 * @tx_rpseq: last consecutive seqno 56 * @tx_rpseq: last consecutive seqno
47 * @tx_rpdupack: dupacks since rpseq 57 * @tx_rpdupack: dupacks since rpseq
48 */ 58 */
@@ -55,17 +65,27 @@ struct ccid2_hc_tx_sock {
55 int tx_seqbufc; 65 int tx_seqbufc;
56 struct ccid2_seq *tx_seqh; 66 struct ccid2_seq *tx_seqh;
57 struct ccid2_seq *tx_seqt; 67 struct ccid2_seq *tx_seqt;
58 long tx_rto; 68
59 long tx_srtt; 69 /* RTT measurement: variables/principles are the same as in TCP */
60 long tx_rttvar; 70 u32 tx_srtt,
61 unsigned long tx_lastrtt; 71 tx_mdev,
72 tx_mdev_max,
73 tx_rttvar,
74 tx_rto;
75 u64 tx_rtt_seq:48;
62 struct timer_list tx_rtotimer; 76 struct timer_list tx_rtotimer;
77
63 u64 tx_rpseq; 78 u64 tx_rpseq;
64 int tx_rpdupack; 79 int tx_rpdupack;
65 unsigned long tx_last_cong; 80 u32 tx_last_cong;
66 u64 tx_high_ack; 81 u64 tx_high_ack;
67}; 82};
68 83
84static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hc)
85{
86 return hc->tx_pipe >= hc->tx_cwnd;
87}
88
69struct ccid2_hc_rx_sock { 89struct ccid2_hc_rx_sock {
70 int rx_data; 90 int rx_data;
71}; 91};
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 95f752986497..3d604e1349c0 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -54,7 +54,6 @@ static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state)
54 [TFRC_SSTATE_NO_SENT] = "NO_SENT", 54 [TFRC_SSTATE_NO_SENT] = "NO_SENT",
55 [TFRC_SSTATE_NO_FBACK] = "NO_FBACK", 55 [TFRC_SSTATE_NO_FBACK] = "NO_FBACK",
56 [TFRC_SSTATE_FBACK] = "FBACK", 56 [TFRC_SSTATE_FBACK] = "FBACK",
57 [TFRC_SSTATE_TERM] = "TERM",
58 }; 57 };
59 58
60 return ccid3_state_names[state]; 59 return ccid3_state_names[state];
@@ -91,19 +90,16 @@ static inline u64 rfc3390_initial_rate(struct sock *sk)
91 return scaled_div(w_init << 6, hc->tx_rtt); 90 return scaled_div(w_init << 6, hc->tx_rtt);
92} 91}
93 92
94/* 93/**
95 * Recalculate t_ipi and delta (should be called whenever X changes) 94 * ccid3_update_send_interval - Calculate new t_ipi = s / X_inst
95 * This respects the granularity of X_inst (64 * bytes/second).
96 */ 96 */
97static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hc) 97static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hc)
98{ 98{
99 /* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */
100 hc->tx_t_ipi = scaled_div32(((u64)hc->tx_s) << 6, hc->tx_x); 99 hc->tx_t_ipi = scaled_div32(((u64)hc->tx_s) << 6, hc->tx_x);
101 100
102 /* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */ 101 ccid3_pr_debug("t_ipi=%u, s=%u, X=%u\n", hc->tx_t_ipi,
103 hc->tx_delta = min_t(u32, hc->tx_t_ipi / 2, TFRC_OPSYS_HALF_TIME_GRAN); 102 hc->tx_s, (unsigned)(hc->tx_x >> 6));
104
105 ccid3_pr_debug("t_ipi=%u, delta=%u, s=%u, X=%u\n", hc->tx_t_ipi,
106 hc->tx_delta, hc->tx_s, (unsigned)(hc->tx_x >> 6));
107} 103}
108 104
109static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hc, ktime_t now) 105static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hc, ktime_t now)
@@ -211,16 +207,19 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
211 ccid3_pr_debug("%s(%p, state=%s) - entry\n", dccp_role(sk), sk, 207 ccid3_pr_debug("%s(%p, state=%s) - entry\n", dccp_role(sk), sk,
212 ccid3_tx_state_name(hc->tx_state)); 208 ccid3_tx_state_name(hc->tx_state));
213 209
210 /* Ignore and do not restart after leaving the established state */
211 if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN))
212 goto out;
213
214 /* Reset feedback state to "no feedback received" */
214 if (hc->tx_state == TFRC_SSTATE_FBACK) 215 if (hc->tx_state == TFRC_SSTATE_FBACK)
215 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); 216 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
216 else if (hc->tx_state != TFRC_SSTATE_NO_FBACK)
217 goto out;
218 217
219 /* 218 /*
220 * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4 219 * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4
220 * RTO is 0 if and only if no feedback has been received yet.
221 */ 221 */
222 if (hc->tx_t_rto == 0 || /* no feedback received yet */ 222 if (hc->tx_t_rto == 0 || hc->tx_p == 0) {
223 hc->tx_p == 0) {
224 223
225 /* halve send rate directly */ 224 /* halve send rate directly */
226 hc->tx_x = max(hc->tx_x / 2, 225 hc->tx_x = max(hc->tx_x / 2,
@@ -256,7 +255,7 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
256 * Set new timeout for the nofeedback timer. 255 * Set new timeout for the nofeedback timer.
257 * See comments in packet_recv() regarding the value of t_RTO. 256 * See comments in packet_recv() regarding the value of t_RTO.
258 */ 257 */
259 if (unlikely(hc->tx_t_rto == 0)) /* no feedback yet */ 258 if (unlikely(hc->tx_t_rto == 0)) /* no feedback received yet */
260 t_nfb = TFRC_INITIAL_TIMEOUT; 259 t_nfb = TFRC_INITIAL_TIMEOUT;
261 else 260 else
262 t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi); 261 t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi);
@@ -269,11 +268,11 @@ out:
269 sock_put(sk); 268 sock_put(sk);
270} 269}
271 270
272/* 271/**
273 * returns 272 * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets
274 * > 0: delay (in msecs) that should pass before actually sending 273 * @skb: next packet candidate to send on @sk
275 * = 0: can send immediately 274 * This function uses the convention of ccid_packet_dequeue_eval() and
276 * < 0: error condition; do not send packet 275 * returns a millisecond-delay value between 0 and t_mbi = 64000 msec.
277 */ 276 */
278static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) 277static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
279{ 278{
@@ -290,8 +289,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
290 if (unlikely(skb->len == 0)) 289 if (unlikely(skb->len == 0))
291 return -EBADMSG; 290 return -EBADMSG;
292 291
293 switch (hc->tx_state) { 292 if (hc->tx_state == TFRC_SSTATE_NO_SENT) {
294 case TFRC_SSTATE_NO_SENT:
295 sk_reset_timer(sk, &hc->tx_no_feedback_timer, (jiffies + 293 sk_reset_timer(sk, &hc->tx_no_feedback_timer, (jiffies +
296 usecs_to_jiffies(TFRC_INITIAL_TIMEOUT))); 294 usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)));
297 hc->tx_last_win_count = 0; 295 hc->tx_last_win_count = 0;
@@ -326,27 +324,22 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
326 ccid3_update_send_interval(hc); 324 ccid3_update_send_interval(hc);
327 325
328 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); 326 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
329 break; 327
330 case TFRC_SSTATE_NO_FBACK: 328 } else {
331 case TFRC_SSTATE_FBACK:
332 delay = ktime_us_delta(hc->tx_t_nom, now); 329 delay = ktime_us_delta(hc->tx_t_nom, now);
333 ccid3_pr_debug("delay=%ld\n", (long)delay); 330 ccid3_pr_debug("delay=%ld\n", (long)delay);
334 /* 331 /*
335 * Scheduling of packet transmissions [RFC 3448, 4.6] 332 * Scheduling of packet transmissions (RFC 5348, 8.3)
336 * 333 *
337 * if (t_now > t_nom - delta) 334 * if (t_now > t_nom - delta)
338 * // send the packet now 335 * // send the packet now
339 * else 336 * else
340 * // send the packet in (t_nom - t_now) milliseconds. 337 * // send the packet in (t_nom - t_now) milliseconds.
341 */ 338 */
342 if (delay - (s64)hc->tx_delta >= 1000) 339 if (delay >= TFRC_T_DELTA)
343 return (u32)delay / 1000L; 340 return (u32)delay / USEC_PER_MSEC;
344 341
345 ccid3_hc_tx_update_win_count(hc, now); 342 ccid3_hc_tx_update_win_count(hc, now);
346 break;
347 case TFRC_SSTATE_TERM:
348 DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk);
349 return -EINVAL;
350 } 343 }
351 344
352 /* prepare to send now (add options etc.) */ 345 /* prepare to send now (add options etc.) */
@@ -355,11 +348,10 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
355 348
356 /* set the nominal send time for the next following packet */ 349 /* set the nominal send time for the next following packet */
357 hc->tx_t_nom = ktime_add_us(hc->tx_t_nom, hc->tx_t_ipi); 350 hc->tx_t_nom = ktime_add_us(hc->tx_t_nom, hc->tx_t_ipi);
358 return 0; 351 return CCID_PACKET_SEND_AT_ONCE;
359} 352}
360 353
361static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, 354static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len)
362 unsigned int len)
363{ 355{
364 struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); 356 struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
365 357
@@ -372,48 +364,34 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more,
372static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) 364static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
373{ 365{
374 struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); 366 struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
375 struct ccid3_options_received *opt_recv; 367 struct tfrc_tx_hist_entry *acked;
376 ktime_t now; 368 ktime_t now;
377 unsigned long t_nfb; 369 unsigned long t_nfb;
378 u32 pinv, r_sample; 370 u32 r_sample;
379 371
380 /* we are only interested in ACKs */ 372 /* we are only interested in ACKs */
381 if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || 373 if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK ||
382 DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK)) 374 DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK))
383 return; 375 return;
384 /* ... and only in the established state */
385 if (hc->tx_state != TFRC_SSTATE_FBACK &&
386 hc->tx_state != TFRC_SSTATE_NO_FBACK)
387 return;
388
389 opt_recv = &hc->tx_options_received;
390 now = ktime_get_real();
391
392 /* Estimate RTT from history if ACK number is valid */
393 r_sample = tfrc_tx_hist_rtt(hc->tx_hist,
394 DCCP_SKB_CB(skb)->dccpd_ack_seq, now);
395 if (r_sample == 0) {
396 DCCP_WARN("%s(%p): %s with bogus ACK-%llu\n", dccp_role(sk), sk,
397 dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type),
398 (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq);
399 return;
400 }
401
402 /* Update receive rate in units of 64 * bytes/second */
403 hc->tx_x_recv = opt_recv->ccid3or_receive_rate;
404 hc->tx_x_recv <<= 6;
405
406 /* Update loss event rate (which is scaled by 1e6) */
407 pinv = opt_recv->ccid3or_loss_event_rate;
408 if (pinv == ~0U || pinv == 0) /* see RFC 4342, 8.5 */
409 hc->tx_p = 0;
410 else /* can not exceed 100% */
411 hc->tx_p = scaled_div(1, pinv);
412 /* 376 /*
413 * Validate new RTT sample and update moving average 377 * Locate the acknowledged packet in the TX history.
378 *
379 * Returning "entry not found" here can for instance happen when
380 * - the host has not sent out anything (e.g. a passive server),
381 * - the Ack is outdated (packet with higher Ack number was received),
382 * - it is a bogus Ack (for a packet not sent on this connection).
414 */ 383 */
415 r_sample = dccp_sample_rtt(sk, r_sample); 384 acked = tfrc_tx_hist_find_entry(hc->tx_hist, dccp_hdr_ack_seq(skb));
385 if (acked == NULL)
386 return;
387 /* For the sake of RTT sampling, ignore/remove all older entries */
388 tfrc_tx_hist_purge(&acked->next);
389
390 /* Update the moving average for the RTT estimate (RFC 3448, 4.3) */
391 now = ktime_get_real();
392 r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp));
416 hc->tx_rtt = tfrc_ewma(hc->tx_rtt, r_sample, 9); 393 hc->tx_rtt = tfrc_ewma(hc->tx_rtt, r_sample, 9);
394
417 /* 395 /*
418 * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 396 * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3
419 */ 397 */
@@ -461,13 +439,12 @@ done_computing_x:
461 sk->sk_write_space(sk); 439 sk->sk_write_space(sk);
462 440
463 /* 441 /*
464 * Update timeout interval for the nofeedback timer. 442 * Update timeout interval for the nofeedback timer. In order to control
465 * We use a configuration option to increase the lower bound. 443 * rate halving on networks with very low RTTs (<= 1 ms), use per-route
466 * This can help avoid triggering the nofeedback timer too 444 * tunable RTAX_RTO_MIN value as the lower bound.
467 * often ('spinning') on LANs with small RTTs.
468 */ 445 */
469 hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt, (CONFIG_IP_DCCP_CCID3_RTO * 446 hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt,
470 (USEC_PER_SEC / 1000))); 447 USEC_PER_SEC/HZ * tcp_rto_min(sk));
471 /* 448 /*
472 * Schedule no feedback timer to expire in 449 * Schedule no feedback timer to expire in
473 * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) 450 * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi)
@@ -482,66 +459,41 @@ done_computing_x:
482 jiffies + usecs_to_jiffies(t_nfb)); 459 jiffies + usecs_to_jiffies(t_nfb));
483} 460}
484 461
485static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, 462static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type,
486 unsigned char len, u16 idx, 463 u8 option, u8 *optval, u8 optlen)
487 unsigned char *value)
488{ 464{
489 int rc = 0;
490 const struct dccp_sock *dp = dccp_sk(sk);
491 struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); 465 struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
492 struct ccid3_options_received *opt_recv;
493 __be32 opt_val; 466 __be32 opt_val;
494 467
495 opt_recv = &hc->tx_options_received;
496
497 if (opt_recv->ccid3or_seqno != dp->dccps_gsr) {
498 opt_recv->ccid3or_seqno = dp->dccps_gsr;
499 opt_recv->ccid3or_loss_event_rate = ~0;
500 opt_recv->ccid3or_loss_intervals_idx = 0;
501 opt_recv->ccid3or_loss_intervals_len = 0;
502 opt_recv->ccid3or_receive_rate = 0;
503 }
504
505 switch (option) { 468 switch (option) {
469 case TFRC_OPT_RECEIVE_RATE:
506 case TFRC_OPT_LOSS_EVENT_RATE: 470 case TFRC_OPT_LOSS_EVENT_RATE:
507 if (unlikely(len != 4)) { 471 /* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */
508 DCCP_WARN("%s(%p), invalid len %d " 472 if (packet_type == DCCP_PKT_DATA)
509 "for TFRC_OPT_LOSS_EVENT_RATE\n", 473 break;
510 dccp_role(sk), sk, len); 474 if (unlikely(optlen != 4)) {
511 rc = -EINVAL; 475 DCCP_WARN("%s(%p), invalid len %d for %u\n",
512 } else { 476 dccp_role(sk), sk, optlen, option);
513 opt_val = get_unaligned((__be32 *)value); 477 return -EINVAL;
514 opt_recv->ccid3or_loss_event_rate = ntohl(opt_val);
515 ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
516 dccp_role(sk), sk,
517 opt_recv->ccid3or_loss_event_rate);
518 } 478 }
519 break; 479 opt_val = ntohl(get_unaligned((__be32 *)optval));
520 case TFRC_OPT_LOSS_INTERVALS: 480
521 opt_recv->ccid3or_loss_intervals_idx = idx; 481 if (option == TFRC_OPT_RECEIVE_RATE) {
522 opt_recv->ccid3or_loss_intervals_len = len; 482 /* Receive Rate is kept in units of 64 bytes/second */
523 ccid3_pr_debug("%s(%p), LOSS_INTERVALS=(%u, %u)\n", 483 hc->tx_x_recv = opt_val;
524 dccp_role(sk), sk, 484 hc->tx_x_recv <<= 6;
525 opt_recv->ccid3or_loss_intervals_idx, 485
526 opt_recv->ccid3or_loss_intervals_len);
527 break;
528 case TFRC_OPT_RECEIVE_RATE:
529 if (unlikely(len != 4)) {
530 DCCP_WARN("%s(%p), invalid len %d "
531 "for TFRC_OPT_RECEIVE_RATE\n",
532 dccp_role(sk), sk, len);
533 rc = -EINVAL;
534 } else {
535 opt_val = get_unaligned((__be32 *)value);
536 opt_recv->ccid3or_receive_rate = ntohl(opt_val);
537 ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", 486 ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n",
538 dccp_role(sk), sk, 487 dccp_role(sk), sk, opt_val);
539 opt_recv->ccid3or_receive_rate); 488 } else {
489 /* Update the fixpoint Loss Event Rate fraction */
490 hc->tx_p = tfrc_invert_loss_event_rate(opt_val);
491
492 ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
493 dccp_role(sk), sk, opt_val);
540 } 494 }
541 break;
542 } 495 }
543 496 return 0;
544 return rc;
545} 497}
546 498
547static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk) 499static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk)
@@ -559,42 +511,36 @@ static void ccid3_hc_tx_exit(struct sock *sk)
559{ 511{
560 struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); 512 struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
561 513
562 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM);
563 sk_stop_timer(sk, &hc->tx_no_feedback_timer); 514 sk_stop_timer(sk, &hc->tx_no_feedback_timer);
564
565 tfrc_tx_hist_purge(&hc->tx_hist); 515 tfrc_tx_hist_purge(&hc->tx_hist);
566} 516}
567 517
568static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) 518static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
569{ 519{
570 struct ccid3_hc_tx_sock *hc; 520 info->tcpi_rto = ccid3_hc_tx_sk(sk)->tx_t_rto;
571 521 info->tcpi_rtt = ccid3_hc_tx_sk(sk)->tx_rtt;
572 /* Listen socks doesn't have a private CCID block */
573 if (sk->sk_state == DCCP_LISTEN)
574 return;
575
576 hc = ccid3_hc_tx_sk(sk);
577 info->tcpi_rto = hc->tx_t_rto;
578 info->tcpi_rtt = hc->tx_rtt;
579} 522}
580 523
581static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, 524static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
582 u32 __user *optval, int __user *optlen) 525 u32 __user *optval, int __user *optlen)
583{ 526{
584 const struct ccid3_hc_tx_sock *hc; 527 const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
528 struct tfrc_tx_info tfrc;
585 const void *val; 529 const void *val;
586 530
587 /* Listen socks doesn't have a private CCID block */
588 if (sk->sk_state == DCCP_LISTEN)
589 return -EINVAL;
590
591 hc = ccid3_hc_tx_sk(sk);
592 switch (optname) { 531 switch (optname) {
593 case DCCP_SOCKOPT_CCID_TX_INFO: 532 case DCCP_SOCKOPT_CCID_TX_INFO:
594 if (len < sizeof(hc->tx_tfrc)) 533 if (len < sizeof(tfrc))
595 return -EINVAL; 534 return -EINVAL;
596 len = sizeof(hc->tx_tfrc); 535 tfrc.tfrctx_x = hc->tx_x;
597 val = &hc->tx_tfrc; 536 tfrc.tfrctx_x_recv = hc->tx_x_recv;
537 tfrc.tfrctx_x_calc = hc->tx_x_calc;
538 tfrc.tfrctx_rtt = hc->tx_rtt;
539 tfrc.tfrctx_p = hc->tx_p;
540 tfrc.tfrctx_rto = hc->tx_t_rto;
541 tfrc.tfrctx_ipi = hc->tx_t_ipi;
542 len = sizeof(tfrc);
543 val = &tfrc;
598 break; 544 break;
599 default: 545 default:
600 return -ENOPROTOOPT; 546 return -ENOPROTOOPT;
@@ -624,7 +570,6 @@ static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
624 static const char *const ccid3_rx_state_names[] = { 570 static const char *const ccid3_rx_state_names[] = {
625 [TFRC_RSTATE_NO_DATA] = "NO_DATA", 571 [TFRC_RSTATE_NO_DATA] = "NO_DATA",
626 [TFRC_RSTATE_DATA] = "DATA", 572 [TFRC_RSTATE_DATA] = "DATA",
627 [TFRC_RSTATE_TERM] = "TERM",
628 }; 573 };
629 574
630 return ccid3_rx_state_names[state]; 575 return ccid3_rx_state_names[state];
@@ -650,14 +595,9 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk,
650{ 595{
651 struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); 596 struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
652 struct dccp_sock *dp = dccp_sk(sk); 597 struct dccp_sock *dp = dccp_sk(sk);
653 ktime_t now; 598 ktime_t now = ktime_get_real();
654 s64 delta = 0; 599 s64 delta = 0;
655 600
656 if (unlikely(hc->rx_state == TFRC_RSTATE_TERM))
657 return;
658
659 now = ktime_get_real();
660
661 switch (fbtype) { 601 switch (fbtype) {
662 case CCID3_FBACK_INITIAL: 602 case CCID3_FBACK_INITIAL:
663 hc->rx_x_recv = 0; 603 hc->rx_x_recv = 0;
@@ -701,14 +641,12 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk,
701 641
702static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) 642static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
703{ 643{
704 const struct ccid3_hc_rx_sock *hc; 644 const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
705 __be32 x_recv, pinv; 645 __be32 x_recv, pinv;
706 646
707 if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) 647 if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN))
708 return 0; 648 return 0;
709 649
710 hc = ccid3_hc_rx_sk(sk);
711
712 if (dccp_packet_without_ack(skb)) 650 if (dccp_packet_without_ack(skb))
713 return 0; 651 return 0;
714 652
@@ -749,10 +687,11 @@ static u32 ccid3_first_li(struct sock *sk)
749 x_recv = scaled_div32(hc->rx_bytes_recv, delta); 687 x_recv = scaled_div32(hc->rx_bytes_recv, delta);
750 if (x_recv == 0) { /* would also trigger divide-by-zero */ 688 if (x_recv == 0) { /* would also trigger divide-by-zero */
751 DCCP_WARN("X_recv==0\n"); 689 DCCP_WARN("X_recv==0\n");
752 if ((x_recv = hc->rx_x_recv) == 0) { 690 if (hc->rx_x_recv == 0) {
753 DCCP_BUG("stored value of X_recv is zero"); 691 DCCP_BUG("stored value of X_recv is zero");
754 return ~0U; 692 return ~0U;
755 } 693 }
694 x_recv = hc->rx_x_recv;
756 } 695 }
757 696
758 fval = scaled_div(hc->rx_s, hc->rx_rtt); 697 fval = scaled_div(hc->rx_s, hc->rx_rtt);
@@ -862,46 +801,31 @@ static void ccid3_hc_rx_exit(struct sock *sk)
862{ 801{
863 struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); 802 struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
864 803
865 ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM);
866
867 tfrc_rx_hist_purge(&hc->rx_hist); 804 tfrc_rx_hist_purge(&hc->rx_hist);
868 tfrc_lh_cleanup(&hc->rx_li_hist); 805 tfrc_lh_cleanup(&hc->rx_li_hist);
869} 806}
870 807
871static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) 808static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
872{ 809{
873 const struct ccid3_hc_rx_sock *hc; 810 info->tcpi_ca_state = ccid3_hc_rx_sk(sk)->rx_state;
874
875 /* Listen socks doesn't have a private CCID block */
876 if (sk->sk_state == DCCP_LISTEN)
877 return;
878
879 hc = ccid3_hc_rx_sk(sk);
880 info->tcpi_ca_state = hc->rx_state;
881 info->tcpi_options |= TCPI_OPT_TIMESTAMPS; 811 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
882 info->tcpi_rcv_rtt = hc->rx_rtt; 812 info->tcpi_rcv_rtt = ccid3_hc_rx_sk(sk)->rx_rtt;
883} 813}
884 814
885static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, 815static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
886 u32 __user *optval, int __user *optlen) 816 u32 __user *optval, int __user *optlen)
887{ 817{
888 const struct ccid3_hc_rx_sock *hc; 818 const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
889 struct tfrc_rx_info rx_info; 819 struct tfrc_rx_info rx_info;
890 const void *val; 820 const void *val;
891 821
892 /* Listen socks doesn't have a private CCID block */
893 if (sk->sk_state == DCCP_LISTEN)
894 return -EINVAL;
895
896 hc = ccid3_hc_rx_sk(sk);
897 switch (optname) { 822 switch (optname) {
898 case DCCP_SOCKOPT_CCID_RX_INFO: 823 case DCCP_SOCKOPT_CCID_RX_INFO:
899 if (len < sizeof(rx_info)) 824 if (len < sizeof(rx_info))
900 return -EINVAL; 825 return -EINVAL;
901 rx_info.tfrcrx_x_recv = hc->rx_x_recv; 826 rx_info.tfrcrx_x_recv = hc->rx_x_recv;
902 rx_info.tfrcrx_rtt = hc->rx_rtt; 827 rx_info.tfrcrx_rtt = hc->rx_rtt;
903 rx_info.tfrcrx_p = hc->rx_pinv == 0 ? ~0U : 828 rx_info.tfrcrx_p = tfrc_invert_loss_event_rate(hc->rx_pinv);
904 scaled_div(1, hc->rx_pinv);
905 len = sizeof(rx_info); 829 len = sizeof(rx_info);
906 val = &rx_info; 830 val = &rx_info;
907 break; 831 break;
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
index 032635776653..1a9933c29672 100644
--- a/net/dccp/ccids/ccid3.h
+++ b/net/dccp/ccids/ccid3.h
@@ -42,35 +42,36 @@
42#include "lib/tfrc.h" 42#include "lib/tfrc.h"
43#include "../ccid.h" 43#include "../ccid.h"
44 44
45/* Two seconds as per RFC 3448 4.2 */ 45/* Two seconds as per RFC 5348, 4.2 */
46#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC) 46#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC)
47 47
48/* In usecs - half the scheduling granularity as per RFC3448 4.6 */
49#define TFRC_OPSYS_HALF_TIME_GRAN (USEC_PER_SEC / (2 * HZ))
50
51/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */ 48/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */
52#define TFRC_T_MBI 64 49#define TFRC_T_MBI 64
53 50
51/*
52 * The t_delta parameter (RFC 5348, 8.3): delays of less than %USEC_PER_MSEC are
53 * rounded down to 0, since sk_reset_timer() here uses millisecond granularity.
54 * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse
55 * resolution of HZ < 500 means that the error is below one timer tick (t_gran)
56 * when using the constant t_delta = t_gran / 2 = %USEC_PER_SEC / (2 * HZ).
57 */
58#if (HZ >= 500)
59# define TFRC_T_DELTA USEC_PER_MSEC
60#else
61# define TFRC_T_DELTA (USEC_PER_SEC / (2 * HZ))
62#endif
63
54enum ccid3_options { 64enum ccid3_options {
55 TFRC_OPT_LOSS_EVENT_RATE = 192, 65 TFRC_OPT_LOSS_EVENT_RATE = 192,
56 TFRC_OPT_LOSS_INTERVALS = 193, 66 TFRC_OPT_LOSS_INTERVALS = 193,
57 TFRC_OPT_RECEIVE_RATE = 194, 67 TFRC_OPT_RECEIVE_RATE = 194,
58}; 68};
59 69
60struct ccid3_options_received {
61 u64 ccid3or_seqno:48,
62 ccid3or_loss_intervals_idx:16;
63 u16 ccid3or_loss_intervals_len;
64 u32 ccid3or_loss_event_rate;
65 u32 ccid3or_receive_rate;
66};
67
68/* TFRC sender states */ 70/* TFRC sender states */
69enum ccid3_hc_tx_states { 71enum ccid3_hc_tx_states {
70 TFRC_SSTATE_NO_SENT = 1, 72 TFRC_SSTATE_NO_SENT = 1,
71 TFRC_SSTATE_NO_FBACK, 73 TFRC_SSTATE_NO_FBACK,
72 TFRC_SSTATE_FBACK, 74 TFRC_SSTATE_FBACK,
73 TFRC_SSTATE_TERM,
74}; 75};
75 76
76/** 77/**
@@ -90,19 +91,16 @@ enum ccid3_hc_tx_states {
90 * @tx_no_feedback_timer: Handle to no feedback timer 91 * @tx_no_feedback_timer: Handle to no feedback timer
91 * @tx_t_ld: Time last doubled during slow start 92 * @tx_t_ld: Time last doubled during slow start
92 * @tx_t_nom: Nominal send time of next packet 93 * @tx_t_nom: Nominal send time of next packet
93 * @tx_delta: Send timer delta (RFC 3448, 4.6) in usecs
94 * @tx_hist: Packet history 94 * @tx_hist: Packet history
95 * @tx_options_received: Parsed set of retrieved options
96 */ 95 */
97struct ccid3_hc_tx_sock { 96struct ccid3_hc_tx_sock {
98 struct tfrc_tx_info tx_tfrc; 97 u64 tx_x;
99#define tx_x tx_tfrc.tfrctx_x 98 u64 tx_x_recv;
100#define tx_x_recv tx_tfrc.tfrctx_x_recv 99 u32 tx_x_calc;
101#define tx_x_calc tx_tfrc.tfrctx_x_calc 100 u32 tx_rtt;
102#define tx_rtt tx_tfrc.tfrctx_rtt 101 u32 tx_p;
103#define tx_p tx_tfrc.tfrctx_p 102 u32 tx_t_rto;
104#define tx_t_rto tx_tfrc.tfrctx_rto 103 u32 tx_t_ipi;
105#define tx_t_ipi tx_tfrc.tfrctx_ipi
106 u16 tx_s; 104 u16 tx_s;
107 enum ccid3_hc_tx_states tx_state:8; 105 enum ccid3_hc_tx_states tx_state:8;
108 u8 tx_last_win_count; 106 u8 tx_last_win_count;
@@ -110,9 +108,7 @@ struct ccid3_hc_tx_sock {
110 struct timer_list tx_no_feedback_timer; 108 struct timer_list tx_no_feedback_timer;
111 ktime_t tx_t_ld; 109 ktime_t tx_t_ld;
112 ktime_t tx_t_nom; 110 ktime_t tx_t_nom;
113 u32 tx_delta;
114 struct tfrc_tx_hist_entry *tx_hist; 111 struct tfrc_tx_hist_entry *tx_hist;
115 struct ccid3_options_received tx_options_received;
116}; 112};
117 113
118static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) 114static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
@@ -126,21 +122,16 @@ static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
126enum ccid3_hc_rx_states { 122enum ccid3_hc_rx_states {
127 TFRC_RSTATE_NO_DATA = 1, 123 TFRC_RSTATE_NO_DATA = 1,
128 TFRC_RSTATE_DATA, 124 TFRC_RSTATE_DATA,
129 TFRC_RSTATE_TERM = 127,
130}; 125};
131 126
132/** 127/**
133 * struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket 128 * struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket
134 * @rx_x_recv: Receiver estimate of send rate (RFC 3448 4.3)
135 * @rx_rtt: Receiver estimate of rtt (non-standard)
136 * @rx_p: Current loss event rate (RFC 3448 5.4)
137 * @rx_last_counter: Tracks window counter (RFC 4342, 8.1) 129 * @rx_last_counter: Tracks window counter (RFC 4342, 8.1)
138 * @rx_state: Receiver state, one of %ccid3_hc_rx_states 130 * @rx_state: Receiver state, one of %ccid3_hc_rx_states
139 * @rx_bytes_recv: Total sum of DCCP payload bytes 131 * @rx_bytes_recv: Total sum of DCCP payload bytes
140 * @rx_x_recv: Receiver estimate of send rate (RFC 3448, sec. 4.3) 132 * @rx_x_recv: Receiver estimate of send rate (RFC 3448, sec. 4.3)
141 * @rx_rtt: Receiver estimate of RTT 133 * @rx_rtt: Receiver estimate of RTT
142 * @rx_tstamp_last_feedback: Time at which last feedback was sent 134 * @rx_tstamp_last_feedback: Time at which last feedback was sent
143 * @rx_tstamp_last_ack: Time at which last feedback was sent
144 * @rx_hist: Packet history (loss detection + RTT sampling) 135 * @rx_hist: Packet history (loss detection + RTT sampling)
145 * @rx_li_hist: Loss Interval database 136 * @rx_li_hist: Loss Interval database
146 * @rx_s: Received packet size in bytes 137 * @rx_s: Received packet size in bytes
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
index 8fc3cbf79071..497723c4d4bb 100644
--- a/net/dccp/ccids/lib/loss_interval.c
+++ b/net/dccp/ccids/lib/loss_interval.c
@@ -116,7 +116,7 @@ u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
116 cur->li_length = len; 116 cur->li_length = len;
117 tfrc_lh_calc_i_mean(lh); 117 tfrc_lh_calc_i_mean(lh);
118 118
119 return (lh->i_mean < old_i_mean); 119 return lh->i_mean < old_i_mean;
120} 120}
121 121
122/* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */ 122/* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
index 3a4f414e94a0..de8fe294bf0b 100644
--- a/net/dccp/ccids/lib/packet_history.c
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -38,18 +38,6 @@
38#include "packet_history.h" 38#include "packet_history.h"
39#include "../../dccp.h" 39#include "../../dccp.h"
40 40
41/**
42 * tfrc_tx_hist_entry - Simple singly-linked TX history list
43 * @next: next oldest entry (LIFO order)
44 * @seqno: sequence number of this entry
45 * @stamp: send time of packet with sequence number @seqno
46 */
47struct tfrc_tx_hist_entry {
48 struct tfrc_tx_hist_entry *next;
49 u64 seqno;
50 ktime_t stamp;
51};
52
53/* 41/*
54 * Transmitter History Routines 42 * Transmitter History Routines
55 */ 43 */
@@ -71,15 +59,6 @@ void tfrc_tx_packet_history_exit(void)
71 } 59 }
72} 60}
73 61
74static struct tfrc_tx_hist_entry *
75 tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
76{
77 while (head != NULL && head->seqno != seqno)
78 head = head->next;
79
80 return head;
81}
82
83int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno) 62int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno)
84{ 63{
85 struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any()); 64 struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any());
@@ -107,24 +86,6 @@ void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp)
107 *headp = NULL; 86 *headp = NULL;
108} 87}
109 88
110u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, const u64 seqno,
111 const ktime_t now)
112{
113 u32 rtt = 0;
114 struct tfrc_tx_hist_entry *packet = tfrc_tx_hist_find_entry(head, seqno);
115
116 if (packet != NULL) {
117 rtt = ktime_us_delta(now, packet->stamp);
118 /*
119 * Garbage-collect older (irrelevant) entries:
120 */
121 tfrc_tx_hist_purge(&packet->next);
122 }
123
124 return rtt;
125}
126
127
128/* 89/*
129 * Receiver History Routines 90 * Receiver History Routines
130 */ 91 */
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h
index 7df6c5299999..7ee4a9d9d335 100644
--- a/net/dccp/ccids/lib/packet_history.h
+++ b/net/dccp/ccids/lib/packet_history.h
@@ -40,12 +40,28 @@
40#include <linux/slab.h> 40#include <linux/slab.h>
41#include "tfrc.h" 41#include "tfrc.h"
42 42
43struct tfrc_tx_hist_entry; 43/**
44 * tfrc_tx_hist_entry - Simple singly-linked TX history list
45 * @next: next oldest entry (LIFO order)
46 * @seqno: sequence number of this entry
47 * @stamp: send time of packet with sequence number @seqno
48 */
49struct tfrc_tx_hist_entry {
50 struct tfrc_tx_hist_entry *next;
51 u64 seqno;
52 ktime_t stamp;
53};
54
55static inline struct tfrc_tx_hist_entry *
56 tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
57{
58 while (head != NULL && head->seqno != seqno)
59 head = head->next;
60 return head;
61}
44 62
45extern int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno); 63extern int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno);
46extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp); 64extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp);
47extern u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head,
48 const u64 seqno, const ktime_t now);
49 65
50/* Subtraction a-b modulo-16, respects circular wrap-around */ 66/* Subtraction a-b modulo-16, respects circular wrap-around */
51#define SUB16(a, b) (((a) + 16 - (b)) & 0xF) 67#define SUB16(a, b) (((a) + 16 - (b)) & 0xF)
diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h
index 01bb48e96c2e..f8ee3f549770 100644
--- a/net/dccp/ccids/lib/tfrc.h
+++ b/net/dccp/ccids/lib/tfrc.h
@@ -57,6 +57,7 @@ static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight)
57 57
58extern u32 tfrc_calc_x(u16 s, u32 R, u32 p); 58extern u32 tfrc_calc_x(u16 s, u32 R, u32 p);
59extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue); 59extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue);
60extern u32 tfrc_invert_loss_event_rate(u32 loss_event_rate);
60 61
61extern int tfrc_tx_packet_history_init(void); 62extern int tfrc_tx_packet_history_init(void);
62extern void tfrc_tx_packet_history_exit(void); 63extern void tfrc_tx_packet_history_exit(void);
diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
index 22ca1cf0eb55..a052a4377e26 100644
--- a/net/dccp/ccids/lib/tfrc_equation.c
+++ b/net/dccp/ccids/lib/tfrc_equation.c
@@ -687,3 +687,17 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
687 index = tfrc_binsearch(fvalue, 0); 687 index = tfrc_binsearch(fvalue, 0);
688 return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE; 688 return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE;
689} 689}
690
691/**
692 * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100%
693 * When @loss_event_rate is large, there is a chance that p is truncated to 0.
694 * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0.
695 */
696u32 tfrc_invert_loss_event_rate(u32 loss_event_rate)
697{
698 if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */
699 return 0;
700 if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */
701 return 1000000;
702 return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P);
703}
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 3ccef1b70fee..a8ed459508b2 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -153,18 +153,27 @@ static inline u64 max48(const u64 seq1, const u64 seq2)
153} 153}
154 154
155/** 155/**
156 * dccp_loss_free - Evaluates condition for data loss from RFC 4340, 7.7.1 156 * dccp_loss_count - Approximate the number of lost data packets in a burst loss
157 * @s1: start sequence number 157 * @s1: last known sequence number before the loss ('hole')
158 * @s2: end sequence number 158 * @s2: first sequence number seen after the 'hole'
159 * @ndp: NDP count on packet with sequence number @s2 159 * @ndp: NDP count on packet with sequence number @s2
160 * Returns true if the sequence range s1...s2 has no data loss.
161 */ 160 */
162static inline bool dccp_loss_free(const u64 s1, const u64 s2, const u64 ndp) 161static inline u64 dccp_loss_count(const u64 s1, const u64 s2, const u64 ndp)
163{ 162{
164 s64 delta = dccp_delta_seqno(s1, s2); 163 s64 delta = dccp_delta_seqno(s1, s2);
165 164
166 WARN_ON(delta < 0); 165 WARN_ON(delta < 0);
167 return (u64)delta <= ndp + 1; 166 delta -= ndp + 1;
167
168 return delta > 0 ? delta : 0;
169}
170
171/**
172 * dccp_loss_free - Evaluate condition for data loss from RFC 4340, 7.7.1
173 */
174static inline bool dccp_loss_free(const u64 s1, const u64 s2, const u64 ndp)
175{
176 return dccp_loss_count(s1, s2, ndp) == 0;
168} 177}
169 178
170enum { 179enum {
@@ -234,8 +243,9 @@ extern void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
234extern void dccp_send_sync(struct sock *sk, const u64 seq, 243extern void dccp_send_sync(struct sock *sk, const u64 seq,
235 const enum dccp_pkt_type pkt_type); 244 const enum dccp_pkt_type pkt_type);
236 245
237extern void dccp_write_xmit(struct sock *sk, int block); 246extern void dccp_write_xmit(struct sock *sk);
238extern void dccp_write_space(struct sock *sk); 247extern void dccp_write_space(struct sock *sk);
248extern void dccp_flush_write_queue(struct sock *sk, long *time_budget);
239 249
240extern void dccp_init_xmit_timers(struct sock *sk); 250extern void dccp_init_xmit_timers(struct sock *sk);
241static inline void dccp_clear_xmit_timers(struct sock *sk) 251static inline void dccp_clear_xmit_timers(struct sock *sk)
@@ -246,7 +256,6 @@ static inline void dccp_clear_xmit_timers(struct sock *sk)
246extern unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu); 256extern unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu);
247 257
248extern const char *dccp_packet_name(const int type); 258extern const char *dccp_packet_name(const int type);
249extern const char *dccp_state_name(const int state);
250 259
251extern void dccp_set_state(struct sock *sk, const int state); 260extern void dccp_set_state(struct sock *sk, const int state);
252extern void dccp_done(struct sock *sk); 261extern void dccp_done(struct sock *sk);
@@ -415,6 +424,23 @@ static inline void dccp_update_gsr(struct sock *sk, u64 seq)
415 dp->dccps_gsr = seq; 424 dp->dccps_gsr = seq;
416 /* Sequence validity window depends on remote Sequence Window (7.5.1) */ 425 /* Sequence validity window depends on remote Sequence Window (7.5.1) */
417 dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4); 426 dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4);
427 /*
428 * Adjust SWL so that it is not below ISR. In contrast to RFC 4340,
429 * 7.5.1 we perform this check beyond the initial handshake: W/W' are
430 * always > 32, so for the first W/W' packets in the lifetime of a
431 * connection we always have to adjust SWL.
432 * A second reason why we are doing this is that the window depends on
433 * the feature-remote value of Sequence Window: nothing stops the peer
434 * from updating this value while we are busy adjusting SWL for the
435 * first W packets (we would have to count from scratch again then).
436 * Therefore it is safer to always make sure that the Sequence Window
437 * is not artificially extended by a peer who grows SWL downwards by
438 * continually updating the feature-remote Sequence-Window.
439 * If sequence numbers wrap it is bad luck. But that will take a while
440 * (48 bit), and this measure prevents Sequence-number attacks.
441 */
442 if (before48(dp->dccps_swl, dp->dccps_isr))
443 dp->dccps_swl = dp->dccps_isr;
418 dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4); 444 dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4);
419} 445}
420 446
@@ -425,14 +451,16 @@ static inline void dccp_update_gss(struct sock *sk, u64 seq)
425 dp->dccps_gss = seq; 451 dp->dccps_gss = seq;
426 /* Ack validity window depends on local Sequence Window value (7.5.1) */ 452 /* Ack validity window depends on local Sequence Window value (7.5.1) */
427 dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win); 453 dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win);
454 /* Adjust AWL so that it is not below ISS - see comment above for SWL */
455 if (before48(dp->dccps_awl, dp->dccps_iss))
456 dp->dccps_awl = dp->dccps_iss;
428 dp->dccps_awh = dp->dccps_gss; 457 dp->dccps_awh = dp->dccps_gss;
429} 458}
430 459
431static inline int dccp_ack_pending(const struct sock *sk) 460static inline int dccp_ack_pending(const struct sock *sk)
432{ 461{
433 const struct dccp_sock *dp = dccp_sk(sk); 462 const struct dccp_sock *dp = dccp_sk(sk);
434 return dp->dccps_timestamp_echo != 0 || 463 return (dp->dccps_hc_rx_ackvec != NULL &&
435 (dp->dccps_hc_rx_ackvec != NULL &&
436 dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) || 464 dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) ||
437 inet_csk_ack_scheduled(sk); 465 inet_csk_ack_scheduled(sk);
438} 466}
@@ -449,7 +477,6 @@ extern int dccp_insert_options_rsk(struct dccp_request_sock*, struct sk_buff*);
449extern int dccp_insert_option_elapsed_time(struct sk_buff *skb, u32 elapsed); 477extern int dccp_insert_option_elapsed_time(struct sk_buff *skb, u32 elapsed);
450extern u32 dccp_timestamp(void); 478extern u32 dccp_timestamp(void);
451extern void dccp_timestamping_init(void); 479extern void dccp_timestamping_init(void);
452extern int dccp_insert_option_timestamp(struct sk_buff *skb);
453extern int dccp_insert_option(struct sk_buff *skb, unsigned char option, 480extern int dccp_insert_option(struct sk_buff *skb, unsigned char option,
454 const void *value, unsigned char len); 481 const void *value, unsigned char len);
455 482
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index df7dd26cf07e..568def952722 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -730,16 +730,6 @@ int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
730 0, list, len); 730 0, list, len);
731} 731}
732 732
733/* Analogous to dccp_feat_register_sp(), but for non-negotiable values */
734int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val)
735{
736 /* any changes must be registered before establishing the connection */
737 if (sk->sk_state != DCCP_CLOSED)
738 return -EISCONN;
739 if (dccp_feat_type(feat) != FEAT_NN)
740 return -EINVAL;
741 return __feat_register_nn(&dccp_sk(sk)->dccps_featneg, feat, 0, val);
742}
743 733
744/* 734/*
745 * Tracking features whose value depend on the choice of CCID 735 * Tracking features whose value depend on the choice of CCID
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
index f96721619def..e56a4e5e634e 100644
--- a/net/dccp/feat.h
+++ b/net/dccp/feat.h
@@ -111,7 +111,6 @@ extern int dccp_feat_init(struct sock *sk);
111extern void dccp_feat_initialise_sysctls(void); 111extern void dccp_feat_initialise_sysctls(void);
112extern int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, 112extern int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
113 u8 const *list, u8 len); 113 u8 const *list, u8 len);
114extern int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val);
115extern int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *, 114extern int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *,
116 u8 mand, u8 opt, u8 feat, u8 *val, u8 len); 115 u8 mand, u8 opt, u8 feat, u8 *val, u8 len);
117extern int dccp_feat_clone_list(struct list_head const *, struct list_head *); 116extern int dccp_feat_clone_list(struct list_head const *, struct list_head *);
diff --git a/net/dccp/input.c b/net/dccp/input.c
index 10c957a88f4f..265985370fa1 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -259,7 +259,7 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
259 sysctl_dccp_sync_ratelimit))) 259 sysctl_dccp_sync_ratelimit)))
260 return 0; 260 return 0;
261 261
262 DCCP_WARN("DCCP: Step 6 failed for %s packet, " 262 DCCP_WARN("Step 6 failed for %s packet, "
263 "(LSWL(%llu) <= P.seqno(%llu) <= S.SWH(%llu)) and " 263 "(LSWL(%llu) <= P.seqno(%llu) <= S.SWH(%llu)) and "
264 "(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), " 264 "(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), "
265 "sending SYNC...\n", dccp_packet_name(dh->dccph_type), 265 "sending SYNC...\n", dccp_packet_name(dh->dccph_type),
@@ -441,20 +441,14 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
441 kfree_skb(sk->sk_send_head); 441 kfree_skb(sk->sk_send_head);
442 sk->sk_send_head = NULL; 442 sk->sk_send_head = NULL;
443 443
444 dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
445 dccp_update_gsr(sk, dp->dccps_isr);
446 /* 444 /*
447 * SWL and AWL are initially adjusted so that they are not less than 445 * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect
448 * the initial Sequence Numbers received and sent, respectively: 446 * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH
449 * SWL := max(GSR + 1 - floor(W/4), ISR), 447 * is done as part of activating the feature values below, since
450 * AWL := max(GSS - W' + 1, ISS). 448 * these settings depend on the local/remote Sequence Window
451 * These adjustments MUST be applied only at the beginning of the 449 * features, which were undefined or not confirmed until now.
452 * connection.
453 *
454 * AWL was adjusted in dccp_v4_connect -acme
455 */ 450 */
456 dccp_set_seqno(&dp->dccps_swl, 451 dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
457 max48(dp->dccps_swl, dp->dccps_isr));
458 452
459 dccp_sync_mss(sk, icsk->icsk_pmtu_cookie); 453 dccp_sync_mss(sk, icsk->icsk_pmtu_cookie);
460 454
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index d4a166f0f391..3f69ea114829 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -392,7 +392,7 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
392 392
393 newsk = dccp_create_openreq_child(sk, req, skb); 393 newsk = dccp_create_openreq_child(sk, req, skb);
394 if (newsk == NULL) 394 if (newsk == NULL)
395 goto exit; 395 goto exit_nonewsk;
396 396
397 sk_setup_caps(newsk, dst); 397 sk_setup_caps(newsk, dst);
398 398
@@ -409,16 +409,20 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
409 409
410 dccp_sync_mss(newsk, dst_mtu(dst)); 410 dccp_sync_mss(newsk, dst_mtu(dst));
411 411
412 if (__inet_inherit_port(sk, newsk) < 0) {
413 sock_put(newsk);
414 goto exit;
415 }
412 __inet_hash_nolisten(newsk, NULL); 416 __inet_hash_nolisten(newsk, NULL);
413 __inet_inherit_port(sk, newsk);
414 417
415 return newsk; 418 return newsk;
416 419
417exit_overflow: 420exit_overflow:
418 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 421 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
422exit_nonewsk:
423 dst_release(dst);
419exit: 424exit:
420 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); 425 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
421 dst_release(dst);
422 return NULL; 426 return NULL;
423} 427}
424 428
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 6e3f32575df7..dca711df9b60 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -564,7 +564,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
564 564
565 newsk = dccp_create_openreq_child(sk, req, skb); 565 newsk = dccp_create_openreq_child(sk, req, skb);
566 if (newsk == NULL) 566 if (newsk == NULL)
567 goto out; 567 goto out_nonewsk;
568 568
569 /* 569 /*
570 * No need to charge this sock to the relevant IPv6 refcnt debug socks 570 * No need to charge this sock to the relevant IPv6 refcnt debug socks
@@ -632,18 +632,22 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
632 newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6; 632 newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6;
633 newinet->inet_rcv_saddr = LOOPBACK4_IPV6; 633 newinet->inet_rcv_saddr = LOOPBACK4_IPV6;
634 634
635 if (__inet_inherit_port(sk, newsk) < 0) {
636 sock_put(newsk);
637 goto out;
638 }
635 __inet6_hash(newsk, NULL); 639 __inet6_hash(newsk, NULL);
636 __inet_inherit_port(sk, newsk);
637 640
638 return newsk; 641 return newsk;
639 642
640out_overflow: 643out_overflow:
641 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 644 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
645out_nonewsk:
646 dst_release(dst);
642out: 647out:
643 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); 648 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
644 if (opt != NULL && opt != np->opt) 649 if (opt != NULL && opt != np->opt)
645 sock_kfree_s(sk, opt, opt->tot_len); 650 sock_kfree_s(sk, opt, opt->tot_len);
646 dst_release(dst);
647 return NULL; 651 return NULL;
648} 652}
649 653
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 128b089d3aef..d7041a0963af 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -121,30 +121,18 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
121 * 121 *
122 * Choose S.ISS (initial seqno) or set from Init Cookies 122 * Choose S.ISS (initial seqno) or set from Init Cookies
123 * Initialize S.GAR := S.ISS 123 * Initialize S.GAR := S.ISS
124 * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies 124 * Set S.ISR, S.GSR from packet (or Init Cookies)
125 */ 125 *
126 newdp->dccps_gar = newdp->dccps_iss = dreq->dreq_iss; 126 * Setting AWL/AWH and SWL/SWH happens as part of the feature
127 dccp_update_gss(newsk, dreq->dreq_iss); 127 * activation below, as these windows all depend on the local
128 128 * and remote Sequence Window feature values (7.5.2).
129 newdp->dccps_isr = dreq->dreq_isr;
130 dccp_update_gsr(newsk, dreq->dreq_isr);
131
132 /*
133 * SWL and AWL are initially adjusted so that they are not less than
134 * the initial Sequence Numbers received and sent, respectively:
135 * SWL := max(GSR + 1 - floor(W/4), ISR),
136 * AWL := max(GSS - W' + 1, ISS).
137 * These adjustments MUST be applied only at the beginning of the
138 * connection.
139 */ 129 */
140 dccp_set_seqno(&newdp->dccps_swl, 130 newdp->dccps_gss = newdp->dccps_iss = dreq->dreq_iss;
141 max48(newdp->dccps_swl, newdp->dccps_isr)); 131 newdp->dccps_gar = newdp->dccps_iss;
142 dccp_set_seqno(&newdp->dccps_awl, 132 newdp->dccps_gsr = newdp->dccps_isr = dreq->dreq_isr;
143 max48(newdp->dccps_awl, newdp->dccps_iss));
144 133
145 /* 134 /*
146 * Activate features after initialising the sequence numbers, 135 * Activate features: initialise CCIDs, sequence windows etc.
147 * since CCID initialisation may depend on GSS, ISR, ISS etc.
148 */ 136 */
149 if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) { 137 if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) {
150 /* It is still raw copy of parent, so invalidate 138 /* It is still raw copy of parent, so invalidate
diff --git a/net/dccp/options.c b/net/dccp/options.c
index bfda087bd90d..cd3061813009 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -96,18 +96,11 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
96 } 96 }
97 97
98 /* 98 /*
99 * CCID-Specific Options (from RFC 4340, sec. 10.3):
100 *
101 * Option numbers 128 through 191 are for options sent from the
102 * HC-Sender to the HC-Receiver; option numbers 192 through 255
103 * are for options sent from the HC-Receiver to the HC-Sender.
104 *
105 * CCID-specific options are ignored during connection setup, as 99 * CCID-specific options are ignored during connection setup, as
106 * negotiation may still be in progress (see RFC 4340, 10.3). 100 * negotiation may still be in progress (see RFC 4340, 10.3).
107 * The same applies to Ack Vectors, as these depend on the CCID. 101 * The same applies to Ack Vectors, as these depend on the CCID.
108 *
109 */ 102 */
110 if (dreq != NULL && (opt >= 128 || 103 if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC ||
111 opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1)) 104 opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1))
112 goto ignore_option; 105 goto ignore_option;
113 106
@@ -170,6 +163,8 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
170 dccp_role(sk), ntohl(opt_val), 163 dccp_role(sk), ntohl(opt_val),
171 (unsigned long long) 164 (unsigned long long)
172 DCCP_SKB_CB(skb)->dccpd_ack_seq); 165 DCCP_SKB_CB(skb)->dccpd_ack_seq);
166 /* schedule an Ack in case this sender is quiescent */
167 inet_csk_schedule_ack(sk);
173 break; 168 break;
174 case DCCPO_TIMESTAMP_ECHO: 169 case DCCPO_TIMESTAMP_ECHO:
175 if (len != 4 && len != 6 && len != 8) 170 if (len != 4 && len != 6 && len != 8)
@@ -226,23 +221,15 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
226 dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n", 221 dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n",
227 dccp_role(sk), elapsed_time); 222 dccp_role(sk), elapsed_time);
228 break; 223 break;
229 case 128 ... 191: { 224 case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC:
230 const u16 idx = value - options;
231
232 if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk, 225 if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk,
233 opt, len, idx, 226 pkt_type, opt, value, len))
234 value) != 0)
235 goto out_invalid_option; 227 goto out_invalid_option;
236 }
237 break; 228 break;
238 case 192 ... 255: { 229 case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC:
239 const u16 idx = value - options;
240
241 if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, 230 if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk,
242 opt, len, idx, 231 pkt_type, opt, value, len))
243 value) != 0)
244 goto out_invalid_option; 232 goto out_invalid_option;
245 }
246 break; 233 break;
247 default: 234 default:
248 DCCP_CRIT("DCCP(%p): option %d(len=%d) not " 235 DCCP_CRIT("DCCP(%p): option %d(len=%d) not "
@@ -384,7 +371,7 @@ int dccp_insert_option_elapsed_time(struct sk_buff *skb, u32 elapsed_time)
384 371
385EXPORT_SYMBOL_GPL(dccp_insert_option_elapsed_time); 372EXPORT_SYMBOL_GPL(dccp_insert_option_elapsed_time);
386 373
387int dccp_insert_option_timestamp(struct sk_buff *skb) 374static int dccp_insert_option_timestamp(struct sk_buff *skb)
388{ 375{
389 __be32 now = htonl(dccp_timestamp()); 376 __be32 now = htonl(dccp_timestamp());
390 /* yes this will overflow but that is the point as we want a 377 /* yes this will overflow but that is the point as we want a
@@ -393,8 +380,6 @@ int dccp_insert_option_timestamp(struct sk_buff *skb)
393 return dccp_insert_option(skb, DCCPO_TIMESTAMP, &now, sizeof(now)); 380 return dccp_insert_option(skb, DCCPO_TIMESTAMP, &now, sizeof(now));
394} 381}
395 382
396EXPORT_SYMBOL_GPL(dccp_insert_option_timestamp);
397
398static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp, 383static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp,
399 struct dccp_request_sock *dreq, 384 struct dccp_request_sock *dreq,
400 struct sk_buff *skb) 385 struct sk_buff *skb)
diff --git a/net/dccp/output.c b/net/dccp/output.c
index aadbdb58758b..45b91853f5ae 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -209,108 +209,150 @@ void dccp_write_space(struct sock *sk)
209} 209}
210 210
211/** 211/**
212 * dccp_wait_for_ccid - Wait for ccid to tell us we can send a packet 212 * dccp_wait_for_ccid - Await CCID send permission
213 * @sk: socket to wait for 213 * @sk: socket to wait for
214 * @skb: current skb to pass on for waiting 214 * @delay: timeout in jiffies
215 * @delay: sleep timeout in milliseconds (> 0) 215 * This is used by CCIDs which need to delay the send time in process context.
216 * This function is called by default when the socket is closed, and
217 * when a non-zero linger time is set on the socket. For consistency
218 */ 216 */
219static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb, int delay) 217static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay)
220{ 218{
221 struct dccp_sock *dp = dccp_sk(sk);
222 DEFINE_WAIT(wait); 219 DEFINE_WAIT(wait);
223 unsigned long jiffdelay; 220 long remaining;
224 int rc; 221
222 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
223 sk->sk_write_pending++;
224 release_sock(sk);
225
226 remaining = schedule_timeout(delay);
227
228 lock_sock(sk);
229 sk->sk_write_pending--;
230 finish_wait(sk_sleep(sk), &wait);
231
232 if (signal_pending(current) || sk->sk_err)
233 return -1;
234 return remaining;
235}
236
237/**
238 * dccp_xmit_packet - Send data packet under control of CCID
239 * Transmits next-queued payload and informs CCID to account for the packet.
240 */
241static void dccp_xmit_packet(struct sock *sk)
242{
243 int err, len;
244 struct dccp_sock *dp = dccp_sk(sk);
245 struct sk_buff *skb = skb_dequeue(&sk->sk_write_queue);
246
247 if (unlikely(skb == NULL))
248 return;
249 len = skb->len;
225 250
226 do { 251 if (sk->sk_state == DCCP_PARTOPEN) {
227 dccp_pr_debug("delayed send by %d msec\n", delay); 252 const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD;
228 jiffdelay = msecs_to_jiffies(delay); 253 /*
254 * See 8.1.5 - Handshake Completion.
255 *
256 * For robustness we resend Confirm options until the client has
257 * entered OPEN. During the initial feature negotiation, the MPS
258 * is smaller than usual, reduced by the Change/Confirm options.
259 */
260 if (!list_empty(&dp->dccps_featneg) && len > cur_mps) {
261 DCCP_WARN("Payload too large (%d) for featneg.\n", len);
262 dccp_send_ack(sk);
263 dccp_feat_list_purge(&dp->dccps_featneg);
264 }
229 265
230 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 266 inet_csk_schedule_ack(sk);
267 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
268 inet_csk(sk)->icsk_rto,
269 DCCP_RTO_MAX);
270 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK;
271 } else if (dccp_ack_pending(sk)) {
272 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK;
273 } else {
274 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA;
275 }
231 276
232 sk->sk_write_pending++; 277 err = dccp_transmit_skb(sk, skb);
233 release_sock(sk); 278 if (err)
234 schedule_timeout(jiffdelay); 279 dccp_pr_debug("transmit_skb() returned err=%d\n", err);
235 lock_sock(sk); 280 /*
236 sk->sk_write_pending--; 281 * Register this one as sent even if an error occurred. To the remote
282 * end a local packet drop is indistinguishable from network loss, i.e.
283 * any local drop will eventually be reported via receiver feedback.
284 */
285 ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len);
286}
237 287
238 if (sk->sk_err) 288/**
239 goto do_error; 289 * dccp_flush_write_queue - Drain queue at end of connection
240 if (signal_pending(current)) 290 * Since dccp_sendmsg queues packets without waiting for them to be sent, it may
241 goto do_interrupted; 291 * happen that the TX queue is not empty at the end of a connection. We give the
292 * HC-sender CCID a grace period of up to @time_budget jiffies. If this function
293 * returns with a non-empty write queue, it will be purged later.
294 */
295void dccp_flush_write_queue(struct sock *sk, long *time_budget)
296{
297 struct dccp_sock *dp = dccp_sk(sk);
298 struct sk_buff *skb;
299 long delay, rc;
242 300
301 while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) {
243 rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); 302 rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
244 } while ((delay = rc) > 0); 303
245out: 304 switch (ccid_packet_dequeue_eval(rc)) {
246 finish_wait(sk_sleep(sk), &wait); 305 case CCID_PACKET_WILL_DEQUEUE_LATER:
247 return rc; 306 /*
248 307 * If the CCID determines when to send, the next sending
249do_error: 308 * time is unknown or the CCID may not even send again
250 rc = -EPIPE; 309 * (e.g. remote host crashes or lost Ack packets).
251 goto out; 310 */
252do_interrupted: 311 DCCP_WARN("CCID did not manage to send all packets\n");
253 rc = -EINTR; 312 return;
254 goto out; 313 case CCID_PACKET_DELAY:
314 delay = msecs_to_jiffies(rc);
315 if (delay > *time_budget)
316 return;
317 rc = dccp_wait_for_ccid(sk, delay);
318 if (rc < 0)
319 return;
320 *time_budget -= (delay - rc);
321 /* check again if we can send now */
322 break;
323 case CCID_PACKET_SEND_AT_ONCE:
324 dccp_xmit_packet(sk);
325 break;
326 case CCID_PACKET_ERR:
327 skb_dequeue(&sk->sk_write_queue);
328 kfree_skb(skb);
329 dccp_pr_debug("packet discarded due to err=%ld\n", rc);
330 }
331 }
255} 332}
256 333
257void dccp_write_xmit(struct sock *sk, int block) 334void dccp_write_xmit(struct sock *sk)
258{ 335{
259 struct dccp_sock *dp = dccp_sk(sk); 336 struct dccp_sock *dp = dccp_sk(sk);
260 struct sk_buff *skb; 337 struct sk_buff *skb;
261 338
262 while ((skb = skb_peek(&sk->sk_write_queue))) { 339 while ((skb = skb_peek(&sk->sk_write_queue))) {
263 int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); 340 int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
264
265 if (err > 0) {
266 if (!block) {
267 sk_reset_timer(sk, &dp->dccps_xmit_timer,
268 msecs_to_jiffies(err)+jiffies);
269 break;
270 } else
271 err = dccp_wait_for_ccid(sk, skb, err);
272 if (err && err != -EINTR)
273 DCCP_BUG("err=%d after dccp_wait_for_ccid", err);
274 }
275 341
276 skb_dequeue(&sk->sk_write_queue); 342 switch (ccid_packet_dequeue_eval(rc)) {
277 if (err == 0) { 343 case CCID_PACKET_WILL_DEQUEUE_LATER:
278 struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); 344 return;
279 const int len = skb->len; 345 case CCID_PACKET_DELAY:
280 346 sk_reset_timer(sk, &dp->dccps_xmit_timer,
281 if (sk->sk_state == DCCP_PARTOPEN) { 347 jiffies + msecs_to_jiffies(rc));
282 const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD; 348 return;
283 /* 349 case CCID_PACKET_SEND_AT_ONCE:
284 * See 8.1.5 - Handshake Completion. 350 dccp_xmit_packet(sk);
285 * 351 break;
286 * For robustness we resend Confirm options until the client has 352 case CCID_PACKET_ERR:
287 * entered OPEN. During the initial feature negotiation, the MPS 353 skb_dequeue(&sk->sk_write_queue);
288 * is smaller than usual, reduced by the Change/Confirm options.
289 */
290 if (!list_empty(&dp->dccps_featneg) && len > cur_mps) {
291 DCCP_WARN("Payload too large (%d) for featneg.\n", len);
292 dccp_send_ack(sk);
293 dccp_feat_list_purge(&dp->dccps_featneg);
294 }
295
296 inet_csk_schedule_ack(sk);
297 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
298 inet_csk(sk)->icsk_rto,
299 DCCP_RTO_MAX);
300 dcb->dccpd_type = DCCP_PKT_DATAACK;
301 } else if (dccp_ack_pending(sk))
302 dcb->dccpd_type = DCCP_PKT_DATAACK;
303 else
304 dcb->dccpd_type = DCCP_PKT_DATA;
305
306 err = dccp_transmit_skb(sk, skb);
307 ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len);
308 if (err)
309 DCCP_BUG("err=%d after ccid_hc_tx_packet_sent",
310 err);
311 } else {
312 dccp_pr_debug("packet discarded due to err=%d\n", err);
313 kfree_skb(skb); 354 kfree_skb(skb);
355 dccp_pr_debug("packet discarded due to err=%d\n", rc);
314 } 356 }
315 } 357 }
316} 358}
@@ -474,8 +516,9 @@ int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code)
474/* 516/*
475 * Do all connect socket setups that can be done AF independent. 517 * Do all connect socket setups that can be done AF independent.
476 */ 518 */
477static inline void dccp_connect_init(struct sock *sk) 519int dccp_connect(struct sock *sk)
478{ 520{
521 struct sk_buff *skb;
479 struct dccp_sock *dp = dccp_sk(sk); 522 struct dccp_sock *dp = dccp_sk(sk);
480 struct dst_entry *dst = __sk_dst_get(sk); 523 struct dst_entry *dst = __sk_dst_get(sk);
481 struct inet_connection_sock *icsk = inet_csk(sk); 524 struct inet_connection_sock *icsk = inet_csk(sk);
@@ -485,22 +528,12 @@ static inline void dccp_connect_init(struct sock *sk)
485 528
486 dccp_sync_mss(sk, dst_mtu(dst)); 529 dccp_sync_mss(sk, dst_mtu(dst));
487 530
488 /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */
489 dp->dccps_gar = dp->dccps_iss;
490
491 icsk->icsk_retransmits = 0;
492}
493
494int dccp_connect(struct sock *sk)
495{
496 struct sk_buff *skb;
497 struct inet_connection_sock *icsk = inet_csk(sk);
498
499 /* do not connect if feature negotiation setup fails */ 531 /* do not connect if feature negotiation setup fails */
500 if (dccp_feat_finalise_settings(dccp_sk(sk))) 532 if (dccp_feat_finalise_settings(dccp_sk(sk)))
501 return -EPROTO; 533 return -EPROTO;
502 534
503 dccp_connect_init(sk); 535 /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */
536 dp->dccps_gar = dp->dccps_iss;
504 537
505 skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation); 538 skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation);
506 if (unlikely(skb == NULL)) 539 if (unlikely(skb == NULL))
@@ -516,6 +549,7 @@ int dccp_connect(struct sock *sk)
516 DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS); 549 DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS);
517 550
518 /* Timer for repeating the REQUEST until an answer. */ 551 /* Timer for repeating the REQUEST until an answer. */
552 icsk->icsk_retransmits = 0;
519 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 553 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
520 icsk->icsk_rto, DCCP_RTO_MAX); 554 icsk->icsk_rto, DCCP_RTO_MAX);
521 return 0; 555 return 0;
@@ -630,7 +664,6 @@ void dccp_send_close(struct sock *sk, const int active)
630 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE; 664 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE;
631 665
632 if (active) { 666 if (active) {
633 dccp_write_xmit(sk, 1);
634 dccp_skb_entail(sk, skb); 667 dccp_skb_entail(sk, skb);
635 dccp_transmit_skb(sk, skb_clone(skb, prio)); 668 dccp_transmit_skb(sk, skb_clone(skb, prio));
636 /* 669 /*
diff --git a/net/dccp/probe.c b/net/dccp/probe.c
index 078e48d442fd..33d0e6297c21 100644
--- a/net/dccp/probe.c
+++ b/net/dccp/probe.c
@@ -149,6 +149,7 @@ static const struct file_operations dccpprobe_fops = {
149 .owner = THIS_MODULE, 149 .owner = THIS_MODULE,
150 .open = dccpprobe_open, 150 .open = dccpprobe_open,
151 .read = dccpprobe_read, 151 .read = dccpprobe_read,
152 .llseek = noop_llseek,
152}; 153};
153 154
154static __init int dccpprobe_init(void) 155static __init int dccpprobe_init(void)
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 096250d1323b..ef343d53fcea 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -50,6 +50,30 @@ EXPORT_SYMBOL_GPL(dccp_hashinfo);
50/* the maximum queue length for tx in packets. 0 is no limit */ 50/* the maximum queue length for tx in packets. 0 is no limit */
51int sysctl_dccp_tx_qlen __read_mostly = 5; 51int sysctl_dccp_tx_qlen __read_mostly = 5;
52 52
53#ifdef CONFIG_IP_DCCP_DEBUG
54static const char *dccp_state_name(const int state)
55{
56 static const char *const dccp_state_names[] = {
57 [DCCP_OPEN] = "OPEN",
58 [DCCP_REQUESTING] = "REQUESTING",
59 [DCCP_PARTOPEN] = "PARTOPEN",
60 [DCCP_LISTEN] = "LISTEN",
61 [DCCP_RESPOND] = "RESPOND",
62 [DCCP_CLOSING] = "CLOSING",
63 [DCCP_ACTIVE_CLOSEREQ] = "CLOSEREQ",
64 [DCCP_PASSIVE_CLOSE] = "PASSIVE_CLOSE",
65 [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ",
66 [DCCP_TIME_WAIT] = "TIME_WAIT",
67 [DCCP_CLOSED] = "CLOSED",
68 };
69
70 if (state >= DCCP_MAX_STATES)
71 return "INVALID STATE!";
72 else
73 return dccp_state_names[state];
74}
75#endif
76
53void dccp_set_state(struct sock *sk, const int state) 77void dccp_set_state(struct sock *sk, const int state)
54{ 78{
55 const int oldstate = sk->sk_state; 79 const int oldstate = sk->sk_state;
@@ -146,30 +170,6 @@ const char *dccp_packet_name(const int type)
146 170
147EXPORT_SYMBOL_GPL(dccp_packet_name); 171EXPORT_SYMBOL_GPL(dccp_packet_name);
148 172
149const char *dccp_state_name(const int state)
150{
151 static const char *const dccp_state_names[] = {
152 [DCCP_OPEN] = "OPEN",
153 [DCCP_REQUESTING] = "REQUESTING",
154 [DCCP_PARTOPEN] = "PARTOPEN",
155 [DCCP_LISTEN] = "LISTEN",
156 [DCCP_RESPOND] = "RESPOND",
157 [DCCP_CLOSING] = "CLOSING",
158 [DCCP_ACTIVE_CLOSEREQ] = "CLOSEREQ",
159 [DCCP_PASSIVE_CLOSE] = "PASSIVE_CLOSE",
160 [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ",
161 [DCCP_TIME_WAIT] = "TIME_WAIT",
162 [DCCP_CLOSED] = "CLOSED",
163 };
164
165 if (state >= DCCP_MAX_STATES)
166 return "INVALID STATE!";
167 else
168 return dccp_state_names[state];
169}
170
171EXPORT_SYMBOL_GPL(dccp_state_name);
172
173int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) 173int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
174{ 174{
175 struct dccp_sock *dp = dccp_sk(sk); 175 struct dccp_sock *dp = dccp_sk(sk);
@@ -726,7 +726,13 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
726 goto out_discard; 726 goto out_discard;
727 727
728 skb_queue_tail(&sk->sk_write_queue, skb); 728 skb_queue_tail(&sk->sk_write_queue, skb);
729 dccp_write_xmit(sk,0); 729 /*
730 * The xmit_timer is set if the TX CCID is rate-based and will expire
731 * when congestion control permits to release further packets into the
732 * network. Window-based CCIDs do not use this timer.
733 */
734 if (!timer_pending(&dp->dccps_xmit_timer))
735 dccp_write_xmit(sk);
730out_release: 736out_release:
731 release_sock(sk); 737 release_sock(sk);
732 return rc ? : len; 738 return rc ? : len;
@@ -944,16 +950,29 @@ void dccp_close(struct sock *sk, long timeout)
944 950
945 if (data_was_unread) { 951 if (data_was_unread) {
946 /* Unread data was tossed, send an appropriate Reset Code */ 952 /* Unread data was tossed, send an appropriate Reset Code */
947 DCCP_WARN("DCCP: ABORT -- %u bytes unread\n", data_was_unread); 953 DCCP_WARN("ABORT with %u bytes unread\n", data_was_unread);
948 dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); 954 dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
949 dccp_set_state(sk, DCCP_CLOSED); 955 dccp_set_state(sk, DCCP_CLOSED);
950 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { 956 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
951 /* Check zero linger _after_ checking for unread data. */ 957 /* Check zero linger _after_ checking for unread data. */
952 sk->sk_prot->disconnect(sk, 0); 958 sk->sk_prot->disconnect(sk, 0);
953 } else if (sk->sk_state != DCCP_CLOSED) { 959 } else if (sk->sk_state != DCCP_CLOSED) {
960 /*
961 * Normal connection termination. May need to wait if there are
962 * still packets in the TX queue that are delayed by the CCID.
963 */
964 dccp_flush_write_queue(sk, &timeout);
954 dccp_terminate_connection(sk); 965 dccp_terminate_connection(sk);
955 } 966 }
956 967
968 /*
969 * Flush write queue. This may be necessary in several cases:
970 * - we have been closed by the peer but still have application data;
971 * - abortive termination (unread data or zero linger time),
972 * - normal termination but queue could not be flushed within time limit
973 */
974 __skb_queue_purge(&sk->sk_write_queue);
975
957 sk_stream_wait_close(sk, timeout); 976 sk_stream_wait_close(sk, timeout);
958 977
959adjudge_to_death: 978adjudge_to_death:
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
index 1a9aa05d4dc4..7587870b7040 100644
--- a/net/dccp/timer.c
+++ b/net/dccp/timer.c
@@ -237,32 +237,35 @@ out:
237 sock_put(sk); 237 sock_put(sk);
238} 238}
239 239
240/* Transmit-delay timer: used by the CCIDs to delay actual send time */ 240/**
241static void dccp_write_xmit_timer(unsigned long data) 241 * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface
242 * See the comments above %ccid_dequeueing_decision for supported modes.
243 */
244static void dccp_write_xmitlet(unsigned long data)
242{ 245{
243 struct sock *sk = (struct sock *)data; 246 struct sock *sk = (struct sock *)data;
244 struct dccp_sock *dp = dccp_sk(sk);
245 247
246 bh_lock_sock(sk); 248 bh_lock_sock(sk);
247 if (sock_owned_by_user(sk)) 249 if (sock_owned_by_user(sk))
248 sk_reset_timer(sk, &dp->dccps_xmit_timer, jiffies+1); 250 sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1);
249 else 251 else
250 dccp_write_xmit(sk, 0); 252 dccp_write_xmit(sk);
251 bh_unlock_sock(sk); 253 bh_unlock_sock(sk);
252 sock_put(sk);
253} 254}
254 255
255static void dccp_init_write_xmit_timer(struct sock *sk) 256static void dccp_write_xmit_timer(unsigned long data)
256{ 257{
257 struct dccp_sock *dp = dccp_sk(sk); 258 dccp_write_xmitlet(data);
258 259 sock_put((struct sock *)data);
259 setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer,
260 (unsigned long)sk);
261} 260}
262 261
263void dccp_init_xmit_timers(struct sock *sk) 262void dccp_init_xmit_timers(struct sock *sk)
264{ 263{
265 dccp_init_write_xmit_timer(sk); 264 struct dccp_sock *dp = dccp_sk(sk);
265
266 tasklet_init(&dp->dccps_xmitlet, dccp_write_xmitlet, (unsigned long)sk);
267 setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer,
268 (unsigned long)sk);
266 inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer, 269 inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer,
267 &dccp_keepalive_timer); 270 &dccp_keepalive_timer);
268} 271}
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index 0363bb95cc7d..a085dbcf5c7f 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -48,7 +48,6 @@
48#include <net/dn_neigh.h> 48#include <net/dn_neigh.h>
49#include <net/dn_route.h> 49#include <net/dn_route.h>
50 50
51static u32 dn_neigh_hash(const void *pkey, const struct net_device *dev);
52static int dn_neigh_construct(struct neighbour *); 51static int dn_neigh_construct(struct neighbour *);
53static void dn_long_error_report(struct neighbour *, struct sk_buff *); 52static void dn_long_error_report(struct neighbour *, struct sk_buff *);
54static void dn_short_error_report(struct neighbour *, struct sk_buff *); 53static void dn_short_error_report(struct neighbour *, struct sk_buff *);
@@ -93,6 +92,13 @@ static const struct neigh_ops dn_phase3_ops = {
93 .queue_xmit = dev_queue_xmit 92 .queue_xmit = dev_queue_xmit
94}; 93};
95 94
95static u32 dn_neigh_hash(const void *pkey,
96 const struct net_device *dev,
97 __u32 hash_rnd)
98{
99 return jhash_2words(*(__u16 *)pkey, 0, hash_rnd);
100}
101
96struct neigh_table dn_neigh_table = { 102struct neigh_table dn_neigh_table = {
97 .family = PF_DECnet, 103 .family = PF_DECnet,
98 .entry_size = sizeof(struct dn_neigh), 104 .entry_size = sizeof(struct dn_neigh),
@@ -122,11 +128,6 @@ struct neigh_table dn_neigh_table = {
122 .gc_thresh3 = 1024, 128 .gc_thresh3 = 1024,
123}; 129};
124 130
125static u32 dn_neigh_hash(const void *pkey, const struct net_device *dev)
126{
127 return jhash_2words(*(__u16 *)pkey, 0, dn_neigh_table.hash_rnd);
128}
129
130static int dn_neigh_construct(struct neighbour *neigh) 131static int dn_neigh_construct(struct neighbour *neigh)
131{ 132{
132 struct net_device *dev = neigh->dev; 133 struct net_device *dev = neigh->dev;
diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c
index baeb1eaf011b..2ef115277bea 100644
--- a/net/decnet/dn_nsp_out.c
+++ b/net/decnet/dn_nsp_out.c
@@ -693,22 +693,22 @@ void dn_nsp_send_conninit(struct sock *sk, unsigned char msgflg)
693 aux = scp->accessdata.acc_userl; 693 aux = scp->accessdata.acc_userl;
694 *skb_put(skb, 1) = aux; 694 *skb_put(skb, 1) = aux;
695 if (aux > 0) 695 if (aux > 0)
696 memcpy(skb_put(skb, aux), scp->accessdata.acc_user, aux); 696 memcpy(skb_put(skb, aux), scp->accessdata.acc_user, aux);
697 697
698 aux = scp->accessdata.acc_passl; 698 aux = scp->accessdata.acc_passl;
699 *skb_put(skb, 1) = aux; 699 *skb_put(skb, 1) = aux;
700 if (aux > 0) 700 if (aux > 0)
701 memcpy(skb_put(skb, aux), scp->accessdata.acc_pass, aux); 701 memcpy(skb_put(skb, aux), scp->accessdata.acc_pass, aux);
702 702
703 aux = scp->accessdata.acc_accl; 703 aux = scp->accessdata.acc_accl;
704 *skb_put(skb, 1) = aux; 704 *skb_put(skb, 1) = aux;
705 if (aux > 0) 705 if (aux > 0)
706 memcpy(skb_put(skb, aux), scp->accessdata.acc_acc, aux); 706 memcpy(skb_put(skb, aux), scp->accessdata.acc_acc, aux);
707 707
708 aux = (__u8)le16_to_cpu(scp->conndata_out.opt_optl); 708 aux = (__u8)le16_to_cpu(scp->conndata_out.opt_optl);
709 *skb_put(skb, 1) = aux; 709 *skb_put(skb, 1) = aux;
710 if (aux > 0) 710 if (aux > 0)
711 memcpy(skb_put(skb,aux), scp->conndata_out.opt_data, aux); 711 memcpy(skb_put(skb, aux), scp->conndata_out.opt_data, aux);
712 712
713 scp->persist = dn_nsp_persist(sk); 713 scp->persist = dn_nsp_persist(sk);
714 scp->persist_fxn = dn_nsp_retrans_conninit; 714 scp->persist_fxn = dn_nsp_retrans_conninit;
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 6585ea6d1182..df0f3e54ff8a 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -132,7 +132,6 @@ static struct dst_ops dn_dst_ops = {
132 .negative_advice = dn_dst_negative_advice, 132 .negative_advice = dn_dst_negative_advice,
133 .link_failure = dn_dst_link_failure, 133 .link_failure = dn_dst_link_failure,
134 .update_pmtu = dn_dst_update_pmtu, 134 .update_pmtu = dn_dst_update_pmtu,
135 .entries = ATOMIC_INIT(0),
136}; 135};
137 136
138static __inline__ unsigned dn_hash(__le16 src, __le16 dst) 137static __inline__ unsigned dn_hash(__le16 src, __le16 dst)
@@ -1758,6 +1757,7 @@ void __init dn_route_init(void)
1758 dn_dst_ops.kmem_cachep = 1757 dn_dst_ops.kmem_cachep =
1759 kmem_cache_create("dn_dst_cache", sizeof(struct dn_route), 0, 1758 kmem_cache_create("dn_dst_cache", sizeof(struct dn_route), 0,
1760 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1759 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1760 dst_entries_init(&dn_dst_ops);
1761 setup_timer(&dn_route_timer, dn_dst_check_expire, 0); 1761 setup_timer(&dn_route_timer, dn_dst_check_expire, 0);
1762 dn_route_timer.expires = jiffies + decnet_dst_gc_interval * HZ; 1762 dn_route_timer.expires = jiffies + decnet_dst_gc_interval * HZ;
1763 add_timer(&dn_route_timer); 1763 add_timer(&dn_route_timer);
@@ -1816,5 +1816,6 @@ void __exit dn_route_cleanup(void)
1816 dn_run_flush(0); 1816 dn_run_flush(0);
1817 1817
1818 proc_net_remove(&init_net, "decnet_cache"); 1818 proc_net_remove(&init_net, "decnet_cache");
1819 dst_entries_destroy(&dn_dst_ops);
1819} 1820}
1820 1821
diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c
index dc54bd0d083b..f8c1ae4b41f0 100644
--- a/net/econet/af_econet.c
+++ b/net/econet/af_econet.c
@@ -392,7 +392,7 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock,
392 dev_queue_xmit(skb); 392 dev_queue_xmit(skb);
393 dev_put(dev); 393 dev_put(dev);
394 mutex_unlock(&econet_mutex); 394 mutex_unlock(&econet_mutex);
395 return(len); 395 return len;
396 396
397 out_free: 397 out_free:
398 kfree_skb(skb); 398 kfree_skb(skb);
@@ -637,7 +637,7 @@ static int econet_create(struct net *net, struct socket *sock, int protocol,
637 eo->num = protocol; 637 eo->num = protocol;
638 638
639 econet_insert_socket(&econet_sklist, sk); 639 econet_insert_socket(&econet_sklist, sk);
640 return(0); 640 return 0;
641out: 641out:
642 return err; 642 return err;
643} 643}
@@ -1009,7 +1009,6 @@ static int __init aun_udp_initialise(void)
1009 struct sockaddr_in sin; 1009 struct sockaddr_in sin;
1010 1010
1011 skb_queue_head_init(&aun_queue); 1011 skb_queue_head_init(&aun_queue);
1012 spin_lock_init(&aun_queue_lock);
1013 setup_timer(&ab_cleanup_timer, ab_cleanup, 0); 1012 setup_timer(&ab_cleanup_timer, ab_cleanup, 0);
1014 ab_cleanup_timer.expires = jiffies + (HZ*2); 1013 ab_cleanup_timer.expires = jiffies + (HZ*2);
1015 add_timer(&ab_cleanup_timer); 1014 add_timer(&ab_cleanup_timer);
@@ -1167,7 +1166,6 @@ static int __init econet_proto_init(void)
1167 goto out; 1166 goto out;
1168 sock_register(&econet_family_ops); 1167 sock_register(&econet_family_ops);
1169#ifdef CONFIG_ECONET_AUNUDP 1168#ifdef CONFIG_ECONET_AUNUDP
1170 spin_lock_init(&aun_queue_lock);
1171 aun_udp_initialise(); 1169 aun_udp_initialise();
1172#endif 1170#endif
1173#ifdef CONFIG_ECONET_NATIVE 1171#ifdef CONFIG_ECONET_NATIVE
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 215c83986a9d..f00ef2f1d814 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -367,7 +367,7 @@ struct net_device *alloc_etherdev_mq(int sizeof_priv, unsigned int queue_count)
367EXPORT_SYMBOL(alloc_etherdev_mq); 367EXPORT_SYMBOL(alloc_etherdev_mq);
368 368
369static size_t _format_mac_addr(char *buf, int buflen, 369static size_t _format_mac_addr(char *buf, int buflen,
370 const unsigned char *addr, int len) 370 const unsigned char *addr, int len)
371{ 371{
372 int i; 372 int i;
373 char *cp = buf; 373 char *cp = buf;
@@ -376,7 +376,7 @@ static size_t _format_mac_addr(char *buf, int buflen,
376 cp += scnprintf(cp, buflen - (cp - buf), "%02x", addr[i]); 376 cp += scnprintf(cp, buflen - (cp - buf), "%02x", addr[i]);
377 if (i == len - 1) 377 if (i == len - 1)
378 break; 378 break;
379 cp += strlcpy(cp, ":", buflen - (cp - buf)); 379 cp += scnprintf(cp, buflen - (cp - buf), ":");
380 } 380 }
381 return cp - buf; 381 return cp - buf;
382} 382}
@@ -386,7 +386,7 @@ ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len)
386 size_t l; 386 size_t l;
387 387
388 l = _format_mac_addr(buf, PAGE_SIZE, addr, len); 388 l = _format_mac_addr(buf, PAGE_SIZE, addr, len);
389 l += strlcpy(buf + l, "\n", PAGE_SIZE - l); 389 l += scnprintf(buf + l, PAGE_SIZE - l, "\n");
390 return ((ssize_t) l); 390 return (ssize_t)l;
391} 391}
392EXPORT_SYMBOL(sysfs_format_mac); 392EXPORT_SYMBOL(sysfs_format_mac);
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 7c3a7d191249..9e95d7fb6d5a 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -46,7 +46,7 @@ config IP_ADVANCED_ROUTER
46 rp_filter on use: 46 rp_filter on use:
47 47
48 echo 1 > /proc/sys/net/ipv4/conf/<device>/rp_filter 48 echo 1 > /proc/sys/net/ipv4/conf/<device>/rp_filter
49 and 49 or
50 echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter 50 echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter
51 51
52 Note that some distributions enable it in startup scripts. 52 Note that some distributions enable it in startup scripts.
@@ -84,7 +84,7 @@ config IP_FIB_TRIE
84 84
85 An experimental study of compression methods for dynamic tries 85 An experimental study of compression methods for dynamic tries
86 Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. 86 Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
87 http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/ 87 <http://www.csc.kth.se/~snilsson/software/dyntrie2/>
88 88
89endchoice 89endchoice
90 90
@@ -215,8 +215,15 @@ config NET_IPIP
215 be inserted in and removed from the running kernel whenever you 215 be inserted in and removed from the running kernel whenever you
216 want). Most people won't need this and can say N. 216 want). Most people won't need this and can say N.
217 217
218config NET_IPGRE_DEMUX
219 tristate "IP: GRE demultiplexer"
220 help
221 This is helper module to demultiplex GRE packets on GRE version field criteria.
222 Required by ip_gre and pptp modules.
223
218config NET_IPGRE 224config NET_IPGRE
219 tristate "IP: GRE tunnels over IP" 225 tristate "IP: GRE tunnels over IP"
226 depends on (IPV6 || IPV6=n) && NET_IPGRE_DEMUX
220 help 227 help
221 Tunneling means encapsulating data of one protocol type within 228 Tunneling means encapsulating data of one protocol type within
222 another protocol and sending it over a channel that understands the 229 another protocol and sending it over a channel that understands the
@@ -412,7 +419,7 @@ config INET_XFRM_MODE_BEET
412 If unsure, say Y. 419 If unsure, say Y.
413 420
414config INET_LRO 421config INET_LRO
415 bool "Large Receive Offload (ipv4/tcp)" 422 tristate "Large Receive Offload (ipv4/tcp)"
416 default y 423 default y
417 ---help--- 424 ---help---
418 Support for Large Receive Offload (ipv4/tcp). 425 Support for Large Receive Offload (ipv4/tcp).
@@ -555,7 +562,7 @@ config TCP_CONG_VENO
555 distinguishing to circumvent the difficult judgment of the packet loss 562 distinguishing to circumvent the difficult judgment of the packet loss
556 type. TCP Veno cuts down less congestion window in response to random 563 type. TCP Veno cuts down less congestion window in response to random
557 loss packets. 564 loss packets.
558 See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf 565 See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186>
559 566
560config TCP_CONG_YEAH 567config TCP_CONG_YEAH
561 tristate "YeAH TCP" 568 tristate "YeAH TCP"
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 80ff87ce43aa..4978d22f9a75 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_PROC_FS) += proc.o
20obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o 20obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
21obj-$(CONFIG_IP_MROUTE) += ipmr.o 21obj-$(CONFIG_IP_MROUTE) += ipmr.o
22obj-$(CONFIG_NET_IPIP) += ipip.o 22obj-$(CONFIG_NET_IPIP) += ipip.o
23obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
23obj-$(CONFIG_NET_IPGRE) += ip_gre.o 24obj-$(CONFIG_NET_IPGRE) += ip_gre.o
24obj-$(CONFIG_SYN_COOKIES) += syncookies.o 25obj-$(CONFIG_SYN_COOKIES) += syncookies.o
25obj-$(CONFIG_INET_AH) += ah4.o 26obj-$(CONFIG_INET_AH) += ah4.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 6a1100c25a9f..f581f77d1097 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -227,18 +227,16 @@ EXPORT_SYMBOL(inet_ehash_secret);
227 227
228/* 228/*
229 * inet_ehash_secret must be set exactly once 229 * inet_ehash_secret must be set exactly once
230 * Instead of using a dedicated spinlock, we (ab)use inetsw_lock
231 */ 230 */
232void build_ehash_secret(void) 231void build_ehash_secret(void)
233{ 232{
234 u32 rnd; 233 u32 rnd;
234
235 do { 235 do {
236 get_random_bytes(&rnd, sizeof(rnd)); 236 get_random_bytes(&rnd, sizeof(rnd));
237 } while (rnd == 0); 237 } while (rnd == 0);
238 spin_lock_bh(&inetsw_lock); 238
239 if (!inet_ehash_secret) 239 cmpxchg(&inet_ehash_secret, 0, rnd);
240 inet_ehash_secret = rnd;
241 spin_unlock_bh(&inetsw_lock);
242} 240}
243EXPORT_SYMBOL(build_ehash_secret); 241EXPORT_SYMBOL(build_ehash_secret);
244 242
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 96c1955b3e2f..d8e540c5b071 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -55,7 +55,7 @@
55 * Stuart Cheshire : Metricom and grat arp fixes 55 * Stuart Cheshire : Metricom and grat arp fixes
56 * *** FOR 2.1 clean this up *** 56 * *** FOR 2.1 clean this up ***
57 * Lawrence V. Stefani: (08/12/96) Added FDDI support. 57 * Lawrence V. Stefani: (08/12/96) Added FDDI support.
58 * Alan Cox : Took the AP1000 nasty FDDI hack and 58 * Alan Cox : Took the AP1000 nasty FDDI hack and
59 * folded into the mainstream FDDI code. 59 * folded into the mainstream FDDI code.
60 * Ack spit, Linus how did you allow that 60 * Ack spit, Linus how did you allow that
61 * one in... 61 * one in...
@@ -120,14 +120,14 @@ EXPORT_SYMBOL(clip_tbl_hook);
120#endif 120#endif
121 121
122#include <asm/system.h> 122#include <asm/system.h>
123#include <asm/uaccess.h> 123#include <linux/uaccess.h>
124 124
125#include <linux/netfilter_arp.h> 125#include <linux/netfilter_arp.h>
126 126
127/* 127/*
128 * Interface to generic neighbour cache. 128 * Interface to generic neighbour cache.
129 */ 129 */
130static u32 arp_hash(const void *pkey, const struct net_device *dev); 130static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 rnd);
131static int arp_constructor(struct neighbour *neigh); 131static int arp_constructor(struct neighbour *neigh);
132static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); 132static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
133static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); 133static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
@@ -161,7 +161,7 @@ static const struct neigh_ops arp_direct_ops = {
161 .queue_xmit = dev_queue_xmit, 161 .queue_xmit = dev_queue_xmit,
162}; 162};
163 163
164const struct neigh_ops arp_broken_ops = { 164static const struct neigh_ops arp_broken_ops = {
165 .family = AF_INET, 165 .family = AF_INET,
166 .solicit = arp_solicit, 166 .solicit = arp_solicit,
167 .error_report = arp_error_report, 167 .error_report = arp_error_report,
@@ -170,35 +170,34 @@ const struct neigh_ops arp_broken_ops = {
170 .hh_output = dev_queue_xmit, 170 .hh_output = dev_queue_xmit,
171 .queue_xmit = dev_queue_xmit, 171 .queue_xmit = dev_queue_xmit,
172}; 172};
173EXPORT_SYMBOL(arp_broken_ops);
174 173
175struct neigh_table arp_tbl = { 174struct neigh_table arp_tbl = {
176 .family = AF_INET, 175 .family = AF_INET,
177 .entry_size = sizeof(struct neighbour) + 4, 176 .entry_size = sizeof(struct neighbour) + 4,
178 .key_len = 4, 177 .key_len = 4,
179 .hash = arp_hash, 178 .hash = arp_hash,
180 .constructor = arp_constructor, 179 .constructor = arp_constructor,
181 .proxy_redo = parp_redo, 180 .proxy_redo = parp_redo,
182 .id = "arp_cache", 181 .id = "arp_cache",
183 .parms = { 182 .parms = {
184 .tbl = &arp_tbl, 183 .tbl = &arp_tbl,
185 .base_reachable_time = 30 * HZ, 184 .base_reachable_time = 30 * HZ,
186 .retrans_time = 1 * HZ, 185 .retrans_time = 1 * HZ,
187 .gc_staletime = 60 * HZ, 186 .gc_staletime = 60 * HZ,
188 .reachable_time = 30 * HZ, 187 .reachable_time = 30 * HZ,
189 .delay_probe_time = 5 * HZ, 188 .delay_probe_time = 5 * HZ,
190 .queue_len = 3, 189 .queue_len = 3,
191 .ucast_probes = 3, 190 .ucast_probes = 3,
192 .mcast_probes = 3, 191 .mcast_probes = 3,
193 .anycast_delay = 1 * HZ, 192 .anycast_delay = 1 * HZ,
194 .proxy_delay = (8 * HZ) / 10, 193 .proxy_delay = (8 * HZ) / 10,
195 .proxy_qlen = 64, 194 .proxy_qlen = 64,
196 .locktime = 1 * HZ, 195 .locktime = 1 * HZ,
197 }, 196 },
198 .gc_interval = 30 * HZ, 197 .gc_interval = 30 * HZ,
199 .gc_thresh1 = 128, 198 .gc_thresh1 = 128,
200 .gc_thresh2 = 512, 199 .gc_thresh2 = 512,
201 .gc_thresh3 = 1024, 200 .gc_thresh3 = 1024,
202}; 201};
203EXPORT_SYMBOL(arp_tbl); 202EXPORT_SYMBOL(arp_tbl);
204 203
@@ -226,14 +225,16 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
226} 225}
227 226
228 227
229static u32 arp_hash(const void *pkey, const struct net_device *dev) 228static u32 arp_hash(const void *pkey,
229 const struct net_device *dev,
230 __u32 hash_rnd)
230{ 231{
231 return jhash_2words(*(u32 *)pkey, dev->ifindex, arp_tbl.hash_rnd); 232 return jhash_2words(*(u32 *)pkey, dev->ifindex, hash_rnd);
232} 233}
233 234
234static int arp_constructor(struct neighbour *neigh) 235static int arp_constructor(struct neighbour *neigh)
235{ 236{
236 __be32 addr = *(__be32*)neigh->primary_key; 237 __be32 addr = *(__be32 *)neigh->primary_key;
237 struct net_device *dev = neigh->dev; 238 struct net_device *dev = neigh->dev;
238 struct in_device *in_dev; 239 struct in_device *in_dev;
239 struct neigh_parms *parms; 240 struct neigh_parms *parms;
@@ -296,16 +297,19 @@ static int arp_constructor(struct neighbour *neigh)
296 neigh->ops = &arp_broken_ops; 297 neigh->ops = &arp_broken_ops;
297 neigh->output = neigh->ops->output; 298 neigh->output = neigh->ops->output;
298 return 0; 299 return 0;
300#else
301 break;
299#endif 302#endif
300 ;} 303 }
301#endif 304#endif
302 if (neigh->type == RTN_MULTICAST) { 305 if (neigh->type == RTN_MULTICAST) {
303 neigh->nud_state = NUD_NOARP; 306 neigh->nud_state = NUD_NOARP;
304 arp_mc_map(addr, neigh->ha, dev, 1); 307 arp_mc_map(addr, neigh->ha, dev, 1);
305 } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) { 308 } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) {
306 neigh->nud_state = NUD_NOARP; 309 neigh->nud_state = NUD_NOARP;
307 memcpy(neigh->ha, dev->dev_addr, dev->addr_len); 310 memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
308 } else if (neigh->type == RTN_BROADCAST || dev->flags&IFF_POINTOPOINT) { 311 } else if (neigh->type == RTN_BROADCAST ||
312 (dev->flags & IFF_POINTOPOINT)) {
309 neigh->nud_state = NUD_NOARP; 313 neigh->nud_state = NUD_NOARP;
310 memcpy(neigh->ha, dev->broadcast, dev->addr_len); 314 memcpy(neigh->ha, dev->broadcast, dev->addr_len);
311 } 315 }
@@ -315,7 +319,7 @@ static int arp_constructor(struct neighbour *neigh)
315 else 319 else
316 neigh->ops = &arp_generic_ops; 320 neigh->ops = &arp_generic_ops;
317 321
318 if (neigh->nud_state&NUD_VALID) 322 if (neigh->nud_state & NUD_VALID)
319 neigh->output = neigh->ops->connected_output; 323 neigh->output = neigh->ops->connected_output;
320 else 324 else
321 neigh->output = neigh->ops->output; 325 neigh->output = neigh->ops->output;
@@ -334,7 +338,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
334 __be32 saddr = 0; 338 __be32 saddr = 0;
335 u8 *dst_ha = NULL; 339 u8 *dst_ha = NULL;
336 struct net_device *dev = neigh->dev; 340 struct net_device *dev = neigh->dev;
337 __be32 target = *(__be32*)neigh->primary_key; 341 __be32 target = *(__be32 *)neigh->primary_key;
338 int probes = atomic_read(&neigh->probes); 342 int probes = atomic_read(&neigh->probes);
339 struct in_device *in_dev; 343 struct in_device *in_dev;
340 344
@@ -347,7 +351,8 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
347 switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { 351 switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
348 default: 352 default:
349 case 0: /* By default announce any local IP */ 353 case 0: /* By default announce any local IP */
350 if (skb && inet_addr_type(dev_net(dev), ip_hdr(skb)->saddr) == RTN_LOCAL) 354 if (skb && inet_addr_type(dev_net(dev),
355 ip_hdr(skb)->saddr) == RTN_LOCAL)
351 saddr = ip_hdr(skb)->saddr; 356 saddr = ip_hdr(skb)->saddr;
352 break; 357 break;
353 case 1: /* Restrict announcements of saddr in same subnet */ 358 case 1: /* Restrict announcements of saddr in same subnet */
@@ -369,16 +374,21 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
369 if (!saddr) 374 if (!saddr)
370 saddr = inet_select_addr(dev, target, RT_SCOPE_LINK); 375 saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);
371 376
372 if ((probes -= neigh->parms->ucast_probes) < 0) { 377 probes -= neigh->parms->ucast_probes;
373 if (!(neigh->nud_state&NUD_VALID)) 378 if (probes < 0) {
374 printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n"); 379 if (!(neigh->nud_state & NUD_VALID))
380 printk(KERN_DEBUG
381 "trying to ucast probe in NUD_INVALID\n");
375 dst_ha = neigh->ha; 382 dst_ha = neigh->ha;
376 read_lock_bh(&neigh->lock); 383 read_lock_bh(&neigh->lock);
377 } else if ((probes -= neigh->parms->app_probes) < 0) { 384 } else {
385 probes -= neigh->parms->app_probes;
386 if (probes < 0) {
378#ifdef CONFIG_ARPD 387#ifdef CONFIG_ARPD
379 neigh_app_ns(neigh); 388 neigh_app_ns(neigh);
380#endif 389#endif
381 return; 390 return;
391 }
382 } 392 }
383 393
384 arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, 394 arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
@@ -451,7 +461,8 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
451 * is allowed to use this function, it is scheduled to be removed. --ANK 461 * is allowed to use this function, it is scheduled to be removed. --ANK
452 */ 462 */
453 463
454static int arp_set_predefined(int addr_hint, unsigned char * haddr, __be32 paddr, struct net_device * dev) 464static int arp_set_predefined(int addr_hint, unsigned char *haddr,
465 __be32 paddr, struct net_device *dev)
455{ 466{
456 switch (addr_hint) { 467 switch (addr_hint) {
457 case RTN_LOCAL: 468 case RTN_LOCAL:
@@ -483,17 +494,16 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
483 494
484 paddr = skb_rtable(skb)->rt_gateway; 495 paddr = skb_rtable(skb)->rt_gateway;
485 496
486 if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, paddr, dev)) 497 if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr,
498 paddr, dev))
487 return 0; 499 return 0;
488 500
489 n = __neigh_lookup(&arp_tbl, &paddr, dev, 1); 501 n = __neigh_lookup(&arp_tbl, &paddr, dev, 1);
490 502
491 if (n) { 503 if (n) {
492 n->used = jiffies; 504 n->used = jiffies;
493 if (n->nud_state&NUD_VALID || neigh_event_send(n, skb) == 0) { 505 if (n->nud_state & NUD_VALID || neigh_event_send(n, skb) == 0) {
494 read_lock_bh(&n->lock); 506 neigh_ha_snapshot(haddr, n, dev);
495 memcpy(haddr, n->ha, dev->addr_len);
496 read_unlock_bh(&n->lock);
497 neigh_release(n); 507 neigh_release(n);
498 return 0; 508 return 0;
499 } 509 }
@@ -515,13 +525,14 @@ int arp_bind_neighbour(struct dst_entry *dst)
515 return -EINVAL; 525 return -EINVAL;
516 if (n == NULL) { 526 if (n == NULL) {
517 __be32 nexthop = ((struct rtable *)dst)->rt_gateway; 527 __be32 nexthop = ((struct rtable *)dst)->rt_gateway;
518 if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT)) 528 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
519 nexthop = 0; 529 nexthop = 0;
520 n = __neigh_lookup_errno( 530 n = __neigh_lookup_errno(
521#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) 531#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
522 dev->type == ARPHRD_ATM ? clip_tbl_hook : 532 dev->type == ARPHRD_ATM ?
533 clip_tbl_hook :
523#endif 534#endif
524 &arp_tbl, &nexthop, dev); 535 &arp_tbl, &nexthop, dev);
525 if (IS_ERR(n)) 536 if (IS_ERR(n))
526 return PTR_ERR(n); 537 return PTR_ERR(n);
527 dst->neighbour = n; 538 dst->neighbour = n;
@@ -543,8 +554,8 @@ static inline int arp_fwd_proxy(struct in_device *in_dev,
543 554
544 if (!IN_DEV_PROXY_ARP(in_dev)) 555 if (!IN_DEV_PROXY_ARP(in_dev))
545 return 0; 556 return 0;
546 557 imi = IN_DEV_MEDIUM_ID(in_dev);
547 if ((imi = IN_DEV_MEDIUM_ID(in_dev)) == 0) 558 if (imi == 0)
548 return 1; 559 return 1;
549 if (imi == -1) 560 if (imi == -1)
550 return 0; 561 return 0;
@@ -555,7 +566,7 @@ static inline int arp_fwd_proxy(struct in_device *in_dev,
555 if (out_dev) 566 if (out_dev)
556 omi = IN_DEV_MEDIUM_ID(out_dev); 567 omi = IN_DEV_MEDIUM_ID(out_dev);
557 568
558 return (omi != imi && omi != -1); 569 return omi != imi && omi != -1;
559} 570}
560 571
561/* 572/*
@@ -685,7 +696,7 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
685 arp->ar_pln = 4; 696 arp->ar_pln = 4;
686 arp->ar_op = htons(type); 697 arp->ar_op = htons(type);
687 698
688 arp_ptr=(unsigned char *)(arp+1); 699 arp_ptr = (unsigned char *)(arp + 1);
689 700
690 memcpy(arp_ptr, src_hw, dev->addr_len); 701 memcpy(arp_ptr, src_hw, dev->addr_len);
691 arp_ptr += dev->addr_len; 702 arp_ptr += dev->addr_len;
@@ -735,9 +746,8 @@ void arp_send(int type, int ptype, __be32 dest_ip,
735 746
736 skb = arp_create(type, ptype, dest_ip, dev, src_ip, 747 skb = arp_create(type, ptype, dest_ip, dev, src_ip,
737 dest_hw, src_hw, target_hw); 748 dest_hw, src_hw, target_hw);
738 if (skb == NULL) { 749 if (skb == NULL)
739 return; 750 return;
740 }
741 751
742 arp_xmit(skb); 752 arp_xmit(skb);
743} 753}
@@ -815,7 +825,7 @@ static int arp_process(struct sk_buff *skb)
815/* 825/*
816 * Extract fields 826 * Extract fields
817 */ 827 */
818 arp_ptr= (unsigned char *)(arp+1); 828 arp_ptr = (unsigned char *)(arp + 1);
819 sha = arp_ptr; 829 sha = arp_ptr;
820 arp_ptr += dev->addr_len; 830 arp_ptr += dev->addr_len;
821 memcpy(&sip, arp_ptr, 4); 831 memcpy(&sip, arp_ptr, 4);
@@ -869,16 +879,17 @@ static int arp_process(struct sk_buff *skb)
869 addr_type = rt->rt_type; 879 addr_type = rt->rt_type;
870 880
871 if (addr_type == RTN_LOCAL) { 881 if (addr_type == RTN_LOCAL) {
872 int dont_send = 0; 882 int dont_send;
873 883
874 if (!dont_send) 884 dont_send = arp_ignore(in_dev, sip, tip);
875 dont_send |= arp_ignore(in_dev,sip,tip);
876 if (!dont_send && IN_DEV_ARPFILTER(in_dev)) 885 if (!dont_send && IN_DEV_ARPFILTER(in_dev))
877 dont_send |= arp_filter(sip,tip,dev); 886 dont_send |= arp_filter(sip, tip, dev);
878 if (!dont_send) { 887 if (!dont_send) {
879 n = neigh_event_ns(&arp_tbl, sha, &sip, dev); 888 n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
880 if (n) { 889 if (n) {
881 arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); 890 arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
891 dev, tip, sha, dev->dev_addr,
892 sha);
882 neigh_release(n); 893 neigh_release(n);
883 } 894 }
884 } 895 }
@@ -887,8 +898,7 @@ static int arp_process(struct sk_buff *skb)
887 if (addr_type == RTN_UNICAST && 898 if (addr_type == RTN_UNICAST &&
888 (arp_fwd_proxy(in_dev, dev, rt) || 899 (arp_fwd_proxy(in_dev, dev, rt) ||
889 arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || 900 arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
890 pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) 901 pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) {
891 {
892 n = neigh_event_ns(&arp_tbl, sha, &sip, dev); 902 n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
893 if (n) 903 if (n)
894 neigh_release(n); 904 neigh_release(n);
@@ -896,9 +906,12 @@ static int arp_process(struct sk_buff *skb)
896 if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || 906 if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
897 skb->pkt_type == PACKET_HOST || 907 skb->pkt_type == PACKET_HOST ||
898 in_dev->arp_parms->proxy_delay == 0) { 908 in_dev->arp_parms->proxy_delay == 0) {
899 arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); 909 arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
910 dev, tip, sha, dev->dev_addr,
911 sha);
900 } else { 912 } else {
901 pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb); 913 pneigh_enqueue(&arp_tbl,
914 in_dev->arp_parms, skb);
902 return 0; 915 return 0;
903 } 916 }
904 goto out; 917 goto out;
@@ -939,7 +952,8 @@ static int arp_process(struct sk_buff *skb)
939 if (arp->ar_op != htons(ARPOP_REPLY) || 952 if (arp->ar_op != htons(ARPOP_REPLY) ||
940 skb->pkt_type != PACKET_HOST) 953 skb->pkt_type != PACKET_HOST)
941 state = NUD_STALE; 954 state = NUD_STALE;
942 neigh_update(n, sha, state, override ? NEIGH_UPDATE_F_OVERRIDE : 0); 955 neigh_update(n, sha, state,
956 override ? NEIGH_UPDATE_F_OVERRIDE : 0);
943 neigh_release(n); 957 neigh_release(n);
944 } 958 }
945 959
@@ -975,7 +989,8 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
975 arp->ar_pln != 4) 989 arp->ar_pln != 4)
976 goto freeskb; 990 goto freeskb;
977 991
978 if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) 992 skb = skb_share_check(skb, GFP_ATOMIC);
993 if (skb == NULL)
979 goto out_of_mem; 994 goto out_of_mem;
980 995
981 memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); 996 memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
@@ -1019,7 +1034,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r,
1019 return -EINVAL; 1034 return -EINVAL;
1020 if (!dev && (r->arp_flags & ATF_COM)) { 1035 if (!dev && (r->arp_flags & ATF_COM)) {
1021 dev = dev_getbyhwaddr(net, r->arp_ha.sa_family, 1036 dev = dev_getbyhwaddr(net, r->arp_ha.sa_family,
1022 r->arp_ha.sa_data); 1037 r->arp_ha.sa_data);
1023 if (!dev) 1038 if (!dev)
1024 return -ENODEV; 1039 return -ENODEV;
1025 } 1040 }
@@ -1033,7 +1048,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r,
1033} 1048}
1034 1049
1035static int arp_req_set(struct net *net, struct arpreq *r, 1050static int arp_req_set(struct net *net, struct arpreq *r,
1036 struct net_device * dev) 1051 struct net_device *dev)
1037{ 1052{
1038 __be32 ip; 1053 __be32 ip;
1039 struct neighbour *neigh; 1054 struct neighbour *neigh;
@@ -1046,10 +1061,11 @@ static int arp_req_set(struct net *net, struct arpreq *r,
1046 if (r->arp_flags & ATF_PERM) 1061 if (r->arp_flags & ATF_PERM)
1047 r->arp_flags |= ATF_COM; 1062 r->arp_flags |= ATF_COM;
1048 if (dev == NULL) { 1063 if (dev == NULL) {
1049 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, 1064 struct flowi fl = { .nl_u.ip4_u = { .daddr = ip,
1050 .tos = RTO_ONLINK } } }; 1065 .tos = RTO_ONLINK } };
1051 struct rtable * rt; 1066 struct rtable *rt;
1052 if ((err = ip_route_output_key(net, &rt, &fl)) != 0) 1067 err = ip_route_output_key(net, &rt, &fl);
1068 if (err != 0)
1053 return err; 1069 return err;
1054 dev = rt->dst.dev; 1070 dev = rt->dst.dev;
1055 ip_rt_put(rt); 1071 ip_rt_put(rt);
@@ -1083,9 +1099,9 @@ static int arp_req_set(struct net *net, struct arpreq *r,
1083 unsigned state = NUD_STALE; 1099 unsigned state = NUD_STALE;
1084 if (r->arp_flags & ATF_PERM) 1100 if (r->arp_flags & ATF_PERM)
1085 state = NUD_PERMANENT; 1101 state = NUD_PERMANENT;
1086 err = neigh_update(neigh, (r->arp_flags&ATF_COM) ? 1102 err = neigh_update(neigh, (r->arp_flags & ATF_COM) ?
1087 r->arp_ha.sa_data : NULL, state, 1103 r->arp_ha.sa_data : NULL, state,
1088 NEIGH_UPDATE_F_OVERRIDE| 1104 NEIGH_UPDATE_F_OVERRIDE |
1089 NEIGH_UPDATE_F_ADMIN); 1105 NEIGH_UPDATE_F_ADMIN);
1090 neigh_release(neigh); 1106 neigh_release(neigh);
1091 } 1107 }
@@ -1094,12 +1110,12 @@ static int arp_req_set(struct net *net, struct arpreq *r,
1094 1110
1095static unsigned arp_state_to_flags(struct neighbour *neigh) 1111static unsigned arp_state_to_flags(struct neighbour *neigh)
1096{ 1112{
1097 unsigned flags = 0;
1098 if (neigh->nud_state&NUD_PERMANENT) 1113 if (neigh->nud_state&NUD_PERMANENT)
1099 flags = ATF_PERM|ATF_COM; 1114 return ATF_PERM | ATF_COM;
1100 else if (neigh->nud_state&NUD_VALID) 1115 else if (neigh->nud_state&NUD_VALID)
1101 flags = ATF_COM; 1116 return ATF_COM;
1102 return flags; 1117 else
1118 return 0;
1103} 1119}
1104 1120
1105/* 1121/*
@@ -1142,7 +1158,7 @@ static int arp_req_delete_public(struct net *net, struct arpreq *r,
1142} 1158}
1143 1159
1144static int arp_req_delete(struct net *net, struct arpreq *r, 1160static int arp_req_delete(struct net *net, struct arpreq *r,
1145 struct net_device * dev) 1161 struct net_device *dev)
1146{ 1162{
1147 int err; 1163 int err;
1148 __be32 ip; 1164 __be32 ip;
@@ -1153,10 +1169,11 @@ static int arp_req_delete(struct net *net, struct arpreq *r,
1153 1169
1154 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; 1170 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
1155 if (dev == NULL) { 1171 if (dev == NULL) {
1156 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, 1172 struct flowi fl = { .nl_u.ip4_u = { .daddr = ip,
1157 .tos = RTO_ONLINK } } }; 1173 .tos = RTO_ONLINK } };
1158 struct rtable * rt; 1174 struct rtable *rt;
1159 if ((err = ip_route_output_key(net, &rt, &fl)) != 0) 1175 err = ip_route_output_key(net, &rt, &fl);
1176 if (err != 0)
1160 return err; 1177 return err;
1161 dev = rt->dst.dev; 1178 dev = rt->dst.dev;
1162 ip_rt_put(rt); 1179 ip_rt_put(rt);
@@ -1166,7 +1183,7 @@ static int arp_req_delete(struct net *net, struct arpreq *r,
1166 err = -ENXIO; 1183 err = -ENXIO;
1167 neigh = neigh_lookup(&arp_tbl, &ip, dev); 1184 neigh = neigh_lookup(&arp_tbl, &ip, dev);
1168 if (neigh) { 1185 if (neigh) {
1169 if (neigh->nud_state&~NUD_NOARP) 1186 if (neigh->nud_state & ~NUD_NOARP)
1170 err = neigh_update(neigh, NULL, NUD_FAILED, 1187 err = neigh_update(neigh, NULL, NUD_FAILED,
1171 NEIGH_UPDATE_F_OVERRIDE| 1188 NEIGH_UPDATE_F_OVERRIDE|
1172 NEIGH_UPDATE_F_ADMIN); 1189 NEIGH_UPDATE_F_ADMIN);
@@ -1186,24 +1203,24 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1186 struct net_device *dev = NULL; 1203 struct net_device *dev = NULL;
1187 1204
1188 switch (cmd) { 1205 switch (cmd) {
1189 case SIOCDARP: 1206 case SIOCDARP:
1190 case SIOCSARP: 1207 case SIOCSARP:
1191 if (!capable(CAP_NET_ADMIN)) 1208 if (!capable(CAP_NET_ADMIN))
1192 return -EPERM; 1209 return -EPERM;
1193 case SIOCGARP: 1210 case SIOCGARP:
1194 err = copy_from_user(&r, arg, sizeof(struct arpreq)); 1211 err = copy_from_user(&r, arg, sizeof(struct arpreq));
1195 if (err) 1212 if (err)
1196 return -EFAULT; 1213 return -EFAULT;
1197 break; 1214 break;
1198 default: 1215 default:
1199 return -EINVAL; 1216 return -EINVAL;
1200 } 1217 }
1201 1218
1202 if (r.arp_pa.sa_family != AF_INET) 1219 if (r.arp_pa.sa_family != AF_INET)
1203 return -EPFNOSUPPORT; 1220 return -EPFNOSUPPORT;
1204 1221
1205 if (!(r.arp_flags & ATF_PUBL) && 1222 if (!(r.arp_flags & ATF_PUBL) &&
1206 (r.arp_flags & (ATF_NETMASK|ATF_DONTPUB))) 1223 (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB)))
1207 return -EINVAL; 1224 return -EINVAL;
1208 if (!(r.arp_flags & ATF_NETMASK)) 1225 if (!(r.arp_flags & ATF_NETMASK))
1209 ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr = 1226 ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr =
@@ -1211,7 +1228,8 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1211 rtnl_lock(); 1228 rtnl_lock();
1212 if (r.arp_dev[0]) { 1229 if (r.arp_dev[0]) {
1213 err = -ENODEV; 1230 err = -ENODEV;
1214 if ((dev = __dev_get_by_name(net, r.arp_dev)) == NULL) 1231 dev = __dev_get_by_name(net, r.arp_dev);
1232 if (dev == NULL)
1215 goto out; 1233 goto out;
1216 1234
1217 /* Mmmm... It is wrong... ARPHRD_NETROM==0 */ 1235 /* Mmmm... It is wrong... ARPHRD_NETROM==0 */
@@ -1243,7 +1261,8 @@ out:
1243 return err; 1261 return err;
1244} 1262}
1245 1263
1246static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) 1264static int arp_netdev_event(struct notifier_block *this, unsigned long event,
1265 void *ptr)
1247{ 1266{
1248 struct net_device *dev = ptr; 1267 struct net_device *dev = ptr;
1249 1268
@@ -1311,12 +1330,13 @@ static char *ax2asc2(ax25_address *a, char *buf)
1311 for (n = 0, s = buf; n < 6; n++) { 1330 for (n = 0, s = buf; n < 6; n++) {
1312 c = (a->ax25_call[n] >> 1) & 0x7F; 1331 c = (a->ax25_call[n] >> 1) & 0x7F;
1313 1332
1314 if (c != ' ') *s++ = c; 1333 if (c != ' ')
1334 *s++ = c;
1315 } 1335 }
1316 1336
1317 *s++ = '-'; 1337 *s++ = '-';
1318 1338 n = (a->ax25_call[6] >> 1) & 0x0F;
1319 if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) { 1339 if (n > 9) {
1320 *s++ = '1'; 1340 *s++ = '1';
1321 n -= 10; 1341 n -= 10;
1322 } 1342 }
@@ -1325,10 +1345,9 @@ static char *ax2asc2(ax25_address *a, char *buf)
1325 *s++ = '\0'; 1345 *s++ = '\0';
1326 1346
1327 if (*buf == '\0' || *buf == '-') 1347 if (*buf == '\0' || *buf == '-')
1328 return "*"; 1348 return "*";
1329 1349
1330 return buf; 1350 return buf;
1331
1332} 1351}
1333#endif /* CONFIG_AX25 */ 1352#endif /* CONFIG_AX25 */
1334 1353
@@ -1408,10 +1427,10 @@ static void *arp_seq_start(struct seq_file *seq, loff_t *pos)
1408/* ------------------------------------------------------------------------ */ 1427/* ------------------------------------------------------------------------ */
1409 1428
1410static const struct seq_operations arp_seq_ops = { 1429static const struct seq_operations arp_seq_ops = {
1411 .start = arp_seq_start, 1430 .start = arp_seq_start,
1412 .next = neigh_seq_next, 1431 .next = neigh_seq_next,
1413 .stop = neigh_seq_stop, 1432 .stop = neigh_seq_stop,
1414 .show = arp_seq_show, 1433 .show = arp_seq_show,
1415}; 1434};
1416 1435
1417static int arp_seq_open(struct inode *inode, struct file *file) 1436static int arp_seq_open(struct inode *inode, struct file *file)
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 3a92a76ae41d..094e150c6260 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -9,7 +9,7 @@
9 * 9 *
10 * The CIPSO draft specification can be found in the kernel's Documentation 10 * The CIPSO draft specification can be found in the kernel's Documentation
11 * directory as well as the following URL: 11 * directory as well as the following URL:
12 * http://netlabel.sourceforge.net/files/draft-ietf-cipso-ipsecurity-01.txt 12 * http://tools.ietf.org/id/draft-ietf-cipso-ipsecurity-01.txt
13 * The FIPS-188 specification can be found at the following URL: 13 * The FIPS-188 specification can be found at the following URL:
14 * http://www.itl.nist.gov/fipspubs/fip188.htm 14 * http://www.itl.nist.gov/fipspubs/fip188.htm
15 * 15 *
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index f0550941df7b..174be6caa5c8 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -62,14 +62,17 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
62 } 62 }
63 if (!inet->inet_saddr) 63 if (!inet->inet_saddr)
64 inet->inet_saddr = rt->rt_src; /* Update source address */ 64 inet->inet_saddr = rt->rt_src; /* Update source address */
65 if (!inet->inet_rcv_saddr) 65 if (!inet->inet_rcv_saddr) {
66 inet->inet_rcv_saddr = rt->rt_src; 66 inet->inet_rcv_saddr = rt->rt_src;
67 if (sk->sk_prot->rehash)
68 sk->sk_prot->rehash(sk);
69 }
67 inet->inet_daddr = rt->rt_dst; 70 inet->inet_daddr = rt->rt_dst;
68 inet->inet_dport = usin->sin_port; 71 inet->inet_dport = usin->sin_port;
69 sk->sk_state = TCP_ESTABLISHED; 72 sk->sk_state = TCP_ESTABLISHED;
70 inet->inet_id = jiffies; 73 inet->inet_id = jiffies;
71 74
72 sk_dst_set(sk, &rt->dst); 75 sk_dst_set(sk, &rt->dst);
73 return(0); 76 return 0;
74} 77}
75EXPORT_SYMBOL(ip4_datagram_connect); 78EXPORT_SYMBOL(ip4_datagram_connect);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index da14c49284f4..dc94b0316b78 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -209,7 +209,7 @@ static void inetdev_destroy(struct in_device *in_dev)
209 inet_free_ifa(ifa); 209 inet_free_ifa(ifa);
210 } 210 }
211 211
212 dev->ip_ptr = NULL; 212 rcu_assign_pointer(dev->ip_ptr, NULL);
213 213
214 devinet_sysctl_unregister(in_dev); 214 devinet_sysctl_unregister(in_dev);
215 neigh_parms_release(&arp_tbl, in_dev->arp_parms); 215 neigh_parms_release(&arp_tbl, in_dev->arp_parms);
@@ -403,6 +403,9 @@ static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
403 return inet_insert_ifa(ifa); 403 return inet_insert_ifa(ifa);
404} 404}
405 405
406/* Caller must hold RCU or RTNL :
407 * We dont take a reference on found in_device
408 */
406struct in_device *inetdev_by_index(struct net *net, int ifindex) 409struct in_device *inetdev_by_index(struct net *net, int ifindex)
407{ 410{
408 struct net_device *dev; 411 struct net_device *dev;
@@ -411,7 +414,7 @@ struct in_device *inetdev_by_index(struct net *net, int ifindex)
411 rcu_read_lock(); 414 rcu_read_lock();
412 dev = dev_get_by_index_rcu(net, ifindex); 415 dev = dev_get_by_index_rcu(net, ifindex);
413 if (dev) 416 if (dev)
414 in_dev = in_dev_get(dev); 417 in_dev = rcu_dereference_rtnl(dev->ip_ptr);
415 rcu_read_unlock(); 418 rcu_read_unlock();
416 return in_dev; 419 return in_dev;
417} 420}
@@ -453,8 +456,6 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
453 goto errout; 456 goto errout;
454 } 457 }
455 458
456 __in_dev_put(in_dev);
457
458 for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; 459 for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
459 ifap = &ifa->ifa_next) { 460 ifap = &ifa->ifa_next) {
460 if (tb[IFA_LOCAL] && 461 if (tb[IFA_LOCAL] &&
@@ -1059,7 +1060,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
1059 switch (event) { 1060 switch (event) {
1060 case NETDEV_REGISTER: 1061 case NETDEV_REGISTER:
1061 printk(KERN_DEBUG "inetdev_event: bug\n"); 1062 printk(KERN_DEBUG "inetdev_event: bug\n");
1062 dev->ip_ptr = NULL; 1063 rcu_assign_pointer(dev->ip_ptr, NULL);
1063 break; 1064 break;
1064 case NETDEV_UP: 1065 case NETDEV_UP:
1065 if (!inetdev_valid_mtu(dev->mtu)) 1066 if (!inetdev_valid_mtu(dev->mtu))
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index a43968918350..eb6f69a8f27a 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -147,35 +147,43 @@ static void fib_flush(struct net *net)
147 rt_cache_flush(net, -1); 147 rt_cache_flush(net, -1);
148} 148}
149 149
150/* 150/**
151 * Find the first device with a given source address. 151 * __ip_dev_find - find the first device with a given source address.
152 * @net: the net namespace
153 * @addr: the source address
154 * @devref: if true, take a reference on the found device
155 *
156 * If a caller uses devref=false, it should be protected by RCU, or RTNL
152 */ 157 */
153 158struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
154struct net_device * ip_dev_find(struct net *net, __be32 addr)
155{ 159{
156 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } }; 160 struct flowi fl = {
157 struct fib_result res; 161 .nl_u = {
162 .ip4_u = {
163 .daddr = addr
164 }
165 },
166 .flags = FLOWI_FLAG_MATCH_ANY_IIF
167 };
168 struct fib_result res = { 0 };
158 struct net_device *dev = NULL; 169 struct net_device *dev = NULL;
159 struct fib_table *local_table;
160
161#ifdef CONFIG_IP_MULTIPLE_TABLES
162 res.r = NULL;
163#endif
164 170
165 local_table = fib_get_table(net, RT_TABLE_LOCAL); 171 rcu_read_lock();
166 if (!local_table || fib_table_lookup(local_table, &fl, &res)) 172 if (fib_lookup(net, &fl, &res)) {
173 rcu_read_unlock();
167 return NULL; 174 return NULL;
175 }
168 if (res.type != RTN_LOCAL) 176 if (res.type != RTN_LOCAL)
169 goto out; 177 goto out;
170 dev = FIB_RES_DEV(res); 178 dev = FIB_RES_DEV(res);
171 179
172 if (dev) 180 if (dev && devref)
173 dev_hold(dev); 181 dev_hold(dev);
174out: 182out:
175 fib_res_put(&res); 183 rcu_read_unlock();
176 return dev; 184 return dev;
177} 185}
178EXPORT_SYMBOL(ip_dev_find); 186EXPORT_SYMBOL(__ip_dev_find);
179 187
180/* 188/*
181 * Find address type as if only "dev" was present in the system. If 189 * Find address type as if only "dev" was present in the system. If
@@ -202,11 +210,12 @@ static inline unsigned __inet_dev_addr_type(struct net *net,
202 local_table = fib_get_table(net, RT_TABLE_LOCAL); 210 local_table = fib_get_table(net, RT_TABLE_LOCAL);
203 if (local_table) { 211 if (local_table) {
204 ret = RTN_UNICAST; 212 ret = RTN_UNICAST;
205 if (!fib_table_lookup(local_table, &fl, &res)) { 213 rcu_read_lock();
214 if (!fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) {
206 if (!dev || dev == res.fi->fib_dev) 215 if (!dev || dev == res.fi->fib_dev)
207 ret = res.type; 216 ret = res.type;
208 fib_res_put(&res);
209 } 217 }
218 rcu_read_unlock();
210 } 219 }
211 return ret; 220 return ret;
212} 221}
@@ -220,37 +229,41 @@ EXPORT_SYMBOL(inet_addr_type);
220unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, 229unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
221 __be32 addr) 230 __be32 addr)
222{ 231{
223 return __inet_dev_addr_type(net, dev, addr); 232 return __inet_dev_addr_type(net, dev, addr);
224} 233}
225EXPORT_SYMBOL(inet_dev_addr_type); 234EXPORT_SYMBOL(inet_dev_addr_type);
226 235
227/* Given (packet source, input interface) and optional (dst, oif, tos): 236/* Given (packet source, input interface) and optional (dst, oif, tos):
228 - (main) check, that source is valid i.e. not broadcast or our local 237 * - (main) check, that source is valid i.e. not broadcast or our local
229 address. 238 * address.
230 - figure out what "logical" interface this packet arrived 239 * - figure out what "logical" interface this packet arrived
231 and calculate "specific destination" address. 240 * and calculate "specific destination" address.
232 - check, that packet arrived from expected physical interface. 241 * - check, that packet arrived from expected physical interface.
242 * called with rcu_read_lock()
233 */ 243 */
234
235int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, 244int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
236 struct net_device *dev, __be32 *spec_dst, 245 struct net_device *dev, __be32 *spec_dst,
237 u32 *itag, u32 mark) 246 u32 *itag, u32 mark)
238{ 247{
239 struct in_device *in_dev; 248 struct in_device *in_dev;
240 struct flowi fl = { .nl_u = { .ip4_u = 249 struct flowi fl = {
241 { .daddr = src, 250 .nl_u = {
242 .saddr = dst, 251 .ip4_u = {
243 .tos = tos } }, 252 .daddr = src,
244 .mark = mark, 253 .saddr = dst,
245 .iif = oif }; 254 .tos = tos
246 255 }
256 },
257 .mark = mark,
258 .iif = oif
259 };
247 struct fib_result res; 260 struct fib_result res;
248 int no_addr, rpf, accept_local; 261 int no_addr, rpf, accept_local;
262 bool dev_match;
249 int ret; 263 int ret;
250 struct net *net; 264 struct net *net;
251 265
252 no_addr = rpf = accept_local = 0; 266 no_addr = rpf = accept_local = 0;
253 rcu_read_lock();
254 in_dev = __in_dev_get_rcu(dev); 267 in_dev = __in_dev_get_rcu(dev);
255 if (in_dev) { 268 if (in_dev) {
256 no_addr = in_dev->ifa_list == NULL; 269 no_addr = in_dev->ifa_list == NULL;
@@ -259,7 +272,6 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
259 if (mark && !IN_DEV_SRC_VMARK(in_dev)) 272 if (mark && !IN_DEV_SRC_VMARK(in_dev))
260 fl.mark = 0; 273 fl.mark = 0;
261 } 274 }
262 rcu_read_unlock();
263 275
264 if (in_dev == NULL) 276 if (in_dev == NULL)
265 goto e_inval; 277 goto e_inval;
@@ -269,21 +281,29 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
269 goto last_resort; 281 goto last_resort;
270 if (res.type != RTN_UNICAST) { 282 if (res.type != RTN_UNICAST) {
271 if (res.type != RTN_LOCAL || !accept_local) 283 if (res.type != RTN_LOCAL || !accept_local)
272 goto e_inval_res; 284 goto e_inval;
273 } 285 }
274 *spec_dst = FIB_RES_PREFSRC(res); 286 *spec_dst = FIB_RES_PREFSRC(res);
275 fib_combine_itag(itag, &res); 287 fib_combine_itag(itag, &res);
288 dev_match = false;
289
276#ifdef CONFIG_IP_ROUTE_MULTIPATH 290#ifdef CONFIG_IP_ROUTE_MULTIPATH
277 if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1) 291 for (ret = 0; ret < res.fi->fib_nhs; ret++) {
292 struct fib_nh *nh = &res.fi->fib_nh[ret];
293
294 if (nh->nh_dev == dev) {
295 dev_match = true;
296 break;
297 }
298 }
278#else 299#else
279 if (FIB_RES_DEV(res) == dev) 300 if (FIB_RES_DEV(res) == dev)
301 dev_match = true;
280#endif 302#endif
281 { 303 if (dev_match) {
282 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; 304 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
283 fib_res_put(&res);
284 return ret; 305 return ret;
285 } 306 }
286 fib_res_put(&res);
287 if (no_addr) 307 if (no_addr)
288 goto last_resort; 308 goto last_resort;
289 if (rpf == 1) 309 if (rpf == 1)
@@ -296,7 +316,6 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
296 *spec_dst = FIB_RES_PREFSRC(res); 316 *spec_dst = FIB_RES_PREFSRC(res);
297 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; 317 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
298 } 318 }
299 fib_res_put(&res);
300 } 319 }
301 return ret; 320 return ret;
302 321
@@ -307,8 +326,6 @@ last_resort:
307 *itag = 0; 326 *itag = 0;
308 return 0; 327 return 0;
309 328
310e_inval_res:
311 fib_res_put(&res);
312e_inval: 329e_inval:
313 return -EINVAL; 330 return -EINVAL;
314e_rpf: 331e_rpf:
@@ -461,9 +478,9 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
461} 478}
462 479
463/* 480/*
464 * Handle IP routing ioctl calls. These are used to manipulate the routing tables 481 * Handle IP routing ioctl calls.
482 * These are used to manipulate the routing tables
465 */ 483 */
466
467int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) 484int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
468{ 485{
469 struct fib_config cfg; 486 struct fib_config cfg;
@@ -507,7 +524,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
507 return -EINVAL; 524 return -EINVAL;
508} 525}
509 526
510const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = { 527const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
511 [RTA_DST] = { .type = NLA_U32 }, 528 [RTA_DST] = { .type = NLA_U32 },
512 [RTA_SRC] = { .type = NLA_U32 }, 529 [RTA_SRC] = { .type = NLA_U32 },
513 [RTA_IIF] = { .type = NLA_U32 }, 530 [RTA_IIF] = { .type = NLA_U32 },
@@ -521,7 +538,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = {
521}; 538};
522 539
523static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, 540static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
524 struct nlmsghdr *nlh, struct fib_config *cfg) 541 struct nlmsghdr *nlh, struct fib_config *cfg)
525{ 542{
526 struct nlattr *attr; 543 struct nlattr *attr;
527 int err, remaining; 544 int err, remaining;
@@ -676,12 +693,11 @@ out:
676} 693}
677 694
678/* Prepare and feed intra-kernel routing request. 695/* Prepare and feed intra-kernel routing request.
679 Really, it should be netlink message, but :-( netlink 696 * Really, it should be netlink message, but :-( netlink
680 can be not configured, so that we feed it directly 697 * can be not configured, so that we feed it directly
681 to fib engine. It is legal, because all events occur 698 * to fib engine. It is legal, because all events occur
682 only when netlink is already locked. 699 * only when netlink is already locked.
683 */ 700 */
684
685static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa) 701static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
686{ 702{
687 struct net *net = dev_net(ifa->ifa_dev->dev); 703 struct net *net = dev_net(ifa->ifa_dev->dev);
@@ -727,9 +743,9 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
727 struct in_ifaddr *prim = ifa; 743 struct in_ifaddr *prim = ifa;
728 __be32 mask = ifa->ifa_mask; 744 __be32 mask = ifa->ifa_mask;
729 __be32 addr = ifa->ifa_local; 745 __be32 addr = ifa->ifa_local;
730 __be32 prefix = ifa->ifa_address&mask; 746 __be32 prefix = ifa->ifa_address & mask;
731 747
732 if (ifa->ifa_flags&IFA_F_SECONDARY) { 748 if (ifa->ifa_flags & IFA_F_SECONDARY) {
733 prim = inet_ifa_byprefix(in_dev, prefix, mask); 749 prim = inet_ifa_byprefix(in_dev, prefix, mask);
734 if (prim == NULL) { 750 if (prim == NULL) {
735 printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n"); 751 printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n");
@@ -739,22 +755,24 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
739 755
740 fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim); 756 fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
741 757
742 if (!(dev->flags&IFF_UP)) 758 if (!(dev->flags & IFF_UP))
743 return; 759 return;
744 760
745 /* Add broadcast address, if it is explicitly assigned. */ 761 /* Add broadcast address, if it is explicitly assigned. */
746 if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) 762 if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
747 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); 763 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
748 764
749 if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) && 765 if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
750 (prefix != addr || ifa->ifa_prefixlen < 32)) { 766 (prefix != addr || ifa->ifa_prefixlen < 32)) {
751 fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL : 767 fib_magic(RTM_NEWROUTE,
752 RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim); 768 dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
769 prefix, ifa->ifa_prefixlen, prim);
753 770
754 /* Add network specific broadcasts, when it takes a sense */ 771 /* Add network specific broadcasts, when it takes a sense */
755 if (ifa->ifa_prefixlen < 31) { 772 if (ifa->ifa_prefixlen < 31) {
756 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim); 773 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
757 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix|~mask, 32, prim); 774 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
775 32, prim);
758 } 776 }
759 } 777 }
760} 778}
@@ -765,17 +783,18 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
765 struct net_device *dev = in_dev->dev; 783 struct net_device *dev = in_dev->dev;
766 struct in_ifaddr *ifa1; 784 struct in_ifaddr *ifa1;
767 struct in_ifaddr *prim = ifa; 785 struct in_ifaddr *prim = ifa;
768 __be32 brd = ifa->ifa_address|~ifa->ifa_mask; 786 __be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
769 __be32 any = ifa->ifa_address&ifa->ifa_mask; 787 __be32 any = ifa->ifa_address & ifa->ifa_mask;
770#define LOCAL_OK 1 788#define LOCAL_OK 1
771#define BRD_OK 2 789#define BRD_OK 2
772#define BRD0_OK 4 790#define BRD0_OK 4
773#define BRD1_OK 8 791#define BRD1_OK 8
774 unsigned ok = 0; 792 unsigned ok = 0;
775 793
776 if (!(ifa->ifa_flags&IFA_F_SECONDARY)) 794 if (!(ifa->ifa_flags & IFA_F_SECONDARY))
777 fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL : 795 fib_magic(RTM_DELROUTE,
778 RTN_UNICAST, any, ifa->ifa_prefixlen, prim); 796 dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
797 any, ifa->ifa_prefixlen, prim);
779 else { 798 else {
780 prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); 799 prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
781 if (prim == NULL) { 800 if (prim == NULL) {
@@ -785,9 +804,9 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
785 } 804 }
786 805
787 /* Deletion is more complicated than add. 806 /* Deletion is more complicated than add.
788 We should take care of not to delete too much :-) 807 * We should take care of not to delete too much :-)
789 808 *
790 Scan address list to be sure that addresses are really gone. 809 * Scan address list to be sure that addresses are really gone.
791 */ 810 */
792 811
793 for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) { 812 for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
@@ -801,23 +820,23 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
801 ok |= BRD0_OK; 820 ok |= BRD0_OK;
802 } 821 }
803 822
804 if (!(ok&BRD_OK)) 823 if (!(ok & BRD_OK))
805 fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); 824 fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
806 if (!(ok&BRD1_OK)) 825 if (!(ok & BRD1_OK))
807 fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim); 826 fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
808 if (!(ok&BRD0_OK)) 827 if (!(ok & BRD0_OK))
809 fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); 828 fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
810 if (!(ok&LOCAL_OK)) { 829 if (!(ok & LOCAL_OK)) {
811 fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim); 830 fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
812 831
813 /* Check, that this local address finally disappeared. */ 832 /* Check, that this local address finally disappeared. */
814 if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) { 833 if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
815 /* And the last, but not the least thing. 834 /* And the last, but not the least thing.
816 We must flush stray FIB entries. 835 * We must flush stray FIB entries.
817 836 *
818 First of all, we scan fib_info list searching 837 * First of all, we scan fib_info list searching
819 for stray nexthop entries, then ignite fib_flush. 838 * for stray nexthop entries, then ignite fib_flush.
820 */ 839 */
821 if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local)) 840 if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
822 fib_flush(dev_net(dev)); 841 fib_flush(dev_net(dev));
823 } 842 }
@@ -828,14 +847,20 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
828#undef BRD1_OK 847#undef BRD1_OK
829} 848}
830 849
831static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb ) 850static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
832{ 851{
833 852
834 struct fib_result res; 853 struct fib_result res;
835 struct flowi fl = { .mark = frn->fl_mark, 854 struct flowi fl = {
836 .nl_u = { .ip4_u = { .daddr = frn->fl_addr, 855 .mark = frn->fl_mark,
837 .tos = frn->fl_tos, 856 .nl_u = {
838 .scope = frn->fl_scope } } }; 857 .ip4_u = {
858 .daddr = frn->fl_addr,
859 .tos = frn->fl_tos,
860 .scope = frn->fl_scope
861 }
862 }
863 };
839 864
840#ifdef CONFIG_IP_MULTIPLE_TABLES 865#ifdef CONFIG_IP_MULTIPLE_TABLES
841 res.r = NULL; 866 res.r = NULL;
@@ -846,15 +871,16 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
846 local_bh_disable(); 871 local_bh_disable();
847 872
848 frn->tb_id = tb->tb_id; 873 frn->tb_id = tb->tb_id;
849 frn->err = fib_table_lookup(tb, &fl, &res); 874 rcu_read_lock();
875 frn->err = fib_table_lookup(tb, &fl, &res, FIB_LOOKUP_NOREF);
850 876
851 if (!frn->err) { 877 if (!frn->err) {
852 frn->prefixlen = res.prefixlen; 878 frn->prefixlen = res.prefixlen;
853 frn->nh_sel = res.nh_sel; 879 frn->nh_sel = res.nh_sel;
854 frn->type = res.type; 880 frn->type = res.type;
855 frn->scope = res.scope; 881 frn->scope = res.scope;
856 fib_res_put(&res);
857 } 882 }
883 rcu_read_unlock();
858 local_bh_enable(); 884 local_bh_enable();
859 } 885 }
860} 886}
@@ -883,8 +909,8 @@ static void nl_fib_input(struct sk_buff *skb)
883 909
884 nl_fib_lookup(frn, tb); 910 nl_fib_lookup(frn, tb);
885 911
886 pid = NETLINK_CB(skb).pid; /* pid of sending process */ 912 pid = NETLINK_CB(skb).pid; /* pid of sending process */
887 NETLINK_CB(skb).pid = 0; /* from kernel */ 913 NETLINK_CB(skb).pid = 0; /* from kernel */
888 NETLINK_CB(skb).dst_group = 0; /* unicast */ 914 NETLINK_CB(skb).dst_group = 0; /* unicast */
889 netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT); 915 netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT);
890} 916}
@@ -931,7 +957,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
931 fib_del_ifaddr(ifa); 957 fib_del_ifaddr(ifa);
932 if (ifa->ifa_dev->ifa_list == NULL) { 958 if (ifa->ifa_dev->ifa_list == NULL) {
933 /* Last address was deleted from this interface. 959 /* Last address was deleted from this interface.
934 Disable IP. 960 * Disable IP.
935 */ 961 */
936 fib_disable_ip(dev, 1, 0); 962 fib_disable_ip(dev, 1, 0);
937 } else { 963 } else {
@@ -990,16 +1016,15 @@ static struct notifier_block fib_netdev_notifier = {
990static int __net_init ip_fib_net_init(struct net *net) 1016static int __net_init ip_fib_net_init(struct net *net)
991{ 1017{
992 int err; 1018 int err;
993 unsigned int i; 1019 size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
994 1020
995 net->ipv4.fib_table_hash = kzalloc( 1021 /* Avoid false sharing : Use at least a full cache line */
996 sizeof(struct hlist_head)*FIB_TABLE_HASHSZ, GFP_KERNEL); 1022 size = max_t(size_t, size, L1_CACHE_BYTES);
1023
1024 net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
997 if (net->ipv4.fib_table_hash == NULL) 1025 if (net->ipv4.fib_table_hash == NULL)
998 return -ENOMEM; 1026 return -ENOMEM;
999 1027
1000 for (i = 0; i < FIB_TABLE_HASHSZ; i++)
1001 INIT_HLIST_HEAD(&net->ipv4.fib_table_hash[i]);
1002
1003 err = fib4_rules_init(net); 1028 err = fib4_rules_init(net);
1004 if (err < 0) 1029 if (err < 0)
1005 goto fail; 1030 goto fail;
@@ -1027,7 +1052,7 @@ static void ip_fib_net_exit(struct net *net)
1027 hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) { 1052 hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) {
1028 hlist_del(node); 1053 hlist_del(node);
1029 fib_table_flush(tb); 1054 fib_table_flush(tb);
1030 kfree(tb); 1055 fib_free_table(tb);
1031 } 1056 }
1032 } 1057 }
1033 kfree(net->ipv4.fib_table_hash); 1058 kfree(net->ipv4.fib_table_hash);
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 4ed7e0dea1bc..b3acb0417b21 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -54,36 +54,37 @@ struct fib_node {
54 struct fib_alias fn_embedded_alias; 54 struct fib_alias fn_embedded_alias;
55}; 55};
56 56
57struct fn_zone { 57#define EMBEDDED_HASH_SIZE (L1_CACHE_BYTES / sizeof(struct hlist_head))
58 struct fn_zone *fz_next; /* Next not empty zone */
59 struct hlist_head *fz_hash; /* Hash table pointer */
60 int fz_nent; /* Number of entries */
61 58
62 int fz_divisor; /* Hash divisor */ 59struct fn_zone {
60 struct fn_zone __rcu *fz_next; /* Next not empty zone */
61 struct hlist_head __rcu *fz_hash; /* Hash table pointer */
62 seqlock_t fz_lock;
63 u32 fz_hashmask; /* (fz_divisor - 1) */ 63 u32 fz_hashmask; /* (fz_divisor - 1) */
64#define FZ_HASHMASK(fz) ((fz)->fz_hashmask)
65 64
66 int fz_order; /* Zone order */ 65 u8 fz_order; /* Zone order (0..32) */
67 __be32 fz_mask; 66 u8 fz_revorder; /* 32 - fz_order */
67 __be32 fz_mask; /* inet_make_mask(order) */
68#define FZ_MASK(fz) ((fz)->fz_mask) 68#define FZ_MASK(fz) ((fz)->fz_mask)
69};
70 69
71/* NOTE. On fast computers evaluation of fz_hashmask and fz_mask 70 struct hlist_head fz_embedded_hash[EMBEDDED_HASH_SIZE];
72 * can be cheaper than memory lookup, so that FZ_* macros are used. 71
73 */ 72 int fz_nent; /* Number of entries */
73 int fz_divisor; /* Hash size (mask+1) */
74};
74 75
75struct fn_hash { 76struct fn_hash {
76 struct fn_zone *fn_zones[33]; 77 struct fn_zone *fn_zones[33];
77 struct fn_zone *fn_zone_list; 78 struct fn_zone __rcu *fn_zone_list;
78}; 79};
79 80
80static inline u32 fn_hash(__be32 key, struct fn_zone *fz) 81static inline u32 fn_hash(__be32 key, struct fn_zone *fz)
81{ 82{
82 u32 h = ntohl(key)>>(32 - fz->fz_order); 83 u32 h = ntohl(key) >> fz->fz_revorder;
83 h ^= (h>>20); 84 h ^= (h>>20);
84 h ^= (h>>10); 85 h ^= (h>>10);
85 h ^= (h>>5); 86 h ^= (h>>5);
86 h &= FZ_HASHMASK(fz); 87 h &= fz->fz_hashmask;
87 return h; 88 return h;
88} 89}
89 90
@@ -92,7 +93,6 @@ static inline __be32 fz_key(__be32 dst, struct fn_zone *fz)
92 return dst & FZ_MASK(fz); 93 return dst & FZ_MASK(fz);
93} 94}
94 95
95static DEFINE_RWLOCK(fib_hash_lock);
96static unsigned int fib_hash_genid; 96static unsigned int fib_hash_genid;
97 97
98#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head)) 98#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head))
@@ -101,12 +101,11 @@ static struct hlist_head *fz_hash_alloc(int divisor)
101{ 101{
102 unsigned long size = divisor * sizeof(struct hlist_head); 102 unsigned long size = divisor * sizeof(struct hlist_head);
103 103
104 if (size <= PAGE_SIZE) { 104 if (size <= PAGE_SIZE)
105 return kzalloc(size, GFP_KERNEL); 105 return kzalloc(size, GFP_KERNEL);
106 } else { 106
107 return (struct hlist_head *) 107 return (struct hlist_head *)
108 __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size)); 108 __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size));
109 }
110} 109}
111 110
112/* The fib hash lock must be held when this is called. */ 111/* The fib hash lock must be held when this is called. */
@@ -123,10 +122,11 @@ static inline void fn_rebuild_zone(struct fn_zone *fz,
123 hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) { 122 hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) {
124 struct hlist_head *new_head; 123 struct hlist_head *new_head;
125 124
126 hlist_del(&f->fn_hash); 125 hlist_del_rcu(&f->fn_hash);
127 126
128 new_head = &fz->fz_hash[fn_hash(f->fn_key, fz)]; 127 new_head = rcu_dereference_protected(fz->fz_hash, 1) +
129 hlist_add_head(&f->fn_hash, new_head); 128 fn_hash(f->fn_key, fz);
129 hlist_add_head_rcu(&f->fn_hash, new_head);
130 } 130 }
131 } 131 }
132} 132}
@@ -147,14 +147,14 @@ static void fn_rehash_zone(struct fn_zone *fz)
147 int old_divisor, new_divisor; 147 int old_divisor, new_divisor;
148 u32 new_hashmask; 148 u32 new_hashmask;
149 149
150 old_divisor = fz->fz_divisor; 150 new_divisor = old_divisor = fz->fz_divisor;
151 151
152 switch (old_divisor) { 152 switch (old_divisor) {
153 case 16: 153 case EMBEDDED_HASH_SIZE:
154 new_divisor = 256; 154 new_divisor *= EMBEDDED_HASH_SIZE;
155 break; 155 break;
156 case 256: 156 case EMBEDDED_HASH_SIZE*EMBEDDED_HASH_SIZE:
157 new_divisor = 1024; 157 new_divisor *= (EMBEDDED_HASH_SIZE/2);
158 break; 158 break;
159 default: 159 default:
160 if ((old_divisor << 1) > FZ_MAX_DIVISOR) { 160 if ((old_divisor << 1) > FZ_MAX_DIVISOR) {
@@ -175,31 +175,55 @@ static void fn_rehash_zone(struct fn_zone *fz)
175 ht = fz_hash_alloc(new_divisor); 175 ht = fz_hash_alloc(new_divisor);
176 176
177 if (ht) { 177 if (ht) {
178 write_lock_bh(&fib_hash_lock); 178 struct fn_zone nfz;
179 old_ht = fz->fz_hash; 179
180 fz->fz_hash = ht; 180 memcpy(&nfz, fz, sizeof(nfz));
181
182 write_seqlock_bh(&fz->fz_lock);
183 old_ht = rcu_dereference_protected(fz->fz_hash, 1);
184 RCU_INIT_POINTER(nfz.fz_hash, ht);
185 nfz.fz_hashmask = new_hashmask;
186 nfz.fz_divisor = new_divisor;
187 fn_rebuild_zone(&nfz, old_ht, old_divisor);
188 fib_hash_genid++;
189 rcu_assign_pointer(fz->fz_hash, ht);
181 fz->fz_hashmask = new_hashmask; 190 fz->fz_hashmask = new_hashmask;
182 fz->fz_divisor = new_divisor; 191 fz->fz_divisor = new_divisor;
183 fn_rebuild_zone(fz, old_ht, old_divisor); 192 write_sequnlock_bh(&fz->fz_lock);
184 fib_hash_genid++;
185 write_unlock_bh(&fib_hash_lock);
186 193
187 fz_hash_free(old_ht, old_divisor); 194 if (old_ht != fz->fz_embedded_hash) {
195 synchronize_rcu();
196 fz_hash_free(old_ht, old_divisor);
197 }
188 } 198 }
189} 199}
190 200
191static inline void fn_free_node(struct fib_node * f) 201static void fn_free_node_rcu(struct rcu_head *head)
192{ 202{
203 struct fib_node *f = container_of(head, struct fib_node, fn_embedded_alias.rcu);
204
193 kmem_cache_free(fn_hash_kmem, f); 205 kmem_cache_free(fn_hash_kmem, f);
194} 206}
195 207
208static inline void fn_free_node(struct fib_node *f)
209{
210 call_rcu(&f->fn_embedded_alias.rcu, fn_free_node_rcu);
211}
212
213static void fn_free_alias_rcu(struct rcu_head *head)
214{
215 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
216
217 kmem_cache_free(fn_alias_kmem, fa);
218}
219
196static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f) 220static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f)
197{ 221{
198 fib_release_info(fa->fa_info); 222 fib_release_info(fa->fa_info);
199 if (fa == &f->fn_embedded_alias) 223 if (fa == &f->fn_embedded_alias)
200 fa->fa_info = NULL; 224 fa->fa_info = NULL;
201 else 225 else
202 kmem_cache_free(fn_alias_kmem, fa); 226 call_rcu(&fa->rcu, fn_free_alias_rcu);
203} 227}
204 228
205static struct fn_zone * 229static struct fn_zone *
@@ -210,68 +234,71 @@ fn_new_zone(struct fn_hash *table, int z)
210 if (!fz) 234 if (!fz)
211 return NULL; 235 return NULL;
212 236
213 if (z) { 237 seqlock_init(&fz->fz_lock);
214 fz->fz_divisor = 16; 238 fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1;
215 } else { 239 fz->fz_hashmask = fz->fz_divisor - 1;
216 fz->fz_divisor = 1; 240 RCU_INIT_POINTER(fz->fz_hash, fz->fz_embedded_hash);
217 }
218 fz->fz_hashmask = (fz->fz_divisor - 1);
219 fz->fz_hash = fz_hash_alloc(fz->fz_divisor);
220 if (!fz->fz_hash) {
221 kfree(fz);
222 return NULL;
223 }
224 fz->fz_order = z; 241 fz->fz_order = z;
242 fz->fz_revorder = 32 - z;
225 fz->fz_mask = inet_make_mask(z); 243 fz->fz_mask = inet_make_mask(z);
226 244
227 /* Find the first not empty zone with more specific mask */ 245 /* Find the first not empty zone with more specific mask */
228 for (i=z+1; i<=32; i++) 246 for (i = z + 1; i <= 32; i++)
229 if (table->fn_zones[i]) 247 if (table->fn_zones[i])
230 break; 248 break;
231 write_lock_bh(&fib_hash_lock); 249 if (i > 32) {
232 if (i>32) {
233 /* No more specific masks, we are the first. */ 250 /* No more specific masks, we are the first. */
234 fz->fz_next = table->fn_zone_list; 251 rcu_assign_pointer(fz->fz_next,
235 table->fn_zone_list = fz; 252 rtnl_dereference(table->fn_zone_list));
253 rcu_assign_pointer(table->fn_zone_list, fz);
236 } else { 254 } else {
237 fz->fz_next = table->fn_zones[i]->fz_next; 255 rcu_assign_pointer(fz->fz_next,
238 table->fn_zones[i]->fz_next = fz; 256 rtnl_dereference(table->fn_zones[i]->fz_next));
257 rcu_assign_pointer(table->fn_zones[i]->fz_next, fz);
239 } 258 }
240 table->fn_zones[z] = fz; 259 table->fn_zones[z] = fz;
241 fib_hash_genid++; 260 fib_hash_genid++;
242 write_unlock_bh(&fib_hash_lock);
243 return fz; 261 return fz;
244} 262}
245 263
246int fib_table_lookup(struct fib_table *tb, 264int fib_table_lookup(struct fib_table *tb,
247 const struct flowi *flp, struct fib_result *res) 265 const struct flowi *flp, struct fib_result *res,
266 int fib_flags)
248{ 267{
249 int err; 268 int err;
250 struct fn_zone *fz; 269 struct fn_zone *fz;
251 struct fn_hash *t = (struct fn_hash *)tb->tb_data; 270 struct fn_hash *t = (struct fn_hash *)tb->tb_data;
252 271
253 read_lock(&fib_hash_lock); 272 rcu_read_lock();
254 for (fz = t->fn_zone_list; fz; fz = fz->fz_next) { 273 for (fz = rcu_dereference(t->fn_zone_list);
274 fz != NULL;
275 fz = rcu_dereference(fz->fz_next)) {
255 struct hlist_head *head; 276 struct hlist_head *head;
256 struct hlist_node *node; 277 struct hlist_node *node;
257 struct fib_node *f; 278 struct fib_node *f;
258 __be32 k = fz_key(flp->fl4_dst, fz); 279 __be32 k;
280 unsigned int seq;
259 281
260 head = &fz->fz_hash[fn_hash(k, fz)]; 282 do {
261 hlist_for_each_entry(f, node, head, fn_hash) { 283 seq = read_seqbegin(&fz->fz_lock);
262 if (f->fn_key != k) 284 k = fz_key(flp->fl4_dst, fz);
263 continue; 285
286 head = rcu_dereference(fz->fz_hash) + fn_hash(k, fz);
287 hlist_for_each_entry_rcu(f, node, head, fn_hash) {
288 if (f->fn_key != k)
289 continue;
264 290
265 err = fib_semantic_match(&f->fn_alias, 291 err = fib_semantic_match(&f->fn_alias,
266 flp, res, 292 flp, res,
267 fz->fz_order); 293 fz->fz_order, fib_flags);
268 if (err <= 0) 294 if (err <= 0)
269 goto out; 295 goto out;
270 } 296 }
297 } while (read_seqretry(&fz->fz_lock, seq));
271 } 298 }
272 err = 1; 299 err = 1;
273out: 300out:
274 read_unlock(&fib_hash_lock); 301 rcu_read_unlock();
275 return err; 302 return err;
276} 303}
277 304
@@ -285,6 +312,7 @@ void fib_table_select_default(struct fib_table *tb,
285 struct fib_info *last_resort; 312 struct fib_info *last_resort;
286 struct fn_hash *t = (struct fn_hash *)tb->tb_data; 313 struct fn_hash *t = (struct fn_hash *)tb->tb_data;
287 struct fn_zone *fz = t->fn_zones[0]; 314 struct fn_zone *fz = t->fn_zones[0];
315 struct hlist_head *head;
288 316
289 if (fz == NULL) 317 if (fz == NULL)
290 return; 318 return;
@@ -293,11 +321,12 @@ void fib_table_select_default(struct fib_table *tb,
293 last_resort = NULL; 321 last_resort = NULL;
294 order = -1; 322 order = -1;
295 323
296 read_lock(&fib_hash_lock); 324 rcu_read_lock();
297 hlist_for_each_entry(f, node, &fz->fz_hash[0], fn_hash) { 325 head = rcu_dereference(fz->fz_hash);
326 hlist_for_each_entry_rcu(f, node, head, fn_hash) {
298 struct fib_alias *fa; 327 struct fib_alias *fa;
299 328
300 list_for_each_entry(fa, &f->fn_alias, fa_list) { 329 list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
301 struct fib_info *next_fi = fa->fa_info; 330 struct fib_info *next_fi = fa->fa_info;
302 331
303 if (fa->fa_scope != res->scope || 332 if (fa->fa_scope != res->scope ||
@@ -309,7 +338,8 @@ void fib_table_select_default(struct fib_table *tb,
309 if (!next_fi->fib_nh[0].nh_gw || 338 if (!next_fi->fib_nh[0].nh_gw ||
310 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) 339 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
311 continue; 340 continue;
312 fa->fa_state |= FA_S_ACCESSED; 341
342 fib_alias_accessed(fa);
313 343
314 if (fi == NULL) { 344 if (fi == NULL) {
315 if (next_fi != res->fi) 345 if (next_fi != res->fi)
@@ -341,25 +371,25 @@ void fib_table_select_default(struct fib_table *tb,
341 fib_result_assign(res, last_resort); 371 fib_result_assign(res, last_resort);
342 tb->tb_default = last_idx; 372 tb->tb_default = last_idx;
343out: 373out:
344 read_unlock(&fib_hash_lock); 374 rcu_read_unlock();
345} 375}
346 376
347/* Insert node F to FZ. */ 377/* Insert node F to FZ. */
348static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f) 378static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
349{ 379{
350 struct hlist_head *head = &fz->fz_hash[fn_hash(f->fn_key, fz)]; 380 struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(f->fn_key, fz);
351 381
352 hlist_add_head(&f->fn_hash, head); 382 hlist_add_head_rcu(&f->fn_hash, head);
353} 383}
354 384
355/* Return the node in FZ matching KEY. */ 385/* Return the node in FZ matching KEY. */
356static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key) 386static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
357{ 387{
358 struct hlist_head *head = &fz->fz_hash[fn_hash(key, fz)]; 388 struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(key, fz);
359 struct hlist_node *node; 389 struct hlist_node *node;
360 struct fib_node *f; 390 struct fib_node *f;
361 391
362 hlist_for_each_entry(f, node, head, fn_hash) { 392 hlist_for_each_entry_rcu(f, node, head, fn_hash) {
363 if (f->fn_key == key) 393 if (f->fn_key == key)
364 return f; 394 return f;
365 } 395 }
@@ -367,6 +397,17 @@ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
367 return NULL; 397 return NULL;
368} 398}
369 399
400
401static struct fib_alias *fib_fast_alloc(struct fib_node *f)
402{
403 struct fib_alias *fa = &f->fn_embedded_alias;
404
405 if (fa->fa_info != NULL)
406 fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
407 return fa;
408}
409
410/* Caller must hold RTNL. */
370int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) 411int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
371{ 412{
372 struct fn_hash *table = (struct fn_hash *) tb->tb_data; 413 struct fn_hash *table = (struct fn_hash *) tb->tb_data;
@@ -451,7 +492,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
451 } 492 }
452 493
453 if (cfg->fc_nlflags & NLM_F_REPLACE) { 494 if (cfg->fc_nlflags & NLM_F_REPLACE) {
454 struct fib_info *fi_drop;
455 u8 state; 495 u8 state;
456 496
457 fa = fa_first; 497 fa = fa_first;
@@ -460,21 +500,25 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
460 err = 0; 500 err = 0;
461 goto out; 501 goto out;
462 } 502 }
463 write_lock_bh(&fib_hash_lock); 503 err = -ENOBUFS;
464 fi_drop = fa->fa_info; 504 new_fa = fib_fast_alloc(f);
465 fa->fa_info = fi; 505 if (new_fa == NULL)
466 fa->fa_type = cfg->fc_type; 506 goto out;
467 fa->fa_scope = cfg->fc_scope; 507
508 new_fa->fa_tos = fa->fa_tos;
509 new_fa->fa_info = fi;
510 new_fa->fa_type = cfg->fc_type;
511 new_fa->fa_scope = cfg->fc_scope;
468 state = fa->fa_state; 512 state = fa->fa_state;
469 fa->fa_state &= ~FA_S_ACCESSED; 513 new_fa->fa_state = state & ~FA_S_ACCESSED;
470 fib_hash_genid++; 514 fib_hash_genid++;
471 write_unlock_bh(&fib_hash_lock); 515 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
472 516
473 fib_release_info(fi_drop); 517 fn_free_alias(fa, f);
474 if (state & FA_S_ACCESSED) 518 if (state & FA_S_ACCESSED)
475 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); 519 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
476 rtmsg_fib(RTM_NEWROUTE, key, fa, cfg->fc_dst_len, tb->tb_id, 520 rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len,
477 &cfg->fc_nlinfo, NLM_F_REPLACE); 521 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
478 return 0; 522 return 0;
479 } 523 }
480 524
@@ -506,12 +550,10 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
506 f = new_f; 550 f = new_f;
507 } 551 }
508 552
509 new_fa = &f->fn_embedded_alias; 553 new_fa = fib_fast_alloc(f);
510 if (new_fa->fa_info != NULL) { 554 if (new_fa == NULL)
511 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); 555 goto out;
512 if (new_fa == NULL) 556
513 goto out;
514 }
515 new_fa->fa_info = fi; 557 new_fa->fa_info = fi;
516 new_fa->fa_tos = tos; 558 new_fa->fa_tos = tos;
517 new_fa->fa_type = cfg->fc_type; 559 new_fa->fa_type = cfg->fc_type;
@@ -522,13 +564,11 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
522 * Insert new entry to the list. 564 * Insert new entry to the list.
523 */ 565 */
524 566
525 write_lock_bh(&fib_hash_lock);
526 if (new_f) 567 if (new_f)
527 fib_insert_node(fz, new_f); 568 fib_insert_node(fz, new_f);
528 list_add_tail(&new_fa->fa_list, 569 list_add_tail_rcu(&new_fa->fa_list,
529 (fa ? &fa->fa_list : &f->fn_alias)); 570 (fa ? &fa->fa_list : &f->fn_alias));
530 fib_hash_genid++; 571 fib_hash_genid++;
531 write_unlock_bh(&fib_hash_lock);
532 572
533 if (new_f) 573 if (new_f)
534 fz->fz_nent++; 574 fz->fz_nent++;
@@ -603,14 +643,12 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
603 tb->tb_id, &cfg->fc_nlinfo, 0); 643 tb->tb_id, &cfg->fc_nlinfo, 0);
604 644
605 kill_fn = 0; 645 kill_fn = 0;
606 write_lock_bh(&fib_hash_lock); 646 list_del_rcu(&fa->fa_list);
607 list_del(&fa->fa_list);
608 if (list_empty(&f->fn_alias)) { 647 if (list_empty(&f->fn_alias)) {
609 hlist_del(&f->fn_hash); 648 hlist_del_rcu(&f->fn_hash);
610 kill_fn = 1; 649 kill_fn = 1;
611 } 650 }
612 fib_hash_genid++; 651 fib_hash_genid++;
613 write_unlock_bh(&fib_hash_lock);
614 652
615 if (fa->fa_state & FA_S_ACCESSED) 653 if (fa->fa_state & FA_S_ACCESSED)
616 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); 654 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
@@ -627,7 +665,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
627 665
628static int fn_flush_list(struct fn_zone *fz, int idx) 666static int fn_flush_list(struct fn_zone *fz, int idx)
629{ 667{
630 struct hlist_head *head = &fz->fz_hash[idx]; 668 struct hlist_head *head = rtnl_dereference(fz->fz_hash) + idx;
631 struct hlist_node *node, *n; 669 struct hlist_node *node, *n;
632 struct fib_node *f; 670 struct fib_node *f;
633 int found = 0; 671 int found = 0;
@@ -641,14 +679,12 @@ static int fn_flush_list(struct fn_zone *fz, int idx)
641 struct fib_info *fi = fa->fa_info; 679 struct fib_info *fi = fa->fa_info;
642 680
643 if (fi && (fi->fib_flags&RTNH_F_DEAD)) { 681 if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
644 write_lock_bh(&fib_hash_lock); 682 list_del_rcu(&fa->fa_list);
645 list_del(&fa->fa_list);
646 if (list_empty(&f->fn_alias)) { 683 if (list_empty(&f->fn_alias)) {
647 hlist_del(&f->fn_hash); 684 hlist_del_rcu(&f->fn_hash);
648 kill_f = 1; 685 kill_f = 1;
649 } 686 }
650 fib_hash_genid++; 687 fib_hash_genid++;
651 write_unlock_bh(&fib_hash_lock);
652 688
653 fn_free_alias(fa, f); 689 fn_free_alias(fa, f);
654 found++; 690 found++;
@@ -662,13 +698,16 @@ static int fn_flush_list(struct fn_zone *fz, int idx)
662 return found; 698 return found;
663} 699}
664 700
701/* caller must hold RTNL. */
665int fib_table_flush(struct fib_table *tb) 702int fib_table_flush(struct fib_table *tb)
666{ 703{
667 struct fn_hash *table = (struct fn_hash *) tb->tb_data; 704 struct fn_hash *table = (struct fn_hash *) tb->tb_data;
668 struct fn_zone *fz; 705 struct fn_zone *fz;
669 int found = 0; 706 int found = 0;
670 707
671 for (fz = table->fn_zone_list; fz; fz = fz->fz_next) { 708 for (fz = rtnl_dereference(table->fn_zone_list);
709 fz != NULL;
710 fz = rtnl_dereference(fz->fz_next)) {
672 int i; 711 int i;
673 712
674 for (i = fz->fz_divisor - 1; i >= 0; i--) 713 for (i = fz->fz_divisor - 1; i >= 0; i--)
@@ -677,6 +716,24 @@ int fib_table_flush(struct fib_table *tb)
677 return found; 716 return found;
678} 717}
679 718
719void fib_free_table(struct fib_table *tb)
720{
721 struct fn_hash *table = (struct fn_hash *) tb->tb_data;
722 struct fn_zone *fz, *next;
723
724 next = table->fn_zone_list;
725 while (next != NULL) {
726 fz = next;
727 next = fz->fz_next;
728
729 if (fz->fz_hash != fz->fz_embedded_hash)
730 fz_hash_free(fz->fz_hash, fz->fz_divisor);
731
732 kfree(fz);
733 }
734
735 kfree(tb);
736}
680 737
681static inline int 738static inline int
682fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb, 739fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
@@ -690,10 +747,10 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
690 747
691 s_i = cb->args[4]; 748 s_i = cb->args[4];
692 i = 0; 749 i = 0;
693 hlist_for_each_entry(f, node, head, fn_hash) { 750 hlist_for_each_entry_rcu(f, node, head, fn_hash) {
694 struct fib_alias *fa; 751 struct fib_alias *fa;
695 752
696 list_for_each_entry(fa, &f->fn_alias, fa_list) { 753 list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
697 if (i < s_i) 754 if (i < s_i)
698 goto next; 755 goto next;
699 756
@@ -711,7 +768,7 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
711 cb->args[4] = i; 768 cb->args[4] = i;
712 return -1; 769 return -1;
713 } 770 }
714 next: 771next:
715 i++; 772 i++;
716 } 773 }
717 } 774 }
@@ -725,14 +782,15 @@ fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
725 struct fn_zone *fz) 782 struct fn_zone *fz)
726{ 783{
727 int h, s_h; 784 int h, s_h;
785 struct hlist_head *head = rcu_dereference(fz->fz_hash);
728 786
729 if (fz->fz_hash == NULL) 787 if (head == NULL)
730 return skb->len; 788 return skb->len;
731 s_h = cb->args[3]; 789 s_h = cb->args[3];
732 for (h = s_h; h < fz->fz_divisor; h++) { 790 for (h = s_h; h < fz->fz_divisor; h++) {
733 if (hlist_empty(&fz->fz_hash[h])) 791 if (hlist_empty(head + h))
734 continue; 792 continue;
735 if (fn_hash_dump_bucket(skb, cb, tb, fz, &fz->fz_hash[h]) < 0) { 793 if (fn_hash_dump_bucket(skb, cb, tb, fz, head + h) < 0) {
736 cb->args[3] = h; 794 cb->args[3] = h;
737 return -1; 795 return -1;
738 } 796 }
@@ -746,23 +804,26 @@ fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
746int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, 804int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
747 struct netlink_callback *cb) 805 struct netlink_callback *cb)
748{ 806{
749 int m, s_m; 807 int m = 0, s_m;
750 struct fn_zone *fz; 808 struct fn_zone *fz;
751 struct fn_hash *table = (struct fn_hash *)tb->tb_data; 809 struct fn_hash *table = (struct fn_hash *)tb->tb_data;
752 810
753 s_m = cb->args[2]; 811 s_m = cb->args[2];
754 read_lock(&fib_hash_lock); 812 rcu_read_lock();
755 for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) { 813 for (fz = rcu_dereference(table->fn_zone_list);
756 if (m < s_m) continue; 814 fz != NULL;
815 fz = rcu_dereference(fz->fz_next), m++) {
816 if (m < s_m)
817 continue;
757 if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) { 818 if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
758 cb->args[2] = m; 819 cb->args[2] = m;
759 read_unlock(&fib_hash_lock); 820 rcu_read_unlock();
760 return -1; 821 return -1;
761 } 822 }
762 memset(&cb->args[3], 0, 823 memset(&cb->args[3], 0,
763 sizeof(cb->args) - 3*sizeof(cb->args[0])); 824 sizeof(cb->args) - 3*sizeof(cb->args[0]));
764 } 825 }
765 read_unlock(&fib_hash_lock); 826 rcu_read_unlock();
766 cb->args[2] = m; 827 cb->args[2] = m;
767 return skb->len; 828 return skb->len;
768} 829}
@@ -825,14 +886,15 @@ static struct fib_alias *fib_get_first(struct seq_file *seq)
825 iter->genid = fib_hash_genid; 886 iter->genid = fib_hash_genid;
826 iter->valid = 1; 887 iter->valid = 1;
827 888
828 for (iter->zone = table->fn_zone_list; iter->zone; 889 for (iter->zone = rcu_dereference(table->fn_zone_list);
829 iter->zone = iter->zone->fz_next) { 890 iter->zone != NULL;
891 iter->zone = rcu_dereference(iter->zone->fz_next)) {
830 int maxslot; 892 int maxslot;
831 893
832 if (!iter->zone->fz_nent) 894 if (!iter->zone->fz_nent)
833 continue; 895 continue;
834 896
835 iter->hash_head = iter->zone->fz_hash; 897 iter->hash_head = rcu_dereference(iter->zone->fz_hash);
836 maxslot = iter->zone->fz_divisor; 898 maxslot = iter->zone->fz_divisor;
837 899
838 for (iter->bucket = 0; iter->bucket < maxslot; 900 for (iter->bucket = 0; iter->bucket < maxslot;
@@ -911,13 +973,13 @@ static struct fib_alias *fib_get_next(struct seq_file *seq)
911 } 973 }
912 } 974 }
913 975
914 iter->zone = iter->zone->fz_next; 976 iter->zone = rcu_dereference(iter->zone->fz_next);
915 977
916 if (!iter->zone) 978 if (!iter->zone)
917 goto out; 979 goto out;
918 980
919 iter->bucket = 0; 981 iter->bucket = 0;
920 iter->hash_head = iter->zone->fz_hash; 982 iter->hash_head = rcu_dereference(iter->zone->fz_hash);
921 983
922 hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) { 984 hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
923 list_for_each_entry(fa, &fn->fn_alias, fa_list) { 985 list_for_each_entry(fa, &fn->fn_alias, fa_list) {
@@ -950,11 +1012,11 @@ static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
950} 1012}
951 1013
952static void *fib_seq_start(struct seq_file *seq, loff_t *pos) 1014static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
953 __acquires(fib_hash_lock) 1015 __acquires(RCU)
954{ 1016{
955 void *v = NULL; 1017 void *v = NULL;
956 1018
957 read_lock(&fib_hash_lock); 1019 rcu_read_lock();
958 if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN)) 1020 if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN))
959 v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 1021 v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
960 return v; 1022 return v;
@@ -967,15 +1029,16 @@ static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos)
967} 1029}
968 1030
969static void fib_seq_stop(struct seq_file *seq, void *v) 1031static void fib_seq_stop(struct seq_file *seq, void *v)
970 __releases(fib_hash_lock) 1032 __releases(RCU)
971{ 1033{
972 read_unlock(&fib_hash_lock); 1034 rcu_read_unlock();
973} 1035}
974 1036
975static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi) 1037static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
976{ 1038{
977 static const unsigned type2flags[RTN_MAX + 1] = { 1039 static const unsigned type2flags[RTN_MAX + 1] = {
978 [7] = RTF_REJECT, [8] = RTF_REJECT, 1040 [7] = RTF_REJECT,
1041 [8] = RTF_REJECT,
979 }; 1042 };
980 unsigned flags = type2flags[type]; 1043 unsigned flags = type2flags[type];
981 1044
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index 637b133973bd..a29edf2219c8 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -12,17 +12,22 @@ struct fib_alias {
12 u8 fa_type; 12 u8 fa_type;
13 u8 fa_scope; 13 u8 fa_scope;
14 u8 fa_state; 14 u8 fa_state;
15#ifdef CONFIG_IP_FIB_TRIE
16 struct rcu_head rcu; 15 struct rcu_head rcu;
17#endif
18}; 16};
19 17
20#define FA_S_ACCESSED 0x01 18#define FA_S_ACCESSED 0x01
21 19
20/* Dont write on fa_state unless needed, to keep it shared on all cpus */
21static inline void fib_alias_accessed(struct fib_alias *fa)
22{
23 if (!(fa->fa_state & FA_S_ACCESSED))
24 fa->fa_state |= FA_S_ACCESSED;
25}
26
22/* Exported by fib_semantics.c */ 27/* Exported by fib_semantics.c */
23extern int fib_semantic_match(struct list_head *head, 28extern int fib_semantic_match(struct list_head *head,
24 const struct flowi *flp, 29 const struct flowi *flp,
25 struct fib_result *res, int prefixlen); 30 struct fib_result *res, int prefixlen, int fib_flags);
26extern void fib_release_info(struct fib_info *); 31extern void fib_release_info(struct fib_info *);
27extern struct fib_info *fib_create_info(struct fib_config *cfg); 32extern struct fib_info *fib_create_info(struct fib_config *cfg);
28extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); 33extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 76daeb5ff564..7981a24f5c7b 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -6,7 +6,7 @@
6 * IPv4 Forwarding Information Base: policy rules. 6 * IPv4 Forwarding Information Base: policy rules.
7 * 7 *
8 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 8 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9 * Thomas Graf <tgraf@suug.ch> 9 * Thomas Graf <tgraf@suug.ch>
10 * 10 *
11 * This program is free software; you can redistribute it and/or 11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License 12 * modify it under the terms of the GNU General Public License
@@ -14,7 +14,7 @@
14 * 2 of the License, or (at your option) any later version. 14 * 2 of the License, or (at your option) any later version.
15 * 15 *
16 * Fixes: 16 * Fixes:
17 * Rani Assaf : local_rule cannot be deleted 17 * Rani Assaf : local_rule cannot be deleted
18 * Marc Boucher : routing by fwmark 18 * Marc Boucher : routing by fwmark
19 */ 19 */
20 20
@@ -32,8 +32,7 @@
32#include <net/ip_fib.h> 32#include <net/ip_fib.h>
33#include <net/fib_rules.h> 33#include <net/fib_rules.h>
34 34
35struct fib4_rule 35struct fib4_rule {
36{
37 struct fib_rule common; 36 struct fib_rule common;
38 u8 dst_len; 37 u8 dst_len;
39 u8 src_len; 38 u8 src_len;
@@ -58,6 +57,7 @@ int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res)
58{ 57{
59 struct fib_lookup_arg arg = { 58 struct fib_lookup_arg arg = {
60 .result = res, 59 .result = res,
60 .flags = FIB_LOOKUP_NOREF,
61 }; 61 };
62 int err; 62 int err;
63 63
@@ -91,10 +91,11 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
91 goto errout; 91 goto errout;
92 } 92 }
93 93
94 if ((tbl = fib_get_table(rule->fr_net, rule->table)) == NULL) 94 tbl = fib_get_table(rule->fr_net, rule->table);
95 if (!tbl)
95 goto errout; 96 goto errout;
96 97
97 err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result); 98 err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result, arg->flags);
98 if (err > 0) 99 if (err > 0)
99 err = -EAGAIN; 100 err = -EAGAIN;
100errout: 101errout:
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 20f09c5b31e8..3e0da3ef6116 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -60,21 +60,30 @@ static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
60 60
61static DEFINE_SPINLOCK(fib_multipath_lock); 61static DEFINE_SPINLOCK(fib_multipath_lock);
62 62
63#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \ 63#define for_nexthops(fi) { \
64for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++) 64 int nhsel; const struct fib_nh *nh; \
65 65 for (nhsel = 0, nh = (fi)->fib_nh; \
66#define change_nexthops(fi) { int nhsel; struct fib_nh *nexthop_nh; \ 66 nhsel < (fi)->fib_nhs; \
67for (nhsel=0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nexthop_nh++, nhsel++) 67 nh++, nhsel++)
68
69#define change_nexthops(fi) { \
70 int nhsel; struct fib_nh *nexthop_nh; \
71 for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
72 nhsel < (fi)->fib_nhs; \
73 nexthop_nh++, nhsel++)
68 74
69#else /* CONFIG_IP_ROUTE_MULTIPATH */ 75#else /* CONFIG_IP_ROUTE_MULTIPATH */
70 76
71/* Hope, that gcc will optimize it to get rid of dummy loop */ 77/* Hope, that gcc will optimize it to get rid of dummy loop */
72 78
73#define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \ 79#define for_nexthops(fi) { \
74for (nhsel=0; nhsel < 1; nhsel++) 80 int nhsel; const struct fib_nh *nh = (fi)->fib_nh; \
81 for (nhsel = 0; nhsel < 1; nhsel++)
75 82
76#define change_nexthops(fi) { int nhsel = 0; struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ 83#define change_nexthops(fi) { \
77for (nhsel=0; nhsel < 1; nhsel++) 84 int nhsel; \
85 struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
86 for (nhsel = 0; nhsel < 1; nhsel++)
78 87
79#endif /* CONFIG_IP_ROUTE_MULTIPATH */ 88#endif /* CONFIG_IP_ROUTE_MULTIPATH */
80 89
@@ -86,63 +95,70 @@ static const struct
86 int error; 95 int error;
87 u8 scope; 96 u8 scope;
88} fib_props[RTN_MAX + 1] = { 97} fib_props[RTN_MAX + 1] = {
89 { 98 [RTN_UNSPEC] = {
90 .error = 0, 99 .error = 0,
91 .scope = RT_SCOPE_NOWHERE, 100 .scope = RT_SCOPE_NOWHERE,
92 }, /* RTN_UNSPEC */ 101 },
93 { 102 [RTN_UNICAST] = {
94 .error = 0, 103 .error = 0,
95 .scope = RT_SCOPE_UNIVERSE, 104 .scope = RT_SCOPE_UNIVERSE,
96 }, /* RTN_UNICAST */ 105 },
97 { 106 [RTN_LOCAL] = {
98 .error = 0, 107 .error = 0,
99 .scope = RT_SCOPE_HOST, 108 .scope = RT_SCOPE_HOST,
100 }, /* RTN_LOCAL */ 109 },
101 { 110 [RTN_BROADCAST] = {
102 .error = 0, 111 .error = 0,
103 .scope = RT_SCOPE_LINK, 112 .scope = RT_SCOPE_LINK,
104 }, /* RTN_BROADCAST */ 113 },
105 { 114 [RTN_ANYCAST] = {
106 .error = 0, 115 .error = 0,
107 .scope = RT_SCOPE_LINK, 116 .scope = RT_SCOPE_LINK,
108 }, /* RTN_ANYCAST */ 117 },
109 { 118 [RTN_MULTICAST] = {
110 .error = 0, 119 .error = 0,
111 .scope = RT_SCOPE_UNIVERSE, 120 .scope = RT_SCOPE_UNIVERSE,
112 }, /* RTN_MULTICAST */ 121 },
113 { 122 [RTN_BLACKHOLE] = {
114 .error = -EINVAL, 123 .error = -EINVAL,
115 .scope = RT_SCOPE_UNIVERSE, 124 .scope = RT_SCOPE_UNIVERSE,
116 }, /* RTN_BLACKHOLE */ 125 },
117 { 126 [RTN_UNREACHABLE] = {
118 .error = -EHOSTUNREACH, 127 .error = -EHOSTUNREACH,
119 .scope = RT_SCOPE_UNIVERSE, 128 .scope = RT_SCOPE_UNIVERSE,
120 }, /* RTN_UNREACHABLE */ 129 },
121 { 130 [RTN_PROHIBIT] = {
122 .error = -EACCES, 131 .error = -EACCES,
123 .scope = RT_SCOPE_UNIVERSE, 132 .scope = RT_SCOPE_UNIVERSE,
124 }, /* RTN_PROHIBIT */ 133 },
125 { 134 [RTN_THROW] = {
126 .error = -EAGAIN, 135 .error = -EAGAIN,
127 .scope = RT_SCOPE_UNIVERSE, 136 .scope = RT_SCOPE_UNIVERSE,
128 }, /* RTN_THROW */ 137 },
129 { 138 [RTN_NAT] = {
130 .error = -EINVAL, 139 .error = -EINVAL,
131 .scope = RT_SCOPE_NOWHERE, 140 .scope = RT_SCOPE_NOWHERE,
132 }, /* RTN_NAT */ 141 },
133 { 142 [RTN_XRESOLVE] = {
134 .error = -EINVAL, 143 .error = -EINVAL,
135 .scope = RT_SCOPE_NOWHERE, 144 .scope = RT_SCOPE_NOWHERE,
136 }, /* RTN_XRESOLVE */ 145 },
137}; 146};
138 147
139 148
140/* Release a nexthop info record */ 149/* Release a nexthop info record */
141 150
151static void free_fib_info_rcu(struct rcu_head *head)
152{
153 struct fib_info *fi = container_of(head, struct fib_info, rcu);
154
155 kfree(fi);
156}
157
142void free_fib_info(struct fib_info *fi) 158void free_fib_info(struct fib_info *fi)
143{ 159{
144 if (fi->fib_dead == 0) { 160 if (fi->fib_dead == 0) {
145 printk(KERN_WARNING "Freeing alive fib_info %p\n", fi); 161 pr_warning("Freeing alive fib_info %p\n", fi);
146 return; 162 return;
147 } 163 }
148 change_nexthops(fi) { 164 change_nexthops(fi) {
@@ -152,7 +168,7 @@ void free_fib_info(struct fib_info *fi)
152 } endfor_nexthops(fi); 168 } endfor_nexthops(fi);
153 fib_info_cnt--; 169 fib_info_cnt--;
154 release_net(fi->fib_net); 170 release_net(fi->fib_net);
155 kfree(fi); 171 call_rcu(&fi->rcu, free_fib_info_rcu);
156} 172}
157 173
158void fib_release_info(struct fib_info *fi) 174void fib_release_info(struct fib_info *fi)
@@ -173,7 +189,7 @@ void fib_release_info(struct fib_info *fi)
173 spin_unlock_bh(&fib_info_lock); 189 spin_unlock_bh(&fib_info_lock);
174} 190}
175 191
176static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) 192static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
177{ 193{
178 const struct fib_nh *onh = ofi->fib_nh; 194 const struct fib_nh *onh = ofi->fib_nh;
179 195
@@ -187,7 +203,7 @@ static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *
187#ifdef CONFIG_NET_CLS_ROUTE 203#ifdef CONFIG_NET_CLS_ROUTE
188 nh->nh_tclassid != onh->nh_tclassid || 204 nh->nh_tclassid != onh->nh_tclassid ||
189#endif 205#endif
190 ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD)) 206 ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
191 return -1; 207 return -1;
192 onh++; 208 onh++;
193 } endfor_nexthops(fi); 209 } endfor_nexthops(fi);
@@ -238,7 +254,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
238 nfi->fib_priority == fi->fib_priority && 254 nfi->fib_priority == fi->fib_priority &&
239 memcmp(nfi->fib_metrics, fi->fib_metrics, 255 memcmp(nfi->fib_metrics, fi->fib_metrics,
240 sizeof(fi->fib_metrics)) == 0 && 256 sizeof(fi->fib_metrics)) == 0 &&
241 ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 && 257 ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 &&
242 (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) 258 (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
243 return fi; 259 return fi;
244 } 260 }
@@ -247,9 +263,8 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
247} 263}
248 264
249/* Check, that the gateway is already configured. 265/* Check, that the gateway is already configured.
250 Used only by redirect accept routine. 266 * Used only by redirect accept routine.
251 */ 267 */
252
253int ip_fib_check_default(__be32 gw, struct net_device *dev) 268int ip_fib_check_default(__be32 gw, struct net_device *dev)
254{ 269{
255 struct hlist_head *head; 270 struct hlist_head *head;
@@ -264,7 +279,7 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev)
264 hlist_for_each_entry(nh, node, head, nh_hash) { 279 hlist_for_each_entry(nh, node, head, nh_hash) {
265 if (nh->nh_dev == dev && 280 if (nh->nh_dev == dev &&
266 nh->nh_gw == gw && 281 nh->nh_gw == gw &&
267 !(nh->nh_flags&RTNH_F_DEAD)) { 282 !(nh->nh_flags & RTNH_F_DEAD)) {
268 spin_unlock(&fib_info_lock); 283 spin_unlock(&fib_info_lock);
269 return 0; 284 return 0;
270 } 285 }
@@ -362,10 +377,10 @@ int fib_detect_death(struct fib_info *fi, int order,
362 } 377 }
363 if (state == NUD_REACHABLE) 378 if (state == NUD_REACHABLE)
364 return 0; 379 return 0;
365 if ((state&NUD_VALID) && order != dflt) 380 if ((state & NUD_VALID) && order != dflt)
366 return 0; 381 return 0;
367 if ((state&NUD_VALID) || 382 if ((state & NUD_VALID) ||
368 (*last_idx<0 && order > dflt)) { 383 (*last_idx < 0 && order > dflt)) {
369 *last_resort = fi; 384 *last_resort = fi;
370 *last_idx = order; 385 *last_idx = order;
371 } 386 }
@@ -476,75 +491,76 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
476 491
477 492
478/* 493/*
479 Picture 494 * Picture
480 ------- 495 * -------
481 496 *
482 Semantics of nexthop is very messy by historical reasons. 497 * Semantics of nexthop is very messy by historical reasons.
483 We have to take into account, that: 498 * We have to take into account, that:
484 a) gateway can be actually local interface address, 499 * a) gateway can be actually local interface address,
485 so that gatewayed route is direct. 500 * so that gatewayed route is direct.
486 b) gateway must be on-link address, possibly 501 * b) gateway must be on-link address, possibly
487 described not by an ifaddr, but also by a direct route. 502 * described not by an ifaddr, but also by a direct route.
488 c) If both gateway and interface are specified, they should not 503 * c) If both gateway and interface are specified, they should not
489 contradict. 504 * contradict.
490 d) If we use tunnel routes, gateway could be not on-link. 505 * d) If we use tunnel routes, gateway could be not on-link.
491 506 *
492 Attempt to reconcile all of these (alas, self-contradictory) conditions 507 * Attempt to reconcile all of these (alas, self-contradictory) conditions
493 results in pretty ugly and hairy code with obscure logic. 508 * results in pretty ugly and hairy code with obscure logic.
494 509 *
495 I chose to generalized it instead, so that the size 510 * I chose to generalized it instead, so that the size
496 of code does not increase practically, but it becomes 511 * of code does not increase practically, but it becomes
497 much more general. 512 * much more general.
498 Every prefix is assigned a "scope" value: "host" is local address, 513 * Every prefix is assigned a "scope" value: "host" is local address,
499 "link" is direct route, 514 * "link" is direct route,
500 [ ... "site" ... "interior" ... ] 515 * [ ... "site" ... "interior" ... ]
501 and "universe" is true gateway route with global meaning. 516 * and "universe" is true gateway route with global meaning.
502 517 *
503 Every prefix refers to a set of "nexthop"s (gw, oif), 518 * Every prefix refers to a set of "nexthop"s (gw, oif),
504 where gw must have narrower scope. This recursion stops 519 * where gw must have narrower scope. This recursion stops
505 when gw has LOCAL scope or if "nexthop" is declared ONLINK, 520 * when gw has LOCAL scope or if "nexthop" is declared ONLINK,
506 which means that gw is forced to be on link. 521 * which means that gw is forced to be on link.
507 522 *
508 Code is still hairy, but now it is apparently logically 523 * Code is still hairy, but now it is apparently logically
509 consistent and very flexible. F.e. as by-product it allows 524 * consistent and very flexible. F.e. as by-product it allows
510 to co-exists in peace independent exterior and interior 525 * to co-exists in peace independent exterior and interior
511 routing processes. 526 * routing processes.
512 527 *
513 Normally it looks as following. 528 * Normally it looks as following.
514 529 *
515 {universe prefix} -> (gw, oif) [scope link] 530 * {universe prefix} -> (gw, oif) [scope link]
516 | 531 * |
517 |-> {link prefix} -> (gw, oif) [scope local] 532 * |-> {link prefix} -> (gw, oif) [scope local]
518 | 533 * |
519 |-> {local prefix} (terminal node) 534 * |-> {local prefix} (terminal node)
520 */ 535 */
521
522static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, 536static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
523 struct fib_nh *nh) 537 struct fib_nh *nh)
524{ 538{
525 int err; 539 int err;
526 struct net *net; 540 struct net *net;
541 struct net_device *dev;
527 542
528 net = cfg->fc_nlinfo.nl_net; 543 net = cfg->fc_nlinfo.nl_net;
529 if (nh->nh_gw) { 544 if (nh->nh_gw) {
530 struct fib_result res; 545 struct fib_result res;
531 546
532 if (nh->nh_flags&RTNH_F_ONLINK) { 547 if (nh->nh_flags & RTNH_F_ONLINK) {
533 struct net_device *dev;
534 548
535 if (cfg->fc_scope >= RT_SCOPE_LINK) 549 if (cfg->fc_scope >= RT_SCOPE_LINK)
536 return -EINVAL; 550 return -EINVAL;
537 if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST) 551 if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
538 return -EINVAL; 552 return -EINVAL;
539 if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL) 553 dev = __dev_get_by_index(net, nh->nh_oif);
554 if (!dev)
540 return -ENODEV; 555 return -ENODEV;
541 if (!(dev->flags&IFF_UP)) 556 if (!(dev->flags & IFF_UP))
542 return -ENETDOWN; 557 return -ENETDOWN;
543 nh->nh_dev = dev; 558 nh->nh_dev = dev;
544 dev_hold(dev); 559 dev_hold(dev);
545 nh->nh_scope = RT_SCOPE_LINK; 560 nh->nh_scope = RT_SCOPE_LINK;
546 return 0; 561 return 0;
547 } 562 }
563 rcu_read_lock();
548 { 564 {
549 struct flowi fl = { 565 struct flowi fl = {
550 .nl_u = { 566 .nl_u = {
@@ -559,50 +575,53 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
559 /* It is not necessary, but requires a bit of thinking */ 575 /* It is not necessary, but requires a bit of thinking */
560 if (fl.fl4_scope < RT_SCOPE_LINK) 576 if (fl.fl4_scope < RT_SCOPE_LINK)
561 fl.fl4_scope = RT_SCOPE_LINK; 577 fl.fl4_scope = RT_SCOPE_LINK;
562 if ((err = fib_lookup(net, &fl, &res)) != 0) 578 err = fib_lookup(net, &fl, &res);
579 if (err) {
580 rcu_read_unlock();
563 return err; 581 return err;
582 }
564 } 583 }
565 err = -EINVAL; 584 err = -EINVAL;
566 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) 585 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
567 goto out; 586 goto out;
568 nh->nh_scope = res.scope; 587 nh->nh_scope = res.scope;
569 nh->nh_oif = FIB_RES_OIF(res); 588 nh->nh_oif = FIB_RES_OIF(res);
570 if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL) 589 nh->nh_dev = dev = FIB_RES_DEV(res);
590 if (!dev)
571 goto out; 591 goto out;
572 dev_hold(nh->nh_dev); 592 dev_hold(dev);
573 err = -ENETDOWN; 593 err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
574 if (!(nh->nh_dev->flags & IFF_UP))
575 goto out;
576 err = 0;
577out:
578 fib_res_put(&res);
579 return err;
580 } else { 594 } else {
581 struct in_device *in_dev; 595 struct in_device *in_dev;
582 596
583 if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK)) 597 if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK))
584 return -EINVAL; 598 return -EINVAL;
585 599
600 rcu_read_lock();
601 err = -ENODEV;
586 in_dev = inetdev_by_index(net, nh->nh_oif); 602 in_dev = inetdev_by_index(net, nh->nh_oif);
587 if (in_dev == NULL) 603 if (in_dev == NULL)
588 return -ENODEV; 604 goto out;
589 if (!(in_dev->dev->flags&IFF_UP)) { 605 err = -ENETDOWN;
590 in_dev_put(in_dev); 606 if (!(in_dev->dev->flags & IFF_UP))
591 return -ENETDOWN; 607 goto out;
592 }
593 nh->nh_dev = in_dev->dev; 608 nh->nh_dev = in_dev->dev;
594 dev_hold(nh->nh_dev); 609 dev_hold(nh->nh_dev);
595 nh->nh_scope = RT_SCOPE_HOST; 610 nh->nh_scope = RT_SCOPE_HOST;
596 in_dev_put(in_dev); 611 err = 0;
597 } 612 }
598 return 0; 613out:
614 rcu_read_unlock();
615 return err;
599} 616}
600 617
601static inline unsigned int fib_laddr_hashfn(__be32 val) 618static inline unsigned int fib_laddr_hashfn(__be32 val)
602{ 619{
603 unsigned int mask = (fib_hash_size - 1); 620 unsigned int mask = (fib_hash_size - 1);
604 621
605 return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask; 622 return ((__force u32)val ^
623 ((__force u32)val >> 7) ^
624 ((__force u32)val >> 14)) & mask;
606} 625}
607 626
608static struct hlist_head *fib_hash_alloc(int bytes) 627static struct hlist_head *fib_hash_alloc(int bytes)
@@ -611,7 +630,8 @@ static struct hlist_head *fib_hash_alloc(int bytes)
611 return kzalloc(bytes, GFP_KERNEL); 630 return kzalloc(bytes, GFP_KERNEL);
612 else 631 else
613 return (struct hlist_head *) 632 return (struct hlist_head *)
614 __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes)); 633 __get_free_pages(GFP_KERNEL | __GFP_ZERO,
634 get_order(bytes));
615} 635}
616 636
617static void fib_hash_free(struct hlist_head *hash, int bytes) 637static void fib_hash_free(struct hlist_head *hash, int bytes)
@@ -806,7 +826,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
806 goto failure; 826 goto failure;
807 } else { 827 } else {
808 change_nexthops(fi) { 828 change_nexthops(fi) {
809 if ((err = fib_check_nh(cfg, fi, nexthop_nh)) != 0) 829 err = fib_check_nh(cfg, fi, nexthop_nh);
830 if (err != 0)
810 goto failure; 831 goto failure;
811 } endfor_nexthops(fi) 832 } endfor_nexthops(fi)
812 } 833 }
@@ -819,7 +840,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
819 } 840 }
820 841
821link_it: 842link_it:
822 if ((ofi = fib_find_info(fi)) != NULL) { 843 ofi = fib_find_info(fi);
844 if (ofi) {
823 fi->fib_dead = 1; 845 fi->fib_dead = 1;
824 free_fib_info(fi); 846 free_fib_info(fi);
825 ofi->fib_treeref++; 847 ofi->fib_treeref++;
@@ -864,7 +886,7 @@ failure:
864 886
865/* Note! fib_semantic_match intentionally uses RCU list functions. */ 887/* Note! fib_semantic_match intentionally uses RCU list functions. */
866int fib_semantic_match(struct list_head *head, const struct flowi *flp, 888int fib_semantic_match(struct list_head *head, const struct flowi *flp,
867 struct fib_result *res, int prefixlen) 889 struct fib_result *res, int prefixlen, int fib_flags)
868{ 890{
869 struct fib_alias *fa; 891 struct fib_alias *fa;
870 int nh_sel = 0; 892 int nh_sel = 0;
@@ -879,7 +901,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
879 if (fa->fa_scope < flp->fl4_scope) 901 if (fa->fa_scope < flp->fl4_scope)
880 continue; 902 continue;
881 903
882 fa->fa_state |= FA_S_ACCESSED; 904 fib_alias_accessed(fa);
883 905
884 err = fib_props[fa->fa_type].error; 906 err = fib_props[fa->fa_type].error;
885 if (err == 0) { 907 if (err == 0) {
@@ -895,7 +917,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
895 case RTN_ANYCAST: 917 case RTN_ANYCAST:
896 case RTN_MULTICAST: 918 case RTN_MULTICAST:
897 for_nexthops(fi) { 919 for_nexthops(fi) {
898 if (nh->nh_flags&RTNH_F_DEAD) 920 if (nh->nh_flags & RTNH_F_DEAD)
899 continue; 921 continue;
900 if (!flp->oif || flp->oif == nh->nh_oif) 922 if (!flp->oif || flp->oif == nh->nh_oif)
901 break; 923 break;
@@ -906,16 +928,15 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
906 goto out_fill_res; 928 goto out_fill_res;
907 } 929 }
908#else 930#else
909 if (nhsel < 1) { 931 if (nhsel < 1)
910 goto out_fill_res; 932 goto out_fill_res;
911 }
912#endif 933#endif
913 endfor_nexthops(fi); 934 endfor_nexthops(fi);
914 continue; 935 continue;
915 936
916 default: 937 default:
917 printk(KERN_WARNING "fib_semantic_match bad type %#x\n", 938 pr_warning("fib_semantic_match bad type %#x\n",
918 fa->fa_type); 939 fa->fa_type);
919 return -EINVAL; 940 return -EINVAL;
920 } 941 }
921 } 942 }
@@ -929,7 +950,8 @@ out_fill_res:
929 res->type = fa->fa_type; 950 res->type = fa->fa_type;
930 res->scope = fa->fa_scope; 951 res->scope = fa->fa_scope;
931 res->fi = fa->fa_info; 952 res->fi = fa->fa_info;
932 atomic_inc(&res->fi->fib_clntref); 953 if (!(fib_flags & FIB_LOOKUP_NOREF))
954 atomic_inc(&res->fi->fib_clntref);
933 return 0; 955 return 0;
934} 956}
935 957
@@ -1028,10 +1050,10 @@ nla_put_failure:
1028} 1050}
1029 1051
1030/* 1052/*
1031 Update FIB if: 1053 * Update FIB if:
1032 - local address disappeared -> we must delete all the entries 1054 * - local address disappeared -> we must delete all the entries
1033 referring to it. 1055 * referring to it.
1034 - device went down -> we must shutdown all nexthops going via it. 1056 * - device went down -> we must shutdown all nexthops going via it.
1035 */ 1057 */
1036int fib_sync_down_addr(struct net *net, __be32 local) 1058int fib_sync_down_addr(struct net *net, __be32 local)
1037{ 1059{
@@ -1078,7 +1100,7 @@ int fib_sync_down_dev(struct net_device *dev, int force)
1078 prev_fi = fi; 1100 prev_fi = fi;
1079 dead = 0; 1101 dead = 0;
1080 change_nexthops(fi) { 1102 change_nexthops(fi) {
1081 if (nexthop_nh->nh_flags&RTNH_F_DEAD) 1103 if (nexthop_nh->nh_flags & RTNH_F_DEAD)
1082 dead++; 1104 dead++;
1083 else if (nexthop_nh->nh_dev == dev && 1105 else if (nexthop_nh->nh_dev == dev &&
1084 nexthop_nh->nh_scope != scope) { 1106 nexthop_nh->nh_scope != scope) {
@@ -1110,10 +1132,9 @@ int fib_sync_down_dev(struct net_device *dev, int force)
1110#ifdef CONFIG_IP_ROUTE_MULTIPATH 1132#ifdef CONFIG_IP_ROUTE_MULTIPATH
1111 1133
1112/* 1134/*
1113 Dead device goes up. We wake up dead nexthops. 1135 * Dead device goes up. We wake up dead nexthops.
1114 It takes sense only on multipath routes. 1136 * It takes sense only on multipath routes.
1115 */ 1137 */
1116
1117int fib_sync_up(struct net_device *dev) 1138int fib_sync_up(struct net_device *dev)
1118{ 1139{
1119 struct fib_info *prev_fi; 1140 struct fib_info *prev_fi;
@@ -1123,7 +1144,7 @@ int fib_sync_up(struct net_device *dev)
1123 struct fib_nh *nh; 1144 struct fib_nh *nh;
1124 int ret; 1145 int ret;
1125 1146
1126 if (!(dev->flags&IFF_UP)) 1147 if (!(dev->flags & IFF_UP))
1127 return 0; 1148 return 0;
1128 1149
1129 prev_fi = NULL; 1150 prev_fi = NULL;
@@ -1142,12 +1163,12 @@ int fib_sync_up(struct net_device *dev)
1142 prev_fi = fi; 1163 prev_fi = fi;
1143 alive = 0; 1164 alive = 0;
1144 change_nexthops(fi) { 1165 change_nexthops(fi) {
1145 if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) { 1166 if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
1146 alive++; 1167 alive++;
1147 continue; 1168 continue;
1148 } 1169 }
1149 if (nexthop_nh->nh_dev == NULL || 1170 if (nexthop_nh->nh_dev == NULL ||
1150 !(nexthop_nh->nh_dev->flags&IFF_UP)) 1171 !(nexthop_nh->nh_dev->flags & IFF_UP))
1151 continue; 1172 continue;
1152 if (nexthop_nh->nh_dev != dev || 1173 if (nexthop_nh->nh_dev != dev ||
1153 !__in_dev_get_rtnl(dev)) 1174 !__in_dev_get_rtnl(dev))
@@ -1169,10 +1190,9 @@ int fib_sync_up(struct net_device *dev)
1169} 1190}
1170 1191
1171/* 1192/*
1172 The algorithm is suboptimal, but it provides really 1193 * The algorithm is suboptimal, but it provides really
1173 fair weighted route distribution. 1194 * fair weighted route distribution.
1174 */ 1195 */
1175
1176void fib_select_multipath(const struct flowi *flp, struct fib_result *res) 1196void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1177{ 1197{
1178 struct fib_info *fi = res->fi; 1198 struct fib_info *fi = res->fi;
@@ -1182,7 +1202,7 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1182 if (fi->fib_power <= 0) { 1202 if (fi->fib_power <= 0) {
1183 int power = 0; 1203 int power = 0;
1184 change_nexthops(fi) { 1204 change_nexthops(fi) {
1185 if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) { 1205 if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
1186 power += nexthop_nh->nh_weight; 1206 power += nexthop_nh->nh_weight;
1187 nexthop_nh->nh_power = nexthop_nh->nh_weight; 1207 nexthop_nh->nh_power = nexthop_nh->nh_weight;
1188 } 1208 }
@@ -1198,15 +1218,16 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1198 1218
1199 1219
1200 /* w should be random number [0..fi->fib_power-1], 1220 /* w should be random number [0..fi->fib_power-1],
1201 it is pretty bad approximation. 1221 * it is pretty bad approximation.
1202 */ 1222 */
1203 1223
1204 w = jiffies % fi->fib_power; 1224 w = jiffies % fi->fib_power;
1205 1225
1206 change_nexthops(fi) { 1226 change_nexthops(fi) {
1207 if (!(nexthop_nh->nh_flags&RTNH_F_DEAD) && 1227 if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) &&
1208 nexthop_nh->nh_power) { 1228 nexthop_nh->nh_power) {
1209 if ((w -= nexthop_nh->nh_power) <= 0) { 1229 w -= nexthop_nh->nh_power;
1230 if (w <= 0) {
1210 nexthop_nh->nh_power--; 1231 nexthop_nh->nh_power--;
1211 fi->fib_power--; 1232 fi->fib_power--;
1212 res->nh_sel = nhsel; 1233 res->nh_sel = nhsel;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 79d057a939ba..200eb538fbb3 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -16,7 +16,7 @@
16 * 16 *
17 * An experimental study of compression methods for dynamic tries 17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. 18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/ 19 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
20 * 20 *
21 * 21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson 22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
@@ -186,7 +186,7 @@ static inline struct tnode *node_parent_rcu(struct node *node)
186{ 186{
187 struct tnode *ret = node_parent(node); 187 struct tnode *ret = node_parent(node);
188 188
189 return rcu_dereference(ret); 189 return rcu_dereference_rtnl(ret);
190} 190}
191 191
192/* Same as rcu_assign_pointer 192/* Same as rcu_assign_pointer
@@ -209,9 +209,7 @@ static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
209{ 209{
210 struct node *ret = tnode_get_child(tn, i); 210 struct node *ret = tnode_get_child(tn, i);
211 211
212 return rcu_dereference_check(ret, 212 return rcu_dereference_rtnl(ret);
213 rcu_read_lock_held() ||
214 lockdep_rtnl_is_held());
215} 213}
216 214
217static inline int tnode_child_length(const struct tnode *tn) 215static inline int tnode_child_length(const struct tnode *tn)
@@ -457,8 +455,8 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
457 tn->empty_children = 1<<bits; 455 tn->empty_children = 1<<bits;
458 } 456 }
459 457
460 pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode), 458 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
461 (unsigned long) (sizeof(struct node) << bits)); 459 sizeof(struct node) << bits);
462 return tn; 460 return tn;
463} 461}
464 462
@@ -607,11 +605,10 @@ static struct node *resize(struct trie *t, struct tnode *tn)
607 605
608 /* Keep root node larger */ 606 /* Keep root node larger */
609 607
610 if (!node_parent((struct node*) tn)) { 608 if (!node_parent((struct node *)tn)) {
611 inflate_threshold_use = inflate_threshold_root; 609 inflate_threshold_use = inflate_threshold_root;
612 halve_threshold_use = halve_threshold_root; 610 halve_threshold_use = halve_threshold_root;
613 } 611 } else {
614 else {
615 inflate_threshold_use = inflate_threshold; 612 inflate_threshold_use = inflate_threshold;
616 halve_threshold_use = halve_threshold; 613 halve_threshold_use = halve_threshold;
617 } 614 }
@@ -637,7 +634,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
637 check_tnode(tn); 634 check_tnode(tn);
638 635
639 /* Return if at least one inflate is run */ 636 /* Return if at least one inflate is run */
640 if( max_work != MAX_WORK) 637 if (max_work != MAX_WORK)
641 return (struct node *) tn; 638 return (struct node *) tn;
642 639
643 /* 640 /*
@@ -964,9 +961,7 @@ fib_find_node(struct trie *t, u32 key)
964 struct node *n; 961 struct node *n;
965 962
966 pos = 0; 963 pos = 0;
967 n = rcu_dereference_check(t->trie, 964 n = rcu_dereference_rtnl(t->trie);
968 rcu_read_lock_held() ||
969 lockdep_rtnl_is_held());
970 965
971 while (n != NULL && NODE_TYPE(n) == T_TNODE) { 966 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
972 tn = (struct tnode *) n; 967 tn = (struct tnode *) n;
@@ -1347,7 +1342,7 @@ err:
1347/* should be called with rcu_read_lock */ 1342/* should be called with rcu_read_lock */
1348static int check_leaf(struct trie *t, struct leaf *l, 1343static int check_leaf(struct trie *t, struct leaf *l,
1349 t_key key, const struct flowi *flp, 1344 t_key key, const struct flowi *flp,
1350 struct fib_result *res) 1345 struct fib_result *res, int fib_flags)
1351{ 1346{
1352 struct leaf_info *li; 1347 struct leaf_info *li;
1353 struct hlist_head *hhead = &l->list; 1348 struct hlist_head *hhead = &l->list;
@@ -1361,7 +1356,7 @@ static int check_leaf(struct trie *t, struct leaf *l,
1361 if (l->key != (key & ntohl(mask))) 1356 if (l->key != (key & ntohl(mask)))
1362 continue; 1357 continue;
1363 1358
1364 err = fib_semantic_match(&li->falh, flp, res, plen); 1359 err = fib_semantic_match(&li->falh, flp, res, plen, fib_flags);
1365 1360
1366#ifdef CONFIG_IP_FIB_TRIE_STATS 1361#ifdef CONFIG_IP_FIB_TRIE_STATS
1367 if (err <= 0) 1362 if (err <= 0)
@@ -1377,7 +1372,7 @@ static int check_leaf(struct trie *t, struct leaf *l,
1377} 1372}
1378 1373
1379int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, 1374int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1380 struct fib_result *res) 1375 struct fib_result *res, int fib_flags)
1381{ 1376{
1382 struct trie *t = (struct trie *) tb->tb_data; 1377 struct trie *t = (struct trie *) tb->tb_data;
1383 int ret; 1378 int ret;
@@ -1389,8 +1384,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1389 t_key cindex = 0; 1384 t_key cindex = 0;
1390 int current_prefix_length = KEYLENGTH; 1385 int current_prefix_length = KEYLENGTH;
1391 struct tnode *cn; 1386 struct tnode *cn;
1392 t_key node_prefix, key_prefix, pref_mismatch; 1387 t_key pref_mismatch;
1393 int mp;
1394 1388
1395 rcu_read_lock(); 1389 rcu_read_lock();
1396 1390
@@ -1404,7 +1398,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1404 1398
1405 /* Just a leaf? */ 1399 /* Just a leaf? */
1406 if (IS_LEAF(n)) { 1400 if (IS_LEAF(n)) {
1407 ret = check_leaf(t, (struct leaf *)n, key, flp, res); 1401 ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags);
1408 goto found; 1402 goto found;
1409 } 1403 }
1410 1404
@@ -1429,7 +1423,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1429 } 1423 }
1430 1424
1431 if (IS_LEAF(n)) { 1425 if (IS_LEAF(n)) {
1432 ret = check_leaf(t, (struct leaf *)n, key, flp, res); 1426 ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags);
1433 if (ret > 0) 1427 if (ret > 0)
1434 goto backtrace; 1428 goto backtrace;
1435 goto found; 1429 goto found;
@@ -1505,10 +1499,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1505 * matching prefix. 1499 * matching prefix.
1506 */ 1500 */
1507 1501
1508 node_prefix = mask_pfx(cn->key, cn->pos); 1502 pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
1509 key_prefix = mask_pfx(key, cn->pos);
1510 pref_mismatch = key_prefix^node_prefix;
1511 mp = 0;
1512 1503
1513 /* 1504 /*
1514 * In short: If skipped bits in this node do not match 1505 * In short: If skipped bits in this node do not match
@@ -1516,13 +1507,9 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1516 * state.directly. 1507 * state.directly.
1517 */ 1508 */
1518 if (pref_mismatch) { 1509 if (pref_mismatch) {
1519 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) { 1510 int mp = KEYLENGTH - fls(pref_mismatch);
1520 mp++;
1521 pref_mismatch = pref_mismatch << 1;
1522 }
1523 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1524 1511
1525 if (key_prefix != 0) 1512 if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
1526 goto backtrace; 1513 goto backtrace;
1527 1514
1528 if (current_prefix_length >= cn->pos) 1515 if (current_prefix_length >= cn->pos)
@@ -1746,14 +1733,14 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1746 1733
1747 /* Node empty, walk back up to parent */ 1734 /* Node empty, walk back up to parent */
1748 c = (struct node *) p; 1735 c = (struct node *) p;
1749 } while ( (p = node_parent_rcu(c)) != NULL); 1736 } while ((p = node_parent_rcu(c)) != NULL);
1750 1737
1751 return NULL; /* Root of trie */ 1738 return NULL; /* Root of trie */
1752} 1739}
1753 1740
1754static struct leaf *trie_firstleaf(struct trie *t) 1741static struct leaf *trie_firstleaf(struct trie *t)
1755{ 1742{
1756 struct tnode *n = (struct tnode *) rcu_dereference(t->trie); 1743 struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
1757 1744
1758 if (!n) 1745 if (!n)
1759 return NULL; 1746 return NULL;
@@ -1810,6 +1797,11 @@ int fib_table_flush(struct fib_table *tb)
1810 return found; 1797 return found;
1811} 1798}
1812 1799
1800void fib_free_table(struct fib_table *tb)
1801{
1802 kfree(tb);
1803}
1804
1813void fib_table_select_default(struct fib_table *tb, 1805void fib_table_select_default(struct fib_table *tb,
1814 const struct flowi *flp, 1806 const struct flowi *flp,
1815 struct fib_result *res) 1807 struct fib_result *res)
@@ -1851,7 +1843,8 @@ void fib_table_select_default(struct fib_table *tb,
1851 if (!next_fi->fib_nh[0].nh_gw || 1843 if (!next_fi->fib_nh[0].nh_gw ||
1852 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) 1844 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1853 continue; 1845 continue;
1854 fa->fa_state |= FA_S_ACCESSED; 1846
1847 fib_alias_accessed(fa);
1855 1848
1856 if (fi == NULL) { 1849 if (fi == NULL) {
1857 if (next_fi != res->fi) 1850 if (next_fi != res->fi)
@@ -2039,14 +2032,14 @@ struct fib_trie_iter {
2039 struct seq_net_private p; 2032 struct seq_net_private p;
2040 struct fib_table *tb; 2033 struct fib_table *tb;
2041 struct tnode *tnode; 2034 struct tnode *tnode;
2042 unsigned index; 2035 unsigned int index;
2043 unsigned depth; 2036 unsigned int depth;
2044}; 2037};
2045 2038
2046static struct node *fib_trie_get_next(struct fib_trie_iter *iter) 2039static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2047{ 2040{
2048 struct tnode *tn = iter->tnode; 2041 struct tnode *tn = iter->tnode;
2049 unsigned cindex = iter->index; 2042 unsigned int cindex = iter->index;
2050 struct tnode *p; 2043 struct tnode *p;
2051 2044
2052 /* A single entry routing table */ 2045 /* A single entry routing table */
@@ -2155,7 +2148,7 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2155 */ 2148 */
2156static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) 2149static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2157{ 2150{
2158 unsigned i, max, pointers, bytes, avdepth; 2151 unsigned int i, max, pointers, bytes, avdepth;
2159 2152
2160 if (stat->leaves) 2153 if (stat->leaves)
2161 avdepth = stat->totdepth*100 / stat->leaves; 2154 avdepth = stat->totdepth*100 / stat->leaves;
@@ -2352,7 +2345,8 @@ static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2352 2345
2353static void seq_indent(struct seq_file *seq, int n) 2346static void seq_indent(struct seq_file *seq, int n)
2354{ 2347{
2355 while (n-- > 0) seq_puts(seq, " "); 2348 while (n-- > 0)
2349 seq_puts(seq, " ");
2356} 2350}
2357 2351
2358static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s) 2352static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
@@ -2384,7 +2378,7 @@ static const char *const rtn_type_names[__RTN_MAX] = {
2384 [RTN_XRESOLVE] = "XRESOLVE", 2378 [RTN_XRESOLVE] = "XRESOLVE",
2385}; 2379};
2386 2380
2387static inline const char *rtn_type(char *buf, size_t len, unsigned t) 2381static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2388{ 2382{
2389 if (t < __RTN_MAX && rtn_type_names[t]) 2383 if (t < __RTN_MAX && rtn_type_names[t])
2390 return rtn_type_names[t]; 2384 return rtn_type_names[t];
@@ -2540,13 +2534,12 @@ static void fib_route_seq_stop(struct seq_file *seq, void *v)
2540 rcu_read_unlock(); 2534 rcu_read_unlock();
2541} 2535}
2542 2536
2543static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi) 2537static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2544{ 2538{
2545 static unsigned type2flags[RTN_MAX + 1] = { 2539 unsigned int flags = 0;
2546 [7] = RTF_REJECT, [8] = RTF_REJECT,
2547 };
2548 unsigned flags = type2flags[type];
2549 2540
2541 if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2542 flags = RTF_REJECT;
2550 if (fi && fi->fib_nh->nh_gw) 2543 if (fi && fi->fib_nh->nh_gw)
2551 flags |= RTF_GATEWAY; 2544 flags |= RTF_GATEWAY;
2552 if (mask == htonl(0xFFFFFFFF)) 2545 if (mask == htonl(0xFFFFFFFF))
@@ -2558,7 +2551,7 @@ static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2558/* 2551/*
2559 * This outputs /proc/net/route. 2552 * This outputs /proc/net/route.
2560 * The format of the file is not supposed to be changed 2553 * The format of the file is not supposed to be changed
2561 * and needs to be same as fib_hash output to avoid breaking 2554 * and needs to be same as fib_hash output to avoid breaking
2562 * legacy utilities 2555 * legacy utilities
2563 */ 2556 */
2564static int fib_route_seq_show(struct seq_file *seq, void *v) 2557static int fib_route_seq_show(struct seq_file *seq, void *v)
@@ -2583,7 +2576,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
2583 2576
2584 list_for_each_entry_rcu(fa, &li->falh, fa_list) { 2577 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2585 const struct fib_info *fi = fa->fa_info; 2578 const struct fib_info *fi = fa->fa_info;
2586 unsigned flags = fib_flag_trans(fa->fa_type, mask, fi); 2579 unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2587 int len; 2580 int len;
2588 2581
2589 if (fa->fa_type == RTN_BROADCAST 2582 if (fa->fa_type == RTN_BROADCAST
diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c
new file mode 100644
index 000000000000..c6933f2ea310
--- /dev/null
+++ b/net/ipv4/gre.c
@@ -0,0 +1,152 @@
1/*
2 * GRE over IPv4 demultiplexer driver
3 *
4 * Authors: Dmitry Kozlov (xeb@mail.ru)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 */
12
13#include <linux/module.h>
14#include <linux/kernel.h>
15#include <linux/kmod.h>
16#include <linux/skbuff.h>
17#include <linux/in.h>
18#include <linux/netdevice.h>
19#include <linux/version.h>
20#include <linux/spinlock.h>
21#include <net/protocol.h>
22#include <net/gre.h>
23
24
25static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly;
26static DEFINE_SPINLOCK(gre_proto_lock);
27
28int gre_add_protocol(const struct gre_protocol *proto, u8 version)
29{
30 if (version >= GREPROTO_MAX)
31 goto err_out;
32
33 spin_lock(&gre_proto_lock);
34 if (gre_proto[version])
35 goto err_out_unlock;
36
37 rcu_assign_pointer(gre_proto[version], proto);
38 spin_unlock(&gre_proto_lock);
39 return 0;
40
41err_out_unlock:
42 spin_unlock(&gre_proto_lock);
43err_out:
44 return -1;
45}
46EXPORT_SYMBOL_GPL(gre_add_protocol);
47
48int gre_del_protocol(const struct gre_protocol *proto, u8 version)
49{
50 if (version >= GREPROTO_MAX)
51 goto err_out;
52
53 spin_lock(&gre_proto_lock);
54 if (rcu_dereference_protected(gre_proto[version],
55 lockdep_is_held(&gre_proto_lock)) != proto)
56 goto err_out_unlock;
57 rcu_assign_pointer(gre_proto[version], NULL);
58 spin_unlock(&gre_proto_lock);
59 synchronize_rcu();
60 return 0;
61
62err_out_unlock:
63 spin_unlock(&gre_proto_lock);
64err_out:
65 return -1;
66}
67EXPORT_SYMBOL_GPL(gre_del_protocol);
68
69static int gre_rcv(struct sk_buff *skb)
70{
71 const struct gre_protocol *proto;
72 u8 ver;
73 int ret;
74
75 if (!pskb_may_pull(skb, 12))
76 goto drop;
77
78 ver = skb->data[1]&0x7f;
79 if (ver >= GREPROTO_MAX)
80 goto drop;
81
82 rcu_read_lock();
83 proto = rcu_dereference(gre_proto[ver]);
84 if (!proto || !proto->handler)
85 goto drop_unlock;
86 ret = proto->handler(skb);
87 rcu_read_unlock();
88 return ret;
89
90drop_unlock:
91 rcu_read_unlock();
92drop:
93 kfree_skb(skb);
94 return NET_RX_DROP;
95}
96
97static void gre_err(struct sk_buff *skb, u32 info)
98{
99 const struct gre_protocol *proto;
100 u8 ver;
101
102 if (!pskb_may_pull(skb, 12))
103 goto drop;
104
105 ver = skb->data[1]&0x7f;
106 if (ver >= GREPROTO_MAX)
107 goto drop;
108
109 rcu_read_lock();
110 proto = rcu_dereference(gre_proto[ver]);
111 if (!proto || !proto->err_handler)
112 goto drop_unlock;
113 proto->err_handler(skb, info);
114 rcu_read_unlock();
115 return;
116
117drop_unlock:
118 rcu_read_unlock();
119drop:
120 kfree_skb(skb);
121}
122
123static const struct net_protocol net_gre_protocol = {
124 .handler = gre_rcv,
125 .err_handler = gre_err,
126 .netns_ok = 1,
127};
128
129static int __init gre_init(void)
130{
131 pr_info("GRE over IPv4 demultiplexor driver");
132
133 if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
134 pr_err("gre: can't add protocol\n");
135 return -EAGAIN;
136 }
137
138 return 0;
139}
140
141static void __exit gre_exit(void)
142{
143 inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
144}
145
146module_init(gre_init);
147module_exit(gre_exit);
148
149MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver");
150MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)");
151MODULE_LICENSE("GPL");
152
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index a0d847c7cba5..96bc7f9475a3 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -379,7 +379,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
379 inet->tos = ip_hdr(skb)->tos; 379 inet->tos = ip_hdr(skb)->tos;
380 daddr = ipc.addr = rt->rt_src; 380 daddr = ipc.addr = rt->rt_src;
381 ipc.opt = NULL; 381 ipc.opt = NULL;
382 ipc.shtx.flags = 0; 382 ipc.tx_flags = 0;
383 if (icmp_param->replyopts.optlen) { 383 if (icmp_param->replyopts.optlen) {
384 ipc.opt = &icmp_param->replyopts; 384 ipc.opt = &icmp_param->replyopts;
385 if (ipc.opt->srr) 385 if (ipc.opt->srr)
@@ -538,7 +538,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
538 inet_sk(sk)->tos = tos; 538 inet_sk(sk)->tos = tos;
539 ipc.addr = iph->saddr; 539 ipc.addr = iph->saddr;
540 ipc.opt = &icmp_param.replyopts; 540 ipc.opt = &icmp_param.replyopts;
541 ipc.shtx.flags = 0; 541 ipc.tx_flags = 0;
542 542
543 { 543 {
544 struct flowi fl = { 544 struct flowi fl = {
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index a1ad0e7180d2..c8877c6c7216 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -856,6 +856,18 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
856 igmpv3_clear_delrec(in_dev); 856 igmpv3_clear_delrec(in_dev);
857 } else if (len < 12) { 857 } else if (len < 12) {
858 return; /* ignore bogus packet; freed by caller */ 858 return; /* ignore bogus packet; freed by caller */
859 } else if (IGMP_V1_SEEN(in_dev)) {
860 /* This is a v3 query with v1 queriers present */
861 max_delay = IGMP_Query_Response_Interval;
862 group = 0;
863 } else if (IGMP_V2_SEEN(in_dev)) {
864 /* this is a v3 query with v2 queriers present;
865 * Interpretation of the max_delay code is problematic here.
866 * A real v2 host would use ih_code directly, while v3 has a
867 * different encoding. We use the v3 encoding as more likely
868 * to be intended in a v3 query.
869 */
870 max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
859 } else { /* v3 */ 871 } else { /* v3 */
860 if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) 872 if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
861 return; 873 return;
@@ -1257,14 +1269,14 @@ void ip_mc_rejoin_group(struct ip_mc_list *im)
1257 if (im->multiaddr == IGMP_ALL_HOSTS) 1269 if (im->multiaddr == IGMP_ALL_HOSTS)
1258 return; 1270 return;
1259 1271
1260 if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) { 1272 /* a failover is happening and switches
1261 igmp_mod_timer(im, IGMP_Initial_Report_Delay); 1273 * must be notified immediately */
1262 return; 1274 if (IGMP_V1_SEEN(in_dev))
1263 } 1275 igmp_send_report(in_dev, im, IGMP_HOST_MEMBERSHIP_REPORT);
1264 /* else, v3 */ 1276 else if (IGMP_V2_SEEN(in_dev))
1265 im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : 1277 igmp_send_report(in_dev, im, IGMPV2_HOST_MEMBERSHIP_REPORT);
1266 IGMP_Unsolicited_Report_Count; 1278 else
1267 igmp_ifc_event(in_dev); 1279 igmp_send_report(in_dev, im, IGMPV3_HOST_MEMBERSHIP_REPORT);
1268#endif 1280#endif
1269} 1281}
1270EXPORT_SYMBOL(ip_mc_rejoin_group); 1282EXPORT_SYMBOL(ip_mc_rejoin_group);
@@ -1406,6 +1418,7 @@ void ip_mc_destroy_dev(struct in_device *in_dev)
1406 write_unlock_bh(&in_dev->mc_list_lock); 1418 write_unlock_bh(&in_dev->mc_list_lock);
1407} 1419}
1408 1420
1421/* RTNL is locked */
1409static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) 1422static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
1410{ 1423{
1411 struct flowi fl = { .nl_u = { .ip4_u = 1424 struct flowi fl = { .nl_u = { .ip4_u =
@@ -1416,15 +1429,12 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
1416 1429
1417 if (imr->imr_ifindex) { 1430 if (imr->imr_ifindex) {
1418 idev = inetdev_by_index(net, imr->imr_ifindex); 1431 idev = inetdev_by_index(net, imr->imr_ifindex);
1419 if (idev)
1420 __in_dev_put(idev);
1421 return idev; 1432 return idev;
1422 } 1433 }
1423 if (imr->imr_address.s_addr) { 1434 if (imr->imr_address.s_addr) {
1424 dev = ip_dev_find(net, imr->imr_address.s_addr); 1435 dev = __ip_dev_find(net, imr->imr_address.s_addr, false);
1425 if (!dev) 1436 if (!dev)
1426 return NULL; 1437 return NULL;
1427 dev_put(dev);
1428 } 1438 }
1429 1439
1430 if (!dev && !ip_route_output_key(net, &rt, &fl)) { 1440 if (!dev && !ip_route_output_key(net, &rt, &fl)) {
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index e5fa2ddce320..ba8042665849 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -425,7 +425,7 @@ static int inet_diag_bc_run(const void *bc, int len,
425 bc += op->no; 425 bc += op->no;
426 } 426 }
427 } 427 }
428 return (len == 0); 428 return len == 0;
429} 429}
430 430
431static int valid_cc(const void *bc, int len, int cc) 431static int valid_cc(const void *bc, int len, int cc)
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index fb7ad5a21ff3..1b344f30b463 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -101,19 +101,43 @@ void inet_put_port(struct sock *sk)
101} 101}
102EXPORT_SYMBOL(inet_put_port); 102EXPORT_SYMBOL(inet_put_port);
103 103
104void __inet_inherit_port(struct sock *sk, struct sock *child) 104int __inet_inherit_port(struct sock *sk, struct sock *child)
105{ 105{
106 struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; 106 struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
107 const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->inet_num, 107 unsigned short port = inet_sk(child)->inet_num;
108 const int bhash = inet_bhashfn(sock_net(sk), port,
108 table->bhash_size); 109 table->bhash_size);
109 struct inet_bind_hashbucket *head = &table->bhash[bhash]; 110 struct inet_bind_hashbucket *head = &table->bhash[bhash];
110 struct inet_bind_bucket *tb; 111 struct inet_bind_bucket *tb;
111 112
112 spin_lock(&head->lock); 113 spin_lock(&head->lock);
113 tb = inet_csk(sk)->icsk_bind_hash; 114 tb = inet_csk(sk)->icsk_bind_hash;
115 if (tb->port != port) {
116 /* NOTE: using tproxy and redirecting skbs to a proxy
117 * on a different listener port breaks the assumption
118 * that the listener socket's icsk_bind_hash is the same
119 * as that of the child socket. We have to look up or
120 * create a new bind bucket for the child here. */
121 struct hlist_node *node;
122 inet_bind_bucket_for_each(tb, node, &head->chain) {
123 if (net_eq(ib_net(tb), sock_net(sk)) &&
124 tb->port == port)
125 break;
126 }
127 if (!node) {
128 tb = inet_bind_bucket_create(table->bind_bucket_cachep,
129 sock_net(sk), head, port);
130 if (!tb) {
131 spin_unlock(&head->lock);
132 return -ENOMEM;
133 }
134 }
135 }
114 sk_add_bind_node(child, &tb->owners); 136 sk_add_bind_node(child, &tb->owners);
115 inet_csk(child)->icsk_bind_hash = tb; 137 inet_csk(child)->icsk_bind_hash = tb;
116 spin_unlock(&head->lock); 138 spin_unlock(&head->lock);
139
140 return 0;
117} 141}
118EXPORT_SYMBOL_GPL(__inet_inherit_port); 142EXPORT_SYMBOL_GPL(__inet_inherit_port);
119 143
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 9ffa24b9a804..9e94d7cf4f8a 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -72,18 +72,19 @@ static struct kmem_cache *peer_cachep __read_mostly;
72#define node_height(x) x->avl_height 72#define node_height(x) x->avl_height
73 73
74#define peer_avl_empty ((struct inet_peer *)&peer_fake_node) 74#define peer_avl_empty ((struct inet_peer *)&peer_fake_node)
75#define peer_avl_empty_rcu ((struct inet_peer __rcu __force *)&peer_fake_node)
75static const struct inet_peer peer_fake_node = { 76static const struct inet_peer peer_fake_node = {
76 .avl_left = peer_avl_empty, 77 .avl_left = peer_avl_empty_rcu,
77 .avl_right = peer_avl_empty, 78 .avl_right = peer_avl_empty_rcu,
78 .avl_height = 0 79 .avl_height = 0
79}; 80};
80 81
81static struct { 82static struct {
82 struct inet_peer *root; 83 struct inet_peer __rcu *root;
83 spinlock_t lock; 84 spinlock_t lock;
84 int total; 85 int total;
85} peers = { 86} peers = {
86 .root = peer_avl_empty, 87 .root = peer_avl_empty_rcu,
87 .lock = __SPIN_LOCK_UNLOCKED(peers.lock), 88 .lock = __SPIN_LOCK_UNLOCKED(peers.lock),
88 .total = 0, 89 .total = 0,
89}; 90};
@@ -156,11 +157,14 @@ static void unlink_from_unused(struct inet_peer *p)
156 */ 157 */
157#define lookup(_daddr, _stack) \ 158#define lookup(_daddr, _stack) \
158({ \ 159({ \
159 struct inet_peer *u, **v; \ 160 struct inet_peer *u; \
161 struct inet_peer __rcu **v; \
160 \ 162 \
161 stackptr = _stack; \ 163 stackptr = _stack; \
162 *stackptr++ = &peers.root; \ 164 *stackptr++ = &peers.root; \
163 for (u = peers.root; u != peer_avl_empty; ) { \ 165 for (u = rcu_dereference_protected(peers.root, \
166 lockdep_is_held(&peers.lock)); \
167 u != peer_avl_empty; ) { \
164 if (_daddr == u->v4daddr) \ 168 if (_daddr == u->v4daddr) \
165 break; \ 169 break; \
166 if ((__force __u32)_daddr < (__force __u32)u->v4daddr) \ 170 if ((__force __u32)_daddr < (__force __u32)u->v4daddr) \
@@ -168,7 +172,8 @@ static void unlink_from_unused(struct inet_peer *p)
168 else \ 172 else \
169 v = &u->avl_right; \ 173 v = &u->avl_right; \
170 *stackptr++ = v; \ 174 *stackptr++ = v; \
171 u = *v; \ 175 u = rcu_dereference_protected(*v, \
176 lockdep_is_held(&peers.lock)); \
172 } \ 177 } \
173 u; \ 178 u; \
174}) 179})
@@ -209,13 +214,17 @@ static struct inet_peer *lookup_rcu_bh(__be32 daddr)
209/* Called with local BH disabled and the pool lock held. */ 214/* Called with local BH disabled and the pool lock held. */
210#define lookup_rightempty(start) \ 215#define lookup_rightempty(start) \
211({ \ 216({ \
212 struct inet_peer *u, **v; \ 217 struct inet_peer *u; \
218 struct inet_peer __rcu **v; \
213 *stackptr++ = &start->avl_left; \ 219 *stackptr++ = &start->avl_left; \
214 v = &start->avl_left; \ 220 v = &start->avl_left; \
215 for (u = *v; u->avl_right != peer_avl_empty; ) { \ 221 for (u = rcu_dereference_protected(*v, \
222 lockdep_is_held(&peers.lock)); \
223 u->avl_right != peer_avl_empty_rcu; ) { \
216 v = &u->avl_right; \ 224 v = &u->avl_right; \
217 *stackptr++ = v; \ 225 *stackptr++ = v; \
218 u = *v; \ 226 u = rcu_dereference_protected(*v, \
227 lockdep_is_held(&peers.lock)); \
219 } \ 228 } \
220 u; \ 229 u; \
221}) 230})
@@ -224,74 +233,86 @@ static struct inet_peer *lookup_rcu_bh(__be32 daddr)
224 * Variable names are the proof of operation correctness. 233 * Variable names are the proof of operation correctness.
225 * Look into mm/map_avl.c for more detail description of the ideas. 234 * Look into mm/map_avl.c for more detail description of the ideas.
226 */ 235 */
227static void peer_avl_rebalance(struct inet_peer **stack[], 236static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
228 struct inet_peer ***stackend) 237 struct inet_peer __rcu ***stackend)
229{ 238{
230 struct inet_peer **nodep, *node, *l, *r; 239 struct inet_peer __rcu **nodep;
240 struct inet_peer *node, *l, *r;
231 int lh, rh; 241 int lh, rh;
232 242
233 while (stackend > stack) { 243 while (stackend > stack) {
234 nodep = *--stackend; 244 nodep = *--stackend;
235 node = *nodep; 245 node = rcu_dereference_protected(*nodep,
236 l = node->avl_left; 246 lockdep_is_held(&peers.lock));
237 r = node->avl_right; 247 l = rcu_dereference_protected(node->avl_left,
248 lockdep_is_held(&peers.lock));
249 r = rcu_dereference_protected(node->avl_right,
250 lockdep_is_held(&peers.lock));
238 lh = node_height(l); 251 lh = node_height(l);
239 rh = node_height(r); 252 rh = node_height(r);
240 if (lh > rh + 1) { /* l: RH+2 */ 253 if (lh > rh + 1) { /* l: RH+2 */
241 struct inet_peer *ll, *lr, *lrl, *lrr; 254 struct inet_peer *ll, *lr, *lrl, *lrr;
242 int lrh; 255 int lrh;
243 ll = l->avl_left; 256 ll = rcu_dereference_protected(l->avl_left,
244 lr = l->avl_right; 257 lockdep_is_held(&peers.lock));
258 lr = rcu_dereference_protected(l->avl_right,
259 lockdep_is_held(&peers.lock));
245 lrh = node_height(lr); 260 lrh = node_height(lr);
246 if (lrh <= node_height(ll)) { /* ll: RH+1 */ 261 if (lrh <= node_height(ll)) { /* ll: RH+1 */
247 node->avl_left = lr; /* lr: RH or RH+1 */ 262 RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */
248 node->avl_right = r; /* r: RH */ 263 RCU_INIT_POINTER(node->avl_right, r); /* r: RH */
249 node->avl_height = lrh + 1; /* RH+1 or RH+2 */ 264 node->avl_height = lrh + 1; /* RH+1 or RH+2 */
250 l->avl_left = ll; /* ll: RH+1 */ 265 RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH+1 */
251 l->avl_right = node; /* node: RH+1 or RH+2 */ 266 RCU_INIT_POINTER(l->avl_right, node); /* node: RH+1 or RH+2 */
252 l->avl_height = node->avl_height + 1; 267 l->avl_height = node->avl_height + 1;
253 *nodep = l; 268 RCU_INIT_POINTER(*nodep, l);
254 } else { /* ll: RH, lr: RH+1 */ 269 } else { /* ll: RH, lr: RH+1 */
255 lrl = lr->avl_left; /* lrl: RH or RH-1 */ 270 lrl = rcu_dereference_protected(lr->avl_left,
256 lrr = lr->avl_right; /* lrr: RH or RH-1 */ 271 lockdep_is_held(&peers.lock)); /* lrl: RH or RH-1 */
257 node->avl_left = lrr; /* lrr: RH or RH-1 */ 272 lrr = rcu_dereference_protected(lr->avl_right,
258 node->avl_right = r; /* r: RH */ 273 lockdep_is_held(&peers.lock)); /* lrr: RH or RH-1 */
274 RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */
275 RCU_INIT_POINTER(node->avl_right, r); /* r: RH */
259 node->avl_height = rh + 1; /* node: RH+1 */ 276 node->avl_height = rh + 1; /* node: RH+1 */
260 l->avl_left = ll; /* ll: RH */ 277 RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH */
261 l->avl_right = lrl; /* lrl: RH or RH-1 */ 278 RCU_INIT_POINTER(l->avl_right, lrl); /* lrl: RH or RH-1 */
262 l->avl_height = rh + 1; /* l: RH+1 */ 279 l->avl_height = rh + 1; /* l: RH+1 */
263 lr->avl_left = l; /* l: RH+1 */ 280 RCU_INIT_POINTER(lr->avl_left, l); /* l: RH+1 */
264 lr->avl_right = node; /* node: RH+1 */ 281 RCU_INIT_POINTER(lr->avl_right, node); /* node: RH+1 */
265 lr->avl_height = rh + 2; 282 lr->avl_height = rh + 2;
266 *nodep = lr; 283 RCU_INIT_POINTER(*nodep, lr);
267 } 284 }
268 } else if (rh > lh + 1) { /* r: LH+2 */ 285 } else if (rh > lh + 1) { /* r: LH+2 */
269 struct inet_peer *rr, *rl, *rlr, *rll; 286 struct inet_peer *rr, *rl, *rlr, *rll;
270 int rlh; 287 int rlh;
271 rr = r->avl_right; 288 rr = rcu_dereference_protected(r->avl_right,
272 rl = r->avl_left; 289 lockdep_is_held(&peers.lock));
290 rl = rcu_dereference_protected(r->avl_left,
291 lockdep_is_held(&peers.lock));
273 rlh = node_height(rl); 292 rlh = node_height(rl);
274 if (rlh <= node_height(rr)) { /* rr: LH+1 */ 293 if (rlh <= node_height(rr)) { /* rr: LH+1 */
275 node->avl_right = rl; /* rl: LH or LH+1 */ 294 RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */
276 node->avl_left = l; /* l: LH */ 295 RCU_INIT_POINTER(node->avl_left, l); /* l: LH */
277 node->avl_height = rlh + 1; /* LH+1 or LH+2 */ 296 node->avl_height = rlh + 1; /* LH+1 or LH+2 */
278 r->avl_right = rr; /* rr: LH+1 */ 297 RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH+1 */
279 r->avl_left = node; /* node: LH+1 or LH+2 */ 298 RCU_INIT_POINTER(r->avl_left, node); /* node: LH+1 or LH+2 */
280 r->avl_height = node->avl_height + 1; 299 r->avl_height = node->avl_height + 1;
281 *nodep = r; 300 RCU_INIT_POINTER(*nodep, r);
282 } else { /* rr: RH, rl: RH+1 */ 301 } else { /* rr: RH, rl: RH+1 */
283 rlr = rl->avl_right; /* rlr: LH or LH-1 */ 302 rlr = rcu_dereference_protected(rl->avl_right,
284 rll = rl->avl_left; /* rll: LH or LH-1 */ 303 lockdep_is_held(&peers.lock)); /* rlr: LH or LH-1 */
285 node->avl_right = rll; /* rll: LH or LH-1 */ 304 rll = rcu_dereference_protected(rl->avl_left,
286 node->avl_left = l; /* l: LH */ 305 lockdep_is_held(&peers.lock)); /* rll: LH or LH-1 */
306 RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */
307 RCU_INIT_POINTER(node->avl_left, l); /* l: LH */
287 node->avl_height = lh + 1; /* node: LH+1 */ 308 node->avl_height = lh + 1; /* node: LH+1 */
288 r->avl_right = rr; /* rr: LH */ 309 RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH */
289 r->avl_left = rlr; /* rlr: LH or LH-1 */ 310 RCU_INIT_POINTER(r->avl_left, rlr); /* rlr: LH or LH-1 */
290 r->avl_height = lh + 1; /* r: LH+1 */ 311 r->avl_height = lh + 1; /* r: LH+1 */
291 rl->avl_right = r; /* r: LH+1 */ 312 RCU_INIT_POINTER(rl->avl_right, r); /* r: LH+1 */
292 rl->avl_left = node; /* node: LH+1 */ 313 RCU_INIT_POINTER(rl->avl_left, node); /* node: LH+1 */
293 rl->avl_height = lh + 2; 314 rl->avl_height = lh + 2;
294 *nodep = rl; 315 RCU_INIT_POINTER(*nodep, rl);
295 } 316 }
296 } else { 317 } else {
297 node->avl_height = (lh > rh ? lh : rh) + 1; 318 node->avl_height = (lh > rh ? lh : rh) + 1;
@@ -303,10 +324,10 @@ static void peer_avl_rebalance(struct inet_peer **stack[],
303#define link_to_pool(n) \ 324#define link_to_pool(n) \
304do { \ 325do { \
305 n->avl_height = 1; \ 326 n->avl_height = 1; \
306 n->avl_left = peer_avl_empty; \ 327 n->avl_left = peer_avl_empty_rcu; \
307 n->avl_right = peer_avl_empty; \ 328 n->avl_right = peer_avl_empty_rcu; \
308 smp_wmb(); /* lockless readers can catch us now */ \ 329 /* lockless readers can catch us now */ \
309 **--stackptr = n; \ 330 rcu_assign_pointer(**--stackptr, n); \
310 peer_avl_rebalance(stack, stackptr); \ 331 peer_avl_rebalance(stack, stackptr); \
311} while (0) 332} while (0)
312 333
@@ -330,24 +351,25 @@ static void unlink_from_pool(struct inet_peer *p)
330 * We use refcnt=-1 to alert lockless readers this entry is deleted. 351 * We use refcnt=-1 to alert lockless readers this entry is deleted.
331 */ 352 */
332 if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) { 353 if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) {
333 struct inet_peer **stack[PEER_MAXDEPTH]; 354 struct inet_peer __rcu **stack[PEER_MAXDEPTH];
334 struct inet_peer ***stackptr, ***delp; 355 struct inet_peer __rcu ***stackptr, ***delp;
335 if (lookup(p->v4daddr, stack) != p) 356 if (lookup(p->v4daddr, stack) != p)
336 BUG(); 357 BUG();
337 delp = stackptr - 1; /* *delp[0] == p */ 358 delp = stackptr - 1; /* *delp[0] == p */
338 if (p->avl_left == peer_avl_empty) { 359 if (p->avl_left == peer_avl_empty_rcu) {
339 *delp[0] = p->avl_right; 360 *delp[0] = p->avl_right;
340 --stackptr; 361 --stackptr;
341 } else { 362 } else {
342 /* look for a node to insert instead of p */ 363 /* look for a node to insert instead of p */
343 struct inet_peer *t; 364 struct inet_peer *t;
344 t = lookup_rightempty(p); 365 t = lookup_rightempty(p);
345 BUG_ON(*stackptr[-1] != t); 366 BUG_ON(rcu_dereference_protected(*stackptr[-1],
367 lockdep_is_held(&peers.lock)) != t);
346 **--stackptr = t->avl_left; 368 **--stackptr = t->avl_left;
347 /* t is removed, t->v4daddr > x->v4daddr for any 369 /* t is removed, t->v4daddr > x->v4daddr for any
348 * x in p->avl_left subtree. 370 * x in p->avl_left subtree.
349 * Put t in the old place of p. */ 371 * Put t in the old place of p. */
350 *delp[0] = t; 372 RCU_INIT_POINTER(*delp[0], t);
351 t->avl_left = p->avl_left; 373 t->avl_left = p->avl_left;
352 t->avl_right = p->avl_right; 374 t->avl_right = p->avl_right;
353 t->avl_height = p->avl_height; 375 t->avl_height = p->avl_height;
@@ -414,7 +436,7 @@ static int cleanup_once(unsigned long ttl)
414struct inet_peer *inet_getpeer(__be32 daddr, int create) 436struct inet_peer *inet_getpeer(__be32 daddr, int create)
415{ 437{
416 struct inet_peer *p; 438 struct inet_peer *p;
417 struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr; 439 struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
418 440
419 /* Look up for the address quickly, lockless. 441 /* Look up for the address quickly, lockless.
420 * Because of a concurrent writer, we might not find an existing entry. 442 * Because of a concurrent writer, we might not find an existing entry.
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index b7c41654dde5..168440834ade 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -116,11 +116,11 @@ static int ip4_frag_match(struct inet_frag_queue *q, void *a)
116 struct ip4_create_arg *arg = a; 116 struct ip4_create_arg *arg = a;
117 117
118 qp = container_of(q, struct ipq, q); 118 qp = container_of(q, struct ipq, q);
119 return (qp->id == arg->iph->id && 119 return qp->id == arg->iph->id &&
120 qp->saddr == arg->iph->saddr && 120 qp->saddr == arg->iph->saddr &&
121 qp->daddr == arg->iph->daddr && 121 qp->daddr == arg->iph->daddr &&
122 qp->protocol == arg->iph->protocol && 122 qp->protocol == arg->iph->protocol &&
123 qp->user == arg->user); 123 qp->user == arg->user;
124} 124}
125 125
126/* Memory Tracking Functions. */ 126/* Memory Tracking Functions. */
@@ -542,7 +542,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
542 /* If the first fragment is fragmented itself, we split 542 /* If the first fragment is fragmented itself, we split
543 * it to two chunks: the first with data and paged part 543 * it to two chunks: the first with data and paged part
544 * and the second, holding only fragments. */ 544 * and the second, holding only fragments. */
545 if (skb_has_frags(head)) { 545 if (skb_has_frag_list(head)) {
546 struct sk_buff *clone; 546 struct sk_buff *clone;
547 int i, plen = 0; 547 int i, plen = 0;
548 548
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 945b20a5ad50..70ff77f02eee 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -44,8 +44,9 @@
44#include <net/net_namespace.h> 44#include <net/net_namespace.h>
45#include <net/netns/generic.h> 45#include <net/netns/generic.h>
46#include <net/rtnetlink.h> 46#include <net/rtnetlink.h>
47#include <net/gre.h>
47 48
48#ifdef CONFIG_IPV6 49#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
49#include <net/ipv6.h> 50#include <net/ipv6.h>
50#include <net/ip6_fib.h> 51#include <net/ip6_fib.h>
51#include <net/ip6_route.h> 52#include <net/ip6_route.h>
@@ -63,13 +64,13 @@
63 We cannot track such dead loops during route installation, 64 We cannot track such dead loops during route installation,
64 it is infeasible task. The most general solutions would be 65 it is infeasible task. The most general solutions would be
65 to keep skb->encapsulation counter (sort of local ttl), 66 to keep skb->encapsulation counter (sort of local ttl),
66 and silently drop packet when it expires. It is the best 67 and silently drop packet when it expires. It is a good
67 solution, but it supposes maintaing new variable in ALL 68 solution, but it supposes maintaing new variable in ALL
68 skb, even if no tunneling is used. 69 skb, even if no tunneling is used.
69 70
70 Current solution: HARD_TX_LOCK lock breaks dead loops. 71 Current solution: xmit_recursion breaks dead loops. This is a percpu
71 72 counter, since when we enter the first ndo_xmit(), cpu migration is
72 73 forbidden. We force an exit if this counter reaches RECURSION_LIMIT
73 74
74 2. Networking dead loops would not kill routers, but would really 75 2. Networking dead loops would not kill routers, but would really
75 kill network. IP hop limit plays role of "t->recursion" in this case, 76 kill network. IP hop limit plays role of "t->recursion" in this case,
@@ -128,7 +129,7 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev);
128 129
129static int ipgre_net_id __read_mostly; 130static int ipgre_net_id __read_mostly;
130struct ipgre_net { 131struct ipgre_net {
131 struct ip_tunnel *tunnels[4][HASH_SIZE]; 132 struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
132 133
133 struct net_device *fb_tunnel_dev; 134 struct net_device *fb_tunnel_dev;
134}; 135};
@@ -158,13 +159,40 @@ struct ipgre_net {
158#define tunnels_l tunnels[1] 159#define tunnels_l tunnels[1]
159#define tunnels_wc tunnels[0] 160#define tunnels_wc tunnels[0]
160/* 161/*
161 * Locking : hash tables are protected by RCU and a spinlock 162 * Locking : hash tables are protected by RCU and RTNL
162 */ 163 */
163static DEFINE_SPINLOCK(ipgre_lock);
164 164
165#define for_each_ip_tunnel_rcu(start) \ 165#define for_each_ip_tunnel_rcu(start) \
166 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) 166 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
167 167
168/* often modified stats are per cpu, other are shared (netdev->stats) */
169struct pcpu_tstats {
170 unsigned long rx_packets;
171 unsigned long rx_bytes;
172 unsigned long tx_packets;
173 unsigned long tx_bytes;
174};
175
176static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
177{
178 struct pcpu_tstats sum = { 0 };
179 int i;
180
181 for_each_possible_cpu(i) {
182 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
183
184 sum.rx_packets += tstats->rx_packets;
185 sum.rx_bytes += tstats->rx_bytes;
186 sum.tx_packets += tstats->tx_packets;
187 sum.tx_bytes += tstats->tx_bytes;
188 }
189 dev->stats.rx_packets = sum.rx_packets;
190 dev->stats.rx_bytes = sum.rx_bytes;
191 dev->stats.tx_packets = sum.tx_packets;
192 dev->stats.tx_bytes = sum.tx_bytes;
193 return &dev->stats;
194}
195
168/* Given src, dst and key, find appropriate for input tunnel. */ 196/* Given src, dst and key, find appropriate for input tunnel. */
169 197
170static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, 198static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
@@ -173,8 +201,8 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
173{ 201{
174 struct net *net = dev_net(dev); 202 struct net *net = dev_net(dev);
175 int link = dev->ifindex; 203 int link = dev->ifindex;
176 unsigned h0 = HASH(remote); 204 unsigned int h0 = HASH(remote);
177 unsigned h1 = HASH(key); 205 unsigned int h1 = HASH(key);
178 struct ip_tunnel *t, *cand = NULL; 206 struct ip_tunnel *t, *cand = NULL;
179 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 207 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
180 int dev_type = (gre_proto == htons(ETH_P_TEB)) ? 208 int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
@@ -289,13 +317,13 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
289 return NULL; 317 return NULL;
290} 318}
291 319
292static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign, 320static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
293 struct ip_tunnel_parm *parms) 321 struct ip_tunnel_parm *parms)
294{ 322{
295 __be32 remote = parms->iph.daddr; 323 __be32 remote = parms->iph.daddr;
296 __be32 local = parms->iph.saddr; 324 __be32 local = parms->iph.saddr;
297 __be32 key = parms->i_key; 325 __be32 key = parms->i_key;
298 unsigned h = HASH(key); 326 unsigned int h = HASH(key);
299 int prio = 0; 327 int prio = 0;
300 328
301 if (local) 329 if (local)
@@ -308,7 +336,7 @@ static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
308 return &ign->tunnels[prio][h]; 336 return &ign->tunnels[prio][h];
309} 337}
310 338
311static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign, 339static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
312 struct ip_tunnel *t) 340 struct ip_tunnel *t)
313{ 341{
314 return __ipgre_bucket(ign, &t->parms); 342 return __ipgre_bucket(ign, &t->parms);
@@ -316,23 +344,22 @@ static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
316 344
317static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t) 345static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
318{ 346{
319 struct ip_tunnel **tp = ipgre_bucket(ign, t); 347 struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
320 348
321 spin_lock_bh(&ipgre_lock); 349 rcu_assign_pointer(t->next, rtnl_dereference(*tp));
322 t->next = *tp;
323 rcu_assign_pointer(*tp, t); 350 rcu_assign_pointer(*tp, t);
324 spin_unlock_bh(&ipgre_lock);
325} 351}
326 352
327static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t) 353static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
328{ 354{
329 struct ip_tunnel **tp; 355 struct ip_tunnel __rcu **tp;
330 356 struct ip_tunnel *iter;
331 for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) { 357
332 if (t == *tp) { 358 for (tp = ipgre_bucket(ign, t);
333 spin_lock_bh(&ipgre_lock); 359 (iter = rtnl_dereference(*tp)) != NULL;
334 *tp = t->next; 360 tp = &iter->next) {
335 spin_unlock_bh(&ipgre_lock); 361 if (t == iter) {
362 rcu_assign_pointer(*tp, t->next);
336 break; 363 break;
337 } 364 }
338 } 365 }
@@ -346,10 +373,13 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
346 __be32 local = parms->iph.saddr; 373 __be32 local = parms->iph.saddr;
347 __be32 key = parms->i_key; 374 __be32 key = parms->i_key;
348 int link = parms->link; 375 int link = parms->link;
349 struct ip_tunnel *t, **tp; 376 struct ip_tunnel *t;
377 struct ip_tunnel __rcu **tp;
350 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 378 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
351 379
352 for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next) 380 for (tp = __ipgre_bucket(ign, parms);
381 (t = rtnl_dereference(*tp)) != NULL;
382 tp = &t->next)
353 if (local == t->parms.iph.saddr && 383 if (local == t->parms.iph.saddr &&
354 remote == t->parms.iph.daddr && 384 remote == t->parms.iph.daddr &&
355 key == t->parms.i_key && 385 key == t->parms.i_key &&
@@ -360,7 +390,7 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
360 return t; 390 return t;
361} 391}
362 392
363static struct ip_tunnel * ipgre_tunnel_locate(struct net *net, 393static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
364 struct ip_tunnel_parm *parms, int create) 394 struct ip_tunnel_parm *parms, int create)
365{ 395{
366 struct ip_tunnel *t, *nt; 396 struct ip_tunnel *t, *nt;
@@ -582,7 +612,7 @@ static int ipgre_rcv(struct sk_buff *skb)
582 if ((tunnel = ipgre_tunnel_lookup(skb->dev, 612 if ((tunnel = ipgre_tunnel_lookup(skb->dev,
583 iph->saddr, iph->daddr, key, 613 iph->saddr, iph->daddr, key,
584 gre_proto))) { 614 gre_proto))) {
585 struct net_device_stats *stats = &tunnel->dev->stats; 615 struct pcpu_tstats *tstats;
586 616
587 secpath_reset(skb); 617 secpath_reset(skb);
588 618
@@ -606,22 +636,22 @@ static int ipgre_rcv(struct sk_buff *skb)
606 /* Looped back packet, drop it! */ 636 /* Looped back packet, drop it! */
607 if (skb_rtable(skb)->fl.iif == 0) 637 if (skb_rtable(skb)->fl.iif == 0)
608 goto drop; 638 goto drop;
609 stats->multicast++; 639 tunnel->dev->stats.multicast++;
610 skb->pkt_type = PACKET_BROADCAST; 640 skb->pkt_type = PACKET_BROADCAST;
611 } 641 }
612#endif 642#endif
613 643
614 if (((flags&GRE_CSUM) && csum) || 644 if (((flags&GRE_CSUM) && csum) ||
615 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) { 645 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
616 stats->rx_crc_errors++; 646 tunnel->dev->stats.rx_crc_errors++;
617 stats->rx_errors++; 647 tunnel->dev->stats.rx_errors++;
618 goto drop; 648 goto drop;
619 } 649 }
620 if (tunnel->parms.i_flags&GRE_SEQ) { 650 if (tunnel->parms.i_flags&GRE_SEQ) {
621 if (!(flags&GRE_SEQ) || 651 if (!(flags&GRE_SEQ) ||
622 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) { 652 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
623 stats->rx_fifo_errors++; 653 tunnel->dev->stats.rx_fifo_errors++;
624 stats->rx_errors++; 654 tunnel->dev->stats.rx_errors++;
625 goto drop; 655 goto drop;
626 } 656 }
627 tunnel->i_seqno = seqno + 1; 657 tunnel->i_seqno = seqno + 1;
@@ -630,8 +660,8 @@ static int ipgre_rcv(struct sk_buff *skb)
630 /* Warning: All skb pointers will be invalidated! */ 660 /* Warning: All skb pointers will be invalidated! */
631 if (tunnel->dev->type == ARPHRD_ETHER) { 661 if (tunnel->dev->type == ARPHRD_ETHER) {
632 if (!pskb_may_pull(skb, ETH_HLEN)) { 662 if (!pskb_may_pull(skb, ETH_HLEN)) {
633 stats->rx_length_errors++; 663 tunnel->dev->stats.rx_length_errors++;
634 stats->rx_errors++; 664 tunnel->dev->stats.rx_errors++;
635 goto drop; 665 goto drop;
636 } 666 }
637 667
@@ -640,14 +670,19 @@ static int ipgre_rcv(struct sk_buff *skb)
640 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 670 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
641 } 671 }
642 672
643 skb_tunnel_rx(skb, tunnel->dev); 673 tstats = this_cpu_ptr(tunnel->dev->tstats);
674 tstats->rx_packets++;
675 tstats->rx_bytes += skb->len;
676
677 __skb_tunnel_rx(skb, tunnel->dev);
644 678
645 skb_reset_network_header(skb); 679 skb_reset_network_header(skb);
646 ipgre_ecn_decapsulate(iph, skb); 680 ipgre_ecn_decapsulate(iph, skb);
647 681
648 netif_rx(skb); 682 netif_rx(skb);
683
649 rcu_read_unlock(); 684 rcu_read_unlock();
650 return(0); 685 return 0;
651 } 686 }
652 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); 687 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
653 688
@@ -655,20 +690,19 @@ drop:
655 rcu_read_unlock(); 690 rcu_read_unlock();
656drop_nolock: 691drop_nolock:
657 kfree_skb(skb); 692 kfree_skb(skb);
658 return(0); 693 return 0;
659} 694}
660 695
661static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) 696static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
662{ 697{
663 struct ip_tunnel *tunnel = netdev_priv(dev); 698 struct ip_tunnel *tunnel = netdev_priv(dev);
664 struct net_device_stats *stats = &dev->stats; 699 struct pcpu_tstats *tstats;
665 struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
666 struct iphdr *old_iph = ip_hdr(skb); 700 struct iphdr *old_iph = ip_hdr(skb);
667 struct iphdr *tiph; 701 struct iphdr *tiph;
668 u8 tos; 702 u8 tos;
669 __be16 df; 703 __be16 df;
670 struct rtable *rt; /* Route to the other host */ 704 struct rtable *rt; /* Route to the other host */
671 struct net_device *tdev; /* Device to other host */ 705 struct net_device *tdev; /* Device to other host */
672 struct iphdr *iph; /* Our new IP header */ 706 struct iphdr *iph; /* Our new IP header */
673 unsigned int max_headroom; /* The extra header space needed */ 707 unsigned int max_headroom; /* The extra header space needed */
674 int gre_hlen; 708 int gre_hlen;
@@ -690,7 +724,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
690 /* NBMA tunnel */ 724 /* NBMA tunnel */
691 725
692 if (skb_dst(skb) == NULL) { 726 if (skb_dst(skb) == NULL) {
693 stats->tx_fifo_errors++; 727 dev->stats.tx_fifo_errors++;
694 goto tx_error; 728 goto tx_error;
695 } 729 }
696 730
@@ -699,7 +733,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
699 if ((dst = rt->rt_gateway) == 0) 733 if ((dst = rt->rt_gateway) == 0)
700 goto tx_error_icmp; 734 goto tx_error_icmp;
701 } 735 }
702#ifdef CONFIG_IPV6 736#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
703 else if (skb->protocol == htons(ETH_P_IPV6)) { 737 else if (skb->protocol == htons(ETH_P_IPV6)) {
704 struct in6_addr *addr6; 738 struct in6_addr *addr6;
705 int addr_type; 739 int addr_type;
@@ -736,14 +770,20 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
736 } 770 }
737 771
738 { 772 {
739 struct flowi fl = { .oif = tunnel->parms.link, 773 struct flowi fl = {
740 .nl_u = { .ip4_u = 774 .oif = tunnel->parms.link,
741 { .daddr = dst, 775 .nl_u = {
742 .saddr = tiph->saddr, 776 .ip4_u = {
743 .tos = RT_TOS(tos) } }, 777 .daddr = dst,
744 .proto = IPPROTO_GRE }; 778 .saddr = tiph->saddr,
779 .tos = RT_TOS(tos)
780 }
781 },
782 .proto = IPPROTO_GRE
783 }
784;
745 if (ip_route_output_key(dev_net(dev), &rt, &fl)) { 785 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
746 stats->tx_carrier_errors++; 786 dev->stats.tx_carrier_errors++;
747 goto tx_error; 787 goto tx_error;
748 } 788 }
749 } 789 }
@@ -751,7 +791,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
751 791
752 if (tdev == dev) { 792 if (tdev == dev) {
753 ip_rt_put(rt); 793 ip_rt_put(rt);
754 stats->collisions++; 794 dev->stats.collisions++;
755 goto tx_error; 795 goto tx_error;
756 } 796 }
757 797
@@ -774,7 +814,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
774 goto tx_error; 814 goto tx_error;
775 } 815 }
776 } 816 }
777#ifdef CONFIG_IPV6 817#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
778 else if (skb->protocol == htons(ETH_P_IPV6)) { 818 else if (skb->protocol == htons(ETH_P_IPV6)) {
779 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); 819 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
780 820
@@ -814,7 +854,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
814 dev->needed_headroom = max_headroom; 854 dev->needed_headroom = max_headroom;
815 if (!new_skb) { 855 if (!new_skb) {
816 ip_rt_put(rt); 856 ip_rt_put(rt);
817 txq->tx_dropped++; 857 dev->stats.tx_dropped++;
818 dev_kfree_skb(skb); 858 dev_kfree_skb(skb);
819 return NETDEV_TX_OK; 859 return NETDEV_TX_OK;
820 } 860 }
@@ -850,7 +890,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
850 if ((iph->ttl = tiph->ttl) == 0) { 890 if ((iph->ttl = tiph->ttl) == 0) {
851 if (skb->protocol == htons(ETH_P_IP)) 891 if (skb->protocol == htons(ETH_P_IP))
852 iph->ttl = old_iph->ttl; 892 iph->ttl = old_iph->ttl;
853#ifdef CONFIG_IPV6 893#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
854 else if (skb->protocol == htons(ETH_P_IPV6)) 894 else if (skb->protocol == htons(ETH_P_IPV6))
855 iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit; 895 iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
856#endif 896#endif
@@ -881,15 +921,15 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
881 } 921 }
882 922
883 nf_reset(skb); 923 nf_reset(skb);
884 924 tstats = this_cpu_ptr(dev->tstats);
885 IPTUNNEL_XMIT(); 925 __IPTUNNEL_XMIT(tstats, &dev->stats);
886 return NETDEV_TX_OK; 926 return NETDEV_TX_OK;
887 927
888tx_error_icmp: 928tx_error_icmp:
889 dst_link_failure(skb); 929 dst_link_failure(skb);
890 930
891tx_error: 931tx_error:
892 stats->tx_errors++; 932 dev->stats.tx_errors++;
893 dev_kfree_skb(skb); 933 dev_kfree_skb(skb);
894 return NETDEV_TX_OK; 934 return NETDEV_TX_OK;
895} 935}
@@ -909,13 +949,19 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev)
909 /* Guess output device to choose reasonable mtu and needed_headroom */ 949 /* Guess output device to choose reasonable mtu and needed_headroom */
910 950
911 if (iph->daddr) { 951 if (iph->daddr) {
912 struct flowi fl = { .oif = tunnel->parms.link, 952 struct flowi fl = {
913 .nl_u = { .ip4_u = 953 .oif = tunnel->parms.link,
914 { .daddr = iph->daddr, 954 .nl_u = {
915 .saddr = iph->saddr, 955 .ip4_u = {
916 .tos = RT_TOS(iph->tos) } }, 956 .daddr = iph->daddr,
917 .proto = IPPROTO_GRE }; 957 .saddr = iph->saddr,
958 .tos = RT_TOS(iph->tos)
959 }
960 },
961 .proto = IPPROTO_GRE
962 };
918 struct rtable *rt; 963 struct rtable *rt;
964
919 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { 965 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
920 tdev = rt->dst.dev; 966 tdev = rt->dst.dev;
921 ip_rt_put(rt); 967 ip_rt_put(rt);
@@ -1012,7 +1058,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1012 break; 1058 break;
1013 } 1059 }
1014 } else { 1060 } else {
1015 unsigned nflags = 0; 1061 unsigned int nflags = 0;
1016 1062
1017 t = netdev_priv(dev); 1063 t = netdev_priv(dev);
1018 1064
@@ -1026,6 +1072,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1026 break; 1072 break;
1027 } 1073 }
1028 ipgre_tunnel_unlink(ign, t); 1074 ipgre_tunnel_unlink(ign, t);
1075 synchronize_net();
1029 t->parms.iph.saddr = p.iph.saddr; 1076 t->parms.iph.saddr = p.iph.saddr;
1030 t->parms.iph.daddr = p.iph.daddr; 1077 t->parms.iph.daddr = p.iph.daddr;
1031 t->parms.i_key = p.i_key; 1078 t->parms.i_key = p.i_key;
@@ -1125,7 +1172,7 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1125 1172
1126static int ipgre_header(struct sk_buff *skb, struct net_device *dev, 1173static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1127 unsigned short type, 1174 unsigned short type,
1128 const void *daddr, const void *saddr, unsigned len) 1175 const void *daddr, const void *saddr, unsigned int len)
1129{ 1176{
1130 struct ip_tunnel *t = netdev_priv(dev); 1177 struct ip_tunnel *t = netdev_priv(dev);
1131 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen); 1178 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
@@ -1167,13 +1214,19 @@ static int ipgre_open(struct net_device *dev)
1167 struct ip_tunnel *t = netdev_priv(dev); 1214 struct ip_tunnel *t = netdev_priv(dev);
1168 1215
1169 if (ipv4_is_multicast(t->parms.iph.daddr)) { 1216 if (ipv4_is_multicast(t->parms.iph.daddr)) {
1170 struct flowi fl = { .oif = t->parms.link, 1217 struct flowi fl = {
1171 .nl_u = { .ip4_u = 1218 .oif = t->parms.link,
1172 { .daddr = t->parms.iph.daddr, 1219 .nl_u = {
1173 .saddr = t->parms.iph.saddr, 1220 .ip4_u = {
1174 .tos = RT_TOS(t->parms.iph.tos) } }, 1221 .daddr = t->parms.iph.daddr,
1175 .proto = IPPROTO_GRE }; 1222 .saddr = t->parms.iph.saddr,
1223 .tos = RT_TOS(t->parms.iph.tos)
1224 }
1225 },
1226 .proto = IPPROTO_GRE
1227 };
1176 struct rtable *rt; 1228 struct rtable *rt;
1229
1177 if (ip_route_output_key(dev_net(dev), &rt, &fl)) 1230 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1178 return -EADDRNOTAVAIL; 1231 return -EADDRNOTAVAIL;
1179 dev = rt->dst.dev; 1232 dev = rt->dst.dev;
@@ -1193,10 +1246,8 @@ static int ipgre_close(struct net_device *dev)
1193 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) { 1246 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1194 struct in_device *in_dev; 1247 struct in_device *in_dev;
1195 in_dev = inetdev_by_index(dev_net(dev), t->mlink); 1248 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1196 if (in_dev) { 1249 if (in_dev)
1197 ip_mc_dec_group(in_dev, t->parms.iph.daddr); 1250 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1198 in_dev_put(in_dev);
1199 }
1200 } 1251 }
1201 return 0; 1252 return 0;
1202} 1253}
@@ -1213,12 +1264,19 @@ static const struct net_device_ops ipgre_netdev_ops = {
1213 .ndo_start_xmit = ipgre_tunnel_xmit, 1264 .ndo_start_xmit = ipgre_tunnel_xmit,
1214 .ndo_do_ioctl = ipgre_tunnel_ioctl, 1265 .ndo_do_ioctl = ipgre_tunnel_ioctl,
1215 .ndo_change_mtu = ipgre_tunnel_change_mtu, 1266 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1267 .ndo_get_stats = ipgre_get_stats,
1216}; 1268};
1217 1269
1270static void ipgre_dev_free(struct net_device *dev)
1271{
1272 free_percpu(dev->tstats);
1273 free_netdev(dev);
1274}
1275
1218static void ipgre_tunnel_setup(struct net_device *dev) 1276static void ipgre_tunnel_setup(struct net_device *dev)
1219{ 1277{
1220 dev->netdev_ops = &ipgre_netdev_ops; 1278 dev->netdev_ops = &ipgre_netdev_ops;
1221 dev->destructor = free_netdev; 1279 dev->destructor = ipgre_dev_free;
1222 1280
1223 dev->type = ARPHRD_IPGRE; 1281 dev->type = ARPHRD_IPGRE;
1224 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; 1282 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
@@ -1256,6 +1314,10 @@ static int ipgre_tunnel_init(struct net_device *dev)
1256 } else 1314 } else
1257 dev->header_ops = &ipgre_header_ops; 1315 dev->header_ops = &ipgre_header_ops;
1258 1316
1317 dev->tstats = alloc_percpu(struct pcpu_tstats);
1318 if (!dev->tstats)
1319 return -ENOMEM;
1320
1259 return 0; 1321 return 0;
1260} 1322}
1261 1323
@@ -1263,7 +1325,6 @@ static void ipgre_fb_tunnel_init(struct net_device *dev)
1263{ 1325{
1264 struct ip_tunnel *tunnel = netdev_priv(dev); 1326 struct ip_tunnel *tunnel = netdev_priv(dev);
1265 struct iphdr *iph = &tunnel->parms.iph; 1327 struct iphdr *iph = &tunnel->parms.iph;
1266 struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1267 1328
1268 tunnel->dev = dev; 1329 tunnel->dev = dev;
1269 strcpy(tunnel->parms.name, dev->name); 1330 strcpy(tunnel->parms.name, dev->name);
@@ -1274,14 +1335,12 @@ static void ipgre_fb_tunnel_init(struct net_device *dev)
1274 tunnel->hlen = sizeof(struct iphdr) + 4; 1335 tunnel->hlen = sizeof(struct iphdr) + 4;
1275 1336
1276 dev_hold(dev); 1337 dev_hold(dev);
1277 ign->tunnels_wc[0] = tunnel;
1278} 1338}
1279 1339
1280 1340
1281static const struct net_protocol ipgre_protocol = { 1341static const struct gre_protocol ipgre_protocol = {
1282 .handler = ipgre_rcv, 1342 .handler = ipgre_rcv,
1283 .err_handler = ipgre_err, 1343 .err_handler = ipgre_err,
1284 .netns_ok = 1,
1285}; 1344};
1286 1345
1287static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head) 1346static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
@@ -1291,11 +1350,13 @@ static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1291 for (prio = 0; prio < 4; prio++) { 1350 for (prio = 0; prio < 4; prio++) {
1292 int h; 1351 int h;
1293 for (h = 0; h < HASH_SIZE; h++) { 1352 for (h = 0; h < HASH_SIZE; h++) {
1294 struct ip_tunnel *t = ign->tunnels[prio][h]; 1353 struct ip_tunnel *t;
1354
1355 t = rtnl_dereference(ign->tunnels[prio][h]);
1295 1356
1296 while (t != NULL) { 1357 while (t != NULL) {
1297 unregister_netdevice_queue(t->dev, head); 1358 unregister_netdevice_queue(t->dev, head);
1298 t = t->next; 1359 t = rtnl_dereference(t->next);
1299 } 1360 }
1300 } 1361 }
1301 } 1362 }
@@ -1320,10 +1381,12 @@ static int __net_init ipgre_init_net(struct net *net)
1320 if ((err = register_netdev(ign->fb_tunnel_dev))) 1381 if ((err = register_netdev(ign->fb_tunnel_dev)))
1321 goto err_reg_dev; 1382 goto err_reg_dev;
1322 1383
1384 rcu_assign_pointer(ign->tunnels_wc[0],
1385 netdev_priv(ign->fb_tunnel_dev));
1323 return 0; 1386 return 0;
1324 1387
1325err_reg_dev: 1388err_reg_dev:
1326 free_netdev(ign->fb_tunnel_dev); 1389 ipgre_dev_free(ign->fb_tunnel_dev);
1327err_alloc_dev: 1390err_alloc_dev:
1328 return err; 1391 return err;
1329} 1392}
@@ -1441,6 +1504,10 @@ static int ipgre_tap_init(struct net_device *dev)
1441 1504
1442 ipgre_tunnel_bind_dev(dev); 1505 ipgre_tunnel_bind_dev(dev);
1443 1506
1507 dev->tstats = alloc_percpu(struct pcpu_tstats);
1508 if (!dev->tstats)
1509 return -ENOMEM;
1510
1444 return 0; 1511 return 0;
1445} 1512}
1446 1513
@@ -1451,6 +1518,7 @@ static const struct net_device_ops ipgre_tap_netdev_ops = {
1451 .ndo_set_mac_address = eth_mac_addr, 1518 .ndo_set_mac_address = eth_mac_addr,
1452 .ndo_validate_addr = eth_validate_addr, 1519 .ndo_validate_addr = eth_validate_addr,
1453 .ndo_change_mtu = ipgre_tunnel_change_mtu, 1520 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1521 .ndo_get_stats = ipgre_get_stats,
1454}; 1522};
1455 1523
1456static void ipgre_tap_setup(struct net_device *dev) 1524static void ipgre_tap_setup(struct net_device *dev)
@@ -1459,7 +1527,7 @@ static void ipgre_tap_setup(struct net_device *dev)
1459 ether_setup(dev); 1527 ether_setup(dev);
1460 1528
1461 dev->netdev_ops = &ipgre_tap_netdev_ops; 1529 dev->netdev_ops = &ipgre_tap_netdev_ops;
1462 dev->destructor = free_netdev; 1530 dev->destructor = ipgre_dev_free;
1463 1531
1464 dev->iflink = 0; 1532 dev->iflink = 0;
1465 dev->features |= NETIF_F_NETNS_LOCAL; 1533 dev->features |= NETIF_F_NETNS_LOCAL;
@@ -1487,6 +1555,10 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nla
1487 if (!tb[IFLA_MTU]) 1555 if (!tb[IFLA_MTU])
1488 dev->mtu = mtu; 1556 dev->mtu = mtu;
1489 1557
1558 /* Can use a lockless transmit, unless we generate output sequences */
1559 if (!(nt->parms.o_flags & GRE_SEQ))
1560 dev->features |= NETIF_F_LLTX;
1561
1490 err = register_netdevice(dev); 1562 err = register_netdevice(dev);
1491 if (err) 1563 if (err)
1492 goto out; 1564 goto out;
@@ -1522,7 +1594,7 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1522 t = nt; 1594 t = nt;
1523 1595
1524 if (dev->type != ARPHRD_ETHER) { 1596 if (dev->type != ARPHRD_ETHER) {
1525 unsigned nflags = 0; 1597 unsigned int nflags = 0;
1526 1598
1527 if (ipv4_is_multicast(p.iph.daddr)) 1599 if (ipv4_is_multicast(p.iph.daddr))
1528 nflags = IFF_BROADCAST; 1600 nflags = IFF_BROADCAST;
@@ -1663,7 +1735,7 @@ static int __init ipgre_init(void)
1663 if (err < 0) 1735 if (err < 0)
1664 return err; 1736 return err;
1665 1737
1666 err = inet_add_protocol(&ipgre_protocol, IPPROTO_GRE); 1738 err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1667 if (err < 0) { 1739 if (err < 0) {
1668 printk(KERN_INFO "ipgre init: can't add protocol\n"); 1740 printk(KERN_INFO "ipgre init: can't add protocol\n");
1669 goto add_proto_failed; 1741 goto add_proto_failed;
@@ -1683,7 +1755,7 @@ out:
1683tap_ops_failed: 1755tap_ops_failed:
1684 rtnl_link_unregister(&ipgre_link_ops); 1756 rtnl_link_unregister(&ipgre_link_ops);
1685rtnl_link_failed: 1757rtnl_link_failed:
1686 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE); 1758 gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1687add_proto_failed: 1759add_proto_failed:
1688 unregister_pernet_device(&ipgre_net_ops); 1760 unregister_pernet_device(&ipgre_net_ops);
1689 goto out; 1761 goto out;
@@ -1693,7 +1765,7 @@ static void __exit ipgre_fini(void)
1693{ 1765{
1694 rtnl_link_unregister(&ipgre_tap_ops); 1766 rtnl_link_unregister(&ipgre_tap_ops);
1695 rtnl_link_unregister(&ipgre_link_ops); 1767 rtnl_link_unregister(&ipgre_link_ops);
1696 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) 1768 if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1697 printk(KERN_INFO "ipgre close: can't remove protocol\n"); 1769 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1698 unregister_pernet_device(&ipgre_net_ops); 1770 unregister_pernet_device(&ipgre_net_ops);
1699} 1771}
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index ba9836c488ed..1906fa35860c 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -466,7 +466,7 @@ error:
466 } 466 }
467 return -EINVAL; 467 return -EINVAL;
468} 468}
469 469EXPORT_SYMBOL(ip_options_compile);
470 470
471/* 471/*
472 * Undo all the changes done by ip_options_compile(). 472 * Undo all the changes done by ip_options_compile().
@@ -646,3 +646,4 @@ int ip_options_rcv_srr(struct sk_buff *skb)
646 } 646 }
647 return 0; 647 return 0;
648} 648}
649EXPORT_SYMBOL(ip_options_rcv_srr);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 04b69896df5f..439d2a34ee44 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -487,10 +487,9 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
487 * LATER: this step can be merged to real generation of fragments, 487 * LATER: this step can be merged to real generation of fragments,
488 * we can switch to copy when see the first bad fragment. 488 * we can switch to copy when see the first bad fragment.
489 */ 489 */
490 if (skb_has_frags(skb)) { 490 if (skb_has_frag_list(skb)) {
491 struct sk_buff *frag; 491 struct sk_buff *frag, *frag2;
492 int first_len = skb_pagelen(skb); 492 int first_len = skb_pagelen(skb);
493 int truesizes = 0;
494 493
495 if (first_len - hlen > mtu || 494 if (first_len - hlen > mtu ||
496 ((first_len - hlen) & 7) || 495 ((first_len - hlen) & 7) ||
@@ -503,18 +502,18 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
503 if (frag->len > mtu || 502 if (frag->len > mtu ||
504 ((frag->len & 7) && frag->next) || 503 ((frag->len & 7) && frag->next) ||
505 skb_headroom(frag) < hlen) 504 skb_headroom(frag) < hlen)
506 goto slow_path; 505 goto slow_path_clean;
507 506
508 /* Partially cloned skb? */ 507 /* Partially cloned skb? */
509 if (skb_shared(frag)) 508 if (skb_shared(frag))
510 goto slow_path; 509 goto slow_path_clean;
511 510
512 BUG_ON(frag->sk); 511 BUG_ON(frag->sk);
513 if (skb->sk) { 512 if (skb->sk) {
514 frag->sk = skb->sk; 513 frag->sk = skb->sk;
515 frag->destructor = sock_wfree; 514 frag->destructor = sock_wfree;
516 } 515 }
517 truesizes += frag->truesize; 516 skb->truesize -= frag->truesize;
518 } 517 }
519 518
520 /* Everything is OK. Generate! */ 519 /* Everything is OK. Generate! */
@@ -524,7 +523,6 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
524 frag = skb_shinfo(skb)->frag_list; 523 frag = skb_shinfo(skb)->frag_list;
525 skb_frag_list_init(skb); 524 skb_frag_list_init(skb);
526 skb->data_len = first_len - skb_headlen(skb); 525 skb->data_len = first_len - skb_headlen(skb);
527 skb->truesize -= truesizes;
528 skb->len = first_len; 526 skb->len = first_len;
529 iph->tot_len = htons(first_len); 527 iph->tot_len = htons(first_len);
530 iph->frag_off = htons(IP_MF); 528 iph->frag_off = htons(IP_MF);
@@ -576,6 +574,15 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
576 } 574 }
577 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); 575 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
578 return err; 576 return err;
577
578slow_path_clean:
579 skb_walk_frags(skb, frag2) {
580 if (frag2 == frag)
581 break;
582 frag2->sk = NULL;
583 frag2->destructor = NULL;
584 skb->truesize += frag2->truesize;
585 }
579 } 586 }
580 587
581slow_path: 588slow_path:
@@ -837,10 +844,9 @@ int ip_append_data(struct sock *sk,
837 inet->cork.length = 0; 844 inet->cork.length = 0;
838 sk->sk_sndmsg_page = NULL; 845 sk->sk_sndmsg_page = NULL;
839 sk->sk_sndmsg_off = 0; 846 sk->sk_sndmsg_off = 0;
840 if ((exthdrlen = rt->dst.header_len) != 0) { 847 exthdrlen = rt->dst.header_len;
841 length += exthdrlen; 848 length += exthdrlen;
842 transhdrlen += exthdrlen; 849 transhdrlen += exthdrlen;
843 }
844 } else { 850 } else {
845 rt = (struct rtable *)inet->cork.dst; 851 rt = (struct rtable *)inet->cork.dst;
846 if (inet->cork.flags & IPCORK_OPT) 852 if (inet->cork.flags & IPCORK_OPT)
@@ -927,16 +933,19 @@ alloc_new_skb:
927 !(rt->dst.dev->features&NETIF_F_SG)) 933 !(rt->dst.dev->features&NETIF_F_SG))
928 alloclen = mtu; 934 alloclen = mtu;
929 else 935 else
930 alloclen = datalen + fragheaderlen; 936 alloclen = fraglen;
931 937
932 /* The last fragment gets additional space at tail. 938 /* The last fragment gets additional space at tail.
933 * Note, with MSG_MORE we overallocate on fragments, 939 * Note, with MSG_MORE we overallocate on fragments,
934 * because we have no idea what fragment will be 940 * because we have no idea what fragment will be
935 * the last. 941 * the last.
936 */ 942 */
937 if (datalen == length + fraggap) 943 if (datalen == length + fraggap) {
938 alloclen += rt->dst.trailer_len; 944 alloclen += rt->dst.trailer_len;
939 945 /* make sure mtu is not reached */
946 if (datalen > mtu - fragheaderlen - rt->dst.trailer_len)
947 datalen -= ALIGN(rt->dst.trailer_len, 8);
948 }
940 if (transhdrlen) { 949 if (transhdrlen) {
941 skb = sock_alloc_send_skb(sk, 950 skb = sock_alloc_send_skb(sk,
942 alloclen + hh_len + 15, 951 alloclen + hh_len + 15,
@@ -953,7 +962,7 @@ alloc_new_skb:
953 else 962 else
954 /* only the initial fragment is 963 /* only the initial fragment is
955 time stamped */ 964 time stamped */
956 ipc->shtx.flags = 0; 965 ipc->tx_flags = 0;
957 } 966 }
958 if (skb == NULL) 967 if (skb == NULL)
959 goto error; 968 goto error;
@@ -964,7 +973,7 @@ alloc_new_skb:
964 skb->ip_summed = csummode; 973 skb->ip_summed = csummode;
965 skb->csum = 0; 974 skb->csum = 0;
966 skb_reserve(skb, hh_len); 975 skb_reserve(skb, hh_len);
967 *skb_tx(skb) = ipc->shtx; 976 skb_shinfo(skb)->tx_flags = ipc->tx_flags;
968 977
969 /* 978 /*
970 * Find where to start putting bytes. 979 * Find where to start putting bytes.
@@ -1384,7 +1393,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
1384 1393
1385 daddr = ipc.addr = rt->rt_src; 1394 daddr = ipc.addr = rt->rt_src;
1386 ipc.opt = NULL; 1395 ipc.opt = NULL;
1387 ipc.shtx.flags = 0; 1396 ipc.tx_flags = 0;
1388 1397
1389 if (replyopts.opt.optlen) { 1398 if (replyopts.opt.optlen) {
1390 ipc.opt = &replyopts.opt; 1399 ipc.opt = &replyopts.opt;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 6c40a8c46e79..3948c86e59ca 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -238,7 +238,7 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc)
238 but receiver should be enough clever f.e. to forward mtrace requests, 238 but receiver should be enough clever f.e. to forward mtrace requests,
239 sent to multicast group to reach destination designated router. 239 sent to multicast group to reach destination designated router.
240 */ 240 */
241struct ip_ra_chain *ip_ra_chain; 241struct ip_ra_chain __rcu *ip_ra_chain;
242static DEFINE_SPINLOCK(ip_ra_lock); 242static DEFINE_SPINLOCK(ip_ra_lock);
243 243
244 244
@@ -253,7 +253,8 @@ static void ip_ra_destroy_rcu(struct rcu_head *head)
253int ip_ra_control(struct sock *sk, unsigned char on, 253int ip_ra_control(struct sock *sk, unsigned char on,
254 void (*destructor)(struct sock *)) 254 void (*destructor)(struct sock *))
255{ 255{
256 struct ip_ra_chain *ra, *new_ra, **rap; 256 struct ip_ra_chain *ra, *new_ra;
257 struct ip_ra_chain __rcu **rap;
257 258
258 if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW) 259 if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW)
259 return -EINVAL; 260 return -EINVAL;
@@ -261,7 +262,10 @@ int ip_ra_control(struct sock *sk, unsigned char on,
261 new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; 262 new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
262 263
263 spin_lock_bh(&ip_ra_lock); 264 spin_lock_bh(&ip_ra_lock);
264 for (rap = &ip_ra_chain; (ra = *rap) != NULL; rap = &ra->next) { 265 for (rap = &ip_ra_chain;
266 (ra = rcu_dereference_protected(*rap,
267 lockdep_is_held(&ip_ra_lock))) != NULL;
268 rap = &ra->next) {
265 if (ra->sk == sk) { 269 if (ra->sk == sk) {
266 if (on) { 270 if (on) {
267 spin_unlock_bh(&ip_ra_lock); 271 spin_unlock_bh(&ip_ra_lock);
@@ -1129,6 +1133,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
1129 case IP_HDRINCL: 1133 case IP_HDRINCL:
1130 val = inet->hdrincl; 1134 val = inet->hdrincl;
1131 break; 1135 break;
1136 case IP_NODEFRAG:
1137 val = inet->nodefrag;
1138 break;
1132 case IP_MTU_DISCOVER: 1139 case IP_MTU_DISCOVER:
1133 val = inet->pmtudisc; 1140 val = inet->pmtudisc;
1134 break; 1141 break;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index ec036731a70b..cd300aaee78f 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -122,31 +122,59 @@
122 122
123static int ipip_net_id __read_mostly; 123static int ipip_net_id __read_mostly;
124struct ipip_net { 124struct ipip_net {
125 struct ip_tunnel *tunnels_r_l[HASH_SIZE]; 125 struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
126 struct ip_tunnel *tunnels_r[HASH_SIZE]; 126 struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
127 struct ip_tunnel *tunnels_l[HASH_SIZE]; 127 struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
128 struct ip_tunnel *tunnels_wc[1]; 128 struct ip_tunnel __rcu *tunnels_wc[1];
129 struct ip_tunnel **tunnels[4]; 129 struct ip_tunnel __rcu **tunnels[4];
130 130
131 struct net_device *fb_tunnel_dev; 131 struct net_device *fb_tunnel_dev;
132}; 132};
133 133
134static void ipip_tunnel_init(struct net_device *dev); 134static int ipip_tunnel_init(struct net_device *dev);
135static void ipip_tunnel_setup(struct net_device *dev); 135static void ipip_tunnel_setup(struct net_device *dev);
136static void ipip_dev_free(struct net_device *dev);
136 137
137/* 138/*
138 * Locking : hash tables are protected by RCU and a spinlock 139 * Locking : hash tables are protected by RCU and RTNL
139 */ 140 */
140static DEFINE_SPINLOCK(ipip_lock);
141 141
142#define for_each_ip_tunnel_rcu(start) \ 142#define for_each_ip_tunnel_rcu(start) \
143 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) 143 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
144 144
145/* often modified stats are per cpu, other are shared (netdev->stats) */
146struct pcpu_tstats {
147 unsigned long rx_packets;
148 unsigned long rx_bytes;
149 unsigned long tx_packets;
150 unsigned long tx_bytes;
151};
152
153static struct net_device_stats *ipip_get_stats(struct net_device *dev)
154{
155 struct pcpu_tstats sum = { 0 };
156 int i;
157
158 for_each_possible_cpu(i) {
159 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
160
161 sum.rx_packets += tstats->rx_packets;
162 sum.rx_bytes += tstats->rx_bytes;
163 sum.tx_packets += tstats->tx_packets;
164 sum.tx_bytes += tstats->tx_bytes;
165 }
166 dev->stats.rx_packets = sum.rx_packets;
167 dev->stats.rx_bytes = sum.rx_bytes;
168 dev->stats.tx_packets = sum.tx_packets;
169 dev->stats.tx_bytes = sum.tx_bytes;
170 return &dev->stats;
171}
172
145static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, 173static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
146 __be32 remote, __be32 local) 174 __be32 remote, __be32 local)
147{ 175{
148 unsigned h0 = HASH(remote); 176 unsigned int h0 = HASH(remote);
149 unsigned h1 = HASH(local); 177 unsigned int h1 = HASH(local);
150 struct ip_tunnel *t; 178 struct ip_tunnel *t;
151 struct ipip_net *ipn = net_generic(net, ipip_net_id); 179 struct ipip_net *ipn = net_generic(net, ipip_net_id);
152 180
@@ -169,12 +197,12 @@ static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
169 return NULL; 197 return NULL;
170} 198}
171 199
172static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn, 200static struct ip_tunnel __rcu **__ipip_bucket(struct ipip_net *ipn,
173 struct ip_tunnel_parm *parms) 201 struct ip_tunnel_parm *parms)
174{ 202{
175 __be32 remote = parms->iph.daddr; 203 __be32 remote = parms->iph.daddr;
176 __be32 local = parms->iph.saddr; 204 __be32 local = parms->iph.saddr;
177 unsigned h = 0; 205 unsigned int h = 0;
178 int prio = 0; 206 int prio = 0;
179 207
180 if (remote) { 208 if (remote) {
@@ -188,7 +216,7 @@ static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn,
188 return &ipn->tunnels[prio][h]; 216 return &ipn->tunnels[prio][h];
189} 217}
190 218
191static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn, 219static inline struct ip_tunnel __rcu **ipip_bucket(struct ipip_net *ipn,
192 struct ip_tunnel *t) 220 struct ip_tunnel *t)
193{ 221{
194 return __ipip_bucket(ipn, &t->parms); 222 return __ipip_bucket(ipn, &t->parms);
@@ -196,13 +224,14 @@ static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn,
196 224
197static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t) 225static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
198{ 226{
199 struct ip_tunnel **tp; 227 struct ip_tunnel __rcu **tp;
200 228 struct ip_tunnel *iter;
201 for (tp = ipip_bucket(ipn, t); *tp; tp = &(*tp)->next) { 229
202 if (t == *tp) { 230 for (tp = ipip_bucket(ipn, t);
203 spin_lock_bh(&ipip_lock); 231 (iter = rtnl_dereference(*tp)) != NULL;
204 *tp = t->next; 232 tp = &iter->next) {
205 spin_unlock_bh(&ipip_lock); 233 if (t == iter) {
234 rcu_assign_pointer(*tp, t->next);
206 break; 235 break;
207 } 236 }
208 } 237 }
@@ -210,12 +239,10 @@ static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
210 239
211static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t) 240static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t)
212{ 241{
213 struct ip_tunnel **tp = ipip_bucket(ipn, t); 242 struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t);
214 243
215 spin_lock_bh(&ipip_lock); 244 rcu_assign_pointer(t->next, rtnl_dereference(*tp));
216 t->next = *tp;
217 rcu_assign_pointer(*tp, t); 245 rcu_assign_pointer(*tp, t);
218 spin_unlock_bh(&ipip_lock);
219} 246}
220 247
221static struct ip_tunnel * ipip_tunnel_locate(struct net *net, 248static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
@@ -223,12 +250,15 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
223{ 250{
224 __be32 remote = parms->iph.daddr; 251 __be32 remote = parms->iph.daddr;
225 __be32 local = parms->iph.saddr; 252 __be32 local = parms->iph.saddr;
226 struct ip_tunnel *t, **tp, *nt; 253 struct ip_tunnel *t, *nt;
254 struct ip_tunnel __rcu **tp;
227 struct net_device *dev; 255 struct net_device *dev;
228 char name[IFNAMSIZ]; 256 char name[IFNAMSIZ];
229 struct ipip_net *ipn = net_generic(net, ipip_net_id); 257 struct ipip_net *ipn = net_generic(net, ipip_net_id);
230 258
231 for (tp = __ipip_bucket(ipn, parms); (t = *tp) != NULL; tp = &t->next) { 259 for (tp = __ipip_bucket(ipn, parms);
260 (t = rtnl_dereference(*tp)) != NULL;
261 tp = &t->next) {
232 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) 262 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
233 return t; 263 return t;
234 } 264 }
@@ -238,7 +268,7 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
238 if (parms->name[0]) 268 if (parms->name[0])
239 strlcpy(name, parms->name, IFNAMSIZ); 269 strlcpy(name, parms->name, IFNAMSIZ);
240 else 270 else
241 sprintf(name, "tunl%%d"); 271 strcpy(name, "tunl%d");
242 272
243 dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup); 273 dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup);
244 if (dev == NULL) 274 if (dev == NULL)
@@ -254,7 +284,8 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
254 nt = netdev_priv(dev); 284 nt = netdev_priv(dev);
255 nt->parms = *parms; 285 nt->parms = *parms;
256 286
257 ipip_tunnel_init(dev); 287 if (ipip_tunnel_init(dev) < 0)
288 goto failed_free;
258 289
259 if (register_netdevice(dev) < 0) 290 if (register_netdevice(dev) < 0)
260 goto failed_free; 291 goto failed_free;
@@ -264,20 +295,19 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
264 return nt; 295 return nt;
265 296
266failed_free: 297failed_free:
267 free_netdev(dev); 298 ipip_dev_free(dev);
268 return NULL; 299 return NULL;
269} 300}
270 301
302/* called with RTNL */
271static void ipip_tunnel_uninit(struct net_device *dev) 303static void ipip_tunnel_uninit(struct net_device *dev)
272{ 304{
273 struct net *net = dev_net(dev); 305 struct net *net = dev_net(dev);
274 struct ipip_net *ipn = net_generic(net, ipip_net_id); 306 struct ipip_net *ipn = net_generic(net, ipip_net_id);
275 307
276 if (dev == ipn->fb_tunnel_dev) { 308 if (dev == ipn->fb_tunnel_dev)
277 spin_lock_bh(&ipip_lock); 309 rcu_assign_pointer(ipn->tunnels_wc[0], NULL);
278 ipn->tunnels_wc[0] = NULL; 310 else
279 spin_unlock_bh(&ipip_lock);
280 } else
281 ipip_tunnel_unlink(ipn, netdev_priv(dev)); 311 ipip_tunnel_unlink(ipn, netdev_priv(dev));
282 dev_put(dev); 312 dev_put(dev);
283} 313}
@@ -359,8 +389,10 @@ static int ipip_rcv(struct sk_buff *skb)
359 const struct iphdr *iph = ip_hdr(skb); 389 const struct iphdr *iph = ip_hdr(skb);
360 390
361 rcu_read_lock(); 391 rcu_read_lock();
362 if ((tunnel = ipip_tunnel_lookup(dev_net(skb->dev), 392 tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
363 iph->saddr, iph->daddr)) != NULL) { 393 if (tunnel != NULL) {
394 struct pcpu_tstats *tstats;
395
364 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 396 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
365 rcu_read_unlock(); 397 rcu_read_unlock();
366 kfree_skb(skb); 398 kfree_skb(skb);
@@ -374,10 +406,16 @@ static int ipip_rcv(struct sk_buff *skb)
374 skb->protocol = htons(ETH_P_IP); 406 skb->protocol = htons(ETH_P_IP);
375 skb->pkt_type = PACKET_HOST; 407 skb->pkt_type = PACKET_HOST;
376 408
377 skb_tunnel_rx(skb, tunnel->dev); 409 tstats = this_cpu_ptr(tunnel->dev->tstats);
410 tstats->rx_packets++;
411 tstats->rx_bytes += skb->len;
412
413 __skb_tunnel_rx(skb, tunnel->dev);
378 414
379 ipip_ecn_decapsulate(iph, skb); 415 ipip_ecn_decapsulate(iph, skb);
416
380 netif_rx(skb); 417 netif_rx(skb);
418
381 rcu_read_unlock(); 419 rcu_read_unlock();
382 return 0; 420 return 0;
383 } 421 }
@@ -394,13 +432,12 @@ static int ipip_rcv(struct sk_buff *skb)
394static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) 432static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
395{ 433{
396 struct ip_tunnel *tunnel = netdev_priv(dev); 434 struct ip_tunnel *tunnel = netdev_priv(dev);
397 struct net_device_stats *stats = &dev->stats; 435 struct pcpu_tstats *tstats;
398 struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
399 struct iphdr *tiph = &tunnel->parms.iph; 436 struct iphdr *tiph = &tunnel->parms.iph;
400 u8 tos = tunnel->parms.iph.tos; 437 u8 tos = tunnel->parms.iph.tos;
401 __be16 df = tiph->frag_off; 438 __be16 df = tiph->frag_off;
402 struct rtable *rt; /* Route to the other host */ 439 struct rtable *rt; /* Route to the other host */
403 struct net_device *tdev; /* Device to other host */ 440 struct net_device *tdev; /* Device to other host */
404 struct iphdr *old_iph = ip_hdr(skb); 441 struct iphdr *old_iph = ip_hdr(skb);
405 struct iphdr *iph; /* Our new IP header */ 442 struct iphdr *iph; /* Our new IP header */
406 unsigned int max_headroom; /* The extra header space needed */ 443 unsigned int max_headroom; /* The extra header space needed */
@@ -410,13 +447,13 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
410 if (skb->protocol != htons(ETH_P_IP)) 447 if (skb->protocol != htons(ETH_P_IP))
411 goto tx_error; 448 goto tx_error;
412 449
413 if (tos&1) 450 if (tos & 1)
414 tos = old_iph->tos; 451 tos = old_iph->tos;
415 452
416 if (!dst) { 453 if (!dst) {
417 /* NBMA tunnel */ 454 /* NBMA tunnel */
418 if ((rt = skb_rtable(skb)) == NULL) { 455 if ((rt = skb_rtable(skb)) == NULL) {
419 stats->tx_fifo_errors++; 456 dev->stats.tx_fifo_errors++;
420 goto tx_error; 457 goto tx_error;
421 } 458 }
422 if ((dst = rt->rt_gateway) == 0) 459 if ((dst = rt->rt_gateway) == 0)
@@ -424,14 +461,20 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
424 } 461 }
425 462
426 { 463 {
427 struct flowi fl = { .oif = tunnel->parms.link, 464 struct flowi fl = {
428 .nl_u = { .ip4_u = 465 .oif = tunnel->parms.link,
429 { .daddr = dst, 466 .nl_u = {
430 .saddr = tiph->saddr, 467 .ip4_u = {
431 .tos = RT_TOS(tos) } }, 468 .daddr = dst,
432 .proto = IPPROTO_IPIP }; 469 .saddr = tiph->saddr,
470 .tos = RT_TOS(tos)
471 }
472 },
473 .proto = IPPROTO_IPIP
474 };
475
433 if (ip_route_output_key(dev_net(dev), &rt, &fl)) { 476 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
434 stats->tx_carrier_errors++; 477 dev->stats.tx_carrier_errors++;
435 goto tx_error_icmp; 478 goto tx_error_icmp;
436 } 479 }
437 } 480 }
@@ -439,7 +482,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
439 482
440 if (tdev == dev) { 483 if (tdev == dev) {
441 ip_rt_put(rt); 484 ip_rt_put(rt);
442 stats->collisions++; 485 dev->stats.collisions++;
443 goto tx_error; 486 goto tx_error;
444 } 487 }
445 488
@@ -449,7 +492,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
449 mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); 492 mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
450 493
451 if (mtu < 68) { 494 if (mtu < 68) {
452 stats->collisions++; 495 dev->stats.collisions++;
453 ip_rt_put(rt); 496 ip_rt_put(rt);
454 goto tx_error; 497 goto tx_error;
455 } 498 }
@@ -485,7 +528,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
485 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); 528 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
486 if (!new_skb) { 529 if (!new_skb) {
487 ip_rt_put(rt); 530 ip_rt_put(rt);
488 txq->tx_dropped++; 531 dev->stats.tx_dropped++;
489 dev_kfree_skb(skb); 532 dev_kfree_skb(skb);
490 return NETDEV_TX_OK; 533 return NETDEV_TX_OK;
491 } 534 }
@@ -522,14 +565,14 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
522 iph->ttl = old_iph->ttl; 565 iph->ttl = old_iph->ttl;
523 566
524 nf_reset(skb); 567 nf_reset(skb);
525 568 tstats = this_cpu_ptr(dev->tstats);
526 IPTUNNEL_XMIT(); 569 __IPTUNNEL_XMIT(tstats, &dev->stats);
527 return NETDEV_TX_OK; 570 return NETDEV_TX_OK;
528 571
529tx_error_icmp: 572tx_error_icmp:
530 dst_link_failure(skb); 573 dst_link_failure(skb);
531tx_error: 574tx_error:
532 stats->tx_errors++; 575 dev->stats.tx_errors++;
533 dev_kfree_skb(skb); 576 dev_kfree_skb(skb);
534 return NETDEV_TX_OK; 577 return NETDEV_TX_OK;
535} 578}
@@ -544,13 +587,19 @@ static void ipip_tunnel_bind_dev(struct net_device *dev)
544 iph = &tunnel->parms.iph; 587 iph = &tunnel->parms.iph;
545 588
546 if (iph->daddr) { 589 if (iph->daddr) {
547 struct flowi fl = { .oif = tunnel->parms.link, 590 struct flowi fl = {
548 .nl_u = { .ip4_u = 591 .oif = tunnel->parms.link,
549 { .daddr = iph->daddr, 592 .nl_u = {
550 .saddr = iph->saddr, 593 .ip4_u = {
551 .tos = RT_TOS(iph->tos) } }, 594 .daddr = iph->daddr,
552 .proto = IPPROTO_IPIP }; 595 .saddr = iph->saddr,
596 .tos = RT_TOS(iph->tos)
597 }
598 },
599 .proto = IPPROTO_IPIP
600 };
553 struct rtable *rt; 601 struct rtable *rt;
602
554 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { 603 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
555 tdev = rt->dst.dev; 604 tdev = rt->dst.dev;
556 ip_rt_put(rt); 605 ip_rt_put(rt);
@@ -627,6 +676,7 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
627 } 676 }
628 t = netdev_priv(dev); 677 t = netdev_priv(dev);
629 ipip_tunnel_unlink(ipn, t); 678 ipip_tunnel_unlink(ipn, t);
679 synchronize_net();
630 t->parms.iph.saddr = p.iph.saddr; 680 t->parms.iph.saddr = p.iph.saddr;
631 t->parms.iph.daddr = p.iph.daddr; 681 t->parms.iph.daddr = p.iph.daddr;
632 memcpy(dev->dev_addr, &p.iph.saddr, 4); 682 memcpy(dev->dev_addr, &p.iph.saddr, 4);
@@ -696,13 +746,19 @@ static const struct net_device_ops ipip_netdev_ops = {
696 .ndo_start_xmit = ipip_tunnel_xmit, 746 .ndo_start_xmit = ipip_tunnel_xmit,
697 .ndo_do_ioctl = ipip_tunnel_ioctl, 747 .ndo_do_ioctl = ipip_tunnel_ioctl,
698 .ndo_change_mtu = ipip_tunnel_change_mtu, 748 .ndo_change_mtu = ipip_tunnel_change_mtu,
699 749 .ndo_get_stats = ipip_get_stats,
700}; 750};
701 751
752static void ipip_dev_free(struct net_device *dev)
753{
754 free_percpu(dev->tstats);
755 free_netdev(dev);
756}
757
702static void ipip_tunnel_setup(struct net_device *dev) 758static void ipip_tunnel_setup(struct net_device *dev)
703{ 759{
704 dev->netdev_ops = &ipip_netdev_ops; 760 dev->netdev_ops = &ipip_netdev_ops;
705 dev->destructor = free_netdev; 761 dev->destructor = ipip_dev_free;
706 762
707 dev->type = ARPHRD_TUNNEL; 763 dev->type = ARPHRD_TUNNEL;
708 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); 764 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
@@ -711,10 +767,11 @@ static void ipip_tunnel_setup(struct net_device *dev)
711 dev->iflink = 0; 767 dev->iflink = 0;
712 dev->addr_len = 4; 768 dev->addr_len = 4;
713 dev->features |= NETIF_F_NETNS_LOCAL; 769 dev->features |= NETIF_F_NETNS_LOCAL;
770 dev->features |= NETIF_F_LLTX;
714 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; 771 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
715} 772}
716 773
717static void ipip_tunnel_init(struct net_device *dev) 774static int ipip_tunnel_init(struct net_device *dev)
718{ 775{
719 struct ip_tunnel *tunnel = netdev_priv(dev); 776 struct ip_tunnel *tunnel = netdev_priv(dev);
720 777
@@ -725,9 +782,15 @@ static void ipip_tunnel_init(struct net_device *dev)
725 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); 782 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
726 783
727 ipip_tunnel_bind_dev(dev); 784 ipip_tunnel_bind_dev(dev);
785
786 dev->tstats = alloc_percpu(struct pcpu_tstats);
787 if (!dev->tstats)
788 return -ENOMEM;
789
790 return 0;
728} 791}
729 792
730static void __net_init ipip_fb_tunnel_init(struct net_device *dev) 793static int __net_init ipip_fb_tunnel_init(struct net_device *dev)
731{ 794{
732 struct ip_tunnel *tunnel = netdev_priv(dev); 795 struct ip_tunnel *tunnel = netdev_priv(dev);
733 struct iphdr *iph = &tunnel->parms.iph; 796 struct iphdr *iph = &tunnel->parms.iph;
@@ -740,11 +803,16 @@ static void __net_init ipip_fb_tunnel_init(struct net_device *dev)
740 iph->protocol = IPPROTO_IPIP; 803 iph->protocol = IPPROTO_IPIP;
741 iph->ihl = 5; 804 iph->ihl = 5;
742 805
806 dev->tstats = alloc_percpu(struct pcpu_tstats);
807 if (!dev->tstats)
808 return -ENOMEM;
809
743 dev_hold(dev); 810 dev_hold(dev);
744 ipn->tunnels_wc[0] = tunnel; 811 rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
812 return 0;
745} 813}
746 814
747static struct xfrm_tunnel ipip_handler = { 815static struct xfrm_tunnel ipip_handler __read_mostly = {
748 .handler = ipip_rcv, 816 .handler = ipip_rcv,
749 .err_handler = ipip_err, 817 .err_handler = ipip_err,
750 .priority = 1, 818 .priority = 1,
@@ -760,11 +828,12 @@ static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head)
760 for (prio = 1; prio < 4; prio++) { 828 for (prio = 1; prio < 4; prio++) {
761 int h; 829 int h;
762 for (h = 0; h < HASH_SIZE; h++) { 830 for (h = 0; h < HASH_SIZE; h++) {
763 struct ip_tunnel *t = ipn->tunnels[prio][h]; 831 struct ip_tunnel *t;
764 832
833 t = rtnl_dereference(ipn->tunnels[prio][h]);
765 while (t != NULL) { 834 while (t != NULL) {
766 unregister_netdevice_queue(t->dev, head); 835 unregister_netdevice_queue(t->dev, head);
767 t = t->next; 836 t = rtnl_dereference(t->next);
768 } 837 }
769 } 838 }
770 } 839 }
@@ -789,7 +858,9 @@ static int __net_init ipip_init_net(struct net *net)
789 } 858 }
790 dev_net_set(ipn->fb_tunnel_dev, net); 859 dev_net_set(ipn->fb_tunnel_dev, net);
791 860
792 ipip_fb_tunnel_init(ipn->fb_tunnel_dev); 861 err = ipip_fb_tunnel_init(ipn->fb_tunnel_dev);
862 if (err)
863 goto err_reg_dev;
793 864
794 if ((err = register_netdev(ipn->fb_tunnel_dev))) 865 if ((err = register_netdev(ipn->fb_tunnel_dev)))
795 goto err_reg_dev; 866 goto err_reg_dev;
@@ -797,7 +868,7 @@ static int __net_init ipip_init_net(struct net *net)
797 return 0; 868 return 0;
798 869
799err_reg_dev: 870err_reg_dev:
800 free_netdev(ipn->fb_tunnel_dev); 871 ipip_dev_free(ipn->fb_tunnel_dev);
801err_alloc_dev: 872err_alloc_dev:
802 /* nothing */ 873 /* nothing */
803 return err; 874 return err;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 179fcab866fc..86dd5691af46 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -75,7 +75,7 @@ struct mr_table {
75 struct net *net; 75 struct net *net;
76#endif 76#endif
77 u32 id; 77 u32 id;
78 struct sock *mroute_sk; 78 struct sock __rcu *mroute_sk;
79 struct timer_list ipmr_expire_timer; 79 struct timer_list ipmr_expire_timer;
80 struct list_head mfc_unres_queue; 80 struct list_head mfc_unres_queue;
81 struct list_head mfc_cache_array[MFC_LINES]; 81 struct list_head mfc_cache_array[MFC_LINES];
@@ -98,7 +98,7 @@ struct ipmr_result {
98}; 98};
99 99
100/* Big lock, protecting vif table, mrt cache and mroute socket state. 100/* Big lock, protecting vif table, mrt cache and mroute socket state.
101 Note that the changes are semaphored via rtnl_lock. 101 * Note that the changes are semaphored via rtnl_lock.
102 */ 102 */
103 103
104static DEFINE_RWLOCK(mrt_lock); 104static DEFINE_RWLOCK(mrt_lock);
@@ -113,11 +113,11 @@ static DEFINE_RWLOCK(mrt_lock);
113static DEFINE_SPINLOCK(mfc_unres_lock); 113static DEFINE_SPINLOCK(mfc_unres_lock);
114 114
115/* We return to original Alan's scheme. Hash table of resolved 115/* We return to original Alan's scheme. Hash table of resolved
116 entries is changed only in process context and protected 116 * entries is changed only in process context and protected
117 with weak lock mrt_lock. Queue of unresolved entries is protected 117 * with weak lock mrt_lock. Queue of unresolved entries is protected
118 with strong spinlock mfc_unres_lock. 118 * with strong spinlock mfc_unres_lock.
119 119 *
120 In this case data path is free of exclusive locks at all. 120 * In this case data path is free of exclusive locks at all.
121 */ 121 */
122 122
123static struct kmem_cache *mrt_cachep __read_mostly; 123static struct kmem_cache *mrt_cachep __read_mostly;
@@ -396,9 +396,9 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
396 set_fs(KERNEL_DS); 396 set_fs(KERNEL_DS);
397 err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL); 397 err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
398 set_fs(oldfs); 398 set_fs(oldfs);
399 } else 399 } else {
400 err = -EOPNOTSUPP; 400 err = -EOPNOTSUPP;
401 401 }
402 dev = NULL; 402 dev = NULL;
403 403
404 if (err == 0 && 404 if (err == 0 &&
@@ -495,7 +495,8 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
495 dev->iflink = 0; 495 dev->iflink = 0;
496 496
497 rcu_read_lock(); 497 rcu_read_lock();
498 if ((in_dev = __in_dev_get_rcu(dev)) == NULL) { 498 in_dev = __in_dev_get_rcu(dev);
499 if (!in_dev) {
499 rcu_read_unlock(); 500 rcu_read_unlock();
500 goto failure; 501 goto failure;
501 } 502 }
@@ -552,9 +553,10 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
552 mrt->mroute_reg_vif_num = -1; 553 mrt->mroute_reg_vif_num = -1;
553#endif 554#endif
554 555
555 if (vifi+1 == mrt->maxvif) { 556 if (vifi + 1 == mrt->maxvif) {
556 int tmp; 557 int tmp;
557 for (tmp=vifi-1; tmp>=0; tmp--) { 558
559 for (tmp = vifi - 1; tmp >= 0; tmp--) {
558 if (VIF_EXISTS(mrt, tmp)) 560 if (VIF_EXISTS(mrt, tmp))
559 break; 561 break;
560 } 562 }
@@ -565,25 +567,33 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
565 567
566 dev_set_allmulti(dev, -1); 568 dev_set_allmulti(dev, -1);
567 569
568 if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) { 570 in_dev = __in_dev_get_rtnl(dev);
571 if (in_dev) {
569 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--; 572 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
570 ip_rt_multicast_event(in_dev); 573 ip_rt_multicast_event(in_dev);
571 } 574 }
572 575
573 if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify) 576 if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify)
574 unregister_netdevice_queue(dev, head); 577 unregister_netdevice_queue(dev, head);
575 578
576 dev_put(dev); 579 dev_put(dev);
577 return 0; 580 return 0;
578} 581}
579 582
580static inline void ipmr_cache_free(struct mfc_cache *c) 583static void ipmr_cache_free_rcu(struct rcu_head *head)
581{ 584{
585 struct mfc_cache *c = container_of(head, struct mfc_cache, rcu);
586
582 kmem_cache_free(mrt_cachep, c); 587 kmem_cache_free(mrt_cachep, c);
583} 588}
584 589
590static inline void ipmr_cache_free(struct mfc_cache *c)
591{
592 call_rcu(&c->rcu, ipmr_cache_free_rcu);
593}
594
585/* Destroy an unresolved cache entry, killing queued skbs 595/* Destroy an unresolved cache entry, killing queued skbs
586 and reporting error to netlink readers. 596 * and reporting error to netlink readers.
587 */ 597 */
588 598
589static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) 599static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
@@ -605,8 +615,9 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
605 memset(&e->msg, 0, sizeof(e->msg)); 615 memset(&e->msg, 0, sizeof(e->msg));
606 616
607 rtnl_unicast(skb, net, NETLINK_CB(skb).pid); 617 rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
608 } else 618 } else {
609 kfree_skb(skb); 619 kfree_skb(skb);
620 }
610 } 621 }
611 622
612 ipmr_cache_free(c); 623 ipmr_cache_free(c);
@@ -724,13 +735,13 @@ static int vif_add(struct net *net, struct mr_table *mrt,
724 case 0: 735 case 0:
725 if (vifc->vifc_flags == VIFF_USE_IFINDEX) { 736 if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
726 dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex); 737 dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
727 if (dev && dev->ip_ptr == NULL) { 738 if (dev && __in_dev_get_rtnl(dev) == NULL) {
728 dev_put(dev); 739 dev_put(dev);
729 return -EADDRNOTAVAIL; 740 return -EADDRNOTAVAIL;
730 } 741 }
731 } else 742 } else {
732 dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr); 743 dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
733 744 }
734 if (!dev) 745 if (!dev)
735 return -EADDRNOTAVAIL; 746 return -EADDRNOTAVAIL;
736 err = dev_set_allmulti(dev, 1); 747 err = dev_set_allmulti(dev, 1);
@@ -743,16 +754,16 @@ static int vif_add(struct net *net, struct mr_table *mrt,
743 return -EINVAL; 754 return -EINVAL;
744 } 755 }
745 756
746 if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) { 757 in_dev = __in_dev_get_rtnl(dev);
758 if (!in_dev) {
747 dev_put(dev); 759 dev_put(dev);
748 return -EADDRNOTAVAIL; 760 return -EADDRNOTAVAIL;
749 } 761 }
750 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++; 762 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
751 ip_rt_multicast_event(in_dev); 763 ip_rt_multicast_event(in_dev);
752 764
753 /* 765 /* Fill in the VIF structures */
754 * Fill in the VIF structures 766
755 */
756 v->rate_limit = vifc->vifc_rate_limit; 767 v->rate_limit = vifc->vifc_rate_limit;
757 v->local = vifc->vifc_lcl_addr.s_addr; 768 v->local = vifc->vifc_lcl_addr.s_addr;
758 v->remote = vifc->vifc_rmt_addr.s_addr; 769 v->remote = vifc->vifc_rmt_addr.s_addr;
@@ -765,14 +776,14 @@ static int vif_add(struct net *net, struct mr_table *mrt,
765 v->pkt_in = 0; 776 v->pkt_in = 0;
766 v->pkt_out = 0; 777 v->pkt_out = 0;
767 v->link = dev->ifindex; 778 v->link = dev->ifindex;
768 if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER)) 779 if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER))
769 v->link = dev->iflink; 780 v->link = dev->iflink;
770 781
771 /* And finish update writing critical data */ 782 /* And finish update writing critical data */
772 write_lock_bh(&mrt_lock); 783 write_lock_bh(&mrt_lock);
773 v->dev = dev; 784 v->dev = dev;
774#ifdef CONFIG_IP_PIMSM 785#ifdef CONFIG_IP_PIMSM
775 if (v->flags&VIFF_REGISTER) 786 if (v->flags & VIFF_REGISTER)
776 mrt->mroute_reg_vif_num = vifi; 787 mrt->mroute_reg_vif_num = vifi;
777#endif 788#endif
778 if (vifi+1 > mrt->maxvif) 789 if (vifi+1 > mrt->maxvif)
@@ -781,6 +792,7 @@ static int vif_add(struct net *net, struct mr_table *mrt,
781 return 0; 792 return 0;
782} 793}
783 794
795/* called with rcu_read_lock() */
784static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, 796static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
785 __be32 origin, 797 __be32 origin,
786 __be32 mcastgrp) 798 __be32 mcastgrp)
@@ -788,7 +800,7 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
788 int line = MFC_HASH(mcastgrp, origin); 800 int line = MFC_HASH(mcastgrp, origin);
789 struct mfc_cache *c; 801 struct mfc_cache *c;
790 802
791 list_for_each_entry(c, &mrt->mfc_cache_array[line], list) { 803 list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) {
792 if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp) 804 if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp)
793 return c; 805 return c;
794 } 806 }
@@ -801,19 +813,20 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
801static struct mfc_cache *ipmr_cache_alloc(void) 813static struct mfc_cache *ipmr_cache_alloc(void)
802{ 814{
803 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); 815 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
804 if (c == NULL) 816
805 return NULL; 817 if (c)
806 c->mfc_un.res.minvif = MAXVIFS; 818 c->mfc_un.res.minvif = MAXVIFS;
807 return c; 819 return c;
808} 820}
809 821
810static struct mfc_cache *ipmr_cache_alloc_unres(void) 822static struct mfc_cache *ipmr_cache_alloc_unres(void)
811{ 823{
812 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); 824 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
813 if (c == NULL) 825
814 return NULL; 826 if (c) {
815 skb_queue_head_init(&c->mfc_un.unres.unresolved); 827 skb_queue_head_init(&c->mfc_un.unres.unresolved);
816 c->mfc_un.unres.expires = jiffies + 10*HZ; 828 c->mfc_un.unres.expires = jiffies + 10*HZ;
829 }
817 return c; 830 return c;
818} 831}
819 832
@@ -827,17 +840,15 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
827 struct sk_buff *skb; 840 struct sk_buff *skb;
828 struct nlmsgerr *e; 841 struct nlmsgerr *e;
829 842
830 /* 843 /* Play the pending entries through our router */
831 * Play the pending entries through our router
832 */
833 844
834 while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) { 845 while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
835 if (ip_hdr(skb)->version == 0) { 846 if (ip_hdr(skb)->version == 0) {
836 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); 847 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
837 848
838 if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) { 849 if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) {
839 nlh->nlmsg_len = (skb_tail_pointer(skb) - 850 nlh->nlmsg_len = skb_tail_pointer(skb) -
840 (u8 *)nlh); 851 (u8 *)nlh;
841 } else { 852 } else {
842 nlh->nlmsg_type = NLMSG_ERROR; 853 nlh->nlmsg_type = NLMSG_ERROR;
843 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); 854 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
@@ -848,8 +859,9 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
848 } 859 }
849 860
850 rtnl_unicast(skb, net, NETLINK_CB(skb).pid); 861 rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
851 } else 862 } else {
852 ip_mr_forward(net, mrt, skb, c, 0); 863 ip_mr_forward(net, mrt, skb, c, 0);
864 }
853 } 865 }
854} 866}
855 867
@@ -867,6 +879,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
867 const int ihl = ip_hdrlen(pkt); 879 const int ihl = ip_hdrlen(pkt);
868 struct igmphdr *igmp; 880 struct igmphdr *igmp;
869 struct igmpmsg *msg; 881 struct igmpmsg *msg;
882 struct sock *mroute_sk;
870 int ret; 883 int ret;
871 884
872#ifdef CONFIG_IP_PIMSM 885#ifdef CONFIG_IP_PIMSM
@@ -882,9 +895,9 @@ static int ipmr_cache_report(struct mr_table *mrt,
882#ifdef CONFIG_IP_PIMSM 895#ifdef CONFIG_IP_PIMSM
883 if (assert == IGMPMSG_WHOLEPKT) { 896 if (assert == IGMPMSG_WHOLEPKT) {
884 /* Ugly, but we have no choice with this interface. 897 /* Ugly, but we have no choice with this interface.
885 Duplicate old header, fix ihl, length etc. 898 * Duplicate old header, fix ihl, length etc.
886 And all this only to mangle msg->im_msgtype and 899 * And all this only to mangle msg->im_msgtype and
887 to set msg->im_mbz to "mbz" :-) 900 * to set msg->im_mbz to "mbz" :-)
888 */ 901 */
889 skb_push(skb, sizeof(struct iphdr)); 902 skb_push(skb, sizeof(struct iphdr));
890 skb_reset_network_header(skb); 903 skb_reset_network_header(skb);
@@ -901,39 +914,38 @@ static int ipmr_cache_report(struct mr_table *mrt,
901#endif 914#endif
902 { 915 {
903 916
904 /* 917 /* Copy the IP header */
905 * Copy the IP header
906 */
907 918
908 skb->network_header = skb->tail; 919 skb->network_header = skb->tail;
909 skb_put(skb, ihl); 920 skb_put(skb, ihl);
910 skb_copy_to_linear_data(skb, pkt->data, ihl); 921 skb_copy_to_linear_data(skb, pkt->data, ihl);
911 ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */ 922 ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */
912 msg = (struct igmpmsg *)skb_network_header(skb); 923 msg = (struct igmpmsg *)skb_network_header(skb);
913 msg->im_vif = vifi; 924 msg->im_vif = vifi;
914 skb_dst_set(skb, dst_clone(skb_dst(pkt))); 925 skb_dst_set(skb, dst_clone(skb_dst(pkt)));
915 926
916 /* 927 /* Add our header */
917 * Add our header
918 */
919 928
920 igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr)); 929 igmp = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
921 igmp->type = 930 igmp->type =
922 msg->im_msgtype = assert; 931 msg->im_msgtype = assert;
923 igmp->code = 0; 932 igmp->code = 0;
924 ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */ 933 ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */
925 skb->transport_header = skb->network_header; 934 skb->transport_header = skb->network_header;
926 } 935 }
927 936
928 if (mrt->mroute_sk == NULL) { 937 rcu_read_lock();
938 mroute_sk = rcu_dereference(mrt->mroute_sk);
939 if (mroute_sk == NULL) {
940 rcu_read_unlock();
929 kfree_skb(skb); 941 kfree_skb(skb);
930 return -EINVAL; 942 return -EINVAL;
931 } 943 }
932 944
933 /* 945 /* Deliver to mrouted */
934 * Deliver to mrouted 946
935 */ 947 ret = sock_queue_rcv_skb(mroute_sk, skb);
936 ret = sock_queue_rcv_skb(mrt->mroute_sk, skb); 948 rcu_read_unlock();
937 if (ret < 0) { 949 if (ret < 0) {
938 if (net_ratelimit()) 950 if (net_ratelimit())
939 printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n"); 951 printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
@@ -965,9 +977,7 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
965 } 977 }
966 978
967 if (!found) { 979 if (!found) {
968 /* 980 /* Create a new entry if allowable */
969 * Create a new entry if allowable
970 */
971 981
972 if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 || 982 if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
973 (c = ipmr_cache_alloc_unres()) == NULL) { 983 (c = ipmr_cache_alloc_unres()) == NULL) {
@@ -977,16 +987,14 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
977 return -ENOBUFS; 987 return -ENOBUFS;
978 } 988 }
979 989
980 /* 990 /* Fill in the new cache entry */
981 * Fill in the new cache entry 991
982 */
983 c->mfc_parent = -1; 992 c->mfc_parent = -1;
984 c->mfc_origin = iph->saddr; 993 c->mfc_origin = iph->saddr;
985 c->mfc_mcastgrp = iph->daddr; 994 c->mfc_mcastgrp = iph->daddr;
986 995
987 /* 996 /* Reflect first query at mrouted. */
988 * Reflect first query at mrouted. 997
989 */
990 err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE); 998 err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
991 if (err < 0) { 999 if (err < 0) {
992 /* If the report failed throw the cache entry 1000 /* If the report failed throw the cache entry
@@ -1006,10 +1014,9 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
1006 mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires); 1014 mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires);
1007 } 1015 }
1008 1016
1009 /* 1017 /* See if we can append the packet */
1010 * See if we can append the packet 1018
1011 */ 1019 if (c->mfc_un.unres.unresolved.qlen > 3) {
1012 if (c->mfc_un.unres.unresolved.qlen>3) {
1013 kfree_skb(skb); 1020 kfree_skb(skb);
1014 err = -ENOBUFS; 1021 err = -ENOBUFS;
1015 } else { 1022 } else {
@@ -1035,9 +1042,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
1035 list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) { 1042 list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
1036 if (c->mfc_origin == mfc->mfcc_origin.s_addr && 1043 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
1037 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) { 1044 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
1038 write_lock_bh(&mrt_lock); 1045 list_del_rcu(&c->list);
1039 list_del(&c->list);
1040 write_unlock_bh(&mrt_lock);
1041 1046
1042 ipmr_cache_free(c); 1047 ipmr_cache_free(c);
1043 return 0; 1048 return 0;
@@ -1090,9 +1095,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
1090 if (!mrtsock) 1095 if (!mrtsock)
1091 c->mfc_flags |= MFC_STATIC; 1096 c->mfc_flags |= MFC_STATIC;
1092 1097
1093 write_lock_bh(&mrt_lock); 1098 list_add_rcu(&c->list, &mrt->mfc_cache_array[line]);
1094 list_add(&c->list, &mrt->mfc_cache_array[line]);
1095 write_unlock_bh(&mrt_lock);
1096 1099
1097 /* 1100 /*
1098 * Check to see if we resolved a queued list. If so we 1101 * Check to see if we resolved a queued list. If so we
@@ -1130,26 +1133,21 @@ static void mroute_clean_tables(struct mr_table *mrt)
1130 LIST_HEAD(list); 1133 LIST_HEAD(list);
1131 struct mfc_cache *c, *next; 1134 struct mfc_cache *c, *next;
1132 1135
1133 /* 1136 /* Shut down all active vif entries */
1134 * Shut down all active vif entries 1137
1135 */
1136 for (i = 0; i < mrt->maxvif; i++) { 1138 for (i = 0; i < mrt->maxvif; i++) {
1137 if (!(mrt->vif_table[i].flags&VIFF_STATIC)) 1139 if (!(mrt->vif_table[i].flags & VIFF_STATIC))
1138 vif_delete(mrt, i, 0, &list); 1140 vif_delete(mrt, i, 0, &list);
1139 } 1141 }
1140 unregister_netdevice_many(&list); 1142 unregister_netdevice_many(&list);
1141 1143
1142 /* 1144 /* Wipe the cache */
1143 * Wipe the cache 1145
1144 */
1145 for (i = 0; i < MFC_LINES; i++) { 1146 for (i = 0; i < MFC_LINES; i++) {
1146 list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) { 1147 list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
1147 if (c->mfc_flags&MFC_STATIC) 1148 if (c->mfc_flags & MFC_STATIC)
1148 continue; 1149 continue;
1149 write_lock_bh(&mrt_lock); 1150 list_del_rcu(&c->list);
1150 list_del(&c->list);
1151 write_unlock_bh(&mrt_lock);
1152
1153 ipmr_cache_free(c); 1151 ipmr_cache_free(c);
1154 } 1152 }
1155 } 1153 }
@@ -1164,6 +1162,9 @@ static void mroute_clean_tables(struct mr_table *mrt)
1164 } 1162 }
1165} 1163}
1166 1164
1165/* called from ip_ra_control(), before an RCU grace period,
1166 * we dont need to call synchronize_rcu() here
1167 */
1167static void mrtsock_destruct(struct sock *sk) 1168static void mrtsock_destruct(struct sock *sk)
1168{ 1169{
1169 struct net *net = sock_net(sk); 1170 struct net *net = sock_net(sk);
@@ -1171,13 +1172,9 @@ static void mrtsock_destruct(struct sock *sk)
1171 1172
1172 rtnl_lock(); 1173 rtnl_lock();
1173 ipmr_for_each_table(mrt, net) { 1174 ipmr_for_each_table(mrt, net) {
1174 if (sk == mrt->mroute_sk) { 1175 if (sk == rtnl_dereference(mrt->mroute_sk)) {
1175 IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; 1176 IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
1176 1177 rcu_assign_pointer(mrt->mroute_sk, NULL);
1177 write_lock_bh(&mrt_lock);
1178 mrt->mroute_sk = NULL;
1179 write_unlock_bh(&mrt_lock);
1180
1181 mroute_clean_tables(mrt); 1178 mroute_clean_tables(mrt);
1182 } 1179 }
1183 } 1180 }
@@ -1204,7 +1201,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1204 return -ENOENT; 1201 return -ENOENT;
1205 1202
1206 if (optname != MRT_INIT) { 1203 if (optname != MRT_INIT) {
1207 if (sk != mrt->mroute_sk && !capable(CAP_NET_ADMIN)) 1204 if (sk != rcu_dereference_raw(mrt->mroute_sk) &&
1205 !capable(CAP_NET_ADMIN))
1208 return -EACCES; 1206 return -EACCES;
1209 } 1207 }
1210 1208
@@ -1217,23 +1215,20 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1217 return -ENOPROTOOPT; 1215 return -ENOPROTOOPT;
1218 1216
1219 rtnl_lock(); 1217 rtnl_lock();
1220 if (mrt->mroute_sk) { 1218 if (rtnl_dereference(mrt->mroute_sk)) {
1221 rtnl_unlock(); 1219 rtnl_unlock();
1222 return -EADDRINUSE; 1220 return -EADDRINUSE;
1223 } 1221 }
1224 1222
1225 ret = ip_ra_control(sk, 1, mrtsock_destruct); 1223 ret = ip_ra_control(sk, 1, mrtsock_destruct);
1226 if (ret == 0) { 1224 if (ret == 0) {
1227 write_lock_bh(&mrt_lock); 1225 rcu_assign_pointer(mrt->mroute_sk, sk);
1228 mrt->mroute_sk = sk;
1229 write_unlock_bh(&mrt_lock);
1230
1231 IPV4_DEVCONF_ALL(net, MC_FORWARDING)++; 1226 IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
1232 } 1227 }
1233 rtnl_unlock(); 1228 rtnl_unlock();
1234 return ret; 1229 return ret;
1235 case MRT_DONE: 1230 case MRT_DONE:
1236 if (sk != mrt->mroute_sk) 1231 if (sk != rcu_dereference_raw(mrt->mroute_sk))
1237 return -EACCES; 1232 return -EACCES;
1238 return ip_ra_control(sk, 0, NULL); 1233 return ip_ra_control(sk, 0, NULL);
1239 case MRT_ADD_VIF: 1234 case MRT_ADD_VIF:
@@ -1246,7 +1241,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1246 return -ENFILE; 1241 return -ENFILE;
1247 rtnl_lock(); 1242 rtnl_lock();
1248 if (optname == MRT_ADD_VIF) { 1243 if (optname == MRT_ADD_VIF) {
1249 ret = vif_add(net, mrt, &vif, sk == mrt->mroute_sk); 1244 ret = vif_add(net, mrt, &vif,
1245 sk == rtnl_dereference(mrt->mroute_sk));
1250 } else { 1246 } else {
1251 ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL); 1247 ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
1252 } 1248 }
@@ -1267,7 +1263,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1267 if (optname == MRT_DEL_MFC) 1263 if (optname == MRT_DEL_MFC)
1268 ret = ipmr_mfc_delete(mrt, &mfc); 1264 ret = ipmr_mfc_delete(mrt, &mfc);
1269 else 1265 else
1270 ret = ipmr_mfc_add(net, mrt, &mfc, sk == mrt->mroute_sk); 1266 ret = ipmr_mfc_add(net, mrt, &mfc,
1267 sk == rtnl_dereference(mrt->mroute_sk));
1271 rtnl_unlock(); 1268 rtnl_unlock();
1272 return ret; 1269 return ret;
1273 /* 1270 /*
@@ -1276,7 +1273,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1276 case MRT_ASSERT: 1273 case MRT_ASSERT:
1277 { 1274 {
1278 int v; 1275 int v;
1279 if (get_user(v,(int __user *)optval)) 1276 if (get_user(v, (int __user *)optval))
1280 return -EFAULT; 1277 return -EFAULT;
1281 mrt->mroute_do_assert = (v) ? 1 : 0; 1278 mrt->mroute_do_assert = (v) ? 1 : 0;
1282 return 0; 1279 return 0;
@@ -1286,7 +1283,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1286 { 1283 {
1287 int v; 1284 int v;
1288 1285
1289 if (get_user(v,(int __user *)optval)) 1286 if (get_user(v, (int __user *)optval))
1290 return -EFAULT; 1287 return -EFAULT;
1291 v = (v) ? 1 : 0; 1288 v = (v) ? 1 : 0;
1292 1289
@@ -1309,14 +1306,16 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1309 return -EINVAL; 1306 return -EINVAL;
1310 if (get_user(v, (u32 __user *)optval)) 1307 if (get_user(v, (u32 __user *)optval))
1311 return -EFAULT; 1308 return -EFAULT;
1312 if (sk == mrt->mroute_sk)
1313 return -EBUSY;
1314 1309
1315 rtnl_lock(); 1310 rtnl_lock();
1316 ret = 0; 1311 ret = 0;
1317 if (!ipmr_new_table(net, v)) 1312 if (sk == rtnl_dereference(mrt->mroute_sk)) {
1318 ret = -ENOMEM; 1313 ret = -EBUSY;
1319 raw_sk(sk)->ipmr_table = v; 1314 } else {
1315 if (!ipmr_new_table(net, v))
1316 ret = -ENOMEM;
1317 raw_sk(sk)->ipmr_table = v;
1318 }
1320 rtnl_unlock(); 1319 rtnl_unlock();
1321 return ret; 1320 return ret;
1322 } 1321 }
@@ -1347,9 +1346,9 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
1347 1346
1348 if (optname != MRT_VERSION && 1347 if (optname != MRT_VERSION &&
1349#ifdef CONFIG_IP_PIMSM 1348#ifdef CONFIG_IP_PIMSM
1350 optname!=MRT_PIM && 1349 optname != MRT_PIM &&
1351#endif 1350#endif
1352 optname!=MRT_ASSERT) 1351 optname != MRT_ASSERT)
1353 return -ENOPROTOOPT; 1352 return -ENOPROTOOPT;
1354 1353
1355 if (get_user(olr, optlen)) 1354 if (get_user(olr, optlen))
@@ -1416,19 +1415,19 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1416 if (copy_from_user(&sr, arg, sizeof(sr))) 1415 if (copy_from_user(&sr, arg, sizeof(sr)))
1417 return -EFAULT; 1416 return -EFAULT;
1418 1417
1419 read_lock(&mrt_lock); 1418 rcu_read_lock();
1420 c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr); 1419 c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
1421 if (c) { 1420 if (c) {
1422 sr.pktcnt = c->mfc_un.res.pkt; 1421 sr.pktcnt = c->mfc_un.res.pkt;
1423 sr.bytecnt = c->mfc_un.res.bytes; 1422 sr.bytecnt = c->mfc_un.res.bytes;
1424 sr.wrong_if = c->mfc_un.res.wrong_if; 1423 sr.wrong_if = c->mfc_un.res.wrong_if;
1425 read_unlock(&mrt_lock); 1424 rcu_read_unlock();
1426 1425
1427 if (copy_to_user(arg, &sr, sizeof(sr))) 1426 if (copy_to_user(arg, &sr, sizeof(sr)))
1428 return -EFAULT; 1427 return -EFAULT;
1429 return 0; 1428 return 0;
1430 } 1429 }
1431 read_unlock(&mrt_lock); 1430 rcu_read_unlock();
1432 return -EADDRNOTAVAIL; 1431 return -EADDRNOTAVAIL;
1433 default: 1432 default:
1434 return -ENOIOCTLCMD; 1433 return -ENOIOCTLCMD;
@@ -1465,7 +1464,7 @@ static struct notifier_block ip_mr_notifier = {
1465}; 1464};
1466 1465
1467/* 1466/*
1468 * Encapsulate a packet by attaching a valid IPIP header to it. 1467 * Encapsulate a packet by attaching a valid IPIP header to it.
1469 * This avoids tunnel drivers and other mess and gives us the speed so 1468 * This avoids tunnel drivers and other mess and gives us the speed so
1470 * important for multicast video. 1469 * important for multicast video.
1471 */ 1470 */
@@ -1480,7 +1479,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1480 skb_reset_network_header(skb); 1479 skb_reset_network_header(skb);
1481 iph = ip_hdr(skb); 1480 iph = ip_hdr(skb);
1482 1481
1483 iph->version = 4; 1482 iph->version = 4;
1484 iph->tos = old_iph->tos; 1483 iph->tos = old_iph->tos;
1485 iph->ttl = old_iph->ttl; 1484 iph->ttl = old_iph->ttl;
1486 iph->frag_off = 0; 1485 iph->frag_off = 0;
@@ -1498,7 +1497,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1498 1497
1499static inline int ipmr_forward_finish(struct sk_buff *skb) 1498static inline int ipmr_forward_finish(struct sk_buff *skb)
1500{ 1499{
1501 struct ip_options * opt = &(IPCB(skb)->opt); 1500 struct ip_options *opt = &(IPCB(skb)->opt);
1502 1501
1503 IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); 1502 IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
1504 1503
@@ -1535,22 +1534,34 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1535 } 1534 }
1536#endif 1535#endif
1537 1536
1538 if (vif->flags&VIFF_TUNNEL) { 1537 if (vif->flags & VIFF_TUNNEL) {
1539 struct flowi fl = { .oif = vif->link, 1538 struct flowi fl = {
1540 .nl_u = { .ip4_u = 1539 .oif = vif->link,
1541 { .daddr = vif->remote, 1540 .nl_u = {
1542 .saddr = vif->local, 1541 .ip4_u = {
1543 .tos = RT_TOS(iph->tos) } }, 1542 .daddr = vif->remote,
1544 .proto = IPPROTO_IPIP }; 1543 .saddr = vif->local,
1544 .tos = RT_TOS(iph->tos)
1545 }
1546 },
1547 .proto = IPPROTO_IPIP
1548 };
1549
1545 if (ip_route_output_key(net, &rt, &fl)) 1550 if (ip_route_output_key(net, &rt, &fl))
1546 goto out_free; 1551 goto out_free;
1547 encap = sizeof(struct iphdr); 1552 encap = sizeof(struct iphdr);
1548 } else { 1553 } else {
1549 struct flowi fl = { .oif = vif->link, 1554 struct flowi fl = {
1550 .nl_u = { .ip4_u = 1555 .oif = vif->link,
1551 { .daddr = iph->daddr, 1556 .nl_u = {
1552 .tos = RT_TOS(iph->tos) } }, 1557 .ip4_u = {
1553 .proto = IPPROTO_IPIP }; 1558 .daddr = iph->daddr,
1559 .tos = RT_TOS(iph->tos)
1560 }
1561 },
1562 .proto = IPPROTO_IPIP
1563 };
1564
1554 if (ip_route_output_key(net, &rt, &fl)) 1565 if (ip_route_output_key(net, &rt, &fl))
1555 goto out_free; 1566 goto out_free;
1556 } 1567 }
@@ -1559,8 +1570,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1559 1570
1560 if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) { 1571 if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
1561 /* Do not fragment multicasts. Alas, IPv4 does not 1572 /* Do not fragment multicasts. Alas, IPv4 does not
1562 allow to send ICMP, so that packets will disappear 1573 * allow to send ICMP, so that packets will disappear
1563 to blackhole. 1574 * to blackhole.
1564 */ 1575 */
1565 1576
1566 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS); 1577 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
@@ -1583,7 +1594,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1583 ip_decrease_ttl(ip_hdr(skb)); 1594 ip_decrease_ttl(ip_hdr(skb));
1584 1595
1585 /* FIXME: forward and output firewalls used to be called here. 1596 /* FIXME: forward and output firewalls used to be called here.
1586 * What do we do with netfilter? -- RR */ 1597 * What do we do with netfilter? -- RR
1598 */
1587 if (vif->flags & VIFF_TUNNEL) { 1599 if (vif->flags & VIFF_TUNNEL) {
1588 ip_encap(skb, vif->local, vif->remote); 1600 ip_encap(skb, vif->local, vif->remote);
1589 /* FIXME: extra output firewall step used to be here. --RR */ 1601 /* FIXME: extra output firewall step used to be here. --RR */
@@ -1644,15 +1656,15 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
1644 1656
1645 if (skb_rtable(skb)->fl.iif == 0) { 1657 if (skb_rtable(skb)->fl.iif == 0) {
1646 /* It is our own packet, looped back. 1658 /* It is our own packet, looped back.
1647 Very complicated situation... 1659 * Very complicated situation...
1648 1660 *
1649 The best workaround until routing daemons will be 1661 * The best workaround until routing daemons will be
1650 fixed is not to redistribute packet, if it was 1662 * fixed is not to redistribute packet, if it was
1651 send through wrong interface. It means, that 1663 * send through wrong interface. It means, that
1652 multicast applications WILL NOT work for 1664 * multicast applications WILL NOT work for
1653 (S,G), which have default multicast route pointing 1665 * (S,G), which have default multicast route pointing
1654 to wrong oif. In any case, it is not a good 1666 * to wrong oif. In any case, it is not a good
1655 idea to use multicasting applications on router. 1667 * idea to use multicasting applications on router.
1656 */ 1668 */
1657 goto dont_forward; 1669 goto dont_forward;
1658 } 1670 }
@@ -1662,9 +1674,9 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
1662 1674
1663 if (true_vifi >= 0 && mrt->mroute_do_assert && 1675 if (true_vifi >= 0 && mrt->mroute_do_assert &&
1664 /* pimsm uses asserts, when switching from RPT to SPT, 1676 /* pimsm uses asserts, when switching from RPT to SPT,
1665 so that we cannot check that packet arrived on an oif. 1677 * so that we cannot check that packet arrived on an oif.
1666 It is bad, but otherwise we would need to move pretty 1678 * It is bad, but otherwise we would need to move pretty
1667 large chunk of pimd to kernel. Ough... --ANK 1679 * large chunk of pimd to kernel. Ough... --ANK
1668 */ 1680 */
1669 (mrt->mroute_do_pim || 1681 (mrt->mroute_do_pim ||
1670 cache->mfc_un.res.ttls[true_vifi] < 255) && 1682 cache->mfc_un.res.ttls[true_vifi] < 255) &&
@@ -1682,10 +1694,12 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
1682 /* 1694 /*
1683 * Forward the frame 1695 * Forward the frame
1684 */ 1696 */
1685 for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) { 1697 for (ct = cache->mfc_un.res.maxvif - 1;
1698 ct >= cache->mfc_un.res.minvif; ct--) {
1686 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) { 1699 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1687 if (psend != -1) { 1700 if (psend != -1) {
1688 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 1701 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1702
1689 if (skb2) 1703 if (skb2)
1690 ipmr_queue_xmit(net, mrt, skb2, cache, 1704 ipmr_queue_xmit(net, mrt, skb2, cache,
1691 psend); 1705 psend);
@@ -1696,6 +1710,7 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
1696 if (psend != -1) { 1710 if (psend != -1) {
1697 if (local) { 1711 if (local) {
1698 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 1712 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1713
1699 if (skb2) 1714 if (skb2)
1700 ipmr_queue_xmit(net, mrt, skb2, cache, psend); 1715 ipmr_queue_xmit(net, mrt, skb2, cache, psend);
1701 } else { 1716 } else {
@@ -1713,6 +1728,7 @@ dont_forward:
1713 1728
1714/* 1729/*
1715 * Multicast packets for forwarding arrive here 1730 * Multicast packets for forwarding arrive here
1731 * Called with rcu_read_lock();
1716 */ 1732 */
1717 1733
1718int ip_mr_input(struct sk_buff *skb) 1734int ip_mr_input(struct sk_buff *skb)
@@ -1724,9 +1740,9 @@ int ip_mr_input(struct sk_buff *skb)
1724 int err; 1740 int err;
1725 1741
1726 /* Packet is looped back after forward, it should not be 1742 /* Packet is looped back after forward, it should not be
1727 forwarded second time, but still can be delivered locally. 1743 * forwarded second time, but still can be delivered locally.
1728 */ 1744 */
1729 if (IPCB(skb)->flags&IPSKB_FORWARDED) 1745 if (IPCB(skb)->flags & IPSKB_FORWARDED)
1730 goto dont_forward; 1746 goto dont_forward;
1731 1747
1732 err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt); 1748 err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt);
@@ -1736,28 +1752,28 @@ int ip_mr_input(struct sk_buff *skb)
1736 } 1752 }
1737 1753
1738 if (!local) { 1754 if (!local) {
1739 if (IPCB(skb)->opt.router_alert) { 1755 if (IPCB(skb)->opt.router_alert) {
1740 if (ip_call_ra_chain(skb)) 1756 if (ip_call_ra_chain(skb))
1741 return 0; 1757 return 0;
1742 } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){ 1758 } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP) {
1743 /* IGMPv1 (and broken IGMPv2 implementations sort of 1759 /* IGMPv1 (and broken IGMPv2 implementations sort of
1744 Cisco IOS <= 11.2(8)) do not put router alert 1760 * Cisco IOS <= 11.2(8)) do not put router alert
1745 option to IGMP packets destined to routable 1761 * option to IGMP packets destined to routable
1746 groups. It is very bad, because it means 1762 * groups. It is very bad, because it means
1747 that we can forward NO IGMP messages. 1763 * that we can forward NO IGMP messages.
1748 */ 1764 */
1749 read_lock(&mrt_lock); 1765 struct sock *mroute_sk;
1750 if (mrt->mroute_sk) { 1766
1751 nf_reset(skb); 1767 mroute_sk = rcu_dereference(mrt->mroute_sk);
1752 raw_rcv(mrt->mroute_sk, skb); 1768 if (mroute_sk) {
1753 read_unlock(&mrt_lock); 1769 nf_reset(skb);
1754 return 0; 1770 raw_rcv(mroute_sk, skb);
1755 } 1771 return 0;
1756 read_unlock(&mrt_lock); 1772 }
1757 } 1773 }
1758 } 1774 }
1759 1775
1760 read_lock(&mrt_lock); 1776 /* already under rcu_read_lock() */
1761 cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); 1777 cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1762 1778
1763 /* 1779 /*
@@ -1769,13 +1785,12 @@ int ip_mr_input(struct sk_buff *skb)
1769 if (local) { 1785 if (local) {
1770 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 1786 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1771 ip_local_deliver(skb); 1787 ip_local_deliver(skb);
1772 if (skb2 == NULL) { 1788 if (skb2 == NULL)
1773 read_unlock(&mrt_lock);
1774 return -ENOBUFS; 1789 return -ENOBUFS;
1775 }
1776 skb = skb2; 1790 skb = skb2;
1777 } 1791 }
1778 1792
1793 read_lock(&mrt_lock);
1779 vif = ipmr_find_vif(mrt, skb->dev); 1794 vif = ipmr_find_vif(mrt, skb->dev);
1780 if (vif >= 0) { 1795 if (vif >= 0) {
1781 int err2 = ipmr_cache_unresolved(mrt, vif, skb); 1796 int err2 = ipmr_cache_unresolved(mrt, vif, skb);
@@ -1788,8 +1803,8 @@ int ip_mr_input(struct sk_buff *skb)
1788 return -ENODEV; 1803 return -ENODEV;
1789 } 1804 }
1790 1805
1806 read_lock(&mrt_lock);
1791 ip_mr_forward(net, mrt, skb, cache, local); 1807 ip_mr_forward(net, mrt, skb, cache, local);
1792
1793 read_unlock(&mrt_lock); 1808 read_unlock(&mrt_lock);
1794 1809
1795 if (local) 1810 if (local)
@@ -1805,6 +1820,7 @@ dont_forward:
1805} 1820}
1806 1821
1807#ifdef CONFIG_IP_PIMSM 1822#ifdef CONFIG_IP_PIMSM
1823/* called with rcu_read_lock() */
1808static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, 1824static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
1809 unsigned int pimlen) 1825 unsigned int pimlen)
1810{ 1826{
@@ -1813,10 +1829,10 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
1813 1829
1814 encap = (struct iphdr *)(skb_transport_header(skb) + pimlen); 1830 encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
1815 /* 1831 /*
1816 Check that: 1832 * Check that:
1817 a. packet is really destinted to a multicast group 1833 * a. packet is really sent to a multicast group
1818 b. packet is not a NULL-REGISTER 1834 * b. packet is not a NULL-REGISTER
1819 c. packet is not truncated 1835 * c. packet is not truncated
1820 */ 1836 */
1821 if (!ipv4_is_multicast(encap->daddr) || 1837 if (!ipv4_is_multicast(encap->daddr) ||
1822 encap->tot_len == 0 || 1838 encap->tot_len == 0 ||
@@ -1826,26 +1842,23 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
1826 read_lock(&mrt_lock); 1842 read_lock(&mrt_lock);
1827 if (mrt->mroute_reg_vif_num >= 0) 1843 if (mrt->mroute_reg_vif_num >= 0)
1828 reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev; 1844 reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
1829 if (reg_dev)
1830 dev_hold(reg_dev);
1831 read_unlock(&mrt_lock); 1845 read_unlock(&mrt_lock);
1832 1846
1833 if (reg_dev == NULL) 1847 if (reg_dev == NULL)
1834 return 1; 1848 return 1;
1835 1849
1836 skb->mac_header = skb->network_header; 1850 skb->mac_header = skb->network_header;
1837 skb_pull(skb, (u8*)encap - skb->data); 1851 skb_pull(skb, (u8 *)encap - skb->data);
1838 skb_reset_network_header(skb); 1852 skb_reset_network_header(skb);
1839 skb->protocol = htons(ETH_P_IP); 1853 skb->protocol = htons(ETH_P_IP);
1840 skb->ip_summed = 0; 1854 skb->ip_summed = CHECKSUM_NONE;
1841 skb->pkt_type = PACKET_HOST; 1855 skb->pkt_type = PACKET_HOST;
1842 1856
1843 skb_tunnel_rx(skb, reg_dev); 1857 skb_tunnel_rx(skb, reg_dev);
1844 1858
1845 netif_rx(skb); 1859 netif_rx(skb);
1846 dev_put(reg_dev);
1847 1860
1848 return 0; 1861 return NET_RX_SUCCESS;
1849} 1862}
1850#endif 1863#endif
1851 1864
@@ -1854,7 +1867,7 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
1854 * Handle IGMP messages of PIMv1 1867 * Handle IGMP messages of PIMv1
1855 */ 1868 */
1856 1869
1857int pim_rcv_v1(struct sk_buff * skb) 1870int pim_rcv_v1(struct sk_buff *skb)
1858{ 1871{
1859 struct igmphdr *pim; 1872 struct igmphdr *pim;
1860 struct net *net = dev_net(skb->dev); 1873 struct net *net = dev_net(skb->dev);
@@ -1881,7 +1894,7 @@ drop:
1881#endif 1894#endif
1882 1895
1883#ifdef CONFIG_IP_PIMSM_V2 1896#ifdef CONFIG_IP_PIMSM_V2
1884static int pim_rcv(struct sk_buff * skb) 1897static int pim_rcv(struct sk_buff *skb)
1885{ 1898{
1886 struct pimreghdr *pim; 1899 struct pimreghdr *pim;
1887 struct net *net = dev_net(skb->dev); 1900 struct net *net = dev_net(skb->dev);
@@ -1891,8 +1904,8 @@ static int pim_rcv(struct sk_buff * skb)
1891 goto drop; 1904 goto drop;
1892 1905
1893 pim = (struct pimreghdr *)skb_transport_header(skb); 1906 pim = (struct pimreghdr *)skb_transport_header(skb);
1894 if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) || 1907 if (pim->type != ((PIM_VERSION << 4) | (PIM_REGISTER)) ||
1895 (pim->flags&PIM_NULL_REGISTER) || 1908 (pim->flags & PIM_NULL_REGISTER) ||
1896 (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && 1909 (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1897 csum_fold(skb_checksum(skb, 0, skb->len, 0)))) 1910 csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1898 goto drop; 1911 goto drop;
@@ -1958,28 +1971,33 @@ int ipmr_get_route(struct net *net,
1958 if (mrt == NULL) 1971 if (mrt == NULL)
1959 return -ENOENT; 1972 return -ENOENT;
1960 1973
1961 read_lock(&mrt_lock); 1974 rcu_read_lock();
1962 cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst); 1975 cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst);
1963 1976
1964 if (cache == NULL) { 1977 if (cache == NULL) {
1965 struct sk_buff *skb2; 1978 struct sk_buff *skb2;
1966 struct iphdr *iph; 1979 struct iphdr *iph;
1967 struct net_device *dev; 1980 struct net_device *dev;
1968 int vif; 1981 int vif = -1;
1969 1982
1970 if (nowait) { 1983 if (nowait) {
1971 read_unlock(&mrt_lock); 1984 rcu_read_unlock();
1972 return -EAGAIN; 1985 return -EAGAIN;
1973 } 1986 }
1974 1987
1975 dev = skb->dev; 1988 dev = skb->dev;
1976 if (dev == NULL || (vif = ipmr_find_vif(mrt, dev)) < 0) { 1989 read_lock(&mrt_lock);
1990 if (dev)
1991 vif = ipmr_find_vif(mrt, dev);
1992 if (vif < 0) {
1977 read_unlock(&mrt_lock); 1993 read_unlock(&mrt_lock);
1994 rcu_read_unlock();
1978 return -ENODEV; 1995 return -ENODEV;
1979 } 1996 }
1980 skb2 = skb_clone(skb, GFP_ATOMIC); 1997 skb2 = skb_clone(skb, GFP_ATOMIC);
1981 if (!skb2) { 1998 if (!skb2) {
1982 read_unlock(&mrt_lock); 1999 read_unlock(&mrt_lock);
2000 rcu_read_unlock();
1983 return -ENOMEM; 2001 return -ENOMEM;
1984 } 2002 }
1985 2003
@@ -1992,13 +2010,16 @@ int ipmr_get_route(struct net *net,
1992 iph->version = 0; 2010 iph->version = 0;
1993 err = ipmr_cache_unresolved(mrt, vif, skb2); 2011 err = ipmr_cache_unresolved(mrt, vif, skb2);
1994 read_unlock(&mrt_lock); 2012 read_unlock(&mrt_lock);
2013 rcu_read_unlock();
1995 return err; 2014 return err;
1996 } 2015 }
1997 2016
1998 if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY)) 2017 read_lock(&mrt_lock);
2018 if (!nowait && (rtm->rtm_flags & RTM_F_NOTIFY))
1999 cache->mfc_flags |= MFC_NOTIFY; 2019 cache->mfc_flags |= MFC_NOTIFY;
2000 err = __ipmr_fill_mroute(mrt, skb, cache, rtm); 2020 err = __ipmr_fill_mroute(mrt, skb, cache, rtm);
2001 read_unlock(&mrt_lock); 2021 read_unlock(&mrt_lock);
2022 rcu_read_unlock();
2002 return err; 2023 return err;
2003} 2024}
2004 2025
@@ -2050,14 +2071,14 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
2050 s_h = cb->args[1]; 2071 s_h = cb->args[1];
2051 s_e = cb->args[2]; 2072 s_e = cb->args[2];
2052 2073
2053 read_lock(&mrt_lock); 2074 rcu_read_lock();
2054 ipmr_for_each_table(mrt, net) { 2075 ipmr_for_each_table(mrt, net) {
2055 if (t < s_t) 2076 if (t < s_t)
2056 goto next_table; 2077 goto next_table;
2057 if (t > s_t) 2078 if (t > s_t)
2058 s_h = 0; 2079 s_h = 0;
2059 for (h = s_h; h < MFC_LINES; h++) { 2080 for (h = s_h; h < MFC_LINES; h++) {
2060 list_for_each_entry(mfc, &mrt->mfc_cache_array[h], list) { 2081 list_for_each_entry_rcu(mfc, &mrt->mfc_cache_array[h], list) {
2061 if (e < s_e) 2082 if (e < s_e)
2062 goto next_entry; 2083 goto next_entry;
2063 if (ipmr_fill_mroute(mrt, skb, 2084 if (ipmr_fill_mroute(mrt, skb,
@@ -2075,7 +2096,7 @@ next_table:
2075 t++; 2096 t++;
2076 } 2097 }
2077done: 2098done:
2078 read_unlock(&mrt_lock); 2099 rcu_read_unlock();
2079 2100
2080 cb->args[2] = e; 2101 cb->args[2] = e;
2081 cb->args[1] = h; 2102 cb->args[1] = h;
@@ -2086,7 +2107,8 @@ done:
2086 2107
2087#ifdef CONFIG_PROC_FS 2108#ifdef CONFIG_PROC_FS
2088/* 2109/*
2089 * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif 2110 * The /proc interfaces to multicast routing :
2111 * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif
2090 */ 2112 */
2091struct ipmr_vif_iter { 2113struct ipmr_vif_iter {
2092 struct seq_net_private p; 2114 struct seq_net_private p;
@@ -2208,14 +2230,14 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
2208 struct mr_table *mrt = it->mrt; 2230 struct mr_table *mrt = it->mrt;
2209 struct mfc_cache *mfc; 2231 struct mfc_cache *mfc;
2210 2232
2211 read_lock(&mrt_lock); 2233 rcu_read_lock();
2212 for (it->ct = 0; it->ct < MFC_LINES; it->ct++) { 2234 for (it->ct = 0; it->ct < MFC_LINES; it->ct++) {
2213 it->cache = &mrt->mfc_cache_array[it->ct]; 2235 it->cache = &mrt->mfc_cache_array[it->ct];
2214 list_for_each_entry(mfc, it->cache, list) 2236 list_for_each_entry_rcu(mfc, it->cache, list)
2215 if (pos-- == 0) 2237 if (pos-- == 0)
2216 return mfc; 2238 return mfc;
2217 } 2239 }
2218 read_unlock(&mrt_lock); 2240 rcu_read_unlock();
2219 2241
2220 spin_lock_bh(&mfc_unres_lock); 2242 spin_lock_bh(&mfc_unres_lock);
2221 it->cache = &mrt->mfc_unres_queue; 2243 it->cache = &mrt->mfc_unres_queue;
@@ -2274,7 +2296,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2274 } 2296 }
2275 2297
2276 /* exhausted cache_array, show unresolved */ 2298 /* exhausted cache_array, show unresolved */
2277 read_unlock(&mrt_lock); 2299 rcu_read_unlock();
2278 it->cache = &mrt->mfc_unres_queue; 2300 it->cache = &mrt->mfc_unres_queue;
2279 it->ct = 0; 2301 it->ct = 0;
2280 2302
@@ -2282,7 +2304,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2282 if (!list_empty(it->cache)) 2304 if (!list_empty(it->cache))
2283 return list_first_entry(it->cache, struct mfc_cache, list); 2305 return list_first_entry(it->cache, struct mfc_cache, list);
2284 2306
2285 end_of_list: 2307end_of_list:
2286 spin_unlock_bh(&mfc_unres_lock); 2308 spin_unlock_bh(&mfc_unres_lock);
2287 it->cache = NULL; 2309 it->cache = NULL;
2288 2310
@@ -2297,7 +2319,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
2297 if (it->cache == &mrt->mfc_unres_queue) 2319 if (it->cache == &mrt->mfc_unres_queue)
2298 spin_unlock_bh(&mfc_unres_lock); 2320 spin_unlock_bh(&mfc_unres_lock);
2299 else if (it->cache == &mrt->mfc_cache_array[it->ct]) 2321 else if (it->cache == &mrt->mfc_cache_array[it->ct])
2300 read_unlock(&mrt_lock); 2322 rcu_read_unlock();
2301} 2323}
2302 2324
2303static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) 2325static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
@@ -2323,7 +2345,7 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
2323 mfc->mfc_un.res.bytes, 2345 mfc->mfc_un.res.bytes,
2324 mfc->mfc_un.res.wrong_if); 2346 mfc->mfc_un.res.wrong_if);
2325 for (n = mfc->mfc_un.res.minvif; 2347 for (n = mfc->mfc_un.res.minvif;
2326 n < mfc->mfc_un.res.maxvif; n++ ) { 2348 n < mfc->mfc_un.res.maxvif; n++) {
2327 if (VIF_EXISTS(mrt, n) && 2349 if (VIF_EXISTS(mrt, n) &&
2328 mfc->mfc_un.res.ttls[n] < 255) 2350 mfc->mfc_un.res.ttls[n] < 255)
2329 seq_printf(seq, 2351 seq_printf(seq,
@@ -2421,7 +2443,7 @@ int __init ip_mr_init(void)
2421 2443
2422 mrt_cachep = kmem_cache_create("ip_mrt_cache", 2444 mrt_cachep = kmem_cache_create("ip_mrt_cache",
2423 sizeof(struct mfc_cache), 2445 sizeof(struct mfc_cache),
2424 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, 2446 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
2425 NULL); 2447 NULL);
2426 if (!mrt_cachep) 2448 if (!mrt_cachep)
2427 return -ENOMEM; 2449 return -ENOMEM;
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 1833bdbf9805..babd1a2bae5f 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -147,7 +147,7 @@ config IP_NF_TARGET_ULOG
147 which can only be viewed through syslog. 147 which can only be viewed through syslog.
148 148
149 The appropriate userspace logging daemon (ulogd) may be obtained from 149 The appropriate userspace logging daemon (ulogd) may be obtained from
150 <http://www.gnumonks.org/projects/ulogd/> 150 <http://www.netfilter.org/projects/ulogd/index.html>
151 151
152 To compile it as a module, choose M here. If unsure, say N. 152 To compile it as a module, choose M here. If unsure, say N.
153 153
@@ -324,10 +324,10 @@ config IP_NF_TARGET_ECN
324 324
325config IP_NF_TARGET_TTL 325config IP_NF_TARGET_TTL
326 tristate '"TTL" target support' 326 tristate '"TTL" target support'
327 depends on NETFILTER_ADVANCED 327 depends on NETFILTER_ADVANCED && IP_NF_MANGLE
328 select NETFILTER_XT_TARGET_HL 328 select NETFILTER_XT_TARGET_HL
329 ---help--- 329 ---help---
330 This is a backwards-compat option for the user's convenience 330 This is a backwards-compatible option for the user's convenience
331 (e.g. when running oldconfig). It selects 331 (e.g. when running oldconfig). It selects
332 CONFIG_NETFILTER_XT_TARGET_HL. 332 CONFIG_NETFILTER_XT_TARGET_HL.
333 333
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 6bccba31d132..3cad2591ace0 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -72,7 +72,7 @@ static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
72 for (i = 0; i < len; i++) 72 for (i = 0; i < len; i++)
73 ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i]; 73 ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i];
74 74
75 return (ret != 0); 75 return ret != 0;
76} 76}
77 77
78/* 78/*
@@ -228,7 +228,7 @@ arpt_error(struct sk_buff *skb, const struct xt_action_param *par)
228 return NF_DROP; 228 return NF_DROP;
229} 229}
230 230
231static inline const struct arpt_entry_target * 231static inline const struct xt_entry_target *
232arpt_get_target_c(const struct arpt_entry *e) 232arpt_get_target_c(const struct arpt_entry *e)
233{ 233{
234 return arpt_get_target((struct arpt_entry *)e); 234 return arpt_get_target((struct arpt_entry *)e);
@@ -282,7 +282,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
282 282
283 arp = arp_hdr(skb); 283 arp = arp_hdr(skb);
284 do { 284 do {
285 const struct arpt_entry_target *t; 285 const struct xt_entry_target *t;
286 286
287 if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) { 287 if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) {
288 e = arpt_next_entry(e); 288 e = arpt_next_entry(e);
@@ -297,10 +297,10 @@ unsigned int arpt_do_table(struct sk_buff *skb,
297 if (!t->u.kernel.target->target) { 297 if (!t->u.kernel.target->target) {
298 int v; 298 int v;
299 299
300 v = ((struct arpt_standard_target *)t)->verdict; 300 v = ((struct xt_standard_target *)t)->verdict;
301 if (v < 0) { 301 if (v < 0) {
302 /* Pop from stack? */ 302 /* Pop from stack? */
303 if (v != ARPT_RETURN) { 303 if (v != XT_RETURN) {
304 verdict = (unsigned)(-v) - 1; 304 verdict = (unsigned)(-v) - 1;
305 break; 305 break;
306 } 306 }
@@ -332,7 +332,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
332 /* Target might have changed stuff. */ 332 /* Target might have changed stuff. */
333 arp = arp_hdr(skb); 333 arp = arp_hdr(skb);
334 334
335 if (verdict == ARPT_CONTINUE) 335 if (verdict == XT_CONTINUE)
336 e = arpt_next_entry(e); 336 e = arpt_next_entry(e);
337 else 337 else
338 /* Verdict */ 338 /* Verdict */
@@ -377,7 +377,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
377 e->counters.pcnt = pos; 377 e->counters.pcnt = pos;
378 378
379 for (;;) { 379 for (;;) {
380 const struct arpt_standard_target *t 380 const struct xt_standard_target *t
381 = (void *)arpt_get_target_c(e); 381 = (void *)arpt_get_target_c(e);
382 int visited = e->comefrom & (1 << hook); 382 int visited = e->comefrom & (1 << hook);
383 383
@@ -392,13 +392,13 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
392 /* Unconditional return/END. */ 392 /* Unconditional return/END. */
393 if ((e->target_offset == sizeof(struct arpt_entry) && 393 if ((e->target_offset == sizeof(struct arpt_entry) &&
394 (strcmp(t->target.u.user.name, 394 (strcmp(t->target.u.user.name,
395 ARPT_STANDARD_TARGET) == 0) && 395 XT_STANDARD_TARGET) == 0) &&
396 t->verdict < 0 && unconditional(&e->arp)) || 396 t->verdict < 0 && unconditional(&e->arp)) ||
397 visited) { 397 visited) {
398 unsigned int oldpos, size; 398 unsigned int oldpos, size;
399 399
400 if ((strcmp(t->target.u.user.name, 400 if ((strcmp(t->target.u.user.name,
401 ARPT_STANDARD_TARGET) == 0) && 401 XT_STANDARD_TARGET) == 0) &&
402 t->verdict < -NF_MAX_VERDICT - 1) { 402 t->verdict < -NF_MAX_VERDICT - 1) {
403 duprintf("mark_source_chains: bad " 403 duprintf("mark_source_chains: bad "
404 "negative verdict (%i)\n", 404 "negative verdict (%i)\n",
@@ -433,7 +433,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
433 int newpos = t->verdict; 433 int newpos = t->verdict;
434 434
435 if (strcmp(t->target.u.user.name, 435 if (strcmp(t->target.u.user.name,
436 ARPT_STANDARD_TARGET) == 0 && 436 XT_STANDARD_TARGET) == 0 &&
437 newpos >= 0) { 437 newpos >= 0) {
438 if (newpos > newinfo->size - 438 if (newpos > newinfo->size -
439 sizeof(struct arpt_entry)) { 439 sizeof(struct arpt_entry)) {
@@ -464,14 +464,14 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
464 464
465static inline int check_entry(const struct arpt_entry *e, const char *name) 465static inline int check_entry(const struct arpt_entry *e, const char *name)
466{ 466{
467 const struct arpt_entry_target *t; 467 const struct xt_entry_target *t;
468 468
469 if (!arp_checkentry(&e->arp)) { 469 if (!arp_checkentry(&e->arp)) {
470 duprintf("arp_tables: arp check failed %p %s.\n", e, name); 470 duprintf("arp_tables: arp check failed %p %s.\n", e, name);
471 return -EINVAL; 471 return -EINVAL;
472 } 472 }
473 473
474 if (e->target_offset + sizeof(struct arpt_entry_target) > e->next_offset) 474 if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset)
475 return -EINVAL; 475 return -EINVAL;
476 476
477 t = arpt_get_target_c(e); 477 t = arpt_get_target_c(e);
@@ -483,7 +483,7 @@ static inline int check_entry(const struct arpt_entry *e, const char *name)
483 483
484static inline int check_target(struct arpt_entry *e, const char *name) 484static inline int check_target(struct arpt_entry *e, const char *name)
485{ 485{
486 struct arpt_entry_target *t = arpt_get_target(e); 486 struct xt_entry_target *t = arpt_get_target(e);
487 int ret; 487 int ret;
488 struct xt_tgchk_param par = { 488 struct xt_tgchk_param par = {
489 .table = name, 489 .table = name,
@@ -506,7 +506,7 @@ static inline int check_target(struct arpt_entry *e, const char *name)
506static inline int 506static inline int
507find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) 507find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
508{ 508{
509 struct arpt_entry_target *t; 509 struct xt_entry_target *t;
510 struct xt_target *target; 510 struct xt_target *target;
511 int ret; 511 int ret;
512 512
@@ -536,7 +536,7 @@ out:
536 536
537static bool check_underflow(const struct arpt_entry *e) 537static bool check_underflow(const struct arpt_entry *e)
538{ 538{
539 const struct arpt_entry_target *t; 539 const struct xt_entry_target *t;
540 unsigned int verdict; 540 unsigned int verdict;
541 541
542 if (!unconditional(&e->arp)) 542 if (!unconditional(&e->arp))
@@ -544,7 +544,7 @@ static bool check_underflow(const struct arpt_entry *e)
544 t = arpt_get_target_c(e); 544 t = arpt_get_target_c(e);
545 if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) 545 if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
546 return false; 546 return false;
547 verdict = ((struct arpt_standard_target *)t)->verdict; 547 verdict = ((struct xt_standard_target *)t)->verdict;
548 verdict = -verdict - 1; 548 verdict = -verdict - 1;
549 return verdict == NF_DROP || verdict == NF_ACCEPT; 549 return verdict == NF_DROP || verdict == NF_ACCEPT;
550} 550}
@@ -566,7 +566,7 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
566 } 566 }
567 567
568 if (e->next_offset 568 if (e->next_offset
569 < sizeof(struct arpt_entry) + sizeof(struct arpt_entry_target)) { 569 < sizeof(struct arpt_entry) + sizeof(struct xt_entry_target)) {
570 duprintf("checking: element %p size %u\n", 570 duprintf("checking: element %p size %u\n",
571 e, e->next_offset); 571 e, e->next_offset);
572 return -EINVAL; 572 return -EINVAL;
@@ -598,7 +598,7 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
598static inline void cleanup_entry(struct arpt_entry *e) 598static inline void cleanup_entry(struct arpt_entry *e)
599{ 599{
600 struct xt_tgdtor_param par; 600 struct xt_tgdtor_param par;
601 struct arpt_entry_target *t; 601 struct xt_entry_target *t;
602 602
603 t = arpt_get_target(e); 603 t = arpt_get_target(e);
604 par.target = t->u.kernel.target; 604 par.target = t->u.kernel.target;
@@ -735,6 +735,7 @@ static void get_counters(const struct xt_table_info *t,
735 if (cpu == curcpu) 735 if (cpu == curcpu)
736 continue; 736 continue;
737 i = 0; 737 i = 0;
738 local_bh_disable();
738 xt_info_wrlock(cpu); 739 xt_info_wrlock(cpu);
739 xt_entry_foreach(iter, t->entries[cpu], t->size) { 740 xt_entry_foreach(iter, t->entries[cpu], t->size) {
740 ADD_COUNTER(counters[i], iter->counters.bcnt, 741 ADD_COUNTER(counters[i], iter->counters.bcnt,
@@ -742,6 +743,7 @@ static void get_counters(const struct xt_table_info *t,
742 ++i; 743 ++i;
743 } 744 }
744 xt_info_wrunlock(cpu); 745 xt_info_wrunlock(cpu);
746 local_bh_enable();
745 } 747 }
746 put_cpu(); 748 put_cpu();
747} 749}
@@ -792,7 +794,7 @@ static int copy_entries_to_user(unsigned int total_size,
792 /* FIXME: use iterator macros --RR */ 794 /* FIXME: use iterator macros --RR */
793 /* ... then go back and fix counters and names */ 795 /* ... then go back and fix counters and names */
794 for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ 796 for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
795 const struct arpt_entry_target *t; 797 const struct xt_entry_target *t;
796 798
797 e = (struct arpt_entry *)(loc_cpu_entry + off); 799 e = (struct arpt_entry *)(loc_cpu_entry + off);
798 if (copy_to_user(userptr + off 800 if (copy_to_user(userptr + off
@@ -805,7 +807,7 @@ static int copy_entries_to_user(unsigned int total_size,
805 807
806 t = arpt_get_target_c(e); 808 t = arpt_get_target_c(e);
807 if (copy_to_user(userptr + off + e->target_offset 809 if (copy_to_user(userptr + off + e->target_offset
808 + offsetof(struct arpt_entry_target, 810 + offsetof(struct xt_entry_target,
809 u.user.name), 811 u.user.name),
810 t->u.kernel.target->name, 812 t->u.kernel.target->name,
811 strlen(t->u.kernel.target->name)+1) != 0) { 813 strlen(t->u.kernel.target->name)+1) != 0) {
@@ -842,7 +844,7 @@ static int compat_calc_entry(const struct arpt_entry *e,
842 const struct xt_table_info *info, 844 const struct xt_table_info *info,
843 const void *base, struct xt_table_info *newinfo) 845 const void *base, struct xt_table_info *newinfo)
844{ 846{
845 const struct arpt_entry_target *t; 847 const struct xt_entry_target *t;
846 unsigned int entry_offset; 848 unsigned int entry_offset;
847 int off, i, ret; 849 int off, i, ret;
848 850
@@ -893,7 +895,7 @@ static int compat_table_info(const struct xt_table_info *info,
893static int get_info(struct net *net, void __user *user, 895static int get_info(struct net *net, void __user *user,
894 const int *len, int compat) 896 const int *len, int compat)
895{ 897{
896 char name[ARPT_TABLE_MAXNAMELEN]; 898 char name[XT_TABLE_MAXNAMELEN];
897 struct xt_table *t; 899 struct xt_table *t;
898 int ret; 900 int ret;
899 901
@@ -906,7 +908,7 @@ static int get_info(struct net *net, void __user *user,
906 if (copy_from_user(name, user, sizeof(name)) != 0) 908 if (copy_from_user(name, user, sizeof(name)) != 0)
907 return -EFAULT; 909 return -EFAULT;
908 910
909 name[ARPT_TABLE_MAXNAMELEN-1] = '\0'; 911 name[XT_TABLE_MAXNAMELEN-1] = '\0';
910#ifdef CONFIG_COMPAT 912#ifdef CONFIG_COMPAT
911 if (compat) 913 if (compat)
912 xt_compat_lock(NFPROTO_ARP); 914 xt_compat_lock(NFPROTO_ARP);
@@ -1202,7 +1204,7 @@ static int do_add_counters(struct net *net, const void __user *user,
1202#ifdef CONFIG_COMPAT 1204#ifdef CONFIG_COMPAT
1203static inline void compat_release_entry(struct compat_arpt_entry *e) 1205static inline void compat_release_entry(struct compat_arpt_entry *e)
1204{ 1206{
1205 struct arpt_entry_target *t; 1207 struct xt_entry_target *t;
1206 1208
1207 t = compat_arpt_get_target(e); 1209 t = compat_arpt_get_target(e);
1208 module_put(t->u.kernel.target->me); 1210 module_put(t->u.kernel.target->me);
@@ -1218,7 +1220,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
1218 const unsigned int *underflows, 1220 const unsigned int *underflows,
1219 const char *name) 1221 const char *name)
1220{ 1222{
1221 struct arpt_entry_target *t; 1223 struct xt_entry_target *t;
1222 struct xt_target *target; 1224 struct xt_target *target;
1223 unsigned int entry_offset; 1225 unsigned int entry_offset;
1224 int ret, off, h; 1226 int ret, off, h;
@@ -1286,7 +1288,7 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr,
1286 unsigned int *size, const char *name, 1288 unsigned int *size, const char *name,
1287 struct xt_table_info *newinfo, unsigned char *base) 1289 struct xt_table_info *newinfo, unsigned char *base)
1288{ 1290{
1289 struct arpt_entry_target *t; 1291 struct xt_entry_target *t;
1290 struct xt_target *target; 1292 struct xt_target *target;
1291 struct arpt_entry *de; 1293 struct arpt_entry *de;
1292 unsigned int origsize; 1294 unsigned int origsize;
@@ -1418,6 +1420,9 @@ static int translate_compat_table(const char *name,
1418 if (ret != 0) 1420 if (ret != 0)
1419 break; 1421 break;
1420 ++i; 1422 ++i;
1423 if (strcmp(arpt_get_target(iter1)->u.user.name,
1424 XT_ERROR_TARGET) == 0)
1425 ++newinfo->stacksize;
1421 } 1426 }
1422 if (ret) { 1427 if (ret) {
1423 /* 1428 /*
@@ -1469,7 +1474,7 @@ out_unlock:
1469} 1474}
1470 1475
1471struct compat_arpt_replace { 1476struct compat_arpt_replace {
1472 char name[ARPT_TABLE_MAXNAMELEN]; 1477 char name[XT_TABLE_MAXNAMELEN];
1473 u32 valid_hooks; 1478 u32 valid_hooks;
1474 u32 num_entries; 1479 u32 num_entries;
1475 u32 size; 1480 u32 size;
@@ -1562,7 +1567,7 @@ static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr,
1562 struct xt_counters *counters, 1567 struct xt_counters *counters,
1563 unsigned int i) 1568 unsigned int i)
1564{ 1569{
1565 struct arpt_entry_target *t; 1570 struct xt_entry_target *t;
1566 struct compat_arpt_entry __user *ce; 1571 struct compat_arpt_entry __user *ce;
1567 u_int16_t target_offset, next_offset; 1572 u_int16_t target_offset, next_offset;
1568 compat_uint_t origsize; 1573 compat_uint_t origsize;
@@ -1623,7 +1628,7 @@ static int compat_copy_entries_to_user(unsigned int total_size,
1623} 1628}
1624 1629
1625struct compat_arpt_get_entries { 1630struct compat_arpt_get_entries {
1626 char name[ARPT_TABLE_MAXNAMELEN]; 1631 char name[XT_TABLE_MAXNAMELEN];
1627 compat_uint_t size; 1632 compat_uint_t size;
1628 struct compat_arpt_entry entrytable[0]; 1633 struct compat_arpt_entry entrytable[0];
1629}; 1634};
@@ -1823,7 +1828,7 @@ void arpt_unregister_table(struct xt_table *table)
1823/* The built-in targets: standard (NULL) and error. */ 1828/* The built-in targets: standard (NULL) and error. */
1824static struct xt_target arpt_builtin_tg[] __read_mostly = { 1829static struct xt_target arpt_builtin_tg[] __read_mostly = {
1825 { 1830 {
1826 .name = ARPT_STANDARD_TARGET, 1831 .name = XT_STANDARD_TARGET,
1827 .targetsize = sizeof(int), 1832 .targetsize = sizeof(int),
1828 .family = NFPROTO_ARP, 1833 .family = NFPROTO_ARP,
1829#ifdef CONFIG_COMPAT 1834#ifdef CONFIG_COMPAT
@@ -1833,9 +1838,9 @@ static struct xt_target arpt_builtin_tg[] __read_mostly = {
1833#endif 1838#endif
1834 }, 1839 },
1835 { 1840 {
1836 .name = ARPT_ERROR_TARGET, 1841 .name = XT_ERROR_TARGET,
1837 .target = arpt_error, 1842 .target = arpt_error,
1838 .targetsize = ARPT_FUNCTION_MAXNAMELEN, 1843 .targetsize = XT_FUNCTION_MAXNAMELEN,
1839 .family = NFPROTO_ARP, 1844 .family = NFPROTO_ARP,
1840 }, 1845 },
1841}; 1846};
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
index e1be7dd1171b..b8ddcc480ed9 100644
--- a/net/ipv4/netfilter/arpt_mangle.c
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -63,7 +63,7 @@ static int checkentry(const struct xt_tgchk_param *par)
63 return false; 63 return false;
64 64
65 if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT && 65 if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT &&
66 mangle->target != ARPT_CONTINUE) 66 mangle->target != XT_CONTINUE)
67 return false; 67 return false;
68 return true; 68 return true;
69} 69}
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index c439721b165a..d31b007a6d80 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -186,7 +186,7 @@ static inline bool unconditional(const struct ipt_ip *ip)
186} 186}
187 187
188/* for const-correctness */ 188/* for const-correctness */
189static inline const struct ipt_entry_target * 189static inline const struct xt_entry_target *
190ipt_get_target_c(const struct ipt_entry *e) 190ipt_get_target_c(const struct ipt_entry *e)
191{ 191{
192 return ipt_get_target((struct ipt_entry *)e); 192 return ipt_get_target((struct ipt_entry *)e);
@@ -230,9 +230,9 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
230 const char *hookname, const char **chainname, 230 const char *hookname, const char **chainname,
231 const char **comment, unsigned int *rulenum) 231 const char **comment, unsigned int *rulenum)
232{ 232{
233 const struct ipt_standard_target *t = (void *)ipt_get_target_c(s); 233 const struct xt_standard_target *t = (void *)ipt_get_target_c(s);
234 234
235 if (strcmp(t->target.u.kernel.target->name, IPT_ERROR_TARGET) == 0) { 235 if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) {
236 /* Head of user chain: ERROR target with chainname */ 236 /* Head of user chain: ERROR target with chainname */
237 *chainname = t->target.data; 237 *chainname = t->target.data;
238 (*rulenum) = 0; 238 (*rulenum) = 0;
@@ -241,7 +241,7 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
241 241
242 if (s->target_offset == sizeof(struct ipt_entry) && 242 if (s->target_offset == sizeof(struct ipt_entry) &&
243 strcmp(t->target.u.kernel.target->name, 243 strcmp(t->target.u.kernel.target->name,
244 IPT_STANDARD_TARGET) == 0 && 244 XT_STANDARD_TARGET) == 0 &&
245 t->verdict < 0 && 245 t->verdict < 0 &&
246 unconditional(&s->ip)) { 246 unconditional(&s->ip)) {
247 /* Tail of chains: STANDARD target (return/policy) */ 247 /* Tail of chains: STANDARD target (return/policy) */
@@ -346,7 +346,7 @@ ipt_do_table(struct sk_buff *skb,
346 get_entry(table_base, private->underflow[hook])); 346 get_entry(table_base, private->underflow[hook]));
347 347
348 do { 348 do {
349 const struct ipt_entry_target *t; 349 const struct xt_entry_target *t;
350 const struct xt_entry_match *ematch; 350 const struct xt_entry_match *ematch;
351 351
352 IP_NF_ASSERT(e); 352 IP_NF_ASSERT(e);
@@ -380,10 +380,10 @@ ipt_do_table(struct sk_buff *skb,
380 if (!t->u.kernel.target->target) { 380 if (!t->u.kernel.target->target) {
381 int v; 381 int v;
382 382
383 v = ((struct ipt_standard_target *)t)->verdict; 383 v = ((struct xt_standard_target *)t)->verdict;
384 if (v < 0) { 384 if (v < 0) {
385 /* Pop from stack? */ 385 /* Pop from stack? */
386 if (v != IPT_RETURN) { 386 if (v != XT_RETURN) {
387 verdict = (unsigned)(-v) - 1; 387 verdict = (unsigned)(-v) - 1;
388 break; 388 break;
389 } 389 }
@@ -421,7 +421,7 @@ ipt_do_table(struct sk_buff *skb,
421 verdict = t->u.kernel.target->target(skb, &acpar); 421 verdict = t->u.kernel.target->target(skb, &acpar);
422 /* Target might have changed stuff. */ 422 /* Target might have changed stuff. */
423 ip = ip_hdr(skb); 423 ip = ip_hdr(skb);
424 if (verdict == IPT_CONTINUE) 424 if (verdict == XT_CONTINUE)
425 e = ipt_next_entry(e); 425 e = ipt_next_entry(e);
426 else 426 else
427 /* Verdict */ 427 /* Verdict */
@@ -461,7 +461,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
461 e->counters.pcnt = pos; 461 e->counters.pcnt = pos;
462 462
463 for (;;) { 463 for (;;) {
464 const struct ipt_standard_target *t 464 const struct xt_standard_target *t
465 = (void *)ipt_get_target_c(e); 465 = (void *)ipt_get_target_c(e);
466 int visited = e->comefrom & (1 << hook); 466 int visited = e->comefrom & (1 << hook);
467 467
@@ -475,13 +475,13 @@ mark_source_chains(const struct xt_table_info *newinfo,
475 /* Unconditional return/END. */ 475 /* Unconditional return/END. */
476 if ((e->target_offset == sizeof(struct ipt_entry) && 476 if ((e->target_offset == sizeof(struct ipt_entry) &&
477 (strcmp(t->target.u.user.name, 477 (strcmp(t->target.u.user.name,
478 IPT_STANDARD_TARGET) == 0) && 478 XT_STANDARD_TARGET) == 0) &&
479 t->verdict < 0 && unconditional(&e->ip)) || 479 t->verdict < 0 && unconditional(&e->ip)) ||
480 visited) { 480 visited) {
481 unsigned int oldpos, size; 481 unsigned int oldpos, size;
482 482
483 if ((strcmp(t->target.u.user.name, 483 if ((strcmp(t->target.u.user.name,
484 IPT_STANDARD_TARGET) == 0) && 484 XT_STANDARD_TARGET) == 0) &&
485 t->verdict < -NF_MAX_VERDICT - 1) { 485 t->verdict < -NF_MAX_VERDICT - 1) {
486 duprintf("mark_source_chains: bad " 486 duprintf("mark_source_chains: bad "
487 "negative verdict (%i)\n", 487 "negative verdict (%i)\n",
@@ -524,7 +524,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
524 int newpos = t->verdict; 524 int newpos = t->verdict;
525 525
526 if (strcmp(t->target.u.user.name, 526 if (strcmp(t->target.u.user.name,
527 IPT_STANDARD_TARGET) == 0 && 527 XT_STANDARD_TARGET) == 0 &&
528 newpos >= 0) { 528 newpos >= 0) {
529 if (newpos > newinfo->size - 529 if (newpos > newinfo->size -
530 sizeof(struct ipt_entry)) { 530 sizeof(struct ipt_entry)) {
@@ -552,7 +552,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
552 return 1; 552 return 1;
553} 553}
554 554
555static void cleanup_match(struct ipt_entry_match *m, struct net *net) 555static void cleanup_match(struct xt_entry_match *m, struct net *net)
556{ 556{
557 struct xt_mtdtor_param par; 557 struct xt_mtdtor_param par;
558 558
@@ -568,14 +568,14 @@ static void cleanup_match(struct ipt_entry_match *m, struct net *net)
568static int 568static int
569check_entry(const struct ipt_entry *e, const char *name) 569check_entry(const struct ipt_entry *e, const char *name)
570{ 570{
571 const struct ipt_entry_target *t; 571 const struct xt_entry_target *t;
572 572
573 if (!ip_checkentry(&e->ip)) { 573 if (!ip_checkentry(&e->ip)) {
574 duprintf("ip check failed %p %s.\n", e, par->match->name); 574 duprintf("ip check failed %p %s.\n", e, par->match->name);
575 return -EINVAL; 575 return -EINVAL;
576 } 576 }
577 577
578 if (e->target_offset + sizeof(struct ipt_entry_target) > 578 if (e->target_offset + sizeof(struct xt_entry_target) >
579 e->next_offset) 579 e->next_offset)
580 return -EINVAL; 580 return -EINVAL;
581 581
@@ -587,7 +587,7 @@ check_entry(const struct ipt_entry *e, const char *name)
587} 587}
588 588
589static int 589static int
590check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par) 590check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
591{ 591{
592 const struct ipt_ip *ip = par->entryinfo; 592 const struct ipt_ip *ip = par->entryinfo;
593 int ret; 593 int ret;
@@ -605,7 +605,7 @@ check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par)
605} 605}
606 606
607static int 607static int
608find_check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par) 608find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
609{ 609{
610 struct xt_match *match; 610 struct xt_match *match;
611 int ret; 611 int ret;
@@ -630,7 +630,7 @@ err:
630 630
631static int check_target(struct ipt_entry *e, struct net *net, const char *name) 631static int check_target(struct ipt_entry *e, struct net *net, const char *name)
632{ 632{
633 struct ipt_entry_target *t = ipt_get_target(e); 633 struct xt_entry_target *t = ipt_get_target(e);
634 struct xt_tgchk_param par = { 634 struct xt_tgchk_param par = {
635 .net = net, 635 .net = net,
636 .table = name, 636 .table = name,
@@ -656,7 +656,7 @@ static int
656find_check_entry(struct ipt_entry *e, struct net *net, const char *name, 656find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
657 unsigned int size) 657 unsigned int size)
658{ 658{
659 struct ipt_entry_target *t; 659 struct xt_entry_target *t;
660 struct xt_target *target; 660 struct xt_target *target;
661 int ret; 661 int ret;
662 unsigned int j; 662 unsigned int j;
@@ -707,7 +707,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
707 707
708static bool check_underflow(const struct ipt_entry *e) 708static bool check_underflow(const struct ipt_entry *e)
709{ 709{
710 const struct ipt_entry_target *t; 710 const struct xt_entry_target *t;
711 unsigned int verdict; 711 unsigned int verdict;
712 712
713 if (!unconditional(&e->ip)) 713 if (!unconditional(&e->ip))
@@ -715,7 +715,7 @@ static bool check_underflow(const struct ipt_entry *e)
715 t = ipt_get_target_c(e); 715 t = ipt_get_target_c(e);
716 if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) 716 if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
717 return false; 717 return false;
718 verdict = ((struct ipt_standard_target *)t)->verdict; 718 verdict = ((struct xt_standard_target *)t)->verdict;
719 verdict = -verdict - 1; 719 verdict = -verdict - 1;
720 return verdict == NF_DROP || verdict == NF_ACCEPT; 720 return verdict == NF_DROP || verdict == NF_ACCEPT;
721} 721}
@@ -738,7 +738,7 @@ check_entry_size_and_hooks(struct ipt_entry *e,
738 } 738 }
739 739
740 if (e->next_offset 740 if (e->next_offset
741 < sizeof(struct ipt_entry) + sizeof(struct ipt_entry_target)) { 741 < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target)) {
742 duprintf("checking: element %p size %u\n", 742 duprintf("checking: element %p size %u\n",
743 e, e->next_offset); 743 e, e->next_offset);
744 return -EINVAL; 744 return -EINVAL;
@@ -771,7 +771,7 @@ static void
771cleanup_entry(struct ipt_entry *e, struct net *net) 771cleanup_entry(struct ipt_entry *e, struct net *net)
772{ 772{
773 struct xt_tgdtor_param par; 773 struct xt_tgdtor_param par;
774 struct ipt_entry_target *t; 774 struct xt_entry_target *t;
775 struct xt_entry_match *ematch; 775 struct xt_entry_match *ematch;
776 776
777 /* Cleanup all matches */ 777 /* Cleanup all matches */
@@ -909,6 +909,7 @@ get_counters(const struct xt_table_info *t,
909 if (cpu == curcpu) 909 if (cpu == curcpu)
910 continue; 910 continue;
911 i = 0; 911 i = 0;
912 local_bh_disable();
912 xt_info_wrlock(cpu); 913 xt_info_wrlock(cpu);
913 xt_entry_foreach(iter, t->entries[cpu], t->size) { 914 xt_entry_foreach(iter, t->entries[cpu], t->size) {
914 ADD_COUNTER(counters[i], iter->counters.bcnt, 915 ADD_COUNTER(counters[i], iter->counters.bcnt,
@@ -916,6 +917,7 @@ get_counters(const struct xt_table_info *t,
916 ++i; /* macro does multi eval of i */ 917 ++i; /* macro does multi eval of i */
917 } 918 }
918 xt_info_wrunlock(cpu); 919 xt_info_wrunlock(cpu);
920 local_bh_enable();
919 } 921 }
920 put_cpu(); 922 put_cpu();
921} 923}
@@ -970,8 +972,8 @@ copy_entries_to_user(unsigned int total_size,
970 /* ... then go back and fix counters and names */ 972 /* ... then go back and fix counters and names */
971 for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ 973 for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
972 unsigned int i; 974 unsigned int i;
973 const struct ipt_entry_match *m; 975 const struct xt_entry_match *m;
974 const struct ipt_entry_target *t; 976 const struct xt_entry_target *t;
975 977
976 e = (struct ipt_entry *)(loc_cpu_entry + off); 978 e = (struct ipt_entry *)(loc_cpu_entry + off);
977 if (copy_to_user(userptr + off 979 if (copy_to_user(userptr + off
@@ -988,7 +990,7 @@ copy_entries_to_user(unsigned int total_size,
988 m = (void *)e + i; 990 m = (void *)e + i;
989 991
990 if (copy_to_user(userptr + off + i 992 if (copy_to_user(userptr + off + i
991 + offsetof(struct ipt_entry_match, 993 + offsetof(struct xt_entry_match,
992 u.user.name), 994 u.user.name),
993 m->u.kernel.match->name, 995 m->u.kernel.match->name,
994 strlen(m->u.kernel.match->name)+1) 996 strlen(m->u.kernel.match->name)+1)
@@ -1000,7 +1002,7 @@ copy_entries_to_user(unsigned int total_size,
1000 1002
1001 t = ipt_get_target_c(e); 1003 t = ipt_get_target_c(e);
1002 if (copy_to_user(userptr + off + e->target_offset 1004 if (copy_to_user(userptr + off + e->target_offset
1003 + offsetof(struct ipt_entry_target, 1005 + offsetof(struct xt_entry_target,
1004 u.user.name), 1006 u.user.name),
1005 t->u.kernel.target->name, 1007 t->u.kernel.target->name,
1006 strlen(t->u.kernel.target->name)+1) != 0) { 1008 strlen(t->u.kernel.target->name)+1) != 0) {
@@ -1038,7 +1040,7 @@ static int compat_calc_entry(const struct ipt_entry *e,
1038 const void *base, struct xt_table_info *newinfo) 1040 const void *base, struct xt_table_info *newinfo)
1039{ 1041{
1040 const struct xt_entry_match *ematch; 1042 const struct xt_entry_match *ematch;
1041 const struct ipt_entry_target *t; 1043 const struct xt_entry_target *t;
1042 unsigned int entry_offset; 1044 unsigned int entry_offset;
1043 int off, i, ret; 1045 int off, i, ret;
1044 1046
@@ -1090,7 +1092,7 @@ static int compat_table_info(const struct xt_table_info *info,
1090static int get_info(struct net *net, void __user *user, 1092static int get_info(struct net *net, void __user *user,
1091 const int *len, int compat) 1093 const int *len, int compat)
1092{ 1094{
1093 char name[IPT_TABLE_MAXNAMELEN]; 1095 char name[XT_TABLE_MAXNAMELEN];
1094 struct xt_table *t; 1096 struct xt_table *t;
1095 int ret; 1097 int ret;
1096 1098
@@ -1103,7 +1105,7 @@ static int get_info(struct net *net, void __user *user,
1103 if (copy_from_user(name, user, sizeof(name)) != 0) 1105 if (copy_from_user(name, user, sizeof(name)) != 0)
1104 return -EFAULT; 1106 return -EFAULT;
1105 1107
1106 name[IPT_TABLE_MAXNAMELEN-1] = '\0'; 1108 name[XT_TABLE_MAXNAMELEN-1] = '\0';
1107#ifdef CONFIG_COMPAT 1109#ifdef CONFIG_COMPAT
1108 if (compat) 1110 if (compat)
1109 xt_compat_lock(AF_INET); 1111 xt_compat_lock(AF_INET);
@@ -1398,14 +1400,14 @@ do_add_counters(struct net *net, const void __user *user,
1398 1400
1399#ifdef CONFIG_COMPAT 1401#ifdef CONFIG_COMPAT
1400struct compat_ipt_replace { 1402struct compat_ipt_replace {
1401 char name[IPT_TABLE_MAXNAMELEN]; 1403 char name[XT_TABLE_MAXNAMELEN];
1402 u32 valid_hooks; 1404 u32 valid_hooks;
1403 u32 num_entries; 1405 u32 num_entries;
1404 u32 size; 1406 u32 size;
1405 u32 hook_entry[NF_INET_NUMHOOKS]; 1407 u32 hook_entry[NF_INET_NUMHOOKS];
1406 u32 underflow[NF_INET_NUMHOOKS]; 1408 u32 underflow[NF_INET_NUMHOOKS];
1407 u32 num_counters; 1409 u32 num_counters;
1408 compat_uptr_t counters; /* struct ipt_counters * */ 1410 compat_uptr_t counters; /* struct xt_counters * */
1409 struct compat_ipt_entry entries[0]; 1411 struct compat_ipt_entry entries[0];
1410}; 1412};
1411 1413
@@ -1414,7 +1416,7 @@ compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr,
1414 unsigned int *size, struct xt_counters *counters, 1416 unsigned int *size, struct xt_counters *counters,
1415 unsigned int i) 1417 unsigned int i)
1416{ 1418{
1417 struct ipt_entry_target *t; 1419 struct xt_entry_target *t;
1418 struct compat_ipt_entry __user *ce; 1420 struct compat_ipt_entry __user *ce;
1419 u_int16_t target_offset, next_offset; 1421 u_int16_t target_offset, next_offset;
1420 compat_uint_t origsize; 1422 compat_uint_t origsize;
@@ -1449,7 +1451,7 @@ compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr,
1449} 1451}
1450 1452
1451static int 1453static int
1452compat_find_calc_match(struct ipt_entry_match *m, 1454compat_find_calc_match(struct xt_entry_match *m,
1453 const char *name, 1455 const char *name,
1454 const struct ipt_ip *ip, 1456 const struct ipt_ip *ip,
1455 unsigned int hookmask, 1457 unsigned int hookmask,
@@ -1471,7 +1473,7 @@ compat_find_calc_match(struct ipt_entry_match *m,
1471 1473
1472static void compat_release_entry(struct compat_ipt_entry *e) 1474static void compat_release_entry(struct compat_ipt_entry *e)
1473{ 1475{
1474 struct ipt_entry_target *t; 1476 struct xt_entry_target *t;
1475 struct xt_entry_match *ematch; 1477 struct xt_entry_match *ematch;
1476 1478
1477 /* Cleanup all matches */ 1479 /* Cleanup all matches */
@@ -1492,7 +1494,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
1492 const char *name) 1494 const char *name)
1493{ 1495{
1494 struct xt_entry_match *ematch; 1496 struct xt_entry_match *ematch;
1495 struct ipt_entry_target *t; 1497 struct xt_entry_target *t;
1496 struct xt_target *target; 1498 struct xt_target *target;
1497 unsigned int entry_offset; 1499 unsigned int entry_offset;
1498 unsigned int j; 1500 unsigned int j;
@@ -1574,7 +1576,7 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
1574 unsigned int *size, const char *name, 1576 unsigned int *size, const char *name,
1575 struct xt_table_info *newinfo, unsigned char *base) 1577 struct xt_table_info *newinfo, unsigned char *base)
1576{ 1578{
1577 struct ipt_entry_target *t; 1579 struct xt_entry_target *t;
1578 struct xt_target *target; 1580 struct xt_target *target;
1579 struct ipt_entry *de; 1581 struct ipt_entry *de;
1580 unsigned int origsize; 1582 unsigned int origsize;
@@ -1749,6 +1751,9 @@ translate_compat_table(struct net *net,
1749 if (ret != 0) 1751 if (ret != 0)
1750 break; 1752 break;
1751 ++i; 1753 ++i;
1754 if (strcmp(ipt_get_target(iter1)->u.user.name,
1755 XT_ERROR_TARGET) == 0)
1756 ++newinfo->stacksize;
1752 } 1757 }
1753 if (ret) { 1758 if (ret) {
1754 /* 1759 /*
@@ -1879,7 +1884,7 @@ compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user,
1879} 1884}
1880 1885
1881struct compat_ipt_get_entries { 1886struct compat_ipt_get_entries {
1882 char name[IPT_TABLE_MAXNAMELEN]; 1887 char name[XT_TABLE_MAXNAMELEN];
1883 compat_uint_t size; 1888 compat_uint_t size;
1884 struct compat_ipt_entry entrytable[0]; 1889 struct compat_ipt_entry entrytable[0];
1885}; 1890};
@@ -2034,7 +2039,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2034 2039
2035 case IPT_SO_GET_REVISION_MATCH: 2040 case IPT_SO_GET_REVISION_MATCH:
2036 case IPT_SO_GET_REVISION_TARGET: { 2041 case IPT_SO_GET_REVISION_TARGET: {
2037 struct ipt_get_revision rev; 2042 struct xt_get_revision rev;
2038 int target; 2043 int target;
2039 2044
2040 if (*len != sizeof(rev)) { 2045 if (*len != sizeof(rev)) {
@@ -2171,7 +2176,7 @@ static int icmp_checkentry(const struct xt_mtchk_param *par)
2171 2176
2172static struct xt_target ipt_builtin_tg[] __read_mostly = { 2177static struct xt_target ipt_builtin_tg[] __read_mostly = {
2173 { 2178 {
2174 .name = IPT_STANDARD_TARGET, 2179 .name = XT_STANDARD_TARGET,
2175 .targetsize = sizeof(int), 2180 .targetsize = sizeof(int),
2176 .family = NFPROTO_IPV4, 2181 .family = NFPROTO_IPV4,
2177#ifdef CONFIG_COMPAT 2182#ifdef CONFIG_COMPAT
@@ -2181,9 +2186,9 @@ static struct xt_target ipt_builtin_tg[] __read_mostly = {
2181#endif 2186#endif
2182 }, 2187 },
2183 { 2188 {
2184 .name = IPT_ERROR_TARGET, 2189 .name = XT_ERROR_TARGET,
2185 .target = ipt_error, 2190 .target = ipt_error,
2186 .targetsize = IPT_FUNCTION_MAXNAMELEN, 2191 .targetsize = XT_FUNCTION_MAXNAMELEN,
2187 .family = NFPROTO_IPV4, 2192 .family = NFPROTO_IPV4,
2188 }, 2193 },
2189}; 2194};
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 3a43cf36db87..1e26a4897655 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -29,6 +29,7 @@
29#include <net/netfilter/nf_conntrack.h> 29#include <net/netfilter/nf_conntrack.h>
30#include <net/net_namespace.h> 30#include <net/net_namespace.h>
31#include <net/checksum.h> 31#include <net/checksum.h>
32#include <net/ip.h>
32 33
33#define CLUSTERIP_VERSION "0.8" 34#define CLUSTERIP_VERSION "0.8"
34 35
@@ -231,24 +232,22 @@ clusterip_hashfn(const struct sk_buff *skb,
231{ 232{
232 const struct iphdr *iph = ip_hdr(skb); 233 const struct iphdr *iph = ip_hdr(skb);
233 unsigned long hashval; 234 unsigned long hashval;
234 u_int16_t sport, dport; 235 u_int16_t sport = 0, dport = 0;
235 const u_int16_t *ports; 236 int poff;
236 237
237 switch (iph->protocol) { 238 poff = proto_ports_offset(iph->protocol);
238 case IPPROTO_TCP: 239 if (poff >= 0) {
239 case IPPROTO_UDP: 240 const u_int16_t *ports;
240 case IPPROTO_UDPLITE: 241 u16 _ports[2];
241 case IPPROTO_SCTP: 242
242 case IPPROTO_DCCP: 243 ports = skb_header_pointer(skb, iph->ihl * 4 + poff, 4, _ports);
243 case IPPROTO_ICMP: 244 if (ports) {
244 ports = (const void *)iph+iph->ihl*4; 245 sport = ports[0];
245 sport = ports[0]; 246 dport = ports[1];
246 dport = ports[1]; 247 }
247 break; 248 } else {
248 default:
249 if (net_ratelimit()) 249 if (net_ratelimit())
250 pr_info("unknown protocol %u\n", iph->protocol); 250 pr_info("unknown protocol %u\n", iph->protocol);
251 sport = dport = 0;
252 } 251 }
253 252
254 switch (config->hash_mode) { 253 switch (config->hash_mode) {
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index 915fc17d7ce2..72ffc8fda2e9 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -24,16 +24,15 @@
24#include <linux/netfilter/x_tables.h> 24#include <linux/netfilter/x_tables.h>
25#include <linux/netfilter_ipv4/ipt_LOG.h> 25#include <linux/netfilter_ipv4/ipt_LOG.h>
26#include <net/netfilter/nf_log.h> 26#include <net/netfilter/nf_log.h>
27#include <net/netfilter/xt_log.h>
27 28
28MODULE_LICENSE("GPL"); 29MODULE_LICENSE("GPL");
29MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); 30MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
30MODULE_DESCRIPTION("Xtables: IPv4 packet logging to syslog"); 31MODULE_DESCRIPTION("Xtables: IPv4 packet logging to syslog");
31 32
32/* Use lock to serialize, so printks don't overlap */
33static DEFINE_SPINLOCK(log_lock);
34
35/* One level of recursion won't kill us */ 33/* One level of recursion won't kill us */
36static void dump_packet(const struct nf_loginfo *info, 34static void dump_packet(struct sbuff *m,
35 const struct nf_loginfo *info,
37 const struct sk_buff *skb, 36 const struct sk_buff *skb,
38 unsigned int iphoff) 37 unsigned int iphoff)
39{ 38{
@@ -48,32 +47,32 @@ static void dump_packet(const struct nf_loginfo *info,
48 47
49 ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); 48 ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
50 if (ih == NULL) { 49 if (ih == NULL) {
51 printk("TRUNCATED"); 50 sb_add(m, "TRUNCATED");
52 return; 51 return;
53 } 52 }
54 53
55 /* Important fields: 54 /* Important fields:
56 * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */ 55 * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
57 /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ 56 /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
58 printk("SRC=%pI4 DST=%pI4 ", 57 sb_add(m, "SRC=%pI4 DST=%pI4 ",
59 &ih->saddr, &ih->daddr); 58 &ih->saddr, &ih->daddr);
60 59
61 /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ 60 /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
62 printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", 61 sb_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
63 ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, 62 ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
64 ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); 63 ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
65 64
66 /* Max length: 6 "CE DF MF " */ 65 /* Max length: 6 "CE DF MF " */
67 if (ntohs(ih->frag_off) & IP_CE) 66 if (ntohs(ih->frag_off) & IP_CE)
68 printk("CE "); 67 sb_add(m, "CE ");
69 if (ntohs(ih->frag_off) & IP_DF) 68 if (ntohs(ih->frag_off) & IP_DF)
70 printk("DF "); 69 sb_add(m, "DF ");
71 if (ntohs(ih->frag_off) & IP_MF) 70 if (ntohs(ih->frag_off) & IP_MF)
72 printk("MF "); 71 sb_add(m, "MF ");
73 72
74 /* Max length: 11 "FRAG:65535 " */ 73 /* Max length: 11 "FRAG:65535 " */
75 if (ntohs(ih->frag_off) & IP_OFFSET) 74 if (ntohs(ih->frag_off) & IP_OFFSET)
76 printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); 75 sb_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
77 76
78 if ((logflags & IPT_LOG_IPOPT) && 77 if ((logflags & IPT_LOG_IPOPT) &&
79 ih->ihl * 4 > sizeof(struct iphdr)) { 78 ih->ihl * 4 > sizeof(struct iphdr)) {
@@ -85,15 +84,15 @@ static void dump_packet(const struct nf_loginfo *info,
85 op = skb_header_pointer(skb, iphoff+sizeof(_iph), 84 op = skb_header_pointer(skb, iphoff+sizeof(_iph),
86 optsize, _opt); 85 optsize, _opt);
87 if (op == NULL) { 86 if (op == NULL) {
88 printk("TRUNCATED"); 87 sb_add(m, "TRUNCATED");
89 return; 88 return;
90 } 89 }
91 90
92 /* Max length: 127 "OPT (" 15*4*2chars ") " */ 91 /* Max length: 127 "OPT (" 15*4*2chars ") " */
93 printk("OPT ("); 92 sb_add(m, "OPT (");
94 for (i = 0; i < optsize; i++) 93 for (i = 0; i < optsize; i++)
95 printk("%02X", op[i]); 94 sb_add(m, "%02X", op[i]);
96 printk(") "); 95 sb_add(m, ") ");
97 } 96 }
98 97
99 switch (ih->protocol) { 98 switch (ih->protocol) {
@@ -102,7 +101,7 @@ static void dump_packet(const struct nf_loginfo *info,
102 const struct tcphdr *th; 101 const struct tcphdr *th;
103 102
104 /* Max length: 10 "PROTO=TCP " */ 103 /* Max length: 10 "PROTO=TCP " */
105 printk("PROTO=TCP "); 104 sb_add(m, "PROTO=TCP ");
106 105
107 if (ntohs(ih->frag_off) & IP_OFFSET) 106 if (ntohs(ih->frag_off) & IP_OFFSET)
108 break; 107 break;
@@ -111,41 +110,41 @@ static void dump_packet(const struct nf_loginfo *info,
111 th = skb_header_pointer(skb, iphoff + ih->ihl * 4, 110 th = skb_header_pointer(skb, iphoff + ih->ihl * 4,
112 sizeof(_tcph), &_tcph); 111 sizeof(_tcph), &_tcph);
113 if (th == NULL) { 112 if (th == NULL) {
114 printk("INCOMPLETE [%u bytes] ", 113 sb_add(m, "INCOMPLETE [%u bytes] ",
115 skb->len - iphoff - ih->ihl*4); 114 skb->len - iphoff - ih->ihl*4);
116 break; 115 break;
117 } 116 }
118 117
119 /* Max length: 20 "SPT=65535 DPT=65535 " */ 118 /* Max length: 20 "SPT=65535 DPT=65535 " */
120 printk("SPT=%u DPT=%u ", 119 sb_add(m, "SPT=%u DPT=%u ",
121 ntohs(th->source), ntohs(th->dest)); 120 ntohs(th->source), ntohs(th->dest));
122 /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ 121 /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
123 if (logflags & IPT_LOG_TCPSEQ) 122 if (logflags & IPT_LOG_TCPSEQ)
124 printk("SEQ=%u ACK=%u ", 123 sb_add(m, "SEQ=%u ACK=%u ",
125 ntohl(th->seq), ntohl(th->ack_seq)); 124 ntohl(th->seq), ntohl(th->ack_seq));
126 /* Max length: 13 "WINDOW=65535 " */ 125 /* Max length: 13 "WINDOW=65535 " */
127 printk("WINDOW=%u ", ntohs(th->window)); 126 sb_add(m, "WINDOW=%u ", ntohs(th->window));
128 /* Max length: 9 "RES=0x3F " */ 127 /* Max length: 9 "RES=0x3F " */
129 printk("RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); 128 sb_add(m, "RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
130 /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ 129 /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
131 if (th->cwr) 130 if (th->cwr)
132 printk("CWR "); 131 sb_add(m, "CWR ");
133 if (th->ece) 132 if (th->ece)
134 printk("ECE "); 133 sb_add(m, "ECE ");
135 if (th->urg) 134 if (th->urg)
136 printk("URG "); 135 sb_add(m, "URG ");
137 if (th->ack) 136 if (th->ack)
138 printk("ACK "); 137 sb_add(m, "ACK ");
139 if (th->psh) 138 if (th->psh)
140 printk("PSH "); 139 sb_add(m, "PSH ");
141 if (th->rst) 140 if (th->rst)
142 printk("RST "); 141 sb_add(m, "RST ");
143 if (th->syn) 142 if (th->syn)
144 printk("SYN "); 143 sb_add(m, "SYN ");
145 if (th->fin) 144 if (th->fin)
146 printk("FIN "); 145 sb_add(m, "FIN ");
147 /* Max length: 11 "URGP=65535 " */ 146 /* Max length: 11 "URGP=65535 " */
148 printk("URGP=%u ", ntohs(th->urg_ptr)); 147 sb_add(m, "URGP=%u ", ntohs(th->urg_ptr));
149 148
150 if ((logflags & IPT_LOG_TCPOPT) && 149 if ((logflags & IPT_LOG_TCPOPT) &&
151 th->doff * 4 > sizeof(struct tcphdr)) { 150 th->doff * 4 > sizeof(struct tcphdr)) {
@@ -158,15 +157,15 @@ static void dump_packet(const struct nf_loginfo *info,
158 iphoff+ih->ihl*4+sizeof(_tcph), 157 iphoff+ih->ihl*4+sizeof(_tcph),
159 optsize, _opt); 158 optsize, _opt);
160 if (op == NULL) { 159 if (op == NULL) {
161 printk("TRUNCATED"); 160 sb_add(m, "TRUNCATED");
162 return; 161 return;
163 } 162 }
164 163
165 /* Max length: 127 "OPT (" 15*4*2chars ") " */ 164 /* Max length: 127 "OPT (" 15*4*2chars ") " */
166 printk("OPT ("); 165 sb_add(m, "OPT (");
167 for (i = 0; i < optsize; i++) 166 for (i = 0; i < optsize; i++)
168 printk("%02X", op[i]); 167 sb_add(m, "%02X", op[i]);
169 printk(") "); 168 sb_add(m, ") ");
170 } 169 }
171 break; 170 break;
172 } 171 }
@@ -177,9 +176,9 @@ static void dump_packet(const struct nf_loginfo *info,
177 176
178 if (ih->protocol == IPPROTO_UDP) 177 if (ih->protocol == IPPROTO_UDP)
179 /* Max length: 10 "PROTO=UDP " */ 178 /* Max length: 10 "PROTO=UDP " */
180 printk("PROTO=UDP " ); 179 sb_add(m, "PROTO=UDP " );
181 else /* Max length: 14 "PROTO=UDPLITE " */ 180 else /* Max length: 14 "PROTO=UDPLITE " */
182 printk("PROTO=UDPLITE "); 181 sb_add(m, "PROTO=UDPLITE ");
183 182
184 if (ntohs(ih->frag_off) & IP_OFFSET) 183 if (ntohs(ih->frag_off) & IP_OFFSET)
185 break; 184 break;
@@ -188,13 +187,13 @@ static void dump_packet(const struct nf_loginfo *info,
188 uh = skb_header_pointer(skb, iphoff+ih->ihl*4, 187 uh = skb_header_pointer(skb, iphoff+ih->ihl*4,
189 sizeof(_udph), &_udph); 188 sizeof(_udph), &_udph);
190 if (uh == NULL) { 189 if (uh == NULL) {
191 printk("INCOMPLETE [%u bytes] ", 190 sb_add(m, "INCOMPLETE [%u bytes] ",
192 skb->len - iphoff - ih->ihl*4); 191 skb->len - iphoff - ih->ihl*4);
193 break; 192 break;
194 } 193 }
195 194
196 /* Max length: 20 "SPT=65535 DPT=65535 " */ 195 /* Max length: 20 "SPT=65535 DPT=65535 " */
197 printk("SPT=%u DPT=%u LEN=%u ", 196 sb_add(m, "SPT=%u DPT=%u LEN=%u ",
198 ntohs(uh->source), ntohs(uh->dest), 197 ntohs(uh->source), ntohs(uh->dest),
199 ntohs(uh->len)); 198 ntohs(uh->len));
200 break; 199 break;
@@ -221,7 +220,7 @@ static void dump_packet(const struct nf_loginfo *info,
221 [ICMP_ADDRESSREPLY] = 12 }; 220 [ICMP_ADDRESSREPLY] = 12 };
222 221
223 /* Max length: 11 "PROTO=ICMP " */ 222 /* Max length: 11 "PROTO=ICMP " */
224 printk("PROTO=ICMP "); 223 sb_add(m, "PROTO=ICMP ");
225 224
226 if (ntohs(ih->frag_off) & IP_OFFSET) 225 if (ntohs(ih->frag_off) & IP_OFFSET)
227 break; 226 break;
@@ -230,19 +229,19 @@ static void dump_packet(const struct nf_loginfo *info,
230 ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, 229 ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
231 sizeof(_icmph), &_icmph); 230 sizeof(_icmph), &_icmph);
232 if (ich == NULL) { 231 if (ich == NULL) {
233 printk("INCOMPLETE [%u bytes] ", 232 sb_add(m, "INCOMPLETE [%u bytes] ",
234 skb->len - iphoff - ih->ihl*4); 233 skb->len - iphoff - ih->ihl*4);
235 break; 234 break;
236 } 235 }
237 236
238 /* Max length: 18 "TYPE=255 CODE=255 " */ 237 /* Max length: 18 "TYPE=255 CODE=255 " */
239 printk("TYPE=%u CODE=%u ", ich->type, ich->code); 238 sb_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code);
240 239
241 /* Max length: 25 "INCOMPLETE [65535 bytes] " */ 240 /* Max length: 25 "INCOMPLETE [65535 bytes] " */
242 if (ich->type <= NR_ICMP_TYPES && 241 if (ich->type <= NR_ICMP_TYPES &&
243 required_len[ich->type] && 242 required_len[ich->type] &&
244 skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { 243 skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
245 printk("INCOMPLETE [%u bytes] ", 244 sb_add(m, "INCOMPLETE [%u bytes] ",
246 skb->len - iphoff - ih->ihl*4); 245 skb->len - iphoff - ih->ihl*4);
247 break; 246 break;
248 } 247 }
@@ -251,35 +250,35 @@ static void dump_packet(const struct nf_loginfo *info,
251 case ICMP_ECHOREPLY: 250 case ICMP_ECHOREPLY:
252 case ICMP_ECHO: 251 case ICMP_ECHO:
253 /* Max length: 19 "ID=65535 SEQ=65535 " */ 252 /* Max length: 19 "ID=65535 SEQ=65535 " */
254 printk("ID=%u SEQ=%u ", 253 sb_add(m, "ID=%u SEQ=%u ",
255 ntohs(ich->un.echo.id), 254 ntohs(ich->un.echo.id),
256 ntohs(ich->un.echo.sequence)); 255 ntohs(ich->un.echo.sequence));
257 break; 256 break;
258 257
259 case ICMP_PARAMETERPROB: 258 case ICMP_PARAMETERPROB:
260 /* Max length: 14 "PARAMETER=255 " */ 259 /* Max length: 14 "PARAMETER=255 " */
261 printk("PARAMETER=%u ", 260 sb_add(m, "PARAMETER=%u ",
262 ntohl(ich->un.gateway) >> 24); 261 ntohl(ich->un.gateway) >> 24);
263 break; 262 break;
264 case ICMP_REDIRECT: 263 case ICMP_REDIRECT:
265 /* Max length: 24 "GATEWAY=255.255.255.255 " */ 264 /* Max length: 24 "GATEWAY=255.255.255.255 " */
266 printk("GATEWAY=%pI4 ", &ich->un.gateway); 265 sb_add(m, "GATEWAY=%pI4 ", &ich->un.gateway);
267 /* Fall through */ 266 /* Fall through */
268 case ICMP_DEST_UNREACH: 267 case ICMP_DEST_UNREACH:
269 case ICMP_SOURCE_QUENCH: 268 case ICMP_SOURCE_QUENCH:
270 case ICMP_TIME_EXCEEDED: 269 case ICMP_TIME_EXCEEDED:
271 /* Max length: 3+maxlen */ 270 /* Max length: 3+maxlen */
272 if (!iphoff) { /* Only recurse once. */ 271 if (!iphoff) { /* Only recurse once. */
273 printk("["); 272 sb_add(m, "[");
274 dump_packet(info, skb, 273 dump_packet(m, info, skb,
275 iphoff + ih->ihl*4+sizeof(_icmph)); 274 iphoff + ih->ihl*4+sizeof(_icmph));
276 printk("] "); 275 sb_add(m, "] ");
277 } 276 }
278 277
279 /* Max length: 10 "MTU=65535 " */ 278 /* Max length: 10 "MTU=65535 " */
280 if (ich->type == ICMP_DEST_UNREACH && 279 if (ich->type == ICMP_DEST_UNREACH &&
281 ich->code == ICMP_FRAG_NEEDED) 280 ich->code == ICMP_FRAG_NEEDED)
282 printk("MTU=%u ", ntohs(ich->un.frag.mtu)); 281 sb_add(m, "MTU=%u ", ntohs(ich->un.frag.mtu));
283 } 282 }
284 break; 283 break;
285 } 284 }
@@ -292,19 +291,19 @@ static void dump_packet(const struct nf_loginfo *info,
292 break; 291 break;
293 292
294 /* Max length: 9 "PROTO=AH " */ 293 /* Max length: 9 "PROTO=AH " */
295 printk("PROTO=AH "); 294 sb_add(m, "PROTO=AH ");
296 295
297 /* Max length: 25 "INCOMPLETE [65535 bytes] " */ 296 /* Max length: 25 "INCOMPLETE [65535 bytes] " */
298 ah = skb_header_pointer(skb, iphoff+ih->ihl*4, 297 ah = skb_header_pointer(skb, iphoff+ih->ihl*4,
299 sizeof(_ahdr), &_ahdr); 298 sizeof(_ahdr), &_ahdr);
300 if (ah == NULL) { 299 if (ah == NULL) {
301 printk("INCOMPLETE [%u bytes] ", 300 sb_add(m, "INCOMPLETE [%u bytes] ",
302 skb->len - iphoff - ih->ihl*4); 301 skb->len - iphoff - ih->ihl*4);
303 break; 302 break;
304 } 303 }
305 304
306 /* Length: 15 "SPI=0xF1234567 " */ 305 /* Length: 15 "SPI=0xF1234567 " */
307 printk("SPI=0x%x ", ntohl(ah->spi)); 306 sb_add(m, "SPI=0x%x ", ntohl(ah->spi));
308 break; 307 break;
309 } 308 }
310 case IPPROTO_ESP: { 309 case IPPROTO_ESP: {
@@ -312,7 +311,7 @@ static void dump_packet(const struct nf_loginfo *info,
312 const struct ip_esp_hdr *eh; 311 const struct ip_esp_hdr *eh;
313 312
314 /* Max length: 10 "PROTO=ESP " */ 313 /* Max length: 10 "PROTO=ESP " */
315 printk("PROTO=ESP "); 314 sb_add(m, "PROTO=ESP ");
316 315
317 if (ntohs(ih->frag_off) & IP_OFFSET) 316 if (ntohs(ih->frag_off) & IP_OFFSET)
318 break; 317 break;
@@ -321,25 +320,25 @@ static void dump_packet(const struct nf_loginfo *info,
321 eh = skb_header_pointer(skb, iphoff+ih->ihl*4, 320 eh = skb_header_pointer(skb, iphoff+ih->ihl*4,
322 sizeof(_esph), &_esph); 321 sizeof(_esph), &_esph);
323 if (eh == NULL) { 322 if (eh == NULL) {
324 printk("INCOMPLETE [%u bytes] ", 323 sb_add(m, "INCOMPLETE [%u bytes] ",
325 skb->len - iphoff - ih->ihl*4); 324 skb->len - iphoff - ih->ihl*4);
326 break; 325 break;
327 } 326 }
328 327
329 /* Length: 15 "SPI=0xF1234567 " */ 328 /* Length: 15 "SPI=0xF1234567 " */
330 printk("SPI=0x%x ", ntohl(eh->spi)); 329 sb_add(m, "SPI=0x%x ", ntohl(eh->spi));
331 break; 330 break;
332 } 331 }
333 /* Max length: 10 "PROTO 255 " */ 332 /* Max length: 10 "PROTO 255 " */
334 default: 333 default:
335 printk("PROTO=%u ", ih->protocol); 334 sb_add(m, "PROTO=%u ", ih->protocol);
336 } 335 }
337 336
338 /* Max length: 15 "UID=4294967295 " */ 337 /* Max length: 15 "UID=4294967295 " */
339 if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) { 338 if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
340 read_lock_bh(&skb->sk->sk_callback_lock); 339 read_lock_bh(&skb->sk->sk_callback_lock);
341 if (skb->sk->sk_socket && skb->sk->sk_socket->file) 340 if (skb->sk->sk_socket && skb->sk->sk_socket->file)
342 printk("UID=%u GID=%u ", 341 sb_add(m, "UID=%u GID=%u ",
343 skb->sk->sk_socket->file->f_cred->fsuid, 342 skb->sk->sk_socket->file->f_cred->fsuid,
344 skb->sk->sk_socket->file->f_cred->fsgid); 343 skb->sk->sk_socket->file->f_cred->fsgid);
345 read_unlock_bh(&skb->sk->sk_callback_lock); 344 read_unlock_bh(&skb->sk->sk_callback_lock);
@@ -347,7 +346,7 @@ static void dump_packet(const struct nf_loginfo *info,
347 346
348 /* Max length: 16 "MARK=0xFFFFFFFF " */ 347 /* Max length: 16 "MARK=0xFFFFFFFF " */
349 if (!iphoff && skb->mark) 348 if (!iphoff && skb->mark)
350 printk("MARK=0x%x ", skb->mark); 349 sb_add(m, "MARK=0x%x ", skb->mark);
351 350
352 /* Proto Max log string length */ 351 /* Proto Max log string length */
353 /* IP: 40+46+6+11+127 = 230 */ 352 /* IP: 40+46+6+11+127 = 230 */
@@ -364,7 +363,8 @@ static void dump_packet(const struct nf_loginfo *info,
364 /* maxlen = 230+ 91 + 230 + 252 = 803 */ 363 /* maxlen = 230+ 91 + 230 + 252 = 803 */
365} 364}
366 365
367static void dump_mac_header(const struct nf_loginfo *info, 366static void dump_mac_header(struct sbuff *m,
367 const struct nf_loginfo *info,
368 const struct sk_buff *skb) 368 const struct sk_buff *skb)
369{ 369{
370 struct net_device *dev = skb->dev; 370 struct net_device *dev = skb->dev;
@@ -378,7 +378,7 @@ static void dump_mac_header(const struct nf_loginfo *info,
378 378
379 switch (dev->type) { 379 switch (dev->type) {
380 case ARPHRD_ETHER: 380 case ARPHRD_ETHER:
381 printk("MACSRC=%pM MACDST=%pM MACPROTO=%04x ", 381 sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
382 eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, 382 eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
383 ntohs(eth_hdr(skb)->h_proto)); 383 ntohs(eth_hdr(skb)->h_proto));
384 return; 384 return;
@@ -387,17 +387,17 @@ static void dump_mac_header(const struct nf_loginfo *info,
387 } 387 }
388 388
389fallback: 389fallback:
390 printk("MAC="); 390 sb_add(m, "MAC=");
391 if (dev->hard_header_len && 391 if (dev->hard_header_len &&
392 skb->mac_header != skb->network_header) { 392 skb->mac_header != skb->network_header) {
393 const unsigned char *p = skb_mac_header(skb); 393 const unsigned char *p = skb_mac_header(skb);
394 unsigned int i; 394 unsigned int i;
395 395
396 printk("%02x", *p++); 396 sb_add(m, "%02x", *p++);
397 for (i = 1; i < dev->hard_header_len; i++, p++) 397 for (i = 1; i < dev->hard_header_len; i++, p++)
398 printk(":%02x", *p); 398 sb_add(m, ":%02x", *p);
399 } 399 }
400 printk(" "); 400 sb_add(m, " ");
401} 401}
402 402
403static struct nf_loginfo default_loginfo = { 403static struct nf_loginfo default_loginfo = {
@@ -419,11 +419,12 @@ ipt_log_packet(u_int8_t pf,
419 const struct nf_loginfo *loginfo, 419 const struct nf_loginfo *loginfo,
420 const char *prefix) 420 const char *prefix)
421{ 421{
422 struct sbuff *m = sb_open();
423
422 if (!loginfo) 424 if (!loginfo)
423 loginfo = &default_loginfo; 425 loginfo = &default_loginfo;
424 426
425 spin_lock_bh(&log_lock); 427 sb_add(m, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
426 printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
427 prefix, 428 prefix,
428 in ? in->name : "", 429 in ? in->name : "",
429 out ? out->name : ""); 430 out ? out->name : "");
@@ -434,20 +435,20 @@ ipt_log_packet(u_int8_t pf,
434 435
435 physindev = skb->nf_bridge->physindev; 436 physindev = skb->nf_bridge->physindev;
436 if (physindev && in != physindev) 437 if (physindev && in != physindev)
437 printk("PHYSIN=%s ", physindev->name); 438 sb_add(m, "PHYSIN=%s ", physindev->name);
438 physoutdev = skb->nf_bridge->physoutdev; 439 physoutdev = skb->nf_bridge->physoutdev;
439 if (physoutdev && out != physoutdev) 440 if (physoutdev && out != physoutdev)
440 printk("PHYSOUT=%s ", physoutdev->name); 441 sb_add(m, "PHYSOUT=%s ", physoutdev->name);
441 } 442 }
442#endif 443#endif
443 444
444 /* MAC logging for input path only. */ 445 /* MAC logging for input path only. */
445 if (in && !out) 446 if (in && !out)
446 dump_mac_header(loginfo, skb); 447 dump_mac_header(m, loginfo, skb);
448
449 dump_packet(m, loginfo, skb, 0);
447 450
448 dump_packet(loginfo, skb, 0); 451 sb_close(m);
449 printk("\n");
450 spin_unlock_bh(&log_lock);
451} 452}
452 453
453static unsigned int 454static unsigned int
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index b254dafaf429..43eec80c0e7c 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -112,6 +112,7 @@ static void send_reset(struct sk_buff *oldskb, int hook)
112 /* ip_route_me_harder expects skb->dst to be set */ 112 /* ip_route_me_harder expects skb->dst to be set */
113 skb_dst_set_noref(nskb, skb_dst(oldskb)); 113 skb_dst_set_noref(nskb, skb_dst(oldskb));
114 114
115 nskb->protocol = htons(ETH_P_IP);
115 if (ip_route_me_harder(nskb, addr_type)) 116 if (ip_route_me_harder(nskb, addr_type))
116 goto free_nskb; 117 goto free_nskb;
117 118
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 244f7cb08d68..37f8adb68c79 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -11,6 +11,7 @@
11#include <linux/proc_fs.h> 11#include <linux/proc_fs.h>
12#include <linux/seq_file.h> 12#include <linux/seq_file.h>
13#include <linux/percpu.h> 13#include <linux/percpu.h>
14#include <linux/security.h>
14#include <net/net_namespace.h> 15#include <net/net_namespace.h>
15 16
16#include <linux/netfilter.h> 17#include <linux/netfilter.h>
@@ -87,6 +88,29 @@ static void ct_seq_stop(struct seq_file *s, void *v)
87 rcu_read_unlock(); 88 rcu_read_unlock();
88} 89}
89 90
91#ifdef CONFIG_NF_CONNTRACK_SECMARK
92static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
93{
94 int ret;
95 u32 len;
96 char *secctx;
97
98 ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
99 if (ret)
100 return ret;
101
102 ret = seq_printf(s, "secctx=%s ", secctx);
103
104 security_release_secctx(secctx, len);
105 return ret;
106}
107#else
108static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
109{
110 return 0;
111}
112#endif
113
90static int ct_seq_show(struct seq_file *s, void *v) 114static int ct_seq_show(struct seq_file *s, void *v)
91{ 115{
92 struct nf_conntrack_tuple_hash *hash = v; 116 struct nf_conntrack_tuple_hash *hash = v;
@@ -148,10 +172,8 @@ static int ct_seq_show(struct seq_file *s, void *v)
148 goto release; 172 goto release;
149#endif 173#endif
150 174
151#ifdef CONFIG_NF_CONNTRACK_SECMARK 175 if (ct_show_secctx(s, ct))
152 if (seq_printf(s, "secmark=%u ", ct->secmark))
153 goto release; 176 goto release;
154#endif
155 177
156 if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use))) 178 if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)))
157 goto release; 179 goto release;
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index eab8de32f200..f3a9b42b16c6 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -66,9 +66,11 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
66 const struct net_device *out, 66 const struct net_device *out,
67 int (*okfn)(struct sk_buff *)) 67 int (*okfn)(struct sk_buff *))
68{ 68{
69 struct sock *sk = skb->sk;
69 struct inet_sock *inet = inet_sk(skb->sk); 70 struct inet_sock *inet = inet_sk(skb->sk);
70 71
71 if (inet && inet->nodefrag) 72 if (sk && (sk->sk_family == PF_INET) &&
73 inet->nodefrag)
72 return NF_ACCEPT; 74 return NF_ACCEPT;
73 75
74#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) 76#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c
index c31b87668250..0f23b3f06df0 100644
--- a/net/ipv4/netfilter/nf_nat_amanda.c
+++ b/net/ipv4/netfilter/nf_nat_amanda.c
@@ -44,9 +44,16 @@ static unsigned int help(struct sk_buff *skb,
44 44
45 /* Try to get same port: if not, try to change it. */ 45 /* Try to get same port: if not, try to change it. */
46 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { 46 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
47 int ret;
48
47 exp->tuple.dst.u.tcp.port = htons(port); 49 exp->tuple.dst.u.tcp.port = htons(port);
48 if (nf_ct_expect_related(exp) == 0) 50 ret = nf_ct_expect_related(exp);
51 if (ret == 0)
52 break;
53 else if (ret != -EBUSY) {
54 port = 0;
49 break; 55 break;
56 }
50 } 57 }
51 58
52 if (port == 0) 59 if (port == 0)
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 8c8632d9b93c..295c97431e43 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -38,7 +38,7 @@ static DEFINE_SPINLOCK(nf_nat_lock);
38static struct nf_conntrack_l3proto *l3proto __read_mostly; 38static struct nf_conntrack_l3proto *l3proto __read_mostly;
39 39
40#define MAX_IP_NAT_PROTO 256 40#define MAX_IP_NAT_PROTO 256
41static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO] 41static const struct nf_nat_protocol __rcu *nf_nat_protos[MAX_IP_NAT_PROTO]
42 __read_mostly; 42 __read_mostly;
43 43
44static inline const struct nf_nat_protocol * 44static inline const struct nf_nat_protocol *
@@ -47,7 +47,7 @@ __nf_nat_proto_find(u_int8_t protonum)
47 return rcu_dereference(nf_nat_protos[protonum]); 47 return rcu_dereference(nf_nat_protos[protonum]);
48} 48}
49 49
50const struct nf_nat_protocol * 50static const struct nf_nat_protocol *
51nf_nat_proto_find_get(u_int8_t protonum) 51nf_nat_proto_find_get(u_int8_t protonum)
52{ 52{
53 const struct nf_nat_protocol *p; 53 const struct nf_nat_protocol *p;
@@ -60,14 +60,12 @@ nf_nat_proto_find_get(u_int8_t protonum)
60 60
61 return p; 61 return p;
62} 62}
63EXPORT_SYMBOL_GPL(nf_nat_proto_find_get);
64 63
65void 64static void
66nf_nat_proto_put(const struct nf_nat_protocol *p) 65nf_nat_proto_put(const struct nf_nat_protocol *p)
67{ 66{
68 module_put(p->me); 67 module_put(p->me);
69} 68}
70EXPORT_SYMBOL_GPL(nf_nat_proto_put);
71 69
72/* We keep an extra hash for each conntrack, for fast searching. */ 70/* We keep an extra hash for each conntrack, for fast searching. */
73static inline unsigned int 71static inline unsigned int
@@ -262,11 +260,17 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
262 proto = __nf_nat_proto_find(orig_tuple->dst.protonum); 260 proto = __nf_nat_proto_find(orig_tuple->dst.protonum);
263 261
264 /* Only bother mapping if it's not already in range and unique */ 262 /* Only bother mapping if it's not already in range and unique */
265 if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM) && 263 if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
266 (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) || 264 if (range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) {
267 proto->in_range(tuple, maniptype, &range->min, &range->max)) && 265 if (proto->in_range(tuple, maniptype, &range->min,
268 !nf_nat_used_tuple(tuple, ct)) 266 &range->max) &&
269 goto out; 267 (range->min.all == range->max.all ||
268 !nf_nat_used_tuple(tuple, ct)))
269 goto out;
270 } else if (!nf_nat_used_tuple(tuple, ct)) {
271 goto out;
272 }
273 }
270 274
271 /* Last change: get protocol to try to obtain unique tuple. */ 275 /* Last change: get protocol to try to obtain unique tuple. */
272 proto->unique_tuple(tuple, range, maniptype, ct); 276 proto->unique_tuple(tuple, range, maniptype, ct);
@@ -458,6 +462,18 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
458 return 0; 462 return 0;
459 } 463 }
460 464
465 if (manip == IP_NAT_MANIP_SRC)
466 statusbit = IPS_SRC_NAT;
467 else
468 statusbit = IPS_DST_NAT;
469
470 /* Invert if this is reply dir. */
471 if (dir == IP_CT_DIR_REPLY)
472 statusbit ^= IPS_NAT_MASK;
473
474 if (!(ct->status & statusbit))
475 return 1;
476
461 pr_debug("icmp_reply_translation: translating error %p manip %u " 477 pr_debug("icmp_reply_translation: translating error %p manip %u "
462 "dir %s\n", skb, manip, 478 "dir %s\n", skb, manip,
463 dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY"); 479 dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
@@ -492,20 +508,9 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
492 508
493 /* Change outer to look the reply to an incoming packet 509 /* Change outer to look the reply to an incoming packet
494 * (proto 0 means don't invert per-proto part). */ 510 * (proto 0 means don't invert per-proto part). */
495 if (manip == IP_NAT_MANIP_SRC) 511 nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
496 statusbit = IPS_SRC_NAT; 512 if (!manip_pkt(0, skb, 0, &target, manip))
497 else 513 return 0;
498 statusbit = IPS_DST_NAT;
499
500 /* Invert if this is reply dir. */
501 if (dir == IP_CT_DIR_REPLY)
502 statusbit ^= IPS_NAT_MASK;
503
504 if (ct->status & statusbit) {
505 nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
506 if (!manip_pkt(0, skb, 0, &target, manip))
507 return 0;
508 }
509 514
510 return 1; 515 return 1;
511} 516}
diff --git a/net/ipv4/netfilter/nf_nat_ftp.c b/net/ipv4/netfilter/nf_nat_ftp.c
index 86e0e84ff0a0..dc73abb3fe27 100644
--- a/net/ipv4/netfilter/nf_nat_ftp.c
+++ b/net/ipv4/netfilter/nf_nat_ftp.c
@@ -79,9 +79,16 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb,
79 79
80 /* Try to get same port: if not, try to change it. */ 80 /* Try to get same port: if not, try to change it. */
81 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { 81 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
82 int ret;
83
82 exp->tuple.dst.u.tcp.port = htons(port); 84 exp->tuple.dst.u.tcp.port = htons(port);
83 if (nf_ct_expect_related(exp) == 0) 85 ret = nf_ct_expect_related(exp);
86 if (ret == 0)
87 break;
88 else if (ret != -EBUSY) {
89 port = 0;
84 break; 90 break;
91 }
85 } 92 }
86 93
87 if (port == 0) 94 if (port == 0)
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index 5045196d853c..790f3160e012 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -222,13 +222,24 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
222 /* Try to get a pair of ports. */ 222 /* Try to get a pair of ports. */
223 for (nated_port = ntohs(rtp_exp->tuple.dst.u.udp.port); 223 for (nated_port = ntohs(rtp_exp->tuple.dst.u.udp.port);
224 nated_port != 0; nated_port += 2) { 224 nated_port != 0; nated_port += 2) {
225 int ret;
226
225 rtp_exp->tuple.dst.u.udp.port = htons(nated_port); 227 rtp_exp->tuple.dst.u.udp.port = htons(nated_port);
226 if (nf_ct_expect_related(rtp_exp) == 0) { 228 ret = nf_ct_expect_related(rtp_exp);
229 if (ret == 0) {
227 rtcp_exp->tuple.dst.u.udp.port = 230 rtcp_exp->tuple.dst.u.udp.port =
228 htons(nated_port + 1); 231 htons(nated_port + 1);
229 if (nf_ct_expect_related(rtcp_exp) == 0) 232 ret = nf_ct_expect_related(rtcp_exp);
233 if (ret == 0)
234 break;
235 else if (ret != -EBUSY) {
236 nf_ct_unexpect_related(rtp_exp);
237 nated_port = 0;
230 break; 238 break;
231 nf_ct_unexpect_related(rtp_exp); 239 }
240 } else if (ret != -EBUSY) {
241 nated_port = 0;
242 break;
232 } 243 }
233 } 244 }
234 245
@@ -284,9 +295,16 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
284 295
285 /* Try to get same port: if not, try to change it. */ 296 /* Try to get same port: if not, try to change it. */
286 for (; nated_port != 0; nated_port++) { 297 for (; nated_port != 0; nated_port++) {
298 int ret;
299
287 exp->tuple.dst.u.tcp.port = htons(nated_port); 300 exp->tuple.dst.u.tcp.port = htons(nated_port);
288 if (nf_ct_expect_related(exp) == 0) 301 ret = nf_ct_expect_related(exp);
302 if (ret == 0)
303 break;
304 else if (ret != -EBUSY) {
305 nated_port = 0;
289 break; 306 break;
307 }
290 } 308 }
291 309
292 if (nated_port == 0) { /* No port available */ 310 if (nated_port == 0) { /* No port available */
@@ -334,9 +352,16 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
334 352
335 /* Try to get same port: if not, try to change it. */ 353 /* Try to get same port: if not, try to change it. */
336 for (; nated_port != 0; nated_port++) { 354 for (; nated_port != 0; nated_port++) {
355 int ret;
356
337 exp->tuple.dst.u.tcp.port = htons(nated_port); 357 exp->tuple.dst.u.tcp.port = htons(nated_port);
338 if (nf_ct_expect_related(exp) == 0) 358 ret = nf_ct_expect_related(exp);
359 if (ret == 0)
339 break; 360 break;
361 else if (ret != -EBUSY) {
362 nated_port = 0;
363 break;
364 }
340 } 365 }
341 366
342 if (nated_port == 0) { /* No port available */ 367 if (nated_port == 0) { /* No port available */
@@ -418,9 +443,16 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
418 443
419 /* Try to get same port: if not, try to change it. */ 444 /* Try to get same port: if not, try to change it. */
420 for (; nated_port != 0; nated_port++) { 445 for (; nated_port != 0; nated_port++) {
446 int ret;
447
421 exp->tuple.dst.u.tcp.port = htons(nated_port); 448 exp->tuple.dst.u.tcp.port = htons(nated_port);
422 if (nf_ct_expect_related(exp) == 0) 449 ret = nf_ct_expect_related(exp);
450 if (ret == 0)
451 break;
452 else if (ret != -EBUSY) {
453 nated_port = 0;
423 break; 454 break;
455 }
424 } 456 }
425 457
426 if (nated_port == 0) { /* No port available */ 458 if (nated_port == 0) { /* No port available */
@@ -500,9 +532,16 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
500 532
501 /* Try to get same port: if not, try to change it. */ 533 /* Try to get same port: if not, try to change it. */
502 for (nated_port = ntohs(port); nated_port != 0; nated_port++) { 534 for (nated_port = ntohs(port); nated_port != 0; nated_port++) {
535 int ret;
536
503 exp->tuple.dst.u.tcp.port = htons(nated_port); 537 exp->tuple.dst.u.tcp.port = htons(nated_port);
504 if (nf_ct_expect_related(exp) == 0) 538 ret = nf_ct_expect_related(exp);
539 if (ret == 0)
505 break; 540 break;
541 else if (ret != -EBUSY) {
542 nated_port = 0;
543 break;
544 }
506 } 545 }
507 546
508 if (nated_port == 0) { /* No port available */ 547 if (nated_port == 0) { /* No port available */
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
index 4a0c6b548eee..31427fb57aa8 100644
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -153,6 +153,35 @@ void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
153} 153}
154EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust); 154EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust);
155 155
156static void nf_nat_csum(struct sk_buff *skb, struct iphdr *iph, void *data,
157 int datalen, __sum16 *check, int oldlen)
158{
159 struct rtable *rt = skb_rtable(skb);
160
161 if (skb->ip_summed != CHECKSUM_PARTIAL) {
162 if (!(rt->rt_flags & RTCF_LOCAL) &&
163 skb->dev->features & NETIF_F_V4_CSUM) {
164 skb->ip_summed = CHECKSUM_PARTIAL;
165 skb->csum_start = skb_headroom(skb) +
166 skb_network_offset(skb) +
167 iph->ihl * 4;
168 skb->csum_offset = (void *)check - data;
169 *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
170 datalen, iph->protocol, 0);
171 } else {
172 *check = 0;
173 *check = csum_tcpudp_magic(iph->saddr, iph->daddr,
174 datalen, iph->protocol,
175 csum_partial(data, datalen,
176 0));
177 if (iph->protocol == IPPROTO_UDP && !*check)
178 *check = CSUM_MANGLED_0;
179 }
180 } else
181 inet_proto_csum_replace2(check, skb,
182 htons(oldlen), htons(datalen), 1);
183}
184
156/* Generic function for mangling variable-length address changes inside 185/* Generic function for mangling variable-length address changes inside
157 * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX 186 * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX
158 * command in FTP). 187 * command in FTP).
@@ -169,7 +198,6 @@ int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
169 const char *rep_buffer, 198 const char *rep_buffer,
170 unsigned int rep_len, bool adjust) 199 unsigned int rep_len, bool adjust)
171{ 200{
172 struct rtable *rt = skb_rtable(skb);
173 struct iphdr *iph; 201 struct iphdr *iph;
174 struct tcphdr *tcph; 202 struct tcphdr *tcph;
175 int oldlen, datalen; 203 int oldlen, datalen;
@@ -192,26 +220,7 @@ int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
192 match_offset, match_len, rep_buffer, rep_len); 220 match_offset, match_len, rep_buffer, rep_len);
193 221
194 datalen = skb->len - iph->ihl*4; 222 datalen = skb->len - iph->ihl*4;
195 if (skb->ip_summed != CHECKSUM_PARTIAL) { 223 nf_nat_csum(skb, iph, tcph, datalen, &tcph->check, oldlen);
196 if (!(rt->rt_flags & RTCF_LOCAL) &&
197 skb->dev->features & NETIF_F_V4_CSUM) {
198 skb->ip_summed = CHECKSUM_PARTIAL;
199 skb->csum_start = skb_headroom(skb) +
200 skb_network_offset(skb) +
201 iph->ihl * 4;
202 skb->csum_offset = offsetof(struct tcphdr, check);
203 tcph->check = ~tcp_v4_check(datalen,
204 iph->saddr, iph->daddr, 0);
205 } else {
206 tcph->check = 0;
207 tcph->check = tcp_v4_check(datalen,
208 iph->saddr, iph->daddr,
209 csum_partial(tcph,
210 datalen, 0));
211 }
212 } else
213 inet_proto_csum_replace2(&tcph->check, skb,
214 htons(oldlen), htons(datalen), 1);
215 224
216 if (adjust && rep_len != match_len) 225 if (adjust && rep_len != match_len)
217 nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq, 226 nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq,
@@ -240,7 +249,6 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
240 const char *rep_buffer, 249 const char *rep_buffer,
241 unsigned int rep_len) 250 unsigned int rep_len)
242{ 251{
243 struct rtable *rt = skb_rtable(skb);
244 struct iphdr *iph; 252 struct iphdr *iph;
245 struct udphdr *udph; 253 struct udphdr *udph;
246 int datalen, oldlen; 254 int datalen, oldlen;
@@ -274,29 +282,7 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
274 if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL) 282 if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL)
275 return 1; 283 return 1;
276 284
277 if (skb->ip_summed != CHECKSUM_PARTIAL) { 285 nf_nat_csum(skb, iph, udph, datalen, &udph->check, oldlen);
278 if (!(rt->rt_flags & RTCF_LOCAL) &&
279 skb->dev->features & NETIF_F_V4_CSUM) {
280 skb->ip_summed = CHECKSUM_PARTIAL;
281 skb->csum_start = skb_headroom(skb) +
282 skb_network_offset(skb) +
283 iph->ihl * 4;
284 skb->csum_offset = offsetof(struct udphdr, check);
285 udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
286 datalen, IPPROTO_UDP,
287 0);
288 } else {
289 udph->check = 0;
290 udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
291 datalen, IPPROTO_UDP,
292 csum_partial(udph,
293 datalen, 0));
294 if (!udph->check)
295 udph->check = CSUM_MANGLED_0;
296 }
297 } else
298 inet_proto_csum_replace2(&udph->check, skb,
299 htons(oldlen), htons(datalen), 1);
300 286
301 return 1; 287 return 1;
302} 288}
diff --git a/net/ipv4/netfilter/nf_nat_irc.c b/net/ipv4/netfilter/nf_nat_irc.c
index ea83a886b03e..535e1a802356 100644
--- a/net/ipv4/netfilter/nf_nat_irc.c
+++ b/net/ipv4/netfilter/nf_nat_irc.c
@@ -45,9 +45,16 @@ static unsigned int help(struct sk_buff *skb,
45 45
46 /* Try to get same port: if not, try to change it. */ 46 /* Try to get same port: if not, try to change it. */
47 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { 47 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
48 int ret;
49
48 exp->tuple.dst.u.tcp.port = htons(port); 50 exp->tuple.dst.u.tcp.port = htons(port);
49 if (nf_ct_expect_related(exp) == 0) 51 ret = nf_ct_expect_related(exp);
52 if (ret == 0)
53 break;
54 else if (ret != -EBUSY) {
55 port = 0;
50 break; 56 break;
57 }
51 } 58 }
52 59
53 if (port == 0) 60 if (port == 0)
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
index ebbd319f62f5..21c30426480b 100644
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ b/net/ipv4/netfilter/nf_nat_rule.c
@@ -106,16 +106,15 @@ alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
106{ 106{
107 /* Force range to this IP; let proto decide mapping for 107 /* Force range to this IP; let proto decide mapping for
108 per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). 108 per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
109 Use reply in case it's already been mangled (eg local packet).
110 */ 109 */
111 __be32 ip 110 struct nf_nat_range range;
112 = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC 111
113 ? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip 112 range.flags = 0;
114 : ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip); 113 pr_debug("Allocating NULL binding for %p (%pI4)\n", ct,
115 struct nf_nat_range range 114 HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC ?
116 = { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } }; 115 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip :
117 116 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
118 pr_debug("Allocating NULL binding for %p (%pI4)\n", ct, &ip); 117
119 return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum)); 118 return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
120} 119}
121 120
diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c
index 11b538deaaec..e40cf7816fdb 100644
--- a/net/ipv4/netfilter/nf_nat_sip.c
+++ b/net/ipv4/netfilter/nf_nat_sip.c
@@ -307,9 +307,16 @@ static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int dataoff,
307 exp->expectfn = ip_nat_sip_expected; 307 exp->expectfn = ip_nat_sip_expected;
308 308
309 for (; port != 0; port++) { 309 for (; port != 0; port++) {
310 int ret;
311
310 exp->tuple.dst.u.udp.port = htons(port); 312 exp->tuple.dst.u.udp.port = htons(port);
311 if (nf_ct_expect_related(exp) == 0) 313 ret = nf_ct_expect_related(exp);
314 if (ret == 0)
315 break;
316 else if (ret != -EBUSY) {
317 port = 0;
312 break; 318 break;
319 }
313 } 320 }
314 321
315 if (port == 0) 322 if (port == 0)
@@ -480,13 +487,25 @@ static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff,
480 /* Try to get same pair of ports: if not, try to change them. */ 487 /* Try to get same pair of ports: if not, try to change them. */
481 for (port = ntohs(rtp_exp->tuple.dst.u.udp.port); 488 for (port = ntohs(rtp_exp->tuple.dst.u.udp.port);
482 port != 0; port += 2) { 489 port != 0; port += 2) {
490 int ret;
491
483 rtp_exp->tuple.dst.u.udp.port = htons(port); 492 rtp_exp->tuple.dst.u.udp.port = htons(port);
484 if (nf_ct_expect_related(rtp_exp) != 0) 493 ret = nf_ct_expect_related(rtp_exp);
494 if (ret == -EBUSY)
485 continue; 495 continue;
496 else if (ret < 0) {
497 port = 0;
498 break;
499 }
486 rtcp_exp->tuple.dst.u.udp.port = htons(port + 1); 500 rtcp_exp->tuple.dst.u.udp.port = htons(port + 1);
487 if (nf_ct_expect_related(rtcp_exp) == 0) 501 ret = nf_ct_expect_related(rtcp_exp);
502 if (ret == 0)
488 break; 503 break;
489 nf_ct_unexpect_related(rtp_exp); 504 else if (ret != -EBUSY) {
505 nf_ct_unexpect_related(rtp_exp);
506 port = 0;
507 break;
508 }
490 } 509 }
491 510
492 if (port == 0) 511 if (port == 0)
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index 1679e2c0963d..ee5f419d0a56 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -893,13 +893,15 @@ static void fast_csum(__sum16 *csum,
893 unsigned char s[4]; 893 unsigned char s[4];
894 894
895 if (offset & 1) { 895 if (offset & 1) {
896 s[0] = s[2] = 0; 896 s[0] = ~0;
897 s[1] = ~*optr; 897 s[1] = ~*optr;
898 s[2] = 0;
898 s[3] = *nptr; 899 s[3] = *nptr;
899 } else { 900 } else {
900 s[1] = s[3] = 0;
901 s[0] = ~*optr; 901 s[0] = ~*optr;
902 s[1] = ~0;
902 s[2] = *nptr; 903 s[2] = *nptr;
904 s[3] = 0;
903 } 905 }
904 906
905 *csum = csum_fold(csum_partial(s, 4, ~csum_unfold(*csum))); 907 *csum = csum_fold(csum_partial(s, 4, ~csum_unfold(*csum)));
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index f2d297351405..9ae5c01cd0b2 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -28,8 +28,7 @@
28#include <linux/spinlock.h> 28#include <linux/spinlock.h>
29#include <net/protocol.h> 29#include <net/protocol.h>
30 30
31const struct net_protocol *inet_protos[MAX_INET_PROTOS] ____cacheline_aligned_in_smp; 31const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
32static DEFINE_SPINLOCK(inet_proto_lock);
33 32
34/* 33/*
35 * Add a protocol handler to the hash tables 34 * Add a protocol handler to the hash tables
@@ -37,20 +36,10 @@ static DEFINE_SPINLOCK(inet_proto_lock);
37 36
38int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) 37int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
39{ 38{
40 int hash, ret; 39 int hash = protocol & (MAX_INET_PROTOS - 1);
41 40
42 hash = protocol & (MAX_INET_PROTOS - 1); 41 return !cmpxchg((const struct net_protocol **)&inet_protos[hash],
43 42 NULL, prot) ? 0 : -1;
44 spin_lock_bh(&inet_proto_lock);
45 if (inet_protos[hash]) {
46 ret = -1;
47 } else {
48 inet_protos[hash] = prot;
49 ret = 0;
50 }
51 spin_unlock_bh(&inet_proto_lock);
52
53 return ret;
54} 43}
55EXPORT_SYMBOL(inet_add_protocol); 44EXPORT_SYMBOL(inet_add_protocol);
56 45
@@ -60,18 +49,10 @@ EXPORT_SYMBOL(inet_add_protocol);
60 49
61int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol) 50int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
62{ 51{
63 int hash, ret; 52 int ret, hash = protocol & (MAX_INET_PROTOS - 1);
64
65 hash = protocol & (MAX_INET_PROTOS - 1);
66 53
67 spin_lock_bh(&inet_proto_lock); 54 ret = (cmpxchg((const struct net_protocol **)&inet_protos[hash],
68 if (inet_protos[hash] == prot) { 55 prot, NULL) == prot) ? 0 : -1;
69 inet_protos[hash] = NULL;
70 ret = 0;
71 } else {
72 ret = -1;
73 }
74 spin_unlock_bh(&inet_proto_lock);
75 56
76 synchronize_net(); 57 synchronize_net();
77 58
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 009a7b2aa1ef..1f85ef289895 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -505,7 +505,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
505 505
506 ipc.addr = inet->inet_saddr; 506 ipc.addr = inet->inet_saddr;
507 ipc.opt = NULL; 507 ipc.opt = NULL;
508 ipc.shtx.flags = 0; 508 ipc.tx_flags = 0;
509 ipc.oif = sk->sk_bound_dev_if; 509 ipc.oif = sk->sk_bound_dev_if;
510 510
511 if (msg->msg_controllen) { 511 if (msg->msg_controllen) {
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 3f56b6e6c6aa..987bf9adb318 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -159,7 +159,6 @@ static struct dst_ops ipv4_dst_ops = {
159 .link_failure = ipv4_link_failure, 159 .link_failure = ipv4_link_failure,
160 .update_pmtu = ip_rt_update_pmtu, 160 .update_pmtu = ip_rt_update_pmtu,
161 .local_out = __ip_local_out, 161 .local_out = __ip_local_out,
162 .entries = ATOMIC_INIT(0),
163}; 162};
164 163
165#define ECN_OR_COST(class) TC_PRIO_##class 164#define ECN_OR_COST(class) TC_PRIO_##class
@@ -199,7 +198,7 @@ const __u8 ip_tos2prio[16] = {
199 */ 198 */
200 199
201struct rt_hash_bucket { 200struct rt_hash_bucket {
202 struct rtable *chain; 201 struct rtable __rcu *chain;
203}; 202};
204 203
205#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ 204#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
@@ -281,7 +280,7 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq)
281 struct rtable *r = NULL; 280 struct rtable *r = NULL;
282 281
283 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { 282 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
284 if (!rt_hash_table[st->bucket].chain) 283 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
285 continue; 284 continue;
286 rcu_read_lock_bh(); 285 rcu_read_lock_bh();
287 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); 286 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
@@ -301,17 +300,17 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq,
301{ 300{
302 struct rt_cache_iter_state *st = seq->private; 301 struct rt_cache_iter_state *st = seq->private;
303 302
304 r = r->dst.rt_next; 303 r = rcu_dereference_bh(r->dst.rt_next);
305 while (!r) { 304 while (!r) {
306 rcu_read_unlock_bh(); 305 rcu_read_unlock_bh();
307 do { 306 do {
308 if (--st->bucket < 0) 307 if (--st->bucket < 0)
309 return NULL; 308 return NULL;
310 } while (!rt_hash_table[st->bucket].chain); 309 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
311 rcu_read_lock_bh(); 310 rcu_read_lock_bh();
312 r = rt_hash_table[st->bucket].chain; 311 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
313 } 312 }
314 return rcu_dereference_bh(r); 313 return r;
315} 314}
316 315
317static struct rtable *rt_cache_get_next(struct seq_file *seq, 316static struct rtable *rt_cache_get_next(struct seq_file *seq,
@@ -466,7 +465,7 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v)
466 465
467 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " 466 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
468 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", 467 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
469 atomic_read(&ipv4_dst_ops.entries), 468 dst_entries_get_slow(&ipv4_dst_ops),
470 st->in_hit, 469 st->in_hit,
471 st->in_slow_tot, 470 st->in_slow_tot,
472 st->in_slow_mc, 471 st->in_slow_mc,
@@ -722,19 +721,23 @@ static void rt_do_flush(int process_context)
722 for (i = 0; i <= rt_hash_mask; i++) { 721 for (i = 0; i <= rt_hash_mask; i++) {
723 if (process_context && need_resched()) 722 if (process_context && need_resched())
724 cond_resched(); 723 cond_resched();
725 rth = rt_hash_table[i].chain; 724 rth = rcu_dereference_raw(rt_hash_table[i].chain);
726 if (!rth) 725 if (!rth)
727 continue; 726 continue;
728 727
729 spin_lock_bh(rt_hash_lock_addr(i)); 728 spin_lock_bh(rt_hash_lock_addr(i));
730#ifdef CONFIG_NET_NS 729#ifdef CONFIG_NET_NS
731 { 730 {
732 struct rtable ** prev, * p; 731 struct rtable __rcu **prev;
732 struct rtable *p;
733 733
734 rth = rt_hash_table[i].chain; 734 rth = rcu_dereference_protected(rt_hash_table[i].chain,
735 lockdep_is_held(rt_hash_lock_addr(i)));
735 736
736 /* defer releasing the head of the list after spin_unlock */ 737 /* defer releasing the head of the list after spin_unlock */
737 for (tail = rth; tail; tail = tail->dst.rt_next) 738 for (tail = rth; tail;
739 tail = rcu_dereference_protected(tail->dst.rt_next,
740 lockdep_is_held(rt_hash_lock_addr(i))))
738 if (!rt_is_expired(tail)) 741 if (!rt_is_expired(tail))
739 break; 742 break;
740 if (rth != tail) 743 if (rth != tail)
@@ -742,8 +745,12 @@ static void rt_do_flush(int process_context)
742 745
743 /* call rt_free on entries after the tail requiring flush */ 746 /* call rt_free on entries after the tail requiring flush */
744 prev = &rt_hash_table[i].chain; 747 prev = &rt_hash_table[i].chain;
745 for (p = *prev; p; p = next) { 748 for (p = rcu_dereference_protected(*prev,
746 next = p->dst.rt_next; 749 lockdep_is_held(rt_hash_lock_addr(i)));
750 p != NULL;
751 p = next) {
752 next = rcu_dereference_protected(p->dst.rt_next,
753 lockdep_is_held(rt_hash_lock_addr(i)));
747 if (!rt_is_expired(p)) { 754 if (!rt_is_expired(p)) {
748 prev = &p->dst.rt_next; 755 prev = &p->dst.rt_next;
749 } else { 756 } else {
@@ -753,14 +760,15 @@ static void rt_do_flush(int process_context)
753 } 760 }
754 } 761 }
755#else 762#else
756 rth = rt_hash_table[i].chain; 763 rth = rcu_dereference_protected(rt_hash_table[i].chain,
757 rt_hash_table[i].chain = NULL; 764 lockdep_is_held(rt_hash_lock_addr(i)));
765 rcu_assign_pointer(rt_hash_table[i].chain, NULL);
758 tail = NULL; 766 tail = NULL;
759#endif 767#endif
760 spin_unlock_bh(rt_hash_lock_addr(i)); 768 spin_unlock_bh(rt_hash_lock_addr(i));
761 769
762 for (; rth != tail; rth = next) { 770 for (; rth != tail; rth = next) {
763 next = rth->dst.rt_next; 771 next = rcu_dereference_protected(rth->dst.rt_next, 1);
764 rt_free(rth); 772 rt_free(rth);
765 } 773 }
766 } 774 }
@@ -791,7 +799,7 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth)
791 while (aux != rth) { 799 while (aux != rth) {
792 if (compare_hash_inputs(&aux->fl, &rth->fl)) 800 if (compare_hash_inputs(&aux->fl, &rth->fl))
793 return 0; 801 return 0;
794 aux = aux->dst.rt_next; 802 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
795 } 803 }
796 return ONE; 804 return ONE;
797} 805}
@@ -800,7 +808,8 @@ static void rt_check_expire(void)
800{ 808{
801 static unsigned int rover; 809 static unsigned int rover;
802 unsigned int i = rover, goal; 810 unsigned int i = rover, goal;
803 struct rtable *rth, **rthp; 811 struct rtable *rth;
812 struct rtable __rcu **rthp;
804 unsigned long samples = 0; 813 unsigned long samples = 0;
805 unsigned long sum = 0, sum2 = 0; 814 unsigned long sum = 0, sum2 = 0;
806 unsigned long delta; 815 unsigned long delta;
@@ -826,11 +835,12 @@ static void rt_check_expire(void)
826 835
827 samples++; 836 samples++;
828 837
829 if (*rthp == NULL) 838 if (rcu_dereference_raw(*rthp) == NULL)
830 continue; 839 continue;
831 length = 0; 840 length = 0;
832 spin_lock_bh(rt_hash_lock_addr(i)); 841 spin_lock_bh(rt_hash_lock_addr(i));
833 while ((rth = *rthp) != NULL) { 842 while ((rth = rcu_dereference_protected(*rthp,
843 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
834 prefetch(rth->dst.rt_next); 844 prefetch(rth->dst.rt_next);
835 if (rt_is_expired(rth)) { 845 if (rt_is_expired(rth)) {
836 *rthp = rth->dst.rt_next; 846 *rthp = rth->dst.rt_next;
@@ -942,9 +952,11 @@ static int rt_garbage_collect(struct dst_ops *ops)
942 static unsigned long last_gc; 952 static unsigned long last_gc;
943 static int rover; 953 static int rover;
944 static int equilibrium; 954 static int equilibrium;
945 struct rtable *rth, **rthp; 955 struct rtable *rth;
956 struct rtable __rcu **rthp;
946 unsigned long now = jiffies; 957 unsigned long now = jiffies;
947 int goal; 958 int goal;
959 int entries = dst_entries_get_fast(&ipv4_dst_ops);
948 960
949 /* 961 /*
950 * Garbage collection is pretty expensive, 962 * Garbage collection is pretty expensive,
@@ -954,28 +966,28 @@ static int rt_garbage_collect(struct dst_ops *ops)
954 RT_CACHE_STAT_INC(gc_total); 966 RT_CACHE_STAT_INC(gc_total);
955 967
956 if (now - last_gc < ip_rt_gc_min_interval && 968 if (now - last_gc < ip_rt_gc_min_interval &&
957 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) { 969 entries < ip_rt_max_size) {
958 RT_CACHE_STAT_INC(gc_ignored); 970 RT_CACHE_STAT_INC(gc_ignored);
959 goto out; 971 goto out;
960 } 972 }
961 973
974 entries = dst_entries_get_slow(&ipv4_dst_ops);
962 /* Calculate number of entries, which we want to expire now. */ 975 /* Calculate number of entries, which we want to expire now. */
963 goal = atomic_read(&ipv4_dst_ops.entries) - 976 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
964 (ip_rt_gc_elasticity << rt_hash_log);
965 if (goal <= 0) { 977 if (goal <= 0) {
966 if (equilibrium < ipv4_dst_ops.gc_thresh) 978 if (equilibrium < ipv4_dst_ops.gc_thresh)
967 equilibrium = ipv4_dst_ops.gc_thresh; 979 equilibrium = ipv4_dst_ops.gc_thresh;
968 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; 980 goal = entries - equilibrium;
969 if (goal > 0) { 981 if (goal > 0) {
970 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1); 982 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
971 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; 983 goal = entries - equilibrium;
972 } 984 }
973 } else { 985 } else {
974 /* We are in dangerous area. Try to reduce cache really 986 /* We are in dangerous area. Try to reduce cache really
975 * aggressively. 987 * aggressively.
976 */ 988 */
977 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1); 989 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
978 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal; 990 equilibrium = entries - goal;
979 } 991 }
980 992
981 if (now - last_gc >= ip_rt_gc_min_interval) 993 if (now - last_gc >= ip_rt_gc_min_interval)
@@ -995,7 +1007,8 @@ static int rt_garbage_collect(struct dst_ops *ops)
995 k = (k + 1) & rt_hash_mask; 1007 k = (k + 1) & rt_hash_mask;
996 rthp = &rt_hash_table[k].chain; 1008 rthp = &rt_hash_table[k].chain;
997 spin_lock_bh(rt_hash_lock_addr(k)); 1009 spin_lock_bh(rt_hash_lock_addr(k));
998 while ((rth = *rthp) != NULL) { 1010 while ((rth = rcu_dereference_protected(*rthp,
1011 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
999 if (!rt_is_expired(rth) && 1012 if (!rt_is_expired(rth) &&
1000 !rt_may_expire(rth, tmo, expire)) { 1013 !rt_may_expire(rth, tmo, expire)) {
1001 tmo >>= 1; 1014 tmo >>= 1;
@@ -1032,14 +1045,16 @@ static int rt_garbage_collect(struct dst_ops *ops)
1032 expire >>= 1; 1045 expire >>= 1;
1033#if RT_CACHE_DEBUG >= 2 1046#if RT_CACHE_DEBUG >= 2
1034 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, 1047 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1035 atomic_read(&ipv4_dst_ops.entries), goal, i); 1048 dst_entries_get_fast(&ipv4_dst_ops), goal, i);
1036#endif 1049#endif
1037 1050
1038 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) 1051 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1039 goto out; 1052 goto out;
1040 } while (!in_softirq() && time_before_eq(jiffies, now)); 1053 } while (!in_softirq() && time_before_eq(jiffies, now));
1041 1054
1042 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) 1055 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1056 goto out;
1057 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1043 goto out; 1058 goto out;
1044 if (net_ratelimit()) 1059 if (net_ratelimit())
1045 printk(KERN_WARNING "dst cache overflow\n"); 1060 printk(KERN_WARNING "dst cache overflow\n");
@@ -1049,11 +1064,12 @@ static int rt_garbage_collect(struct dst_ops *ops)
1049work_done: 1064work_done:
1050 expire += ip_rt_gc_min_interval; 1065 expire += ip_rt_gc_min_interval;
1051 if (expire > ip_rt_gc_timeout || 1066 if (expire > ip_rt_gc_timeout ||
1052 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh) 1067 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1068 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1053 expire = ip_rt_gc_timeout; 1069 expire = ip_rt_gc_timeout;
1054#if RT_CACHE_DEBUG >= 2 1070#if RT_CACHE_DEBUG >= 2
1055 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, 1071 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1056 atomic_read(&ipv4_dst_ops.entries), goal, rover); 1072 dst_entries_get_fast(&ipv4_dst_ops), goal, rover);
1057#endif 1073#endif
1058out: return 0; 1074out: return 0;
1059} 1075}
@@ -1068,7 +1084,7 @@ static int slow_chain_length(const struct rtable *head)
1068 1084
1069 while (rth) { 1085 while (rth) {
1070 length += has_noalias(head, rth); 1086 length += has_noalias(head, rth);
1071 rth = rth->dst.rt_next; 1087 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1072 } 1088 }
1073 return length >> FRACT_BITS; 1089 return length >> FRACT_BITS;
1074} 1090}
@@ -1076,9 +1092,9 @@ static int slow_chain_length(const struct rtable *head)
1076static int rt_intern_hash(unsigned hash, struct rtable *rt, 1092static int rt_intern_hash(unsigned hash, struct rtable *rt,
1077 struct rtable **rp, struct sk_buff *skb, int ifindex) 1093 struct rtable **rp, struct sk_buff *skb, int ifindex)
1078{ 1094{
1079 struct rtable *rth, **rthp; 1095 struct rtable *rth, *cand;
1096 struct rtable __rcu **rthp, **candp;
1080 unsigned long now; 1097 unsigned long now;
1081 struct rtable *cand, **candp;
1082 u32 min_score; 1098 u32 min_score;
1083 int chain_length; 1099 int chain_length;
1084 int attempts = !in_softirq(); 1100 int attempts = !in_softirq();
@@ -1102,30 +1118,31 @@ restart:
1102 * Note that we do rt_free on this new route entry, so that 1118 * Note that we do rt_free on this new route entry, so that
1103 * once its refcount hits zero, we are still able to reap it 1119 * once its refcount hits zero, we are still able to reap it
1104 * (Thanks Alexey) 1120 * (Thanks Alexey)
1105 * Note also the rt_free uses call_rcu. We don't actually 1121 * Note: To avoid expensive rcu stuff for this uncached dst,
1106 * need rcu protection here, this is just our path to get 1122 * we set DST_NOCACHE so that dst_release() can free dst without
1107 * on the route gc list. 1123 * waiting a grace period.
1108 */ 1124 */
1109 1125
1126 rt->dst.flags |= DST_NOCACHE;
1110 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { 1127 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1111 int err = arp_bind_neighbour(&rt->dst); 1128 int err = arp_bind_neighbour(&rt->dst);
1112 if (err) { 1129 if (err) {
1113 if (net_ratelimit()) 1130 if (net_ratelimit())
1114 printk(KERN_WARNING 1131 printk(KERN_WARNING
1115 "Neighbour table failure & not caching routes.\n"); 1132 "Neighbour table failure & not caching routes.\n");
1116 rt_drop(rt); 1133 ip_rt_put(rt);
1117 return err; 1134 return err;
1118 } 1135 }
1119 } 1136 }
1120 1137
1121 rt_free(rt);
1122 goto skip_hashing; 1138 goto skip_hashing;
1123 } 1139 }
1124 1140
1125 rthp = &rt_hash_table[hash].chain; 1141 rthp = &rt_hash_table[hash].chain;
1126 1142
1127 spin_lock_bh(rt_hash_lock_addr(hash)); 1143 spin_lock_bh(rt_hash_lock_addr(hash));
1128 while ((rth = *rthp) != NULL) { 1144 while ((rth = rcu_dereference_protected(*rthp,
1145 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1129 if (rt_is_expired(rth)) { 1146 if (rt_is_expired(rth)) {
1130 *rthp = rth->dst.rt_next; 1147 *rthp = rth->dst.rt_next;
1131 rt_free(rth); 1148 rt_free(rth);
@@ -1231,7 +1248,7 @@ restart:
1231 } 1248 }
1232 1249
1233 if (net_ratelimit()) 1250 if (net_ratelimit())
1234 printk(KERN_WARNING "Neighbour table overflow.\n"); 1251 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1235 rt_drop(rt); 1252 rt_drop(rt);
1236 return -ENOBUFS; 1253 return -ENOBUFS;
1237 } 1254 }
@@ -1268,18 +1285,11 @@ skip_hashing:
1268 1285
1269void rt_bind_peer(struct rtable *rt, int create) 1286void rt_bind_peer(struct rtable *rt, int create)
1270{ 1287{
1271 static DEFINE_SPINLOCK(rt_peer_lock);
1272 struct inet_peer *peer; 1288 struct inet_peer *peer;
1273 1289
1274 peer = inet_getpeer(rt->rt_dst, create); 1290 peer = inet_getpeer(rt->rt_dst, create);
1275 1291
1276 spin_lock_bh(&rt_peer_lock); 1292 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1277 if (rt->peer == NULL) {
1278 rt->peer = peer;
1279 peer = NULL;
1280 }
1281 spin_unlock_bh(&rt_peer_lock);
1282 if (peer)
1283 inet_putpeer(peer); 1293 inet_putpeer(peer);
1284} 1294}
1285 1295
@@ -1328,12 +1338,14 @@ EXPORT_SYMBOL(__ip_select_ident);
1328 1338
1329static void rt_del(unsigned hash, struct rtable *rt) 1339static void rt_del(unsigned hash, struct rtable *rt)
1330{ 1340{
1331 struct rtable **rthp, *aux; 1341 struct rtable __rcu **rthp;
1342 struct rtable *aux;
1332 1343
1333 rthp = &rt_hash_table[hash].chain; 1344 rthp = &rt_hash_table[hash].chain;
1334 spin_lock_bh(rt_hash_lock_addr(hash)); 1345 spin_lock_bh(rt_hash_lock_addr(hash));
1335 ip_rt_put(rt); 1346 ip_rt_put(rt);
1336 while ((aux = *rthp) != NULL) { 1347 while ((aux = rcu_dereference_protected(*rthp,
1348 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1337 if (aux == rt || rt_is_expired(aux)) { 1349 if (aux == rt || rt_is_expired(aux)) {
1338 *rthp = aux->dst.rt_next; 1350 *rthp = aux->dst.rt_next;
1339 rt_free(aux); 1351 rt_free(aux);
@@ -1350,7 +1362,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1350{ 1362{
1351 int i, k; 1363 int i, k;
1352 struct in_device *in_dev = __in_dev_get_rcu(dev); 1364 struct in_device *in_dev = __in_dev_get_rcu(dev);
1353 struct rtable *rth, **rthp; 1365 struct rtable *rth;
1366 struct rtable __rcu **rthp;
1354 __be32 skeys[2] = { saddr, 0 }; 1367 __be32 skeys[2] = { saddr, 0 };
1355 int ikeys[2] = { dev->ifindex, 0 }; 1368 int ikeys[2] = { dev->ifindex, 0 };
1356 struct netevent_redirect netevent; 1369 struct netevent_redirect netevent;
@@ -1383,7 +1396,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1383 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], 1396 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1384 rt_genid(net)); 1397 rt_genid(net));
1385 1398
1386 rthp=&rt_hash_table[hash].chain; 1399 rthp = &rt_hash_table[hash].chain;
1387 1400
1388 while ((rth = rcu_dereference(*rthp)) != NULL) { 1401 while ((rth = rcu_dereference(*rthp)) != NULL) {
1389 struct rtable *rt; 1402 struct rtable *rt;
@@ -1779,12 +1792,15 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
1779 1792
1780 if (rt->fl.iif == 0) 1793 if (rt->fl.iif == 0)
1781 src = rt->rt_src; 1794 src = rt->rt_src;
1782 else if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) { 1795 else {
1783 src = FIB_RES_PREFSRC(res); 1796 rcu_read_lock();
1784 fib_res_put(&res); 1797 if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
1785 } else 1798 src = FIB_RES_PREFSRC(res);
1786 src = inet_select_addr(rt->dst.dev, rt->rt_gateway, 1799 else
1800 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1787 RT_SCOPE_UNIVERSE); 1801 RT_SCOPE_UNIVERSE);
1802 rcu_read_unlock();
1803 }
1788 memcpy(addr, &src, 4); 1804 memcpy(addr, &src, 4);
1789} 1805}
1790 1806
@@ -2087,6 +2103,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
2087 * Such approach solves two big problems: 2103 * Such approach solves two big problems:
2088 * 1. Not simplex devices are handled properly. 2104 * 1. Not simplex devices are handled properly.
2089 * 2. IP spoofing attempts are filtered with 100% of guarantee. 2105 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2106 * called with rcu_read_lock()
2090 */ 2107 */
2091 2108
2092static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2109static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
@@ -2108,7 +2125,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2108 unsigned hash; 2125 unsigned hash;
2109 __be32 spec_dst; 2126 __be32 spec_dst;
2110 int err = -EINVAL; 2127 int err = -EINVAL;
2111 int free_res = 0;
2112 struct net * net = dev_net(dev); 2128 struct net * net = dev_net(dev);
2113 2129
2114 /* IP on this device is disabled. */ 2130 /* IP on this device is disabled. */
@@ -2124,7 +2140,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2124 ipv4_is_loopback(saddr)) 2140 ipv4_is_loopback(saddr))
2125 goto martian_source; 2141 goto martian_source;
2126 2142
2127 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0)) 2143 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2128 goto brd_input; 2144 goto brd_input;
2129 2145
2130 /* Accept zero addresses only to limited broadcast; 2146 /* Accept zero addresses only to limited broadcast;
@@ -2133,19 +2149,18 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2133 if (ipv4_is_zeronet(saddr)) 2149 if (ipv4_is_zeronet(saddr))
2134 goto martian_source; 2150 goto martian_source;
2135 2151
2136 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) || 2152 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2137 ipv4_is_loopback(daddr))
2138 goto martian_destination; 2153 goto martian_destination;
2139 2154
2140 /* 2155 /*
2141 * Now we are ready to route packet. 2156 * Now we are ready to route packet.
2142 */ 2157 */
2143 if ((err = fib_lookup(net, &fl, &res)) != 0) { 2158 err = fib_lookup(net, &fl, &res);
2159 if (err != 0) {
2144 if (!IN_DEV_FORWARD(in_dev)) 2160 if (!IN_DEV_FORWARD(in_dev))
2145 goto e_hostunreach; 2161 goto e_hostunreach;
2146 goto no_route; 2162 goto no_route;
2147 } 2163 }
2148 free_res = 1;
2149 2164
2150 RT_CACHE_STAT_INC(in_slow_tot); 2165 RT_CACHE_STAT_INC(in_slow_tot);
2151 2166
@@ -2154,8 +2169,8 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2154 2169
2155 if (res.type == RTN_LOCAL) { 2170 if (res.type == RTN_LOCAL) {
2156 err = fib_validate_source(saddr, daddr, tos, 2171 err = fib_validate_source(saddr, daddr, tos,
2157 net->loopback_dev->ifindex, 2172 net->loopback_dev->ifindex,
2158 dev, &spec_dst, &itag, skb->mark); 2173 dev, &spec_dst, &itag, skb->mark);
2159 if (err < 0) 2174 if (err < 0)
2160 goto martian_source_keep_err; 2175 goto martian_source_keep_err;
2161 if (err) 2176 if (err)
@@ -2170,9 +2185,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2170 goto martian_destination; 2185 goto martian_destination;
2171 2186
2172 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); 2187 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2173done:
2174 if (free_res)
2175 fib_res_put(&res);
2176out: return err; 2188out: return err;
2177 2189
2178brd_input: 2190brd_input:
@@ -2232,7 +2244,7 @@ local_input:
2232 rth->rt_type = res.type; 2244 rth->rt_type = res.type;
2233 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); 2245 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2234 err = rt_intern_hash(hash, rth, NULL, skb, fl.iif); 2246 err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
2235 goto done; 2247 goto out;
2236 2248
2237no_route: 2249no_route:
2238 RT_CACHE_STAT_INC(in_no_route); 2250 RT_CACHE_STAT_INC(in_no_route);
@@ -2255,21 +2267,21 @@ martian_destination:
2255 2267
2256e_hostunreach: 2268e_hostunreach:
2257 err = -EHOSTUNREACH; 2269 err = -EHOSTUNREACH;
2258 goto done; 2270 goto out;
2259 2271
2260e_inval: 2272e_inval:
2261 err = -EINVAL; 2273 err = -EINVAL;
2262 goto done; 2274 goto out;
2263 2275
2264e_nobufs: 2276e_nobufs:
2265 err = -ENOBUFS; 2277 err = -ENOBUFS;
2266 goto done; 2278 goto out;
2267 2279
2268martian_source: 2280martian_source:
2269 err = -EINVAL; 2281 err = -EINVAL;
2270martian_source_keep_err: 2282martian_source_keep_err:
2271 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); 2283 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2272 goto done; 2284 goto out;
2273} 2285}
2274 2286
2275int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2287int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
@@ -2355,6 +2367,7 @@ skip_cache:
2355} 2367}
2356EXPORT_SYMBOL(ip_route_input_common); 2368EXPORT_SYMBOL(ip_route_input_common);
2357 2369
2370/* called with rcu_read_lock() */
2358static int __mkroute_output(struct rtable **result, 2371static int __mkroute_output(struct rtable **result,
2359 struct fib_result *res, 2372 struct fib_result *res,
2360 const struct flowi *fl, 2373 const struct flowi *fl,
@@ -2365,53 +2378,47 @@ static int __mkroute_output(struct rtable **result,
2365 struct rtable *rth; 2378 struct rtable *rth;
2366 struct in_device *in_dev; 2379 struct in_device *in_dev;
2367 u32 tos = RT_FL_TOS(oldflp); 2380 u32 tos = RT_FL_TOS(oldflp);
2368 int err = 0;
2369 2381
2370 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK)) 2382 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
2371 return -EINVAL; 2383 return -EINVAL;
2372 2384
2373 if (fl->fl4_dst == htonl(0xFFFFFFFF)) 2385 if (ipv4_is_lbcast(fl->fl4_dst))
2374 res->type = RTN_BROADCAST; 2386 res->type = RTN_BROADCAST;
2375 else if (ipv4_is_multicast(fl->fl4_dst)) 2387 else if (ipv4_is_multicast(fl->fl4_dst))
2376 res->type = RTN_MULTICAST; 2388 res->type = RTN_MULTICAST;
2377 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst)) 2389 else if (ipv4_is_zeronet(fl->fl4_dst))
2378 return -EINVAL; 2390 return -EINVAL;
2379 2391
2380 if (dev_out->flags & IFF_LOOPBACK) 2392 if (dev_out->flags & IFF_LOOPBACK)
2381 flags |= RTCF_LOCAL; 2393 flags |= RTCF_LOCAL;
2382 2394
2383 /* get work reference to inet device */ 2395 in_dev = __in_dev_get_rcu(dev_out);
2384 in_dev = in_dev_get(dev_out);
2385 if (!in_dev) 2396 if (!in_dev)
2386 return -EINVAL; 2397 return -EINVAL;
2387 2398
2388 if (res->type == RTN_BROADCAST) { 2399 if (res->type == RTN_BROADCAST) {
2389 flags |= RTCF_BROADCAST | RTCF_LOCAL; 2400 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2390 if (res->fi) { 2401 res->fi = NULL;
2391 fib_info_put(res->fi);
2392 res->fi = NULL;
2393 }
2394 } else if (res->type == RTN_MULTICAST) { 2402 } else if (res->type == RTN_MULTICAST) {
2395 flags |= RTCF_MULTICAST|RTCF_LOCAL; 2403 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2396 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, 2404 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2397 oldflp->proto)) 2405 oldflp->proto))
2398 flags &= ~RTCF_LOCAL; 2406 flags &= ~RTCF_LOCAL;
2399 /* If multicast route do not exist use 2407 /* If multicast route do not exist use
2400 default one, but do not gateway in this case. 2408 * default one, but do not gateway in this case.
2401 Yes, it is hack. 2409 * Yes, it is hack.
2402 */ 2410 */
2403 if (res->fi && res->prefixlen < 4) { 2411 if (res->fi && res->prefixlen < 4)
2404 fib_info_put(res->fi);
2405 res->fi = NULL; 2412 res->fi = NULL;
2406 }
2407 } 2413 }
2408 2414
2409 2415
2410 rth = dst_alloc(&ipv4_dst_ops); 2416 rth = dst_alloc(&ipv4_dst_ops);
2411 if (!rth) { 2417 if (!rth)
2412 err = -ENOBUFS; 2418 return -ENOBUFS;
2413 goto cleanup; 2419
2414 } 2420 in_dev_hold(in_dev);
2421 rth->idev = in_dev;
2415 2422
2416 atomic_set(&rth->dst.__refcnt, 1); 2423 atomic_set(&rth->dst.__refcnt, 1);
2417 rth->dst.flags= DST_HOST; 2424 rth->dst.flags= DST_HOST;
@@ -2432,7 +2439,6 @@ static int __mkroute_output(struct rtable **result,
2432 cache entry */ 2439 cache entry */
2433 rth->dst.dev = dev_out; 2440 rth->dst.dev = dev_out;
2434 dev_hold(dev_out); 2441 dev_hold(dev_out);
2435 rth->idev = in_dev_get(dev_out);
2436 rth->rt_gateway = fl->fl4_dst; 2442 rth->rt_gateway = fl->fl4_dst;
2437 rth->rt_spec_dst= fl->fl4_src; 2443 rth->rt_spec_dst= fl->fl4_src;
2438 2444
@@ -2467,15 +2473,11 @@ static int __mkroute_output(struct rtable **result,
2467 rt_set_nexthop(rth, res, 0); 2473 rt_set_nexthop(rth, res, 0);
2468 2474
2469 rth->rt_flags = flags; 2475 rth->rt_flags = flags;
2470
2471 *result = rth; 2476 *result = rth;
2472 cleanup: 2477 return 0;
2473 /* release work reference to inet device */
2474 in_dev_put(in_dev);
2475
2476 return err;
2477} 2478}
2478 2479
2480/* called with rcu_read_lock() */
2479static int ip_mkroute_output(struct rtable **rp, 2481static int ip_mkroute_output(struct rtable **rp,
2480 struct fib_result *res, 2482 struct fib_result *res,
2481 const struct flowi *fl, 2483 const struct flowi *fl,
@@ -2497,6 +2499,7 @@ static int ip_mkroute_output(struct rtable **rp,
2497 2499
2498/* 2500/*
2499 * Major route resolver routine. 2501 * Major route resolver routine.
2502 * called with rcu_read_lock();
2500 */ 2503 */
2501 2504
2502static int ip_route_output_slow(struct net *net, struct rtable **rp, 2505static int ip_route_output_slow(struct net *net, struct rtable **rp,
@@ -2515,9 +2518,8 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2515 .iif = net->loopback_dev->ifindex, 2518 .iif = net->loopback_dev->ifindex,
2516 .oif = oldflp->oif }; 2519 .oif = oldflp->oif };
2517 struct fib_result res; 2520 struct fib_result res;
2518 unsigned flags = 0; 2521 unsigned int flags = 0;
2519 struct net_device *dev_out = NULL; 2522 struct net_device *dev_out = NULL;
2520 int free_res = 0;
2521 int err; 2523 int err;
2522 2524
2523 2525
@@ -2543,9 +2545,9 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2543 2545
2544 if (oldflp->oif == 0 && 2546 if (oldflp->oif == 0 &&
2545 (ipv4_is_multicast(oldflp->fl4_dst) || 2547 (ipv4_is_multicast(oldflp->fl4_dst) ||
2546 oldflp->fl4_dst == htonl(0xFFFFFFFF))) { 2548 ipv4_is_lbcast(oldflp->fl4_dst))) {
2547 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2549 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2548 dev_out = ip_dev_find(net, oldflp->fl4_src); 2550 dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
2549 if (dev_out == NULL) 2551 if (dev_out == NULL)
2550 goto out; 2552 goto out;
2551 2553
@@ -2570,29 +2572,24 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2570 2572
2571 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) { 2573 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2572 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2574 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2573 dev_out = ip_dev_find(net, oldflp->fl4_src); 2575 if (!__ip_dev_find(net, oldflp->fl4_src, false))
2574 if (dev_out == NULL)
2575 goto out; 2576 goto out;
2576 dev_put(dev_out);
2577 dev_out = NULL;
2578 } 2577 }
2579 } 2578 }
2580 2579
2581 2580
2582 if (oldflp->oif) { 2581 if (oldflp->oif) {
2583 dev_out = dev_get_by_index(net, oldflp->oif); 2582 dev_out = dev_get_by_index_rcu(net, oldflp->oif);
2584 err = -ENODEV; 2583 err = -ENODEV;
2585 if (dev_out == NULL) 2584 if (dev_out == NULL)
2586 goto out; 2585 goto out;
2587 2586
2588 /* RACE: Check return value of inet_select_addr instead. */ 2587 /* RACE: Check return value of inet_select_addr instead. */
2589 if (__in_dev_get_rtnl(dev_out) == NULL) { 2588 if (rcu_dereference(dev_out->ip_ptr) == NULL)
2590 dev_put(dev_out);
2591 goto out; /* Wrong error code */ 2589 goto out; /* Wrong error code */
2592 }
2593 2590
2594 if (ipv4_is_local_multicast(oldflp->fl4_dst) || 2591 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2595 oldflp->fl4_dst == htonl(0xFFFFFFFF)) { 2592 ipv4_is_lbcast(oldflp->fl4_dst)) {
2596 if (!fl.fl4_src) 2593 if (!fl.fl4_src)
2597 fl.fl4_src = inet_select_addr(dev_out, 0, 2594 fl.fl4_src = inet_select_addr(dev_out, 0,
2598 RT_SCOPE_LINK); 2595 RT_SCOPE_LINK);
@@ -2612,10 +2609,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2612 fl.fl4_dst = fl.fl4_src; 2609 fl.fl4_dst = fl.fl4_src;
2613 if (!fl.fl4_dst) 2610 if (!fl.fl4_dst)
2614 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); 2611 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2615 if (dev_out)
2616 dev_put(dev_out);
2617 dev_out = net->loopback_dev; 2612 dev_out = net->loopback_dev;
2618 dev_hold(dev_out);
2619 fl.oif = net->loopback_dev->ifindex; 2613 fl.oif = net->loopback_dev->ifindex;
2620 res.type = RTN_LOCAL; 2614 res.type = RTN_LOCAL;
2621 flags |= RTCF_LOCAL; 2615 flags |= RTCF_LOCAL;
@@ -2649,23 +2643,15 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2649 res.type = RTN_UNICAST; 2643 res.type = RTN_UNICAST;
2650 goto make_route; 2644 goto make_route;
2651 } 2645 }
2652 if (dev_out)
2653 dev_put(dev_out);
2654 err = -ENETUNREACH; 2646 err = -ENETUNREACH;
2655 goto out; 2647 goto out;
2656 } 2648 }
2657 free_res = 1;
2658 2649
2659 if (res.type == RTN_LOCAL) { 2650 if (res.type == RTN_LOCAL) {
2660 if (!fl.fl4_src) 2651 if (!fl.fl4_src)
2661 fl.fl4_src = fl.fl4_dst; 2652 fl.fl4_src = fl.fl4_dst;
2662 if (dev_out)
2663 dev_put(dev_out);
2664 dev_out = net->loopback_dev; 2653 dev_out = net->loopback_dev;
2665 dev_hold(dev_out);
2666 fl.oif = dev_out->ifindex; 2654 fl.oif = dev_out->ifindex;
2667 if (res.fi)
2668 fib_info_put(res.fi);
2669 res.fi = NULL; 2655 res.fi = NULL;
2670 flags |= RTCF_LOCAL; 2656 flags |= RTCF_LOCAL;
2671 goto make_route; 2657 goto make_route;
@@ -2682,28 +2668,21 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2682 if (!fl.fl4_src) 2668 if (!fl.fl4_src)
2683 fl.fl4_src = FIB_RES_PREFSRC(res); 2669 fl.fl4_src = FIB_RES_PREFSRC(res);
2684 2670
2685 if (dev_out)
2686 dev_put(dev_out);
2687 dev_out = FIB_RES_DEV(res); 2671 dev_out = FIB_RES_DEV(res);
2688 dev_hold(dev_out);
2689 fl.oif = dev_out->ifindex; 2672 fl.oif = dev_out->ifindex;
2690 2673
2691 2674
2692make_route: 2675make_route:
2693 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); 2676 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2694 2677
2695
2696 if (free_res)
2697 fib_res_put(&res);
2698 if (dev_out)
2699 dev_put(dev_out);
2700out: return err; 2678out: return err;
2701} 2679}
2702 2680
2703int __ip_route_output_key(struct net *net, struct rtable **rp, 2681int __ip_route_output_key(struct net *net, struct rtable **rp,
2704 const struct flowi *flp) 2682 const struct flowi *flp)
2705{ 2683{
2706 unsigned hash; 2684 unsigned int hash;
2685 int res;
2707 struct rtable *rth; 2686 struct rtable *rth;
2708 2687
2709 if (!rt_caching(net)) 2688 if (!rt_caching(net))
@@ -2734,10 +2713,18 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
2734 rcu_read_unlock_bh(); 2713 rcu_read_unlock_bh();
2735 2714
2736slow_output: 2715slow_output:
2737 return ip_route_output_slow(net, rp, flp); 2716 rcu_read_lock();
2717 res = ip_route_output_slow(net, rp, flp);
2718 rcu_read_unlock();
2719 return res;
2738} 2720}
2739EXPORT_SYMBOL_GPL(__ip_route_output_key); 2721EXPORT_SYMBOL_GPL(__ip_route_output_key);
2740 2722
2723static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2724{
2725 return NULL;
2726}
2727
2741static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) 2728static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2742{ 2729{
2743} 2730}
@@ -2746,9 +2733,8 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
2746 .family = AF_INET, 2733 .family = AF_INET,
2747 .protocol = cpu_to_be16(ETH_P_IP), 2734 .protocol = cpu_to_be16(ETH_P_IP),
2748 .destroy = ipv4_dst_destroy, 2735 .destroy = ipv4_dst_destroy,
2749 .check = ipv4_dst_check, 2736 .check = ipv4_blackhole_dst_check,
2750 .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2737 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2751 .entries = ATOMIC_INIT(0),
2752}; 2738};
2753 2739
2754 2740
@@ -2793,7 +2779,7 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
2793 2779
2794 dst_release(&(*rp)->dst); 2780 dst_release(&(*rp)->dst);
2795 *rp = rt; 2781 *rp = rt;
2796 return (rt ? 0 : -ENOMEM); 2782 return rt ? 0 : -ENOMEM;
2797} 2783}
2798 2784
2799int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, 2785int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
@@ -3318,6 +3304,12 @@ int __init ip_rt_init(void)
3318 3304
3319 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; 3305 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3320 3306
3307 if (dst_entries_init(&ipv4_dst_ops) < 0)
3308 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3309
3310 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3311 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3312
3321 rt_hash_table = (struct rt_hash_bucket *) 3313 rt_hash_table = (struct rt_hash_bucket *)
3322 alloc_large_system_hash("IP route cache", 3314 alloc_large_system_hash("IP route cache",
3323 sizeof(struct rt_hash_bucket), 3315 sizeof(struct rt_hash_bucket),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 176e11aaea77..1664a0590bb8 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -386,8 +386,6 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
386 */ 386 */
387 387
388 mask = 0; 388 mask = 0;
389 if (sk->sk_err)
390 mask = POLLERR;
391 389
392 /* 390 /*
393 * POLLHUP is certainly not done right. But poll() doesn't 391 * POLLHUP is certainly not done right. But poll() doesn't
@@ -451,11 +449,17 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
451 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) 449 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
452 mask |= POLLOUT | POLLWRNORM; 450 mask |= POLLOUT | POLLWRNORM;
453 } 451 }
454 } 452 } else
453 mask |= POLLOUT | POLLWRNORM;
455 454
456 if (tp->urg_data & TCP_URG_VALID) 455 if (tp->urg_data & TCP_URG_VALID)
457 mask |= POLLPRI; 456 mask |= POLLPRI;
458 } 457 }
458 /* This barrier is coupled with smp_wmb() in tcp_reset() */
459 smp_rmb();
460 if (sk->sk_err)
461 mask |= POLLERR;
462
459 return mask; 463 return mask;
460} 464}
461EXPORT_SYMBOL(tcp_poll); 465EXPORT_SYMBOL(tcp_poll);
@@ -939,7 +943,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
939 sg = sk->sk_route_caps & NETIF_F_SG; 943 sg = sk->sk_route_caps & NETIF_F_SG;
940 944
941 while (--iovlen >= 0) { 945 while (--iovlen >= 0) {
942 int seglen = iov->iov_len; 946 size_t seglen = iov->iov_len;
943 unsigned char __user *from = iov->iov_base; 947 unsigned char __user *from = iov->iov_base;
944 948
945 iov++; 949 iov++;
@@ -2011,11 +2015,8 @@ adjudge_to_death:
2011 } 2015 }
2012 } 2016 }
2013 if (sk->sk_state != TCP_CLOSE) { 2017 if (sk->sk_state != TCP_CLOSE) {
2014 int orphan_count = percpu_counter_read_positive(
2015 sk->sk_prot->orphan_count);
2016
2017 sk_mem_reclaim(sk); 2018 sk_mem_reclaim(sk);
2018 if (tcp_too_many_orphans(sk, orphan_count)) { 2019 if (tcp_too_many_orphans(sk, 0)) {
2019 if (net_ratelimit()) 2020 if (net_ratelimit())
2020 printk(KERN_INFO "TCP: too many of orphaned " 2021 printk(KERN_INFO "TCP: too many of orphaned "
2021 "sockets\n"); 2022 "sockets\n");
@@ -2391,7 +2392,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2391 err = tp->af_specific->md5_parse(sk, optval, optlen); 2392 err = tp->af_specific->md5_parse(sk, optval, optlen);
2392 break; 2393 break;
2393#endif 2394#endif
2394 2395 case TCP_USER_TIMEOUT:
2396 /* Cap the max timeout in ms TCP will retry/retrans
2397 * before giving up and aborting (ETIMEDOUT) a connection.
2398 */
2399 icsk->icsk_user_timeout = msecs_to_jiffies(val);
2400 break;
2395 default: 2401 default:
2396 err = -ENOPROTOOPT; 2402 err = -ENOPROTOOPT;
2397 break; 2403 break;
@@ -2610,6 +2616,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
2610 case TCP_THIN_DUPACK: 2616 case TCP_THIN_DUPACK:
2611 val = tp->thin_dupack; 2617 val = tp->thin_dupack;
2612 break; 2618 break;
2619
2620 case TCP_USER_TIMEOUT:
2621 val = jiffies_to_msecs(icsk->icsk_user_timeout);
2622 break;
2613 default: 2623 default:
2614 return -ENOPROTOOPT; 2624 return -ENOPROTOOPT;
2615 } 2625 }
@@ -3212,7 +3222,7 @@ void __init tcp_init(void)
3212{ 3222{
3213 struct sk_buff *skb = NULL; 3223 struct sk_buff *skb = NULL;
3214 unsigned long nr_pages, limit; 3224 unsigned long nr_pages, limit;
3215 int order, i, max_share; 3225 int i, max_share, cnt;
3216 unsigned long jiffy = jiffies; 3226 unsigned long jiffy = jiffies;
3217 3227
3218 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); 3228 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
@@ -3261,22 +3271,12 @@ void __init tcp_init(void)
3261 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); 3271 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
3262 } 3272 }
3263 3273
3264 /* Try to be a bit smarter and adjust defaults depending 3274
3265 * on available memory. 3275 cnt = tcp_hashinfo.ehash_mask + 1;
3266 */ 3276
3267 for (order = 0; ((1 << order) << PAGE_SHIFT) < 3277 tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3268 (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket)); 3278 sysctl_tcp_max_orphans = cnt / 2;
3269 order++) 3279 sysctl_max_syn_backlog = max(128, cnt / 256);
3270 ;
3271 if (order >= 4) {
3272 tcp_death_row.sysctl_max_tw_buckets = 180000;
3273 sysctl_tcp_max_orphans = 4096 << (order - 4);
3274 sysctl_max_syn_backlog = 1024;
3275 } else if (order < 3) {
3276 tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
3277 sysctl_tcp_max_orphans >>= (3 - order);
3278 sysctl_max_syn_backlog = 128;
3279 }
3280 3280
3281 /* Set the pressure threshold to be a fraction of global memory that 3281 /* Set the pressure threshold to be a fraction of global memory that
3282 * is up to 1/2 at 256 MB, decreasing toward zero with the amount of 3282 * is up to 1/2 at 256 MB, decreasing toward zero with the amount of
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 0ec9bd0ae94f..850c737e08e2 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -196,10 +196,10 @@ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
196int tcp_set_allowed_congestion_control(char *val) 196int tcp_set_allowed_congestion_control(char *val)
197{ 197{
198 struct tcp_congestion_ops *ca; 198 struct tcp_congestion_ops *ca;
199 char *clone, *name; 199 char *saved_clone, *clone, *name;
200 int ret = 0; 200 int ret = 0;
201 201
202 clone = kstrdup(val, GFP_USER); 202 saved_clone = clone = kstrdup(val, GFP_USER);
203 if (!clone) 203 if (!clone)
204 return -ENOMEM; 204 return -ENOMEM;
205 205
@@ -226,6 +226,7 @@ int tcp_set_allowed_congestion_control(char *val)
226 } 226 }
227out: 227out:
228 spin_unlock(&tcp_cong_list_lock); 228 spin_unlock(&tcp_cong_list_lock);
229 kfree(saved_clone);
229 230
230 return ret; 231 return ret;
231} 232}
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 1eba160b72dc..00ca688d8964 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -6,7 +6,7 @@
6 * The algorithm is described in: 6 * The algorithm is described in:
7 * "TCP-Illinois: A Loss and Delay-Based Congestion Control Algorithm 7 * "TCP-Illinois: A Loss and Delay-Based Congestion Control Algorithm
8 * for High-Speed Networks" 8 * for High-Speed Networks"
9 * http://www.ews.uiuc.edu/~shaoliu/papersandslides/liubassri06perf.pdf 9 * http://www.ifp.illinois.edu/~srikant/Papers/liubassri06perf.pdf
10 * 10 *
11 * Implemented from description in paper and ns-2 simulation. 11 * Implemented from description in paper and ns-2 simulation.
12 * Copyright (C) 2007 Stephen Hemminger <shemminger@linux-foundation.org> 12 * Copyright (C) 2007 Stephen Hemminger <shemminger@linux-foundation.org>
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e663b78a2ef6..3357f69e353d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -182,7 +182,7 @@ static void tcp_incr_quickack(struct sock *sk)
182 icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS); 182 icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
183} 183}
184 184
185void tcp_enter_quickack_mode(struct sock *sk) 185static void tcp_enter_quickack_mode(struct sock *sk)
186{ 186{
187 struct inet_connection_sock *icsk = inet_csk(sk); 187 struct inet_connection_sock *icsk = inet_csk(sk);
188 tcp_incr_quickack(sk); 188 tcp_incr_quickack(sk);
@@ -428,10 +428,10 @@ EXPORT_SYMBOL(tcp_initialize_rcv_mss);
428 * 428 *
429 * The algorithm for RTT estimation w/o timestamps is based on 429 * The algorithm for RTT estimation w/o timestamps is based on
430 * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL. 430 * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
431 * <http://www.lanl.gov/radiant/website/pubs/drs/lacsi2001.ps> 431 * <http://public.lanl.gov/radiant/pubs.html#DRS>
432 * 432 *
433 * More detail on this code can be found at 433 * More detail on this code can be found at
434 * <http://www.psc.edu/~jheffner/senior_thesis.ps>, 434 * <http://staff.psc.edu/jheffner/>,
435 * though this reference is out of date. A new paper 435 * though this reference is out of date. A new paper
436 * is pending. 436 * is pending.
437 */ 437 */
@@ -805,25 +805,12 @@ void tcp_update_metrics(struct sock *sk)
805 } 805 }
806} 806}
807 807
808/* Numbers are taken from RFC3390.
809 *
810 * John Heffner states:
811 *
812 * The RFC specifies a window of no more than 4380 bytes
813 * unless 2*MSS > 4380. Reading the pseudocode in the RFC
814 * is a bit misleading because they use a clamp at 4380 bytes
815 * rather than use a multiplier in the relevant range.
816 */
817__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) 808__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
818{ 809{
819 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); 810 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
820 811
821 if (!cwnd) { 812 if (!cwnd)
822 if (tp->mss_cache > 1460) 813 cwnd = rfc3390_bytes_to_packets(tp->mss_cache);
823 cwnd = 2;
824 else
825 cwnd = (tp->mss_cache > 1095) ? 3 : 4;
826 }
827 return min_t(__u32, cwnd, tp->snd_cwnd_clamp); 814 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
828} 815}
829 816
@@ -2314,7 +2301,7 @@ static inline int tcp_dupack_heuristics(struct tcp_sock *tp)
2314 2301
2315static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) 2302static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
2316{ 2303{
2317 return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto); 2304 return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto;
2318} 2305}
2319 2306
2320static inline int tcp_head_timedout(struct sock *sk) 2307static inline int tcp_head_timedout(struct sock *sk)
@@ -2508,7 +2495,7 @@ static void tcp_timeout_skbs(struct sock *sk)
2508/* Mark head of queue up as lost. With RFC3517 SACK, the packets is 2495/* Mark head of queue up as lost. With RFC3517 SACK, the packets is
2509 * is against sacked "cnt", otherwise it's against facked "cnt" 2496 * is against sacked "cnt", otherwise it's against facked "cnt"
2510 */ 2497 */
2511static void tcp_mark_head_lost(struct sock *sk, int packets) 2498static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
2512{ 2499{
2513 struct tcp_sock *tp = tcp_sk(sk); 2500 struct tcp_sock *tp = tcp_sk(sk);
2514 struct sk_buff *skb; 2501 struct sk_buff *skb;
@@ -2516,13 +2503,13 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
2516 int err; 2503 int err;
2517 unsigned int mss; 2504 unsigned int mss;
2518 2505
2519 if (packets == 0)
2520 return;
2521
2522 WARN_ON(packets > tp->packets_out); 2506 WARN_ON(packets > tp->packets_out);
2523 if (tp->lost_skb_hint) { 2507 if (tp->lost_skb_hint) {
2524 skb = tp->lost_skb_hint; 2508 skb = tp->lost_skb_hint;
2525 cnt = tp->lost_cnt_hint; 2509 cnt = tp->lost_cnt_hint;
2510 /* Head already handled? */
2511 if (mark_head && skb != tcp_write_queue_head(sk))
2512 return;
2526 } else { 2513 } else {
2527 skb = tcp_write_queue_head(sk); 2514 skb = tcp_write_queue_head(sk);
2528 cnt = 0; 2515 cnt = 0;
@@ -2545,7 +2532,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
2545 cnt += tcp_skb_pcount(skb); 2532 cnt += tcp_skb_pcount(skb);
2546 2533
2547 if (cnt > packets) { 2534 if (cnt > packets) {
2548 if (tcp_is_sack(tp) || (oldcnt >= packets)) 2535 if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
2536 (oldcnt >= packets))
2549 break; 2537 break;
2550 2538
2551 mss = skb_shinfo(skb)->gso_size; 2539 mss = skb_shinfo(skb)->gso_size;
@@ -2556,6 +2544,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
2556 } 2544 }
2557 2545
2558 tcp_skb_mark_lost(tp, skb); 2546 tcp_skb_mark_lost(tp, skb);
2547
2548 if (mark_head)
2549 break;
2559 } 2550 }
2560 tcp_verify_left_out(tp); 2551 tcp_verify_left_out(tp);
2561} 2552}
@@ -2567,17 +2558,18 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
2567 struct tcp_sock *tp = tcp_sk(sk); 2558 struct tcp_sock *tp = tcp_sk(sk);
2568 2559
2569 if (tcp_is_reno(tp)) { 2560 if (tcp_is_reno(tp)) {
2570 tcp_mark_head_lost(sk, 1); 2561 tcp_mark_head_lost(sk, 1, 1);
2571 } else if (tcp_is_fack(tp)) { 2562 } else if (tcp_is_fack(tp)) {
2572 int lost = tp->fackets_out - tp->reordering; 2563 int lost = tp->fackets_out - tp->reordering;
2573 if (lost <= 0) 2564 if (lost <= 0)
2574 lost = 1; 2565 lost = 1;
2575 tcp_mark_head_lost(sk, lost); 2566 tcp_mark_head_lost(sk, lost, 0);
2576 } else { 2567 } else {
2577 int sacked_upto = tp->sacked_out - tp->reordering; 2568 int sacked_upto = tp->sacked_out - tp->reordering;
2578 if (sacked_upto < fast_rexmit) 2569 if (sacked_upto >= 0)
2579 sacked_upto = fast_rexmit; 2570 tcp_mark_head_lost(sk, sacked_upto, 0);
2580 tcp_mark_head_lost(sk, sacked_upto); 2571 else if (fast_rexmit)
2572 tcp_mark_head_lost(sk, 1, 1);
2581 } 2573 }
2582 2574
2583 tcp_timeout_skbs(sk); 2575 tcp_timeout_skbs(sk);
@@ -2886,7 +2878,7 @@ static void tcp_mtup_probe_success(struct sock *sk)
2886 icsk->icsk_mtup.probe_size; 2878 icsk->icsk_mtup.probe_size;
2887 tp->snd_cwnd_cnt = 0; 2879 tp->snd_cwnd_cnt = 0;
2888 tp->snd_cwnd_stamp = tcp_time_stamp; 2880 tp->snd_cwnd_stamp = tcp_time_stamp;
2889 tp->rcv_ssthresh = tcp_current_ssthresh(sk); 2881 tp->snd_ssthresh = tcp_current_ssthresh(sk);
2890 2882
2891 icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size; 2883 icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
2892 icsk->icsk_mtup.probe_size = 0; 2884 icsk->icsk_mtup.probe_size = 0;
@@ -2983,7 +2975,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
2983 before(tp->snd_una, tp->high_seq) && 2975 before(tp->snd_una, tp->high_seq) &&
2984 icsk->icsk_ca_state != TCP_CA_Open && 2976 icsk->icsk_ca_state != TCP_CA_Open &&
2985 tp->fackets_out > tp->reordering) { 2977 tp->fackets_out > tp->reordering) {
2986 tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering); 2978 tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0);
2987 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS); 2979 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS);
2988 } 2980 }
2989 2981
@@ -3411,8 +3403,8 @@ static void tcp_ack_probe(struct sock *sk)
3411 3403
3412static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag) 3404static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
3413{ 3405{
3414 return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || 3406 return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
3415 inet_csk(sk)->icsk_ca_state != TCP_CA_Open); 3407 inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
3416} 3408}
3417 3409
3418static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag) 3410static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
@@ -3429,9 +3421,9 @@ static inline int tcp_may_update_window(const struct tcp_sock *tp,
3429 const u32 ack, const u32 ack_seq, 3421 const u32 ack, const u32 ack_seq,
3430 const u32 nwin) 3422 const u32 nwin)
3431{ 3423{
3432 return (after(ack, tp->snd_una) || 3424 return after(ack, tp->snd_una) ||
3433 after(ack_seq, tp->snd_wl1) || 3425 after(ack_seq, tp->snd_wl1) ||
3434 (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd)); 3426 (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
3435} 3427}
3436 3428
3437/* Update our send window. 3429/* Update our send window.
@@ -4048,6 +4040,8 @@ static void tcp_reset(struct sock *sk)
4048 default: 4040 default:
4049 sk->sk_err = ECONNRESET; 4041 sk->sk_err = ECONNRESET;
4050 } 4042 }
4043 /* This barrier is coupled with smp_rmb() in tcp_poll() */
4044 smp_wmb();
4051 4045
4052 if (!sock_flag(sk, SOCK_DEAD)) 4046 if (!sock_flag(sk, SOCK_DEAD))
4053 sk->sk_error_report(sk); 4047 sk->sk_error_report(sk);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 020766292bb0..8f8527d41682 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1422,7 +1422,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1422 1422
1423 newsk = tcp_create_openreq_child(sk, req, skb); 1423 newsk = tcp_create_openreq_child(sk, req, skb);
1424 if (!newsk) 1424 if (!newsk)
1425 goto exit; 1425 goto exit_nonewsk;
1426 1426
1427 newsk->sk_gso_type = SKB_GSO_TCPV4; 1427 newsk->sk_gso_type = SKB_GSO_TCPV4;
1428 sk_setup_caps(newsk, dst); 1428 sk_setup_caps(newsk, dst);
@@ -1469,16 +1469,20 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1469 } 1469 }
1470#endif 1470#endif
1471 1471
1472 if (__inet_inherit_port(sk, newsk) < 0) {
1473 sock_put(newsk);
1474 goto exit;
1475 }
1472 __inet_hash_nolisten(newsk, NULL); 1476 __inet_hash_nolisten(newsk, NULL);
1473 __inet_inherit_port(sk, newsk);
1474 1477
1475 return newsk; 1478 return newsk;
1476 1479
1477exit_overflow: 1480exit_overflow:
1478 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1481 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1482exit_nonewsk:
1483 dst_release(dst);
1479exit: 1484exit:
1480 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); 1485 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1481 dst_release(dst);
1482 return NULL; 1486 return NULL;
1483} 1487}
1484EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1488EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
@@ -2571,7 +2575,6 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2571 2575
2572 return tcp_gro_receive(head, skb); 2576 return tcp_gro_receive(head, skb);
2573} 2577}
2574EXPORT_SYMBOL(tcp4_gro_receive);
2575 2578
2576int tcp4_gro_complete(struct sk_buff *skb) 2579int tcp4_gro_complete(struct sk_buff *skb)
2577{ 2580{
@@ -2584,7 +2587,6 @@ int tcp4_gro_complete(struct sk_buff *skb)
2584 2587
2585 return tcp_gro_complete(skb); 2588 return tcp_gro_complete(skb);
2586} 2589}
2587EXPORT_SYMBOL(tcp4_gro_complete);
2588 2590
2589struct proto tcp_prot = { 2591struct proto tcp_prot = {
2590 .name = "TCP", 2592 .name = "TCP",
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f25b56cb85cb..43cf901d7659 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -55,7 +55,7 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
55 return 1; 55 return 1;
56 if (after(end_seq, s_win) && before(seq, e_win)) 56 if (after(end_seq, s_win) && before(seq, e_win))
57 return 1; 57 return 1;
58 return (seq == e_win && seq == end_seq); 58 return seq == e_win && seq == end_seq;
59} 59}
60 60
61/* 61/*
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index de3bd8458588..05b1ecf36763 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -224,16 +224,10 @@ void tcp_select_initial_window(int __space, __u32 mss,
224 } 224 }
225 } 225 }
226 226
227 /* Set initial window to value enough for senders, 227 /* Set initial window to value enough for senders, following RFC5681. */
228 * following RFC2414. Senders, not following this RFC,
229 * will be satisfied with 2.
230 */
231 if (mss > (1 << *rcv_wscale)) { 228 if (mss > (1 << *rcv_wscale)) {
232 int init_cwnd = 4; 229 int init_cwnd = rfc3390_bytes_to_packets(mss);
233 if (mss > 1460 * 3) 230
234 init_cwnd = 2;
235 else if (mss > 1460)
236 init_cwnd = 3;
237 /* when initializing use the value from init_rcv_wnd 231 /* when initializing use the value from init_rcv_wnd
238 * rather than the default from above 232 * rather than the default from above
239 */ 233 */
@@ -1376,9 +1370,9 @@ static inline int tcp_nagle_check(const struct tcp_sock *tp,
1376 const struct sk_buff *skb, 1370 const struct sk_buff *skb,
1377 unsigned mss_now, int nonagle) 1371 unsigned mss_now, int nonagle)
1378{ 1372{
1379 return (skb->len < mss_now && 1373 return skb->len < mss_now &&
1380 ((nonagle & TCP_NAGLE_CORK) || 1374 ((nonagle & TCP_NAGLE_CORK) ||
1381 (!nonagle && tp->packets_out && tcp_minshall_check(tp)))); 1375 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
1382} 1376}
1383 1377
1384/* Return non-zero if the Nagle test allows this packet to be 1378/* Return non-zero if the Nagle test allows this packet to be
@@ -1449,10 +1443,10 @@ int tcp_may_send_now(struct sock *sk)
1449 struct tcp_sock *tp = tcp_sk(sk); 1443 struct tcp_sock *tp = tcp_sk(sk);
1450 struct sk_buff *skb = tcp_send_head(sk); 1444 struct sk_buff *skb = tcp_send_head(sk);
1451 1445
1452 return (skb && 1446 return skb &&
1453 tcp_snd_test(sk, skb, tcp_current_mss(sk), 1447 tcp_snd_test(sk, skb, tcp_current_mss(sk),
1454 (tcp_skb_is_last(sk, skb) ? 1448 (tcp_skb_is_last(sk, skb) ?
1455 tp->nonagle : TCP_NAGLE_PUSH))); 1449 tp->nonagle : TCP_NAGLE_PUSH));
1456} 1450}
1457 1451
1458/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet 1452/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
@@ -2429,6 +2423,12 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2429 __u8 rcv_wscale; 2423 __u8 rcv_wscale;
2430 /* Set this up on the first call only */ 2424 /* Set this up on the first call only */
2431 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); 2425 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
2426
2427 /* limit the window selection if the user enforce a smaller rx buffer */
2428 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
2429 (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
2430 req->window_clamp = tcp_full_space(sk);
2431
2432 /* tcp_full_space because it is guaranteed to be the first packet */ 2432 /* tcp_full_space because it is guaranteed to be the first packet */
2433 tcp_select_initial_window(tcp_full_space(sk), 2433 tcp_select_initial_window(tcp_full_space(sk),
2434 mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), 2434 mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
@@ -2555,6 +2555,11 @@ static void tcp_connect_init(struct sock *sk)
2555 2555
2556 tcp_initialize_rcv_mss(sk); 2556 tcp_initialize_rcv_mss(sk);
2557 2557
2558 /* limit the window selection if the user enforce a smaller rx buffer */
2559 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
2560 (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
2561 tp->window_clamp = tcp_full_space(sk);
2562
2558 tcp_select_initial_window(tcp_full_space(sk), 2563 tcp_select_initial_window(tcp_full_space(sk),
2559 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), 2564 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
2560 &tp->rcv_wnd, 2565 &tp->rcv_wnd,
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index f8efada580e8..6211e2114173 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -214,6 +214,7 @@ static const struct file_operations tcpprobe_fops = {
214 .owner = THIS_MODULE, 214 .owner = THIS_MODULE,
215 .open = tcpprobe_open, 215 .open = tcpprobe_open,
216 .read = tcpprobe_read, 216 .read = tcpprobe_read,
217 .llseek = noop_llseek,
217}; 218};
218 219
219static __init int tcpprobe_init(void) 220static __init int tcpprobe_init(void)
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 808bb920c9f5..74a6aa003657 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -66,18 +66,18 @@ static void tcp_write_err(struct sock *sk)
66static int tcp_out_of_resources(struct sock *sk, int do_reset) 66static int tcp_out_of_resources(struct sock *sk, int do_reset)
67{ 67{
68 struct tcp_sock *tp = tcp_sk(sk); 68 struct tcp_sock *tp = tcp_sk(sk);
69 int orphans = percpu_counter_read_positive(&tcp_orphan_count); 69 int shift = 0;
70 70
71 /* If peer does not open window for long time, or did not transmit 71 /* If peer does not open window for long time, or did not transmit
72 * anything for long time, penalize it. */ 72 * anything for long time, penalize it. */
73 if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset) 73 if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
74 orphans <<= 1; 74 shift++;
75 75
76 /* If some dubious ICMP arrived, penalize even more. */ 76 /* If some dubious ICMP arrived, penalize even more. */
77 if (sk->sk_err_soft) 77 if (sk->sk_err_soft)
78 orphans <<= 1; 78 shift++;
79 79
80 if (tcp_too_many_orphans(sk, orphans)) { 80 if (tcp_too_many_orphans(sk, shift)) {
81 if (net_ratelimit()) 81 if (net_ratelimit())
82 printk(KERN_INFO "Out of socket memory\n"); 82 printk(KERN_INFO "Out of socket memory\n");
83 83
@@ -135,13 +135,16 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
135 135
136/* This function calculates a "timeout" which is equivalent to the timeout of a 136/* This function calculates a "timeout" which is equivalent to the timeout of a
137 * TCP connection after "boundary" unsuccessful, exponentially backed-off 137 * TCP connection after "boundary" unsuccessful, exponentially backed-off
138 * retransmissions with an initial RTO of TCP_RTO_MIN. 138 * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if
139 * syn_set flag is set.
139 */ 140 */
140static bool retransmits_timed_out(struct sock *sk, 141static bool retransmits_timed_out(struct sock *sk,
141 unsigned int boundary) 142 unsigned int boundary,
143 unsigned int timeout,
144 bool syn_set)
142{ 145{
143 unsigned int timeout, linear_backoff_thresh; 146 unsigned int linear_backoff_thresh, start_ts;
144 unsigned int start_ts; 147 unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN;
145 148
146 if (!inet_csk(sk)->icsk_retransmits) 149 if (!inet_csk(sk)->icsk_retransmits)
147 return false; 150 return false;
@@ -151,14 +154,15 @@ static bool retransmits_timed_out(struct sock *sk,
151 else 154 else
152 start_ts = tcp_sk(sk)->retrans_stamp; 155 start_ts = tcp_sk(sk)->retrans_stamp;
153 156
154 linear_backoff_thresh = ilog2(TCP_RTO_MAX/TCP_RTO_MIN); 157 if (likely(timeout == 0)) {
155 158 linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
156 if (boundary <= linear_backoff_thresh)
157 timeout = ((2 << boundary) - 1) * TCP_RTO_MIN;
158 else
159 timeout = ((2 << linear_backoff_thresh) - 1) * TCP_RTO_MIN +
160 (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
161 159
160 if (boundary <= linear_backoff_thresh)
161 timeout = ((2 << boundary) - 1) * rto_base;
162 else
163 timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
164 (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
165 }
162 return (tcp_time_stamp - start_ts) >= timeout; 166 return (tcp_time_stamp - start_ts) >= timeout;
163} 167}
164 168
@@ -167,14 +171,15 @@ static int tcp_write_timeout(struct sock *sk)
167{ 171{
168 struct inet_connection_sock *icsk = inet_csk(sk); 172 struct inet_connection_sock *icsk = inet_csk(sk);
169 int retry_until; 173 int retry_until;
170 bool do_reset; 174 bool do_reset, syn_set = 0;
171 175
172 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { 176 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
173 if (icsk->icsk_retransmits) 177 if (icsk->icsk_retransmits)
174 dst_negative_advice(sk); 178 dst_negative_advice(sk);
175 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; 179 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
180 syn_set = 1;
176 } else { 181 } else {
177 if (retransmits_timed_out(sk, sysctl_tcp_retries1)) { 182 if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
178 /* Black hole detection */ 183 /* Black hole detection */
179 tcp_mtu_probing(icsk, sk); 184 tcp_mtu_probing(icsk, sk);
180 185
@@ -187,14 +192,15 @@ static int tcp_write_timeout(struct sock *sk)
187 192
188 retry_until = tcp_orphan_retries(sk, alive); 193 retry_until = tcp_orphan_retries(sk, alive);
189 do_reset = alive || 194 do_reset = alive ||
190 !retransmits_timed_out(sk, retry_until); 195 !retransmits_timed_out(sk, retry_until, 0, 0);
191 196
192 if (tcp_out_of_resources(sk, do_reset)) 197 if (tcp_out_of_resources(sk, do_reset))
193 return 1; 198 return 1;
194 } 199 }
195 } 200 }
196 201
197 if (retransmits_timed_out(sk, retry_until)) { 202 if (retransmits_timed_out(sk, retry_until,
203 syn_set ? 0 : icsk->icsk_user_timeout, syn_set)) {
198 /* Has it gone just too far? */ 204 /* Has it gone just too far? */
199 tcp_write_err(sk); 205 tcp_write_err(sk);
200 return 1; 206 return 1;
@@ -361,18 +367,19 @@ void tcp_retransmit_timer(struct sock *sk)
361 if (icsk->icsk_retransmits == 0) { 367 if (icsk->icsk_retransmits == 0) {
362 int mib_idx; 368 int mib_idx;
363 369
364 if (icsk->icsk_ca_state == TCP_CA_Disorder) { 370 if (icsk->icsk_ca_state == TCP_CA_Recovery) {
365 if (tcp_is_sack(tp))
366 mib_idx = LINUX_MIB_TCPSACKFAILURES;
367 else
368 mib_idx = LINUX_MIB_TCPRENOFAILURES;
369 } else if (icsk->icsk_ca_state == TCP_CA_Recovery) {
370 if (tcp_is_sack(tp)) 371 if (tcp_is_sack(tp))
371 mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL; 372 mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL;
372 else 373 else
373 mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL; 374 mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL;
374 } else if (icsk->icsk_ca_state == TCP_CA_Loss) { 375 } else if (icsk->icsk_ca_state == TCP_CA_Loss) {
375 mib_idx = LINUX_MIB_TCPLOSSFAILURES; 376 mib_idx = LINUX_MIB_TCPLOSSFAILURES;
377 } else if ((icsk->icsk_ca_state == TCP_CA_Disorder) ||
378 tp->sacked_out) {
379 if (tcp_is_sack(tp))
380 mib_idx = LINUX_MIB_TCPSACKFAILURES;
381 else
382 mib_idx = LINUX_MIB_TCPRENOFAILURES;
376 } else { 383 } else {
377 mib_idx = LINUX_MIB_TCPTIMEOUTS; 384 mib_idx = LINUX_MIB_TCPTIMEOUTS;
378 } 385 }
@@ -436,7 +443,7 @@ out_reset_timer:
436 icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); 443 icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
437 } 444 }
438 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); 445 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
439 if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1)) 446 if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))
440 __sk_dst_reset(sk); 447 __sk_dst_reset(sk);
441 448
442out:; 449out:;
@@ -556,7 +563,14 @@ static void tcp_keepalive_timer (unsigned long data)
556 elapsed = keepalive_time_elapsed(tp); 563 elapsed = keepalive_time_elapsed(tp);
557 564
558 if (elapsed >= keepalive_time_when(tp)) { 565 if (elapsed >= keepalive_time_when(tp)) {
559 if (icsk->icsk_probes_out >= keepalive_probes(tp)) { 566 /* If the TCP_USER_TIMEOUT option is enabled, use that
567 * to determine when to timeout instead.
568 */
569 if ((icsk->icsk_user_timeout != 0 &&
570 elapsed >= icsk->icsk_user_timeout &&
571 icsk->icsk_probes_out > 0) ||
572 (icsk->icsk_user_timeout == 0 &&
573 icsk->icsk_probes_out >= keepalive_probes(tp))) {
560 tcp_send_active_reset(sk, GFP_ATOMIC); 574 tcp_send_active_reset(sk, GFP_ATOMIC);
561 tcp_write_err(sk); 575 tcp_write_err(sk);
562 goto out; 576 goto out;
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index b612acf76183..38bc0b52d745 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -6,7 +6,7 @@
6 * "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks." 6 * "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks."
7 * IEEE Journal on Selected Areas in Communication, 7 * IEEE Journal on Selected Areas in Communication,
8 * Feb. 2003. 8 * Feb. 2003.
9 * See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf 9 * See http://www.ie.cuhk.edu.hk/fileadmin/staff_upload/soung/Journal/J3.pdf
10 */ 10 */
11 11
12#include <linux/mm.h> 12#include <linux/mm.h>
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 20151d6a6241..a534dda5456e 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -80,7 +80,7 @@ static void tcp_westwood_init(struct sock *sk)
80 */ 80 */
81static inline u32 westwood_do_filter(u32 a, u32 b) 81static inline u32 westwood_do_filter(u32 a, u32 b)
82{ 82{
83 return (((7 * a) + b) >> 3); 83 return ((7 * a) + b) >> 3;
84} 84}
85 85
86static void westwood_filter(struct westwood *w, u32 delta) 86static void westwood_filter(struct westwood *w, u32 delta)
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index 59186ca7808a..ac3b3ee4b07c 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -14,32 +14,37 @@
14#include <net/protocol.h> 14#include <net/protocol.h>
15#include <net/xfrm.h> 15#include <net/xfrm.h>
16 16
17static struct xfrm_tunnel *tunnel4_handlers; 17static struct xfrm_tunnel __rcu *tunnel4_handlers __read_mostly;
18static struct xfrm_tunnel *tunnel64_handlers; 18static struct xfrm_tunnel __rcu *tunnel64_handlers __read_mostly;
19static DEFINE_MUTEX(tunnel4_mutex); 19static DEFINE_MUTEX(tunnel4_mutex);
20 20
21static inline struct xfrm_tunnel **fam_handlers(unsigned short family) 21static inline struct xfrm_tunnel __rcu **fam_handlers(unsigned short family)
22{ 22{
23 return (family == AF_INET) ? &tunnel4_handlers : &tunnel64_handlers; 23 return (family == AF_INET) ? &tunnel4_handlers : &tunnel64_handlers;
24} 24}
25 25
26int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family) 26int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family)
27{ 27{
28 struct xfrm_tunnel **pprev; 28 struct xfrm_tunnel __rcu **pprev;
29 struct xfrm_tunnel *t;
30
29 int ret = -EEXIST; 31 int ret = -EEXIST;
30 int priority = handler->priority; 32 int priority = handler->priority;
31 33
32 mutex_lock(&tunnel4_mutex); 34 mutex_lock(&tunnel4_mutex);
33 35
34 for (pprev = fam_handlers(family); *pprev; pprev = &(*pprev)->next) { 36 for (pprev = fam_handlers(family);
35 if ((*pprev)->priority > priority) 37 (t = rcu_dereference_protected(*pprev,
38 lockdep_is_held(&tunnel4_mutex))) != NULL;
39 pprev = &t->next) {
40 if (t->priority > priority)
36 break; 41 break;
37 if ((*pprev)->priority == priority) 42 if (t->priority == priority)
38 goto err; 43 goto err;
39 } 44 }
40 45
41 handler->next = *pprev; 46 handler->next = *pprev;
42 *pprev = handler; 47 rcu_assign_pointer(*pprev, handler);
43 48
44 ret = 0; 49 ret = 0;
45 50
@@ -52,13 +57,17 @@ EXPORT_SYMBOL(xfrm4_tunnel_register);
52 57
53int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family) 58int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family)
54{ 59{
55 struct xfrm_tunnel **pprev; 60 struct xfrm_tunnel __rcu **pprev;
61 struct xfrm_tunnel *t;
56 int ret = -ENOENT; 62 int ret = -ENOENT;
57 63
58 mutex_lock(&tunnel4_mutex); 64 mutex_lock(&tunnel4_mutex);
59 65
60 for (pprev = fam_handlers(family); *pprev; pprev = &(*pprev)->next) { 66 for (pprev = fam_handlers(family);
61 if (*pprev == handler) { 67 (t = rcu_dereference_protected(*pprev,
68 lockdep_is_held(&tunnel4_mutex))) != NULL;
69 pprev = &t->next) {
70 if (t == handler) {
62 *pprev = handler->next; 71 *pprev = handler->next;
63 ret = 0; 72 ret = 0;
64 break; 73 break;
@@ -73,6 +82,11 @@ int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family)
73} 82}
74EXPORT_SYMBOL(xfrm4_tunnel_deregister); 83EXPORT_SYMBOL(xfrm4_tunnel_deregister);
75 84
85#define for_each_tunnel_rcu(head, handler) \
86 for (handler = rcu_dereference(head); \
87 handler != NULL; \
88 handler = rcu_dereference(handler->next)) \
89
76static int tunnel4_rcv(struct sk_buff *skb) 90static int tunnel4_rcv(struct sk_buff *skb)
77{ 91{
78 struct xfrm_tunnel *handler; 92 struct xfrm_tunnel *handler;
@@ -80,7 +94,7 @@ static int tunnel4_rcv(struct sk_buff *skb)
80 if (!pskb_may_pull(skb, sizeof(struct iphdr))) 94 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
81 goto drop; 95 goto drop;
82 96
83 for (handler = tunnel4_handlers; handler; handler = handler->next) 97 for_each_tunnel_rcu(tunnel4_handlers, handler)
84 if (!handler->handler(skb)) 98 if (!handler->handler(skb))
85 return 0; 99 return 0;
86 100
@@ -99,7 +113,7 @@ static int tunnel64_rcv(struct sk_buff *skb)
99 if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) 113 if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
100 goto drop; 114 goto drop;
101 115
102 for (handler = tunnel64_handlers; handler; handler = handler->next) 116 for_each_tunnel_rcu(tunnel64_handlers, handler)
103 if (!handler->handler(skb)) 117 if (!handler->handler(skb))
104 return 0; 118 return 0;
105 119
@@ -115,7 +129,7 @@ static void tunnel4_err(struct sk_buff *skb, u32 info)
115{ 129{
116 struct xfrm_tunnel *handler; 130 struct xfrm_tunnel *handler;
117 131
118 for (handler = tunnel4_handlers; handler; handler = handler->next) 132 for_each_tunnel_rcu(tunnel4_handlers, handler)
119 if (!handler->err_handler(skb, info)) 133 if (!handler->err_handler(skb, info))
120 break; 134 break;
121} 135}
@@ -125,7 +139,7 @@ static void tunnel64_err(struct sk_buff *skb, u32 info)
125{ 139{
126 struct xfrm_tunnel *handler; 140 struct xfrm_tunnel *handler;
127 141
128 for (handler = tunnel64_handlers; handler; handler = handler->next) 142 for_each_tunnel_rcu(tunnel64_handlers, handler)
129 if (!handler->err_handler(skb, info)) 143 if (!handler->err_handler(skb, info))
130 break; 144 break;
131} 145}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 32e0bef60d0a..28cb2d733a3c 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -797,7 +797,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
797 return -EOPNOTSUPP; 797 return -EOPNOTSUPP;
798 798
799 ipc.opt = NULL; 799 ipc.opt = NULL;
800 ipc.shtx.flags = 0; 800 ipc.tx_flags = 0;
801 801
802 if (up->pending) { 802 if (up->pending) {
803 /* 803 /*
@@ -845,7 +845,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
845 ipc.addr = inet->inet_saddr; 845 ipc.addr = inet->inet_saddr;
846 846
847 ipc.oif = sk->sk_bound_dev_if; 847 ipc.oif = sk->sk_bound_dev_if;
848 err = sock_tx_timestamp(msg, sk, &ipc.shtx); 848 err = sock_tx_timestamp(sk, &ipc.tx_flags);
849 if (err) 849 if (err)
850 return err; 850 return err;
851 if (msg->msg_controllen) { 851 if (msg->msg_controllen) {
@@ -1260,6 +1260,49 @@ void udp_lib_unhash(struct sock *sk)
1260} 1260}
1261EXPORT_SYMBOL(udp_lib_unhash); 1261EXPORT_SYMBOL(udp_lib_unhash);
1262 1262
1263/*
1264 * inet_rcv_saddr was changed, we must rehash secondary hash
1265 */
1266void udp_lib_rehash(struct sock *sk, u16 newhash)
1267{
1268 if (sk_hashed(sk)) {
1269 struct udp_table *udptable = sk->sk_prot->h.udp_table;
1270 struct udp_hslot *hslot, *hslot2, *nhslot2;
1271
1272 hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
1273 nhslot2 = udp_hashslot2(udptable, newhash);
1274 udp_sk(sk)->udp_portaddr_hash = newhash;
1275 if (hslot2 != nhslot2) {
1276 hslot = udp_hashslot(udptable, sock_net(sk),
1277 udp_sk(sk)->udp_port_hash);
1278 /* we must lock primary chain too */
1279 spin_lock_bh(&hslot->lock);
1280
1281 spin_lock(&hslot2->lock);
1282 hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
1283 hslot2->count--;
1284 spin_unlock(&hslot2->lock);
1285
1286 spin_lock(&nhslot2->lock);
1287 hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
1288 &nhslot2->head);
1289 nhslot2->count++;
1290 spin_unlock(&nhslot2->lock);
1291
1292 spin_unlock_bh(&hslot->lock);
1293 }
1294 }
1295}
1296EXPORT_SYMBOL(udp_lib_rehash);
1297
1298static void udp_v4_rehash(struct sock *sk)
1299{
1300 u16 new_hash = udp4_portaddr_hash(sock_net(sk),
1301 inet_sk(sk)->inet_rcv_saddr,
1302 inet_sk(sk)->inet_num);
1303 udp_lib_rehash(sk, new_hash);
1304}
1305
1263static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 1306static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1264{ 1307{
1265 int rc; 1308 int rc;
@@ -1370,7 +1413,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1370 } 1413 }
1371 } 1414 }
1372 1415
1373 if (sk->sk_filter) { 1416 if (rcu_dereference_raw(sk->sk_filter)) {
1374 if (udp_lib_checksum_complete(skb)) 1417 if (udp_lib_checksum_complete(skb))
1375 goto drop; 1418 goto drop;
1376 } 1419 }
@@ -1843,6 +1886,7 @@ struct proto udp_prot = {
1843 .backlog_rcv = __udp_queue_rcv_skb, 1886 .backlog_rcv = __udp_queue_rcv_skb,
1844 .hash = udp_lib_hash, 1887 .hash = udp_lib_hash,
1845 .unhash = udp_lib_unhash, 1888 .unhash = udp_lib_unhash,
1889 .rehash = udp_v4_rehash,
1846 .get_port = udp_v4_get_port, 1890 .get_port = udp_v4_get_port,
1847 .memory_allocated = &udp_memory_allocated, 1891 .memory_allocated = &udp_memory_allocated,
1848 .sysctl_mem = sysctl_udp_mem, 1892 .sysctl_mem = sysctl_udp_mem,
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 869078d4eeb9..4464f3bff6a7 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -61,7 +61,7 @@ static int xfrm4_get_saddr(struct net *net,
61 61
62static int xfrm4_get_tos(struct flowi *fl) 62static int xfrm4_get_tos(struct flowi *fl)
63{ 63{
64 return fl->fl4_tos; 64 return IPTOS_RT_MASK & fl->fl4_tos; /* Strip ECN bits */
65} 65}
66 66
67static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst, 67static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
@@ -174,7 +174,7 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops)
174 struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops); 174 struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops);
175 175
176 xfrm4_policy_afinfo.garbage_collect(net); 176 xfrm4_policy_afinfo.garbage_collect(net);
177 return (atomic_read(&ops->entries) > ops->gc_thresh * 2); 177 return (dst_entries_get_slow(ops) > ops->gc_thresh * 2);
178} 178}
179 179
180static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) 180static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu)
@@ -232,7 +232,6 @@ static struct dst_ops xfrm4_dst_ops = {
232 .ifdown = xfrm4_dst_ifdown, 232 .ifdown = xfrm4_dst_ifdown,
233 .local_out = __ip_local_out, 233 .local_out = __ip_local_out,
234 .gc_thresh = 1024, 234 .gc_thresh = 1024,
235 .entries = ATOMIC_INIT(0),
236}; 235};
237 236
238static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { 237static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
@@ -288,6 +287,7 @@ void __init xfrm4_init(int rt_max_size)
288 * and start cleaning when were 1/2 full 287 * and start cleaning when were 1/2 full
289 */ 288 */
290 xfrm4_dst_ops.gc_thresh = rt_max_size/2; 289 xfrm4_dst_ops.gc_thresh = rt_max_size/2;
290 dst_entries_init(&xfrm4_dst_ops);
291 291
292 xfrm4_state_init(); 292 xfrm4_state_init();
293 xfrm4_policy_init(); 293 xfrm4_policy_init();
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 1ef1366a0a03..47947624eccc 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -21,21 +21,25 @@ static int xfrm4_init_flags(struct xfrm_state *x)
21} 21}
22 22
23static void 23static void
24__xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl, 24__xfrm4_init_tempsel(struct xfrm_selector *sel, struct flowi *fl)
25 struct xfrm_tmpl *tmpl, 25{
26 xfrm_address_t *daddr, xfrm_address_t *saddr) 26 sel->daddr.a4 = fl->fl4_dst;
27 sel->saddr.a4 = fl->fl4_src;
28 sel->dport = xfrm_flowi_dport(fl);
29 sel->dport_mask = htons(0xffff);
30 sel->sport = xfrm_flowi_sport(fl);
31 sel->sport_mask = htons(0xffff);
32 sel->family = AF_INET;
33 sel->prefixlen_d = 32;
34 sel->prefixlen_s = 32;
35 sel->proto = fl->proto;
36 sel->ifindex = fl->oif;
37}
38
39static void
40xfrm4_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl,
41 xfrm_address_t *daddr, xfrm_address_t *saddr)
27{ 42{
28 x->sel.daddr.a4 = fl->fl4_dst;
29 x->sel.saddr.a4 = fl->fl4_src;
30 x->sel.dport = xfrm_flowi_dport(fl);
31 x->sel.dport_mask = htons(0xffff);
32 x->sel.sport = xfrm_flowi_sport(fl);
33 x->sel.sport_mask = htons(0xffff);
34 x->sel.family = AF_INET;
35 x->sel.prefixlen_d = 32;
36 x->sel.prefixlen_s = 32;
37 x->sel.proto = fl->proto;
38 x->sel.ifindex = fl->oif;
39 x->id = tmpl->id; 43 x->id = tmpl->id;
40 if (x->id.daddr.a4 == 0) 44 if (x->id.daddr.a4 == 0)
41 x->id.daddr.a4 = daddr->a4; 45 x->id.daddr.a4 = daddr->a4;
@@ -70,6 +74,7 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = {
70 .owner = THIS_MODULE, 74 .owner = THIS_MODULE,
71 .init_flags = xfrm4_init_flags, 75 .init_flags = xfrm4_init_flags,
72 .init_tempsel = __xfrm4_init_tempsel, 76 .init_tempsel = __xfrm4_init_tempsel,
77 .init_temprop = xfrm4_init_temprop,
73 .output = xfrm4_output, 78 .output = xfrm4_output,
74 .extract_input = xfrm4_extract_input, 79 .extract_input = xfrm4_extract_input,
75 .extract_output = xfrm4_extract_output, 80 .extract_output = xfrm4_extract_output,
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index 41f5982d2087..82806455e859 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -58,14 +58,14 @@ static int xfrm_tunnel_err(struct sk_buff *skb, u32 info)
58 return -ENOENT; 58 return -ENOENT;
59} 59}
60 60
61static struct xfrm_tunnel xfrm_tunnel_handler = { 61static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = {
62 .handler = xfrm_tunnel_rcv, 62 .handler = xfrm_tunnel_rcv,
63 .err_handler = xfrm_tunnel_err, 63 .err_handler = xfrm_tunnel_err,
64 .priority = 2, 64 .priority = 2,
65}; 65};
66 66
67#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 67#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
68static struct xfrm_tunnel xfrm64_tunnel_handler = { 68static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = {
69 .handler = xfrm_tunnel_rcv, 69 .handler = xfrm_tunnel_rcv,
70 .err_handler = xfrm_tunnel_err, 70 .err_handler = xfrm_tunnel_err,
71 .priority = 2, 71 .priority = 2,
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index ab70a3fbcafa..e048ec62d109 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -243,7 +243,7 @@ static inline bool addrconf_qdisc_ok(const struct net_device *dev)
243/* Check if a route is valid prefix route */ 243/* Check if a route is valid prefix route */
244static inline int addrconf_is_prefix_route(const struct rt6_info *rt) 244static inline int addrconf_is_prefix_route(const struct rt6_info *rt)
245{ 245{
246 return ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0); 246 return (rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0;
247} 247}
248 248
249static void addrconf_del_timer(struct inet6_ifaddr *ifp) 249static void addrconf_del_timer(struct inet6_ifaddr *ifp)
@@ -836,7 +836,7 @@ static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, struct inet6_ifaddr *i
836{ 836{
837 struct inet6_dev *idev = ifp->idev; 837 struct inet6_dev *idev = ifp->idev;
838 struct in6_addr addr, *tmpaddr; 838 struct in6_addr addr, *tmpaddr;
839 unsigned long tmp_prefered_lft, tmp_valid_lft, tmp_cstamp, tmp_tstamp; 839 unsigned long tmp_prefered_lft, tmp_valid_lft, tmp_cstamp, tmp_tstamp, age;
840 unsigned long regen_advance; 840 unsigned long regen_advance;
841 int tmp_plen; 841 int tmp_plen;
842 int ret = 0; 842 int ret = 0;
@@ -886,12 +886,13 @@ retry:
886 goto out; 886 goto out;
887 } 887 }
888 memcpy(&addr.s6_addr[8], idev->rndid, 8); 888 memcpy(&addr.s6_addr[8], idev->rndid, 8);
889 age = (jiffies - ifp->tstamp) / HZ;
889 tmp_valid_lft = min_t(__u32, 890 tmp_valid_lft = min_t(__u32,
890 ifp->valid_lft, 891 ifp->valid_lft,
891 idev->cnf.temp_valid_lft); 892 idev->cnf.temp_valid_lft + age);
892 tmp_prefered_lft = min_t(__u32, 893 tmp_prefered_lft = min_t(__u32,
893 ifp->prefered_lft, 894 ifp->prefered_lft,
894 idev->cnf.temp_prefered_lft - 895 idev->cnf.temp_prefered_lft + age -
895 idev->cnf.max_desync_factor); 896 idev->cnf.max_desync_factor);
896 tmp_plen = ifp->prefix_len; 897 tmp_plen = ifp->prefix_len;
897 max_addresses = idev->cnf.max_addresses; 898 max_addresses = idev->cnf.max_addresses;
@@ -1426,8 +1427,10 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp)
1426{ 1427{
1427 struct inet6_dev *idev = ifp->idev; 1428 struct inet6_dev *idev = ifp->idev;
1428 1429
1429 if (addrconf_dad_end(ifp)) 1430 if (addrconf_dad_end(ifp)) {
1431 in6_ifa_put(ifp);
1430 return; 1432 return;
1433 }
1431 1434
1432 if (net_ratelimit()) 1435 if (net_ratelimit())
1433 printk(KERN_INFO "%s: IPv6 duplicate address %pI6c detected!\n", 1436 printk(KERN_INFO "%s: IPv6 duplicate address %pI6c detected!\n",
@@ -1544,7 +1547,7 @@ static int addrconf_ifid_infiniband(u8 *eui, struct net_device *dev)
1544 return 0; 1547 return 0;
1545} 1548}
1546 1549
1547int __ipv6_isatap_ifid(u8 *eui, __be32 addr) 1550static int __ipv6_isatap_ifid(u8 *eui, __be32 addr)
1548{ 1551{
1549 if (addr == 0) 1552 if (addr == 0)
1550 return -1; 1553 return -1;
@@ -1560,7 +1563,6 @@ int __ipv6_isatap_ifid(u8 *eui, __be32 addr)
1560 memcpy(eui + 4, &addr, 4); 1563 memcpy(eui + 4, &addr, 4);
1561 return 0; 1564 return 0;
1562} 1565}
1563EXPORT_SYMBOL(__ipv6_isatap_ifid);
1564 1566
1565static int addrconf_ifid_sit(u8 *eui, struct net_device *dev) 1567static int addrconf_ifid_sit(u8 *eui, struct net_device *dev)
1566{ 1568{
@@ -2022,10 +2024,11 @@ ok:
2022 ipv6_ifa_notify(0, ift); 2024 ipv6_ifa_notify(0, ift);
2023 } 2025 }
2024 2026
2025 if (create && in6_dev->cnf.use_tempaddr > 0) { 2027 if ((create || list_empty(&in6_dev->tempaddr_list)) && in6_dev->cnf.use_tempaddr > 0) {
2026 /* 2028 /*
2027 * When a new public address is created as described in [ADDRCONF], 2029 * When a new public address is created as described in [ADDRCONF],
2028 * also create a new temporary address. 2030 * also create a new temporary address. Also create a temporary
2031 * address if it's enabled but no temporary address currently exists.
2029 */ 2032 */
2030 read_unlock_bh(&in6_dev->lock); 2033 read_unlock_bh(&in6_dev->lock);
2031 ipv6_create_tempaddr(ifp, NULL); 2034 ipv6_create_tempaddr(ifp, NULL);
@@ -2964,7 +2967,8 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp)
2964 start sending router solicitations. 2967 start sending router solicitations.
2965 */ 2968 */
2966 2969
2967 if (ifp->idev->cnf.forwarding == 0 && 2970 if ((ifp->idev->cnf.forwarding == 0 ||
2971 ifp->idev->cnf.forwarding == 2) &&
2968 ifp->idev->cnf.rtr_solicits > 0 && 2972 ifp->idev->cnf.rtr_solicits > 0 &&
2969 (dev->flags&IFF_LOOPBACK) == 0 && 2973 (dev->flags&IFF_LOOPBACK) == 0 &&
2970 (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)) { 2974 (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)) {
@@ -4637,10 +4641,12 @@ int __init addrconf_init(void)
4637 if (err < 0) { 4641 if (err < 0) {
4638 printk(KERN_CRIT "IPv6 Addrconf:" 4642 printk(KERN_CRIT "IPv6 Addrconf:"
4639 " cannot initialize default policy table: %d.\n", err); 4643 " cannot initialize default policy table: %d.\n", err);
4640 return err; 4644 goto out;
4641 } 4645 }
4642 4646
4643 register_pernet_subsys(&addrconf_ops); 4647 err = register_pernet_subsys(&addrconf_ops);
4648 if (err < 0)
4649 goto out_addrlabel;
4644 4650
4645 /* The addrconf netdev notifier requires that loopback_dev 4651 /* The addrconf netdev notifier requires that loopback_dev
4646 * has it's ipv6 private information allocated and setup 4652 * has it's ipv6 private information allocated and setup
@@ -4692,7 +4698,9 @@ errout:
4692 unregister_netdevice_notifier(&ipv6_dev_notf); 4698 unregister_netdevice_notifier(&ipv6_dev_notf);
4693errlo: 4699errlo:
4694 unregister_pernet_subsys(&addrconf_ops); 4700 unregister_pernet_subsys(&addrconf_ops);
4695 4701out_addrlabel:
4702 ipv6_addr_label_cleanup();
4703out:
4696 return err; 4704 return err;
4697} 4705}
4698 4706
@@ -4703,6 +4711,7 @@ void addrconf_cleanup(void)
4703 4711
4704 unregister_netdevice_notifier(&ipv6_dev_notf); 4712 unregister_netdevice_notifier(&ipv6_dev_notf);
4705 unregister_pernet_subsys(&addrconf_ops); 4713 unregister_pernet_subsys(&addrconf_ops);
4714 ipv6_addr_label_cleanup();
4706 4715
4707 rtnl_lock(); 4716 rtnl_lock();
4708 4717
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index f0e774cea386..c8993e5a337c 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -393,6 +393,11 @@ int __init ipv6_addr_label_init(void)
393 return register_pernet_subsys(&ipv6_addr_label_ops); 393 return register_pernet_subsys(&ipv6_addr_label_ops);
394} 394}
395 395
396void ipv6_addr_label_cleanup(void)
397{
398 unregister_pernet_subsys(&ipv6_addr_label_ops);
399}
400
396static const struct nla_policy ifal_policy[IFAL_MAX+1] = { 401static const struct nla_policy ifal_policy[IFAL_MAX+1] = {
397 [IFAL_ADDRESS] = { .len = sizeof(struct in6_addr), }, 402 [IFAL_ADDRESS] = { .len = sizeof(struct in6_addr), },
398 [IFAL_LABEL] = { .len = sizeof(u32), }, 403 [IFAL_LABEL] = { .len = sizeof(u32), },
@@ -513,10 +518,9 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb)
513 518
514static inline int ip6addrlbl_msgsize(void) 519static inline int ip6addrlbl_msgsize(void)
515{ 520{
516 return (NLMSG_ALIGN(sizeof(struct ifaddrlblmsg)) 521 return NLMSG_ALIGN(sizeof(struct ifaddrlblmsg))
517 + nla_total_size(16) /* IFAL_ADDRESS */ 522 + nla_total_size(16) /* IFAL_ADDRESS */
518 + nla_total_size(4) /* IFAL_LABEL */ 523 + nla_total_size(4); /* IFAL_LABEL */
519 );
520} 524}
521 525
522static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr* nlh, 526static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr* nlh,
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 56b9bf2516f4..54e8e42f7a88 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -343,7 +343,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
343 */ 343 */
344 v4addr = LOOPBACK4_IPV6; 344 v4addr = LOOPBACK4_IPV6;
345 if (!(addr_type & IPV6_ADDR_MULTICAST)) { 345 if (!(addr_type & IPV6_ADDR_MULTICAST)) {
346 if (!ipv6_chk_addr(net, &addr->sin6_addr, 346 if (!inet->transparent &&
347 !ipv6_chk_addr(net, &addr->sin6_addr,
347 dev, 0)) { 348 dev, 0)) {
348 err = -EADDRNOTAVAIL; 349 err = -EADDRNOTAVAIL;
349 goto out_unlock; 350 goto out_unlock;
@@ -467,7 +468,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
467 if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL) 468 if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL)
468 sin->sin6_scope_id = sk->sk_bound_dev_if; 469 sin->sin6_scope_id = sk->sk_bound_dev_if;
469 *uaddr_len = sizeof(*sin); 470 *uaddr_len = sizeof(*sin);
470 return(0); 471 return 0;
471} 472}
472 473
473EXPORT_SYMBOL(inet6_getname); 474EXPORT_SYMBOL(inet6_getname);
@@ -488,7 +489,7 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
488 case SIOCADDRT: 489 case SIOCADDRT:
489 case SIOCDELRT: 490 case SIOCDELRT:
490 491
491 return(ipv6_route_ioctl(net, cmd, (void __user *)arg)); 492 return ipv6_route_ioctl(net, cmd, (void __user *)arg);
492 493
493 case SIOCSIFADDR: 494 case SIOCSIFADDR:
494 return addrconf_add_ifaddr(net, (void __user *) arg); 495 return addrconf_add_ifaddr(net, (void __user *) arg);
@@ -502,7 +503,7 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
502 return sk->sk_prot->ioctl(sk, cmd, arg); 503 return sk->sk_prot->ioctl(sk, cmd, arg);
503 } 504 }
504 /*NOTREACHED*/ 505 /*NOTREACHED*/
505 return(0); 506 return 0;
506} 507}
507 508
508EXPORT_SYMBOL(inet6_ioctl); 509EXPORT_SYMBOL(inet6_ioctl);
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 7d929a22cbc2..320bdb877eed 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -105,9 +105,12 @@ ipv4_connected:
105 if (ipv6_addr_any(&np->saddr)) 105 if (ipv6_addr_any(&np->saddr))
106 ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr); 106 ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr);
107 107
108 if (ipv6_addr_any(&np->rcv_saddr)) 108 if (ipv6_addr_any(&np->rcv_saddr)) {
109 ipv6_addr_set_v4mapped(inet->inet_rcv_saddr, 109 ipv6_addr_set_v4mapped(inet->inet_rcv_saddr,
110 &np->rcv_saddr); 110 &np->rcv_saddr);
111 if (sk->sk_prot->rehash)
112 sk->sk_prot->rehash(sk);
113 }
111 114
112 goto out; 115 goto out;
113 } 116 }
@@ -181,6 +184,8 @@ ipv4_connected:
181 if (ipv6_addr_any(&np->rcv_saddr)) { 184 if (ipv6_addr_any(&np->rcv_saddr)) {
182 ipv6_addr_copy(&np->rcv_saddr, &fl.fl6_src); 185 ipv6_addr_copy(&np->rcv_saddr, &fl.fl6_src);
183 inet->inet_rcv_saddr = LOOPBACK4_IPV6; 186 inet->inet_rcv_saddr = LOOPBACK4_IPV6;
187 if (sk->sk_prot->rehash)
188 sk->sk_prot->rehash(sk);
184 } 189 }
185 190
186 ip6_dst_store(sk, dst, 191 ip6_dst_store(sk, dst,
@@ -572,6 +577,25 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb)
572 u8 *ptr = nh + opt->dst1; 577 u8 *ptr = nh + opt->dst1;
573 put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr); 578 put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr);
574 } 579 }
580 if (np->rxopt.bits.rxorigdstaddr) {
581 struct sockaddr_in6 sin6;
582 u16 *ports = (u16 *) skb_transport_header(skb);
583
584 if (skb_transport_offset(skb) + 4 <= skb->len) {
585 /* All current transport protocols have the port numbers in the
586 * first four bytes of the transport header and this function is
587 * written with this assumption in mind.
588 */
589
590 sin6.sin6_family = AF_INET6;
591 ipv6_addr_copy(&sin6.sin6_addr, &ipv6_hdr(skb)->daddr);
592 sin6.sin6_port = ports[1];
593 sin6.sin6_flowinfo = 0;
594 sin6.sin6_scope_id = 0;
595
596 put_cmsg(msg, SOL_IPV6, IPV6_ORIGDSTADDR, sizeof(sin6), &sin6);
597 }
598 }
575 return 0; 599 return 0;
576} 600}
577 601
diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c
index e1caa5d526c2..14ed0a955b56 100644
--- a/net/ipv6/exthdrs_core.c
+++ b/net/ipv6/exthdrs_core.c
@@ -13,12 +13,12 @@ int ipv6_ext_hdr(u8 nexthdr)
13 /* 13 /*
14 * find out if nexthdr is an extension header or a protocol 14 * find out if nexthdr is an extension header or a protocol
15 */ 15 */
16 return ( (nexthdr == NEXTHDR_HOP) || 16 return (nexthdr == NEXTHDR_HOP) ||
17 (nexthdr == NEXTHDR_ROUTING) || 17 (nexthdr == NEXTHDR_ROUTING) ||
18 (nexthdr == NEXTHDR_FRAGMENT) || 18 (nexthdr == NEXTHDR_FRAGMENT) ||
19 (nexthdr == NEXTHDR_AUTH) || 19 (nexthdr == NEXTHDR_AUTH) ||
20 (nexthdr == NEXTHDR_NONE) || 20 (nexthdr == NEXTHDR_NONE) ||
21 (nexthdr == NEXTHDR_DEST) ); 21 (nexthdr == NEXTHDR_DEST);
22} 22}
23 23
24/* 24/*
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index b1108ede18e1..d829874d8946 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -34,11 +34,10 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi *fl,
34{ 34{
35 struct fib_lookup_arg arg = { 35 struct fib_lookup_arg arg = {
36 .lookup_ptr = lookup, 36 .lookup_ptr = lookup,
37 .flags = FIB_LOOKUP_NOREF,
37 }; 38 };
38 39
39 fib_rules_lookup(net->ipv6.fib6_rules_ops, fl, flags, &arg); 40 fib_rules_lookup(net->ipv6.fib6_rules_ops, fl, flags, &arg);
40 if (arg.rule)
41 fib_rule_put(arg.rule);
42 41
43 if (arg.result) 42 if (arg.result)
44 return arg.result; 43 return arg.result;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index b6a585909d35..de382114609b 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1500,15 +1500,18 @@ static void fib6_gc_timer_cb(unsigned long arg)
1500 1500
1501static int __net_init fib6_net_init(struct net *net) 1501static int __net_init fib6_net_init(struct net *net)
1502{ 1502{
1503 size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
1504
1503 setup_timer(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, (unsigned long)net); 1505 setup_timer(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, (unsigned long)net);
1504 1506
1505 net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL); 1507 net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL);
1506 if (!net->ipv6.rt6_stats) 1508 if (!net->ipv6.rt6_stats)
1507 goto out_timer; 1509 goto out_timer;
1508 1510
1509 net->ipv6.fib_table_hash = kcalloc(FIB6_TABLE_HASHSZ, 1511 /* Avoid false sharing : Use at least a full cache line */
1510 sizeof(*net->ipv6.fib_table_hash), 1512 size = max_t(size_t, size, L1_CACHE_BYTES);
1511 GFP_KERNEL); 1513
1514 net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL);
1512 if (!net->ipv6.fib_table_hash) 1515 if (!net->ipv6.fib_table_hash)
1513 goto out_rt6_stats; 1516 goto out_rt6_stats;
1514 1517
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index d40b330c0ee6..99157b4cd56e 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -637,9 +637,9 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
637 } 637 }
638 mtu -= hlen + sizeof(struct frag_hdr); 638 mtu -= hlen + sizeof(struct frag_hdr);
639 639
640 if (skb_has_frags(skb)) { 640 if (skb_has_frag_list(skb)) {
641 int first_len = skb_pagelen(skb); 641 int first_len = skb_pagelen(skb);
642 int truesizes = 0; 642 struct sk_buff *frag2;
643 643
644 if (first_len - hlen > mtu || 644 if (first_len - hlen > mtu ||
645 ((first_len - hlen) & 7) || 645 ((first_len - hlen) & 7) ||
@@ -651,18 +651,18 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
651 if (frag->len > mtu || 651 if (frag->len > mtu ||
652 ((frag->len & 7) && frag->next) || 652 ((frag->len & 7) && frag->next) ||
653 skb_headroom(frag) < hlen) 653 skb_headroom(frag) < hlen)
654 goto slow_path; 654 goto slow_path_clean;
655 655
656 /* Partially cloned skb? */ 656 /* Partially cloned skb? */
657 if (skb_shared(frag)) 657 if (skb_shared(frag))
658 goto slow_path; 658 goto slow_path_clean;
659 659
660 BUG_ON(frag->sk); 660 BUG_ON(frag->sk);
661 if (skb->sk) { 661 if (skb->sk) {
662 frag->sk = skb->sk; 662 frag->sk = skb->sk;
663 frag->destructor = sock_wfree; 663 frag->destructor = sock_wfree;
664 truesizes += frag->truesize;
665 } 664 }
665 skb->truesize -= frag->truesize;
666 } 666 }
667 667
668 err = 0; 668 err = 0;
@@ -693,7 +693,6 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
693 693
694 first_len = skb_pagelen(skb); 694 first_len = skb_pagelen(skb);
695 skb->data_len = first_len - skb_headlen(skb); 695 skb->data_len = first_len - skb_headlen(skb);
696 skb->truesize -= truesizes;
697 skb->len = first_len; 696 skb->len = first_len;
698 ipv6_hdr(skb)->payload_len = htons(first_len - 697 ipv6_hdr(skb)->payload_len = htons(first_len -
699 sizeof(struct ipv6hdr)); 698 sizeof(struct ipv6hdr));
@@ -756,6 +755,15 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
756 IPSTATS_MIB_FRAGFAILS); 755 IPSTATS_MIB_FRAGFAILS);
757 dst_release(&rt->dst); 756 dst_release(&rt->dst);
758 return err; 757 return err;
758
759slow_path_clean:
760 skb_walk_frags(skb, frag2) {
761 if (frag2 == frag)
762 break;
763 frag2->sk = NULL;
764 frag2->destructor = NULL;
765 skb->truesize += frag2->truesize;
766 }
759 } 767 }
760 768
761slow_path: 769slow_path:
@@ -870,8 +878,8 @@ static inline int ip6_rt_check(struct rt6key *rt_key,
870 struct in6_addr *fl_addr, 878 struct in6_addr *fl_addr,
871 struct in6_addr *addr_cache) 879 struct in6_addr *addr_cache)
872{ 880{
873 return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && 881 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
874 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache))); 882 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
875} 883}
876 884
877static struct dst_entry *ip6_sk_dst_check(struct sock *sk, 885static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 0fd027f3f47e..2a59610c2a58 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -75,7 +75,7 @@ MODULE_LICENSE("GPL");
75 (addr)->s6_addr32[2] ^ (addr)->s6_addr32[3]) & \ 75 (addr)->s6_addr32[2] ^ (addr)->s6_addr32[3]) & \
76 (HASH_SIZE - 1)) 76 (HASH_SIZE - 1))
77 77
78static void ip6_tnl_dev_init(struct net_device *dev); 78static int ip6_tnl_dev_init(struct net_device *dev);
79static void ip6_tnl_dev_setup(struct net_device *dev); 79static void ip6_tnl_dev_setup(struct net_device *dev);
80 80
81static int ip6_tnl_net_id __read_mostly; 81static int ip6_tnl_net_id __read_mostly;
@@ -83,15 +83,42 @@ struct ip6_tnl_net {
83 /* the IPv6 tunnel fallback device */ 83 /* the IPv6 tunnel fallback device */
84 struct net_device *fb_tnl_dev; 84 struct net_device *fb_tnl_dev;
85 /* lists for storing tunnels in use */ 85 /* lists for storing tunnels in use */
86 struct ip6_tnl *tnls_r_l[HASH_SIZE]; 86 struct ip6_tnl __rcu *tnls_r_l[HASH_SIZE];
87 struct ip6_tnl *tnls_wc[1]; 87 struct ip6_tnl __rcu *tnls_wc[1];
88 struct ip6_tnl **tnls[2]; 88 struct ip6_tnl __rcu **tnls[2];
89}; 89};
90 90
91/* often modified stats are per cpu, other are shared (netdev->stats) */
92struct pcpu_tstats {
93 unsigned long rx_packets;
94 unsigned long rx_bytes;
95 unsigned long tx_packets;
96 unsigned long tx_bytes;
97};
98
99static struct net_device_stats *ip6_get_stats(struct net_device *dev)
100{
101 struct pcpu_tstats sum = { 0 };
102 int i;
103
104 for_each_possible_cpu(i) {
105 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
106
107 sum.rx_packets += tstats->rx_packets;
108 sum.rx_bytes += tstats->rx_bytes;
109 sum.tx_packets += tstats->tx_packets;
110 sum.tx_bytes += tstats->tx_bytes;
111 }
112 dev->stats.rx_packets = sum.rx_packets;
113 dev->stats.rx_bytes = sum.rx_bytes;
114 dev->stats.tx_packets = sum.tx_packets;
115 dev->stats.tx_bytes = sum.tx_bytes;
116 return &dev->stats;
117}
118
91/* 119/*
92 * Locking : hash tables are protected by RCU and a spinlock 120 * Locking : hash tables are protected by RCU and RTNL
93 */ 121 */
94static DEFINE_SPINLOCK(ip6_tnl_lock);
95 122
96static inline struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t) 123static inline struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t)
97{ 124{
@@ -138,8 +165,8 @@ static inline void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst)
138static struct ip6_tnl * 165static struct ip6_tnl *
139ip6_tnl_lookup(struct net *net, struct in6_addr *remote, struct in6_addr *local) 166ip6_tnl_lookup(struct net *net, struct in6_addr *remote, struct in6_addr *local)
140{ 167{
141 unsigned h0 = HASH(remote); 168 unsigned int h0 = HASH(remote);
142 unsigned h1 = HASH(local); 169 unsigned int h1 = HASH(local);
143 struct ip6_tnl *t; 170 struct ip6_tnl *t;
144 struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); 171 struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
145 172
@@ -167,7 +194,7 @@ ip6_tnl_lookup(struct net *net, struct in6_addr *remote, struct in6_addr *local)
167 * Return: head of IPv6 tunnel list 194 * Return: head of IPv6 tunnel list
168 **/ 195 **/
169 196
170static struct ip6_tnl ** 197static struct ip6_tnl __rcu **
171ip6_tnl_bucket(struct ip6_tnl_net *ip6n, struct ip6_tnl_parm *p) 198ip6_tnl_bucket(struct ip6_tnl_net *ip6n, struct ip6_tnl_parm *p)
172{ 199{
173 struct in6_addr *remote = &p->raddr; 200 struct in6_addr *remote = &p->raddr;
@@ -190,12 +217,10 @@ ip6_tnl_bucket(struct ip6_tnl_net *ip6n, struct ip6_tnl_parm *p)
190static void 217static void
191ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) 218ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
192{ 219{
193 struct ip6_tnl **tp = ip6_tnl_bucket(ip6n, &t->parms); 220 struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms);
194 221
195 spin_lock_bh(&ip6_tnl_lock); 222 rcu_assign_pointer(t->next , rtnl_dereference(*tp));
196 t->next = *tp;
197 rcu_assign_pointer(*tp, t); 223 rcu_assign_pointer(*tp, t);
198 spin_unlock_bh(&ip6_tnl_lock);
199} 224}
200 225
201/** 226/**
@@ -206,18 +231,25 @@ ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
206static void 231static void
207ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) 232ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
208{ 233{
209 struct ip6_tnl **tp; 234 struct ip6_tnl __rcu **tp;
210 235 struct ip6_tnl *iter;
211 for (tp = ip6_tnl_bucket(ip6n, &t->parms); *tp; tp = &(*tp)->next) { 236
212 if (t == *tp) { 237 for (tp = ip6_tnl_bucket(ip6n, &t->parms);
213 spin_lock_bh(&ip6_tnl_lock); 238 (iter = rtnl_dereference(*tp)) != NULL;
214 *tp = t->next; 239 tp = &iter->next) {
215 spin_unlock_bh(&ip6_tnl_lock); 240 if (t == iter) {
241 rcu_assign_pointer(*tp, t->next);
216 break; 242 break;
217 } 243 }
218 } 244 }
219} 245}
220 246
247static void ip6_dev_free(struct net_device *dev)
248{
249 free_percpu(dev->tstats);
250 free_netdev(dev);
251}
252
221/** 253/**
222 * ip6_tnl_create() - create a new tunnel 254 * ip6_tnl_create() - create a new tunnel
223 * @p: tunnel parameters 255 * @p: tunnel parameters
@@ -256,7 +288,9 @@ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct ip6_tnl_parm *p)
256 288
257 t = netdev_priv(dev); 289 t = netdev_priv(dev);
258 t->parms = *p; 290 t->parms = *p;
259 ip6_tnl_dev_init(dev); 291 err = ip6_tnl_dev_init(dev);
292 if (err < 0)
293 goto failed_free;
260 294
261 if ((err = register_netdevice(dev)) < 0) 295 if ((err = register_netdevice(dev)) < 0)
262 goto failed_free; 296 goto failed_free;
@@ -266,7 +300,7 @@ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct ip6_tnl_parm *p)
266 return t; 300 return t;
267 301
268failed_free: 302failed_free:
269 free_netdev(dev); 303 ip6_dev_free(dev);
270failed: 304failed:
271 return NULL; 305 return NULL;
272} 306}
@@ -290,10 +324,13 @@ static struct ip6_tnl *ip6_tnl_locate(struct net *net,
290{ 324{
291 struct in6_addr *remote = &p->raddr; 325 struct in6_addr *remote = &p->raddr;
292 struct in6_addr *local = &p->laddr; 326 struct in6_addr *local = &p->laddr;
327 struct ip6_tnl __rcu **tp;
293 struct ip6_tnl *t; 328 struct ip6_tnl *t;
294 struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); 329 struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
295 330
296 for (t = *ip6_tnl_bucket(ip6n, p); t; t = t->next) { 331 for (tp = ip6_tnl_bucket(ip6n, p);
332 (t = rtnl_dereference(*tp)) != NULL;
333 tp = &t->next) {
297 if (ipv6_addr_equal(local, &t->parms.laddr) && 334 if (ipv6_addr_equal(local, &t->parms.laddr) &&
298 ipv6_addr_equal(remote, &t->parms.raddr)) 335 ipv6_addr_equal(remote, &t->parms.raddr))
299 return t; 336 return t;
@@ -318,13 +355,10 @@ ip6_tnl_dev_uninit(struct net_device *dev)
318 struct net *net = dev_net(dev); 355 struct net *net = dev_net(dev);
319 struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); 356 struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
320 357
321 if (dev == ip6n->fb_tnl_dev) { 358 if (dev == ip6n->fb_tnl_dev)
322 spin_lock_bh(&ip6_tnl_lock); 359 rcu_assign_pointer(ip6n->tnls_wc[0], NULL);
323 ip6n->tnls_wc[0] = NULL; 360 else
324 spin_unlock_bh(&ip6_tnl_lock);
325 } else {
326 ip6_tnl_unlink(ip6n, t); 361 ip6_tnl_unlink(ip6n, t);
327 }
328 ip6_tnl_dst_reset(t); 362 ip6_tnl_dst_reset(t);
329 dev_put(dev); 363 dev_put(dev);
330} 364}
@@ -702,6 +736,8 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol,
702 736
703 if ((t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, 737 if ((t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr,
704 &ipv6h->daddr)) != NULL) { 738 &ipv6h->daddr)) != NULL) {
739 struct pcpu_tstats *tstats;
740
705 if (t->parms.proto != ipproto && t->parms.proto != 0) { 741 if (t->parms.proto != ipproto && t->parms.proto != 0) {
706 rcu_read_unlock(); 742 rcu_read_unlock();
707 goto discard; 743 goto discard;
@@ -724,10 +760,16 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol,
724 skb->pkt_type = PACKET_HOST; 760 skb->pkt_type = PACKET_HOST;
725 memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); 761 memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
726 762
727 skb_tunnel_rx(skb, t->dev); 763 tstats = this_cpu_ptr(t->dev->tstats);
764 tstats->rx_packets++;
765 tstats->rx_bytes += skb->len;
766
767 __skb_tunnel_rx(skb, t->dev);
728 768
729 dscp_ecn_decapsulate(t, ipv6h, skb); 769 dscp_ecn_decapsulate(t, ipv6h, skb);
770
730 netif_rx(skb); 771 netif_rx(skb);
772
731 rcu_read_unlock(); 773 rcu_read_unlock();
732 return 0; 774 return 0;
733 } 775 }
@@ -934,8 +976,10 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
934 err = ip6_local_out(skb); 976 err = ip6_local_out(skb);
935 977
936 if (net_xmit_eval(err) == 0) { 978 if (net_xmit_eval(err) == 0) {
937 stats->tx_bytes += pkt_len; 979 struct pcpu_tstats *tstats = this_cpu_ptr(t->dev->tstats);
938 stats->tx_packets++; 980
981 tstats->tx_bytes += pkt_len;
982 tstats->tx_packets++;
939 } else { 983 } else {
940 stats->tx_errors++; 984 stats->tx_errors++;
941 stats->tx_aborted_errors++; 985 stats->tx_aborted_errors++;
@@ -1240,6 +1284,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
1240 t = netdev_priv(dev); 1284 t = netdev_priv(dev);
1241 1285
1242 ip6_tnl_unlink(ip6n, t); 1286 ip6_tnl_unlink(ip6n, t);
1287 synchronize_net();
1243 err = ip6_tnl_change(t, &p); 1288 err = ip6_tnl_change(t, &p);
1244 ip6_tnl_link(ip6n, t); 1289 ip6_tnl_link(ip6n, t);
1245 netdev_state_change(dev); 1290 netdev_state_change(dev);
@@ -1300,12 +1345,14 @@ ip6_tnl_change_mtu(struct net_device *dev, int new_mtu)
1300 1345
1301 1346
1302static const struct net_device_ops ip6_tnl_netdev_ops = { 1347static const struct net_device_ops ip6_tnl_netdev_ops = {
1303 .ndo_uninit = ip6_tnl_dev_uninit, 1348 .ndo_uninit = ip6_tnl_dev_uninit,
1304 .ndo_start_xmit = ip6_tnl_xmit, 1349 .ndo_start_xmit = ip6_tnl_xmit,
1305 .ndo_do_ioctl = ip6_tnl_ioctl, 1350 .ndo_do_ioctl = ip6_tnl_ioctl,
1306 .ndo_change_mtu = ip6_tnl_change_mtu, 1351 .ndo_change_mtu = ip6_tnl_change_mtu,
1352 .ndo_get_stats = ip6_get_stats,
1307}; 1353};
1308 1354
1355
1309/** 1356/**
1310 * ip6_tnl_dev_setup - setup virtual tunnel device 1357 * ip6_tnl_dev_setup - setup virtual tunnel device
1311 * @dev: virtual device associated with tunnel 1358 * @dev: virtual device associated with tunnel
@@ -1317,7 +1364,7 @@ static const struct net_device_ops ip6_tnl_netdev_ops = {
1317static void ip6_tnl_dev_setup(struct net_device *dev) 1364static void ip6_tnl_dev_setup(struct net_device *dev)
1318{ 1365{
1319 dev->netdev_ops = &ip6_tnl_netdev_ops; 1366 dev->netdev_ops = &ip6_tnl_netdev_ops;
1320 dev->destructor = free_netdev; 1367 dev->destructor = ip6_dev_free;
1321 1368
1322 dev->type = ARPHRD_TUNNEL6; 1369 dev->type = ARPHRD_TUNNEL6;
1323 dev->hard_header_len = LL_MAX_HEADER + sizeof (struct ipv6hdr); 1370 dev->hard_header_len = LL_MAX_HEADER + sizeof (struct ipv6hdr);
@@ -1325,6 +1372,7 @@ static void ip6_tnl_dev_setup(struct net_device *dev)
1325 dev->flags |= IFF_NOARP; 1372 dev->flags |= IFF_NOARP;
1326 dev->addr_len = sizeof(struct in6_addr); 1373 dev->addr_len = sizeof(struct in6_addr);
1327 dev->features |= NETIF_F_NETNS_LOCAL; 1374 dev->features |= NETIF_F_NETNS_LOCAL;
1375 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
1328} 1376}
1329 1377
1330 1378
@@ -1333,12 +1381,17 @@ static void ip6_tnl_dev_setup(struct net_device *dev)
1333 * @dev: virtual device associated with tunnel 1381 * @dev: virtual device associated with tunnel
1334 **/ 1382 **/
1335 1383
1336static inline void 1384static inline int
1337ip6_tnl_dev_init_gen(struct net_device *dev) 1385ip6_tnl_dev_init_gen(struct net_device *dev)
1338{ 1386{
1339 struct ip6_tnl *t = netdev_priv(dev); 1387 struct ip6_tnl *t = netdev_priv(dev);
1388
1340 t->dev = dev; 1389 t->dev = dev;
1341 strcpy(t->parms.name, dev->name); 1390 strcpy(t->parms.name, dev->name);
1391 dev->tstats = alloc_percpu(struct pcpu_tstats);
1392 if (!dev->tstats)
1393 return -ENOMEM;
1394 return 0;
1342} 1395}
1343 1396
1344/** 1397/**
@@ -1346,11 +1399,15 @@ ip6_tnl_dev_init_gen(struct net_device *dev)
1346 * @dev: virtual device associated with tunnel 1399 * @dev: virtual device associated with tunnel
1347 **/ 1400 **/
1348 1401
1349static void ip6_tnl_dev_init(struct net_device *dev) 1402static int ip6_tnl_dev_init(struct net_device *dev)
1350{ 1403{
1351 struct ip6_tnl *t = netdev_priv(dev); 1404 struct ip6_tnl *t = netdev_priv(dev);
1352 ip6_tnl_dev_init_gen(dev); 1405 int err = ip6_tnl_dev_init_gen(dev);
1406
1407 if (err)
1408 return err;
1353 ip6_tnl_link_config(t); 1409 ip6_tnl_link_config(t);
1410 return 0;
1354} 1411}
1355 1412
1356/** 1413/**
@@ -1360,25 +1417,29 @@ static void ip6_tnl_dev_init(struct net_device *dev)
1360 * Return: 0 1417 * Return: 0
1361 **/ 1418 **/
1362 1419
1363static void __net_init ip6_fb_tnl_dev_init(struct net_device *dev) 1420static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev)
1364{ 1421{
1365 struct ip6_tnl *t = netdev_priv(dev); 1422 struct ip6_tnl *t = netdev_priv(dev);
1366 struct net *net = dev_net(dev); 1423 struct net *net = dev_net(dev);
1367 struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); 1424 struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
1425 int err = ip6_tnl_dev_init_gen(dev);
1426
1427 if (err)
1428 return err;
1368 1429
1369 ip6_tnl_dev_init_gen(dev);
1370 t->parms.proto = IPPROTO_IPV6; 1430 t->parms.proto = IPPROTO_IPV6;
1371 dev_hold(dev); 1431 dev_hold(dev);
1372 ip6n->tnls_wc[0] = t; 1432 rcu_assign_pointer(ip6n->tnls_wc[0], t);
1433 return 0;
1373} 1434}
1374 1435
1375static struct xfrm6_tunnel ip4ip6_handler = { 1436static struct xfrm6_tunnel ip4ip6_handler __read_mostly = {
1376 .handler = ip4ip6_rcv, 1437 .handler = ip4ip6_rcv,
1377 .err_handler = ip4ip6_err, 1438 .err_handler = ip4ip6_err,
1378 .priority = 1, 1439 .priority = 1,
1379}; 1440};
1380 1441
1381static struct xfrm6_tunnel ip6ip6_handler = { 1442static struct xfrm6_tunnel ip6ip6_handler __read_mostly = {
1382 .handler = ip6ip6_rcv, 1443 .handler = ip6ip6_rcv,
1383 .err_handler = ip6ip6_err, 1444 .err_handler = ip6ip6_err,
1384 .priority = 1, 1445 .priority = 1,
@@ -1391,14 +1452,14 @@ static void __net_exit ip6_tnl_destroy_tunnels(struct ip6_tnl_net *ip6n)
1391 LIST_HEAD(list); 1452 LIST_HEAD(list);
1392 1453
1393 for (h = 0; h < HASH_SIZE; h++) { 1454 for (h = 0; h < HASH_SIZE; h++) {
1394 t = ip6n->tnls_r_l[h]; 1455 t = rtnl_dereference(ip6n->tnls_r_l[h]);
1395 while (t != NULL) { 1456 while (t != NULL) {
1396 unregister_netdevice_queue(t->dev, &list); 1457 unregister_netdevice_queue(t->dev, &list);
1397 t = t->next; 1458 t = rtnl_dereference(t->next);
1398 } 1459 }
1399 } 1460 }
1400 1461
1401 t = ip6n->tnls_wc[0]; 1462 t = rtnl_dereference(ip6n->tnls_wc[0]);
1402 unregister_netdevice_queue(t->dev, &list); 1463 unregister_netdevice_queue(t->dev, &list);
1403 unregister_netdevice_many(&list); 1464 unregister_netdevice_many(&list);
1404} 1465}
@@ -1419,7 +1480,9 @@ static int __net_init ip6_tnl_init_net(struct net *net)
1419 goto err_alloc_dev; 1480 goto err_alloc_dev;
1420 dev_net_set(ip6n->fb_tnl_dev, net); 1481 dev_net_set(ip6n->fb_tnl_dev, net);
1421 1482
1422 ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev); 1483 err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev);
1484 if (err < 0)
1485 goto err_register;
1423 1486
1424 err = register_netdev(ip6n->fb_tnl_dev); 1487 err = register_netdev(ip6n->fb_tnl_dev);
1425 if (err < 0) 1488 if (err < 0)
@@ -1427,7 +1490,7 @@ static int __net_init ip6_tnl_init_net(struct net *net)
1427 return 0; 1490 return 0;
1428 1491
1429err_register: 1492err_register:
1430 free_netdev(ip6n->fb_tnl_dev); 1493 ip6_dev_free(ip6n->fb_tnl_dev);
1431err_alloc_dev: 1494err_alloc_dev:
1432 return err; 1495 return err;
1433} 1496}
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 66078dad7fe8..6f32ffce7022 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -667,6 +667,7 @@ static int pim6_rcv(struct sk_buff *skb)
667 skb_tunnel_rx(skb, reg_dev); 667 skb_tunnel_rx(skb, reg_dev);
668 668
669 netif_rx(skb); 669 netif_rx(skb);
670
670 dev_put(reg_dev); 671 dev_put(reg_dev);
671 return 0; 672 return 0;
672 drop: 673 drop:
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index a7f66bc8f0b0..d1770e061c08 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -342,6 +342,25 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
342 retv = 0; 342 retv = 0;
343 break; 343 break;
344 344
345 case IPV6_TRANSPARENT:
346 if (!capable(CAP_NET_ADMIN)) {
347 retv = -EPERM;
348 break;
349 }
350 if (optlen < sizeof(int))
351 goto e_inval;
352 /* we don't have a separate transparent bit for IPV6 we use the one in the IPv4 socket */
353 inet_sk(sk)->transparent = valbool;
354 retv = 0;
355 break;
356
357 case IPV6_RECVORIGDSTADDR:
358 if (optlen < sizeof(int))
359 goto e_inval;
360 np->rxopt.bits.rxorigdstaddr = valbool;
361 retv = 0;
362 break;
363
345 case IPV6_HOPOPTS: 364 case IPV6_HOPOPTS:
346 case IPV6_RTHDRDSTOPTS: 365 case IPV6_RTHDRDSTOPTS:
347 case IPV6_RTHDR: 366 case IPV6_RTHDR:
@@ -1104,6 +1123,14 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
1104 break; 1123 break;
1105 } 1124 }
1106 1125
1126 case IPV6_TRANSPARENT:
1127 val = inet_sk(sk)->transparent;
1128 break;
1129
1130 case IPV6_RECVORIGDSTADDR:
1131 val = np->rxopt.bits.rxorigdstaddr;
1132 break;
1133
1107 case IPV6_UNICAST_HOPS: 1134 case IPV6_UNICAST_HOPS:
1108 case IPV6_MULTICAST_HOPS: 1135 case IPV6_MULTICAST_HOPS:
1109 { 1136 {
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 58841c4ae947..998d6d27e7cf 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -91,7 +91,9 @@
91#include <linux/netfilter.h> 91#include <linux/netfilter.h>
92#include <linux/netfilter_ipv6.h> 92#include <linux/netfilter_ipv6.h>
93 93
94static u32 ndisc_hash(const void *pkey, const struct net_device *dev); 94static u32 ndisc_hash(const void *pkey,
95 const struct net_device *dev,
96 __u32 rnd);
95static int ndisc_constructor(struct neighbour *neigh); 97static int ndisc_constructor(struct neighbour *neigh);
96static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb); 98static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb);
97static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb); 99static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb);
@@ -228,12 +230,12 @@ static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur,
228 do { 230 do {
229 cur = ((void *)cur) + (cur->nd_opt_len << 3); 231 cur = ((void *)cur) + (cur->nd_opt_len << 3);
230 } while(cur < end && cur->nd_opt_type != type); 232 } while(cur < end && cur->nd_opt_type != type);
231 return (cur <= end && cur->nd_opt_type == type ? cur : NULL); 233 return cur <= end && cur->nd_opt_type == type ? cur : NULL;
232} 234}
233 235
234static inline int ndisc_is_useropt(struct nd_opt_hdr *opt) 236static inline int ndisc_is_useropt(struct nd_opt_hdr *opt)
235{ 237{
236 return (opt->nd_opt_type == ND_OPT_RDNSS); 238 return opt->nd_opt_type == ND_OPT_RDNSS;
237} 239}
238 240
239static struct nd_opt_hdr *ndisc_next_useropt(struct nd_opt_hdr *cur, 241static struct nd_opt_hdr *ndisc_next_useropt(struct nd_opt_hdr *cur,
@@ -244,7 +246,7 @@ static struct nd_opt_hdr *ndisc_next_useropt(struct nd_opt_hdr *cur,
244 do { 246 do {
245 cur = ((void *)cur) + (cur->nd_opt_len << 3); 247 cur = ((void *)cur) + (cur->nd_opt_len << 3);
246 } while(cur < end && !ndisc_is_useropt(cur)); 248 } while(cur < end && !ndisc_is_useropt(cur));
247 return (cur <= end && ndisc_is_useropt(cur) ? cur : NULL); 249 return cur <= end && ndisc_is_useropt(cur) ? cur : NULL;
248} 250}
249 251
250static struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, 252static struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len,
@@ -319,7 +321,7 @@ static inline u8 *ndisc_opt_addr_data(struct nd_opt_hdr *p,
319 int prepad = ndisc_addr_option_pad(dev->type); 321 int prepad = ndisc_addr_option_pad(dev->type);
320 if (lladdrlen != NDISC_OPT_SPACE(dev->addr_len + prepad)) 322 if (lladdrlen != NDISC_OPT_SPACE(dev->addr_len + prepad))
321 return NULL; 323 return NULL;
322 return (lladdr + prepad); 324 return lladdr + prepad;
323} 325}
324 326
325int ndisc_mc_map(struct in6_addr *addr, char *buf, struct net_device *dev, int dir) 327int ndisc_mc_map(struct in6_addr *addr, char *buf, struct net_device *dev, int dir)
@@ -350,7 +352,9 @@ int ndisc_mc_map(struct in6_addr *addr, char *buf, struct net_device *dev, int d
350 352
351EXPORT_SYMBOL(ndisc_mc_map); 353EXPORT_SYMBOL(ndisc_mc_map);
352 354
353static u32 ndisc_hash(const void *pkey, const struct net_device *dev) 355static u32 ndisc_hash(const void *pkey,
356 const struct net_device *dev,
357 __u32 hash_rnd)
354{ 358{
355 const u32 *p32 = pkey; 359 const u32 *p32 = pkey;
356 u32 addr_hash, i; 360 u32 addr_hash, i;
@@ -359,7 +363,7 @@ static u32 ndisc_hash(const void *pkey, const struct net_device *dev)
359 for (i = 0; i < (sizeof(struct in6_addr) / sizeof(u32)); i++) 363 for (i = 0; i < (sizeof(struct in6_addr) / sizeof(u32)); i++)
360 addr_hash ^= *p32++; 364 addr_hash ^= *p32++;
361 365
362 return jhash_2words(addr_hash, dev->ifindex, nd_tbl.hash_rnd); 366 return jhash_2words(addr_hash, dev->ifindex, hash_rnd);
363} 367}
364 368
365static int ndisc_constructor(struct neighbour *neigh) 369static int ndisc_constructor(struct neighbour *neigh)
@@ -1105,6 +1109,18 @@ errout:
1105 rtnl_set_sk_err(net, RTNLGRP_ND_USEROPT, err); 1109 rtnl_set_sk_err(net, RTNLGRP_ND_USEROPT, err);
1106} 1110}
1107 1111
1112static inline int accept_ra(struct inet6_dev *in6_dev)
1113{
1114 /*
1115 * If forwarding is enabled, RA are not accepted unless the special
1116 * hybrid mode (accept_ra=2) is enabled.
1117 */
1118 if (in6_dev->cnf.forwarding && in6_dev->cnf.accept_ra < 2)
1119 return 0;
1120
1121 return in6_dev->cnf.accept_ra;
1122}
1123
1108static void ndisc_router_discovery(struct sk_buff *skb) 1124static void ndisc_router_discovery(struct sk_buff *skb)
1109{ 1125{
1110 struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb); 1126 struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb);
@@ -1158,8 +1174,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
1158 return; 1174 return;
1159 } 1175 }
1160 1176
1161 /* skip route and link configuration on routers */ 1177 if (!accept_ra(in6_dev))
1162 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_ra)
1163 goto skip_linkparms; 1178 goto skip_linkparms;
1164 1179
1165#ifdef CONFIG_IPV6_NDISC_NODETYPE 1180#ifdef CONFIG_IPV6_NDISC_NODETYPE
@@ -1309,8 +1324,7 @@ skip_linkparms:
1309 NEIGH_UPDATE_F_ISROUTER); 1324 NEIGH_UPDATE_F_ISROUTER);
1310 } 1325 }
1311 1326
1312 /* skip route and link configuration on routers */ 1327 if (!accept_ra(in6_dev))
1313 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_ra)
1314 goto out; 1328 goto out;
1315 1329
1316#ifdef CONFIG_IPV6_ROUTE_INFO 1330#ifdef CONFIG_IPV6_ROUTE_INFO
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 29d643bcafa4..448464844a25 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -5,10 +5,15 @@
5menu "IPv6: Netfilter Configuration" 5menu "IPv6: Netfilter Configuration"
6 depends on INET && IPV6 && NETFILTER 6 depends on INET && IPV6 && NETFILTER
7 7
8config NF_DEFRAG_IPV6
9 tristate
10 default n
11
8config NF_CONNTRACK_IPV6 12config NF_CONNTRACK_IPV6
9 tristate "IPv6 connection tracking support" 13 tristate "IPv6 connection tracking support"
10 depends on INET && IPV6 && NF_CONNTRACK 14 depends on INET && IPV6 && NF_CONNTRACK
11 default m if NETFILTER_ADVANCED=n 15 default m if NETFILTER_ADVANCED=n
16 select NF_DEFRAG_IPV6
12 ---help--- 17 ---help---
13 Connection tracking keeps a record of what packets have passed 18 Connection tracking keeps a record of what packets have passed
14 through your machine, in order to figure out how they are related 19 through your machine, in order to figure out how they are related
@@ -132,10 +137,10 @@ config IP6_NF_MATCH_RT
132# The targets 137# The targets
133config IP6_NF_TARGET_HL 138config IP6_NF_TARGET_HL
134 tristate '"HL" hoplimit target support' 139 tristate '"HL" hoplimit target support'
135 depends on NETFILTER_ADVANCED 140 depends on NETFILTER_ADVANCED && IP6_NF_MANGLE
136 select NETFILTER_XT_TARGET_HL 141 select NETFILTER_XT_TARGET_HL
137 ---help--- 142 ---help---
138 This is a backwards-compat option for the user's convenience 143 This is a backwards-compatible option for the user's convenience
139 (e.g. when running oldconfig). It selects 144 (e.g. when running oldconfig). It selects
140 CONFIG_NETFILTER_XT_TARGET_HL. 145 CONFIG_NETFILTER_XT_TARGET_HL.
141 146
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index aafbba30c899..0a432c9b0795 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -11,10 +11,14 @@ obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o
11obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o 11obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o
12 12
13# objects for l3 independent conntrack 13# objects for l3 independent conntrack
14nf_conntrack_ipv6-objs := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o nf_conntrack_reasm.o 14nf_conntrack_ipv6-objs := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o
15 15
16# l3 independent conntrack 16# l3 independent conntrack
17obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o 17obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o nf_defrag_ipv6.o
18
19# defrag
20nf_defrag_ipv6-objs := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o
21obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o
18 22
19# matches 23# matches
20obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o 24obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 5359ef4daac5..51df035897e7 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -82,13 +82,13 @@ EXPORT_SYMBOL_GPL(ip6t_alloc_initial_table);
82int 82int
83ip6t_ext_hdr(u8 nexthdr) 83ip6t_ext_hdr(u8 nexthdr)
84{ 84{
85 return ( (nexthdr == IPPROTO_HOPOPTS) || 85 return (nexthdr == IPPROTO_HOPOPTS) ||
86 (nexthdr == IPPROTO_ROUTING) || 86 (nexthdr == IPPROTO_ROUTING) ||
87 (nexthdr == IPPROTO_FRAGMENT) || 87 (nexthdr == IPPROTO_FRAGMENT) ||
88 (nexthdr == IPPROTO_ESP) || 88 (nexthdr == IPPROTO_ESP) ||
89 (nexthdr == IPPROTO_AH) || 89 (nexthdr == IPPROTO_AH) ||
90 (nexthdr == IPPROTO_NONE) || 90 (nexthdr == IPPROTO_NONE) ||
91 (nexthdr == IPPROTO_DSTOPTS) ); 91 (nexthdr == IPPROTO_DSTOPTS);
92} 92}
93 93
94/* Returns whether matches rule or not. */ 94/* Returns whether matches rule or not. */
@@ -215,7 +215,7 @@ static inline bool unconditional(const struct ip6t_ip6 *ipv6)
215 return memcmp(ipv6, &uncond, sizeof(uncond)) == 0; 215 return memcmp(ipv6, &uncond, sizeof(uncond)) == 0;
216} 216}
217 217
218static inline const struct ip6t_entry_target * 218static inline const struct xt_entry_target *
219ip6t_get_target_c(const struct ip6t_entry *e) 219ip6t_get_target_c(const struct ip6t_entry *e)
220{ 220{
221 return ip6t_get_target((struct ip6t_entry *)e); 221 return ip6t_get_target((struct ip6t_entry *)e);
@@ -260,9 +260,9 @@ get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e,
260 const char *hookname, const char **chainname, 260 const char *hookname, const char **chainname,
261 const char **comment, unsigned int *rulenum) 261 const char **comment, unsigned int *rulenum)
262{ 262{
263 const struct ip6t_standard_target *t = (void *)ip6t_get_target_c(s); 263 const struct xt_standard_target *t = (void *)ip6t_get_target_c(s);
264 264
265 if (strcmp(t->target.u.kernel.target->name, IP6T_ERROR_TARGET) == 0) { 265 if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) {
266 /* Head of user chain: ERROR target with chainname */ 266 /* Head of user chain: ERROR target with chainname */
267 *chainname = t->target.data; 267 *chainname = t->target.data;
268 (*rulenum) = 0; 268 (*rulenum) = 0;
@@ -271,7 +271,7 @@ get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e,
271 271
272 if (s->target_offset == sizeof(struct ip6t_entry) && 272 if (s->target_offset == sizeof(struct ip6t_entry) &&
273 strcmp(t->target.u.kernel.target->name, 273 strcmp(t->target.u.kernel.target->name,
274 IP6T_STANDARD_TARGET) == 0 && 274 XT_STANDARD_TARGET) == 0 &&
275 t->verdict < 0 && 275 t->verdict < 0 &&
276 unconditional(&s->ipv6)) { 276 unconditional(&s->ipv6)) {
277 /* Tail of chains: STANDARD target (return/policy) */ 277 /* Tail of chains: STANDARD target (return/policy) */
@@ -369,7 +369,7 @@ ip6t_do_table(struct sk_buff *skb,
369 e = get_entry(table_base, private->hook_entry[hook]); 369 e = get_entry(table_base, private->hook_entry[hook]);
370 370
371 do { 371 do {
372 const struct ip6t_entry_target *t; 372 const struct xt_entry_target *t;
373 const struct xt_entry_match *ematch; 373 const struct xt_entry_match *ematch;
374 374
375 IP_NF_ASSERT(e); 375 IP_NF_ASSERT(e);
@@ -403,10 +403,10 @@ ip6t_do_table(struct sk_buff *skb,
403 if (!t->u.kernel.target->target) { 403 if (!t->u.kernel.target->target) {
404 int v; 404 int v;
405 405
406 v = ((struct ip6t_standard_target *)t)->verdict; 406 v = ((struct xt_standard_target *)t)->verdict;
407 if (v < 0) { 407 if (v < 0) {
408 /* Pop from stack? */ 408 /* Pop from stack? */
409 if (v != IP6T_RETURN) { 409 if (v != XT_RETURN) {
410 verdict = (unsigned)(-v) - 1; 410 verdict = (unsigned)(-v) - 1;
411 break; 411 break;
412 } 412 }
@@ -434,7 +434,7 @@ ip6t_do_table(struct sk_buff *skb,
434 acpar.targinfo = t->data; 434 acpar.targinfo = t->data;
435 435
436 verdict = t->u.kernel.target->target(skb, &acpar); 436 verdict = t->u.kernel.target->target(skb, &acpar);
437 if (verdict == IP6T_CONTINUE) 437 if (verdict == XT_CONTINUE)
438 e = ip6t_next_entry(e); 438 e = ip6t_next_entry(e);
439 else 439 else
440 /* Verdict */ 440 /* Verdict */
@@ -474,7 +474,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
474 e->counters.pcnt = pos; 474 e->counters.pcnt = pos;
475 475
476 for (;;) { 476 for (;;) {
477 const struct ip6t_standard_target *t 477 const struct xt_standard_target *t
478 = (void *)ip6t_get_target_c(e); 478 = (void *)ip6t_get_target_c(e);
479 int visited = e->comefrom & (1 << hook); 479 int visited = e->comefrom & (1 << hook);
480 480
@@ -488,13 +488,13 @@ mark_source_chains(const struct xt_table_info *newinfo,
488 /* Unconditional return/END. */ 488 /* Unconditional return/END. */
489 if ((e->target_offset == sizeof(struct ip6t_entry) && 489 if ((e->target_offset == sizeof(struct ip6t_entry) &&
490 (strcmp(t->target.u.user.name, 490 (strcmp(t->target.u.user.name,
491 IP6T_STANDARD_TARGET) == 0) && 491 XT_STANDARD_TARGET) == 0) &&
492 t->verdict < 0 && 492 t->verdict < 0 &&
493 unconditional(&e->ipv6)) || visited) { 493 unconditional(&e->ipv6)) || visited) {
494 unsigned int oldpos, size; 494 unsigned int oldpos, size;
495 495
496 if ((strcmp(t->target.u.user.name, 496 if ((strcmp(t->target.u.user.name,
497 IP6T_STANDARD_TARGET) == 0) && 497 XT_STANDARD_TARGET) == 0) &&
498 t->verdict < -NF_MAX_VERDICT - 1) { 498 t->verdict < -NF_MAX_VERDICT - 1) {
499 duprintf("mark_source_chains: bad " 499 duprintf("mark_source_chains: bad "
500 "negative verdict (%i)\n", 500 "negative verdict (%i)\n",
@@ -537,7 +537,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
537 int newpos = t->verdict; 537 int newpos = t->verdict;
538 538
539 if (strcmp(t->target.u.user.name, 539 if (strcmp(t->target.u.user.name,
540 IP6T_STANDARD_TARGET) == 0 && 540 XT_STANDARD_TARGET) == 0 &&
541 newpos >= 0) { 541 newpos >= 0) {
542 if (newpos > newinfo->size - 542 if (newpos > newinfo->size -
543 sizeof(struct ip6t_entry)) { 543 sizeof(struct ip6t_entry)) {
@@ -565,7 +565,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
565 return 1; 565 return 1;
566} 566}
567 567
568static void cleanup_match(struct ip6t_entry_match *m, struct net *net) 568static void cleanup_match(struct xt_entry_match *m, struct net *net)
569{ 569{
570 struct xt_mtdtor_param par; 570 struct xt_mtdtor_param par;
571 571
@@ -581,14 +581,14 @@ static void cleanup_match(struct ip6t_entry_match *m, struct net *net)
581static int 581static int
582check_entry(const struct ip6t_entry *e, const char *name) 582check_entry(const struct ip6t_entry *e, const char *name)
583{ 583{
584 const struct ip6t_entry_target *t; 584 const struct xt_entry_target *t;
585 585
586 if (!ip6_checkentry(&e->ipv6)) { 586 if (!ip6_checkentry(&e->ipv6)) {
587 duprintf("ip_tables: ip check failed %p %s.\n", e, name); 587 duprintf("ip_tables: ip check failed %p %s.\n", e, name);
588 return -EINVAL; 588 return -EINVAL;
589 } 589 }
590 590
591 if (e->target_offset + sizeof(struct ip6t_entry_target) > 591 if (e->target_offset + sizeof(struct xt_entry_target) >
592 e->next_offset) 592 e->next_offset)
593 return -EINVAL; 593 return -EINVAL;
594 594
@@ -599,7 +599,7 @@ check_entry(const struct ip6t_entry *e, const char *name)
599 return 0; 599 return 0;
600} 600}
601 601
602static int check_match(struct ip6t_entry_match *m, struct xt_mtchk_param *par) 602static int check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
603{ 603{
604 const struct ip6t_ip6 *ipv6 = par->entryinfo; 604 const struct ip6t_ip6 *ipv6 = par->entryinfo;
605 int ret; 605 int ret;
@@ -618,7 +618,7 @@ static int check_match(struct ip6t_entry_match *m, struct xt_mtchk_param *par)
618} 618}
619 619
620static int 620static int
621find_check_match(struct ip6t_entry_match *m, struct xt_mtchk_param *par) 621find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
622{ 622{
623 struct xt_match *match; 623 struct xt_match *match;
624 int ret; 624 int ret;
@@ -643,7 +643,7 @@ err:
643 643
644static int check_target(struct ip6t_entry *e, struct net *net, const char *name) 644static int check_target(struct ip6t_entry *e, struct net *net, const char *name)
645{ 645{
646 struct ip6t_entry_target *t = ip6t_get_target(e); 646 struct xt_entry_target *t = ip6t_get_target(e);
647 struct xt_tgchk_param par = { 647 struct xt_tgchk_param par = {
648 .net = net, 648 .net = net,
649 .table = name, 649 .table = name,
@@ -670,7 +670,7 @@ static int
670find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, 670find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
671 unsigned int size) 671 unsigned int size)
672{ 672{
673 struct ip6t_entry_target *t; 673 struct xt_entry_target *t;
674 struct xt_target *target; 674 struct xt_target *target;
675 int ret; 675 int ret;
676 unsigned int j; 676 unsigned int j;
@@ -721,7 +721,7 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
721 721
722static bool check_underflow(const struct ip6t_entry *e) 722static bool check_underflow(const struct ip6t_entry *e)
723{ 723{
724 const struct ip6t_entry_target *t; 724 const struct xt_entry_target *t;
725 unsigned int verdict; 725 unsigned int verdict;
726 726
727 if (!unconditional(&e->ipv6)) 727 if (!unconditional(&e->ipv6))
@@ -729,7 +729,7 @@ static bool check_underflow(const struct ip6t_entry *e)
729 t = ip6t_get_target_c(e); 729 t = ip6t_get_target_c(e);
730 if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) 730 if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
731 return false; 731 return false;
732 verdict = ((struct ip6t_standard_target *)t)->verdict; 732 verdict = ((struct xt_standard_target *)t)->verdict;
733 verdict = -verdict - 1; 733 verdict = -verdict - 1;
734 return verdict == NF_DROP || verdict == NF_ACCEPT; 734 return verdict == NF_DROP || verdict == NF_ACCEPT;
735} 735}
@@ -752,7 +752,7 @@ check_entry_size_and_hooks(struct ip6t_entry *e,
752 } 752 }
753 753
754 if (e->next_offset 754 if (e->next_offset
755 < sizeof(struct ip6t_entry) + sizeof(struct ip6t_entry_target)) { 755 < sizeof(struct ip6t_entry) + sizeof(struct xt_entry_target)) {
756 duprintf("checking: element %p size %u\n", 756 duprintf("checking: element %p size %u\n",
757 e, e->next_offset); 757 e, e->next_offset);
758 return -EINVAL; 758 return -EINVAL;
@@ -784,7 +784,7 @@ check_entry_size_and_hooks(struct ip6t_entry *e,
784static void cleanup_entry(struct ip6t_entry *e, struct net *net) 784static void cleanup_entry(struct ip6t_entry *e, struct net *net)
785{ 785{
786 struct xt_tgdtor_param par; 786 struct xt_tgdtor_param par;
787 struct ip6t_entry_target *t; 787 struct xt_entry_target *t;
788 struct xt_entry_match *ematch; 788 struct xt_entry_match *ematch;
789 789
790 /* Cleanup all matches */ 790 /* Cleanup all matches */
@@ -922,6 +922,7 @@ get_counters(const struct xt_table_info *t,
922 if (cpu == curcpu) 922 if (cpu == curcpu)
923 continue; 923 continue;
924 i = 0; 924 i = 0;
925 local_bh_disable();
925 xt_info_wrlock(cpu); 926 xt_info_wrlock(cpu);
926 xt_entry_foreach(iter, t->entries[cpu], t->size) { 927 xt_entry_foreach(iter, t->entries[cpu], t->size) {
927 ADD_COUNTER(counters[i], iter->counters.bcnt, 928 ADD_COUNTER(counters[i], iter->counters.bcnt,
@@ -929,6 +930,7 @@ get_counters(const struct xt_table_info *t,
929 ++i; 930 ++i;
930 } 931 }
931 xt_info_wrunlock(cpu); 932 xt_info_wrunlock(cpu);
933 local_bh_enable();
932 } 934 }
933 put_cpu(); 935 put_cpu();
934} 936}
@@ -983,8 +985,8 @@ copy_entries_to_user(unsigned int total_size,
983 /* ... then go back and fix counters and names */ 985 /* ... then go back and fix counters and names */
984 for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ 986 for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
985 unsigned int i; 987 unsigned int i;
986 const struct ip6t_entry_match *m; 988 const struct xt_entry_match *m;
987 const struct ip6t_entry_target *t; 989 const struct xt_entry_target *t;
988 990
989 e = (struct ip6t_entry *)(loc_cpu_entry + off); 991 e = (struct ip6t_entry *)(loc_cpu_entry + off);
990 if (copy_to_user(userptr + off 992 if (copy_to_user(userptr + off
@@ -1001,7 +1003,7 @@ copy_entries_to_user(unsigned int total_size,
1001 m = (void *)e + i; 1003 m = (void *)e + i;
1002 1004
1003 if (copy_to_user(userptr + off + i 1005 if (copy_to_user(userptr + off + i
1004 + offsetof(struct ip6t_entry_match, 1006 + offsetof(struct xt_entry_match,
1005 u.user.name), 1007 u.user.name),
1006 m->u.kernel.match->name, 1008 m->u.kernel.match->name,
1007 strlen(m->u.kernel.match->name)+1) 1009 strlen(m->u.kernel.match->name)+1)
@@ -1013,7 +1015,7 @@ copy_entries_to_user(unsigned int total_size,
1013 1015
1014 t = ip6t_get_target_c(e); 1016 t = ip6t_get_target_c(e);
1015 if (copy_to_user(userptr + off + e->target_offset 1017 if (copy_to_user(userptr + off + e->target_offset
1016 + offsetof(struct ip6t_entry_target, 1018 + offsetof(struct xt_entry_target,
1017 u.user.name), 1019 u.user.name),
1018 t->u.kernel.target->name, 1020 t->u.kernel.target->name,
1019 strlen(t->u.kernel.target->name)+1) != 0) { 1021 strlen(t->u.kernel.target->name)+1) != 0) {
@@ -1051,7 +1053,7 @@ static int compat_calc_entry(const struct ip6t_entry *e,
1051 const void *base, struct xt_table_info *newinfo) 1053 const void *base, struct xt_table_info *newinfo)
1052{ 1054{
1053 const struct xt_entry_match *ematch; 1055 const struct xt_entry_match *ematch;
1054 const struct ip6t_entry_target *t; 1056 const struct xt_entry_target *t;
1055 unsigned int entry_offset; 1057 unsigned int entry_offset;
1056 int off, i, ret; 1058 int off, i, ret;
1057 1059
@@ -1103,7 +1105,7 @@ static int compat_table_info(const struct xt_table_info *info,
1103static int get_info(struct net *net, void __user *user, 1105static int get_info(struct net *net, void __user *user,
1104 const int *len, int compat) 1106 const int *len, int compat)
1105{ 1107{
1106 char name[IP6T_TABLE_MAXNAMELEN]; 1108 char name[XT_TABLE_MAXNAMELEN];
1107 struct xt_table *t; 1109 struct xt_table *t;
1108 int ret; 1110 int ret;
1109 1111
@@ -1116,7 +1118,7 @@ static int get_info(struct net *net, void __user *user,
1116 if (copy_from_user(name, user, sizeof(name)) != 0) 1118 if (copy_from_user(name, user, sizeof(name)) != 0)
1117 return -EFAULT; 1119 return -EFAULT;
1118 1120
1119 name[IP6T_TABLE_MAXNAMELEN-1] = '\0'; 1121 name[XT_TABLE_MAXNAMELEN-1] = '\0';
1120#ifdef CONFIG_COMPAT 1122#ifdef CONFIG_COMPAT
1121 if (compat) 1123 if (compat)
1122 xt_compat_lock(AF_INET6); 1124 xt_compat_lock(AF_INET6);
@@ -1413,14 +1415,14 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
1413 1415
1414#ifdef CONFIG_COMPAT 1416#ifdef CONFIG_COMPAT
1415struct compat_ip6t_replace { 1417struct compat_ip6t_replace {
1416 char name[IP6T_TABLE_MAXNAMELEN]; 1418 char name[XT_TABLE_MAXNAMELEN];
1417 u32 valid_hooks; 1419 u32 valid_hooks;
1418 u32 num_entries; 1420 u32 num_entries;
1419 u32 size; 1421 u32 size;
1420 u32 hook_entry[NF_INET_NUMHOOKS]; 1422 u32 hook_entry[NF_INET_NUMHOOKS];
1421 u32 underflow[NF_INET_NUMHOOKS]; 1423 u32 underflow[NF_INET_NUMHOOKS];
1422 u32 num_counters; 1424 u32 num_counters;
1423 compat_uptr_t counters; /* struct ip6t_counters * */ 1425 compat_uptr_t counters; /* struct xt_counters * */
1424 struct compat_ip6t_entry entries[0]; 1426 struct compat_ip6t_entry entries[0];
1425}; 1427};
1426 1428
@@ -1429,7 +1431,7 @@ compat_copy_entry_to_user(struct ip6t_entry *e, void __user **dstptr,
1429 unsigned int *size, struct xt_counters *counters, 1431 unsigned int *size, struct xt_counters *counters,
1430 unsigned int i) 1432 unsigned int i)
1431{ 1433{
1432 struct ip6t_entry_target *t; 1434 struct xt_entry_target *t;
1433 struct compat_ip6t_entry __user *ce; 1435 struct compat_ip6t_entry __user *ce;
1434 u_int16_t target_offset, next_offset; 1436 u_int16_t target_offset, next_offset;
1435 compat_uint_t origsize; 1437 compat_uint_t origsize;
@@ -1464,7 +1466,7 @@ compat_copy_entry_to_user(struct ip6t_entry *e, void __user **dstptr,
1464} 1466}
1465 1467
1466static int 1468static int
1467compat_find_calc_match(struct ip6t_entry_match *m, 1469compat_find_calc_match(struct xt_entry_match *m,
1468 const char *name, 1470 const char *name,
1469 const struct ip6t_ip6 *ipv6, 1471 const struct ip6t_ip6 *ipv6,
1470 unsigned int hookmask, 1472 unsigned int hookmask,
@@ -1486,7 +1488,7 @@ compat_find_calc_match(struct ip6t_entry_match *m,
1486 1488
1487static void compat_release_entry(struct compat_ip6t_entry *e) 1489static void compat_release_entry(struct compat_ip6t_entry *e)
1488{ 1490{
1489 struct ip6t_entry_target *t; 1491 struct xt_entry_target *t;
1490 struct xt_entry_match *ematch; 1492 struct xt_entry_match *ematch;
1491 1493
1492 /* Cleanup all matches */ 1494 /* Cleanup all matches */
@@ -1507,7 +1509,7 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e,
1507 const char *name) 1509 const char *name)
1508{ 1510{
1509 struct xt_entry_match *ematch; 1511 struct xt_entry_match *ematch;
1510 struct ip6t_entry_target *t; 1512 struct xt_entry_target *t;
1511 struct xt_target *target; 1513 struct xt_target *target;
1512 unsigned int entry_offset; 1514 unsigned int entry_offset;
1513 unsigned int j; 1515 unsigned int j;
@@ -1589,7 +1591,7 @@ compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr,
1589 unsigned int *size, const char *name, 1591 unsigned int *size, const char *name,
1590 struct xt_table_info *newinfo, unsigned char *base) 1592 struct xt_table_info *newinfo, unsigned char *base)
1591{ 1593{
1592 struct ip6t_entry_target *t; 1594 struct xt_entry_target *t;
1593 struct xt_target *target; 1595 struct xt_target *target;
1594 struct ip6t_entry *de; 1596 struct ip6t_entry *de;
1595 unsigned int origsize; 1597 unsigned int origsize;
@@ -1764,6 +1766,9 @@ translate_compat_table(struct net *net,
1764 if (ret != 0) 1766 if (ret != 0)
1765 break; 1767 break;
1766 ++i; 1768 ++i;
1769 if (strcmp(ip6t_get_target(iter1)->u.user.name,
1770 XT_ERROR_TARGET) == 0)
1771 ++newinfo->stacksize;
1767 } 1772 }
1768 if (ret) { 1773 if (ret) {
1769 /* 1774 /*
@@ -1894,7 +1899,7 @@ compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user,
1894} 1899}
1895 1900
1896struct compat_ip6t_get_entries { 1901struct compat_ip6t_get_entries {
1897 char name[IP6T_TABLE_MAXNAMELEN]; 1902 char name[XT_TABLE_MAXNAMELEN];
1898 compat_uint_t size; 1903 compat_uint_t size;
1899 struct compat_ip6t_entry entrytable[0]; 1904 struct compat_ip6t_entry entrytable[0];
1900}; 1905};
@@ -2049,7 +2054,7 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2049 2054
2050 case IP6T_SO_GET_REVISION_MATCH: 2055 case IP6T_SO_GET_REVISION_MATCH:
2051 case IP6T_SO_GET_REVISION_TARGET: { 2056 case IP6T_SO_GET_REVISION_TARGET: {
2052 struct ip6t_get_revision rev; 2057 struct xt_get_revision rev;
2053 int target; 2058 int target;
2054 2059
2055 if (*len != sizeof(rev)) { 2060 if (*len != sizeof(rev)) {
@@ -2186,7 +2191,7 @@ static int icmp6_checkentry(const struct xt_mtchk_param *par)
2186/* The built-in targets: standard (NULL) and error. */ 2191/* The built-in targets: standard (NULL) and error. */
2187static struct xt_target ip6t_builtin_tg[] __read_mostly = { 2192static struct xt_target ip6t_builtin_tg[] __read_mostly = {
2188 { 2193 {
2189 .name = IP6T_STANDARD_TARGET, 2194 .name = XT_STANDARD_TARGET,
2190 .targetsize = sizeof(int), 2195 .targetsize = sizeof(int),
2191 .family = NFPROTO_IPV6, 2196 .family = NFPROTO_IPV6,
2192#ifdef CONFIG_COMPAT 2197#ifdef CONFIG_COMPAT
@@ -2196,9 +2201,9 @@ static struct xt_target ip6t_builtin_tg[] __read_mostly = {
2196#endif 2201#endif
2197 }, 2202 },
2198 { 2203 {
2199 .name = IP6T_ERROR_TARGET, 2204 .name = XT_ERROR_TARGET,
2200 .target = ip6t_error, 2205 .target = ip6t_error,
2201 .targetsize = IP6T_FUNCTION_MAXNAMELEN, 2206 .targetsize = XT_FUNCTION_MAXNAMELEN,
2202 .family = NFPROTO_IPV6, 2207 .family = NFPROTO_IPV6,
2203 }, 2208 },
2204}; 2209};
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index 0a07ae7b933f..09c88891a753 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -23,6 +23,7 @@
23#include <linux/netfilter/x_tables.h> 23#include <linux/netfilter/x_tables.h>
24#include <linux/netfilter_ipv6/ip6_tables.h> 24#include <linux/netfilter_ipv6/ip6_tables.h>
25#include <net/netfilter/nf_log.h> 25#include <net/netfilter/nf_log.h>
26#include <net/netfilter/xt_log.h>
26 27
27MODULE_AUTHOR("Jan Rekorajski <baggins@pld.org.pl>"); 28MODULE_AUTHOR("Jan Rekorajski <baggins@pld.org.pl>");
28MODULE_DESCRIPTION("Xtables: IPv6 packet logging to syslog"); 29MODULE_DESCRIPTION("Xtables: IPv6 packet logging to syslog");
@@ -32,11 +33,9 @@ struct in_device;
32#include <net/route.h> 33#include <net/route.h>
33#include <linux/netfilter_ipv6/ip6t_LOG.h> 34#include <linux/netfilter_ipv6/ip6t_LOG.h>
34 35
35/* Use lock to serialize, so printks don't overlap */
36static DEFINE_SPINLOCK(log_lock);
37
38/* One level of recursion won't kill us */ 36/* One level of recursion won't kill us */
39static void dump_packet(const struct nf_loginfo *info, 37static void dump_packet(struct sbuff *m,
38 const struct nf_loginfo *info,
40 const struct sk_buff *skb, unsigned int ip6hoff, 39 const struct sk_buff *skb, unsigned int ip6hoff,
41 int recurse) 40 int recurse)
42{ 41{
@@ -55,15 +54,15 @@ static void dump_packet(const struct nf_loginfo *info,
55 54
56 ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h); 55 ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h);
57 if (ih == NULL) { 56 if (ih == NULL) {
58 printk("TRUNCATED"); 57 sb_add(m, "TRUNCATED");
59 return; 58 return;
60 } 59 }
61 60
62 /* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */ 61 /* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */
63 printk("SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr); 62 sb_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr);
64 63
65 /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */ 64 /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */
66 printk("LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ", 65 sb_add(m, "LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
67 ntohs(ih->payload_len) + sizeof(struct ipv6hdr), 66 ntohs(ih->payload_len) + sizeof(struct ipv6hdr),
68 (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20, 67 (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20,
69 ih->hop_limit, 68 ih->hop_limit,
@@ -78,35 +77,35 @@ static void dump_packet(const struct nf_loginfo *info,
78 77
79 hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); 78 hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr);
80 if (hp == NULL) { 79 if (hp == NULL) {
81 printk("TRUNCATED"); 80 sb_add(m, "TRUNCATED");
82 return; 81 return;
83 } 82 }
84 83
85 /* Max length: 48 "OPT (...) " */ 84 /* Max length: 48 "OPT (...) " */
86 if (logflags & IP6T_LOG_IPOPT) 85 if (logflags & IP6T_LOG_IPOPT)
87 printk("OPT ( "); 86 sb_add(m, "OPT ( ");
88 87
89 switch (currenthdr) { 88 switch (currenthdr) {
90 case IPPROTO_FRAGMENT: { 89 case IPPROTO_FRAGMENT: {
91 struct frag_hdr _fhdr; 90 struct frag_hdr _fhdr;
92 const struct frag_hdr *fh; 91 const struct frag_hdr *fh;
93 92
94 printk("FRAG:"); 93 sb_add(m, "FRAG:");
95 fh = skb_header_pointer(skb, ptr, sizeof(_fhdr), 94 fh = skb_header_pointer(skb, ptr, sizeof(_fhdr),
96 &_fhdr); 95 &_fhdr);
97 if (fh == NULL) { 96 if (fh == NULL) {
98 printk("TRUNCATED "); 97 sb_add(m, "TRUNCATED ");
99 return; 98 return;
100 } 99 }
101 100
102 /* Max length: 6 "65535 " */ 101 /* Max length: 6 "65535 " */
103 printk("%u ", ntohs(fh->frag_off) & 0xFFF8); 102 sb_add(m, "%u ", ntohs(fh->frag_off) & 0xFFF8);
104 103
105 /* Max length: 11 "INCOMPLETE " */ 104 /* Max length: 11 "INCOMPLETE " */
106 if (fh->frag_off & htons(0x0001)) 105 if (fh->frag_off & htons(0x0001))
107 printk("INCOMPLETE "); 106 sb_add(m, "INCOMPLETE ");
108 107
109 printk("ID:%08x ", ntohl(fh->identification)); 108 sb_add(m, "ID:%08x ", ntohl(fh->identification));
110 109
111 if (ntohs(fh->frag_off) & 0xFFF8) 110 if (ntohs(fh->frag_off) & 0xFFF8)
112 fragment = 1; 111 fragment = 1;
@@ -120,7 +119,7 @@ static void dump_packet(const struct nf_loginfo *info,
120 case IPPROTO_HOPOPTS: 119 case IPPROTO_HOPOPTS:
121 if (fragment) { 120 if (fragment) {
122 if (logflags & IP6T_LOG_IPOPT) 121 if (logflags & IP6T_LOG_IPOPT)
123 printk(")"); 122 sb_add(m, ")");
124 return; 123 return;
125 } 124 }
126 hdrlen = ipv6_optlen(hp); 125 hdrlen = ipv6_optlen(hp);
@@ -132,10 +131,10 @@ static void dump_packet(const struct nf_loginfo *info,
132 const struct ip_auth_hdr *ah; 131 const struct ip_auth_hdr *ah;
133 132
134 /* Max length: 3 "AH " */ 133 /* Max length: 3 "AH " */
135 printk("AH "); 134 sb_add(m, "AH ");
136 135
137 if (fragment) { 136 if (fragment) {
138 printk(")"); 137 sb_add(m, ")");
139 return; 138 return;
140 } 139 }
141 140
@@ -146,13 +145,13 @@ static void dump_packet(const struct nf_loginfo *info,
146 * Max length: 26 "INCOMPLETE [65535 145 * Max length: 26 "INCOMPLETE [65535
147 * bytes] )" 146 * bytes] )"
148 */ 147 */
149 printk("INCOMPLETE [%u bytes] )", 148 sb_add(m, "INCOMPLETE [%u bytes] )",
150 skb->len - ptr); 149 skb->len - ptr);
151 return; 150 return;
152 } 151 }
153 152
154 /* Length: 15 "SPI=0xF1234567 */ 153 /* Length: 15 "SPI=0xF1234567 */
155 printk("SPI=0x%x ", ntohl(ah->spi)); 154 sb_add(m, "SPI=0x%x ", ntohl(ah->spi));
156 155
157 } 156 }
158 157
@@ -164,10 +163,10 @@ static void dump_packet(const struct nf_loginfo *info,
164 const struct ip_esp_hdr *eh; 163 const struct ip_esp_hdr *eh;
165 164
166 /* Max length: 4 "ESP " */ 165 /* Max length: 4 "ESP " */
167 printk("ESP "); 166 sb_add(m, "ESP ");
168 167
169 if (fragment) { 168 if (fragment) {
170 printk(")"); 169 sb_add(m, ")");
171 return; 170 return;
172 } 171 }
173 172
@@ -177,23 +176,23 @@ static void dump_packet(const struct nf_loginfo *info,
177 eh = skb_header_pointer(skb, ptr, sizeof(_esph), 176 eh = skb_header_pointer(skb, ptr, sizeof(_esph),
178 &_esph); 177 &_esph);
179 if (eh == NULL) { 178 if (eh == NULL) {
180 printk("INCOMPLETE [%u bytes] )", 179 sb_add(m, "INCOMPLETE [%u bytes] )",
181 skb->len - ptr); 180 skb->len - ptr);
182 return; 181 return;
183 } 182 }
184 183
185 /* Length: 16 "SPI=0xF1234567 )" */ 184 /* Length: 16 "SPI=0xF1234567 )" */
186 printk("SPI=0x%x )", ntohl(eh->spi) ); 185 sb_add(m, "SPI=0x%x )", ntohl(eh->spi) );
187 186
188 } 187 }
189 return; 188 return;
190 default: 189 default:
191 /* Max length: 20 "Unknown Ext Hdr 255" */ 190 /* Max length: 20 "Unknown Ext Hdr 255" */
192 printk("Unknown Ext Hdr %u", currenthdr); 191 sb_add(m, "Unknown Ext Hdr %u", currenthdr);
193 return; 192 return;
194 } 193 }
195 if (logflags & IP6T_LOG_IPOPT) 194 if (logflags & IP6T_LOG_IPOPT)
196 printk(") "); 195 sb_add(m, ") ");
197 196
198 currenthdr = hp->nexthdr; 197 currenthdr = hp->nexthdr;
199 ptr += hdrlen; 198 ptr += hdrlen;
@@ -205,7 +204,7 @@ static void dump_packet(const struct nf_loginfo *info,
205 const struct tcphdr *th; 204 const struct tcphdr *th;
206 205
207 /* Max length: 10 "PROTO=TCP " */ 206 /* Max length: 10 "PROTO=TCP " */
208 printk("PROTO=TCP "); 207 sb_add(m, "PROTO=TCP ");
209 208
210 if (fragment) 209 if (fragment)
211 break; 210 break;
@@ -213,40 +212,40 @@ static void dump_packet(const struct nf_loginfo *info,
213 /* Max length: 25 "INCOMPLETE [65535 bytes] " */ 212 /* Max length: 25 "INCOMPLETE [65535 bytes] " */
214 th = skb_header_pointer(skb, ptr, sizeof(_tcph), &_tcph); 213 th = skb_header_pointer(skb, ptr, sizeof(_tcph), &_tcph);
215 if (th == NULL) { 214 if (th == NULL) {
216 printk("INCOMPLETE [%u bytes] ", skb->len - ptr); 215 sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr);
217 return; 216 return;
218 } 217 }
219 218
220 /* Max length: 20 "SPT=65535 DPT=65535 " */ 219 /* Max length: 20 "SPT=65535 DPT=65535 " */
221 printk("SPT=%u DPT=%u ", 220 sb_add(m, "SPT=%u DPT=%u ",
222 ntohs(th->source), ntohs(th->dest)); 221 ntohs(th->source), ntohs(th->dest));
223 /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ 222 /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
224 if (logflags & IP6T_LOG_TCPSEQ) 223 if (logflags & IP6T_LOG_TCPSEQ)
225 printk("SEQ=%u ACK=%u ", 224 sb_add(m, "SEQ=%u ACK=%u ",
226 ntohl(th->seq), ntohl(th->ack_seq)); 225 ntohl(th->seq), ntohl(th->ack_seq));
227 /* Max length: 13 "WINDOW=65535 " */ 226 /* Max length: 13 "WINDOW=65535 " */
228 printk("WINDOW=%u ", ntohs(th->window)); 227 sb_add(m, "WINDOW=%u ", ntohs(th->window));
229 /* Max length: 9 "RES=0x3C " */ 228 /* Max length: 9 "RES=0x3C " */
230 printk("RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); 229 sb_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
231 /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ 230 /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
232 if (th->cwr) 231 if (th->cwr)
233 printk("CWR "); 232 sb_add(m, "CWR ");
234 if (th->ece) 233 if (th->ece)
235 printk("ECE "); 234 sb_add(m, "ECE ");
236 if (th->urg) 235 if (th->urg)
237 printk("URG "); 236 sb_add(m, "URG ");
238 if (th->ack) 237 if (th->ack)
239 printk("ACK "); 238 sb_add(m, "ACK ");
240 if (th->psh) 239 if (th->psh)
241 printk("PSH "); 240 sb_add(m, "PSH ");
242 if (th->rst) 241 if (th->rst)
243 printk("RST "); 242 sb_add(m, "RST ");
244 if (th->syn) 243 if (th->syn)
245 printk("SYN "); 244 sb_add(m, "SYN ");
246 if (th->fin) 245 if (th->fin)
247 printk("FIN "); 246 sb_add(m, "FIN ");
248 /* Max length: 11 "URGP=65535 " */ 247 /* Max length: 11 "URGP=65535 " */
249 printk("URGP=%u ", ntohs(th->urg_ptr)); 248 sb_add(m, "URGP=%u ", ntohs(th->urg_ptr));
250 249
251 if ((logflags & IP6T_LOG_TCPOPT) && 250 if ((logflags & IP6T_LOG_TCPOPT) &&
252 th->doff * 4 > sizeof(struct tcphdr)) { 251 th->doff * 4 > sizeof(struct tcphdr)) {
@@ -260,15 +259,15 @@ static void dump_packet(const struct nf_loginfo *info,
260 ptr + sizeof(struct tcphdr), 259 ptr + sizeof(struct tcphdr),
261 optsize, _opt); 260 optsize, _opt);
262 if (op == NULL) { 261 if (op == NULL) {
263 printk("OPT (TRUNCATED)"); 262 sb_add(m, "OPT (TRUNCATED)");
264 return; 263 return;
265 } 264 }
266 265
267 /* Max length: 127 "OPT (" 15*4*2chars ") " */ 266 /* Max length: 127 "OPT (" 15*4*2chars ") " */
268 printk("OPT ("); 267 sb_add(m, "OPT (");
269 for (i =0; i < optsize; i++) 268 for (i =0; i < optsize; i++)
270 printk("%02X", op[i]); 269 sb_add(m, "%02X", op[i]);
271 printk(") "); 270 sb_add(m, ") ");
272 } 271 }
273 break; 272 break;
274 } 273 }
@@ -279,9 +278,9 @@ static void dump_packet(const struct nf_loginfo *info,
279 278
280 if (currenthdr == IPPROTO_UDP) 279 if (currenthdr == IPPROTO_UDP)
281 /* Max length: 10 "PROTO=UDP " */ 280 /* Max length: 10 "PROTO=UDP " */
282 printk("PROTO=UDP " ); 281 sb_add(m, "PROTO=UDP " );
283 else /* Max length: 14 "PROTO=UDPLITE " */ 282 else /* Max length: 14 "PROTO=UDPLITE " */
284 printk("PROTO=UDPLITE "); 283 sb_add(m, "PROTO=UDPLITE ");
285 284
286 if (fragment) 285 if (fragment)
287 break; 286 break;
@@ -289,12 +288,12 @@ static void dump_packet(const struct nf_loginfo *info,
289 /* Max length: 25 "INCOMPLETE [65535 bytes] " */ 288 /* Max length: 25 "INCOMPLETE [65535 bytes] " */
290 uh = skb_header_pointer(skb, ptr, sizeof(_udph), &_udph); 289 uh = skb_header_pointer(skb, ptr, sizeof(_udph), &_udph);
291 if (uh == NULL) { 290 if (uh == NULL) {
292 printk("INCOMPLETE [%u bytes] ", skb->len - ptr); 291 sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr);
293 return; 292 return;
294 } 293 }
295 294
296 /* Max length: 20 "SPT=65535 DPT=65535 " */ 295 /* Max length: 20 "SPT=65535 DPT=65535 " */
297 printk("SPT=%u DPT=%u LEN=%u ", 296 sb_add(m, "SPT=%u DPT=%u LEN=%u ",
298 ntohs(uh->source), ntohs(uh->dest), 297 ntohs(uh->source), ntohs(uh->dest),
299 ntohs(uh->len)); 298 ntohs(uh->len));
300 break; 299 break;
@@ -304,7 +303,7 @@ static void dump_packet(const struct nf_loginfo *info,
304 const struct icmp6hdr *ic; 303 const struct icmp6hdr *ic;
305 304
306 /* Max length: 13 "PROTO=ICMPv6 " */ 305 /* Max length: 13 "PROTO=ICMPv6 " */
307 printk("PROTO=ICMPv6 "); 306 sb_add(m, "PROTO=ICMPv6 ");
308 307
309 if (fragment) 308 if (fragment)
310 break; 309 break;
@@ -312,18 +311,18 @@ static void dump_packet(const struct nf_loginfo *info,
312 /* Max length: 25 "INCOMPLETE [65535 bytes] " */ 311 /* Max length: 25 "INCOMPLETE [65535 bytes] " */
313 ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h); 312 ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h);
314 if (ic == NULL) { 313 if (ic == NULL) {
315 printk("INCOMPLETE [%u bytes] ", skb->len - ptr); 314 sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr);
316 return; 315 return;
317 } 316 }
318 317
319 /* Max length: 18 "TYPE=255 CODE=255 " */ 318 /* Max length: 18 "TYPE=255 CODE=255 " */
320 printk("TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code); 319 sb_add(m, "TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code);
321 320
322 switch (ic->icmp6_type) { 321 switch (ic->icmp6_type) {
323 case ICMPV6_ECHO_REQUEST: 322 case ICMPV6_ECHO_REQUEST:
324 case ICMPV6_ECHO_REPLY: 323 case ICMPV6_ECHO_REPLY:
325 /* Max length: 19 "ID=65535 SEQ=65535 " */ 324 /* Max length: 19 "ID=65535 SEQ=65535 " */
326 printk("ID=%u SEQ=%u ", 325 sb_add(m, "ID=%u SEQ=%u ",
327 ntohs(ic->icmp6_identifier), 326 ntohs(ic->icmp6_identifier),
328 ntohs(ic->icmp6_sequence)); 327 ntohs(ic->icmp6_sequence));
329 break; 328 break;
@@ -334,35 +333,35 @@ static void dump_packet(const struct nf_loginfo *info,
334 333
335 case ICMPV6_PARAMPROB: 334 case ICMPV6_PARAMPROB:
336 /* Max length: 17 "POINTER=ffffffff " */ 335 /* Max length: 17 "POINTER=ffffffff " */
337 printk("POINTER=%08x ", ntohl(ic->icmp6_pointer)); 336 sb_add(m, "POINTER=%08x ", ntohl(ic->icmp6_pointer));
338 /* Fall through */ 337 /* Fall through */
339 case ICMPV6_DEST_UNREACH: 338 case ICMPV6_DEST_UNREACH:
340 case ICMPV6_PKT_TOOBIG: 339 case ICMPV6_PKT_TOOBIG:
341 case ICMPV6_TIME_EXCEED: 340 case ICMPV6_TIME_EXCEED:
342 /* Max length: 3+maxlen */ 341 /* Max length: 3+maxlen */
343 if (recurse) { 342 if (recurse) {
344 printk("["); 343 sb_add(m, "[");
345 dump_packet(info, skb, ptr + sizeof(_icmp6h), 344 dump_packet(m, info, skb,
346 0); 345 ptr + sizeof(_icmp6h), 0);
347 printk("] "); 346 sb_add(m, "] ");
348 } 347 }
349 348
350 /* Max length: 10 "MTU=65535 " */ 349 /* Max length: 10 "MTU=65535 " */
351 if (ic->icmp6_type == ICMPV6_PKT_TOOBIG) 350 if (ic->icmp6_type == ICMPV6_PKT_TOOBIG)
352 printk("MTU=%u ", ntohl(ic->icmp6_mtu)); 351 sb_add(m, "MTU=%u ", ntohl(ic->icmp6_mtu));
353 } 352 }
354 break; 353 break;
355 } 354 }
356 /* Max length: 10 "PROTO=255 " */ 355 /* Max length: 10 "PROTO=255 " */
357 default: 356 default:
358 printk("PROTO=%u ", currenthdr); 357 sb_add(m, "PROTO=%u ", currenthdr);
359 } 358 }
360 359
361 /* Max length: 15 "UID=4294967295 " */ 360 /* Max length: 15 "UID=4294967295 " */
362 if ((logflags & IP6T_LOG_UID) && recurse && skb->sk) { 361 if ((logflags & IP6T_LOG_UID) && recurse && skb->sk) {
363 read_lock_bh(&skb->sk->sk_callback_lock); 362 read_lock_bh(&skb->sk->sk_callback_lock);
364 if (skb->sk->sk_socket && skb->sk->sk_socket->file) 363 if (skb->sk->sk_socket && skb->sk->sk_socket->file)
365 printk("UID=%u GID=%u ", 364 sb_add(m, "UID=%u GID=%u ",
366 skb->sk->sk_socket->file->f_cred->fsuid, 365 skb->sk->sk_socket->file->f_cred->fsuid,
367 skb->sk->sk_socket->file->f_cred->fsgid); 366 skb->sk->sk_socket->file->f_cred->fsgid);
368 read_unlock_bh(&skb->sk->sk_callback_lock); 367 read_unlock_bh(&skb->sk->sk_callback_lock);
@@ -370,10 +369,11 @@ static void dump_packet(const struct nf_loginfo *info,
370 369
371 /* Max length: 16 "MARK=0xFFFFFFFF " */ 370 /* Max length: 16 "MARK=0xFFFFFFFF " */
372 if (!recurse && skb->mark) 371 if (!recurse && skb->mark)
373 printk("MARK=0x%x ", skb->mark); 372 sb_add(m, "MARK=0x%x ", skb->mark);
374} 373}
375 374
376static void dump_mac_header(const struct nf_loginfo *info, 375static void dump_mac_header(struct sbuff *m,
376 const struct nf_loginfo *info,
377 const struct sk_buff *skb) 377 const struct sk_buff *skb)
378{ 378{
379 struct net_device *dev = skb->dev; 379 struct net_device *dev = skb->dev;
@@ -387,7 +387,7 @@ static void dump_mac_header(const struct nf_loginfo *info,
387 387
388 switch (dev->type) { 388 switch (dev->type) {
389 case ARPHRD_ETHER: 389 case ARPHRD_ETHER:
390 printk("MACSRC=%pM MACDST=%pM MACPROTO=%04x ", 390 sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
391 eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, 391 eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
392 ntohs(eth_hdr(skb)->h_proto)); 392 ntohs(eth_hdr(skb)->h_proto));
393 return; 393 return;
@@ -396,7 +396,7 @@ static void dump_mac_header(const struct nf_loginfo *info,
396 } 396 }
397 397
398fallback: 398fallback:
399 printk("MAC="); 399 sb_add(m, "MAC=");
400 if (dev->hard_header_len && 400 if (dev->hard_header_len &&
401 skb->mac_header != skb->network_header) { 401 skb->mac_header != skb->network_header) {
402 const unsigned char *p = skb_mac_header(skb); 402 const unsigned char *p = skb_mac_header(skb);
@@ -408,19 +408,19 @@ fallback:
408 p = NULL; 408 p = NULL;
409 409
410 if (p != NULL) { 410 if (p != NULL) {
411 printk("%02x", *p++); 411 sb_add(m, "%02x", *p++);
412 for (i = 1; i < len; i++) 412 for (i = 1; i < len; i++)
413 printk(":%02x", p[i]); 413 sb_add(m, ":%02x", p[i]);
414 } 414 }
415 printk(" "); 415 sb_add(m, " ");
416 416
417 if (dev->type == ARPHRD_SIT) { 417 if (dev->type == ARPHRD_SIT) {
418 const struct iphdr *iph = 418 const struct iphdr *iph =
419 (struct iphdr *)skb_mac_header(skb); 419 (struct iphdr *)skb_mac_header(skb);
420 printk("TUNNEL=%pI4->%pI4 ", &iph->saddr, &iph->daddr); 420 sb_add(m, "TUNNEL=%pI4->%pI4 ", &iph->saddr, &iph->daddr);
421 } 421 }
422 } else 422 } else
423 printk(" "); 423 sb_add(m, " ");
424} 424}
425 425
426static struct nf_loginfo default_loginfo = { 426static struct nf_loginfo default_loginfo = {
@@ -442,22 +442,23 @@ ip6t_log_packet(u_int8_t pf,
442 const struct nf_loginfo *loginfo, 442 const struct nf_loginfo *loginfo,
443 const char *prefix) 443 const char *prefix)
444{ 444{
445 struct sbuff *m = sb_open();
446
445 if (!loginfo) 447 if (!loginfo)
446 loginfo = &default_loginfo; 448 loginfo = &default_loginfo;
447 449
448 spin_lock_bh(&log_lock); 450 sb_add(m, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
449 printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, 451 prefix,
450 prefix, 452 in ? in->name : "",
451 in ? in->name : "", 453 out ? out->name : "");
452 out ? out->name : "");
453 454
454 /* MAC logging for input path only. */ 455 /* MAC logging for input path only. */
455 if (in && !out) 456 if (in && !out)
456 dump_mac_header(loginfo, skb); 457 dump_mac_header(m, loginfo, skb);
458
459 dump_packet(m, loginfo, skb, skb_network_offset(skb), 1);
457 460
458 dump_packet(loginfo, skb, skb_network_offset(skb), 1); 461 sb_close(m);
459 printk("\n");
460 spin_unlock_bh(&log_lock);
461} 462}
462 463
463static unsigned int 464static unsigned int
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index ff43461704be..c8af58b22562 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -16,7 +16,6 @@
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/skbuff.h> 17#include <linux/skbuff.h>
18#include <linux/icmp.h> 18#include <linux/icmp.h>
19#include <linux/sysctl.h>
20#include <net/ipv6.h> 19#include <net/ipv6.h>
21#include <net/inet_frag.h> 20#include <net/inet_frag.h>
22 21
@@ -29,6 +28,7 @@
29#include <net/netfilter/nf_conntrack_core.h> 28#include <net/netfilter/nf_conntrack_core.h>
30#include <net/netfilter/nf_conntrack_zones.h> 29#include <net/netfilter/nf_conntrack_zones.h>
31#include <net/netfilter/ipv6/nf_conntrack_ipv6.h> 30#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
31#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
32#include <net/netfilter/nf_log.h> 32#include <net/netfilter/nf_log.h>
33 33
34static bool ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, 34static bool ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
@@ -189,53 +189,6 @@ out:
189 return nf_conntrack_confirm(skb); 189 return nf_conntrack_confirm(skb);
190} 190}
191 191
192static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum,
193 struct sk_buff *skb)
194{
195 u16 zone = NF_CT_DEFAULT_ZONE;
196
197 if (skb->nfct)
198 zone = nf_ct_zone((struct nf_conn *)skb->nfct);
199
200#ifdef CONFIG_BRIDGE_NETFILTER
201 if (skb->nf_bridge &&
202 skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)
203 return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone;
204#endif
205 if (hooknum == NF_INET_PRE_ROUTING)
206 return IP6_DEFRAG_CONNTRACK_IN + zone;
207 else
208 return IP6_DEFRAG_CONNTRACK_OUT + zone;
209
210}
211
212static unsigned int ipv6_defrag(unsigned int hooknum,
213 struct sk_buff *skb,
214 const struct net_device *in,
215 const struct net_device *out,
216 int (*okfn)(struct sk_buff *))
217{
218 struct sk_buff *reasm;
219
220 /* Previously seen (loopback)? */
221 if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
222 return NF_ACCEPT;
223
224 reasm = nf_ct_frag6_gather(skb, nf_ct6_defrag_user(hooknum, skb));
225 /* queued */
226 if (reasm == NULL)
227 return NF_STOLEN;
228
229 /* error occured or not fragmented */
230 if (reasm == skb)
231 return NF_ACCEPT;
232
233 nf_ct_frag6_output(hooknum, reasm, (struct net_device *)in,
234 (struct net_device *)out, okfn);
235
236 return NF_STOLEN;
237}
238
239static unsigned int __ipv6_conntrack_in(struct net *net, 192static unsigned int __ipv6_conntrack_in(struct net *net,
240 unsigned int hooknum, 193 unsigned int hooknum,
241 struct sk_buff *skb, 194 struct sk_buff *skb,
@@ -288,13 +241,6 @@ static unsigned int ipv6_conntrack_local(unsigned int hooknum,
288 241
289static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = { 242static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = {
290 { 243 {
291 .hook = ipv6_defrag,
292 .owner = THIS_MODULE,
293 .pf = NFPROTO_IPV6,
294 .hooknum = NF_INET_PRE_ROUTING,
295 .priority = NF_IP6_PRI_CONNTRACK_DEFRAG,
296 },
297 {
298 .hook = ipv6_conntrack_in, 244 .hook = ipv6_conntrack_in,
299 .owner = THIS_MODULE, 245 .owner = THIS_MODULE,
300 .pf = NFPROTO_IPV6, 246 .pf = NFPROTO_IPV6,
@@ -309,13 +255,6 @@ static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = {
309 .priority = NF_IP6_PRI_CONNTRACK, 255 .priority = NF_IP6_PRI_CONNTRACK,
310 }, 256 },
311 { 257 {
312 .hook = ipv6_defrag,
313 .owner = THIS_MODULE,
314 .pf = NFPROTO_IPV6,
315 .hooknum = NF_INET_LOCAL_OUT,
316 .priority = NF_IP6_PRI_CONNTRACK_DEFRAG,
317 },
318 {
319 .hook = ipv6_confirm, 258 .hook = ipv6_confirm,
320 .owner = THIS_MODULE, 259 .owner = THIS_MODULE,
321 .pf = NFPROTO_IPV6, 260 .pf = NFPROTO_IPV6,
@@ -387,10 +326,6 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = {
387 .nlattr_to_tuple = ipv6_nlattr_to_tuple, 326 .nlattr_to_tuple = ipv6_nlattr_to_tuple,
388 .nla_policy = ipv6_nla_policy, 327 .nla_policy = ipv6_nla_policy,
389#endif 328#endif
390#ifdef CONFIG_SYSCTL
391 .ctl_table_path = nf_net_netfilter_sysctl_path,
392 .ctl_table = nf_ct_ipv6_sysctl_table,
393#endif
394 .me = THIS_MODULE, 329 .me = THIS_MODULE,
395}; 330};
396 331
@@ -403,16 +338,12 @@ static int __init nf_conntrack_l3proto_ipv6_init(void)
403 int ret = 0; 338 int ret = 0;
404 339
405 need_conntrack(); 340 need_conntrack();
341 nf_defrag_ipv6_enable();
406 342
407 ret = nf_ct_frag6_init();
408 if (ret < 0) {
409 pr_err("nf_conntrack_ipv6: can't initialize frag6.\n");
410 return ret;
411 }
412 ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp6); 343 ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp6);
413 if (ret < 0) { 344 if (ret < 0) {
414 pr_err("nf_conntrack_ipv6: can't register tcp.\n"); 345 pr_err("nf_conntrack_ipv6: can't register tcp.\n");
415 goto cleanup_frag6; 346 return ret;
416 } 347 }
417 348
418 ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp6); 349 ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp6);
@@ -450,8 +381,6 @@ static int __init nf_conntrack_l3proto_ipv6_init(void)
450 nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6); 381 nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6);
451 cleanup_tcp: 382 cleanup_tcp:
452 nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6); 383 nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6);
453 cleanup_frag6:
454 nf_ct_frag6_cleanup();
455 return ret; 384 return ret;
456} 385}
457 386
@@ -463,7 +392,6 @@ static void __exit nf_conntrack_l3proto_ipv6_fini(void)
463 nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmpv6); 392 nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmpv6);
464 nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6); 393 nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6);
465 nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6); 394 nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6);
466 nf_ct_frag6_cleanup();
467} 395}
468 396
469module_init(nf_conntrack_l3proto_ipv6_init); 397module_init(nf_conntrack_l3proto_ipv6_init);
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 13ef5bc05cf5..3a3f129a44cb 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -73,7 +73,7 @@ static struct inet_frags nf_frags;
73static struct netns_frags nf_init_frags; 73static struct netns_frags nf_init_frags;
74 74
75#ifdef CONFIG_SYSCTL 75#ifdef CONFIG_SYSCTL
76struct ctl_table nf_ct_ipv6_sysctl_table[] = { 76struct ctl_table nf_ct_frag6_sysctl_table[] = {
77 { 77 {
78 .procname = "nf_conntrack_frag6_timeout", 78 .procname = "nf_conntrack_frag6_timeout",
79 .data = &nf_init_frags.timeout, 79 .data = &nf_init_frags.timeout,
@@ -97,6 +97,8 @@ struct ctl_table nf_ct_ipv6_sysctl_table[] = {
97 }, 97 },
98 { } 98 { }
99}; 99};
100
101static struct ctl_table_header *nf_ct_frag6_sysctl_header;
100#endif 102#endif
101 103
102static unsigned int nf_hashfn(struct inet_frag_queue *q) 104static unsigned int nf_hashfn(struct inet_frag_queue *q)
@@ -113,14 +115,6 @@ static void nf_skb_free(struct sk_buff *skb)
113 kfree_skb(NFCT_FRAG6_CB(skb)->orig); 115 kfree_skb(NFCT_FRAG6_CB(skb)->orig);
114} 116}
115 117
116/* Memory Tracking Functions. */
117static void frag_kfree_skb(struct sk_buff *skb)
118{
119 atomic_sub(skb->truesize, &nf_init_frags.mem);
120 nf_skb_free(skb);
121 kfree_skb(skb);
122}
123
124/* Destruction primitives. */ 118/* Destruction primitives. */
125 119
126static __inline__ void fq_put(struct nf_ct_frag6_queue *fq) 120static __inline__ void fq_put(struct nf_ct_frag6_queue *fq)
@@ -282,66 +276,22 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
282 } 276 }
283 277
284found: 278found:
285 /* We found where to put this one. Check for overlap with 279 /* RFC5722, Section 4:
286 * preceding fragment, and, if needed, align things so that 280 * When reassembling an IPv6 datagram, if
287 * any overlaps are eliminated. 281 * one or more its constituent fragments is determined to be an
282 * overlapping fragment, the entire datagram (and any constituent
283 * fragments, including those not yet received) MUST be silently
284 * discarded.
288 */ 285 */
289 if (prev) {
290 int i = (NFCT_FRAG6_CB(prev)->offset + prev->len) - offset;
291
292 if (i > 0) {
293 offset += i;
294 if (end <= offset) {
295 pr_debug("overlap\n");
296 goto err;
297 }
298 if (!pskb_pull(skb, i)) {
299 pr_debug("Can't pull\n");
300 goto err;
301 }
302 if (skb->ip_summed != CHECKSUM_UNNECESSARY)
303 skb->ip_summed = CHECKSUM_NONE;
304 }
305 }
306 286
307 /* Look for overlap with succeeding segments. 287 /* Check for overlap with preceding fragment. */
308 * If we can merge fragments, do it. 288 if (prev &&
309 */ 289 (NFCT_FRAG6_CB(prev)->offset + prev->len) - offset > 0)
310 while (next && NFCT_FRAG6_CB(next)->offset < end) { 290 goto discard_fq;
311 /* overlap is 'i' bytes */
312 int i = end - NFCT_FRAG6_CB(next)->offset;
313 291
314 if (i < next->len) { 292 /* Look for overlap with succeeding segment. */
315 /* Eat head of the next overlapped fragment 293 if (next && NFCT_FRAG6_CB(next)->offset < end)
316 * and leave the loop. The next ones cannot overlap. 294 goto discard_fq;
317 */
318 pr_debug("Eat head of the overlapped parts.: %d", i);
319 if (!pskb_pull(next, i))
320 goto err;
321
322 /* next fragment */
323 NFCT_FRAG6_CB(next)->offset += i;
324 fq->q.meat -= i;
325 if (next->ip_summed != CHECKSUM_UNNECESSARY)
326 next->ip_summed = CHECKSUM_NONE;
327 break;
328 } else {
329 struct sk_buff *free_it = next;
330
331 /* Old fragmnet is completely overridden with
332 * new one drop it.
333 */
334 next = next->next;
335
336 if (prev)
337 prev->next = next;
338 else
339 fq->q.fragments = next;
340
341 fq->q.meat -= free_it->len;
342 frag_kfree_skb(free_it);
343 }
344 }
345 295
346 NFCT_FRAG6_CB(skb)->offset = offset; 296 NFCT_FRAG6_CB(skb)->offset = offset;
347 297
@@ -371,6 +321,8 @@ found:
371 write_unlock(&nf_frags.lock); 321 write_unlock(&nf_frags.lock);
372 return 0; 322 return 0;
373 323
324discard_fq:
325 fq_kill(fq);
374err: 326err:
375 return -1; 327 return -1;
376} 328}
@@ -413,7 +365,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev)
413 /* If the first fragment is fragmented itself, we split 365 /* If the first fragment is fragmented itself, we split
414 * it to two chunks: the first with data and paged part 366 * it to two chunks: the first with data and paged part
415 * and the second, holding only fragments. */ 367 * and the second, holding only fragments. */
416 if (skb_has_frags(head)) { 368 if (skb_has_frag_list(head)) {
417 struct sk_buff *clone; 369 struct sk_buff *clone;
418 int i, plen = 0; 370 int i, plen = 0;
419 371
@@ -673,11 +625,24 @@ int nf_ct_frag6_init(void)
673 inet_frags_init_net(&nf_init_frags); 625 inet_frags_init_net(&nf_init_frags);
674 inet_frags_init(&nf_frags); 626 inet_frags_init(&nf_frags);
675 627
628#ifdef CONFIG_SYSCTL
629 nf_ct_frag6_sysctl_header = register_sysctl_paths(nf_net_netfilter_sysctl_path,
630 nf_ct_frag6_sysctl_table);
631 if (!nf_ct_frag6_sysctl_header) {
632 inet_frags_fini(&nf_frags);
633 return -ENOMEM;
634 }
635#endif
636
676 return 0; 637 return 0;
677} 638}
678 639
679void nf_ct_frag6_cleanup(void) 640void nf_ct_frag6_cleanup(void)
680{ 641{
642#ifdef CONFIG_SYSCTL
643 unregister_sysctl_table(nf_ct_frag6_sysctl_header);
644 nf_ct_frag6_sysctl_header = NULL;
645#endif
681 inet_frags_fini(&nf_frags); 646 inet_frags_fini(&nf_frags);
682 647
683 nf_init_frags.low_thresh = 0; 648 nf_init_frags.low_thresh = 0;
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
new file mode 100644
index 000000000000..99abfb53bab9
--- /dev/null
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -0,0 +1,131 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/types.h>
10#include <linux/ipv6.h>
11#include <linux/in6.h>
12#include <linux/netfilter.h>
13#include <linux/module.h>
14#include <linux/skbuff.h>
15#include <linux/icmp.h>
16#include <linux/sysctl.h>
17#include <net/ipv6.h>
18#include <net/inet_frag.h>
19
20#include <linux/netfilter_ipv6.h>
21#include <linux/netfilter_bridge.h>
22#include <net/netfilter/nf_conntrack.h>
23#include <net/netfilter/nf_conntrack_helper.h>
24#include <net/netfilter/nf_conntrack_l4proto.h>
25#include <net/netfilter/nf_conntrack_l3proto.h>
26#include <net/netfilter/nf_conntrack_core.h>
27#include <net/netfilter/nf_conntrack_zones.h>
28#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
29#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
30
31static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum,
32 struct sk_buff *skb)
33{
34 u16 zone = NF_CT_DEFAULT_ZONE;
35
36 if (skb->nfct)
37 zone = nf_ct_zone((struct nf_conn *)skb->nfct);
38
39#ifdef CONFIG_BRIDGE_NETFILTER
40 if (skb->nf_bridge &&
41 skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)
42 return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone;
43#endif
44 if (hooknum == NF_INET_PRE_ROUTING)
45 return IP6_DEFRAG_CONNTRACK_IN + zone;
46 else
47 return IP6_DEFRAG_CONNTRACK_OUT + zone;
48
49}
50
51static unsigned int ipv6_defrag(unsigned int hooknum,
52 struct sk_buff *skb,
53 const struct net_device *in,
54 const struct net_device *out,
55 int (*okfn)(struct sk_buff *))
56{
57 struct sk_buff *reasm;
58
59 /* Previously seen (loopback)? */
60 if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
61 return NF_ACCEPT;
62
63 reasm = nf_ct_frag6_gather(skb, nf_ct6_defrag_user(hooknum, skb));
64 /* queued */
65 if (reasm == NULL)
66 return NF_STOLEN;
67
68 /* error occured or not fragmented */
69 if (reasm == skb)
70 return NF_ACCEPT;
71
72 nf_ct_frag6_output(hooknum, reasm, (struct net_device *)in,
73 (struct net_device *)out, okfn);
74
75 return NF_STOLEN;
76}
77
78static struct nf_hook_ops ipv6_defrag_ops[] = {
79 {
80 .hook = ipv6_defrag,
81 .owner = THIS_MODULE,
82 .pf = NFPROTO_IPV6,
83 .hooknum = NF_INET_PRE_ROUTING,
84 .priority = NF_IP6_PRI_CONNTRACK_DEFRAG,
85 },
86 {
87 .hook = ipv6_defrag,
88 .owner = THIS_MODULE,
89 .pf = NFPROTO_IPV6,
90 .hooknum = NF_INET_LOCAL_OUT,
91 .priority = NF_IP6_PRI_CONNTRACK_DEFRAG,
92 },
93};
94
95static int __init nf_defrag_init(void)
96{
97 int ret = 0;
98
99 ret = nf_ct_frag6_init();
100 if (ret < 0) {
101 pr_err("nf_defrag_ipv6: can't initialize frag6.\n");
102 return ret;
103 }
104 ret = nf_register_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops));
105 if (ret < 0) {
106 pr_err("nf_defrag_ipv6: can't register hooks\n");
107 goto cleanup_frag6;
108 }
109 return ret;
110
111cleanup_frag6:
112 nf_ct_frag6_cleanup();
113 return ret;
114
115}
116
117static void __exit nf_defrag_fini(void)
118{
119 nf_unregister_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops));
120 nf_ct_frag6_cleanup();
121}
122
123void nf_defrag_ipv6_enable(void)
124{
125}
126EXPORT_SYMBOL_GPL(nf_defrag_ipv6_enable);
127
128module_init(nf_defrag_init);
129module_exit(nf_defrag_fini);
130
131MODULE_LICENSE("GPL");
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index d082eaeefa25..24b3558b8e67 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -126,6 +126,8 @@ static const struct snmp_mib snmp6_udp6_list[] = {
126 SNMP_MIB_ITEM("Udp6NoPorts", UDP_MIB_NOPORTS), 126 SNMP_MIB_ITEM("Udp6NoPorts", UDP_MIB_NOPORTS),
127 SNMP_MIB_ITEM("Udp6InErrors", UDP_MIB_INERRORS), 127 SNMP_MIB_ITEM("Udp6InErrors", UDP_MIB_INERRORS),
128 SNMP_MIB_ITEM("Udp6OutDatagrams", UDP_MIB_OUTDATAGRAMS), 128 SNMP_MIB_ITEM("Udp6OutDatagrams", UDP_MIB_OUTDATAGRAMS),
129 SNMP_MIB_ITEM("Udp6RcvbufErrors", UDP_MIB_RCVBUFERRORS),
130 SNMP_MIB_ITEM("Udp6SndbufErrors", UDP_MIB_SNDBUFERRORS),
129 SNMP_MIB_SENTINEL 131 SNMP_MIB_SENTINEL
130}; 132};
131 133
@@ -134,6 +136,8 @@ static const struct snmp_mib snmp6_udplite6_list[] = {
134 SNMP_MIB_ITEM("UdpLite6NoPorts", UDP_MIB_NOPORTS), 136 SNMP_MIB_ITEM("UdpLite6NoPorts", UDP_MIB_NOPORTS),
135 SNMP_MIB_ITEM("UdpLite6InErrors", UDP_MIB_INERRORS), 137 SNMP_MIB_ITEM("UdpLite6InErrors", UDP_MIB_INERRORS),
136 SNMP_MIB_ITEM("UdpLite6OutDatagrams", UDP_MIB_OUTDATAGRAMS), 138 SNMP_MIB_ITEM("UdpLite6OutDatagrams", UDP_MIB_OUTDATAGRAMS),
139 SNMP_MIB_ITEM("UdpLite6RcvbufErrors", UDP_MIB_RCVBUFERRORS),
140 SNMP_MIB_ITEM("UdpLite6SndbufErrors", UDP_MIB_SNDBUFERRORS),
137 SNMP_MIB_SENTINEL 141 SNMP_MIB_SENTINEL
138}; 142};
139 143
diff --git a/net/ipv6/protocol.c b/net/ipv6/protocol.c
index 1fa3468f0f32..9a7978fdc02a 100644
--- a/net/ipv6/protocol.c
+++ b/net/ipv6/protocol.c
@@ -25,28 +25,15 @@
25#include <linux/spinlock.h> 25#include <linux/spinlock.h>
26#include <net/protocol.h> 26#include <net/protocol.h>
27 27
28const struct inet6_protocol *inet6_protos[MAX_INET_PROTOS]; 28const struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS] __read_mostly;
29static DEFINE_SPINLOCK(inet6_proto_lock);
30
31 29
32int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char protocol) 30int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char protocol)
33{ 31{
34 int ret, hash = protocol & (MAX_INET_PROTOS - 1); 32 int hash = protocol & (MAX_INET_PROTOS - 1);
35
36 spin_lock_bh(&inet6_proto_lock);
37
38 if (inet6_protos[hash]) {
39 ret = -1;
40 } else {
41 inet6_protos[hash] = prot;
42 ret = 0;
43 }
44
45 spin_unlock_bh(&inet6_proto_lock);
46 33
47 return ret; 34 return !cmpxchg((const struct inet6_protocol **)&inet6_protos[hash],
35 NULL, prot) ? 0 : -1;
48} 36}
49
50EXPORT_SYMBOL(inet6_add_protocol); 37EXPORT_SYMBOL(inet6_add_protocol);
51 38
52/* 39/*
@@ -57,20 +44,11 @@ int inet6_del_protocol(const struct inet6_protocol *prot, unsigned char protocol
57{ 44{
58 int ret, hash = protocol & (MAX_INET_PROTOS - 1); 45 int ret, hash = protocol & (MAX_INET_PROTOS - 1);
59 46
60 spin_lock_bh(&inet6_proto_lock); 47 ret = (cmpxchg((const struct inet6_protocol **)&inet6_protos[hash],
61 48 prot, NULL) == prot) ? 0 : -1;
62 if (inet6_protos[hash] != prot) {
63 ret = -1;
64 } else {
65 inet6_protos[hash] = NULL;
66 ret = 0;
67 }
68
69 spin_unlock_bh(&inet6_proto_lock);
70 49
71 synchronize_net(); 50 synchronize_net();
72 51
73 return ret; 52 return ret;
74} 53}
75
76EXPORT_SYMBOL(inet6_del_protocol); 54EXPORT_SYMBOL(inet6_del_protocol);
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index e677937a07fc..86c39526ba5e 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -373,7 +373,7 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr,
373 373
374static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb) 374static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb)
375{ 375{
376 if ((raw6_sk(sk)->checksum || sk->sk_filter) && 376 if ((raw6_sk(sk)->checksum || rcu_dereference_raw(sk->sk_filter)) &&
377 skb_checksum_complete(skb)) { 377 skb_checksum_complete(skb)) {
378 atomic_inc(&sk->sk_drops); 378 atomic_inc(&sk->sk_drops);
379 kfree_skb(skb); 379 kfree_skb(skb);
@@ -764,7 +764,7 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
764 return -EINVAL; 764 return -EINVAL;
765 765
766 if (sin6->sin6_family && sin6->sin6_family != AF_INET6) 766 if (sin6->sin6_family && sin6->sin6_family != AF_INET6)
767 return(-EAFNOSUPPORT); 767 return -EAFNOSUPPORT;
768 768
769 /* port is the proto value [0..255] carried in nexthdr */ 769 /* port is the proto value [0..255] carried in nexthdr */
770 proto = ntohs(sin6->sin6_port); 770 proto = ntohs(sin6->sin6_port);
@@ -772,10 +772,10 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
772 if (!proto) 772 if (!proto)
773 proto = inet->inet_num; 773 proto = inet->inet_num;
774 else if (proto != inet->inet_num) 774 else if (proto != inet->inet_num)
775 return(-EINVAL); 775 return -EINVAL;
776 776
777 if (proto > 255) 777 if (proto > 255)
778 return(-EINVAL); 778 return -EINVAL;
779 779
780 daddr = &sin6->sin6_addr; 780 daddr = &sin6->sin6_addr;
781 if (np->sndflow) { 781 if (np->sndflow) {
@@ -985,7 +985,7 @@ static int do_rawv6_setsockopt(struct sock *sk, int level, int optname,
985 /* You may get strange result with a positive odd offset; 985 /* You may get strange result with a positive odd offset;
986 RFC2292bis agrees with me. */ 986 RFC2292bis agrees with me. */
987 if (val > 0 && (val&1)) 987 if (val > 0 && (val&1))
988 return(-EINVAL); 988 return -EINVAL;
989 if (val < 0) { 989 if (val < 0) {
990 rp->checksum = 0; 990 rp->checksum = 0;
991 } else { 991 } else {
@@ -997,7 +997,7 @@ static int do_rawv6_setsockopt(struct sock *sk, int level, int optname,
997 break; 997 break;
998 998
999 default: 999 default:
1000 return(-ENOPROTOOPT); 1000 return -ENOPROTOOPT;
1001 } 1001 }
1002} 1002}
1003 1003
@@ -1190,7 +1190,7 @@ static int rawv6_init_sk(struct sock *sk)
1190 default: 1190 default:
1191 break; 1191 break;
1192 } 1192 }
1193 return(0); 1193 return 0;
1194} 1194}
1195 1195
1196struct proto rawv6_prot = { 1196struct proto rawv6_prot = {
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 545c4141b755..c7ba3149633f 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -149,13 +149,6 @@ int ip6_frag_match(struct inet_frag_queue *q, void *a)
149} 149}
150EXPORT_SYMBOL(ip6_frag_match); 150EXPORT_SYMBOL(ip6_frag_match);
151 151
152/* Memory Tracking Functions. */
153static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb)
154{
155 atomic_sub(skb->truesize, &nf->mem);
156 kfree_skb(skb);
157}
158
159void ip6_frag_init(struct inet_frag_queue *q, void *a) 152void ip6_frag_init(struct inet_frag_queue *q, void *a)
160{ 153{
161 struct frag_queue *fq = container_of(q, struct frag_queue, q); 154 struct frag_queue *fq = container_of(q, struct frag_queue, q);
@@ -346,58 +339,22 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
346 } 339 }
347 340
348found: 341found:
349 /* We found where to put this one. Check for overlap with 342 /* RFC5722, Section 4:
350 * preceding fragment, and, if needed, align things so that 343 * When reassembling an IPv6 datagram, if
351 * any overlaps are eliminated. 344 * one or more its constituent fragments is determined to be an
345 * overlapping fragment, the entire datagram (and any constituent
346 * fragments, including those not yet received) MUST be silently
347 * discarded.
352 */ 348 */
353 if (prev) {
354 int i = (FRAG6_CB(prev)->offset + prev->len) - offset;
355 349
356 if (i > 0) { 350 /* Check for overlap with preceding fragment. */
357 offset += i; 351 if (prev &&
358 if (end <= offset) 352 (FRAG6_CB(prev)->offset + prev->len) - offset > 0)
359 goto err; 353 goto discard_fq;
360 if (!pskb_pull(skb, i))
361 goto err;
362 if (skb->ip_summed != CHECKSUM_UNNECESSARY)
363 skb->ip_summed = CHECKSUM_NONE;
364 }
365 }
366 354
367 /* Look for overlap with succeeding segments. 355 /* Look for overlap with succeeding segment. */
368 * If we can merge fragments, do it. 356 if (next && FRAG6_CB(next)->offset < end)
369 */ 357 goto discard_fq;
370 while (next && FRAG6_CB(next)->offset < end) {
371 int i = end - FRAG6_CB(next)->offset; /* overlap is 'i' bytes */
372
373 if (i < next->len) {
374 /* Eat head of the next overlapped fragment
375 * and leave the loop. The next ones cannot overlap.
376 */
377 if (!pskb_pull(next, i))
378 goto err;
379 FRAG6_CB(next)->offset += i; /* next fragment */
380 fq->q.meat -= i;
381 if (next->ip_summed != CHECKSUM_UNNECESSARY)
382 next->ip_summed = CHECKSUM_NONE;
383 break;
384 } else {
385 struct sk_buff *free_it = next;
386
387 /* Old fragment is completely overridden with
388 * new one drop it.
389 */
390 next = next->next;
391
392 if (prev)
393 prev->next = next;
394 else
395 fq->q.fragments = next;
396
397 fq->q.meat -= free_it->len;
398 frag_kfree_skb(fq->q.net, free_it);
399 }
400 }
401 358
402 FRAG6_CB(skb)->offset = offset; 359 FRAG6_CB(skb)->offset = offset;
403 360
@@ -436,6 +393,8 @@ found:
436 write_unlock(&ip6_frags.lock); 393 write_unlock(&ip6_frags.lock);
437 return -1; 394 return -1;
438 395
396discard_fq:
397 fq_kill(fq);
439err: 398err:
440 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 399 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
441 IPSTATS_MIB_REASMFAILS); 400 IPSTATS_MIB_REASMFAILS);
@@ -499,7 +458,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
499 /* If the first fragment is fragmented itself, we split 458 /* If the first fragment is fragmented itself, we split
500 * it to two chunks: the first with data and paged part 459 * it to two chunks: the first with data and paged part
501 * and the second, holding only fragments. */ 460 * and the second, holding only fragments. */
502 if (skb_has_frags(head)) { 461 if (skb_has_frag_list(head)) {
503 struct sk_buff *clone; 462 struct sk_buff *clone;
504 int i, plen = 0; 463 int i, plen = 0;
505 464
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 8f2d0400cf8a..25661f968f3f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -109,7 +109,6 @@ static struct dst_ops ip6_dst_ops_template = {
109 .link_failure = ip6_link_failure, 109 .link_failure = ip6_link_failure,
110 .update_pmtu = ip6_rt_update_pmtu, 110 .update_pmtu = ip6_rt_update_pmtu,
111 .local_out = __ip6_local_out, 111 .local_out = __ip6_local_out,
112 .entries = ATOMIC_INIT(0),
113}; 112};
114 113
115static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) 114static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
@@ -122,7 +121,6 @@ static struct dst_ops ip6_dst_blackhole_ops = {
122 .destroy = ip6_dst_destroy, 121 .destroy = ip6_dst_destroy,
123 .check = ip6_dst_check, 122 .check = ip6_dst_check,
124 .update_pmtu = ip6_rt_blackhole_update_pmtu, 123 .update_pmtu = ip6_rt_blackhole_update_pmtu,
125 .entries = ATOMIC_INIT(0),
126}; 124};
127 125
128static struct rt6_info ip6_null_entry_template = { 126static struct rt6_info ip6_null_entry_template = {
@@ -217,14 +215,14 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
217 215
218static __inline__ int rt6_check_expired(const struct rt6_info *rt) 216static __inline__ int rt6_check_expired(const struct rt6_info *rt)
219{ 217{
220 return (rt->rt6i_flags & RTF_EXPIRES && 218 return (rt->rt6i_flags & RTF_EXPIRES) &&
221 time_after(jiffies, rt->rt6i_expires)); 219 time_after(jiffies, rt->rt6i_expires);
222} 220}
223 221
224static inline int rt6_need_strict(struct in6_addr *daddr) 222static inline int rt6_need_strict(struct in6_addr *daddr)
225{ 223{
226 return (ipv6_addr_type(daddr) & 224 return ipv6_addr_type(daddr) &
227 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK)); 225 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
228} 226}
229 227
230/* 228/*
@@ -440,7 +438,7 @@ static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
440 __func__, match); 438 __func__, match);
441 439
442 net = dev_net(rt0->rt6i_dev); 440 net = dev_net(rt0->rt6i_dev);
443 return (match ? match : net->ipv6.ip6_null_entry); 441 return match ? match : net->ipv6.ip6_null_entry;
444} 442}
445 443
446#ifdef CONFIG_IPV6_ROUTE_INFO 444#ifdef CONFIG_IPV6_ROUTE_INFO
@@ -670,7 +668,7 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *dad
670 668
671 if (net_ratelimit()) 669 if (net_ratelimit())
672 printk(KERN_WARNING 670 printk(KERN_WARNING
673 "Neighbour table overflow.\n"); 671 "ipv6: Neighbour table overflow.\n");
674 dst_free(&rt->dst); 672 dst_free(&rt->dst);
675 return NULL; 673 return NULL;
676 } 674 }
@@ -859,7 +857,7 @@ int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl
859 857
860 dst_release(*dstp); 858 dst_release(*dstp);
861 *dstp = new; 859 *dstp = new;
862 return (new ? 0 : -ENOMEM); 860 return new ? 0 : -ENOMEM;
863} 861}
864EXPORT_SYMBOL_GPL(ip6_dst_blackhole); 862EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
865 863
@@ -1058,19 +1056,22 @@ static int ip6_dst_gc(struct dst_ops *ops)
1058 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 1056 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1059 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 1057 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1060 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 1058 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1059 int entries;
1061 1060
1061 entries = dst_entries_get_fast(ops);
1062 if (time_after(rt_last_gc + rt_min_interval, now) && 1062 if (time_after(rt_last_gc + rt_min_interval, now) &&
1063 atomic_read(&ops->entries) <= rt_max_size) 1063 entries <= rt_max_size)
1064 goto out; 1064 goto out;
1065 1065
1066 net->ipv6.ip6_rt_gc_expire++; 1066 net->ipv6.ip6_rt_gc_expire++;
1067 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net); 1067 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1068 net->ipv6.ip6_rt_last_gc = now; 1068 net->ipv6.ip6_rt_last_gc = now;
1069 if (atomic_read(&ops->entries) < ops->gc_thresh) 1069 entries = dst_entries_get_slow(ops);
1070 if (entries < ops->gc_thresh)
1070 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 1071 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1071out: 1072out:
1072 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 1073 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1073 return (atomic_read(&ops->entries) > rt_max_size); 1074 return entries > rt_max_size;
1074} 1075}
1075 1076
1076/* Clean host part of a prefix. Not necessary in radix tree, 1077/* Clean host part of a prefix. Not necessary in radix tree,
@@ -1169,6 +1170,8 @@ int ip6_route_add(struct fib6_config *cfg)
1169 1170
1170 if (addr_type & IPV6_ADDR_MULTICAST) 1171 if (addr_type & IPV6_ADDR_MULTICAST)
1171 rt->dst.input = ip6_mc_input; 1172 rt->dst.input = ip6_mc_input;
1173 else if (cfg->fc_flags & RTF_LOCAL)
1174 rt->dst.input = ip6_input;
1172 else 1175 else
1173 rt->dst.input = ip6_forward; 1176 rt->dst.input = ip6_forward;
1174 1177
@@ -1190,7 +1193,8 @@ int ip6_route_add(struct fib6_config *cfg)
1190 they would result in kernel looping; promote them to reject routes 1193 they would result in kernel looping; promote them to reject routes
1191 */ 1194 */
1192 if ((cfg->fc_flags & RTF_REJECT) || 1195 if ((cfg->fc_flags & RTF_REJECT) ||
1193 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) { 1196 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1197 && !(cfg->fc_flags&RTF_LOCAL))) {
1194 /* hold loopback dev/idev if we haven't done so. */ 1198 /* hold loopback dev/idev if we haven't done so. */
1195 if (dev != net->loopback_dev) { 1199 if (dev != net->loopback_dev) {
1196 if (dev) { 1200 if (dev) {
@@ -1556,14 +1560,13 @@ out:
1556 * i.e. Path MTU discovery 1560 * i.e. Path MTU discovery
1557 */ 1561 */
1558 1562
1559void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr, 1563static void rt6_do_pmtu_disc(struct in6_addr *daddr, struct in6_addr *saddr,
1560 struct net_device *dev, u32 pmtu) 1564 struct net *net, u32 pmtu, int ifindex)
1561{ 1565{
1562 struct rt6_info *rt, *nrt; 1566 struct rt6_info *rt, *nrt;
1563 struct net *net = dev_net(dev);
1564 int allfrag = 0; 1567 int allfrag = 0;
1565 1568
1566 rt = rt6_lookup(net, daddr, saddr, dev->ifindex, 0); 1569 rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1567 if (rt == NULL) 1570 if (rt == NULL)
1568 return; 1571 return;
1569 1572
@@ -1631,6 +1634,27 @@ out:
1631 dst_release(&rt->dst); 1634 dst_release(&rt->dst);
1632} 1635}
1633 1636
1637void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1638 struct net_device *dev, u32 pmtu)
1639{
1640 struct net *net = dev_net(dev);
1641
1642 /*
1643 * RFC 1981 states that a node "MUST reduce the size of the packets it
1644 * is sending along the path" that caused the Packet Too Big message.
1645 * Since it's not possible in the general case to determine which
1646 * interface was used to send the original packet, we update the MTU
1647 * on the interface that will be used to send future packets. We also
1648 * update the MTU on the interface that received the Packet Too Big in
1649 * case the original packet was forced out that interface with
1650 * SO_BINDTODEVICE or similar. This is the next best thing to the
1651 * correct behaviour, which would be to update the MTU on all
1652 * interfaces.
1653 */
1654 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1655 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1656}
1657
1634/* 1658/*
1635 * Misc support functions 1659 * Misc support functions
1636 */ 1660 */
@@ -2082,6 +2106,9 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2082 if (rtm->rtm_type == RTN_UNREACHABLE) 2106 if (rtm->rtm_type == RTN_UNREACHABLE)
2083 cfg->fc_flags |= RTF_REJECT; 2107 cfg->fc_flags |= RTF_REJECT;
2084 2108
2109 if (rtm->rtm_type == RTN_LOCAL)
2110 cfg->fc_flags |= RTF_LOCAL;
2111
2085 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid; 2112 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2086 cfg->fc_nlinfo.nlh = nlh; 2113 cfg->fc_nlinfo.nlh = nlh;
2087 cfg->fc_nlinfo.nl_net = sock_net(skb->sk); 2114 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
@@ -2202,6 +2229,8 @@ static int rt6_fill_node(struct net *net,
2202 NLA_PUT_U32(skb, RTA_TABLE, table); 2229 NLA_PUT_U32(skb, RTA_TABLE, table);
2203 if (rt->rt6i_flags&RTF_REJECT) 2230 if (rt->rt6i_flags&RTF_REJECT)
2204 rtm->rtm_type = RTN_UNREACHABLE; 2231 rtm->rtm_type = RTN_UNREACHABLE;
2232 else if (rt->rt6i_flags&RTF_LOCAL)
2233 rtm->rtm_type = RTN_LOCAL;
2205 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK)) 2234 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2206 rtm->rtm_type = RTN_LOCAL; 2235 rtm->rtm_type = RTN_LOCAL;
2207 else 2236 else
@@ -2496,7 +2525,7 @@ static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2496 net->ipv6.rt6_stats->fib_rt_alloc, 2525 net->ipv6.rt6_stats->fib_rt_alloc,
2497 net->ipv6.rt6_stats->fib_rt_entries, 2526 net->ipv6.rt6_stats->fib_rt_entries,
2498 net->ipv6.rt6_stats->fib_rt_cache, 2527 net->ipv6.rt6_stats->fib_rt_cache,
2499 atomic_read(&net->ipv6.ip6_dst_ops.entries), 2528 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2500 net->ipv6.rt6_stats->fib_discarded_routes); 2529 net->ipv6.rt6_stats->fib_discarded_routes);
2501 2530
2502 return 0; 2531 return 0;
@@ -2580,7 +2609,7 @@ ctl_table ipv6_route_table_template[] = {
2580 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 2609 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2581 .maxlen = sizeof(int), 2610 .maxlen = sizeof(int),
2582 .mode = 0644, 2611 .mode = 0644,
2583 .proc_handler = proc_dointvec_jiffies, 2612 .proc_handler = proc_dointvec,
2584 }, 2613 },
2585 { 2614 {
2586 .procname = "mtu_expires", 2615 .procname = "mtu_expires",
@@ -2594,7 +2623,7 @@ ctl_table ipv6_route_table_template[] = {
2594 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 2623 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2595 .maxlen = sizeof(int), 2624 .maxlen = sizeof(int),
2596 .mode = 0644, 2625 .mode = 0644,
2597 .proc_handler = proc_dointvec_jiffies, 2626 .proc_handler = proc_dointvec,
2598 }, 2627 },
2599 { 2628 {
2600 .procname = "gc_min_interval_ms", 2629 .procname = "gc_min_interval_ms",
@@ -2638,11 +2667,14 @@ static int __net_init ip6_route_net_init(struct net *net)
2638 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 2667 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2639 sizeof(net->ipv6.ip6_dst_ops)); 2668 sizeof(net->ipv6.ip6_dst_ops));
2640 2669
2670 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2671 goto out_ip6_dst_ops;
2672
2641 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 2673 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2642 sizeof(*net->ipv6.ip6_null_entry), 2674 sizeof(*net->ipv6.ip6_null_entry),
2643 GFP_KERNEL); 2675 GFP_KERNEL);
2644 if (!net->ipv6.ip6_null_entry) 2676 if (!net->ipv6.ip6_null_entry)
2645 goto out_ip6_dst_ops; 2677 goto out_ip6_dst_entries;
2646 net->ipv6.ip6_null_entry->dst.path = 2678 net->ipv6.ip6_null_entry->dst.path =
2647 (struct dst_entry *)net->ipv6.ip6_null_entry; 2679 (struct dst_entry *)net->ipv6.ip6_null_entry;
2648 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 2680 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
@@ -2692,6 +2724,8 @@ out_ip6_prohibit_entry:
2692out_ip6_null_entry: 2724out_ip6_null_entry:
2693 kfree(net->ipv6.ip6_null_entry); 2725 kfree(net->ipv6.ip6_null_entry);
2694#endif 2726#endif
2727out_ip6_dst_entries:
2728 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2695out_ip6_dst_ops: 2729out_ip6_dst_ops:
2696 goto out; 2730 goto out;
2697} 2731}
@@ -2730,10 +2764,14 @@ int __init ip6_route_init(void)
2730 if (!ip6_dst_ops_template.kmem_cachep) 2764 if (!ip6_dst_ops_template.kmem_cachep)
2731 goto out; 2765 goto out;
2732 2766
2733 ret = register_pernet_subsys(&ip6_route_net_ops); 2767 ret = dst_entries_init(&ip6_dst_blackhole_ops);
2734 if (ret) 2768 if (ret)
2735 goto out_kmem_cache; 2769 goto out_kmem_cache;
2736 2770
2771 ret = register_pernet_subsys(&ip6_route_net_ops);
2772 if (ret)
2773 goto out_dst_entries;
2774
2737 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 2775 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2738 2776
2739 /* Registering of the loopback is done before this portion of code, 2777 /* Registering of the loopback is done before this portion of code,
@@ -2780,6 +2818,8 @@ out_fib6_init:
2780 fib6_gc_cleanup(); 2818 fib6_gc_cleanup();
2781out_register_subsys: 2819out_register_subsys:
2782 unregister_pernet_subsys(&ip6_route_net_ops); 2820 unregister_pernet_subsys(&ip6_route_net_ops);
2821out_dst_entries:
2822 dst_entries_destroy(&ip6_dst_blackhole_ops);
2783out_kmem_cache: 2823out_kmem_cache:
2784 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 2824 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2785 goto out; 2825 goto out;
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 4699cd3c3118..d6bfaec3bbbf 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -63,36 +63,63 @@
63#define HASH_SIZE 16 63#define HASH_SIZE 16
64#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) 64#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
65 65
66static void ipip6_tunnel_init(struct net_device *dev); 66static int ipip6_tunnel_init(struct net_device *dev);
67static void ipip6_tunnel_setup(struct net_device *dev); 67static void ipip6_tunnel_setup(struct net_device *dev);
68static void ipip6_dev_free(struct net_device *dev);
68 69
69static int sit_net_id __read_mostly; 70static int sit_net_id __read_mostly;
70struct sit_net { 71struct sit_net {
71 struct ip_tunnel *tunnels_r_l[HASH_SIZE]; 72 struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
72 struct ip_tunnel *tunnels_r[HASH_SIZE]; 73 struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
73 struct ip_tunnel *tunnels_l[HASH_SIZE]; 74 struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
74 struct ip_tunnel *tunnels_wc[1]; 75 struct ip_tunnel __rcu *tunnels_wc[1];
75 struct ip_tunnel **tunnels[4]; 76 struct ip_tunnel __rcu **tunnels[4];
76 77
77 struct net_device *fb_tunnel_dev; 78 struct net_device *fb_tunnel_dev;
78}; 79};
79 80
80/* 81/*
81 * Locking : hash tables are protected by RCU and a spinlock 82 * Locking : hash tables are protected by RCU and RTNL
82 */ 83 */
83static DEFINE_SPINLOCK(ipip6_lock);
84 84
85#define for_each_ip_tunnel_rcu(start) \ 85#define for_each_ip_tunnel_rcu(start) \
86 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) 86 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
87 87
88/* often modified stats are per cpu, other are shared (netdev->stats) */
89struct pcpu_tstats {
90 unsigned long rx_packets;
91 unsigned long rx_bytes;
92 unsigned long tx_packets;
93 unsigned long tx_bytes;
94};
95
96static struct net_device_stats *ipip6_get_stats(struct net_device *dev)
97{
98 struct pcpu_tstats sum = { 0 };
99 int i;
100
101 for_each_possible_cpu(i) {
102 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
103
104 sum.rx_packets += tstats->rx_packets;
105 sum.rx_bytes += tstats->rx_bytes;
106 sum.tx_packets += tstats->tx_packets;
107 sum.tx_bytes += tstats->tx_bytes;
108 }
109 dev->stats.rx_packets = sum.rx_packets;
110 dev->stats.rx_bytes = sum.rx_bytes;
111 dev->stats.tx_packets = sum.tx_packets;
112 dev->stats.tx_bytes = sum.tx_bytes;
113 return &dev->stats;
114}
88/* 115/*
89 * Must be invoked with rcu_read_lock 116 * Must be invoked with rcu_read_lock
90 */ 117 */
91static struct ip_tunnel * ipip6_tunnel_lookup(struct net *net, 118static struct ip_tunnel * ipip6_tunnel_lookup(struct net *net,
92 struct net_device *dev, __be32 remote, __be32 local) 119 struct net_device *dev, __be32 remote, __be32 local)
93{ 120{
94 unsigned h0 = HASH(remote); 121 unsigned int h0 = HASH(remote);
95 unsigned h1 = HASH(local); 122 unsigned int h1 = HASH(local);
96 struct ip_tunnel *t; 123 struct ip_tunnel *t;
97 struct sit_net *sitn = net_generic(net, sit_net_id); 124 struct sit_net *sitn = net_generic(net, sit_net_id);
98 125
@@ -121,12 +148,12 @@ static struct ip_tunnel * ipip6_tunnel_lookup(struct net *net,
121 return NULL; 148 return NULL;
122} 149}
123 150
124static struct ip_tunnel **__ipip6_bucket(struct sit_net *sitn, 151static struct ip_tunnel __rcu **__ipip6_bucket(struct sit_net *sitn,
125 struct ip_tunnel_parm *parms) 152 struct ip_tunnel_parm *parms)
126{ 153{
127 __be32 remote = parms->iph.daddr; 154 __be32 remote = parms->iph.daddr;
128 __be32 local = parms->iph.saddr; 155 __be32 local = parms->iph.saddr;
129 unsigned h = 0; 156 unsigned int h = 0;
130 int prio = 0; 157 int prio = 0;
131 158
132 if (remote) { 159 if (remote) {
@@ -140,7 +167,7 @@ static struct ip_tunnel **__ipip6_bucket(struct sit_net *sitn,
140 return &sitn->tunnels[prio][h]; 167 return &sitn->tunnels[prio][h];
141} 168}
142 169
143static inline struct ip_tunnel **ipip6_bucket(struct sit_net *sitn, 170static inline struct ip_tunnel __rcu **ipip6_bucket(struct sit_net *sitn,
144 struct ip_tunnel *t) 171 struct ip_tunnel *t)
145{ 172{
146 return __ipip6_bucket(sitn, &t->parms); 173 return __ipip6_bucket(sitn, &t->parms);
@@ -148,13 +175,14 @@ static inline struct ip_tunnel **ipip6_bucket(struct sit_net *sitn,
148 175
149static void ipip6_tunnel_unlink(struct sit_net *sitn, struct ip_tunnel *t) 176static void ipip6_tunnel_unlink(struct sit_net *sitn, struct ip_tunnel *t)
150{ 177{
151 struct ip_tunnel **tp; 178 struct ip_tunnel __rcu **tp;
152 179 struct ip_tunnel *iter;
153 for (tp = ipip6_bucket(sitn, t); *tp; tp = &(*tp)->next) { 180
154 if (t == *tp) { 181 for (tp = ipip6_bucket(sitn, t);
155 spin_lock_bh(&ipip6_lock); 182 (iter = rtnl_dereference(*tp)) != NULL;
156 *tp = t->next; 183 tp = &iter->next) {
157 spin_unlock_bh(&ipip6_lock); 184 if (t == iter) {
185 rcu_assign_pointer(*tp, t->next);
158 break; 186 break;
159 } 187 }
160 } 188 }
@@ -162,12 +190,10 @@ static void ipip6_tunnel_unlink(struct sit_net *sitn, struct ip_tunnel *t)
162 190
163static void ipip6_tunnel_link(struct sit_net *sitn, struct ip_tunnel *t) 191static void ipip6_tunnel_link(struct sit_net *sitn, struct ip_tunnel *t)
164{ 192{
165 struct ip_tunnel **tp = ipip6_bucket(sitn, t); 193 struct ip_tunnel __rcu **tp = ipip6_bucket(sitn, t);
166 194
167 spin_lock_bh(&ipip6_lock); 195 rcu_assign_pointer(t->next, rtnl_dereference(*tp));
168 t->next = *tp;
169 rcu_assign_pointer(*tp, t); 196 rcu_assign_pointer(*tp, t);
170 spin_unlock_bh(&ipip6_lock);
171} 197}
172 198
173static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn) 199static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn)
@@ -187,17 +213,20 @@ static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn)
187#endif 213#endif
188} 214}
189 215
190static struct ip_tunnel * ipip6_tunnel_locate(struct net *net, 216static struct ip_tunnel *ipip6_tunnel_locate(struct net *net,
191 struct ip_tunnel_parm *parms, int create) 217 struct ip_tunnel_parm *parms, int create)
192{ 218{
193 __be32 remote = parms->iph.daddr; 219 __be32 remote = parms->iph.daddr;
194 __be32 local = parms->iph.saddr; 220 __be32 local = parms->iph.saddr;
195 struct ip_tunnel *t, **tp, *nt; 221 struct ip_tunnel *t, *nt;
222 struct ip_tunnel __rcu **tp;
196 struct net_device *dev; 223 struct net_device *dev;
197 char name[IFNAMSIZ]; 224 char name[IFNAMSIZ];
198 struct sit_net *sitn = net_generic(net, sit_net_id); 225 struct sit_net *sitn = net_generic(net, sit_net_id);
199 226
200 for (tp = __ipip6_bucket(sitn, parms); (t = *tp) != NULL; tp = &t->next) { 227 for (tp = __ipip6_bucket(sitn, parms);
228 (t = rtnl_dereference(*tp)) != NULL;
229 tp = &t->next) {
201 if (local == t->parms.iph.saddr && 230 if (local == t->parms.iph.saddr &&
202 remote == t->parms.iph.daddr && 231 remote == t->parms.iph.daddr &&
203 parms->link == t->parms.link) { 232 parms->link == t->parms.link) {
@@ -213,7 +242,7 @@ static struct ip_tunnel * ipip6_tunnel_locate(struct net *net,
213 if (parms->name[0]) 242 if (parms->name[0])
214 strlcpy(name, parms->name, IFNAMSIZ); 243 strlcpy(name, parms->name, IFNAMSIZ);
215 else 244 else
216 sprintf(name, "sit%%d"); 245 strcpy(name, "sit%d");
217 246
218 dev = alloc_netdev(sizeof(*t), name, ipip6_tunnel_setup); 247 dev = alloc_netdev(sizeof(*t), name, ipip6_tunnel_setup);
219 if (dev == NULL) 248 if (dev == NULL)
@@ -229,7 +258,8 @@ static struct ip_tunnel * ipip6_tunnel_locate(struct net *net,
229 nt = netdev_priv(dev); 258 nt = netdev_priv(dev);
230 259
231 nt->parms = *parms; 260 nt->parms = *parms;
232 ipip6_tunnel_init(dev); 261 if (ipip6_tunnel_init(dev) < 0)
262 goto failed_free;
233 ipip6_tunnel_clone_6rd(dev, sitn); 263 ipip6_tunnel_clone_6rd(dev, sitn);
234 264
235 if (parms->i_flags & SIT_ISATAP) 265 if (parms->i_flags & SIT_ISATAP)
@@ -244,7 +274,7 @@ static struct ip_tunnel * ipip6_tunnel_locate(struct net *net,
244 return nt; 274 return nt;
245 275
246failed_free: 276failed_free:
247 free_netdev(dev); 277 ipip6_dev_free(dev);
248failed: 278failed:
249 return NULL; 279 return NULL;
250} 280}
@@ -340,7 +370,7 @@ ipip6_tunnel_add_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a, int chg)
340 370
341 ASSERT_RTNL(); 371 ASSERT_RTNL();
342 372
343 for (p = t->prl; p; p = p->next) { 373 for (p = rtnl_dereference(t->prl); p; p = rtnl_dereference(p->next)) {
344 if (p->addr == a->addr) { 374 if (p->addr == a->addr) {
345 if (chg) { 375 if (chg) {
346 p->flags = a->flags; 376 p->flags = a->flags;
@@ -451,15 +481,12 @@ static void ipip6_tunnel_uninit(struct net_device *dev)
451 struct sit_net *sitn = net_generic(net, sit_net_id); 481 struct sit_net *sitn = net_generic(net, sit_net_id);
452 482
453 if (dev == sitn->fb_tunnel_dev) { 483 if (dev == sitn->fb_tunnel_dev) {
454 spin_lock_bh(&ipip6_lock); 484 rcu_assign_pointer(sitn->tunnels_wc[0], NULL);
455 sitn->tunnels_wc[0] = NULL;
456 spin_unlock_bh(&ipip6_lock);
457 dev_put(dev);
458 } else { 485 } else {
459 ipip6_tunnel_unlink(sitn, netdev_priv(dev)); 486 ipip6_tunnel_unlink(sitn, netdev_priv(dev));
460 ipip6_tunnel_del_prl(netdev_priv(dev), NULL); 487 ipip6_tunnel_del_prl(netdev_priv(dev), NULL);
461 dev_put(dev);
462 } 488 }
489 dev_put(dev);
463} 490}
464 491
465 492
@@ -548,6 +575,8 @@ static int ipip6_rcv(struct sk_buff *skb)
548 tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, 575 tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev,
549 iph->saddr, iph->daddr); 576 iph->saddr, iph->daddr);
550 if (tunnel != NULL) { 577 if (tunnel != NULL) {
578 struct pcpu_tstats *tstats;
579
551 secpath_reset(skb); 580 secpath_reset(skb);
552 skb->mac_header = skb->network_header; 581 skb->mac_header = skb->network_header;
553 skb_reset_network_header(skb); 582 skb_reset_network_header(skb);
@@ -563,10 +592,16 @@ static int ipip6_rcv(struct sk_buff *skb)
563 return 0; 592 return 0;
564 } 593 }
565 594
566 skb_tunnel_rx(skb, tunnel->dev); 595 tstats = this_cpu_ptr(tunnel->dev->tstats);
596 tstats->rx_packets++;
597 tstats->rx_bytes += skb->len;
598
599 __skb_tunnel_rx(skb, tunnel->dev);
567 600
568 ipip6_ecn_decapsulate(iph, skb); 601 ipip6_ecn_decapsulate(iph, skb);
602
569 netif_rx(skb); 603 netif_rx(skb);
604
570 rcu_read_unlock(); 605 rcu_read_unlock();
571 return 0; 606 return 0;
572 } 607 }
@@ -590,7 +625,7 @@ __be32 try_6rd(struct in6_addr *v6dst, struct ip_tunnel *tunnel)
590#ifdef CONFIG_IPV6_SIT_6RD 625#ifdef CONFIG_IPV6_SIT_6RD
591 if (ipv6_prefix_equal(v6dst, &tunnel->ip6rd.prefix, 626 if (ipv6_prefix_equal(v6dst, &tunnel->ip6rd.prefix,
592 tunnel->ip6rd.prefixlen)) { 627 tunnel->ip6rd.prefixlen)) {
593 unsigned pbw0, pbi0; 628 unsigned int pbw0, pbi0;
594 int pbi1; 629 int pbi1;
595 u32 d; 630 u32 d;
596 631
@@ -625,14 +660,13 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
625 struct net_device *dev) 660 struct net_device *dev)
626{ 661{
627 struct ip_tunnel *tunnel = netdev_priv(dev); 662 struct ip_tunnel *tunnel = netdev_priv(dev);
628 struct net_device_stats *stats = &dev->stats; 663 struct pcpu_tstats *tstats;
629 struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
630 struct iphdr *tiph = &tunnel->parms.iph; 664 struct iphdr *tiph = &tunnel->parms.iph;
631 struct ipv6hdr *iph6 = ipv6_hdr(skb); 665 struct ipv6hdr *iph6 = ipv6_hdr(skb);
632 u8 tos = tunnel->parms.iph.tos; 666 u8 tos = tunnel->parms.iph.tos;
633 __be16 df = tiph->frag_off; 667 __be16 df = tiph->frag_off;
634 struct rtable *rt; /* Route to the other host */ 668 struct rtable *rt; /* Route to the other host */
635 struct net_device *tdev; /* Device to other host */ 669 struct net_device *tdev; /* Device to other host */
636 struct iphdr *iph; /* Our new IP header */ 670 struct iphdr *iph; /* Our new IP header */
637 unsigned int max_headroom; /* The extra header space needed */ 671 unsigned int max_headroom; /* The extra header space needed */
638 __be32 dst = tiph->daddr; 672 __be32 dst = tiph->daddr;
@@ -703,20 +737,20 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
703 .oif = tunnel->parms.link, 737 .oif = tunnel->parms.link,
704 .proto = IPPROTO_IPV6 }; 738 .proto = IPPROTO_IPV6 };
705 if (ip_route_output_key(dev_net(dev), &rt, &fl)) { 739 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
706 stats->tx_carrier_errors++; 740 dev->stats.tx_carrier_errors++;
707 goto tx_error_icmp; 741 goto tx_error_icmp;
708 } 742 }
709 } 743 }
710 if (rt->rt_type != RTN_UNICAST) { 744 if (rt->rt_type != RTN_UNICAST) {
711 ip_rt_put(rt); 745 ip_rt_put(rt);
712 stats->tx_carrier_errors++; 746 dev->stats.tx_carrier_errors++;
713 goto tx_error_icmp; 747 goto tx_error_icmp;
714 } 748 }
715 tdev = rt->dst.dev; 749 tdev = rt->dst.dev;
716 750
717 if (tdev == dev) { 751 if (tdev == dev) {
718 ip_rt_put(rt); 752 ip_rt_put(rt);
719 stats->collisions++; 753 dev->stats.collisions++;
720 goto tx_error; 754 goto tx_error;
721 } 755 }
722 756
@@ -724,7 +758,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
724 mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); 758 mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
725 759
726 if (mtu < 68) { 760 if (mtu < 68) {
727 stats->collisions++; 761 dev->stats.collisions++;
728 ip_rt_put(rt); 762 ip_rt_put(rt);
729 goto tx_error; 763 goto tx_error;
730 } 764 }
@@ -763,7 +797,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
763 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); 797 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
764 if (!new_skb) { 798 if (!new_skb) {
765 ip_rt_put(rt); 799 ip_rt_put(rt);
766 txq->tx_dropped++; 800 dev->stats.tx_dropped++;
767 dev_kfree_skb(skb); 801 dev_kfree_skb(skb);
768 return NETDEV_TX_OK; 802 return NETDEV_TX_OK;
769 } 803 }
@@ -799,14 +833,14 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
799 iph->ttl = iph6->hop_limit; 833 iph->ttl = iph6->hop_limit;
800 834
801 nf_reset(skb); 835 nf_reset(skb);
802 836 tstats = this_cpu_ptr(dev->tstats);
803 IPTUNNEL_XMIT(); 837 __IPTUNNEL_XMIT(tstats, &dev->stats);
804 return NETDEV_TX_OK; 838 return NETDEV_TX_OK;
805 839
806tx_error_icmp: 840tx_error_icmp:
807 dst_link_failure(skb); 841 dst_link_failure(skb);
808tx_error: 842tx_error:
809 stats->tx_errors++; 843 dev->stats.tx_errors++;
810 dev_kfree_skb(skb); 844 dev_kfree_skb(skb);
811 return NETDEV_TX_OK; 845 return NETDEV_TX_OK;
812} 846}
@@ -929,6 +963,7 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
929 } 963 }
930 t = netdev_priv(dev); 964 t = netdev_priv(dev);
931 ipip6_tunnel_unlink(sitn, t); 965 ipip6_tunnel_unlink(sitn, t);
966 synchronize_net();
932 t->parms.iph.saddr = p.iph.saddr; 967 t->parms.iph.saddr = p.iph.saddr;
933 t->parms.iph.daddr = p.iph.daddr; 968 t->parms.iph.daddr = p.iph.daddr;
934 memcpy(dev->dev_addr, &p.iph.saddr, 4); 969 memcpy(dev->dev_addr, &p.iph.saddr, 4);
@@ -1083,12 +1118,19 @@ static const struct net_device_ops ipip6_netdev_ops = {
1083 .ndo_start_xmit = ipip6_tunnel_xmit, 1118 .ndo_start_xmit = ipip6_tunnel_xmit,
1084 .ndo_do_ioctl = ipip6_tunnel_ioctl, 1119 .ndo_do_ioctl = ipip6_tunnel_ioctl,
1085 .ndo_change_mtu = ipip6_tunnel_change_mtu, 1120 .ndo_change_mtu = ipip6_tunnel_change_mtu,
1121 .ndo_get_stats = ipip6_get_stats,
1086}; 1122};
1087 1123
1124static void ipip6_dev_free(struct net_device *dev)
1125{
1126 free_percpu(dev->tstats);
1127 free_netdev(dev);
1128}
1129
1088static void ipip6_tunnel_setup(struct net_device *dev) 1130static void ipip6_tunnel_setup(struct net_device *dev)
1089{ 1131{
1090 dev->netdev_ops = &ipip6_netdev_ops; 1132 dev->netdev_ops = &ipip6_netdev_ops;
1091 dev->destructor = free_netdev; 1133 dev->destructor = ipip6_dev_free;
1092 1134
1093 dev->type = ARPHRD_SIT; 1135 dev->type = ARPHRD_SIT;
1094 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); 1136 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
@@ -1098,9 +1140,10 @@ static void ipip6_tunnel_setup(struct net_device *dev)
1098 dev->iflink = 0; 1140 dev->iflink = 0;
1099 dev->addr_len = 4; 1141 dev->addr_len = 4;
1100 dev->features |= NETIF_F_NETNS_LOCAL; 1142 dev->features |= NETIF_F_NETNS_LOCAL;
1143 dev->features |= NETIF_F_LLTX;
1101} 1144}
1102 1145
1103static void ipip6_tunnel_init(struct net_device *dev) 1146static int ipip6_tunnel_init(struct net_device *dev)
1104{ 1147{
1105 struct ip_tunnel *tunnel = netdev_priv(dev); 1148 struct ip_tunnel *tunnel = netdev_priv(dev);
1106 1149
@@ -1111,9 +1154,14 @@ static void ipip6_tunnel_init(struct net_device *dev)
1111 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); 1154 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1112 1155
1113 ipip6_tunnel_bind_dev(dev); 1156 ipip6_tunnel_bind_dev(dev);
1157 dev->tstats = alloc_percpu(struct pcpu_tstats);
1158 if (!dev->tstats)
1159 return -ENOMEM;
1160
1161 return 0;
1114} 1162}
1115 1163
1116static void __net_init ipip6_fb_tunnel_init(struct net_device *dev) 1164static int __net_init ipip6_fb_tunnel_init(struct net_device *dev)
1117{ 1165{
1118 struct ip_tunnel *tunnel = netdev_priv(dev); 1166 struct ip_tunnel *tunnel = netdev_priv(dev);
1119 struct iphdr *iph = &tunnel->parms.iph; 1167 struct iphdr *iph = &tunnel->parms.iph;
@@ -1128,11 +1176,15 @@ static void __net_init ipip6_fb_tunnel_init(struct net_device *dev)
1128 iph->ihl = 5; 1176 iph->ihl = 5;
1129 iph->ttl = 64; 1177 iph->ttl = 64;
1130 1178
1179 dev->tstats = alloc_percpu(struct pcpu_tstats);
1180 if (!dev->tstats)
1181 return -ENOMEM;
1131 dev_hold(dev); 1182 dev_hold(dev);
1132 sitn->tunnels_wc[0] = tunnel; 1183 sitn->tunnels_wc[0] = tunnel;
1184 return 0;
1133} 1185}
1134 1186
1135static struct xfrm_tunnel sit_handler = { 1187static struct xfrm_tunnel sit_handler __read_mostly = {
1136 .handler = ipip6_rcv, 1188 .handler = ipip6_rcv,
1137 .err_handler = ipip6_err, 1189 .err_handler = ipip6_err,
1138 .priority = 1, 1190 .priority = 1,
@@ -1173,7 +1225,10 @@ static int __net_init sit_init_net(struct net *net)
1173 } 1225 }
1174 dev_net_set(sitn->fb_tunnel_dev, net); 1226 dev_net_set(sitn->fb_tunnel_dev, net);
1175 1227
1176 ipip6_fb_tunnel_init(sitn->fb_tunnel_dev); 1228 err = ipip6_fb_tunnel_init(sitn->fb_tunnel_dev);
1229 if (err)
1230 goto err_dev_free;
1231
1177 ipip6_tunnel_clone_6rd(sitn->fb_tunnel_dev, sitn); 1232 ipip6_tunnel_clone_6rd(sitn->fb_tunnel_dev, sitn);
1178 1233
1179 if ((err = register_netdev(sitn->fb_tunnel_dev))) 1234 if ((err = register_netdev(sitn->fb_tunnel_dev)))
@@ -1183,7 +1238,8 @@ static int __net_init sit_init_net(struct net *net)
1183 1238
1184err_reg_dev: 1239err_reg_dev:
1185 dev_put(sitn->fb_tunnel_dev); 1240 dev_put(sitn->fb_tunnel_dev);
1186 free_netdev(sitn->fb_tunnel_dev); 1241err_dev_free:
1242 ipip6_dev_free(sitn->fb_tunnel_dev);
1187err_alloc_dev: 1243err_alloc_dev:
1188 return err; 1244 return err;
1189} 1245}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index fe6d40418c0b..7e41e2cbb85e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -139,7 +139,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
139 return -EINVAL; 139 return -EINVAL;
140 140
141 if (usin->sin6_family != AF_INET6) 141 if (usin->sin6_family != AF_INET6)
142 return(-EAFNOSUPPORT); 142 return -EAFNOSUPPORT;
143 143
144 memset(&fl, 0, sizeof(fl)); 144 memset(&fl, 0, sizeof(fl));
145 145
@@ -1409,7 +1409,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1409 1409
1410 newsk = tcp_create_openreq_child(sk, req, skb); 1410 newsk = tcp_create_openreq_child(sk, req, skb);
1411 if (newsk == NULL) 1411 if (newsk == NULL)
1412 goto out; 1412 goto out_nonewsk;
1413 1413
1414 /* 1414 /*
1415 * No need to charge this sock to the relevant IPv6 refcnt debug socks 1415 * No need to charge this sock to the relevant IPv6 refcnt debug socks
@@ -1497,18 +1497,22 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1497 } 1497 }
1498#endif 1498#endif
1499 1499
1500 if (__inet_inherit_port(sk, newsk) < 0) {
1501 sock_put(newsk);
1502 goto out;
1503 }
1500 __inet6_hash(newsk, NULL); 1504 __inet6_hash(newsk, NULL);
1501 __inet_inherit_port(sk, newsk);
1502 1505
1503 return newsk; 1506 return newsk;
1504 1507
1505out_overflow: 1508out_overflow:
1506 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1509 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1507out: 1510out_nonewsk:
1508 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1509 if (opt && opt != np->opt) 1511 if (opt && opt != np->opt)
1510 sock_kfree_s(sk, opt, opt->tot_len); 1512 sock_kfree_s(sk, opt, opt->tot_len);
1511 dst_release(dst); 1513 dst_release(dst);
1514out:
1515 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1512 return NULL; 1516 return NULL;
1513} 1517}
1514 1518
diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c
index fc3c86a47452..4f3cec12aa85 100644
--- a/net/ipv6/tunnel6.c
+++ b/net/ipv6/tunnel6.c
@@ -30,28 +30,31 @@
30#include <net/protocol.h> 30#include <net/protocol.h>
31#include <net/xfrm.h> 31#include <net/xfrm.h>
32 32
33static struct xfrm6_tunnel *tunnel6_handlers; 33static struct xfrm6_tunnel __rcu *tunnel6_handlers __read_mostly;
34static struct xfrm6_tunnel *tunnel46_handlers; 34static struct xfrm6_tunnel __rcu *tunnel46_handlers __read_mostly;
35static DEFINE_MUTEX(tunnel6_mutex); 35static DEFINE_MUTEX(tunnel6_mutex);
36 36
37int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family) 37int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family)
38{ 38{
39 struct xfrm6_tunnel **pprev; 39 struct xfrm6_tunnel __rcu **pprev;
40 struct xfrm6_tunnel *t;
40 int ret = -EEXIST; 41 int ret = -EEXIST;
41 int priority = handler->priority; 42 int priority = handler->priority;
42 43
43 mutex_lock(&tunnel6_mutex); 44 mutex_lock(&tunnel6_mutex);
44 45
45 for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers; 46 for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers;
46 *pprev; pprev = &(*pprev)->next) { 47 (t = rcu_dereference_protected(*pprev,
47 if ((*pprev)->priority > priority) 48 lockdep_is_held(&tunnel6_mutex))) != NULL;
49 pprev = &t->next) {
50 if (t->priority > priority)
48 break; 51 break;
49 if ((*pprev)->priority == priority) 52 if (t->priority == priority)
50 goto err; 53 goto err;
51 } 54 }
52 55
53 handler->next = *pprev; 56 handler->next = *pprev;
54 *pprev = handler; 57 rcu_assign_pointer(*pprev, handler);
55 58
56 ret = 0; 59 ret = 0;
57 60
@@ -65,14 +68,17 @@ EXPORT_SYMBOL(xfrm6_tunnel_register);
65 68
66int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family) 69int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family)
67{ 70{
68 struct xfrm6_tunnel **pprev; 71 struct xfrm6_tunnel __rcu **pprev;
72 struct xfrm6_tunnel *t;
69 int ret = -ENOENT; 73 int ret = -ENOENT;
70 74
71 mutex_lock(&tunnel6_mutex); 75 mutex_lock(&tunnel6_mutex);
72 76
73 for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers; 77 for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers;
74 *pprev; pprev = &(*pprev)->next) { 78 (t = rcu_dereference_protected(*pprev,
75 if (*pprev == handler) { 79 lockdep_is_held(&tunnel6_mutex))) != NULL;
80 pprev = &t->next) {
81 if (t == handler) {
76 *pprev = handler->next; 82 *pprev = handler->next;
77 ret = 0; 83 ret = 0;
78 break; 84 break;
@@ -88,6 +94,11 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family)
88 94
89EXPORT_SYMBOL(xfrm6_tunnel_deregister); 95EXPORT_SYMBOL(xfrm6_tunnel_deregister);
90 96
97#define for_each_tunnel_rcu(head, handler) \
98 for (handler = rcu_dereference(head); \
99 handler != NULL; \
100 handler = rcu_dereference(handler->next)) \
101
91static int tunnel6_rcv(struct sk_buff *skb) 102static int tunnel6_rcv(struct sk_buff *skb)
92{ 103{
93 struct xfrm6_tunnel *handler; 104 struct xfrm6_tunnel *handler;
@@ -95,7 +106,7 @@ static int tunnel6_rcv(struct sk_buff *skb)
95 if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) 106 if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
96 goto drop; 107 goto drop;
97 108
98 for (handler = tunnel6_handlers; handler; handler = handler->next) 109 for_each_tunnel_rcu(tunnel6_handlers, handler)
99 if (!handler->handler(skb)) 110 if (!handler->handler(skb))
100 return 0; 111 return 0;
101 112
@@ -113,7 +124,7 @@ static int tunnel46_rcv(struct sk_buff *skb)
113 if (!pskb_may_pull(skb, sizeof(struct iphdr))) 124 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
114 goto drop; 125 goto drop;
115 126
116 for (handler = tunnel46_handlers; handler; handler = handler->next) 127 for_each_tunnel_rcu(tunnel46_handlers, handler)
117 if (!handler->handler(skb)) 128 if (!handler->handler(skb))
118 return 0; 129 return 0;
119 130
@@ -129,7 +140,7 @@ static void tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
129{ 140{
130 struct xfrm6_tunnel *handler; 141 struct xfrm6_tunnel *handler;
131 142
132 for (handler = tunnel6_handlers; handler; handler = handler->next) 143 for_each_tunnel_rcu(tunnel6_handlers, handler)
133 if (!handler->err_handler(skb, opt, type, code, offset, info)) 144 if (!handler->err_handler(skb, opt, type, code, offset, info))
134 break; 145 break;
135} 146}
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 1dd1affdead2..91def93bec85 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -111,10 +111,19 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum)
111 return udp_lib_get_port(sk, snum, ipv6_rcv_saddr_equal, hash2_nulladdr); 111 return udp_lib_get_port(sk, snum, ipv6_rcv_saddr_equal, hash2_nulladdr);
112} 112}
113 113
114static void udp_v6_rehash(struct sock *sk)
115{
116 u16 new_hash = udp6_portaddr_hash(sock_net(sk),
117 &inet6_sk(sk)->rcv_saddr,
118 inet_sk(sk)->inet_num);
119
120 udp_lib_rehash(sk, new_hash);
121}
122
114static inline int compute_score(struct sock *sk, struct net *net, 123static inline int compute_score(struct sock *sk, struct net *net,
115 unsigned short hnum, 124 unsigned short hnum,
116 struct in6_addr *saddr, __be16 sport, 125 const struct in6_addr *saddr, __be16 sport,
117 struct in6_addr *daddr, __be16 dport, 126 const struct in6_addr *daddr, __be16 dport,
118 int dif) 127 int dif)
119{ 128{
120 int score = -1; 129 int score = -1;
@@ -230,8 +239,8 @@ exact_match:
230} 239}
231 240
232static struct sock *__udp6_lib_lookup(struct net *net, 241static struct sock *__udp6_lib_lookup(struct net *net,
233 struct in6_addr *saddr, __be16 sport, 242 const struct in6_addr *saddr, __be16 sport,
234 struct in6_addr *daddr, __be16 dport, 243 const struct in6_addr *daddr, __be16 dport,
235 int dif, struct udp_table *udptable) 244 int dif, struct udp_table *udptable)
236{ 245{
237 struct sock *sk, *result; 246 struct sock *sk, *result;
@@ -311,6 +320,14 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb,
311 udptable); 320 udptable);
312} 321}
313 322
323struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport,
324 const struct in6_addr *daddr, __be16 dport, int dif)
325{
326 return __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table);
327}
328EXPORT_SYMBOL_GPL(udp6_lib_lookup);
329
330
314/* 331/*
315 * This should be easy, if there is something there we 332 * This should be easy, if there is something there we
316 * return it, otherwise we block. 333 * return it, otherwise we block.
@@ -510,7 +527,7 @@ int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
510 } 527 }
511 } 528 }
512 529
513 if (sk->sk_filter) { 530 if (rcu_dereference_raw(sk->sk_filter)) {
514 if (udp_lib_checksum_complete(skb)) 531 if (udp_lib_checksum_complete(skb))
515 goto drop; 532 goto drop;
516 } 533 }
@@ -1447,6 +1464,7 @@ struct proto udpv6_prot = {
1447 .backlog_rcv = udpv6_queue_rcv_skb, 1464 .backlog_rcv = udpv6_queue_rcv_skb,
1448 .hash = udp_lib_hash, 1465 .hash = udp_lib_hash,
1449 .unhash = udp_lib_unhash, 1466 .unhash = udp_lib_unhash,
1467 .rehash = udp_v6_rehash,
1450 .get_port = udp_v6_get_port, 1468 .get_port = udp_v6_get_port,
1451 .memory_allocated = &udp_memory_allocated, 1469 .memory_allocated = &udp_memory_allocated,
1452 .sysctl_mem = sysctl_udp_mem, 1470 .sysctl_mem = sysctl_udp_mem,
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 6baeabbbca82..7e74023ea6e4 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -199,7 +199,7 @@ static inline int xfrm6_garbage_collect(struct dst_ops *ops)
199 struct net *net = container_of(ops, struct net, xfrm.xfrm6_dst_ops); 199 struct net *net = container_of(ops, struct net, xfrm.xfrm6_dst_ops);
200 200
201 xfrm6_policy_afinfo.garbage_collect(net); 201 xfrm6_policy_afinfo.garbage_collect(net);
202 return (atomic_read(&ops->entries) > ops->gc_thresh * 2); 202 return dst_entries_get_fast(ops) > ops->gc_thresh * 2;
203} 203}
204 204
205static void xfrm6_update_pmtu(struct dst_entry *dst, u32 mtu) 205static void xfrm6_update_pmtu(struct dst_entry *dst, u32 mtu)
@@ -255,7 +255,6 @@ static struct dst_ops xfrm6_dst_ops = {
255 .ifdown = xfrm6_dst_ifdown, 255 .ifdown = xfrm6_dst_ifdown,
256 .local_out = __ip6_local_out, 256 .local_out = __ip6_local_out,
257 .gc_thresh = 1024, 257 .gc_thresh = 1024,
258 .entries = ATOMIC_INIT(0),
259}; 258};
260 259
261static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { 260static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
@@ -312,11 +311,13 @@ int __init xfrm6_init(void)
312 */ 311 */
313 gc_thresh = FIB6_TABLE_HASHSZ * 8; 312 gc_thresh = FIB6_TABLE_HASHSZ * 8;
314 xfrm6_dst_ops.gc_thresh = (gc_thresh < 1024) ? 1024 : gc_thresh; 313 xfrm6_dst_ops.gc_thresh = (gc_thresh < 1024) ? 1024 : gc_thresh;
314 dst_entries_init(&xfrm6_dst_ops);
315 315
316 ret = xfrm6_policy_init(); 316 ret = xfrm6_policy_init();
317 if (ret) 317 if (ret) {
318 dst_entries_destroy(&xfrm6_dst_ops);
318 goto out; 319 goto out;
319 320 }
320 ret = xfrm6_state_init(); 321 ret = xfrm6_state_init();
321 if (ret) 322 if (ret)
322 goto out_policy; 323 goto out_policy;
@@ -341,4 +342,5 @@ void xfrm6_fini(void)
341 //xfrm6_input_fini(); 342 //xfrm6_input_fini();
342 xfrm6_policy_fini(); 343 xfrm6_policy_fini();
343 xfrm6_state_fini(); 344 xfrm6_state_fini();
345 dst_entries_destroy(&xfrm6_dst_ops);
344} 346}
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index f417b77fa0e1..a67575d472a3 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -20,23 +20,27 @@
20#include <net/addrconf.h> 20#include <net/addrconf.h>
21 21
22static void 22static void
23__xfrm6_init_tempsel(struct xfrm_state *x, struct flowi *fl, 23__xfrm6_init_tempsel(struct xfrm_selector *sel, struct flowi *fl)
24 struct xfrm_tmpl *tmpl,
25 xfrm_address_t *daddr, xfrm_address_t *saddr)
26{ 24{
27 /* Initialize temporary selector matching only 25 /* Initialize temporary selector matching only
28 * to current session. */ 26 * to current session. */
29 ipv6_addr_copy((struct in6_addr *)&x->sel.daddr, &fl->fl6_dst); 27 ipv6_addr_copy((struct in6_addr *)&sel->daddr, &fl->fl6_dst);
30 ipv6_addr_copy((struct in6_addr *)&x->sel.saddr, &fl->fl6_src); 28 ipv6_addr_copy((struct in6_addr *)&sel->saddr, &fl->fl6_src);
31 x->sel.dport = xfrm_flowi_dport(fl); 29 sel->dport = xfrm_flowi_dport(fl);
32 x->sel.dport_mask = htons(0xffff); 30 sel->dport_mask = htons(0xffff);
33 x->sel.sport = xfrm_flowi_sport(fl); 31 sel->sport = xfrm_flowi_sport(fl);
34 x->sel.sport_mask = htons(0xffff); 32 sel->sport_mask = htons(0xffff);
35 x->sel.family = AF_INET6; 33 sel->family = AF_INET6;
36 x->sel.prefixlen_d = 128; 34 sel->prefixlen_d = 128;
37 x->sel.prefixlen_s = 128; 35 sel->prefixlen_s = 128;
38 x->sel.proto = fl->proto; 36 sel->proto = fl->proto;
39 x->sel.ifindex = fl->oif; 37 sel->ifindex = fl->oif;
38}
39
40static void
41xfrm6_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl,
42 xfrm_address_t *daddr, xfrm_address_t *saddr)
43{
40 x->id = tmpl->id; 44 x->id = tmpl->id;
41 if (ipv6_addr_any((struct in6_addr*)&x->id.daddr)) 45 if (ipv6_addr_any((struct in6_addr*)&x->id.daddr))
42 memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr)); 46 memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr));
@@ -168,6 +172,7 @@ static struct xfrm_state_afinfo xfrm6_state_afinfo = {
168 .eth_proto = htons(ETH_P_IPV6), 172 .eth_proto = htons(ETH_P_IPV6),
169 .owner = THIS_MODULE, 173 .owner = THIS_MODULE,
170 .init_tempsel = __xfrm6_init_tempsel, 174 .init_tempsel = __xfrm6_init_tempsel,
175 .init_temprop = xfrm6_init_temprop,
171 .tmpl_sort = __xfrm6_tmpl_sort, 176 .tmpl_sort = __xfrm6_tmpl_sort,
172 .state_sort = __xfrm6_state_sort, 177 .state_sort = __xfrm6_state_sort,
173 .output = xfrm6_output, 178 .output = xfrm6_output,
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index 2ce3a8278f26..2969cad408de 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -199,7 +199,7 @@ static void x6spi_destroy_rcu(struct rcu_head *head)
199 container_of(head, struct xfrm6_tunnel_spi, rcu_head)); 199 container_of(head, struct xfrm6_tunnel_spi, rcu_head));
200} 200}
201 201
202void xfrm6_tunnel_free_spi(struct net *net, xfrm_address_t *saddr) 202static void xfrm6_tunnel_free_spi(struct net *net, xfrm_address_t *saddr)
203{ 203{
204 struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); 204 struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net);
205 struct xfrm6_tunnel_spi *x6spi; 205 struct xfrm6_tunnel_spi *x6spi;
@@ -223,8 +223,6 @@ void xfrm6_tunnel_free_spi(struct net *net, xfrm_address_t *saddr)
223 spin_unlock_bh(&xfrm6_tunnel_spi_lock); 223 spin_unlock_bh(&xfrm6_tunnel_spi_lock);
224} 224}
225 225
226EXPORT_SYMBOL(xfrm6_tunnel_free_spi);
227
228static int xfrm6_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) 226static int xfrm6_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
229{ 227{
230 skb_push(skb, -skb_network_offset(skb)); 228 skb_push(skb, -skb_network_offset(skb));
@@ -317,13 +315,13 @@ static const struct xfrm_type xfrm6_tunnel_type = {
317 .output = xfrm6_tunnel_output, 315 .output = xfrm6_tunnel_output,
318}; 316};
319 317
320static struct xfrm6_tunnel xfrm6_tunnel_handler = { 318static struct xfrm6_tunnel xfrm6_tunnel_handler __read_mostly = {
321 .handler = xfrm6_tunnel_rcv, 319 .handler = xfrm6_tunnel_rcv,
322 .err_handler = xfrm6_tunnel_err, 320 .err_handler = xfrm6_tunnel_err,
323 .priority = 2, 321 .priority = 2,
324}; 322};
325 323
326static struct xfrm6_tunnel xfrm46_tunnel_handler = { 324static struct xfrm6_tunnel xfrm46_tunnel_handler __read_mostly = {
327 .handler = xfrm6_tunnel_rcv, 325 .handler = xfrm6_tunnel_rcv,
328 .err_handler = xfrm6_tunnel_err, 326 .err_handler = xfrm6_tunnel_err,
329 .priority = 2, 327 .priority = 2,
diff --git a/net/ipx/Kconfig b/net/ipx/Kconfig
index e9ad0062fbb6..02549cb2c328 100644
--- a/net/ipx/Kconfig
+++ b/net/ipx/Kconfig
@@ -3,6 +3,7 @@
3# 3#
4config IPX 4config IPX
5 tristate "The IPX protocol" 5 tristate "The IPX protocol"
6 depends on BKL # should be fixable
6 select LLC 7 select LLC
7 ---help--- 8 ---help---
8 This is support for the Novell networking protocol, IPX, commonly 9 This is support for the Novell networking protocol, IPX, commonly
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 79986a674f6e..7f097989cde2 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -573,9 +573,9 @@ static int irda_find_lsap_sel(struct irda_sock *self, char *name)
573 /* Requested object/attribute doesn't exist */ 573 /* Requested object/attribute doesn't exist */
574 if((self->errno == IAS_CLASS_UNKNOWN) || 574 if((self->errno == IAS_CLASS_UNKNOWN) ||
575 (self->errno == IAS_ATTRIB_UNKNOWN)) 575 (self->errno == IAS_ATTRIB_UNKNOWN))
576 return (-EADDRNOTAVAIL); 576 return -EADDRNOTAVAIL;
577 else 577 else
578 return (-EHOSTUNREACH); 578 return -EHOSTUNREACH;
579 } 579 }
580 580
581 /* Get the remote TSAP selector */ 581 /* Get the remote TSAP selector */
@@ -663,7 +663,7 @@ static int irda_discover_daddr_and_lsap_sel(struct irda_sock *self, char *name)
663 __func__, name); 663 __func__, name);
664 self->daddr = DEV_ADDR_ANY; 664 self->daddr = DEV_ADDR_ANY;
665 kfree(discoveries); 665 kfree(discoveries);
666 return(-ENOTUNIQ); 666 return -ENOTUNIQ;
667 } 667 }
668 /* First time we found that one, save it ! */ 668 /* First time we found that one, save it ! */
669 daddr = self->daddr; 669 daddr = self->daddr;
@@ -677,7 +677,7 @@ static int irda_discover_daddr_and_lsap_sel(struct irda_sock *self, char *name)
677 IRDA_DEBUG(0, "%s(), unexpected IAS query failure\n", __func__); 677 IRDA_DEBUG(0, "%s(), unexpected IAS query failure\n", __func__);
678 self->daddr = DEV_ADDR_ANY; 678 self->daddr = DEV_ADDR_ANY;
679 kfree(discoveries); 679 kfree(discoveries);
680 return(-EHOSTUNREACH); 680 return -EHOSTUNREACH;
681 break; 681 break;
682 } 682 }
683 } 683 }
@@ -689,7 +689,7 @@ static int irda_discover_daddr_and_lsap_sel(struct irda_sock *self, char *name)
689 IRDA_DEBUG(1, "%s(), cannot discover service ''%s'' in any device !!!\n", 689 IRDA_DEBUG(1, "%s(), cannot discover service ''%s'' in any device !!!\n",
690 __func__, name); 690 __func__, name);
691 self->daddr = DEV_ADDR_ANY; 691 self->daddr = DEV_ADDR_ANY;
692 return(-EADDRNOTAVAIL); 692 return -EADDRNOTAVAIL;
693 } 693 }
694 694
695 /* Revert back to discovered device & service */ 695 /* Revert back to discovered device & service */
@@ -715,14 +715,11 @@ static int irda_getname(struct socket *sock, struct sockaddr *uaddr,
715 struct sockaddr_irda saddr; 715 struct sockaddr_irda saddr;
716 struct sock *sk = sock->sk; 716 struct sock *sk = sock->sk;
717 struct irda_sock *self = irda_sk(sk); 717 struct irda_sock *self = irda_sk(sk);
718 int err;
719 718
720 lock_kernel();
721 memset(&saddr, 0, sizeof(saddr)); 719 memset(&saddr, 0, sizeof(saddr));
722 if (peer) { 720 if (peer) {
723 err = -ENOTCONN;
724 if (sk->sk_state != TCP_ESTABLISHED) 721 if (sk->sk_state != TCP_ESTABLISHED)
725 goto out; 722 return -ENOTCONN;
726 723
727 saddr.sir_family = AF_IRDA; 724 saddr.sir_family = AF_IRDA;
728 saddr.sir_lsap_sel = self->dtsap_sel; 725 saddr.sir_lsap_sel = self->dtsap_sel;
@@ -739,10 +736,8 @@ static int irda_getname(struct socket *sock, struct sockaddr *uaddr,
739 /* uaddr_len come to us uninitialised */ 736 /* uaddr_len come to us uninitialised */
740 *uaddr_len = sizeof (struct sockaddr_irda); 737 *uaddr_len = sizeof (struct sockaddr_irda);
741 memcpy(uaddr, &saddr, *uaddr_len); 738 memcpy(uaddr, &saddr, *uaddr_len);
742 err = 0; 739
743out: 740 return 0;
744 unlock_kernel();
745 return err;
746} 741}
747 742
748/* 743/*
@@ -758,7 +753,8 @@ static int irda_listen(struct socket *sock, int backlog)
758 753
759 IRDA_DEBUG(2, "%s()\n", __func__); 754 IRDA_DEBUG(2, "%s()\n", __func__);
760 755
761 lock_kernel(); 756 lock_sock(sk);
757
762 if ((sk->sk_type != SOCK_STREAM) && (sk->sk_type != SOCK_SEQPACKET) && 758 if ((sk->sk_type != SOCK_STREAM) && (sk->sk_type != SOCK_SEQPACKET) &&
763 (sk->sk_type != SOCK_DGRAM)) 759 (sk->sk_type != SOCK_DGRAM))
764 goto out; 760 goto out;
@@ -770,7 +766,7 @@ static int irda_listen(struct socket *sock, int backlog)
770 err = 0; 766 err = 0;
771 } 767 }
772out: 768out:
773 unlock_kernel(); 769 release_sock(sk);
774 770
775 return err; 771 return err;
776} 772}
@@ -793,7 +789,7 @@ static int irda_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
793 if (addr_len != sizeof(struct sockaddr_irda)) 789 if (addr_len != sizeof(struct sockaddr_irda))
794 return -EINVAL; 790 return -EINVAL;
795 791
796 lock_kernel(); 792 lock_sock(sk);
797#ifdef CONFIG_IRDA_ULTRA 793#ifdef CONFIG_IRDA_ULTRA
798 /* Special care for Ultra sockets */ 794 /* Special care for Ultra sockets */
799 if ((sk->sk_type == SOCK_DGRAM) && 795 if ((sk->sk_type == SOCK_DGRAM) &&
@@ -824,8 +820,8 @@ static int irda_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
824 820
825 err = irda_open_tsap(self, addr->sir_lsap_sel, addr->sir_name); 821 err = irda_open_tsap(self, addr->sir_lsap_sel, addr->sir_name);
826 if (err < 0) { 822 if (err < 0) {
827 kfree(self->ias_obj->name); 823 irias_delete_object(self->ias_obj);
828 kfree(self->ias_obj); 824 self->ias_obj = NULL;
829 goto out; 825 goto out;
830 } 826 }
831 827
@@ -836,7 +832,7 @@ static int irda_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
836 832
837 err = 0; 833 err = 0;
838out: 834out:
839 unlock_kernel(); 835 release_sock(sk);
840 return err; 836 return err;
841} 837}
842 838
@@ -856,12 +852,13 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
856 852
857 IRDA_DEBUG(2, "%s()\n", __func__); 853 IRDA_DEBUG(2, "%s()\n", __func__);
858 854
859 lock_kernel();
860 err = irda_create(sock_net(sk), newsock, sk->sk_protocol, 0); 855 err = irda_create(sock_net(sk), newsock, sk->sk_protocol, 0);
861 if (err) 856 if (err)
862 goto out; 857 return err;
863 858
864 err = -EINVAL; 859 err = -EINVAL;
860
861 lock_sock(sk);
865 if (sock->state != SS_UNCONNECTED) 862 if (sock->state != SS_UNCONNECTED)
866 goto out; 863 goto out;
867 864
@@ -947,7 +944,7 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
947 irda_connect_response(new); 944 irda_connect_response(new);
948 err = 0; 945 err = 0;
949out: 946out:
950 unlock_kernel(); 947 release_sock(sk);
951 return err; 948 return err;
952} 949}
953 950
@@ -981,7 +978,7 @@ static int irda_connect(struct socket *sock, struct sockaddr *uaddr,
981 978
982 IRDA_DEBUG(2, "%s(%p)\n", __func__, self); 979 IRDA_DEBUG(2, "%s(%p)\n", __func__, self);
983 980
984 lock_kernel(); 981 lock_sock(sk);
985 /* Don't allow connect for Ultra sockets */ 982 /* Don't allow connect for Ultra sockets */
986 err = -ESOCKTNOSUPPORT; 983 err = -ESOCKTNOSUPPORT;
987 if ((sk->sk_type == SOCK_DGRAM) && (sk->sk_protocol == IRDAPROTO_ULTRA)) 984 if ((sk->sk_type == SOCK_DGRAM) && (sk->sk_protocol == IRDAPROTO_ULTRA))
@@ -1072,6 +1069,8 @@ static int irda_connect(struct socket *sock, struct sockaddr *uaddr,
1072 1069
1073 if (sk->sk_state != TCP_ESTABLISHED) { 1070 if (sk->sk_state != TCP_ESTABLISHED) {
1074 sock->state = SS_UNCONNECTED; 1071 sock->state = SS_UNCONNECTED;
1072 if (sk->sk_prot->disconnect(sk, flags))
1073 sock->state = SS_DISCONNECTING;
1075 err = sock_error(sk); 1074 err = sock_error(sk);
1076 if (!err) 1075 if (!err)
1077 err = -ECONNRESET; 1076 err = -ECONNRESET;
@@ -1084,7 +1083,7 @@ static int irda_connect(struct socket *sock, struct sockaddr *uaddr,
1084 self->saddr = irttp_get_saddr(self->tsap); 1083 self->saddr = irttp_get_saddr(self->tsap);
1085 err = 0; 1084 err = 0;
1086out: 1085out:
1087 unlock_kernel(); 1086 release_sock(sk);
1088 return err; 1087 return err;
1089} 1088}
1090 1089
@@ -1231,7 +1230,6 @@ static int irda_release(struct socket *sock)
1231 if (sk == NULL) 1230 if (sk == NULL)
1232 return 0; 1231 return 0;
1233 1232
1234 lock_kernel();
1235 lock_sock(sk); 1233 lock_sock(sk);
1236 sk->sk_state = TCP_CLOSE; 1234 sk->sk_state = TCP_CLOSE;
1237 sk->sk_shutdown |= SEND_SHUTDOWN; 1235 sk->sk_shutdown |= SEND_SHUTDOWN;
@@ -1250,7 +1248,6 @@ static int irda_release(struct socket *sock)
1250 /* Destroy networking socket if we are the last reference on it, 1248 /* Destroy networking socket if we are the last reference on it,
1251 * i.e. if(sk->sk_refcnt == 0) -> sk_free(sk) */ 1249 * i.e. if(sk->sk_refcnt == 0) -> sk_free(sk) */
1252 sock_put(sk); 1250 sock_put(sk);
1253 unlock_kernel();
1254 1251
1255 /* Notes on socket locking and deallocation... - Jean II 1252 /* Notes on socket locking and deallocation... - Jean II
1256 * In theory we should put pairs of sock_hold() / sock_put() to 1253 * In theory we should put pairs of sock_hold() / sock_put() to
@@ -1298,7 +1295,6 @@ static int irda_sendmsg(struct kiocb *iocb, struct socket *sock,
1298 1295
1299 IRDA_DEBUG(4, "%s(), len=%zd\n", __func__, len); 1296 IRDA_DEBUG(4, "%s(), len=%zd\n", __func__, len);
1300 1297
1301 lock_kernel();
1302 /* Note : socket.c set MSG_EOR on SEQPACKET sockets */ 1298 /* Note : socket.c set MSG_EOR on SEQPACKET sockets */
1303 if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_EOR | MSG_CMSG_COMPAT | 1299 if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_EOR | MSG_CMSG_COMPAT |
1304 MSG_NOSIGNAL)) { 1300 MSG_NOSIGNAL)) {
@@ -1306,6 +1302,8 @@ static int irda_sendmsg(struct kiocb *iocb, struct socket *sock,
1306 goto out; 1302 goto out;
1307 } 1303 }
1308 1304
1305 lock_sock(sk);
1306
1309 if (sk->sk_shutdown & SEND_SHUTDOWN) 1307 if (sk->sk_shutdown & SEND_SHUTDOWN)
1310 goto out_err; 1308 goto out_err;
1311 1309
@@ -1361,14 +1359,14 @@ static int irda_sendmsg(struct kiocb *iocb, struct socket *sock,
1361 goto out_err; 1359 goto out_err;
1362 } 1360 }
1363 1361
1364 unlock_kernel(); 1362 release_sock(sk);
1365 /* Tell client how much data we actually sent */ 1363 /* Tell client how much data we actually sent */
1366 return len; 1364 return len;
1367 1365
1368out_err: 1366out_err:
1369 err = sk_stream_error(sk, msg->msg_flags, err); 1367 err = sk_stream_error(sk, msg->msg_flags, err);
1370out: 1368out:
1371 unlock_kernel(); 1369 release_sock(sk);
1372 return err; 1370 return err;
1373 1371
1374} 1372}
@@ -1390,14 +1388,10 @@ static int irda_recvmsg_dgram(struct kiocb *iocb, struct socket *sock,
1390 1388
1391 IRDA_DEBUG(4, "%s()\n", __func__); 1389 IRDA_DEBUG(4, "%s()\n", __func__);
1392 1390
1393 lock_kernel();
1394 if ((err = sock_error(sk)) < 0)
1395 goto out;
1396
1397 skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, 1391 skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT,
1398 flags & MSG_DONTWAIT, &err); 1392 flags & MSG_DONTWAIT, &err);
1399 if (!skb) 1393 if (!skb)
1400 goto out; 1394 return err;
1401 1395
1402 skb_reset_transport_header(skb); 1396 skb_reset_transport_header(skb);
1403 copied = skb->len; 1397 copied = skb->len;
@@ -1425,12 +1419,8 @@ static int irda_recvmsg_dgram(struct kiocb *iocb, struct socket *sock,
1425 irttp_flow_request(self->tsap, FLOW_START); 1419 irttp_flow_request(self->tsap, FLOW_START);
1426 } 1420 }
1427 } 1421 }
1428 unlock_kernel();
1429 return copied;
1430 1422
1431out: 1423 return copied;
1432 unlock_kernel();
1433 return err;
1434} 1424}
1435 1425
1436/* 1426/*
@@ -1448,17 +1438,15 @@ static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock,
1448 1438
1449 IRDA_DEBUG(3, "%s()\n", __func__); 1439 IRDA_DEBUG(3, "%s()\n", __func__);
1450 1440
1451 lock_kernel();
1452 if ((err = sock_error(sk)) < 0) 1441 if ((err = sock_error(sk)) < 0)
1453 goto out; 1442 return err;
1454 1443
1455 err = -EINVAL;
1456 if (sock->flags & __SO_ACCEPTCON) 1444 if (sock->flags & __SO_ACCEPTCON)
1457 goto out; 1445 return -EINVAL;
1458 1446
1459 err =-EOPNOTSUPP; 1447 err =-EOPNOTSUPP;
1460 if (flags & MSG_OOB) 1448 if (flags & MSG_OOB)
1461 goto out; 1449 return -EOPNOTSUPP;
1462 1450
1463 err = 0; 1451 err = 0;
1464 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); 1452 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
@@ -1500,7 +1488,7 @@ static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock,
1500 finish_wait(sk_sleep(sk), &wait); 1488 finish_wait(sk_sleep(sk), &wait);
1501 1489
1502 if (err) 1490 if (err)
1503 goto out; 1491 return err;
1504 if (sk->sk_shutdown & RCV_SHUTDOWN) 1492 if (sk->sk_shutdown & RCV_SHUTDOWN)
1505 break; 1493 break;
1506 1494
@@ -1553,9 +1541,7 @@ static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock,
1553 } 1541 }
1554 } 1542 }
1555 1543
1556out: 1544 return copied;
1557 unlock_kernel();
1558 return err ? : copied;
1559} 1545}
1560 1546
1561/* 1547/*
@@ -1573,13 +1559,12 @@ static int irda_sendmsg_dgram(struct kiocb *iocb, struct socket *sock,
1573 struct sk_buff *skb; 1559 struct sk_buff *skb;
1574 int err; 1560 int err;
1575 1561
1576 lock_kernel();
1577
1578 IRDA_DEBUG(4, "%s(), len=%zd\n", __func__, len); 1562 IRDA_DEBUG(4, "%s(), len=%zd\n", __func__, len);
1579 1563
1580 err = -EINVAL;
1581 if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT)) 1564 if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT))
1582 goto out; 1565 return -EINVAL;
1566
1567 lock_sock(sk);
1583 1568
1584 if (sk->sk_shutdown & SEND_SHUTDOWN) { 1569 if (sk->sk_shutdown & SEND_SHUTDOWN) {
1585 send_sig(SIGPIPE, current, 0); 1570 send_sig(SIGPIPE, current, 0);
@@ -1630,10 +1615,12 @@ static int irda_sendmsg_dgram(struct kiocb *iocb, struct socket *sock,
1630 IRDA_DEBUG(0, "%s(), err=%d\n", __func__, err); 1615 IRDA_DEBUG(0, "%s(), err=%d\n", __func__, err);
1631 goto out; 1616 goto out;
1632 } 1617 }
1633 unlock_kernel(); 1618
1619 release_sock(sk);
1634 return len; 1620 return len;
1621
1635out: 1622out:
1636 unlock_kernel(); 1623 release_sock(sk);
1637 return err; 1624 return err;
1638} 1625}
1639 1626
@@ -1656,10 +1643,11 @@ static int irda_sendmsg_ultra(struct kiocb *iocb, struct socket *sock,
1656 1643
1657 IRDA_DEBUG(4, "%s(), len=%zd\n", __func__, len); 1644 IRDA_DEBUG(4, "%s(), len=%zd\n", __func__, len);
1658 1645
1659 lock_kernel();
1660 err = -EINVAL; 1646 err = -EINVAL;
1661 if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT)) 1647 if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT))
1662 goto out; 1648 return -EINVAL;
1649
1650 lock_sock(sk);
1663 1651
1664 err = -EPIPE; 1652 err = -EPIPE;
1665 if (sk->sk_shutdown & SEND_SHUTDOWN) { 1653 if (sk->sk_shutdown & SEND_SHUTDOWN) {
@@ -1732,7 +1720,7 @@ static int irda_sendmsg_ultra(struct kiocb *iocb, struct socket *sock,
1732 if (err) 1720 if (err)
1733 IRDA_DEBUG(0, "%s(), err=%d\n", __func__, err); 1721 IRDA_DEBUG(0, "%s(), err=%d\n", __func__, err);
1734out: 1722out:
1735 unlock_kernel(); 1723 release_sock(sk);
1736 return err ? : len; 1724 return err ? : len;
1737} 1725}
1738#endif /* CONFIG_IRDA_ULTRA */ 1726#endif /* CONFIG_IRDA_ULTRA */
@@ -1747,7 +1735,7 @@ static int irda_shutdown(struct socket *sock, int how)
1747 1735
1748 IRDA_DEBUG(1, "%s(%p)\n", __func__, self); 1736 IRDA_DEBUG(1, "%s(%p)\n", __func__, self);
1749 1737
1750 lock_kernel(); 1738 lock_sock(sk);
1751 1739
1752 sk->sk_state = TCP_CLOSE; 1740 sk->sk_state = TCP_CLOSE;
1753 sk->sk_shutdown |= SEND_SHUTDOWN; 1741 sk->sk_shutdown |= SEND_SHUTDOWN;
@@ -1769,7 +1757,7 @@ static int irda_shutdown(struct socket *sock, int how)
1769 self->daddr = DEV_ADDR_ANY; /* Until we get re-connected */ 1757 self->daddr = DEV_ADDR_ANY; /* Until we get re-connected */
1770 self->saddr = 0x0; /* so IrLMP assign us any link */ 1758 self->saddr = 0x0; /* so IrLMP assign us any link */
1771 1759
1772 unlock_kernel(); 1760 release_sock(sk);
1773 1761
1774 return 0; 1762 return 0;
1775} 1763}
@@ -1786,7 +1774,6 @@ static unsigned int irda_poll(struct file * file, struct socket *sock,
1786 1774
1787 IRDA_DEBUG(4, "%s()\n", __func__); 1775 IRDA_DEBUG(4, "%s()\n", __func__);
1788 1776
1789 lock_kernel();
1790 poll_wait(file, sk_sleep(sk), wait); 1777 poll_wait(file, sk_sleep(sk), wait);
1791 mask = 0; 1778 mask = 0;
1792 1779
@@ -1834,20 +1821,8 @@ static unsigned int irda_poll(struct file * file, struct socket *sock,
1834 default: 1821 default:
1835 break; 1822 break;
1836 } 1823 }
1837 unlock_kernel();
1838 return mask;
1839}
1840
1841static unsigned int irda_datagram_poll(struct file *file, struct socket *sock,
1842 poll_table *wait)
1843{
1844 int err;
1845
1846 lock_kernel();
1847 err = datagram_poll(file, sock, wait);
1848 unlock_kernel();
1849 1824
1850 return err; 1825 return mask;
1851} 1826}
1852 1827
1853/* 1828/*
@@ -1860,7 +1835,6 @@ static int irda_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1860 1835
1861 IRDA_DEBUG(4, "%s(), cmd=%#x\n", __func__, cmd); 1836 IRDA_DEBUG(4, "%s(), cmd=%#x\n", __func__, cmd);
1862 1837
1863 lock_kernel();
1864 err = -EINVAL; 1838 err = -EINVAL;
1865 switch (cmd) { 1839 switch (cmd) {
1866 case TIOCOUTQ: { 1840 case TIOCOUTQ: {
@@ -1903,7 +1877,6 @@ static int irda_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1903 IRDA_DEBUG(1, "%s(), doing device ioctl!\n", __func__); 1877 IRDA_DEBUG(1, "%s(), doing device ioctl!\n", __func__);
1904 err = -ENOIOCTLCMD; 1878 err = -ENOIOCTLCMD;
1905 } 1879 }
1906 unlock_kernel();
1907 1880
1908 return err; 1881 return err;
1909} 1882}
@@ -1927,7 +1900,7 @@ static int irda_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned lon
1927 * Set some options for the socket 1900 * Set some options for the socket
1928 * 1901 *
1929 */ 1902 */
1930static int __irda_setsockopt(struct socket *sock, int level, int optname, 1903static int irda_setsockopt(struct socket *sock, int level, int optname,
1931 char __user *optval, unsigned int optlen) 1904 char __user *optval, unsigned int optlen)
1932{ 1905{
1933 struct sock *sk = sock->sk; 1906 struct sock *sk = sock->sk;
@@ -1935,13 +1908,15 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
1935 struct irda_ias_set *ias_opt; 1908 struct irda_ias_set *ias_opt;
1936 struct ias_object *ias_obj; 1909 struct ias_object *ias_obj;
1937 struct ias_attrib * ias_attr; /* Attribute in IAS object */ 1910 struct ias_attrib * ias_attr; /* Attribute in IAS object */
1938 int opt, free_ias = 0; 1911 int opt, free_ias = 0, err = 0;
1939 1912
1940 IRDA_DEBUG(2, "%s(%p)\n", __func__, self); 1913 IRDA_DEBUG(2, "%s(%p)\n", __func__, self);
1941 1914
1942 if (level != SOL_IRLMP) 1915 if (level != SOL_IRLMP)
1943 return -ENOPROTOOPT; 1916 return -ENOPROTOOPT;
1944 1917
1918 lock_sock(sk);
1919
1945 switch (optname) { 1920 switch (optname) {
1946 case IRLMP_IAS_SET: 1921 case IRLMP_IAS_SET:
1947 /* The user want to add an attribute to an existing IAS object 1922 /* The user want to add an attribute to an existing IAS object
@@ -1951,17 +1926,22 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
1951 * create the right attribute... 1926 * create the right attribute...
1952 */ 1927 */
1953 1928
1954 if (optlen != sizeof(struct irda_ias_set)) 1929 if (optlen != sizeof(struct irda_ias_set)) {
1955 return -EINVAL; 1930 err = -EINVAL;
1931 goto out;
1932 }
1956 1933
1957 ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC); 1934 ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC);
1958 if (ias_opt == NULL) 1935 if (ias_opt == NULL) {
1959 return -ENOMEM; 1936 err = -ENOMEM;
1937 goto out;
1938 }
1960 1939
1961 /* Copy query to the driver. */ 1940 /* Copy query to the driver. */
1962 if (copy_from_user(ias_opt, optval, optlen)) { 1941 if (copy_from_user(ias_opt, optval, optlen)) {
1963 kfree(ias_opt); 1942 kfree(ias_opt);
1964 return -EFAULT; 1943 err = -EFAULT;
1944 goto out;
1965 } 1945 }
1966 1946
1967 /* Find the object we target. 1947 /* Find the object we target.
@@ -1971,7 +1951,8 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
1971 if(ias_opt->irda_class_name[0] == '\0') { 1951 if(ias_opt->irda_class_name[0] == '\0') {
1972 if(self->ias_obj == NULL) { 1952 if(self->ias_obj == NULL) {
1973 kfree(ias_opt); 1953 kfree(ias_opt);
1974 return -EINVAL; 1954 err = -EINVAL;
1955 goto out;
1975 } 1956 }
1976 ias_obj = self->ias_obj; 1957 ias_obj = self->ias_obj;
1977 } else 1958 } else
@@ -1983,7 +1964,8 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
1983 if((!capable(CAP_NET_ADMIN)) && 1964 if((!capable(CAP_NET_ADMIN)) &&
1984 ((ias_obj == NULL) || (ias_obj != self->ias_obj))) { 1965 ((ias_obj == NULL) || (ias_obj != self->ias_obj))) {
1985 kfree(ias_opt); 1966 kfree(ias_opt);
1986 return -EPERM; 1967 err = -EPERM;
1968 goto out;
1987 } 1969 }
1988 1970
1989 /* If the object doesn't exist, create it */ 1971 /* If the object doesn't exist, create it */
@@ -1993,7 +1975,8 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
1993 jiffies); 1975 jiffies);
1994 if (ias_obj == NULL) { 1976 if (ias_obj == NULL) {
1995 kfree(ias_opt); 1977 kfree(ias_opt);
1996 return -ENOMEM; 1978 err = -ENOMEM;
1979 goto out;
1997 } 1980 }
1998 free_ias = 1; 1981 free_ias = 1;
1999 } 1982 }
@@ -2005,7 +1988,8 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
2005 kfree(ias_obj->name); 1988 kfree(ias_obj->name);
2006 kfree(ias_obj); 1989 kfree(ias_obj);
2007 } 1990 }
2008 return -EINVAL; 1991 err = -EINVAL;
1992 goto out;
2009 } 1993 }
2010 1994
2011 /* Look at the type */ 1995 /* Look at the type */
@@ -2028,7 +2012,8 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
2028 kfree(ias_obj); 2012 kfree(ias_obj);
2029 } 2013 }
2030 2014
2031 return -EINVAL; 2015 err = -EINVAL;
2016 goto out;
2032 } 2017 }
2033 /* Add an octet sequence attribute */ 2018 /* Add an octet sequence attribute */
2034 irias_add_octseq_attrib( 2019 irias_add_octseq_attrib(
@@ -2060,7 +2045,8 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
2060 kfree(ias_obj->name); 2045 kfree(ias_obj->name);
2061 kfree(ias_obj); 2046 kfree(ias_obj);
2062 } 2047 }
2063 return -EINVAL; 2048 err = -EINVAL;
2049 goto out;
2064 } 2050 }
2065 irias_insert_object(ias_obj); 2051 irias_insert_object(ias_obj);
2066 kfree(ias_opt); 2052 kfree(ias_opt);
@@ -2071,17 +2057,22 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
2071 * object is not owned by the kernel and delete it. 2057 * object is not owned by the kernel and delete it.
2072 */ 2058 */
2073 2059
2074 if (optlen != sizeof(struct irda_ias_set)) 2060 if (optlen != sizeof(struct irda_ias_set)) {
2075 return -EINVAL; 2061 err = -EINVAL;
2062 goto out;
2063 }
2076 2064
2077 ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC); 2065 ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC);
2078 if (ias_opt == NULL) 2066 if (ias_opt == NULL) {
2079 return -ENOMEM; 2067 err = -ENOMEM;
2068 goto out;
2069 }
2080 2070
2081 /* Copy query to the driver. */ 2071 /* Copy query to the driver. */
2082 if (copy_from_user(ias_opt, optval, optlen)) { 2072 if (copy_from_user(ias_opt, optval, optlen)) {
2083 kfree(ias_opt); 2073 kfree(ias_opt);
2084 return -EFAULT; 2074 err = -EFAULT;
2075 goto out;
2085 } 2076 }
2086 2077
2087 /* Find the object we target. 2078 /* Find the object we target.
@@ -2094,7 +2085,8 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
2094 ias_obj = irias_find_object(ias_opt->irda_class_name); 2085 ias_obj = irias_find_object(ias_opt->irda_class_name);
2095 if(ias_obj == (struct ias_object *) NULL) { 2086 if(ias_obj == (struct ias_object *) NULL) {
2096 kfree(ias_opt); 2087 kfree(ias_opt);
2097 return -EINVAL; 2088 err = -EINVAL;
2089 goto out;
2098 } 2090 }
2099 2091
2100 /* Only ROOT can mess with the global IAS database. 2092 /* Only ROOT can mess with the global IAS database.
@@ -2103,7 +2095,8 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
2103 if((!capable(CAP_NET_ADMIN)) && 2095 if((!capable(CAP_NET_ADMIN)) &&
2104 ((ias_obj == NULL) || (ias_obj != self->ias_obj))) { 2096 ((ias_obj == NULL) || (ias_obj != self->ias_obj))) {
2105 kfree(ias_opt); 2097 kfree(ias_opt);
2106 return -EPERM; 2098 err = -EPERM;
2099 goto out;
2107 } 2100 }
2108 2101
2109 /* Find the attribute (in the object) we target */ 2102 /* Find the attribute (in the object) we target */
@@ -2111,14 +2104,16 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
2111 ias_opt->irda_attrib_name); 2104 ias_opt->irda_attrib_name);
2112 if(ias_attr == (struct ias_attrib *) NULL) { 2105 if(ias_attr == (struct ias_attrib *) NULL) {
2113 kfree(ias_opt); 2106 kfree(ias_opt);
2114 return -EINVAL; 2107 err = -EINVAL;
2108 goto out;
2115 } 2109 }
2116 2110
2117 /* Check is the user space own the object */ 2111 /* Check is the user space own the object */
2118 if(ias_attr->value->owner != IAS_USER_ATTR) { 2112 if(ias_attr->value->owner != IAS_USER_ATTR) {
2119 IRDA_DEBUG(1, "%s(), attempting to delete a kernel attribute\n", __func__); 2113 IRDA_DEBUG(1, "%s(), attempting to delete a kernel attribute\n", __func__);
2120 kfree(ias_opt); 2114 kfree(ias_opt);
2121 return -EPERM; 2115 err = -EPERM;
2116 goto out;
2122 } 2117 }
2123 2118
2124 /* Remove the attribute (and maybe the object) */ 2119 /* Remove the attribute (and maybe the object) */
@@ -2126,11 +2121,15 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
2126 kfree(ias_opt); 2121 kfree(ias_opt);
2127 break; 2122 break;
2128 case IRLMP_MAX_SDU_SIZE: 2123 case IRLMP_MAX_SDU_SIZE:
2129 if (optlen < sizeof(int)) 2124 if (optlen < sizeof(int)) {
2130 return -EINVAL; 2125 err = -EINVAL;
2126 goto out;
2127 }
2131 2128
2132 if (get_user(opt, (int __user *)optval)) 2129 if (get_user(opt, (int __user *)optval)) {
2133 return -EFAULT; 2130 err = -EFAULT;
2131 goto out;
2132 }
2134 2133
2135 /* Only possible for a seqpacket service (TTP with SAR) */ 2134 /* Only possible for a seqpacket service (TTP with SAR) */
2136 if (sk->sk_type != SOCK_SEQPACKET) { 2135 if (sk->sk_type != SOCK_SEQPACKET) {
@@ -2140,16 +2139,21 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
2140 } else { 2139 } else {
2141 IRDA_WARNING("%s: not allowed to set MAXSDUSIZE for this socket type!\n", 2140 IRDA_WARNING("%s: not allowed to set MAXSDUSIZE for this socket type!\n",
2142 __func__); 2141 __func__);
2143 return -ENOPROTOOPT; 2142 err = -ENOPROTOOPT;
2143 goto out;
2144 } 2144 }
2145 break; 2145 break;
2146 case IRLMP_HINTS_SET: 2146 case IRLMP_HINTS_SET:
2147 if (optlen < sizeof(int)) 2147 if (optlen < sizeof(int)) {
2148 return -EINVAL; 2148 err = -EINVAL;
2149 goto out;
2150 }
2149 2151
2150 /* The input is really a (__u8 hints[2]), easier as an int */ 2152 /* The input is really a (__u8 hints[2]), easier as an int */
2151 if (get_user(opt, (int __user *)optval)) 2153 if (get_user(opt, (int __user *)optval)) {
2152 return -EFAULT; 2154 err = -EFAULT;
2155 goto out;
2156 }
2153 2157
2154 /* Unregister any old registration */ 2158 /* Unregister any old registration */
2155 if (self->skey) 2159 if (self->skey)
@@ -2163,12 +2167,16 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
2163 * making a discovery (nodes which don't match any hint 2167 * making a discovery (nodes which don't match any hint
2164 * bit in the mask are not reported). 2168 * bit in the mask are not reported).
2165 */ 2169 */
2166 if (optlen < sizeof(int)) 2170 if (optlen < sizeof(int)) {
2167 return -EINVAL; 2171 err = -EINVAL;
2172 goto out;
2173 }
2168 2174
2169 /* The input is really a (__u8 hints[2]), easier as an int */ 2175 /* The input is really a (__u8 hints[2]), easier as an int */
2170 if (get_user(opt, (int __user *)optval)) 2176 if (get_user(opt, (int __user *)optval)) {
2171 return -EFAULT; 2177 err = -EFAULT;
2178 goto out;
2179 }
2172 2180
2173 /* Set the new hint mask */ 2181 /* Set the new hint mask */
2174 self->mask.word = (__u16) opt; 2182 self->mask.word = (__u16) opt;
@@ -2180,19 +2188,12 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
2180 2188
2181 break; 2189 break;
2182 default: 2190 default:
2183 return -ENOPROTOOPT; 2191 err = -ENOPROTOOPT;
2192 break;
2184 } 2193 }
2185 return 0;
2186}
2187
2188static int irda_setsockopt(struct socket *sock, int level, int optname,
2189 char __user *optval, unsigned int optlen)
2190{
2191 int err;
2192 2194
2193 lock_kernel(); 2195out:
2194 err = __irda_setsockopt(sock, level, optname, optval, optlen); 2196 release_sock(sk);
2195 unlock_kernel();
2196 2197
2197 return err; 2198 return err;
2198} 2199}
@@ -2249,7 +2250,7 @@ static int irda_extract_ias_value(struct irda_ias_set *ias_opt,
2249/* 2250/*
2250 * Function irda_getsockopt (sock, level, optname, optval, optlen) 2251 * Function irda_getsockopt (sock, level, optname, optval, optlen)
2251 */ 2252 */
2252static int __irda_getsockopt(struct socket *sock, int level, int optname, 2253static int irda_getsockopt(struct socket *sock, int level, int optname,
2253 char __user *optval, int __user *optlen) 2254 char __user *optval, int __user *optlen)
2254{ 2255{
2255 struct sock *sk = sock->sk; 2256 struct sock *sk = sock->sk;
@@ -2262,7 +2263,7 @@ static int __irda_getsockopt(struct socket *sock, int level, int optname,
2262 int daddr = DEV_ADDR_ANY; /* Dest address for IAS queries */ 2263 int daddr = DEV_ADDR_ANY; /* Dest address for IAS queries */
2263 int val = 0; 2264 int val = 0;
2264 int len = 0; 2265 int len = 0;
2265 int err; 2266 int err = 0;
2266 int offset, total; 2267 int offset, total;
2267 2268
2268 IRDA_DEBUG(2, "%s(%p)\n", __func__, self); 2269 IRDA_DEBUG(2, "%s(%p)\n", __func__, self);
@@ -2276,15 +2277,18 @@ static int __irda_getsockopt(struct socket *sock, int level, int optname,
2276 if(len < 0) 2277 if(len < 0)
2277 return -EINVAL; 2278 return -EINVAL;
2278 2279
2280 lock_sock(sk);
2281
2279 switch (optname) { 2282 switch (optname) {
2280 case IRLMP_ENUMDEVICES: 2283 case IRLMP_ENUMDEVICES:
2281 /* Ask lmp for the current discovery log */ 2284 /* Ask lmp for the current discovery log */
2282 discoveries = irlmp_get_discoveries(&list.len, self->mask.word, 2285 discoveries = irlmp_get_discoveries(&list.len, self->mask.word,
2283 self->nslots); 2286 self->nslots);
2284 /* Check if the we got some results */ 2287 /* Check if the we got some results */
2285 if (discoveries == NULL) 2288 if (discoveries == NULL) {
2286 return -EAGAIN; /* Didn't find any devices */ 2289 err = -EAGAIN;
2287 err = 0; 2290 goto out; /* Didn't find any devices */
2291 }
2288 2292
2289 /* Write total list length back to client */ 2293 /* Write total list length back to client */
2290 if (copy_to_user(optval, &list, 2294 if (copy_to_user(optval, &list,
@@ -2297,8 +2301,7 @@ static int __irda_getsockopt(struct socket *sock, int level, int optname,
2297 sizeof(struct irda_device_info); 2301 sizeof(struct irda_device_info);
2298 2302
2299 /* Copy the list itself - watch for overflow */ 2303 /* Copy the list itself - watch for overflow */
2300 if(list.len > 2048) 2304 if (list.len > 2048) {
2301 {
2302 err = -EINVAL; 2305 err = -EINVAL;
2303 goto bed; 2306 goto bed;
2304 } 2307 }
@@ -2314,17 +2317,20 @@ static int __irda_getsockopt(struct socket *sock, int level, int optname,
2314bed: 2317bed:
2315 /* Free up our buffer */ 2318 /* Free up our buffer */
2316 kfree(discoveries); 2319 kfree(discoveries);
2317 if (err)
2318 return err;
2319 break; 2320 break;
2320 case IRLMP_MAX_SDU_SIZE: 2321 case IRLMP_MAX_SDU_SIZE:
2321 val = self->max_data_size; 2322 val = self->max_data_size;
2322 len = sizeof(int); 2323 len = sizeof(int);
2323 if (put_user(len, optlen)) 2324 if (put_user(len, optlen)) {
2324 return -EFAULT; 2325 err = -EFAULT;
2326 goto out;
2327 }
2328
2329 if (copy_to_user(optval, &val, len)) {
2330 err = -EFAULT;
2331 goto out;
2332 }
2325 2333
2326 if (copy_to_user(optval, &val, len))
2327 return -EFAULT;
2328 break; 2334 break;
2329 case IRLMP_IAS_GET: 2335 case IRLMP_IAS_GET:
2330 /* The user want an object from our local IAS database. 2336 /* The user want an object from our local IAS database.
@@ -2332,17 +2338,22 @@ bed:
2332 * that we found */ 2338 * that we found */
2333 2339
2334 /* Check that the user has allocated the right space for us */ 2340 /* Check that the user has allocated the right space for us */
2335 if (len != sizeof(struct irda_ias_set)) 2341 if (len != sizeof(struct irda_ias_set)) {
2336 return -EINVAL; 2342 err = -EINVAL;
2343 goto out;
2344 }
2337 2345
2338 ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC); 2346 ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC);
2339 if (ias_opt == NULL) 2347 if (ias_opt == NULL) {
2340 return -ENOMEM; 2348 err = -ENOMEM;
2349 goto out;
2350 }
2341 2351
2342 /* Copy query to the driver. */ 2352 /* Copy query to the driver. */
2343 if (copy_from_user(ias_opt, optval, len)) { 2353 if (copy_from_user(ias_opt, optval, len)) {
2344 kfree(ias_opt); 2354 kfree(ias_opt);
2345 return -EFAULT; 2355 err = -EFAULT;
2356 goto out;
2346 } 2357 }
2347 2358
2348 /* Find the object we target. 2359 /* Find the object we target.
@@ -2355,7 +2366,8 @@ bed:
2355 ias_obj = irias_find_object(ias_opt->irda_class_name); 2366 ias_obj = irias_find_object(ias_opt->irda_class_name);
2356 if(ias_obj == (struct ias_object *) NULL) { 2367 if(ias_obj == (struct ias_object *) NULL) {
2357 kfree(ias_opt); 2368 kfree(ias_opt);
2358 return -EINVAL; 2369 err = -EINVAL;
2370 goto out;
2359 } 2371 }
2360 2372
2361 /* Find the attribute (in the object) we target */ 2373 /* Find the attribute (in the object) we target */
@@ -2363,21 +2375,23 @@ bed:
2363 ias_opt->irda_attrib_name); 2375 ias_opt->irda_attrib_name);
2364 if(ias_attr == (struct ias_attrib *) NULL) { 2376 if(ias_attr == (struct ias_attrib *) NULL) {
2365 kfree(ias_opt); 2377 kfree(ias_opt);
2366 return -EINVAL; 2378 err = -EINVAL;
2379 goto out;
2367 } 2380 }
2368 2381
2369 /* Translate from internal to user structure */ 2382 /* Translate from internal to user structure */
2370 err = irda_extract_ias_value(ias_opt, ias_attr->value); 2383 err = irda_extract_ias_value(ias_opt, ias_attr->value);
2371 if(err) { 2384 if(err) {
2372 kfree(ias_opt); 2385 kfree(ias_opt);
2373 return err; 2386 goto out;
2374 } 2387 }
2375 2388
2376 /* Copy reply to the user */ 2389 /* Copy reply to the user */
2377 if (copy_to_user(optval, ias_opt, 2390 if (copy_to_user(optval, ias_opt,
2378 sizeof(struct irda_ias_set))) { 2391 sizeof(struct irda_ias_set))) {
2379 kfree(ias_opt); 2392 kfree(ias_opt);
2380 return -EFAULT; 2393 err = -EFAULT;
2394 goto out;
2381 } 2395 }
2382 /* Note : don't need to put optlen, we checked it */ 2396 /* Note : don't need to put optlen, we checked it */
2383 kfree(ias_opt); 2397 kfree(ias_opt);
@@ -2388,17 +2402,22 @@ bed:
2388 * then wait for the answer to come back. */ 2402 * then wait for the answer to come back. */
2389 2403
2390 /* Check that the user has allocated the right space for us */ 2404 /* Check that the user has allocated the right space for us */
2391 if (len != sizeof(struct irda_ias_set)) 2405 if (len != sizeof(struct irda_ias_set)) {
2392 return -EINVAL; 2406 err = -EINVAL;
2407 goto out;
2408 }
2393 2409
2394 ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC); 2410 ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC);
2395 if (ias_opt == NULL) 2411 if (ias_opt == NULL) {
2396 return -ENOMEM; 2412 err = -ENOMEM;
2413 goto out;
2414 }
2397 2415
2398 /* Copy query to the driver. */ 2416 /* Copy query to the driver. */
2399 if (copy_from_user(ias_opt, optval, len)) { 2417 if (copy_from_user(ias_opt, optval, len)) {
2400 kfree(ias_opt); 2418 kfree(ias_opt);
2401 return -EFAULT; 2419 err = -EFAULT;
2420 goto out;
2402 } 2421 }
2403 2422
2404 /* At this point, there are two cases... 2423 /* At this point, there are two cases...
@@ -2419,7 +2438,8 @@ bed:
2419 daddr = ias_opt->daddr; 2438 daddr = ias_opt->daddr;
2420 if((!daddr) || (daddr == DEV_ADDR_ANY)) { 2439 if((!daddr) || (daddr == DEV_ADDR_ANY)) {
2421 kfree(ias_opt); 2440 kfree(ias_opt);
2422 return -EINVAL; 2441 err = -EINVAL;
2442 goto out;
2423 } 2443 }
2424 } 2444 }
2425 2445
@@ -2428,7 +2448,8 @@ bed:
2428 IRDA_WARNING("%s: busy with a previous query\n", 2448 IRDA_WARNING("%s: busy with a previous query\n",
2429 __func__); 2449 __func__);
2430 kfree(ias_opt); 2450 kfree(ias_opt);
2431 return -EBUSY; 2451 err = -EBUSY;
2452 goto out;
2432 } 2453 }
2433 2454
2434 self->iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self, 2455 self->iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self,
@@ -2436,7 +2457,8 @@ bed:
2436 2457
2437 if (self->iriap == NULL) { 2458 if (self->iriap == NULL) {
2438 kfree(ias_opt); 2459 kfree(ias_opt);
2439 return -ENOMEM; 2460 err = -ENOMEM;
2461 goto out;
2440 } 2462 }
2441 2463
2442 /* Treat unexpected wakeup as disconnect */ 2464 /* Treat unexpected wakeup as disconnect */
@@ -2455,7 +2477,8 @@ bed:
2455 * we can free it regardless! */ 2477 * we can free it regardless! */
2456 kfree(ias_opt); 2478 kfree(ias_opt);
2457 /* Treat signals as disconnect */ 2479 /* Treat signals as disconnect */
2458 return -EHOSTUNREACH; 2480 err = -EHOSTUNREACH;
2481 goto out;
2459 } 2482 }
2460 2483
2461 /* Check what happened */ 2484 /* Check what happened */
@@ -2465,9 +2488,11 @@ bed:
2465 /* Requested object/attribute doesn't exist */ 2488 /* Requested object/attribute doesn't exist */
2466 if((self->errno == IAS_CLASS_UNKNOWN) || 2489 if((self->errno == IAS_CLASS_UNKNOWN) ||
2467 (self->errno == IAS_ATTRIB_UNKNOWN)) 2490 (self->errno == IAS_ATTRIB_UNKNOWN))
2468 return (-EADDRNOTAVAIL); 2491 err = -EADDRNOTAVAIL;
2469 else 2492 else
2470 return (-EHOSTUNREACH); 2493 err = -EHOSTUNREACH;
2494
2495 goto out;
2471 } 2496 }
2472 2497
2473 /* Translate from internal to user structure */ 2498 /* Translate from internal to user structure */
@@ -2476,14 +2501,15 @@ bed:
2476 irias_delete_value(self->ias_result); 2501 irias_delete_value(self->ias_result);
2477 if (err) { 2502 if (err) {
2478 kfree(ias_opt); 2503 kfree(ias_opt);
2479 return err; 2504 goto out;
2480 } 2505 }
2481 2506
2482 /* Copy reply to the user */ 2507 /* Copy reply to the user */
2483 if (copy_to_user(optval, ias_opt, 2508 if (copy_to_user(optval, ias_opt,
2484 sizeof(struct irda_ias_set))) { 2509 sizeof(struct irda_ias_set))) {
2485 kfree(ias_opt); 2510 kfree(ias_opt);
2486 return -EFAULT; 2511 err = -EFAULT;
2512 goto out;
2487 } 2513 }
2488 /* Note : don't need to put optlen, we checked it */ 2514 /* Note : don't need to put optlen, we checked it */
2489 kfree(ias_opt); 2515 kfree(ias_opt);
@@ -2504,11 +2530,15 @@ bed:
2504 */ 2530 */
2505 2531
2506 /* Check that the user is passing us an int */ 2532 /* Check that the user is passing us an int */
2507 if (len != sizeof(int)) 2533 if (len != sizeof(int)) {
2508 return -EINVAL; 2534 err = -EINVAL;
2535 goto out;
2536 }
2509 /* Get timeout in ms (max time we block the caller) */ 2537 /* Get timeout in ms (max time we block the caller) */
2510 if (get_user(val, (int __user *)optval)) 2538 if (get_user(val, (int __user *)optval)) {
2511 return -EFAULT; 2539 err = -EFAULT;
2540 goto out;
2541 }
2512 2542
2513 /* Tell IrLMP we want to be notified */ 2543 /* Tell IrLMP we want to be notified */
2514 irlmp_update_client(self->ckey, self->mask.word, 2544 irlmp_update_client(self->ckey, self->mask.word,
@@ -2520,8 +2550,6 @@ bed:
2520 2550
2521 /* Wait until a node is discovered */ 2551 /* Wait until a node is discovered */
2522 if (!self->cachedaddr) { 2552 if (!self->cachedaddr) {
2523 int ret = 0;
2524
2525 IRDA_DEBUG(1, "%s(), nothing discovered yet, going to sleep...\n", __func__); 2553 IRDA_DEBUG(1, "%s(), nothing discovered yet, going to sleep...\n", __func__);
2526 2554
2527 /* Set watchdog timer to expire in <val> ms. */ 2555 /* Set watchdog timer to expire in <val> ms. */
@@ -2534,7 +2562,7 @@ bed:
2534 /* Wait for IR-LMP to call us back */ 2562 /* Wait for IR-LMP to call us back */
2535 __wait_event_interruptible(self->query_wait, 2563 __wait_event_interruptible(self->query_wait,
2536 (self->cachedaddr != 0 || self->errno == -ETIME), 2564 (self->cachedaddr != 0 || self->errno == -ETIME),
2537 ret); 2565 err);
2538 2566
2539 /* If watchdog is still activated, kill it! */ 2567 /* If watchdog is still activated, kill it! */
2540 if(timer_pending(&(self->watchdog))) 2568 if(timer_pending(&(self->watchdog)))
@@ -2542,8 +2570,8 @@ bed:
2542 2570
2543 IRDA_DEBUG(1, "%s(), ...waking up !\n", __func__); 2571 IRDA_DEBUG(1, "%s(), ...waking up !\n", __func__);
2544 2572
2545 if (ret != 0) 2573 if (err != 0)
2546 return ret; 2574 goto out;
2547 } 2575 }
2548 else 2576 else
2549 IRDA_DEBUG(1, "%s(), found immediately !\n", 2577 IRDA_DEBUG(1, "%s(), found immediately !\n",
@@ -2566,25 +2594,19 @@ bed:
2566 * If the user want more details, he should query 2594 * If the user want more details, he should query
2567 * the whole discovery log and pick one device... 2595 * the whole discovery log and pick one device...
2568 */ 2596 */
2569 if (put_user(daddr, (int __user *)optval)) 2597 if (put_user(daddr, (int __user *)optval)) {
2570 return -EFAULT; 2598 err = -EFAULT;
2599 goto out;
2600 }
2571 2601
2572 break; 2602 break;
2573 default: 2603 default:
2574 return -ENOPROTOOPT; 2604 err = -ENOPROTOOPT;
2575 } 2605 }
2576 2606
2577 return 0; 2607out:
2578}
2579
2580static int irda_getsockopt(struct socket *sock, int level, int optname,
2581 char __user *optval, int __user *optlen)
2582{
2583 int err;
2584 2608
2585 lock_kernel(); 2609 release_sock(sk);
2586 err = __irda_getsockopt(sock, level, optname, optval, optlen);
2587 unlock_kernel();
2588 2610
2589 return err; 2611 return err;
2590} 2612}
@@ -2628,7 +2650,7 @@ static const struct proto_ops irda_seqpacket_ops = {
2628 .socketpair = sock_no_socketpair, 2650 .socketpair = sock_no_socketpair,
2629 .accept = irda_accept, 2651 .accept = irda_accept,
2630 .getname = irda_getname, 2652 .getname = irda_getname,
2631 .poll = irda_datagram_poll, 2653 .poll = datagram_poll,
2632 .ioctl = irda_ioctl, 2654 .ioctl = irda_ioctl,
2633#ifdef CONFIG_COMPAT 2655#ifdef CONFIG_COMPAT
2634 .compat_ioctl = irda_compat_ioctl, 2656 .compat_ioctl = irda_compat_ioctl,
@@ -2652,7 +2674,7 @@ static const struct proto_ops irda_dgram_ops = {
2652 .socketpair = sock_no_socketpair, 2674 .socketpair = sock_no_socketpair,
2653 .accept = irda_accept, 2675 .accept = irda_accept,
2654 .getname = irda_getname, 2676 .getname = irda_getname,
2655 .poll = irda_datagram_poll, 2677 .poll = datagram_poll,
2656 .ioctl = irda_ioctl, 2678 .ioctl = irda_ioctl,
2657#ifdef CONFIG_COMPAT 2679#ifdef CONFIG_COMPAT
2658 .compat_ioctl = irda_compat_ioctl, 2680 .compat_ioctl = irda_compat_ioctl,
@@ -2677,7 +2699,7 @@ static const struct proto_ops irda_ultra_ops = {
2677 .socketpair = sock_no_socketpair, 2699 .socketpair = sock_no_socketpair,
2678 .accept = sock_no_accept, 2700 .accept = sock_no_accept,
2679 .getname = irda_getname, 2701 .getname = irda_getname,
2680 .poll = irda_datagram_poll, 2702 .poll = datagram_poll,
2681 .ioctl = irda_ioctl, 2703 .ioctl = irda_ioctl,
2682#ifdef CONFIG_COMPAT 2704#ifdef CONFIG_COMPAT
2683 .compat_ioctl = irda_compat_ioctl, 2705 .compat_ioctl = irda_compat_ioctl,
diff --git a/net/irda/discovery.c b/net/irda/discovery.c
index c1c8ae939126..36c3f037f172 100644
--- a/net/irda/discovery.c
+++ b/net/irda/discovery.c
@@ -315,7 +315,7 @@ struct irda_device_info *irlmp_copy_discoveries(hashbin_t *log, int *pn,
315 315
316 /* Get the actual number of device in the buffer and return */ 316 /* Get the actual number of device in the buffer and return */
317 *pn = i; 317 *pn = i;
318 return(buffer); 318 return buffer;
319} 319}
320 320
321#ifdef CONFIG_PROC_FS 321#ifdef CONFIG_PROC_FS
diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c
index faa82ca2dfdc..a39cca8331df 100644
--- a/net/irda/ircomm/ircomm_tty.c
+++ b/net/irda/ircomm/ircomm_tty.c
@@ -449,8 +449,8 @@ static int ircomm_tty_open(struct tty_struct *tty, struct file *filp)
449 } 449 }
450 450
451#ifdef SERIAL_DO_RESTART 451#ifdef SERIAL_DO_RESTART
452 return ((self->flags & ASYNC_HUP_NOTIFY) ? 452 return (self->flags & ASYNC_HUP_NOTIFY) ?
453 -EAGAIN : -ERESTARTSYS); 453 -EAGAIN : -ERESTARTSYS;
454#else 454#else
455 return -EAGAIN; 455 return -EAGAIN;
456#endif 456#endif
diff --git a/net/irda/iriap.c b/net/irda/iriap.c
index fce364c6c71a..5b743bdd89ba 100644
--- a/net/irda/iriap.c
+++ b/net/irda/iriap.c
@@ -502,7 +502,8 @@ static void iriap_getvaluebyclass_confirm(struct iriap_cb *self,
502 IRDA_DEBUG(4, "%s(), strlen=%d\n", __func__, value_len); 502 IRDA_DEBUG(4, "%s(), strlen=%d\n", __func__, value_len);
503 503
504 /* Make sure the string is null-terminated */ 504 /* Make sure the string is null-terminated */
505 fp[n+value_len] = 0x00; 505 if (n + value_len < skb->len)
506 fp[n + value_len] = 0x00;
506 IRDA_DEBUG(4, "Got string %s\n", fp+n); 507 IRDA_DEBUG(4, "Got string %s\n", fp+n);
507 508
508 /* Will truncate to IAS_MAX_STRING bytes */ 509 /* Will truncate to IAS_MAX_STRING bytes */
diff --git a/net/irda/irlan/irlan_common.c b/net/irda/irlan/irlan_common.c
index a788f9e9427d..6130f9d9dbe1 100644
--- a/net/irda/irlan/irlan_common.c
+++ b/net/irda/irlan/irlan_common.c
@@ -1102,7 +1102,7 @@ int irlan_extract_param(__u8 *buf, char *name, char *value, __u16 *len)
1102 memcpy(&val_len, buf+n, 2); /* To avoid alignment problems */ 1102 memcpy(&val_len, buf+n, 2); /* To avoid alignment problems */
1103 le16_to_cpus(&val_len); n+=2; 1103 le16_to_cpus(&val_len); n+=2;
1104 1104
1105 if (val_len > 1016) { 1105 if (val_len >= 1016) {
1106 IRDA_DEBUG(2, "%s(), parameter length to long\n", __func__ ); 1106 IRDA_DEBUG(2, "%s(), parameter length to long\n", __func__ );
1107 return -RSP_INVALID_COMMAND_FORMAT; 1107 return -RSP_INVALID_COMMAND_FORMAT;
1108 } 1108 }
diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c
index 9616c32d1076..8ee1ff6c742f 100644
--- a/net/irda/irlan/irlan_eth.c
+++ b/net/irda/irlan/irlan_eth.c
@@ -45,13 +45,11 @@ static int irlan_eth_close(struct net_device *dev);
45static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb, 45static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb,
46 struct net_device *dev); 46 struct net_device *dev);
47static void irlan_eth_set_multicast_list( struct net_device *dev); 47static void irlan_eth_set_multicast_list( struct net_device *dev);
48static struct net_device_stats *irlan_eth_get_stats(struct net_device *dev);
49 48
50static const struct net_device_ops irlan_eth_netdev_ops = { 49static const struct net_device_ops irlan_eth_netdev_ops = {
51 .ndo_open = irlan_eth_open, 50 .ndo_open = irlan_eth_open,
52 .ndo_stop = irlan_eth_close, 51 .ndo_stop = irlan_eth_close,
53 .ndo_start_xmit = irlan_eth_xmit, 52 .ndo_start_xmit = irlan_eth_xmit,
54 .ndo_get_stats = irlan_eth_get_stats,
55 .ndo_set_multicast_list = irlan_eth_set_multicast_list, 53 .ndo_set_multicast_list = irlan_eth_set_multicast_list,
56 .ndo_change_mtu = eth_change_mtu, 54 .ndo_change_mtu = eth_change_mtu,
57 .ndo_validate_addr = eth_validate_addr, 55 .ndo_validate_addr = eth_validate_addr,
@@ -169,6 +167,7 @@ static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb,
169{ 167{
170 struct irlan_cb *self = netdev_priv(dev); 168 struct irlan_cb *self = netdev_priv(dev);
171 int ret; 169 int ret;
170 unsigned int len;
172 171
173 /* skb headroom large enough to contain all IrDA-headers? */ 172 /* skb headroom large enough to contain all IrDA-headers? */
174 if ((skb_headroom(skb) < self->max_header_size) || (skb_shared(skb))) { 173 if ((skb_headroom(skb) < self->max_header_size) || (skb_shared(skb))) {
@@ -188,6 +187,7 @@ static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb,
188 187
189 dev->trans_start = jiffies; 188 dev->trans_start = jiffies;
190 189
190 len = skb->len;
191 /* Now queue the packet in the transport layer */ 191 /* Now queue the packet in the transport layer */
192 if (self->use_udata) 192 if (self->use_udata)
193 ret = irttp_udata_request(self->tsap_data, skb); 193 ret = irttp_udata_request(self->tsap_data, skb);
@@ -206,10 +206,10 @@ static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb,
206 * tried :-) DB 206 * tried :-) DB
207 */ 207 */
208 /* irttp_data_request already free the packet */ 208 /* irttp_data_request already free the packet */
209 self->stats.tx_dropped++; 209 dev->stats.tx_dropped++;
210 } else { 210 } else {
211 self->stats.tx_packets++; 211 dev->stats.tx_packets++;
212 self->stats.tx_bytes += skb->len; 212 dev->stats.tx_bytes += len;
213 } 213 }
214 214
215 return NETDEV_TX_OK; 215 return NETDEV_TX_OK;
@@ -224,15 +224,16 @@ static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb,
224int irlan_eth_receive(void *instance, void *sap, struct sk_buff *skb) 224int irlan_eth_receive(void *instance, void *sap, struct sk_buff *skb)
225{ 225{
226 struct irlan_cb *self = instance; 226 struct irlan_cb *self = instance;
227 struct net_device *dev = self->dev;
227 228
228 if (skb == NULL) { 229 if (skb == NULL) {
229 ++self->stats.rx_dropped; 230 dev->stats.rx_dropped++;
230 return 0; 231 return 0;
231 } 232 }
232 if (skb->len < ETH_HLEN) { 233 if (skb->len < ETH_HLEN) {
233 IRDA_DEBUG(0, "%s() : IrLAN frame too short (%d)\n", 234 IRDA_DEBUG(0, "%s() : IrLAN frame too short (%d)\n",
234 __func__, skb->len); 235 __func__, skb->len);
235 ++self->stats.rx_dropped; 236 dev->stats.rx_dropped++;
236 dev_kfree_skb(skb); 237 dev_kfree_skb(skb);
237 return 0; 238 return 0;
238 } 239 }
@@ -242,10 +243,10 @@ int irlan_eth_receive(void *instance, void *sap, struct sk_buff *skb)
242 * might have been previously set by the low level IrDA network 243 * might have been previously set by the low level IrDA network
243 * device driver 244 * device driver
244 */ 245 */
245 skb->protocol = eth_type_trans(skb, self->dev); /* Remove eth header */ 246 skb->protocol = eth_type_trans(skb, dev); /* Remove eth header */
246 247
247 self->stats.rx_packets++; 248 dev->stats.rx_packets++;
248 self->stats.rx_bytes += skb->len; 249 dev->stats.rx_bytes += skb->len;
249 250
250 netif_rx(skb); /* Eat it! */ 251 netif_rx(skb); /* Eat it! */
251 252
@@ -346,16 +347,3 @@ static void irlan_eth_set_multicast_list(struct net_device *dev)
346 else 347 else
347 irlan_set_broadcast_filter(self, FALSE); 348 irlan_set_broadcast_filter(self, FALSE);
348} 349}
349
350/*
351 * Function irlan_get_stats (dev)
352 *
353 * Get the current statistics for this device
354 *
355 */
356static struct net_device_stats *irlan_eth_get_stats(struct net_device *dev)
357{
358 struct irlan_cb *self = netdev_priv(dev);
359
360 return &self->stats;
361}
diff --git a/net/irda/irlan/irlan_event.c b/net/irda/irlan/irlan_event.c
index cbcb4eb54037..43f16040a6fe 100644
--- a/net/irda/irlan/irlan_event.c
+++ b/net/irda/irlan/irlan_event.c
@@ -24,7 +24,7 @@
24 24
25#include <net/irda/irlan_event.h> 25#include <net/irda/irlan_event.h>
26 26
27char *irlan_state[] = { 27const char * const irlan_state[] = {
28 "IRLAN_IDLE", 28 "IRLAN_IDLE",
29 "IRLAN_QUERY", 29 "IRLAN_QUERY",
30 "IRLAN_CONN", 30 "IRLAN_CONN",
diff --git a/net/irda/irlmp.c b/net/irda/irlmp.c
index 0e7d8bde145d..6115a44c0a24 100644
--- a/net/irda/irlmp.c
+++ b/net/irda/irlmp.c
@@ -939,7 +939,7 @@ struct irda_device_info *irlmp_get_discoveries(int *pn, __u16 mask, int nslots)
939 } 939 }
940 940
941 /* Return current cached discovery log */ 941 /* Return current cached discovery log */
942 return(irlmp_copy_discoveries(irlmp->cachelog, pn, mask, TRUE)); 942 return irlmp_copy_discoveries(irlmp->cachelog, pn, mask, TRUE);
943} 943}
944EXPORT_SYMBOL(irlmp_get_discoveries); 944EXPORT_SYMBOL(irlmp_get_discoveries);
945 945
diff --git a/net/irda/irlmp_frame.c b/net/irda/irlmp_frame.c
index 3750884094da..062e63b1c5c4 100644
--- a/net/irda/irlmp_frame.c
+++ b/net/irda/irlmp_frame.c
@@ -448,7 +448,7 @@ static struct lsap_cb *irlmp_find_lsap(struct lap_cb *self, __u8 dlsap_sel,
448 (self->cache.slsap_sel == slsap_sel) && 448 (self->cache.slsap_sel == slsap_sel) &&
449 (self->cache.dlsap_sel == dlsap_sel)) 449 (self->cache.dlsap_sel == dlsap_sel))
450 { 450 {
451 return (self->cache.lsap); 451 return self->cache.lsap;
452 } 452 }
453#endif 453#endif
454 454
diff --git a/net/irda/irnet/irnet.h b/net/irda/irnet/irnet.h
index 4300df35d37d..0d82ff5aeff1 100644
--- a/net/irda/irnet/irnet.h
+++ b/net/irda/irnet/irnet.h
@@ -458,6 +458,8 @@ typedef struct irnet_socket
458 int disco_index; /* Last read in the discovery log */ 458 int disco_index; /* Last read in the discovery log */
459 int disco_number; /* Size of the discovery log */ 459 int disco_number; /* Size of the discovery log */
460 460
461 struct mutex lock;
462
461} irnet_socket; 463} irnet_socket;
462 464
463/* 465/*
diff --git a/net/irda/irnet/irnet_irda.c b/net/irda/irnet/irnet_irda.c
index e98e40d76f4f..7f17a8020e8a 100644
--- a/net/irda/irnet/irnet_irda.c
+++ b/net/irda/irnet/irnet_irda.c
@@ -238,7 +238,7 @@ irnet_ias_to_tsap(irnet_socket * self,
238 DEXIT(IRDA_SR_TRACE, "\n"); 238 DEXIT(IRDA_SR_TRACE, "\n");
239 239
240 /* Return the TSAP */ 240 /* Return the TSAP */
241 return(dtsap_sel); 241 return dtsap_sel;
242} 242}
243 243
244/*------------------------------------------------------------------*/ 244/*------------------------------------------------------------------*/
@@ -301,7 +301,7 @@ irnet_connect_tsap(irnet_socket * self)
301 { 301 {
302 clear_bit(0, &self->ttp_connect); 302 clear_bit(0, &self->ttp_connect);
303 DERROR(IRDA_SR_ERROR, "connect aborted!\n"); 303 DERROR(IRDA_SR_ERROR, "connect aborted!\n");
304 return(err); 304 return err;
305 } 305 }
306 306
307 /* Connect to remote device */ 307 /* Connect to remote device */
@@ -312,7 +312,7 @@ irnet_connect_tsap(irnet_socket * self)
312 { 312 {
313 clear_bit(0, &self->ttp_connect); 313 clear_bit(0, &self->ttp_connect);
314 DERROR(IRDA_SR_ERROR, "connect aborted!\n"); 314 DERROR(IRDA_SR_ERROR, "connect aborted!\n");
315 return(err); 315 return err;
316 } 316 }
317 317
318 /* The above call is non-blocking. 318 /* The above call is non-blocking.
@@ -321,7 +321,7 @@ irnet_connect_tsap(irnet_socket * self)
321 * See you there ;-) */ 321 * See you there ;-) */
322 322
323 DEXIT(IRDA_SR_TRACE, "\n"); 323 DEXIT(IRDA_SR_TRACE, "\n");
324 return(err); 324 return err;
325} 325}
326 326
327/*------------------------------------------------------------------*/ 327/*------------------------------------------------------------------*/
@@ -362,10 +362,10 @@ irnet_discover_next_daddr(irnet_socket * self)
362 /* The above request is non-blocking. 362 /* The above request is non-blocking.
363 * After a while, IrDA will call us back in irnet_discovervalue_confirm() 363 * After a while, IrDA will call us back in irnet_discovervalue_confirm()
364 * We will then call irnet_ias_to_tsap() and come back here again... */ 364 * We will then call irnet_ias_to_tsap() and come back here again... */
365 return(0); 365 return 0;
366 } 366 }
367 else 367 else
368 return(1); 368 return 1;
369} 369}
370 370
371/*------------------------------------------------------------------*/ 371/*------------------------------------------------------------------*/
@@ -436,7 +436,7 @@ irnet_discover_daddr_and_lsap_sel(irnet_socket * self)
436 /* Follow me in irnet_discovervalue_confirm() */ 436 /* Follow me in irnet_discovervalue_confirm() */
437 437
438 DEXIT(IRDA_SR_TRACE, "\n"); 438 DEXIT(IRDA_SR_TRACE, "\n");
439 return(0); 439 return 0;
440} 440}
441 441
442/*------------------------------------------------------------------*/ 442/*------------------------------------------------------------------*/
@@ -485,7 +485,7 @@ irnet_dname_to_daddr(irnet_socket * self)
485 /* No luck ! */ 485 /* No luck ! */
486 DEBUG(IRDA_SR_INFO, "cannot discover device ``%s'' !!!\n", self->rname); 486 DEBUG(IRDA_SR_INFO, "cannot discover device ``%s'' !!!\n", self->rname);
487 kfree(discoveries); 487 kfree(discoveries);
488 return(-EADDRNOTAVAIL); 488 return -EADDRNOTAVAIL;
489} 489}
490 490
491 491
@@ -527,7 +527,7 @@ irda_irnet_create(irnet_socket * self)
527 INIT_WORK(&self->disconnect_work, irnet_ppp_disconnect); 527 INIT_WORK(&self->disconnect_work, irnet_ppp_disconnect);
528 528
529 DEXIT(IRDA_SOCK_TRACE, "\n"); 529 DEXIT(IRDA_SOCK_TRACE, "\n");
530 return(0); 530 return 0;
531} 531}
532 532
533/*------------------------------------------------------------------*/ 533/*------------------------------------------------------------------*/
@@ -601,7 +601,7 @@ irda_irnet_connect(irnet_socket * self)
601 * We will finish the connection procedure in irnet_connect_tsap(). 601 * We will finish the connection procedure in irnet_connect_tsap().
602 */ 602 */
603 DEXIT(IRDA_SOCK_TRACE, "\n"); 603 DEXIT(IRDA_SOCK_TRACE, "\n");
604 return(0); 604 return 0;
605} 605}
606 606
607/*------------------------------------------------------------------*/ 607/*------------------------------------------------------------------*/
@@ -733,7 +733,7 @@ irnet_daddr_to_dname(irnet_socket * self)
733 /* No luck ! */ 733 /* No luck ! */
734 DEXIT(IRDA_SERV_INFO, ": cannot discover device 0x%08x !!!\n", self->daddr); 734 DEXIT(IRDA_SERV_INFO, ": cannot discover device 0x%08x !!!\n", self->daddr);
735 kfree(discoveries); 735 kfree(discoveries);
736 return(-EADDRNOTAVAIL); 736 return -EADDRNOTAVAIL;
737} 737}
738 738
739/*------------------------------------------------------------------*/ 739/*------------------------------------------------------------------*/
diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c
index dfe7b38dd4af..7fa86373de41 100644
--- a/net/irda/irnet/irnet_ppp.c
+++ b/net/irda/irnet/irnet_ppp.c
@@ -166,7 +166,7 @@ irnet_ctrl_write(irnet_socket * ap,
166 } 166 }
167 167
168 /* Success : we have parsed all commands successfully */ 168 /* Success : we have parsed all commands successfully */
169 return(count); 169 return count;
170} 170}
171 171
172#ifdef INITIAL_DISCOVERY 172#ifdef INITIAL_DISCOVERY
@@ -300,7 +300,7 @@ irnet_ctrl_read(irnet_socket * ap,
300 } 300 }
301 301
302 DEXIT(CTRL_TRACE, "\n"); 302 DEXIT(CTRL_TRACE, "\n");
303 return(strlen(event)); 303 return strlen(event);
304 } 304 }
305#endif /* INITIAL_DISCOVERY */ 305#endif /* INITIAL_DISCOVERY */
306 306
@@ -409,7 +409,7 @@ irnet_ctrl_read(irnet_socket * ap,
409 } 409 }
410 410
411 DEXIT(CTRL_TRACE, "\n"); 411 DEXIT(CTRL_TRACE, "\n");
412 return(strlen(event)); 412 return strlen(event);
413} 413}
414 414
415/*------------------------------------------------------------------*/ 415/*------------------------------------------------------------------*/
@@ -480,7 +480,6 @@ dev_irnet_open(struct inode * inode,
480 ap = kzalloc(sizeof(*ap), GFP_KERNEL); 480 ap = kzalloc(sizeof(*ap), GFP_KERNEL);
481 DABORT(ap == NULL, -ENOMEM, FS_ERROR, "Can't allocate struct irnet...\n"); 481 DABORT(ap == NULL, -ENOMEM, FS_ERROR, "Can't allocate struct irnet...\n");
482 482
483 lock_kernel();
484 /* initialize the irnet structure */ 483 /* initialize the irnet structure */
485 ap->file = file; 484 ap->file = file;
486 485
@@ -502,18 +501,20 @@ dev_irnet_open(struct inode * inode,
502 { 501 {
503 DERROR(FS_ERROR, "Can't setup IrDA link...\n"); 502 DERROR(FS_ERROR, "Can't setup IrDA link...\n");
504 kfree(ap); 503 kfree(ap);
505 unlock_kernel(); 504
506 return err; 505 return err;
507 } 506 }
508 507
509 /* For the control channel */ 508 /* For the control channel */
510 ap->event_index = irnet_events.index; /* Cancel all past events */ 509 ap->event_index = irnet_events.index; /* Cancel all past events */
511 510
511 mutex_init(&ap->lock);
512
512 /* Put our stuff where we will be able to find it later */ 513 /* Put our stuff where we will be able to find it later */
513 file->private_data = ap; 514 file->private_data = ap;
514 515
515 DEXIT(FS_TRACE, " - ap=0x%p\n", ap); 516 DEXIT(FS_TRACE, " - ap=0x%p\n", ap);
516 unlock_kernel(); 517
517 return 0; 518 return 0;
518} 519}
519 520
@@ -623,7 +624,7 @@ dev_irnet_poll(struct file * file,
623 mask |= irnet_ctrl_poll(ap, file, wait); 624 mask |= irnet_ctrl_poll(ap, file, wait);
624 625
625 DEXIT(FS_TRACE, " - mask=0x%X\n", mask); 626 DEXIT(FS_TRACE, " - mask=0x%X\n", mask);
626 return(mask); 627 return mask;
627} 628}
628 629
629/*------------------------------------------------------------------*/ 630/*------------------------------------------------------------------*/
@@ -663,8 +664,10 @@ dev_irnet_ioctl(
663 if((val == N_SYNC_PPP) || (val == N_PPP)) 664 if((val == N_SYNC_PPP) || (val == N_PPP))
664 { 665 {
665 DEBUG(FS_INFO, "Entering PPP discipline.\n"); 666 DEBUG(FS_INFO, "Entering PPP discipline.\n");
666 /* PPP channel setup (ap->chan in configued in dev_irnet_open())*/ 667 /* PPP channel setup (ap->chan in configured in dev_irnet_open())*/
667 lock_kernel(); 668 if (mutex_lock_interruptible(&ap->lock))
669 return -EINTR;
670
668 err = ppp_register_channel(&ap->chan); 671 err = ppp_register_channel(&ap->chan);
669 if(err == 0) 672 if(err == 0)
670 { 673 {
@@ -677,14 +680,17 @@ dev_irnet_ioctl(
677 } 680 }
678 else 681 else
679 DERROR(FS_ERROR, "Can't setup PPP channel...\n"); 682 DERROR(FS_ERROR, "Can't setup PPP channel...\n");
680 unlock_kernel(); 683
684 mutex_unlock(&ap->lock);
681 } 685 }
682 else 686 else
683 { 687 {
684 /* In theory, should be N_TTY */ 688 /* In theory, should be N_TTY */
685 DEBUG(FS_INFO, "Exiting PPP discipline.\n"); 689 DEBUG(FS_INFO, "Exiting PPP discipline.\n");
686 /* Disconnect from the generic PPP layer */ 690 /* Disconnect from the generic PPP layer */
687 lock_kernel(); 691 if (mutex_lock_interruptible(&ap->lock))
692 return -EINTR;
693
688 if(ap->ppp_open) 694 if(ap->ppp_open)
689 { 695 {
690 ap->ppp_open = 0; 696 ap->ppp_open = 0;
@@ -693,24 +699,31 @@ dev_irnet_ioctl(
693 else 699 else
694 DERROR(FS_ERROR, "Channel not registered !\n"); 700 DERROR(FS_ERROR, "Channel not registered !\n");
695 err = 0; 701 err = 0;
696 unlock_kernel(); 702
703 mutex_unlock(&ap->lock);
697 } 704 }
698 break; 705 break;
699 706
700 /* Query PPP channel and unit number */ 707 /* Query PPP channel and unit number */
701 case PPPIOCGCHAN: 708 case PPPIOCGCHAN:
702 lock_kernel(); 709 if (mutex_lock_interruptible(&ap->lock))
710 return -EINTR;
711
703 if(ap->ppp_open && !put_user(ppp_channel_index(&ap->chan), 712 if(ap->ppp_open && !put_user(ppp_channel_index(&ap->chan),
704 (int __user *)argp)) 713 (int __user *)argp))
705 err = 0; 714 err = 0;
706 unlock_kernel(); 715
716 mutex_unlock(&ap->lock);
707 break; 717 break;
708 case PPPIOCGUNIT: 718 case PPPIOCGUNIT:
709 lock_kernel(); 719 if (mutex_lock_interruptible(&ap->lock))
720 return -EINTR;
721
710 if(ap->ppp_open && !put_user(ppp_unit_number(&ap->chan), 722 if(ap->ppp_open && !put_user(ppp_unit_number(&ap->chan),
711 (int __user *)argp)) 723 (int __user *)argp))
712 err = 0; 724 err = 0;
713 unlock_kernel(); 725
726 mutex_unlock(&ap->lock);
714 break; 727 break;
715 728
716 /* All these ioctls can be passed both directly and from ppp_generic, 729 /* All these ioctls can be passed both directly and from ppp_generic,
@@ -730,9 +743,12 @@ dev_irnet_ioctl(
730 if(!capable(CAP_NET_ADMIN)) 743 if(!capable(CAP_NET_ADMIN))
731 err = -EPERM; 744 err = -EPERM;
732 else { 745 else {
733 lock_kernel(); 746 if (mutex_lock_interruptible(&ap->lock))
747 return -EINTR;
748
734 err = ppp_irnet_ioctl(&ap->chan, cmd, arg); 749 err = ppp_irnet_ioctl(&ap->chan, cmd, arg);
735 unlock_kernel(); 750
751 mutex_unlock(&ap->lock);
736 } 752 }
737 break; 753 break;
738 754
@@ -740,7 +756,9 @@ dev_irnet_ioctl(
740 /* Get termios */ 756 /* Get termios */
741 case TCGETS: 757 case TCGETS:
742 DEBUG(FS_INFO, "Get termios.\n"); 758 DEBUG(FS_INFO, "Get termios.\n");
743 lock_kernel(); 759 if (mutex_lock_interruptible(&ap->lock))
760 return -EINTR;
761
744#ifndef TCGETS2 762#ifndef TCGETS2
745 if(!kernel_termios_to_user_termios((struct termios __user *)argp, &ap->termios)) 763 if(!kernel_termios_to_user_termios((struct termios __user *)argp, &ap->termios))
746 err = 0; 764 err = 0;
@@ -748,12 +766,15 @@ dev_irnet_ioctl(
748 if(kernel_termios_to_user_termios_1((struct termios __user *)argp, &ap->termios)) 766 if(kernel_termios_to_user_termios_1((struct termios __user *)argp, &ap->termios))
749 err = 0; 767 err = 0;
750#endif 768#endif
751 unlock_kernel(); 769
770 mutex_unlock(&ap->lock);
752 break; 771 break;
753 /* Set termios */ 772 /* Set termios */
754 case TCSETSF: 773 case TCSETSF:
755 DEBUG(FS_INFO, "Set termios.\n"); 774 DEBUG(FS_INFO, "Set termios.\n");
756 lock_kernel(); 775 if (mutex_lock_interruptible(&ap->lock))
776 return -EINTR;
777
757#ifndef TCGETS2 778#ifndef TCGETS2
758 if(!user_termios_to_kernel_termios(&ap->termios, (struct termios __user *)argp)) 779 if(!user_termios_to_kernel_termios(&ap->termios, (struct termios __user *)argp))
759 err = 0; 780 err = 0;
@@ -761,7 +782,8 @@ dev_irnet_ioctl(
761 if(!user_termios_to_kernel_termios_1(&ap->termios, (struct termios __user *)argp)) 782 if(!user_termios_to_kernel_termios_1(&ap->termios, (struct termios __user *)argp))
762 err = 0; 783 err = 0;
763#endif 784#endif
764 unlock_kernel(); 785
786 mutex_unlock(&ap->lock);
765 break; 787 break;
766 788
767 /* Set DTR/RTS */ 789 /* Set DTR/RTS */
@@ -784,9 +806,10 @@ dev_irnet_ioctl(
784 * We should also worry that we don't accept junk here and that 806 * We should also worry that we don't accept junk here and that
785 * we get rid of our own buffers */ 807 * we get rid of our own buffers */
786#ifdef FLUSH_TO_PPP 808#ifdef FLUSH_TO_PPP
787 lock_kernel(); 809 if (mutex_lock_interruptible(&ap->lock))
810 return -EINTR;
788 ppp_output_wakeup(&ap->chan); 811 ppp_output_wakeup(&ap->chan);
789 unlock_kernel(); 812 mutex_unlock(&ap->lock);
790#endif /* FLUSH_TO_PPP */ 813#endif /* FLUSH_TO_PPP */
791 err = 0; 814 err = 0;
792 break; 815 break;
diff --git a/net/irda/irnet/irnet_ppp.h b/net/irda/irnet/irnet_ppp.h
index b5df2418f90c..940225866da0 100644
--- a/net/irda/irnet/irnet_ppp.h
+++ b/net/irda/irnet/irnet_ppp.h
@@ -103,7 +103,8 @@ static const struct file_operations irnet_device_fops =
103 .poll = dev_irnet_poll, 103 .poll = dev_irnet_poll,
104 .unlocked_ioctl = dev_irnet_ioctl, 104 .unlocked_ioctl = dev_irnet_ioctl,
105 .open = dev_irnet_open, 105 .open = dev_irnet_open,
106 .release = dev_irnet_close 106 .release = dev_irnet_close,
107 .llseek = noop_llseek,
107 /* Also : llseek, readdir, mmap, flush, fsync, fasync, lock, readv, writev */ 108 /* Also : llseek, readdir, mmap, flush, fsync, fasync, lock, readv, writev */
108}; 109};
109 110
diff --git a/net/irda/parameters.c b/net/irda/parameters.c
index fc1a20565e2d..71cd38c1a67f 100644
--- a/net/irda/parameters.c
+++ b/net/irda/parameters.c
@@ -298,6 +298,8 @@ static int irda_extract_string(void *self, __u8 *buf, int len, __u8 pi,
298 298
299 p.pi = pi; /* In case handler needs to know */ 299 p.pi = pi; /* In case handler needs to know */
300 p.pl = buf[1]; /* Extract length of value */ 300 p.pl = buf[1]; /* Extract length of value */
301 if (p.pl > 32)
302 p.pl = 32;
301 303
302 IRDA_DEBUG(2, "%s(), pi=%#x, pl=%d\n", __func__, 304 IRDA_DEBUG(2, "%s(), pi=%#x, pl=%d\n", __func__,
303 p.pi, p.pl); 305 p.pi, p.pl);
@@ -318,7 +320,7 @@ static int irda_extract_string(void *self, __u8 *buf, int len, __u8 pi,
318 (__u8) str[0], (__u8) str[1]); 320 (__u8) str[0], (__u8) str[1]);
319 321
320 /* Null terminate string */ 322 /* Null terminate string */
321 str[p.pl+1] = '\0'; 323 str[p.pl] = '\0';
322 324
323 p.pv.c = str; /* Handler will need to take a copy */ 325 p.pv.c = str; /* Handler will need to take a copy */
324 326
diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c
index 499c045d6910..f7db676de77d 100644
--- a/net/iucv/iucv.c
+++ b/net/iucv/iucv.c
@@ -1798,7 +1798,8 @@ static void iucv_work_fn(struct work_struct *work)
1798 * Handles external interrupts coming in from CP. 1798 * Handles external interrupts coming in from CP.
1799 * Places the interrupt buffer on a queue and schedules iucv_tasklet_fn(). 1799 * Places the interrupt buffer on a queue and schedules iucv_tasklet_fn().
1800 */ 1800 */
1801static void iucv_external_interrupt(u16 code) 1801static void iucv_external_interrupt(unsigned int ext_int_code,
1802 unsigned int param32, unsigned long param64)
1802{ 1803{
1803 struct iucv_irq_data *p; 1804 struct iucv_irq_data *p;
1804 struct iucv_irq_list *work; 1805 struct iucv_irq_list *work;
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 43040e97c474..d87c22df6f1e 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -565,12 +565,12 @@ pfkey_proto2satype(uint16_t proto)
565 565
566static uint8_t pfkey_proto_to_xfrm(uint8_t proto) 566static uint8_t pfkey_proto_to_xfrm(uint8_t proto)
567{ 567{
568 return (proto == IPSEC_PROTO_ANY ? 0 : proto); 568 return proto == IPSEC_PROTO_ANY ? 0 : proto;
569} 569}
570 570
571static uint8_t pfkey_proto_from_xfrm(uint8_t proto) 571static uint8_t pfkey_proto_from_xfrm(uint8_t proto)
572{ 572{
573 return (proto ? proto : IPSEC_PROTO_ANY); 573 return proto ? proto : IPSEC_PROTO_ANY;
574} 574}
575 575
576static inline int pfkey_sockaddr_len(sa_family_t family) 576static inline int pfkey_sockaddr_len(sa_family_t family)
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 1712af1c7b3f..c64ce0a0bb03 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -111,6 +111,10 @@ struct l2tp_net {
111 spinlock_t l2tp_session_hlist_lock; 111 spinlock_t l2tp_session_hlist_lock;
112}; 112};
113 113
114static void l2tp_session_set_header_len(struct l2tp_session *session, int version);
115static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel);
116static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel);
117
114static inline struct l2tp_net *l2tp_pernet(struct net *net) 118static inline struct l2tp_net *l2tp_pernet(struct net *net)
115{ 119{
116 BUG_ON(!net); 120 BUG_ON(!net);
@@ -118,6 +122,34 @@ static inline struct l2tp_net *l2tp_pernet(struct net *net)
118 return net_generic(net, l2tp_net_id); 122 return net_generic(net, l2tp_net_id);
119} 123}
120 124
125
126/* Tunnel reference counts. Incremented per session that is added to
127 * the tunnel.
128 */
129static inline void l2tp_tunnel_inc_refcount_1(struct l2tp_tunnel *tunnel)
130{
131 atomic_inc(&tunnel->ref_count);
132}
133
134static inline void l2tp_tunnel_dec_refcount_1(struct l2tp_tunnel *tunnel)
135{
136 if (atomic_dec_and_test(&tunnel->ref_count))
137 l2tp_tunnel_free(tunnel);
138}
139#ifdef L2TP_REFCNT_DEBUG
140#define l2tp_tunnel_inc_refcount(_t) do { \
141 printk(KERN_DEBUG "l2tp_tunnel_inc_refcount: %s:%d %s: cnt=%d\n", __func__, __LINE__, (_t)->name, atomic_read(&_t->ref_count)); \
142 l2tp_tunnel_inc_refcount_1(_t); \
143 } while (0)
144#define l2tp_tunnel_dec_refcount(_t) do { \
145 printk(KERN_DEBUG "l2tp_tunnel_dec_refcount: %s:%d %s: cnt=%d\n", __func__, __LINE__, (_t)->name, atomic_read(&_t->ref_count)); \
146 l2tp_tunnel_dec_refcount_1(_t); \
147 } while (0)
148#else
149#define l2tp_tunnel_inc_refcount(t) l2tp_tunnel_inc_refcount_1(t)
150#define l2tp_tunnel_dec_refcount(t) l2tp_tunnel_dec_refcount_1(t)
151#endif
152
121/* Session hash global list for L2TPv3. 153/* Session hash global list for L2TPv3.
122 * The session_id SHOULD be random according to RFC3931, but several 154 * The session_id SHOULD be random according to RFC3931, but several
123 * L2TP implementations use incrementing session_ids. So we do a real 155 * L2TP implementations use incrementing session_ids. So we do a real
@@ -699,8 +731,8 @@ EXPORT_SYMBOL(l2tp_recv_common);
699 * Returns 1 if the packet was not a good data packet and could not be 731 * Returns 1 if the packet was not a good data packet and could not be
700 * forwarded. All such packets are passed up to userspace to deal with. 732 * forwarded. All such packets are passed up to userspace to deal with.
701 */ 733 */
702int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb, 734static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb,
703 int (*payload_hook)(struct sk_buff *skb)) 735 int (*payload_hook)(struct sk_buff *skb))
704{ 736{
705 struct l2tp_session *session = NULL; 737 struct l2tp_session *session = NULL;
706 unsigned char *ptr, *optr; 738 unsigned char *ptr, *optr;
@@ -812,7 +844,6 @@ error:
812 844
813 return 1; 845 return 1;
814} 846}
815EXPORT_SYMBOL_GPL(l2tp_udp_recv_core);
816 847
817/* UDP encapsulation receive handler. See net/ipv4/udp.c. 848/* UDP encapsulation receive handler. See net/ipv4/udp.c.
818 * Return codes: 849 * Return codes:
@@ -922,7 +953,8 @@ static int l2tp_build_l2tpv3_header(struct l2tp_session *session, void *buf)
922 return bufp - optr; 953 return bufp - optr;
923} 954}
924 955
925int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, size_t data_len) 956static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb,
957 size_t data_len)
926{ 958{
927 struct l2tp_tunnel *tunnel = session->tunnel; 959 struct l2tp_tunnel *tunnel = session->tunnel;
928 unsigned int len = skb->len; 960 unsigned int len = skb->len;
@@ -970,7 +1002,6 @@ int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, size_t dat
970 1002
971 return 0; 1003 return 0;
972} 1004}
973EXPORT_SYMBOL_GPL(l2tp_xmit_core);
974 1005
975/* Automatically called when the skb is freed. 1006/* Automatically called when the skb is freed.
976 */ 1007 */
@@ -1089,7 +1120,7 @@ EXPORT_SYMBOL_GPL(l2tp_xmit_skb);
1089 * The tunnel context is deleted only when all session sockets have been 1120 * The tunnel context is deleted only when all session sockets have been
1090 * closed. 1121 * closed.
1091 */ 1122 */
1092void l2tp_tunnel_destruct(struct sock *sk) 1123static void l2tp_tunnel_destruct(struct sock *sk)
1093{ 1124{
1094 struct l2tp_tunnel *tunnel; 1125 struct l2tp_tunnel *tunnel;
1095 1126
@@ -1128,11 +1159,10 @@ void l2tp_tunnel_destruct(struct sock *sk)
1128end: 1159end:
1129 return; 1160 return;
1130} 1161}
1131EXPORT_SYMBOL(l2tp_tunnel_destruct);
1132 1162
1133/* When the tunnel is closed, all the attached sessions need to go too. 1163/* When the tunnel is closed, all the attached sessions need to go too.
1134 */ 1164 */
1135void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel) 1165static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel)
1136{ 1166{
1137 int hash; 1167 int hash;
1138 struct hlist_node *walk; 1168 struct hlist_node *walk;
@@ -1193,12 +1223,11 @@ again:
1193 } 1223 }
1194 write_unlock_bh(&tunnel->hlist_lock); 1224 write_unlock_bh(&tunnel->hlist_lock);
1195} 1225}
1196EXPORT_SYMBOL_GPL(l2tp_tunnel_closeall);
1197 1226
1198/* Really kill the tunnel. 1227/* Really kill the tunnel.
1199 * Come here only when all sessions have been cleared from the tunnel. 1228 * Come here only when all sessions have been cleared from the tunnel.
1200 */ 1229 */
1201void l2tp_tunnel_free(struct l2tp_tunnel *tunnel) 1230static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel)
1202{ 1231{
1203 struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net); 1232 struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net);
1204 1233
@@ -1217,7 +1246,6 @@ void l2tp_tunnel_free(struct l2tp_tunnel *tunnel)
1217 atomic_dec(&l2tp_tunnel_count); 1246 atomic_dec(&l2tp_tunnel_count);
1218 kfree(tunnel); 1247 kfree(tunnel);
1219} 1248}
1220EXPORT_SYMBOL_GPL(l2tp_tunnel_free);
1221 1249
1222/* Create a socket for the tunnel, if one isn't set up by 1250/* Create a socket for the tunnel, if one isn't set up by
1223 * userspace. This is used for static tunnels where there is no 1251 * userspace. This is used for static tunnels where there is no
@@ -1512,7 +1540,7 @@ EXPORT_SYMBOL_GPL(l2tp_session_delete);
1512/* We come here whenever a session's send_seq, cookie_len or 1540/* We come here whenever a session's send_seq, cookie_len or
1513 * l2specific_len parameters are set. 1541 * l2specific_len parameters are set.
1514 */ 1542 */
1515void l2tp_session_set_header_len(struct l2tp_session *session, int version) 1543static void l2tp_session_set_header_len(struct l2tp_session *session, int version)
1516{ 1544{
1517 if (version == L2TP_HDR_VER_2) { 1545 if (version == L2TP_HDR_VER_2) {
1518 session->hdr_len = 6; 1546 session->hdr_len = 6;
@@ -1525,7 +1553,6 @@ void l2tp_session_set_header_len(struct l2tp_session *session, int version)
1525 } 1553 }
1526 1554
1527} 1555}
1528EXPORT_SYMBOL_GPL(l2tp_session_set_header_len);
1529 1556
1530struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg) 1557struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg)
1531{ 1558{
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index f0f318edd3f1..a16a48e79fab 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -231,48 +231,15 @@ extern int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_i
231extern int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); 231extern int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel);
232extern struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg); 232extern struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg);
233extern int l2tp_session_delete(struct l2tp_session *session); 233extern int l2tp_session_delete(struct l2tp_session *session);
234extern void l2tp_tunnel_free(struct l2tp_tunnel *tunnel);
235extern void l2tp_session_free(struct l2tp_session *session); 234extern void l2tp_session_free(struct l2tp_session *session);
236extern void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, unsigned char *ptr, unsigned char *optr, u16 hdrflags, int length, int (*payload_hook)(struct sk_buff *skb)); 235extern void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, unsigned char *ptr, unsigned char *optr, u16 hdrflags, int length, int (*payload_hook)(struct sk_buff *skb));
237extern int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb, int (*payload_hook)(struct sk_buff *skb));
238extern int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb); 236extern int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb);
239 237
240extern int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, size_t data_len);
241extern int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len); 238extern int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len);
242extern void l2tp_tunnel_destruct(struct sock *sk);
243extern void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel);
244extern void l2tp_session_set_header_len(struct l2tp_session *session, int version);
245 239
246extern int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, const struct l2tp_nl_cmd_ops *ops); 240extern int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, const struct l2tp_nl_cmd_ops *ops);
247extern void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type); 241extern void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type);
248 242
249/* Tunnel reference counts. Incremented per session that is added to
250 * the tunnel.
251 */
252static inline void l2tp_tunnel_inc_refcount_1(struct l2tp_tunnel *tunnel)
253{
254 atomic_inc(&tunnel->ref_count);
255}
256
257static inline void l2tp_tunnel_dec_refcount_1(struct l2tp_tunnel *tunnel)
258{
259 if (atomic_dec_and_test(&tunnel->ref_count))
260 l2tp_tunnel_free(tunnel);
261}
262#ifdef L2TP_REFCNT_DEBUG
263#define l2tp_tunnel_inc_refcount(_t) do { \
264 printk(KERN_DEBUG "l2tp_tunnel_inc_refcount: %s:%d %s: cnt=%d\n", __func__, __LINE__, (_t)->name, atomic_read(&_t->ref_count)); \
265 l2tp_tunnel_inc_refcount_1(_t); \
266 } while (0)
267#define l2tp_tunnel_dec_refcount(_t) do { \
268 printk(KERN_DEBUG "l2tp_tunnel_dec_refcount: %s:%d %s: cnt=%d\n", __func__, __LINE__, (_t)->name, atomic_read(&_t->ref_count)); \
269 l2tp_tunnel_dec_refcount_1(_t); \
270 } while (0)
271#else
272#define l2tp_tunnel_inc_refcount(t) l2tp_tunnel_inc_refcount_1(t)
273#define l2tp_tunnel_dec_refcount(t) l2tp_tunnel_dec_refcount_1(t)
274#endif
275
276/* Session reference counts. Incremented when code obtains a reference 243/* Session reference counts. Incremented when code obtains a reference
277 * to a session. 244 * to a session.
278 */ 245 */
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index 58c6c4cda73b..8d9ce0accc98 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -132,7 +132,7 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb,
132 printk("\n"); 132 printk("\n");
133 } 133 }
134 134
135 if (data_len < ETH_HLEN) 135 if (!pskb_may_pull(skb, sizeof(ETH_HLEN)))
136 goto error; 136 goto error;
137 137
138 secpath_reset(skb); 138 secpath_reset(skb);
@@ -144,7 +144,6 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb,
144 nf_reset(skb); 144 nf_reset(skb);
145 145
146 if (dev_forward_skb(dev, skb) == NET_RX_SUCCESS) { 146 if (dev_forward_skb(dev, skb) == NET_RX_SUCCESS) {
147 dev->last_rx = jiffies;
148 dev->stats.rx_packets++; 147 dev->stats.rx_packets++;
149 dev->stats.rx_bytes += data_len; 148 dev->stats.rx_bytes += data_len;
150 } else 149 } else
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 226a0ae3bcfd..0bf6a59545ab 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -65,9 +65,7 @@ static struct sock *__l2tp_ip_bind_lookup(struct net *net, __be32 laddr, int dif
65 continue; 65 continue;
66 66
67 if ((l2tp->conn_id == tunnel_id) && 67 if ((l2tp->conn_id == tunnel_id) &&
68#ifdef CONFIG_NET_NS 68 net_eq(sock_net(sk), net) &&
69 (sk->sk_net == net) &&
70#endif
71 !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) && 69 !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
72 !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) 70 !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
73 goto found; 71 goto found;
@@ -578,7 +576,7 @@ out:
578 return copied; 576 return copied;
579} 577}
580 578
581struct proto l2tp_ip_prot = { 579static struct proto l2tp_ip_prot = {
582 .name = "L2TP/IP", 580 .name = "L2TP/IP",
583 .owner = THIS_MODULE, 581 .owner = THIS_MODULE,
584 .init = l2tp_ip_open, 582 .init = l2tp_ip_open,
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index ff954b3e94b6..39a21d0c61c4 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -1768,7 +1768,7 @@ static const struct proto_ops pppol2tp_ops = {
1768 .ioctl = pppox_ioctl, 1768 .ioctl = pppox_ioctl,
1769}; 1769};
1770 1770
1771static struct pppox_proto pppol2tp_proto = { 1771static const struct pppox_proto pppol2tp_proto = {
1772 .create = pppol2tp_create, 1772 .create = pppol2tp_create,
1773 .ioctl = pppol2tp_ioctl 1773 .ioctl = pppol2tp_ioctl
1774}; 1774};
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 023ba820236f..582612998211 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -1024,7 +1024,8 @@ static int llc_ui_setsockopt(struct socket *sock, int level, int optname,
1024{ 1024{
1025 struct sock *sk = sock->sk; 1025 struct sock *sk = sock->sk;
1026 struct llc_sock *llc = llc_sk(sk); 1026 struct llc_sock *llc = llc_sk(sk);
1027 int rc = -EINVAL, opt; 1027 unsigned int opt;
1028 int rc = -EINVAL;
1028 1029
1029 lock_sock(sk); 1030 lock_sock(sk);
1030 if (unlikely(level != SOL_LLC || optlen != sizeof(int))) 1031 if (unlikely(level != SOL_LLC || optlen != sizeof(int)))
diff --git a/net/llc/llc_station.c b/net/llc/llc_station.c
index e4dae0244d76..cf4aea3ba30f 100644
--- a/net/llc/llc_station.c
+++ b/net/llc/llc_station.c
@@ -689,7 +689,7 @@ static void llc_station_rcv(struct sk_buff *skb)
689 689
690int __init llc_station_init(void) 690int __init llc_station_init(void)
691{ 691{
692 u16 rc = -ENOBUFS; 692 int rc = -ENOBUFS;
693 struct sk_buff *skb; 693 struct sk_buff *skb;
694 struct llc_station_state_ev *ev; 694 struct llc_station_state_ev *ev;
695 695
diff --git a/net/mac80211/aes_ccm.c b/net/mac80211/aes_ccm.c
index a87cb3ba2df6..d2b03e0851ef 100644
--- a/net/mac80211/aes_ccm.c
+++ b/net/mac80211/aes_ccm.c
@@ -138,10 +138,8 @@ struct crypto_cipher *ieee80211_aes_key_setup_encrypt(const u8 key[])
138 struct crypto_cipher *tfm; 138 struct crypto_cipher *tfm;
139 139
140 tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC); 140 tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
141 if (IS_ERR(tfm)) 141 if (!IS_ERR(tfm))
142 return NULL; 142 crypto_cipher_setkey(tfm, key, ALG_CCMP_KEY_LEN);
143
144 crypto_cipher_setkey(tfm, key, ALG_CCMP_KEY_LEN);
145 143
146 return tfm; 144 return tfm;
147} 145}
diff --git a/net/mac80211/aes_cmac.c b/net/mac80211/aes_cmac.c
index 3d097b3d7b62..b4d66cca76d6 100644
--- a/net/mac80211/aes_cmac.c
+++ b/net/mac80211/aes_cmac.c
@@ -119,10 +119,8 @@ struct crypto_cipher * ieee80211_aes_cmac_key_setup(const u8 key[])
119 struct crypto_cipher *tfm; 119 struct crypto_cipher *tfm;
120 120
121 tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC); 121 tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
122 if (IS_ERR(tfm)) 122 if (!IS_ERR(tfm))
123 return NULL; 123 crypto_cipher_setkey(tfm, key, AES_CMAC_KEY_LEN);
124
125 crypto_cipher_setkey(tfm, key, AES_CMAC_KEY_LEN);
126 124
127 return tfm; 125 return tfm;
128} 126}
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 965b272499fd..720b7a84af59 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -56,7 +56,7 @@ static void ieee80211_free_tid_rx(struct rcu_head *h)
56} 56}
57 57
58void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, 58void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
59 u16 initiator, u16 reason) 59 u16 initiator, u16 reason, bool tx)
60{ 60{
61 struct ieee80211_local *local = sta->local; 61 struct ieee80211_local *local = sta->local;
62 struct tid_ampdu_rx *tid_rx; 62 struct tid_ampdu_rx *tid_rx;
@@ -81,20 +81,21 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
81 "aggregation for tid %d\n", tid); 81 "aggregation for tid %d\n", tid);
82 82
83 /* check if this is a self generated aggregation halt */ 83 /* check if this is a self generated aggregation halt */
84 if (initiator == WLAN_BACK_RECIPIENT) 84 if (initiator == WLAN_BACK_RECIPIENT && tx)
85 ieee80211_send_delba(sta->sdata, sta->sta.addr, 85 ieee80211_send_delba(sta->sdata, sta->sta.addr,
86 tid, 0, reason); 86 tid, 0, reason);
87 87
88 del_timer_sync(&tid_rx->session_timer); 88 del_timer_sync(&tid_rx->session_timer);
89 del_timer_sync(&tid_rx->reorder_timer);
89 90
90 call_rcu(&tid_rx->rcu_head, ieee80211_free_tid_rx); 91 call_rcu(&tid_rx->rcu_head, ieee80211_free_tid_rx);
91} 92}
92 93
93void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, 94void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
94 u16 initiator, u16 reason) 95 u16 initiator, u16 reason, bool tx)
95{ 96{
96 mutex_lock(&sta->ampdu_mlme.mtx); 97 mutex_lock(&sta->ampdu_mlme.mtx);
97 ___ieee80211_stop_rx_ba_session(sta, tid, initiator, reason); 98 ___ieee80211_stop_rx_ba_session(sta, tid, initiator, reason, tx);
98 mutex_unlock(&sta->ampdu_mlme.mtx); 99 mutex_unlock(&sta->ampdu_mlme.mtx);
99} 100}
100 101
@@ -120,6 +121,20 @@ static void sta_rx_agg_session_timer_expired(unsigned long data)
120 ieee80211_queue_work(&sta->local->hw, &sta->ampdu_mlme.work); 121 ieee80211_queue_work(&sta->local->hw, &sta->ampdu_mlme.work);
121} 122}
122 123
124static void sta_rx_agg_reorder_timer_expired(unsigned long data)
125{
126 u8 *ptid = (u8 *)data;
127 u8 *timer_to_id = ptid - *ptid;
128 struct sta_info *sta = container_of(timer_to_id, struct sta_info,
129 timer_to_tid[0]);
130
131 rcu_read_lock();
132 spin_lock(&sta->lock);
133 ieee80211_release_reorder_timeout(sta, *ptid);
134 spin_unlock(&sta->lock);
135 rcu_read_unlock();
136}
137
123static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *da, u16 tid, 138static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *da, u16 tid,
124 u8 dialog_token, u16 status, u16 policy, 139 u8 dialog_token, u16 status, u16 policy,
125 u16 buf_size, u16 timeout) 140 u16 buf_size, u16 timeout)
@@ -251,11 +266,18 @@ void ieee80211_process_addba_request(struct ieee80211_local *local,
251 goto end; 266 goto end;
252 } 267 }
253 268
269 spin_lock_init(&tid_agg_rx->reorder_lock);
270
254 /* rx timer */ 271 /* rx timer */
255 tid_agg_rx->session_timer.function = sta_rx_agg_session_timer_expired; 272 tid_agg_rx->session_timer.function = sta_rx_agg_session_timer_expired;
256 tid_agg_rx->session_timer.data = (unsigned long)&sta->timer_to_tid[tid]; 273 tid_agg_rx->session_timer.data = (unsigned long)&sta->timer_to_tid[tid];
257 init_timer(&tid_agg_rx->session_timer); 274 init_timer(&tid_agg_rx->session_timer);
258 275
276 /* rx reorder timer */
277 tid_agg_rx->reorder_timer.function = sta_rx_agg_reorder_timer_expired;
278 tid_agg_rx->reorder_timer.data = (unsigned long)&sta->timer_to_tid[tid];
279 init_timer(&tid_agg_rx->reorder_timer);
280
259 /* prepare reordering buffer */ 281 /* prepare reordering buffer */
260 tid_agg_rx->reorder_buf = 282 tid_agg_rx->reorder_buf =
261 kcalloc(buf_size, sizeof(struct sk_buff *), GFP_ATOMIC); 283 kcalloc(buf_size, sizeof(struct sk_buff *), GFP_ATOMIC);
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index c893f236acea..d4679b265ba8 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -145,7 +145,8 @@ static void kfree_tid_tx(struct rcu_head *rcu_head)
145} 145}
146 146
147int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, 147int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
148 enum ieee80211_back_parties initiator) 148 enum ieee80211_back_parties initiator,
149 bool tx)
149{ 150{
150 struct ieee80211_local *local = sta->local; 151 struct ieee80211_local *local = sta->local;
151 struct tid_ampdu_tx *tid_tx = sta->ampdu_mlme.tid_tx[tid]; 152 struct tid_ampdu_tx *tid_tx = sta->ampdu_mlme.tid_tx[tid];
@@ -175,6 +176,8 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
175 176
176 set_bit(HT_AGG_STATE_STOPPING, &tid_tx->state); 177 set_bit(HT_AGG_STATE_STOPPING, &tid_tx->state);
177 178
179 del_timer_sync(&tid_tx->addba_resp_timer);
180
178 /* 181 /*
179 * After this packets are no longer handed right through 182 * After this packets are no longer handed right through
180 * to the driver but are put onto tid_tx->pending instead, 183 * to the driver but are put onto tid_tx->pending instead,
@@ -183,6 +186,7 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
183 clear_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state); 186 clear_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state);
184 187
185 tid_tx->stop_initiator = initiator; 188 tid_tx->stop_initiator = initiator;
189 tid_tx->tx_stop = tx;
186 190
187 ret = drv_ampdu_action(local, sta->sdata, 191 ret = drv_ampdu_action(local, sta->sdata,
188 IEEE80211_AMPDU_TX_STOP, 192 IEEE80211_AMPDU_TX_STOP,
@@ -575,13 +579,14 @@ void ieee80211_start_tx_ba_cb_irqsafe(struct ieee80211_vif *vif,
575EXPORT_SYMBOL(ieee80211_start_tx_ba_cb_irqsafe); 579EXPORT_SYMBOL(ieee80211_start_tx_ba_cb_irqsafe);
576 580
577int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, 581int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
578 enum ieee80211_back_parties initiator) 582 enum ieee80211_back_parties initiator,
583 bool tx)
579{ 584{
580 int ret; 585 int ret;
581 586
582 mutex_lock(&sta->ampdu_mlme.mtx); 587 mutex_lock(&sta->ampdu_mlme.mtx);
583 588
584 ret = ___ieee80211_stop_tx_ba_session(sta, tid, initiator); 589 ret = ___ieee80211_stop_tx_ba_session(sta, tid, initiator, tx);
585 590
586 mutex_unlock(&sta->ampdu_mlme.mtx); 591 mutex_unlock(&sta->ampdu_mlme.mtx);
587 592
@@ -670,7 +675,7 @@ void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid)
670 goto unlock_sta; 675 goto unlock_sta;
671 } 676 }
672 677
673 if (tid_tx->stop_initiator == WLAN_BACK_INITIATOR) 678 if (tid_tx->stop_initiator == WLAN_BACK_INITIATOR && tid_tx->tx_stop)
674 ieee80211_send_delba(sta->sdata, ra, tid, 679 ieee80211_send_delba(sta->sdata, ra, tid,
675 WLAN_BACK_INITIATOR, WLAN_REASON_QSTA_NOT_USE); 680 WLAN_BACK_INITIATOR, WLAN_REASON_QSTA_NOT_USE);
676 681
@@ -770,7 +775,8 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
770 775
771 sta->ampdu_mlme.addba_req_num[tid] = 0; 776 sta->ampdu_mlme.addba_req_num[tid] = 0;
772 } else { 777 } else {
773 ___ieee80211_stop_tx_ba_session(sta, tid, WLAN_BACK_INITIATOR); 778 ___ieee80211_stop_tx_ba_session(sta, tid, WLAN_BACK_INITIATOR,
779 true);
774 } 780 }
775 781
776 out: 782 out:
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 29ac8e1a509e..18bd0e550600 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -19,33 +19,6 @@
19#include "rate.h" 19#include "rate.h"
20#include "mesh.h" 20#include "mesh.h"
21 21
22static bool nl80211_type_check(enum nl80211_iftype type)
23{
24 switch (type) {
25 case NL80211_IFTYPE_ADHOC:
26 case NL80211_IFTYPE_STATION:
27 case NL80211_IFTYPE_MONITOR:
28#ifdef CONFIG_MAC80211_MESH
29 case NL80211_IFTYPE_MESH_POINT:
30#endif
31 case NL80211_IFTYPE_AP:
32 case NL80211_IFTYPE_AP_VLAN:
33 case NL80211_IFTYPE_WDS:
34 return true;
35 default:
36 return false;
37 }
38}
39
40static bool nl80211_params_check(enum nl80211_iftype type,
41 struct vif_params *params)
42{
43 if (!nl80211_type_check(type))
44 return false;
45
46 return true;
47}
48
49static int ieee80211_add_iface(struct wiphy *wiphy, char *name, 22static int ieee80211_add_iface(struct wiphy *wiphy, char *name,
50 enum nl80211_iftype type, u32 *flags, 23 enum nl80211_iftype type, u32 *flags,
51 struct vif_params *params) 24 struct vif_params *params)
@@ -55,9 +28,6 @@ static int ieee80211_add_iface(struct wiphy *wiphy, char *name,
55 struct ieee80211_sub_if_data *sdata; 28 struct ieee80211_sub_if_data *sdata;
56 int err; 29 int err;
57 30
58 if (!nl80211_params_check(type, params))
59 return -EINVAL;
60
61 err = ieee80211_if_add(local, name, &dev, type, params); 31 err = ieee80211_if_add(local, name, &dev, type, params);
62 if (err || type != NL80211_IFTYPE_MONITOR || !flags) 32 if (err || type != NL80211_IFTYPE_MONITOR || !flags)
63 return err; 33 return err;
@@ -82,12 +52,6 @@ static int ieee80211_change_iface(struct wiphy *wiphy,
82 struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); 52 struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
83 int ret; 53 int ret;
84 54
85 if (ieee80211_sdata_running(sdata))
86 return -EBUSY;
87
88 if (!nl80211_params_check(type, params))
89 return -EINVAL;
90
91 ret = ieee80211_if_change_type(sdata, type); 55 ret = ieee80211_if_change_type(sdata, type);
92 if (ret) 56 if (ret)
93 return ret; 57 return ret;
@@ -104,54 +68,71 @@ static int ieee80211_change_iface(struct wiphy *wiphy,
104 params && params->use_4addr >= 0) 68 params && params->use_4addr >= 0)
105 sdata->u.mgd.use_4addr = params->use_4addr; 69 sdata->u.mgd.use_4addr = params->use_4addr;
106 70
107 if (sdata->vif.type == NL80211_IFTYPE_MONITOR && flags) 71 if (sdata->vif.type == NL80211_IFTYPE_MONITOR && flags) {
108 sdata->u.mntr_flags = *flags; 72 struct ieee80211_local *local = sdata->local;
73
74 if (ieee80211_sdata_running(sdata)) {
75 /*
76 * Prohibit MONITOR_FLAG_COOK_FRAMES to be
77 * changed while the interface is up.
78 * Else we would need to add a lot of cruft
79 * to update everything:
80 * cooked_mntrs, monitor and all fif_* counters
81 * reconfigure hardware
82 */
83 if ((*flags & MONITOR_FLAG_COOK_FRAMES) !=
84 (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES))
85 return -EBUSY;
86
87 ieee80211_adjust_monitor_flags(sdata, -1);
88 sdata->u.mntr_flags = *flags;
89 ieee80211_adjust_monitor_flags(sdata, 1);
90
91 ieee80211_configure_filter(local);
92 } else {
93 /*
94 * Because the interface is down, ieee80211_do_stop
95 * and ieee80211_do_open take care of "everything"
96 * mentioned in the comment above.
97 */
98 sdata->u.mntr_flags = *flags;
99 }
100 }
109 101
110 return 0; 102 return 0;
111} 103}
112 104
113static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, 105static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
114 u8 key_idx, const u8 *mac_addr, 106 u8 key_idx, bool pairwise, const u8 *mac_addr,
115 struct key_params *params) 107 struct key_params *params)
116{ 108{
117 struct ieee80211_sub_if_data *sdata; 109 struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
118 struct sta_info *sta = NULL; 110 struct sta_info *sta = NULL;
119 enum ieee80211_key_alg alg;
120 struct ieee80211_key *key; 111 struct ieee80211_key *key;
121 int err; 112 int err;
122 113
123 if (!netif_running(dev)) 114 if (!ieee80211_sdata_running(sdata))
124 return -ENETDOWN; 115 return -ENETDOWN;
125 116
126 sdata = IEEE80211_DEV_TO_SUB_IF(dev); 117 /* reject WEP and TKIP keys if WEP failed to initialize */
127
128 switch (params->cipher) { 118 switch (params->cipher) {
129 case WLAN_CIPHER_SUITE_WEP40: 119 case WLAN_CIPHER_SUITE_WEP40:
130 case WLAN_CIPHER_SUITE_WEP104:
131 alg = ALG_WEP;
132 break;
133 case WLAN_CIPHER_SUITE_TKIP: 120 case WLAN_CIPHER_SUITE_TKIP:
134 alg = ALG_TKIP; 121 case WLAN_CIPHER_SUITE_WEP104:
135 break; 122 if (IS_ERR(sdata->local->wep_tx_tfm))
136 case WLAN_CIPHER_SUITE_CCMP: 123 return -EINVAL;
137 alg = ALG_CCMP;
138 break;
139 case WLAN_CIPHER_SUITE_AES_CMAC:
140 alg = ALG_AES_CMAC;
141 break; 124 break;
142 default: 125 default:
143 return -EINVAL; 126 break;
144 } 127 }
145 128
146 /* reject WEP and TKIP keys if WEP failed to initialize */ 129 key = ieee80211_key_alloc(params->cipher, key_idx, params->key_len,
147 if ((alg == ALG_WEP || alg == ALG_TKIP) && 130 params->key, params->seq_len, params->seq);
148 IS_ERR(sdata->local->wep_tx_tfm)) 131 if (IS_ERR(key))
149 return -EINVAL; 132 return PTR_ERR(key);
150 133
151 key = ieee80211_key_alloc(alg, key_idx, params->key_len, params->key, 134 if (pairwise)
152 params->seq_len, params->seq); 135 key->conf.flags |= IEEE80211_KEY_FLAG_PAIRWISE;
153 if (!key)
154 return -ENOMEM;
155 136
156 mutex_lock(&sdata->local->sta_mtx); 137 mutex_lock(&sdata->local->sta_mtx);
157 138
@@ -164,9 +145,10 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
164 } 145 }
165 } 146 }
166 147
167 ieee80211_key_link(key, sdata, sta); 148 err = ieee80211_key_link(key, sdata, sta);
149 if (err)
150 ieee80211_key_free(sdata->local, key);
168 151
169 err = 0;
170 out_unlock: 152 out_unlock:
171 mutex_unlock(&sdata->local->sta_mtx); 153 mutex_unlock(&sdata->local->sta_mtx);
172 154
@@ -174,7 +156,7 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
174} 156}
175 157
176static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev, 158static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev,
177 u8 key_idx, const u8 *mac_addr) 159 u8 key_idx, bool pairwise, const u8 *mac_addr)
178{ 160{
179 struct ieee80211_sub_if_data *sdata; 161 struct ieee80211_sub_if_data *sdata;
180 struct sta_info *sta; 162 struct sta_info *sta;
@@ -191,10 +173,17 @@ static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev,
191 if (!sta) 173 if (!sta)
192 goto out_unlock; 174 goto out_unlock;
193 175
194 if (sta->key) { 176 if (pairwise) {
195 ieee80211_key_free(sdata->local, sta->key); 177 if (sta->ptk) {
196 WARN_ON(sta->key); 178 ieee80211_key_free(sdata->local, sta->ptk);
197 ret = 0; 179 ret = 0;
180 }
181 } else {
182 if (sta->gtk[key_idx]) {
183 ieee80211_key_free(sdata->local,
184 sta->gtk[key_idx]);
185 ret = 0;
186 }
198 } 187 }
199 188
200 goto out_unlock; 189 goto out_unlock;
@@ -216,7 +205,8 @@ static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev,
216} 205}
217 206
218static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, 207static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
219 u8 key_idx, const u8 *mac_addr, void *cookie, 208 u8 key_idx, bool pairwise, const u8 *mac_addr,
209 void *cookie,
220 void (*callback)(void *cookie, 210 void (*callback)(void *cookie,
221 struct key_params *params)) 211 struct key_params *params))
222{ 212{
@@ -224,7 +214,7 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
224 struct sta_info *sta = NULL; 214 struct sta_info *sta = NULL;
225 u8 seq[6] = {0}; 215 u8 seq[6] = {0};
226 struct key_params params; 216 struct key_params params;
227 struct ieee80211_key *key; 217 struct ieee80211_key *key = NULL;
228 u32 iv32; 218 u32 iv32;
229 u16 iv16; 219 u16 iv16;
230 int err = -ENOENT; 220 int err = -ENOENT;
@@ -238,7 +228,10 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
238 if (!sta) 228 if (!sta)
239 goto out; 229 goto out;
240 230
241 key = sta->key; 231 if (pairwise)
232 key = sta->ptk;
233 else if (key_idx < NUM_DEFAULT_KEYS)
234 key = sta->gtk[key_idx];
242 } else 235 } else
243 key = sdata->keys[key_idx]; 236 key = sdata->keys[key_idx];
244 237
@@ -247,10 +240,10 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
247 240
248 memset(&params, 0, sizeof(params)); 241 memset(&params, 0, sizeof(params));
249 242
250 switch (key->conf.alg) { 243 params.cipher = key->conf.cipher;
251 case ALG_TKIP:
252 params.cipher = WLAN_CIPHER_SUITE_TKIP;
253 244
245 switch (key->conf.cipher) {
246 case WLAN_CIPHER_SUITE_TKIP:
254 iv32 = key->u.tkip.tx.iv32; 247 iv32 = key->u.tkip.tx.iv32;
255 iv16 = key->u.tkip.tx.iv16; 248 iv16 = key->u.tkip.tx.iv16;
256 249
@@ -268,8 +261,7 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
268 params.seq = seq; 261 params.seq = seq;
269 params.seq_len = 6; 262 params.seq_len = 6;
270 break; 263 break;
271 case ALG_CCMP: 264 case WLAN_CIPHER_SUITE_CCMP:
272 params.cipher = WLAN_CIPHER_SUITE_CCMP;
273 seq[0] = key->u.ccmp.tx_pn[5]; 265 seq[0] = key->u.ccmp.tx_pn[5];
274 seq[1] = key->u.ccmp.tx_pn[4]; 266 seq[1] = key->u.ccmp.tx_pn[4];
275 seq[2] = key->u.ccmp.tx_pn[3]; 267 seq[2] = key->u.ccmp.tx_pn[3];
@@ -279,14 +271,7 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
279 params.seq = seq; 271 params.seq = seq;
280 params.seq_len = 6; 272 params.seq_len = 6;
281 break; 273 break;
282 case ALG_WEP: 274 case WLAN_CIPHER_SUITE_AES_CMAC:
283 if (key->conf.keylen == 5)
284 params.cipher = WLAN_CIPHER_SUITE_WEP40;
285 else
286 params.cipher = WLAN_CIPHER_SUITE_WEP104;
287 break;
288 case ALG_AES_CMAC:
289 params.cipher = WLAN_CIPHER_SUITE_AES_CMAC;
290 seq[0] = key->u.aes_cmac.tx_pn[5]; 275 seq[0] = key->u.aes_cmac.tx_pn[5];
291 seq[1] = key->u.aes_cmac.tx_pn[4]; 276 seq[1] = key->u.aes_cmac.tx_pn[4];
292 seq[2] = key->u.aes_cmac.tx_pn[3]; 277 seq[2] = key->u.aes_cmac.tx_pn[3];
@@ -342,13 +327,19 @@ static void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
342 STATION_INFO_TX_BYTES | 327 STATION_INFO_TX_BYTES |
343 STATION_INFO_RX_PACKETS | 328 STATION_INFO_RX_PACKETS |
344 STATION_INFO_TX_PACKETS | 329 STATION_INFO_TX_PACKETS |
345 STATION_INFO_TX_BITRATE; 330 STATION_INFO_TX_RETRIES |
331 STATION_INFO_TX_FAILED |
332 STATION_INFO_TX_BITRATE |
333 STATION_INFO_RX_DROP_MISC;
346 334
347 sinfo->inactive_time = jiffies_to_msecs(jiffies - sta->last_rx); 335 sinfo->inactive_time = jiffies_to_msecs(jiffies - sta->last_rx);
348 sinfo->rx_bytes = sta->rx_bytes; 336 sinfo->rx_bytes = sta->rx_bytes;
349 sinfo->tx_bytes = sta->tx_bytes; 337 sinfo->tx_bytes = sta->tx_bytes;
350 sinfo->rx_packets = sta->rx_packets; 338 sinfo->rx_packets = sta->rx_packets;
351 sinfo->tx_packets = sta->tx_packets; 339 sinfo->tx_packets = sta->tx_packets;
340 sinfo->tx_retries = sta->tx_retry_count;
341 sinfo->tx_failed = sta->tx_retry_failed;
342 sinfo->rx_dropped_misc = sta->rx_dropped;
352 343
353 if ((sta->local->hw.flags & IEEE80211_HW_SIGNAL_DBM) || 344 if ((sta->local->hw.flags & IEEE80211_HW_SIGNAL_DBM) ||
354 (sta->local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)) { 345 (sta->local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)) {
@@ -634,6 +625,7 @@ static void sta_apply_parameters(struct ieee80211_local *local,
634 struct sta_info *sta, 625 struct sta_info *sta,
635 struct station_parameters *params) 626 struct station_parameters *params)
636{ 627{
628 unsigned long flags;
637 u32 rates; 629 u32 rates;
638 int i, j; 630 int i, j;
639 struct ieee80211_supported_band *sband; 631 struct ieee80211_supported_band *sband;
@@ -642,7 +634,7 @@ static void sta_apply_parameters(struct ieee80211_local *local,
642 634
643 sband = local->hw.wiphy->bands[local->oper_channel->band]; 635 sband = local->hw.wiphy->bands[local->oper_channel->band];
644 636
645 spin_lock_bh(&sta->lock); 637 spin_lock_irqsave(&sta->flaglock, flags);
646 mask = params->sta_flags_mask; 638 mask = params->sta_flags_mask;
647 set = params->sta_flags_set; 639 set = params->sta_flags_set;
648 640
@@ -669,7 +661,7 @@ static void sta_apply_parameters(struct ieee80211_local *local,
669 if (set & BIT(NL80211_STA_FLAG_MFP)) 661 if (set & BIT(NL80211_STA_FLAG_MFP))
670 sta->flags |= WLAN_STA_MFP; 662 sta->flags |= WLAN_STA_MFP;
671 } 663 }
672 spin_unlock_bh(&sta->lock); 664 spin_unlock_irqrestore(&sta->flaglock, flags);
673 665
674 /* 666 /*
675 * cfg80211 validates this (1-2007) and allows setting the AID 667 * cfg80211 validates this (1-2007) and allows setting the AID
@@ -1143,9 +1135,9 @@ static int ieee80211_set_txq_params(struct wiphy *wiphy,
1143 p.uapsd = false; 1135 p.uapsd = false;
1144 1136
1145 if (drv_conf_tx(local, params->queue, &p)) { 1137 if (drv_conf_tx(local, params->queue, &p)) {
1146 printk(KERN_DEBUG "%s: failed to set TX queue " 1138 wiphy_debug(local->hw.wiphy,
1147 "parameters for queue %d\n", 1139 "failed to set TX queue parameters for queue %d\n",
1148 wiphy_name(local->hw.wiphy), params->queue); 1140 params->queue);
1149 return -EINVAL; 1141 return -EINVAL;
1150 } 1142 }
1151 1143
@@ -1207,15 +1199,26 @@ static int ieee80211_scan(struct wiphy *wiphy,
1207 struct net_device *dev, 1199 struct net_device *dev,
1208 struct cfg80211_scan_request *req) 1200 struct cfg80211_scan_request *req)
1209{ 1201{
1210 struct ieee80211_sub_if_data *sdata; 1202 struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
1211
1212 sdata = IEEE80211_DEV_TO_SUB_IF(dev);
1213 1203
1214 if (sdata->vif.type != NL80211_IFTYPE_STATION && 1204 switch (ieee80211_vif_type_p2p(&sdata->vif)) {
1215 sdata->vif.type != NL80211_IFTYPE_ADHOC && 1205 case NL80211_IFTYPE_STATION:
1216 sdata->vif.type != NL80211_IFTYPE_MESH_POINT && 1206 case NL80211_IFTYPE_ADHOC:
1217 (sdata->vif.type != NL80211_IFTYPE_AP || sdata->u.ap.beacon)) 1207 case NL80211_IFTYPE_MESH_POINT:
1208 case NL80211_IFTYPE_P2P_CLIENT:
1209 break;
1210 case NL80211_IFTYPE_P2P_GO:
1211 if (sdata->local->ops->hw_scan)
1212 break;
1213 /* FIXME: implement NoA while scanning in software */
1214 return -EOPNOTSUPP;
1215 case NL80211_IFTYPE_AP:
1216 if (sdata->u.ap.beacon)
1217 return -EOPNOTSUPP;
1218 break;
1219 default:
1218 return -EOPNOTSUPP; 1220 return -EOPNOTSUPP;
1221 }
1219 1222
1220 return ieee80211_request_scan(sdata, req); 1223 return ieee80211_request_scan(sdata, req);
1221} 1224}
@@ -1362,7 +1365,7 @@ static int ieee80211_get_tx_power(struct wiphy *wiphy, int *dbm)
1362} 1365}
1363 1366
1364static int ieee80211_set_wds_peer(struct wiphy *wiphy, struct net_device *dev, 1367static int ieee80211_set_wds_peer(struct wiphy *wiphy, struct net_device *dev,
1365 u8 *addr) 1368 const u8 *addr)
1366{ 1369{
1367 struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); 1370 struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
1368 1371
@@ -1411,7 +1414,7 @@ int __ieee80211_request_smps(struct ieee80211_sub_if_data *sdata,
1411 if (!sdata->u.mgd.associated || 1414 if (!sdata->u.mgd.associated ||
1412 sdata->vif.bss_conf.channel_type == NL80211_CHAN_NO_HT) { 1415 sdata->vif.bss_conf.channel_type == NL80211_CHAN_NO_HT) {
1413 mutex_lock(&sdata->local->iflist_mtx); 1416 mutex_lock(&sdata->local->iflist_mtx);
1414 ieee80211_recalc_smps(sdata->local, sdata); 1417 ieee80211_recalc_smps(sdata->local);
1415 mutex_unlock(&sdata->local->iflist_mtx); 1418 mutex_unlock(&sdata->local->iflist_mtx);
1416 return 0; 1419 return 0;
1417 } 1420 }
@@ -1541,11 +1544,11 @@ static int ieee80211_cancel_remain_on_channel(struct wiphy *wiphy,
1541 return ieee80211_wk_cancel_remain_on_channel(sdata, cookie); 1544 return ieee80211_wk_cancel_remain_on_channel(sdata, cookie);
1542} 1545}
1543 1546
1544static int ieee80211_action(struct wiphy *wiphy, struct net_device *dev, 1547static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct net_device *dev,
1545 struct ieee80211_channel *chan, 1548 struct ieee80211_channel *chan,
1546 enum nl80211_channel_type channel_type, 1549 enum nl80211_channel_type channel_type,
1547 bool channel_type_valid, 1550 bool channel_type_valid,
1548 const u8 *buf, size_t len, u64 *cookie) 1551 const u8 *buf, size_t len, u64 *cookie)
1549{ 1552{
1550 struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); 1553 struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
1551 struct ieee80211_local *local = sdata->local; 1554 struct ieee80211_local *local = sdata->local;
@@ -1566,7 +1569,11 @@ static int ieee80211_action(struct wiphy *wiphy, struct net_device *dev,
1566 1569
1567 switch (sdata->vif.type) { 1570 switch (sdata->vif.type) {
1568 case NL80211_IFTYPE_ADHOC: 1571 case NL80211_IFTYPE_ADHOC:
1569 if (mgmt->u.action.category == WLAN_CATEGORY_PUBLIC) 1572 case NL80211_IFTYPE_AP:
1573 case NL80211_IFTYPE_AP_VLAN:
1574 case NL80211_IFTYPE_P2P_GO:
1575 if (!ieee80211_is_action(mgmt->frame_control) ||
1576 mgmt->u.action.category == WLAN_CATEGORY_PUBLIC)
1570 break; 1577 break;
1571 rcu_read_lock(); 1578 rcu_read_lock();
1572 sta = sta_info_get(sdata, mgmt->da); 1579 sta = sta_info_get(sdata, mgmt->da);
@@ -1575,8 +1582,7 @@ static int ieee80211_action(struct wiphy *wiphy, struct net_device *dev,
1575 return -ENOLINK; 1582 return -ENOLINK;
1576 break; 1583 break;
1577 case NL80211_IFTYPE_STATION: 1584 case NL80211_IFTYPE_STATION:
1578 if (!(sdata->u.mgd.flags & IEEE80211_STA_MFP_ENABLED)) 1585 case NL80211_IFTYPE_P2P_CLIENT:
1579 flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
1580 break; 1586 break;
1581 default: 1587 default:
1582 return -EOPNOTSUPP; 1588 return -EOPNOTSUPP;
@@ -1598,6 +1604,23 @@ static int ieee80211_action(struct wiphy *wiphy, struct net_device *dev,
1598 return 0; 1604 return 0;
1599} 1605}
1600 1606
1607static void ieee80211_mgmt_frame_register(struct wiphy *wiphy,
1608 struct net_device *dev,
1609 u16 frame_type, bool reg)
1610{
1611 struct ieee80211_local *local = wiphy_priv(wiphy);
1612
1613 if (frame_type != (IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_REQ))
1614 return;
1615
1616 if (reg)
1617 local->probe_req_reg++;
1618 else
1619 local->probe_req_reg--;
1620
1621 ieee80211_queue_work(&local->hw, &local->reconfig_filter);
1622}
1623
1601struct cfg80211_ops mac80211_config_ops = { 1624struct cfg80211_ops mac80211_config_ops = {
1602 .add_virtual_intf = ieee80211_add_iface, 1625 .add_virtual_intf = ieee80211_add_iface,
1603 .del_virtual_intf = ieee80211_del_iface, 1626 .del_virtual_intf = ieee80211_del_iface,
@@ -1647,6 +1670,7 @@ struct cfg80211_ops mac80211_config_ops = {
1647 .set_bitrate_mask = ieee80211_set_bitrate_mask, 1670 .set_bitrate_mask = ieee80211_set_bitrate_mask,
1648 .remain_on_channel = ieee80211_remain_on_channel, 1671 .remain_on_channel = ieee80211_remain_on_channel,
1649 .cancel_remain_on_channel = ieee80211_cancel_remain_on_channel, 1672 .cancel_remain_on_channel = ieee80211_cancel_remain_on_channel,
1650 .action = ieee80211_action, 1673 .mgmt_tx = ieee80211_mgmt_tx,
1651 .set_cqm_rssi_config = ieee80211_set_cqm_rssi_config, 1674 .set_cqm_rssi_config = ieee80211_set_cqm_rssi_config,
1675 .mgmt_frame_register = ieee80211_mgmt_frame_register,
1652}; 1676};
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index 32be11e4c4d9..5b24740fc0b0 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -11,7 +11,7 @@ __ieee80211_get_channel_mode(struct ieee80211_local *local,
11{ 11{
12 struct ieee80211_sub_if_data *sdata; 12 struct ieee80211_sub_if_data *sdata;
13 13
14 WARN_ON(!mutex_is_locked(&local->iflist_mtx)); 14 lockdep_assert_held(&local->iflist_mtx);
15 15
16 list_for_each_entry(sdata, &local->interfaces, list) { 16 list_for_each_entry(sdata, &local->interfaces, list) {
17 if (sdata == ignore) 17 if (sdata == ignore)
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index a694c593ff6a..18260aa99c56 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -36,6 +36,7 @@ static ssize_t name## _read(struct file *file, char __user *userbuf, \
36static const struct file_operations name## _ops = { \ 36static const struct file_operations name## _ops = { \
37 .read = name## _read, \ 37 .read = name## _read, \
38 .open = mac80211_open_file_generic, \ 38 .open = mac80211_open_file_generic, \
39 .llseek = generic_file_llseek, \
39}; 40};
40 41
41#define DEBUGFS_ADD(name) \ 42#define DEBUGFS_ADD(name) \
@@ -85,13 +86,15 @@ static ssize_t tsf_write(struct file *file,
85 if (strncmp(buf, "reset", 5) == 0) { 86 if (strncmp(buf, "reset", 5) == 0) {
86 if (local->ops->reset_tsf) { 87 if (local->ops->reset_tsf) {
87 drv_reset_tsf(local); 88 drv_reset_tsf(local);
88 printk(KERN_INFO "%s: debugfs reset TSF\n", wiphy_name(local->hw.wiphy)); 89 wiphy_info(local->hw.wiphy, "debugfs reset TSF\n");
89 } 90 }
90 } else { 91 } else {
91 tsf = simple_strtoul(buf, NULL, 0); 92 tsf = simple_strtoul(buf, NULL, 0);
92 if (local->ops->set_tsf) { 93 if (local->ops->set_tsf) {
93 drv_set_tsf(local, tsf); 94 drv_set_tsf(local, tsf);
94 printk(KERN_INFO "%s: debugfs set TSF to %#018llx\n", wiphy_name(local->hw.wiphy), tsf); 95 wiphy_info(local->hw.wiphy,
96 "debugfs set TSF to %#018llx\n", tsf);
97
95 } 98 }
96 } 99 }
97 100
@@ -101,7 +104,8 @@ static ssize_t tsf_write(struct file *file,
101static const struct file_operations tsf_ops = { 104static const struct file_operations tsf_ops = {
102 .read = tsf_read, 105 .read = tsf_read,
103 .write = tsf_write, 106 .write = tsf_write,
104 .open = mac80211_open_file_generic 107 .open = mac80211_open_file_generic,
108 .llseek = default_llseek,
105}; 109};
106 110
107static ssize_t reset_write(struct file *file, const char __user *user_buf, 111static ssize_t reset_write(struct file *file, const char __user *user_buf,
@@ -120,6 +124,7 @@ static ssize_t reset_write(struct file *file, const char __user *user_buf,
120static const struct file_operations reset_ops = { 124static const struct file_operations reset_ops = {
121 .write = reset_write, 125 .write = reset_write,
122 .open = mac80211_open_file_generic, 126 .open = mac80211_open_file_generic,
127 .llseek = noop_llseek,
123}; 128};
124 129
125static ssize_t noack_read(struct file *file, char __user *user_buf, 130static ssize_t noack_read(struct file *file, char __user *user_buf,
@@ -155,7 +160,8 @@ static ssize_t noack_write(struct file *file,
155static const struct file_operations noack_ops = { 160static const struct file_operations noack_ops = {
156 .read = noack_read, 161 .read = noack_read,
157 .write = noack_write, 162 .write = noack_write,
158 .open = mac80211_open_file_generic 163 .open = mac80211_open_file_generic,
164 .llseek = default_llseek,
159}; 165};
160 166
161static ssize_t uapsd_queues_read(struct file *file, char __user *user_buf, 167static ssize_t uapsd_queues_read(struct file *file, char __user *user_buf,
@@ -201,7 +207,8 @@ static ssize_t uapsd_queues_write(struct file *file,
201static const struct file_operations uapsd_queues_ops = { 207static const struct file_operations uapsd_queues_ops = {
202 .read = uapsd_queues_read, 208 .read = uapsd_queues_read,
203 .write = uapsd_queues_write, 209 .write = uapsd_queues_write,
204 .open = mac80211_open_file_generic 210 .open = mac80211_open_file_generic,
211 .llseek = default_llseek,
205}; 212};
206 213
207static ssize_t uapsd_max_sp_len_read(struct file *file, char __user *user_buf, 214static ssize_t uapsd_max_sp_len_read(struct file *file, char __user *user_buf,
@@ -247,7 +254,8 @@ static ssize_t uapsd_max_sp_len_write(struct file *file,
247static const struct file_operations uapsd_max_sp_len_ops = { 254static const struct file_operations uapsd_max_sp_len_ops = {
248 .read = uapsd_max_sp_len_read, 255 .read = uapsd_max_sp_len_read,
249 .write = uapsd_max_sp_len_write, 256 .write = uapsd_max_sp_len_write,
250 .open = mac80211_open_file_generic 257 .open = mac80211_open_file_generic,
258 .llseek = default_llseek,
251}; 259};
252 260
253static ssize_t channel_type_read(struct file *file, char __user *user_buf, 261static ssize_t channel_type_read(struct file *file, char __user *user_buf,
@@ -279,7 +287,8 @@ static ssize_t channel_type_read(struct file *file, char __user *user_buf,
279 287
280static const struct file_operations channel_type_ops = { 288static const struct file_operations channel_type_ops = {
281 .read = channel_type_read, 289 .read = channel_type_read,
282 .open = mac80211_open_file_generic 290 .open = mac80211_open_file_generic,
291 .llseek = default_llseek,
283}; 292};
284 293
285static ssize_t queues_read(struct file *file, char __user *user_buf, 294static ssize_t queues_read(struct file *file, char __user *user_buf,
@@ -302,7 +311,8 @@ static ssize_t queues_read(struct file *file, char __user *user_buf,
302 311
303static const struct file_operations queues_ops = { 312static const struct file_operations queues_ops = {
304 .read = queues_read, 313 .read = queues_read,
305 .open = mac80211_open_file_generic 314 .open = mac80211_open_file_generic,
315 .llseek = default_llseek,
306}; 316};
307 317
308/* statistics stuff */ 318/* statistics stuff */
@@ -346,6 +356,7 @@ static ssize_t stats_ ##name## _read(struct file *file, \
346static const struct file_operations stats_ ##name## _ops = { \ 356static const struct file_operations stats_ ##name## _ops = { \
347 .read = stats_ ##name## _read, \ 357 .read = stats_ ##name## _read, \
348 .open = mac80211_open_file_generic, \ 358 .open = mac80211_open_file_generic, \
359 .llseek = generic_file_llseek, \
349}; 360};
350 361
351#define DEBUGFS_STATS_ADD(name, field) \ 362#define DEBUGFS_STATS_ADD(name, field) \
@@ -366,7 +377,6 @@ void debugfs_hw_add(struct ieee80211_local *local)
366 if (!phyd) 377 if (!phyd)
367 return; 378 return;
368 379
369 local->debugfs.stations = debugfs_create_dir("stations", phyd);
370 local->debugfs.keys = debugfs_create_dir("keys", phyd); 380 local->debugfs.keys = debugfs_create_dir("keys", phyd);
371 381
372 DEBUGFS_ADD(frequency); 382 DEBUGFS_ADD(frequency);
diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c
index fa5e76e658ef..1243d1db5c59 100644
--- a/net/mac80211/debugfs_key.c
+++ b/net/mac80211/debugfs_key.c
@@ -32,6 +32,7 @@ static ssize_t key_##name##_read(struct file *file, \
32static const struct file_operations key_ ##name## _ops = { \ 32static const struct file_operations key_ ##name## _ops = { \
33 .read = key_##name##_read, \ 33 .read = key_##name##_read, \
34 .open = mac80211_open_file_generic, \ 34 .open = mac80211_open_file_generic, \
35 .llseek = generic_file_llseek, \
35} 36}
36 37
37#define KEY_FILE(name, format) \ 38#define KEY_FILE(name, format) \
@@ -46,6 +47,7 @@ static const struct file_operations key_ ##name## _ops = { \
46static const struct file_operations key_ ##name## _ops = { \ 47static const struct file_operations key_ ##name## _ops = { \
47 .read = key_conf_##name##_read, \ 48 .read = key_conf_##name##_read, \
48 .open = mac80211_open_file_generic, \ 49 .open = mac80211_open_file_generic, \
50 .llseek = generic_file_llseek, \
49} 51}
50 52
51#define KEY_CONF_FILE(name, format) \ 53#define KEY_CONF_FILE(name, format) \
@@ -64,26 +66,13 @@ static ssize_t key_algorithm_read(struct file *file,
64 char __user *userbuf, 66 char __user *userbuf,
65 size_t count, loff_t *ppos) 67 size_t count, loff_t *ppos)
66{ 68{
67 char *alg; 69 char buf[15];
68 struct ieee80211_key *key = file->private_data; 70 struct ieee80211_key *key = file->private_data;
71 u32 c = key->conf.cipher;
69 72
70 switch (key->conf.alg) { 73 sprintf(buf, "%.2x-%.2x-%.2x:%d\n",
71 case ALG_WEP: 74 c >> 24, (c >> 16) & 0xff, (c >> 8) & 0xff, c & 0xff);
72 alg = "WEP\n"; 75 return simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf));
73 break;
74 case ALG_TKIP:
75 alg = "TKIP\n";
76 break;
77 case ALG_CCMP:
78 alg = "CCMP\n";
79 break;
80 case ALG_AES_CMAC:
81 alg = "AES-128-CMAC\n";
82 break;
83 default:
84 return 0;
85 }
86 return simple_read_from_buffer(userbuf, count, ppos, alg, strlen(alg));
87} 76}
88KEY_OPS(algorithm); 77KEY_OPS(algorithm);
89 78
@@ -95,21 +84,22 @@ static ssize_t key_tx_spec_read(struct file *file, char __user *userbuf,
95 int len; 84 int len;
96 struct ieee80211_key *key = file->private_data; 85 struct ieee80211_key *key = file->private_data;
97 86
98 switch (key->conf.alg) { 87 switch (key->conf.cipher) {
99 case ALG_WEP: 88 case WLAN_CIPHER_SUITE_WEP40:
89 case WLAN_CIPHER_SUITE_WEP104:
100 len = scnprintf(buf, sizeof(buf), "\n"); 90 len = scnprintf(buf, sizeof(buf), "\n");
101 break; 91 break;
102 case ALG_TKIP: 92 case WLAN_CIPHER_SUITE_TKIP:
103 len = scnprintf(buf, sizeof(buf), "%08x %04x\n", 93 len = scnprintf(buf, sizeof(buf), "%08x %04x\n",
104 key->u.tkip.tx.iv32, 94 key->u.tkip.tx.iv32,
105 key->u.tkip.tx.iv16); 95 key->u.tkip.tx.iv16);
106 break; 96 break;
107 case ALG_CCMP: 97 case WLAN_CIPHER_SUITE_CCMP:
108 tpn = key->u.ccmp.tx_pn; 98 tpn = key->u.ccmp.tx_pn;
109 len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n", 99 len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n",
110 tpn[0], tpn[1], tpn[2], tpn[3], tpn[4], tpn[5]); 100 tpn[0], tpn[1], tpn[2], tpn[3], tpn[4], tpn[5]);
111 break; 101 break;
112 case ALG_AES_CMAC: 102 case WLAN_CIPHER_SUITE_AES_CMAC:
113 tpn = key->u.aes_cmac.tx_pn; 103 tpn = key->u.aes_cmac.tx_pn;
114 len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n", 104 len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n",
115 tpn[0], tpn[1], tpn[2], tpn[3], tpn[4], 105 tpn[0], tpn[1], tpn[2], tpn[3], tpn[4],
@@ -130,11 +120,12 @@ static ssize_t key_rx_spec_read(struct file *file, char __user *userbuf,
130 int i, len; 120 int i, len;
131 const u8 *rpn; 121 const u8 *rpn;
132 122
133 switch (key->conf.alg) { 123 switch (key->conf.cipher) {
134 case ALG_WEP: 124 case WLAN_CIPHER_SUITE_WEP40:
125 case WLAN_CIPHER_SUITE_WEP104:
135 len = scnprintf(buf, sizeof(buf), "\n"); 126 len = scnprintf(buf, sizeof(buf), "\n");
136 break; 127 break;
137 case ALG_TKIP: 128 case WLAN_CIPHER_SUITE_TKIP:
138 for (i = 0; i < NUM_RX_DATA_QUEUES; i++) 129 for (i = 0; i < NUM_RX_DATA_QUEUES; i++)
139 p += scnprintf(p, sizeof(buf)+buf-p, 130 p += scnprintf(p, sizeof(buf)+buf-p,
140 "%08x %04x\n", 131 "%08x %04x\n",
@@ -142,7 +133,7 @@ static ssize_t key_rx_spec_read(struct file *file, char __user *userbuf,
142 key->u.tkip.rx[i].iv16); 133 key->u.tkip.rx[i].iv16);
143 len = p - buf; 134 len = p - buf;
144 break; 135 break;
145 case ALG_CCMP: 136 case WLAN_CIPHER_SUITE_CCMP:
146 for (i = 0; i < NUM_RX_DATA_QUEUES + 1; i++) { 137 for (i = 0; i < NUM_RX_DATA_QUEUES + 1; i++) {
147 rpn = key->u.ccmp.rx_pn[i]; 138 rpn = key->u.ccmp.rx_pn[i];
148 p += scnprintf(p, sizeof(buf)+buf-p, 139 p += scnprintf(p, sizeof(buf)+buf-p,
@@ -152,7 +143,7 @@ static ssize_t key_rx_spec_read(struct file *file, char __user *userbuf,
152 } 143 }
153 len = p - buf; 144 len = p - buf;
154 break; 145 break;
155 case ALG_AES_CMAC: 146 case WLAN_CIPHER_SUITE_AES_CMAC:
156 rpn = key->u.aes_cmac.rx_pn; 147 rpn = key->u.aes_cmac.rx_pn;
157 p += scnprintf(p, sizeof(buf)+buf-p, 148 p += scnprintf(p, sizeof(buf)+buf-p,
158 "%02x%02x%02x%02x%02x%02x\n", 149 "%02x%02x%02x%02x%02x%02x\n",
@@ -174,11 +165,11 @@ static ssize_t key_replays_read(struct file *file, char __user *userbuf,
174 char buf[20]; 165 char buf[20];
175 int len; 166 int len;
176 167
177 switch (key->conf.alg) { 168 switch (key->conf.cipher) {
178 case ALG_CCMP: 169 case WLAN_CIPHER_SUITE_CCMP:
179 len = scnprintf(buf, sizeof(buf), "%u\n", key->u.ccmp.replays); 170 len = scnprintf(buf, sizeof(buf), "%u\n", key->u.ccmp.replays);
180 break; 171 break;
181 case ALG_AES_CMAC: 172 case WLAN_CIPHER_SUITE_AES_CMAC:
182 len = scnprintf(buf, sizeof(buf), "%u\n", 173 len = scnprintf(buf, sizeof(buf), "%u\n",
183 key->u.aes_cmac.replays); 174 key->u.aes_cmac.replays);
184 break; 175 break;
@@ -196,8 +187,8 @@ static ssize_t key_icverrors_read(struct file *file, char __user *userbuf,
196 char buf[20]; 187 char buf[20];
197 int len; 188 int len;
198 189
199 switch (key->conf.alg) { 190 switch (key->conf.cipher) {
200 case ALG_AES_CMAC: 191 case WLAN_CIPHER_SUITE_AES_CMAC:
201 len = scnprintf(buf, sizeof(buf), "%u\n", 192 len = scnprintf(buf, sizeof(buf), "%u\n",
202 key->u.aes_cmac.icverrors); 193 key->u.aes_cmac.icverrors);
203 break; 194 break;
@@ -212,9 +203,13 @@ static ssize_t key_key_read(struct file *file, char __user *userbuf,
212 size_t count, loff_t *ppos) 203 size_t count, loff_t *ppos)
213{ 204{
214 struct ieee80211_key *key = file->private_data; 205 struct ieee80211_key *key = file->private_data;
215 int i, res, bufsize = 2 * key->conf.keylen + 2; 206 int i, bufsize = 2 * key->conf.keylen + 2;
216 char *buf = kmalloc(bufsize, GFP_KERNEL); 207 char *buf = kmalloc(bufsize, GFP_KERNEL);
217 char *p = buf; 208 char *p = buf;
209 ssize_t res;
210
211 if (!buf)
212 return -ENOMEM;
218 213
219 for (i = 0; i < key->conf.keylen; i++) 214 for (i = 0; i < key->conf.keylen; i++)
220 p += scnprintf(p, bufsize + buf - p, "%02x", key->conf.key[i]); 215 p += scnprintf(p, bufsize + buf - p, "%02x", key->conf.key[i]);
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index 20b2998fa0ed..cbdf36d7841c 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -121,6 +121,7 @@ static const struct file_operations name##_ops = { \
121 .read = ieee80211_if_read_##name, \ 121 .read = ieee80211_if_read_##name, \
122 .write = (_write), \ 122 .write = (_write), \
123 .open = mac80211_open_file_generic, \ 123 .open = mac80211_open_file_generic, \
124 .llseek = generic_file_llseek, \
124} 125}
125 126
126#define __IEEE80211_IF_FILE_W(name) \ 127#define __IEEE80211_IF_FILE_W(name) \
@@ -409,6 +410,9 @@ void ieee80211_debugfs_add_netdev(struct ieee80211_sub_if_data *sdata)
409 sprintf(buf, "netdev:%s", sdata->name); 410 sprintf(buf, "netdev:%s", sdata->name);
410 sdata->debugfs.dir = debugfs_create_dir(buf, 411 sdata->debugfs.dir = debugfs_create_dir(buf,
411 sdata->local->hw.wiphy->debugfsdir); 412 sdata->local->hw.wiphy->debugfsdir);
413 if (sdata->debugfs.dir)
414 sdata->debugfs.subdir_stations = debugfs_create_dir("stations",
415 sdata->debugfs.dir);
412 add_files(sdata); 416 add_files(sdata);
413} 417}
414 418
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index 76839d4dfaac..4601fea1784d 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -36,6 +36,7 @@ static ssize_t sta_ ##name## _read(struct file *file, \
36static const struct file_operations sta_ ##name## _ops = { \ 36static const struct file_operations sta_ ##name## _ops = { \
37 .read = sta_##name##_read, \ 37 .read = sta_##name##_read, \
38 .open = mac80211_open_file_generic, \ 38 .open = mac80211_open_file_generic, \
39 .llseek = generic_file_llseek, \
39} 40}
40 41
41#define STA_OPS_RW(name) \ 42#define STA_OPS_RW(name) \
@@ -43,6 +44,7 @@ static const struct file_operations sta_ ##name## _ops = { \
43 .read = sta_##name##_read, \ 44 .read = sta_##name##_read, \
44 .write = sta_##name##_write, \ 45 .write = sta_##name##_write, \
45 .open = mac80211_open_file_generic, \ 46 .open = mac80211_open_file_generic, \
47 .llseek = generic_file_llseek, \
46} 48}
47 49
48#define STA_FILE(name, field, format) \ 50#define STA_FILE(name, field, format) \
@@ -196,7 +198,8 @@ static ssize_t sta_agg_status_write(struct file *file, const char __user *userbu
196 else 198 else
197 ret = ieee80211_stop_tx_ba_session(&sta->sta, tid); 199 ret = ieee80211_stop_tx_ba_session(&sta->sta, tid);
198 } else { 200 } else {
199 __ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_RECIPIENT, 3); 201 __ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_RECIPIENT,
202 3, true);
200 ret = 0; 203 ret = 0;
201 } 204 }
202 205
@@ -300,7 +303,7 @@ STA_OPS(ht_capa);
300 303
301void ieee80211_sta_debugfs_add(struct sta_info *sta) 304void ieee80211_sta_debugfs_add(struct sta_info *sta)
302{ 305{
303 struct dentry *stations_dir = sta->local->debugfs.stations; 306 struct dentry *stations_dir = sta->sdata->debugfs.subdir_stations;
304 u8 mac[3*ETH_ALEN]; 307 u8 mac[3*ETH_ALEN];
305 308
306 sta->debugfs.add_has_run = true; 309 sta->debugfs.add_has_run = true;
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 14123dce544b..16983825f8e8 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -54,6 +54,20 @@ static inline int drv_add_interface(struct ieee80211_local *local,
54 return ret; 54 return ret;
55} 55}
56 56
57static inline int drv_change_interface(struct ieee80211_local *local,
58 struct ieee80211_sub_if_data *sdata,
59 enum nl80211_iftype type, bool p2p)
60{
61 int ret;
62
63 might_sleep();
64
65 trace_drv_change_interface(local, sdata, type, p2p);
66 ret = local->ops->change_interface(&local->hw, &sdata->vif, type, p2p);
67 trace_drv_return_int(local, ret);
68 return ret;
69}
70
57static inline void drv_remove_interface(struct ieee80211_local *local, 71static inline void drv_remove_interface(struct ieee80211_local *local,
58 struct ieee80211_vif *vif) 72 struct ieee80211_vif *vif)
59{ 73{
diff --git a/net/mac80211/driver-trace.h b/net/mac80211/driver-trace.h
index 5d5d2a974668..6831fb1641c8 100644
--- a/net/mac80211/driver-trace.h
+++ b/net/mac80211/driver-trace.h
@@ -25,12 +25,14 @@ static inline void trace_ ## name(proto) {}
25#define STA_PR_FMT " sta:%pM" 25#define STA_PR_FMT " sta:%pM"
26#define STA_PR_ARG __entry->sta_addr 26#define STA_PR_ARG __entry->sta_addr
27 27
28#define VIF_ENTRY __field(enum nl80211_iftype, vif_type) __field(void *, sdata) \ 28#define VIF_ENTRY __field(enum nl80211_iftype, vif_type) __field(void *, sdata) \
29 __field(bool, p2p) \
29 __string(vif_name, sdata->dev ? sdata->dev->name : "<nodev>") 30 __string(vif_name, sdata->dev ? sdata->dev->name : "<nodev>")
30#define VIF_ASSIGN __entry->vif_type = sdata->vif.type; __entry->sdata = sdata; \ 31#define VIF_ASSIGN __entry->vif_type = sdata->vif.type; __entry->sdata = sdata; \
32 __entry->p2p = sdata->vif.p2p; \
31 __assign_str(vif_name, sdata->dev ? sdata->dev->name : "<nodev>") 33 __assign_str(vif_name, sdata->dev ? sdata->dev->name : "<nodev>")
32#define VIF_PR_FMT " vif:%s(%d)" 34#define VIF_PR_FMT " vif:%s(%d%s)"
33#define VIF_PR_ARG __get_str(vif_name), __entry->vif_type 35#define VIF_PR_ARG __get_str(vif_name), __entry->vif_type, __entry->p2p ? "/p2p" : ""
34 36
35/* 37/*
36 * Tracing for driver callbacks. 38 * Tracing for driver callbacks.
@@ -136,6 +138,34 @@ TRACE_EVENT(drv_add_interface,
136 ) 138 )
137); 139);
138 140
141TRACE_EVENT(drv_change_interface,
142 TP_PROTO(struct ieee80211_local *local,
143 struct ieee80211_sub_if_data *sdata,
144 enum nl80211_iftype type, bool p2p),
145
146 TP_ARGS(local, sdata, type, p2p),
147
148 TP_STRUCT__entry(
149 LOCAL_ENTRY
150 VIF_ENTRY
151 __field(u32, new_type)
152 __field(bool, new_p2p)
153 ),
154
155 TP_fast_assign(
156 LOCAL_ASSIGN;
157 VIF_ASSIGN;
158 __entry->new_type = type;
159 __entry->new_p2p = p2p;
160 ),
161
162 TP_printk(
163 LOCAL_PR_FMT VIF_PR_FMT " new type:%d%s",
164 LOCAL_PR_ARG, VIF_PR_ARG, __entry->new_type,
165 __entry->new_p2p ? "/p2p" : ""
166 )
167);
168
139TRACE_EVENT(drv_remove_interface, 169TRACE_EVENT(drv_remove_interface,
140 TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata), 170 TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata),
141 171
@@ -336,7 +366,7 @@ TRACE_EVENT(drv_set_key,
336 LOCAL_ENTRY 366 LOCAL_ENTRY
337 VIF_ENTRY 367 VIF_ENTRY
338 STA_ENTRY 368 STA_ENTRY
339 __field(enum ieee80211_key_alg, alg) 369 __field(u32, cipher)
340 __field(u8, hw_key_idx) 370 __field(u8, hw_key_idx)
341 __field(u8, flags) 371 __field(u8, flags)
342 __field(s8, keyidx) 372 __field(s8, keyidx)
@@ -346,7 +376,7 @@ TRACE_EVENT(drv_set_key,
346 LOCAL_ASSIGN; 376 LOCAL_ASSIGN;
347 VIF_ASSIGN; 377 VIF_ASSIGN;
348 STA_ASSIGN; 378 STA_ASSIGN;
349 __entry->alg = key->alg; 379 __entry->cipher = key->cipher;
350 __entry->flags = key->flags; 380 __entry->flags = key->flags;
351 __entry->keyidx = key->keyidx; 381 __entry->keyidx = key->keyidx;
352 __entry->hw_key_idx = key->hw_key_idx; 382 __entry->hw_key_idx = key->hw_key_idx;
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index 9d101fb33861..75d679d75e63 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -101,16 +101,16 @@ void ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_supported_band *sband,
101 ht_cap->mcs.rx_mask[32/8] |= 1; 101 ht_cap->mcs.rx_mask[32/8] |= 1;
102} 102}
103 103
104void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta) 104void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, bool tx)
105{ 105{
106 int i; 106 int i;
107 107
108 cancel_work_sync(&sta->ampdu_mlme.work); 108 cancel_work_sync(&sta->ampdu_mlme.work);
109 109
110 for (i = 0; i < STA_TID_NUM; i++) { 110 for (i = 0; i < STA_TID_NUM; i++) {
111 __ieee80211_stop_tx_ba_session(sta, i, WLAN_BACK_INITIATOR); 111 __ieee80211_stop_tx_ba_session(sta, i, WLAN_BACK_INITIATOR, tx);
112 __ieee80211_stop_rx_ba_session(sta, i, WLAN_BACK_RECIPIENT, 112 __ieee80211_stop_rx_ba_session(sta, i, WLAN_BACK_RECIPIENT,
113 WLAN_REASON_QSTA_LEAVE_QBSS); 113 WLAN_REASON_QSTA_LEAVE_QBSS, tx);
114 } 114 }
115} 115}
116 116
@@ -135,7 +135,7 @@ void ieee80211_ba_session_work(struct work_struct *work)
135 if (test_and_clear_bit(tid, sta->ampdu_mlme.tid_rx_timer_expired)) 135 if (test_and_clear_bit(tid, sta->ampdu_mlme.tid_rx_timer_expired))
136 ___ieee80211_stop_rx_ba_session( 136 ___ieee80211_stop_rx_ba_session(
137 sta, tid, WLAN_BACK_RECIPIENT, 137 sta, tid, WLAN_BACK_RECIPIENT,
138 WLAN_REASON_QSTA_TIMEOUT); 138 WLAN_REASON_QSTA_TIMEOUT, true);
139 139
140 tid_tx = sta->ampdu_mlme.tid_tx[tid]; 140 tid_tx = sta->ampdu_mlme.tid_tx[tid];
141 if (!tid_tx) 141 if (!tid_tx)
@@ -146,7 +146,8 @@ void ieee80211_ba_session_work(struct work_struct *work)
146 else if (test_and_clear_bit(HT_AGG_STATE_WANT_STOP, 146 else if (test_and_clear_bit(HT_AGG_STATE_WANT_STOP,
147 &tid_tx->state)) 147 &tid_tx->state))
148 ___ieee80211_stop_tx_ba_session(sta, tid, 148 ___ieee80211_stop_tx_ba_session(sta, tid,
149 WLAN_BACK_INITIATOR); 149 WLAN_BACK_INITIATOR,
150 true);
150 } 151 }
151 mutex_unlock(&sta->ampdu_mlme.mtx); 152 mutex_unlock(&sta->ampdu_mlme.mtx);
152} 153}
@@ -214,9 +215,11 @@ void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata,
214#endif /* CONFIG_MAC80211_HT_DEBUG */ 215#endif /* CONFIG_MAC80211_HT_DEBUG */
215 216
216 if (initiator == WLAN_BACK_INITIATOR) 217 if (initiator == WLAN_BACK_INITIATOR)
217 __ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_INITIATOR, 0); 218 __ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_INITIATOR, 0,
219 true);
218 else 220 else
219 __ieee80211_stop_tx_ba_session(sta, tid, WLAN_BACK_RECIPIENT); 221 __ieee80211_stop_tx_ba_session(sta, tid, WLAN_BACK_RECIPIENT,
222 true);
220} 223}
221 224
222int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, 225int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata,
@@ -265,3 +268,33 @@ int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata,
265 268
266 return 0; 269 return 0;
267} 270}
271
272void ieee80211_request_smps_work(struct work_struct *work)
273{
274 struct ieee80211_sub_if_data *sdata =
275 container_of(work, struct ieee80211_sub_if_data,
276 u.mgd.request_smps_work);
277
278 mutex_lock(&sdata->u.mgd.mtx);
279 __ieee80211_request_smps(sdata, sdata->u.mgd.driver_smps_mode);
280 mutex_unlock(&sdata->u.mgd.mtx);
281}
282
283void ieee80211_request_smps(struct ieee80211_vif *vif,
284 enum ieee80211_smps_mode smps_mode)
285{
286 struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
287
288 if (WARN_ON(vif->type != NL80211_IFTYPE_STATION))
289 return;
290
291 if (WARN_ON(smps_mode == IEEE80211_SMPS_OFF))
292 smps_mode = IEEE80211_SMPS_AUTOMATIC;
293
294 sdata->u.mgd.driver_smps_mode = smps_mode;
295
296 ieee80211_queue_work(&sdata->local->hw,
297 &sdata->u.mgd.request_smps_work);
298}
299/* this might change ... don't want non-open drivers using it */
300EXPORT_SYMBOL_GPL(ieee80211_request_smps);
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index c691780725a7..239c4836a946 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -173,6 +173,19 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
173 memcpy(skb_put(skb, ifibss->ie_len), 173 memcpy(skb_put(skb, ifibss->ie_len),
174 ifibss->ie, ifibss->ie_len); 174 ifibss->ie, ifibss->ie_len);
175 175
176 if (local->hw.queues >= 4) {
177 pos = skb_put(skb, 9);
178 *pos++ = WLAN_EID_VENDOR_SPECIFIC;
179 *pos++ = 7; /* len */
180 *pos++ = 0x00; /* Microsoft OUI 00:50:F2 */
181 *pos++ = 0x50;
182 *pos++ = 0xf2;
183 *pos++ = 2; /* WME */
184 *pos++ = 0; /* WME info */
185 *pos++ = 1; /* WME ver */
186 *pos++ = 0; /* U-APSD no in use */
187 }
188
176 rcu_assign_pointer(ifibss->presp, skb); 189 rcu_assign_pointer(ifibss->presp, skb);
177 190
178 sdata->vif.bss_conf.beacon_int = beacon_int; 191 sdata->vif.bss_conf.beacon_int = beacon_int;
@@ -266,37 +279,45 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
266 if (!channel || channel->flags & IEEE80211_CHAN_DISABLED) 279 if (!channel || channel->flags & IEEE80211_CHAN_DISABLED)
267 return; 280 return;
268 281
269 if (sdata->vif.type == NL80211_IFTYPE_ADHOC && elems->supp_rates && 282 if (sdata->vif.type == NL80211_IFTYPE_ADHOC &&
270 memcmp(mgmt->bssid, sdata->u.ibss.bssid, ETH_ALEN) == 0) { 283 memcmp(mgmt->bssid, sdata->u.ibss.bssid, ETH_ALEN) == 0) {
271 supp_rates = ieee80211_sta_get_rates(local, elems, band);
272 284
273 rcu_read_lock(); 285 rcu_read_lock();
274
275 sta = sta_info_get(sdata, mgmt->sa); 286 sta = sta_info_get(sdata, mgmt->sa);
276 if (sta) {
277 u32 prev_rates;
278 287
279 prev_rates = sta->sta.supp_rates[band]; 288 if (elems->supp_rates) {
280 /* make sure mandatory rates are always added */ 289 supp_rates = ieee80211_sta_get_rates(local, elems,
281 sta->sta.supp_rates[band] = supp_rates | 290 band);
282 ieee80211_mandatory_rates(local, band); 291 if (sta) {
292 u32 prev_rates;
293
294 prev_rates = sta->sta.supp_rates[band];
295 /* make sure mandatory rates are always added */
296 sta->sta.supp_rates[band] = supp_rates |
297 ieee80211_mandatory_rates(local, band);
283 298
284 if (sta->sta.supp_rates[band] != prev_rates) { 299 if (sta->sta.supp_rates[band] != prev_rates) {
285#ifdef CONFIG_MAC80211_IBSS_DEBUG 300#ifdef CONFIG_MAC80211_IBSS_DEBUG
286 printk(KERN_DEBUG "%s: updated supp_rates set " 301 printk(KERN_DEBUG
287 "for %pM based on beacon/probe_response " 302 "%s: updated supp_rates set "
288 "(0x%x -> 0x%x)\n", 303 "for %pM based on beacon"
289 sdata->name, sta->sta.addr, 304 "/probe_resp (0x%x -> 0x%x)\n",
290 prev_rates, sta->sta.supp_rates[band]); 305 sdata->name, sta->sta.addr,
306 prev_rates,
307 sta->sta.supp_rates[band]);
291#endif 308#endif
292 rate_control_rate_init(sta); 309 rate_control_rate_init(sta);
293 } 310 }
294 rcu_read_unlock(); 311 } else
295 } else { 312 sta = ieee80211_ibss_add_sta(sdata, mgmt->bssid,
296 rcu_read_unlock(); 313 mgmt->sa, supp_rates,
297 ieee80211_ibss_add_sta(sdata, mgmt->bssid, mgmt->sa, 314 GFP_ATOMIC);
298 supp_rates, GFP_KERNEL);
299 } 315 }
316
317 if (sta && elems->wmm_info)
318 set_sta_flags(sta, WLAN_STA_WME);
319
320 rcu_read_unlock();
300 } 321 }
301 322
302 bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, elems, 323 bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, elems,
@@ -427,14 +448,15 @@ struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata,
427 return NULL; 448 return NULL;
428 449
429#ifdef CONFIG_MAC80211_VERBOSE_DEBUG 450#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
430 printk(KERN_DEBUG "%s: Adding new IBSS station %pM (dev=%s)\n", 451 wiphy_debug(local->hw.wiphy, "Adding new IBSS station %pM (dev=%s)\n",
431 wiphy_name(local->hw.wiphy), addr, sdata->name); 452 addr, sdata->name);
432#endif 453#endif
433 454
434 sta = sta_info_alloc(sdata, addr, gfp); 455 sta = sta_info_alloc(sdata, addr, gfp);
435 if (!sta) 456 if (!sta)
436 return NULL; 457 return NULL;
437 458
459 sta->last_rx = jiffies;
438 set_sta_flags(sta, WLAN_STA_AUTHORIZED); 460 set_sta_flags(sta, WLAN_STA_AUTHORIZED);
439 461
440 /* make sure mandatory rates are always added */ 462 /* make sure mandatory rates are always added */
@@ -920,12 +942,14 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata,
920 memcpy(sdata->u.ibss.ssid, params->ssid, IEEE80211_MAX_SSID_LEN); 942 memcpy(sdata->u.ibss.ssid, params->ssid, IEEE80211_MAX_SSID_LEN);
921 sdata->u.ibss.ssid_len = params->ssid_len; 943 sdata->u.ibss.ssid_len = params->ssid_len;
922 944
945 mutex_unlock(&sdata->u.ibss.mtx);
946
947 mutex_lock(&sdata->local->mtx);
923 ieee80211_recalc_idle(sdata->local); 948 ieee80211_recalc_idle(sdata->local);
949 mutex_unlock(&sdata->local->mtx);
924 950
925 ieee80211_queue_work(&sdata->local->hw, &sdata->work); 951 ieee80211_queue_work(&sdata->local->hw, &sdata->work);
926 952
927 mutex_unlock(&sdata->u.ibss.mtx);
928
929 return 0; 953 return 0;
930} 954}
931 955
@@ -980,7 +1004,9 @@ int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata)
980 1004
981 mutex_unlock(&sdata->u.ibss.mtx); 1005 mutex_unlock(&sdata->u.ibss.mtx);
982 1006
1007 mutex_lock(&local->mtx);
983 ieee80211_recalc_idle(sdata->local); 1008 ieee80211_recalc_idle(sdata->local);
1009 mutex_unlock(&local->mtx);
984 1010
985 return 0; 1011 return 0;
986} 1012}
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 65e0ed6c2975..b80c38689927 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -50,12 +50,6 @@ struct ieee80211_local;
50 * increased memory use (about 2 kB of RAM per entry). */ 50 * increased memory use (about 2 kB of RAM per entry). */
51#define IEEE80211_FRAGMENT_MAX 4 51#define IEEE80211_FRAGMENT_MAX 4
52 52
53/*
54 * Time after which we ignore scan results and no longer report/use
55 * them in any way.
56 */
57#define IEEE80211_SCAN_RESULT_EXPIRE (10 * HZ)
58
59#define TU_TO_EXP_TIME(x) (jiffies + usecs_to_jiffies((x) * 1024)) 53#define TU_TO_EXP_TIME(x) (jiffies + usecs_to_jiffies((x) * 1024))
60 54
61#define IEEE80211_DEFAULT_UAPSD_QUEUES \ 55#define IEEE80211_DEFAULT_UAPSD_QUEUES \
@@ -165,12 +159,37 @@ typedef unsigned __bitwise__ ieee80211_rx_result;
165#define RX_DROP_MONITOR ((__force ieee80211_rx_result) 2u) 159#define RX_DROP_MONITOR ((__force ieee80211_rx_result) 2u)
166#define RX_QUEUED ((__force ieee80211_rx_result) 3u) 160#define RX_QUEUED ((__force ieee80211_rx_result) 3u)
167 161
168#define IEEE80211_RX_IN_SCAN BIT(0) 162/**
169/* frame is destined to interface currently processed (incl. multicast frames) */ 163 * enum ieee80211_packet_rx_flags - packet RX flags
170#define IEEE80211_RX_RA_MATCH BIT(1) 164 * @IEEE80211_RX_RA_MATCH: frame is destined to interface currently processed
171#define IEEE80211_RX_AMSDU BIT(2) 165 * (incl. multicast frames)
172#define IEEE80211_RX_FRAGMENTED BIT(3) 166 * @IEEE80211_RX_IN_SCAN: received while scanning
173/* only add flags here that do not change with subframes of an aMPDU */ 167 * @IEEE80211_RX_FRAGMENTED: fragmented frame
168 * @IEEE80211_RX_AMSDU: a-MSDU packet
169 * @IEEE80211_RX_MALFORMED_ACTION_FRM: action frame is malformed
170 *
171 * These are per-frame flags that are attached to a frame in the
172 * @rx_flags field of &struct ieee80211_rx_status.
173 */
174enum ieee80211_packet_rx_flags {
175 IEEE80211_RX_IN_SCAN = BIT(0),
176 IEEE80211_RX_RA_MATCH = BIT(1),
177 IEEE80211_RX_FRAGMENTED = BIT(2),
178 IEEE80211_RX_AMSDU = BIT(3),
179 IEEE80211_RX_MALFORMED_ACTION_FRM = BIT(4),
180};
181
182/**
183 * enum ieee80211_rx_flags - RX data flags
184 *
185 * @IEEE80211_RX_CMNTR: received on cooked monitor already
186 *
187 * These flags are used across handling multiple interfaces
188 * for a single frame.
189 */
190enum ieee80211_rx_flags {
191 IEEE80211_RX_CMNTR = BIT(0),
192};
174 193
175struct ieee80211_rx_data { 194struct ieee80211_rx_data {
176 struct sk_buff *skb; 195 struct sk_buff *skb;
@@ -343,10 +362,14 @@ struct ieee80211_if_managed {
343 unsigned long timers_running; /* used for quiesce/restart */ 362 unsigned long timers_running; /* used for quiesce/restart */
344 bool powersave; /* powersave requested for this iface */ 363 bool powersave; /* powersave requested for this iface */
345 enum ieee80211_smps_mode req_smps, /* requested smps mode */ 364 enum ieee80211_smps_mode req_smps, /* requested smps mode */
346 ap_smps; /* smps mode AP thinks we're in */ 365 ap_smps, /* smps mode AP thinks we're in */
366 driver_smps_mode; /* smps mode request */
367
368 struct work_struct request_smps_work;
347 369
348 unsigned int flags; 370 unsigned int flags;
349 371
372 bool beacon_crc_valid;
350 u32 beacon_crc; 373 u32 beacon_crc;
351 374
352 enum { 375 enum {
@@ -371,6 +394,13 @@ struct ieee80211_if_managed {
371 int ave_beacon_signal; 394 int ave_beacon_signal;
372 395
373 /* 396 /*
397 * Number of Beacon frames used in ave_beacon_signal. This can be used
398 * to avoid generating less reliable cqm events that would be based
399 * only on couple of received frames.
400 */
401 unsigned int count_beacon_signal;
402
403 /*
374 * Last Beacon frame signal strength average (ave_beacon_signal / 16) 404 * Last Beacon frame signal strength average (ave_beacon_signal / 16)
375 * that triggered a cqm event. 0 indicates that no event has been 405 * that triggered a cqm event. 0 indicates that no event has been
376 * generated for the current association. 406 * generated for the current association.
@@ -474,6 +504,19 @@ enum ieee80211_sub_if_data_flags {
474 IEEE80211_SDATA_DONT_BRIDGE_PACKETS = BIT(3), 504 IEEE80211_SDATA_DONT_BRIDGE_PACKETS = BIT(3),
475}; 505};
476 506
507/**
508 * enum ieee80211_sdata_state_bits - virtual interface state bits
509 * @SDATA_STATE_RUNNING: virtual interface is up & running; this
510 * mirrors netif_running() but is separate for interface type
511 * change handling while the interface is up
512 * @SDATA_STATE_OFFCHANNEL: This interface is currently in offchannel
513 * mode, so queues are stopped
514 */
515enum ieee80211_sdata_state_bits {
516 SDATA_STATE_RUNNING,
517 SDATA_STATE_OFFCHANNEL,
518};
519
477struct ieee80211_sub_if_data { 520struct ieee80211_sub_if_data {
478 struct list_head list; 521 struct list_head list;
479 522
@@ -487,6 +530,8 @@ struct ieee80211_sub_if_data {
487 530
488 unsigned int flags; 531 unsigned int flags;
489 532
533 unsigned long state;
534
490 int drop_unencrypted; 535 int drop_unencrypted;
491 536
492 char name[IFNAMSIZ]; 537 char name[IFNAMSIZ];
@@ -497,17 +542,20 @@ struct ieee80211_sub_if_data {
497 */ 542 */
498 bool ht_opmode_valid; 543 bool ht_opmode_valid;
499 544
545 /* to detect idle changes */
546 bool old_idle;
547
500 /* Fragment table for host-based reassembly */ 548 /* Fragment table for host-based reassembly */
501 struct ieee80211_fragment_entry fragments[IEEE80211_FRAGMENT_MAX]; 549 struct ieee80211_fragment_entry fragments[IEEE80211_FRAGMENT_MAX];
502 unsigned int fragment_next; 550 unsigned int fragment_next;
503 551
504#define NUM_DEFAULT_KEYS 4
505#define NUM_DEFAULT_MGMT_KEYS 2
506 struct ieee80211_key *keys[NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS]; 552 struct ieee80211_key *keys[NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS];
507 struct ieee80211_key *default_key; 553 struct ieee80211_key *default_key;
508 struct ieee80211_key *default_mgmt_key; 554 struct ieee80211_key *default_mgmt_key;
509 555
510 u16 sequence_number; 556 u16 sequence_number;
557 __be16 control_port_protocol;
558 bool control_port_no_encrypt;
511 559
512 struct work_struct work; 560 struct work_struct work;
513 struct sk_buff_head skb_queue; 561 struct sk_buff_head skb_queue;
@@ -539,6 +587,7 @@ struct ieee80211_sub_if_data {
539#ifdef CONFIG_MAC80211_DEBUGFS 587#ifdef CONFIG_MAC80211_DEBUGFS
540 struct { 588 struct {
541 struct dentry *dir; 589 struct dentry *dir;
590 struct dentry *subdir_stations;
542 struct dentry *default_key; 591 struct dentry *default_key;
543 struct dentry *default_mgmt_key; 592 struct dentry *default_mgmt_key;
544 } debugfs; 593 } debugfs;
@@ -595,11 +644,17 @@ enum queue_stop_reason {
595 * determine if we are on the operating channel or not 644 * determine if we are on the operating channel or not
596 * @SCAN_OFF_CHANNEL: We're off our operating channel for scanning, 645 * @SCAN_OFF_CHANNEL: We're off our operating channel for scanning,
597 * gets only set in conjunction with SCAN_SW_SCANNING 646 * gets only set in conjunction with SCAN_SW_SCANNING
647 * @SCAN_COMPLETED: Set for our scan work function when the driver reported
648 * that the scan completed.
649 * @SCAN_ABORTED: Set for our scan work function when the driver reported
650 * a scan complete for an aborted scan.
598 */ 651 */
599enum { 652enum {
600 SCAN_SW_SCANNING, 653 SCAN_SW_SCANNING,
601 SCAN_HW_SCANNING, 654 SCAN_HW_SCANNING,
602 SCAN_OFF_CHANNEL, 655 SCAN_OFF_CHANNEL,
656 SCAN_COMPLETED,
657 SCAN_ABORTED,
603}; 658};
604 659
605/** 660/**
@@ -634,7 +689,6 @@ struct ieee80211_local {
634 /* 689 /*
635 * work stuff, potentially off-channel (in the future) 690 * work stuff, potentially off-channel (in the future)
636 */ 691 */
637 struct mutex work_mtx;
638 struct list_head work_list; 692 struct list_head work_list;
639 struct timer_list work_timer; 693 struct timer_list work_timer;
640 struct work_struct work_work; 694 struct work_struct work_work;
@@ -653,9 +707,13 @@ struct ieee80211_local {
653 int open_count; 707 int open_count;
654 int monitors, cooked_mntrs; 708 int monitors, cooked_mntrs;
655 /* number of interfaces with corresponding FIF_ flags */ 709 /* number of interfaces with corresponding FIF_ flags */
656 int fif_fcsfail, fif_plcpfail, fif_control, fif_other_bss, fif_pspoll; 710 int fif_fcsfail, fif_plcpfail, fif_control, fif_other_bss, fif_pspoll,
711 fif_probe_req;
712 int probe_req_reg;
657 unsigned int filter_flags; /* FIF_* */ 713 unsigned int filter_flags; /* FIF_* */
658 714
715 bool wiphy_ciphers_allocated;
716
659 /* protects the aggregated multicast list and filter calls */ 717 /* protects the aggregated multicast list and filter calls */
660 spinlock_t filter_lock; 718 spinlock_t filter_lock;
661 719
@@ -746,9 +804,10 @@ struct ieee80211_local {
746 */ 804 */
747 struct mutex key_mtx; 805 struct mutex key_mtx;
748 806
807 /* mutex for scan and work locking */
808 struct mutex mtx;
749 809
750 /* Scanning and BSS list */ 810 /* Scanning and BSS list */
751 struct mutex scan_mtx;
752 unsigned long scanning; 811 unsigned long scanning;
753 struct cfg80211_ssid scan_ssid; 812 struct cfg80211_ssid scan_ssid;
754 struct cfg80211_scan_request *int_scan_req; 813 struct cfg80211_scan_request *int_scan_req;
@@ -866,10 +925,14 @@ struct ieee80211_local {
866#ifdef CONFIG_MAC80211_DEBUGFS 925#ifdef CONFIG_MAC80211_DEBUGFS
867 struct local_debugfsdentries { 926 struct local_debugfsdentries {
868 struct dentry *rcdir; 927 struct dentry *rcdir;
869 struct dentry *stations;
870 struct dentry *keys; 928 struct dentry *keys;
871 } debugfs; 929 } debugfs;
872#endif 930#endif
931
932 /* dummy netdev for use w/ NAPI */
933 struct net_device napi_dev;
934
935 struct napi_struct napi;
873}; 936};
874 937
875static inline struct ieee80211_sub_if_data * 938static inline struct ieee80211_sub_if_data *
@@ -1003,6 +1066,8 @@ void ieee80211_sta_restart(struct ieee80211_sub_if_data *sdata);
1003void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata); 1066void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata);
1004void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, 1067void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
1005 struct sk_buff *skb); 1068 struct sk_buff *skb);
1069void ieee80211_sta_reset_beacon_monitor(struct ieee80211_sub_if_data *sdata);
1070void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata);
1006 1071
1007/* IBSS code */ 1072/* IBSS code */
1008void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local); 1073void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local);
@@ -1068,10 +1133,12 @@ void ieee80211_if_remove(struct ieee80211_sub_if_data *sdata);
1068void ieee80211_remove_interfaces(struct ieee80211_local *local); 1133void ieee80211_remove_interfaces(struct ieee80211_local *local);
1069u32 __ieee80211_recalc_idle(struct ieee80211_local *local); 1134u32 __ieee80211_recalc_idle(struct ieee80211_local *local);
1070void ieee80211_recalc_idle(struct ieee80211_local *local); 1135void ieee80211_recalc_idle(struct ieee80211_local *local);
1136void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata,
1137 const int offset);
1071 1138
1072static inline bool ieee80211_sdata_running(struct ieee80211_sub_if_data *sdata) 1139static inline bool ieee80211_sdata_running(struct ieee80211_sub_if_data *sdata)
1073{ 1140{
1074 return netif_running(sdata->dev); 1141 return test_bit(SDATA_STATE_RUNNING, &sdata->state);
1075} 1142}
1076 1143
1077/* tx handling */ 1144/* tx handling */
@@ -1105,12 +1172,13 @@ void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata,
1105int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, 1172int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata,
1106 enum ieee80211_smps_mode smps, const u8 *da, 1173 enum ieee80211_smps_mode smps, const u8 *da,
1107 const u8 *bssid); 1174 const u8 *bssid);
1175void ieee80211_request_smps_work(struct work_struct *work);
1108 1176
1109void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, 1177void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
1110 u16 initiator, u16 reason); 1178 u16 initiator, u16 reason, bool stop);
1111void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, 1179void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
1112 u16 initiator, u16 reason); 1180 u16 initiator, u16 reason, bool stop);
1113void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta); 1181void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, bool tx);
1114void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata, 1182void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata,
1115 struct sta_info *sta, 1183 struct sta_info *sta,
1116 struct ieee80211_mgmt *mgmt, size_t len); 1184 struct ieee80211_mgmt *mgmt, size_t len);
@@ -1124,13 +1192,16 @@ void ieee80211_process_addba_request(struct ieee80211_local *local,
1124 size_t len); 1192 size_t len);
1125 1193
1126int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, 1194int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
1127 enum ieee80211_back_parties initiator); 1195 enum ieee80211_back_parties initiator,
1196 bool tx);
1128int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, 1197int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
1129 enum ieee80211_back_parties initiator); 1198 enum ieee80211_back_parties initiator,
1199 bool tx);
1130void ieee80211_start_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u16 tid); 1200void ieee80211_start_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u16 tid);
1131void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid); 1201void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid);
1132void ieee80211_ba_session_work(struct work_struct *work); 1202void ieee80211_ba_session_work(struct work_struct *work);
1133void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid); 1203void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid);
1204void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid);
1134 1205
1135/* Spectrum management */ 1206/* Spectrum management */
1136void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, 1207void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata,
@@ -1146,6 +1217,12 @@ int __ieee80211_suspend(struct ieee80211_hw *hw);
1146 1217
1147static inline int __ieee80211_resume(struct ieee80211_hw *hw) 1218static inline int __ieee80211_resume(struct ieee80211_hw *hw)
1148{ 1219{
1220 struct ieee80211_local *local = hw_to_local(hw);
1221
1222 WARN(test_bit(SCAN_HW_SCANNING, &local->scanning),
1223 "%s: resume with hardware scan still in progress\n",
1224 wiphy_name(hw->wiphy));
1225
1149 return ieee80211_reconfig(hw_to_local(hw)); 1226 return ieee80211_reconfig(hw_to_local(hw));
1150} 1227}
1151#else 1228#else
@@ -1208,7 +1285,8 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
1208 const u8 *key, u8 key_len, u8 key_idx); 1285 const u8 *key, u8 key_len, u8 key_idx);
1209int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, 1286int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer,
1210 const u8 *ie, size_t ie_len, 1287 const u8 *ie, size_t ie_len,
1211 enum ieee80211_band band); 1288 enum ieee80211_band band, u32 rate_mask,
1289 u8 channel);
1212void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst, 1290void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
1213 const u8 *ssid, size_t ssid_len, 1291 const u8 *ssid, size_t ssid_len,
1214 const u8 *ie, size_t ie_len); 1292 const u8 *ie, size_t ie_len);
@@ -1221,8 +1299,7 @@ u32 ieee80211_sta_get_rates(struct ieee80211_local *local,
1221 enum ieee80211_band band); 1299 enum ieee80211_band band);
1222int __ieee80211_request_smps(struct ieee80211_sub_if_data *sdata, 1300int __ieee80211_request_smps(struct ieee80211_sub_if_data *sdata,
1223 enum ieee80211_smps_mode smps_mode); 1301 enum ieee80211_smps_mode smps_mode);
1224void ieee80211_recalc_smps(struct ieee80211_local *local, 1302void ieee80211_recalc_smps(struct ieee80211_local *local);
1225 struct ieee80211_sub_if_data *forsdata);
1226 1303
1227size_t ieee80211_ie_split(const u8 *ies, size_t ielen, 1304size_t ieee80211_ie_split(const u8 *ies, size_t ielen,
1228 const u8 *ids, int n_ids, size_t offset); 1305 const u8 *ids, int n_ids, size_t offset);
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index ebbe264e2b0b..f9163b12c7f1 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -24,6 +24,7 @@
24#include "led.h" 24#include "led.h"
25#include "driver-ops.h" 25#include "driver-ops.h"
26#include "wme.h" 26#include "wme.h"
27#include "rate.h"
27 28
28/** 29/**
29 * DOC: Interface list locking 30 * DOC: Interface list locking
@@ -94,21 +95,14 @@ static inline int identical_mac_addr_allowed(int type1, int type2)
94 type2 == NL80211_IFTYPE_AP_VLAN)); 95 type2 == NL80211_IFTYPE_AP_VLAN));
95} 96}
96 97
97static int ieee80211_open(struct net_device *dev) 98static int ieee80211_check_concurrent_iface(struct ieee80211_sub_if_data *sdata,
99 enum nl80211_iftype iftype)
98{ 100{
99 struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
100 struct ieee80211_sub_if_data *nsdata;
101 struct ieee80211_local *local = sdata->local; 101 struct ieee80211_local *local = sdata->local;
102 struct sta_info *sta; 102 struct ieee80211_sub_if_data *nsdata;
103 u32 changed = 0; 103 struct net_device *dev = sdata->dev;
104 int res;
105 u32 hw_reconf_flags = 0;
106 u8 null_addr[ETH_ALEN] = {0};
107 104
108 /* fail early if user set an invalid address */ 105 ASSERT_RTNL();
109 if (compare_ether_addr(dev->dev_addr, null_addr) &&
110 !is_valid_ether_addr(dev->dev_addr))
111 return -EADDRNOTAVAIL;
112 106
113 /* we hold the RTNL here so can safely walk the list */ 107 /* we hold the RTNL here so can safely walk the list */
114 list_for_each_entry(nsdata, &local->interfaces, list) { 108 list_for_each_entry(nsdata, &local->interfaces, list) {
@@ -125,7 +119,7 @@ static int ieee80211_open(struct net_device *dev)
125 * belonging to the same hardware. Then, however, we're 119 * belonging to the same hardware. Then, however, we're
126 * faced with having to adopt two different TSF timers... 120 * faced with having to adopt two different TSF timers...
127 */ 121 */
128 if (sdata->vif.type == NL80211_IFTYPE_ADHOC && 122 if (iftype == NL80211_IFTYPE_ADHOC &&
129 nsdata->vif.type == NL80211_IFTYPE_ADHOC) 123 nsdata->vif.type == NL80211_IFTYPE_ADHOC)
130 return -EBUSY; 124 return -EBUSY;
131 125
@@ -139,19 +133,56 @@ static int ieee80211_open(struct net_device *dev)
139 /* 133 /*
140 * check whether it may have the same address 134 * check whether it may have the same address
141 */ 135 */
142 if (!identical_mac_addr_allowed(sdata->vif.type, 136 if (!identical_mac_addr_allowed(iftype,
143 nsdata->vif.type)) 137 nsdata->vif.type))
144 return -ENOTUNIQ; 138 return -ENOTUNIQ;
145 139
146 /* 140 /*
147 * can only add VLANs to enabled APs 141 * can only add VLANs to enabled APs
148 */ 142 */
149 if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN && 143 if (iftype == NL80211_IFTYPE_AP_VLAN &&
150 nsdata->vif.type == NL80211_IFTYPE_AP) 144 nsdata->vif.type == NL80211_IFTYPE_AP)
151 sdata->bss = &nsdata->u.ap; 145 sdata->bss = &nsdata->u.ap;
152 } 146 }
153 } 147 }
154 148
149 return 0;
150}
151
152void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata,
153 const int offset)
154{
155 struct ieee80211_local *local = sdata->local;
156 u32 flags = sdata->u.mntr_flags;
157
158#define ADJUST(_f, _s) do { \
159 if (flags & MONITOR_FLAG_##_f) \
160 local->fif_##_s += offset; \
161 } while (0)
162
163 ADJUST(FCSFAIL, fcsfail);
164 ADJUST(PLCPFAIL, plcpfail);
165 ADJUST(CONTROL, control);
166 ADJUST(CONTROL, pspoll);
167 ADJUST(OTHER_BSS, other_bss);
168
169#undef ADJUST
170}
171
172/*
173 * NOTE: Be very careful when changing this function, it must NOT return
174 * an error on interface type changes that have been pre-checked, so most
175 * checks should be in ieee80211_check_concurrent_iface.
176 */
177static int ieee80211_do_open(struct net_device *dev, bool coming_up)
178{
179 struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
180 struct ieee80211_local *local = sdata->local;
181 struct sta_info *sta;
182 u32 changed = 0;
183 int res;
184 u32 hw_reconf_flags = 0;
185
155 switch (sdata->vif.type) { 186 switch (sdata->vif.type) {
156 case NL80211_IFTYPE_WDS: 187 case NL80211_IFTYPE_WDS:
157 if (!is_valid_ether_addr(sdata->u.wds.remote_addr)) 188 if (!is_valid_ether_addr(sdata->u.wds.remote_addr))
@@ -177,7 +208,9 @@ static int ieee80211_open(struct net_device *dev)
177 /* no special treatment */ 208 /* no special treatment */
178 break; 209 break;
179 case NL80211_IFTYPE_UNSPECIFIED: 210 case NL80211_IFTYPE_UNSPECIFIED:
180 case __NL80211_IFTYPE_AFTER_LAST: 211 case NUM_NL80211_IFTYPES:
212 case NL80211_IFTYPE_P2P_CLIENT:
213 case NL80211_IFTYPE_P2P_GO:
181 /* cannot happen */ 214 /* cannot happen */
182 WARN_ON(1); 215 WARN_ON(1);
183 break; 216 break;
@@ -187,39 +220,30 @@ static int ieee80211_open(struct net_device *dev)
187 res = drv_start(local); 220 res = drv_start(local);
188 if (res) 221 if (res)
189 goto err_del_bss; 222 goto err_del_bss;
223 if (local->ops->napi_poll)
224 napi_enable(&local->napi);
190 /* we're brought up, everything changes */ 225 /* we're brought up, everything changes */
191 hw_reconf_flags = ~0; 226 hw_reconf_flags = ~0;
192 ieee80211_led_radio(local, true); 227 ieee80211_led_radio(local, true);
193 } 228 }
194 229
195 /* 230 /*
196 * Check all interfaces and copy the hopefully now-present 231 * Copy the hopefully now-present MAC address to
197 * MAC address to those that have the special null one. 232 * this interface, if it has the special null one.
198 */ 233 */
199 list_for_each_entry(nsdata, &local->interfaces, list) { 234 if (is_zero_ether_addr(dev->dev_addr)) {
200 struct net_device *ndev = nsdata->dev; 235 memcpy(dev->dev_addr,
201 236 local->hw.wiphy->perm_addr,
202 /* 237 ETH_ALEN);
203 * No need to check running since we do not allow 238 memcpy(dev->perm_addr, dev->dev_addr, ETH_ALEN);
204 * it to start up with this invalid address. 239
205 */ 240 if (!is_valid_ether_addr(dev->dev_addr)) {
206 if (compare_ether_addr(null_addr, ndev->dev_addr) == 0) { 241 if (!local->open_count)
207 memcpy(ndev->dev_addr, 242 drv_stop(local);
208 local->hw.wiphy->perm_addr, 243 return -EADDRNOTAVAIL;
209 ETH_ALEN);
210 memcpy(ndev->perm_addr, ndev->dev_addr, ETH_ALEN);
211 } 244 }
212 } 245 }
213 246
214 /*
215 * Validate the MAC address for this device.
216 */
217 if (!is_valid_ether_addr(dev->dev_addr)) {
218 if (!local->open_count)
219 drv_stop(local);
220 return -EADDRNOTAVAIL;
221 }
222
223 switch (sdata->vif.type) { 247 switch (sdata->vif.type) {
224 case NL80211_IFTYPE_AP_VLAN: 248 case NL80211_IFTYPE_AP_VLAN:
225 /* no need to tell driver */ 249 /* no need to tell driver */
@@ -237,25 +261,17 @@ static int ieee80211_open(struct net_device *dev)
237 hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR; 261 hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR;
238 } 262 }
239 263
240 if (sdata->u.mntr_flags & MONITOR_FLAG_FCSFAIL) 264 ieee80211_adjust_monitor_flags(sdata, 1);
241 local->fif_fcsfail++;
242 if (sdata->u.mntr_flags & MONITOR_FLAG_PLCPFAIL)
243 local->fif_plcpfail++;
244 if (sdata->u.mntr_flags & MONITOR_FLAG_CONTROL) {
245 local->fif_control++;
246 local->fif_pspoll++;
247 }
248 if (sdata->u.mntr_flags & MONITOR_FLAG_OTHER_BSS)
249 local->fif_other_bss++;
250
251 ieee80211_configure_filter(local); 265 ieee80211_configure_filter(local);
252 266
253 netif_carrier_on(dev); 267 netif_carrier_on(dev);
254 break; 268 break;
255 default: 269 default:
256 res = drv_add_interface(local, &sdata->vif); 270 if (coming_up) {
257 if (res) 271 res = drv_add_interface(local, &sdata->vif);
258 goto err_stop; 272 if (res)
273 goto err_stop;
274 }
259 275
260 if (ieee80211_vif_is_mesh(&sdata->vif)) { 276 if (ieee80211_vif_is_mesh(&sdata->vif)) {
261 local->fif_other_bss++; 277 local->fif_other_bss++;
@@ -264,8 +280,11 @@ static int ieee80211_open(struct net_device *dev)
264 ieee80211_start_mesh(sdata); 280 ieee80211_start_mesh(sdata);
265 } else if (sdata->vif.type == NL80211_IFTYPE_AP) { 281 } else if (sdata->vif.type == NL80211_IFTYPE_AP) {
266 local->fif_pspoll++; 282 local->fif_pspoll++;
283 local->fif_probe_req++;
267 284
268 ieee80211_configure_filter(local); 285 ieee80211_configure_filter(local);
286 } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
287 local->fif_probe_req++;
269 } 288 }
270 289
271 changed |= ieee80211_reset_erp_info(sdata); 290 changed |= ieee80211_reset_erp_info(sdata);
@@ -277,6 +296,8 @@ static int ieee80211_open(struct net_device *dev)
277 netif_carrier_on(dev); 296 netif_carrier_on(dev);
278 } 297 }
279 298
299 set_bit(SDATA_STATE_RUNNING, &sdata->state);
300
280 if (sdata->vif.type == NL80211_IFTYPE_WDS) { 301 if (sdata->vif.type == NL80211_IFTYPE_WDS) {
281 /* Create STA entry for the WDS peer */ 302 /* Create STA entry for the WDS peer */
282 sta = sta_info_alloc(sdata, sdata->u.wds.remote_addr, 303 sta = sta_info_alloc(sdata, sdata->u.wds.remote_addr,
@@ -294,6 +315,8 @@ static int ieee80211_open(struct net_device *dev)
294 /* STA has been freed */ 315 /* STA has been freed */
295 goto err_del_interface; 316 goto err_del_interface;
296 } 317 }
318
319 rate_control_rate_init(sta);
297 } 320 }
298 321
299 /* 322 /*
@@ -307,9 +330,13 @@ static int ieee80211_open(struct net_device *dev)
307 if (sdata->flags & IEEE80211_SDATA_PROMISC) 330 if (sdata->flags & IEEE80211_SDATA_PROMISC)
308 atomic_inc(&local->iff_promiscs); 331 atomic_inc(&local->iff_promiscs);
309 332
333 mutex_lock(&local->mtx);
310 hw_reconf_flags |= __ieee80211_recalc_idle(local); 334 hw_reconf_flags |= __ieee80211_recalc_idle(local);
335 mutex_unlock(&local->mtx);
336
337 if (coming_up)
338 local->open_count++;
311 339
312 local->open_count++;
313 if (hw_reconf_flags) { 340 if (hw_reconf_flags) {
314 ieee80211_hw_config(local, hw_reconf_flags); 341 ieee80211_hw_config(local, hw_reconf_flags);
315 /* 342 /*
@@ -334,22 +361,42 @@ static int ieee80211_open(struct net_device *dev)
334 sdata->bss = NULL; 361 sdata->bss = NULL;
335 if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) 362 if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
336 list_del(&sdata->u.vlan.list); 363 list_del(&sdata->u.vlan.list);
364 clear_bit(SDATA_STATE_RUNNING, &sdata->state);
337 return res; 365 return res;
338} 366}
339 367
340static int ieee80211_stop(struct net_device *dev) 368static int ieee80211_open(struct net_device *dev)
341{ 369{
342 struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); 370 struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
371 int err;
372
373 /* fail early if user set an invalid address */
374 if (!is_zero_ether_addr(dev->dev_addr) &&
375 !is_valid_ether_addr(dev->dev_addr))
376 return -EADDRNOTAVAIL;
377
378 err = ieee80211_check_concurrent_iface(sdata, sdata->vif.type);
379 if (err)
380 return err;
381
382 return ieee80211_do_open(dev, true);
383}
384
385static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
386 bool going_down)
387{
343 struct ieee80211_local *local = sdata->local; 388 struct ieee80211_local *local = sdata->local;
344 unsigned long flags; 389 unsigned long flags;
345 struct sk_buff *skb, *tmp; 390 struct sk_buff *skb, *tmp;
346 u32 hw_reconf_flags = 0; 391 u32 hw_reconf_flags = 0;
347 int i; 392 int i;
348 393
394 clear_bit(SDATA_STATE_RUNNING, &sdata->state);
395
349 /* 396 /*
350 * Stop TX on this interface first. 397 * Stop TX on this interface first.
351 */ 398 */
352 netif_tx_stop_all_queues(dev); 399 netif_tx_stop_all_queues(sdata->dev);
353 400
354 /* 401 /*
355 * Purge work for this interface. 402 * Purge work for this interface.
@@ -366,12 +413,9 @@ static int ieee80211_stop(struct net_device *dev)
366 * (because if we remove a STA after ops->remove_interface() 413 * (because if we remove a STA after ops->remove_interface()
367 * the driver will have removed the vif info already!) 414 * the driver will have removed the vif info already!)
368 * 415 *
369 * We could relax this and only unlink the stations from the 416 * This is relevant only in AP, WDS and mesh modes, since in
370 * hash table and list but keep them on a per-sdata list that 417 * all other modes we've already removed all stations when
371 * will be inserted back again when the interface is brought 418 * disconnecting etc.
372 * up again, but I don't currently see a use case for that,
373 * except with WDS which gets a STA entry created when it is
374 * brought up.
375 */ 419 */
376 sta_info_flush(local, sdata); 420 sta_info_flush(local, sdata);
377 421
@@ -387,14 +431,19 @@ static int ieee80211_stop(struct net_device *dev)
387 if (sdata->flags & IEEE80211_SDATA_PROMISC) 431 if (sdata->flags & IEEE80211_SDATA_PROMISC)
388 atomic_dec(&local->iff_promiscs); 432 atomic_dec(&local->iff_promiscs);
389 433
390 if (sdata->vif.type == NL80211_IFTYPE_AP) 434 if (sdata->vif.type == NL80211_IFTYPE_AP) {
391 local->fif_pspoll--; 435 local->fif_pspoll--;
436 local->fif_probe_req--;
437 } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
438 local->fif_probe_req--;
439 }
392 440
393 netif_addr_lock_bh(dev); 441 netif_addr_lock_bh(sdata->dev);
394 spin_lock_bh(&local->filter_lock); 442 spin_lock_bh(&local->filter_lock);
395 __hw_addr_unsync(&local->mc_list, &dev->mc, dev->addr_len); 443 __hw_addr_unsync(&local->mc_list, &sdata->dev->mc,
444 sdata->dev->addr_len);
396 spin_unlock_bh(&local->filter_lock); 445 spin_unlock_bh(&local->filter_lock);
397 netif_addr_unlock_bh(dev); 446 netif_addr_unlock_bh(sdata->dev);
398 447
399 ieee80211_configure_filter(local); 448 ieee80211_configure_filter(local);
400 449
@@ -406,11 +455,21 @@ static int ieee80211_stop(struct net_device *dev)
406 struct ieee80211_sub_if_data *vlan, *tmpsdata; 455 struct ieee80211_sub_if_data *vlan, *tmpsdata;
407 struct beacon_data *old_beacon = sdata->u.ap.beacon; 456 struct beacon_data *old_beacon = sdata->u.ap.beacon;
408 457
458 /* sdata_running will return false, so this will disable */
459 ieee80211_bss_info_change_notify(sdata,
460 BSS_CHANGED_BEACON_ENABLED);
461
409 /* remove beacon */ 462 /* remove beacon */
410 rcu_assign_pointer(sdata->u.ap.beacon, NULL); 463 rcu_assign_pointer(sdata->u.ap.beacon, NULL);
411 synchronize_rcu(); 464 synchronize_rcu();
412 kfree(old_beacon); 465 kfree(old_beacon);
413 466
467 /* free all potentially still buffered bcast frames */
468 while ((skb = skb_dequeue(&sdata->u.ap.ps_bc_buf))) {
469 local->total_ps_buffered--;
470 dev_kfree_skb(skb);
471 }
472
414 /* down all dependent devices, that is VLANs */ 473 /* down all dependent devices, that is VLANs */
415 list_for_each_entry_safe(vlan, tmpsdata, &sdata->u.ap.vlans, 474 list_for_each_entry_safe(vlan, tmpsdata, &sdata->u.ap.vlans,
416 u.vlan.list) 475 u.vlan.list)
@@ -418,7 +477,8 @@ static int ieee80211_stop(struct net_device *dev)
418 WARN_ON(!list_empty(&sdata->u.ap.vlans)); 477 WARN_ON(!list_empty(&sdata->u.ap.vlans));
419 } 478 }
420 479
421 local->open_count--; 480 if (going_down)
481 local->open_count--;
422 482
423 switch (sdata->vif.type) { 483 switch (sdata->vif.type) {
424 case NL80211_IFTYPE_AP_VLAN: 484 case NL80211_IFTYPE_AP_VLAN:
@@ -437,40 +497,9 @@ static int ieee80211_stop(struct net_device *dev)
437 hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR; 497 hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR;
438 } 498 }
439 499
440 if (sdata->u.mntr_flags & MONITOR_FLAG_FCSFAIL) 500 ieee80211_adjust_monitor_flags(sdata, -1);
441 local->fif_fcsfail--;
442 if (sdata->u.mntr_flags & MONITOR_FLAG_PLCPFAIL)
443 local->fif_plcpfail--;
444 if (sdata->u.mntr_flags & MONITOR_FLAG_CONTROL) {
445 local->fif_pspoll--;
446 local->fif_control--;
447 }
448 if (sdata->u.mntr_flags & MONITOR_FLAG_OTHER_BSS)
449 local->fif_other_bss--;
450
451 ieee80211_configure_filter(local); 501 ieee80211_configure_filter(local);
452 break; 502 break;
453 case NL80211_IFTYPE_STATION:
454 del_timer_sync(&sdata->u.mgd.chswitch_timer);
455 del_timer_sync(&sdata->u.mgd.timer);
456 del_timer_sync(&sdata->u.mgd.conn_mon_timer);
457 del_timer_sync(&sdata->u.mgd.bcn_mon_timer);
458 /*
459 * If any of the timers fired while we waited for it, it will
460 * have queued its work. Now the work will be running again
461 * but will not rearm the timer again because it checks
462 * whether the interface is running, which, at this point,
463 * it no longer is.
464 */
465 cancel_work_sync(&sdata->u.mgd.chswitch_work);
466 cancel_work_sync(&sdata->u.mgd.monitor_work);
467 cancel_work_sync(&sdata->u.mgd.beacon_connection_loss_work);
468
469 /* fall through */
470 case NL80211_IFTYPE_ADHOC:
471 if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
472 del_timer_sync(&sdata->u.ibss.timer);
473 /* fall through */
474 case NL80211_IFTYPE_MESH_POINT: 503 case NL80211_IFTYPE_MESH_POINT:
475 if (ieee80211_vif_is_mesh(&sdata->vif)) { 504 if (ieee80211_vif_is_mesh(&sdata->vif)) {
476 /* other_bss and allmulti are always set on mesh 505 /* other_bss and allmulti are always set on mesh
@@ -498,27 +527,34 @@ static int ieee80211_stop(struct net_device *dev)
498 ieee80211_scan_cancel(local); 527 ieee80211_scan_cancel(local);
499 528
500 /* 529 /*
501 * Disable beaconing for AP and mesh, IBSS can't 530 * Disable beaconing here for mesh only, AP and IBSS
502 * still be joined to a network at this point. 531 * are already taken care of.
503 */ 532 */
504 if (sdata->vif.type == NL80211_IFTYPE_AP || 533 if (sdata->vif.type == NL80211_IFTYPE_MESH_POINT)
505 sdata->vif.type == NL80211_IFTYPE_MESH_POINT) {
506 ieee80211_bss_info_change_notify(sdata, 534 ieee80211_bss_info_change_notify(sdata,
507 BSS_CHANGED_BEACON_ENABLED); 535 BSS_CHANGED_BEACON_ENABLED);
508 }
509 536
510 /* free all remaining keys, there shouldn't be any */ 537 /*
538 * Free all remaining keys, there shouldn't be any,
539 * except maybe group keys in AP more or WDS?
540 */
511 ieee80211_free_keys(sdata); 541 ieee80211_free_keys(sdata);
512 drv_remove_interface(local, &sdata->vif); 542
543 if (going_down)
544 drv_remove_interface(local, &sdata->vif);
513 } 545 }
514 546
515 sdata->bss = NULL; 547 sdata->bss = NULL;
516 548
549 mutex_lock(&local->mtx);
517 hw_reconf_flags |= __ieee80211_recalc_idle(local); 550 hw_reconf_flags |= __ieee80211_recalc_idle(local);
551 mutex_unlock(&local->mtx);
518 552
519 ieee80211_recalc_ps(local, -1); 553 ieee80211_recalc_ps(local, -1);
520 554
521 if (local->open_count == 0) { 555 if (local->open_count == 0) {
556 if (local->ops->napi_poll)
557 napi_disable(&local->napi);
522 ieee80211_clear_tx_pending(local); 558 ieee80211_clear_tx_pending(local);
523 ieee80211_stop_device(local); 559 ieee80211_stop_device(local);
524 560
@@ -541,6 +577,13 @@ static int ieee80211_stop(struct net_device *dev)
541 } 577 }
542 } 578 }
543 spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); 579 spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
580}
581
582static int ieee80211_stop(struct net_device *dev)
583{
584 struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
585
586 ieee80211_do_stop(sdata, true);
544 587
545 return 0; 588 return 0;
546} 589}
@@ -585,8 +628,6 @@ static void ieee80211_teardown_sdata(struct net_device *dev)
585{ 628{
586 struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); 629 struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
587 struct ieee80211_local *local = sdata->local; 630 struct ieee80211_local *local = sdata->local;
588 struct beacon_data *beacon;
589 struct sk_buff *skb;
590 int flushed; 631 int flushed;
591 int i; 632 int i;
592 633
@@ -599,37 +640,8 @@ static void ieee80211_teardown_sdata(struct net_device *dev)
599 __skb_queue_purge(&sdata->fragments[i].skb_list); 640 __skb_queue_purge(&sdata->fragments[i].skb_list);
600 sdata->fragment_next = 0; 641 sdata->fragment_next = 0;
601 642
602 switch (sdata->vif.type) { 643 if (ieee80211_vif_is_mesh(&sdata->vif))
603 case NL80211_IFTYPE_AP: 644 mesh_rmc_free(sdata);
604 beacon = sdata->u.ap.beacon;
605 rcu_assign_pointer(sdata->u.ap.beacon, NULL);
606 synchronize_rcu();
607 kfree(beacon);
608
609 while ((skb = skb_dequeue(&sdata->u.ap.ps_bc_buf))) {
610 local->total_ps_buffered--;
611 dev_kfree_skb(skb);
612 }
613
614 break;
615 case NL80211_IFTYPE_MESH_POINT:
616 if (ieee80211_vif_is_mesh(&sdata->vif))
617 mesh_rmc_free(sdata);
618 break;
619 case NL80211_IFTYPE_ADHOC:
620 if (WARN_ON(sdata->u.ibss.presp))
621 kfree_skb(sdata->u.ibss.presp);
622 break;
623 case NL80211_IFTYPE_STATION:
624 case NL80211_IFTYPE_WDS:
625 case NL80211_IFTYPE_AP_VLAN:
626 case NL80211_IFTYPE_MONITOR:
627 break;
628 case NL80211_IFTYPE_UNSPECIFIED:
629 case __NL80211_IFTYPE_AFTER_LAST:
630 BUG();
631 break;
632 }
633 645
634 flushed = sta_info_flush(local, sdata); 646 flushed = sta_info_flush(local, sdata);
635 WARN_ON(flushed); 647 WARN_ON(flushed);
@@ -791,7 +803,8 @@ static void ieee80211_iface_work(struct work_struct *work)
791 803
792 __ieee80211_stop_rx_ba_session( 804 __ieee80211_stop_rx_ba_session(
793 sta, tid, WLAN_BACK_RECIPIENT, 805 sta, tid, WLAN_BACK_RECIPIENT,
794 WLAN_REASON_QSTA_REQUIRE_SETUP); 806 WLAN_REASON_QSTA_REQUIRE_SETUP,
807 true);
795 } 808 }
796 mutex_unlock(&local->sta_mtx); 809 mutex_unlock(&local->sta_mtx);
797 } else switch (sdata->vif.type) { 810 } else switch (sdata->vif.type) {
@@ -844,9 +857,13 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
844 857
845 /* and set some type-dependent values */ 858 /* and set some type-dependent values */
846 sdata->vif.type = type; 859 sdata->vif.type = type;
860 sdata->vif.p2p = false;
847 sdata->dev->netdev_ops = &ieee80211_dataif_ops; 861 sdata->dev->netdev_ops = &ieee80211_dataif_ops;
848 sdata->wdev.iftype = type; 862 sdata->wdev.iftype = type;
849 863
864 sdata->control_port_protocol = cpu_to_be16(ETH_P_PAE);
865 sdata->control_port_no_encrypt = false;
866
850 /* only monitor differs */ 867 /* only monitor differs */
851 sdata->dev->type = ARPHRD_ETHER; 868 sdata->dev->type = ARPHRD_ETHER;
852 869
@@ -854,10 +871,20 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
854 INIT_WORK(&sdata->work, ieee80211_iface_work); 871 INIT_WORK(&sdata->work, ieee80211_iface_work);
855 872
856 switch (type) { 873 switch (type) {
874 case NL80211_IFTYPE_P2P_GO:
875 type = NL80211_IFTYPE_AP;
876 sdata->vif.type = type;
877 sdata->vif.p2p = true;
878 /* fall through */
857 case NL80211_IFTYPE_AP: 879 case NL80211_IFTYPE_AP:
858 skb_queue_head_init(&sdata->u.ap.ps_bc_buf); 880 skb_queue_head_init(&sdata->u.ap.ps_bc_buf);
859 INIT_LIST_HEAD(&sdata->u.ap.vlans); 881 INIT_LIST_HEAD(&sdata->u.ap.vlans);
860 break; 882 break;
883 case NL80211_IFTYPE_P2P_CLIENT:
884 type = NL80211_IFTYPE_STATION;
885 sdata->vif.type = type;
886 sdata->vif.p2p = true;
887 /* fall through */
861 case NL80211_IFTYPE_STATION: 888 case NL80211_IFTYPE_STATION:
862 ieee80211_sta_setup_sdata(sdata); 889 ieee80211_sta_setup_sdata(sdata);
863 break; 890 break;
@@ -878,7 +905,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
878 case NL80211_IFTYPE_AP_VLAN: 905 case NL80211_IFTYPE_AP_VLAN:
879 break; 906 break;
880 case NL80211_IFTYPE_UNSPECIFIED: 907 case NL80211_IFTYPE_UNSPECIFIED:
881 case __NL80211_IFTYPE_AFTER_LAST: 908 case NUM_NL80211_IFTYPES:
882 BUG(); 909 BUG();
883 break; 910 break;
884 } 911 }
@@ -886,12 +913,85 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
886 ieee80211_debugfs_add_netdev(sdata); 913 ieee80211_debugfs_add_netdev(sdata);
887} 914}
888 915
916static int ieee80211_runtime_change_iftype(struct ieee80211_sub_if_data *sdata,
917 enum nl80211_iftype type)
918{
919 struct ieee80211_local *local = sdata->local;
920 int ret, err;
921 enum nl80211_iftype internal_type = type;
922 bool p2p = false;
923
924 ASSERT_RTNL();
925
926 if (!local->ops->change_interface)
927 return -EBUSY;
928
929 switch (sdata->vif.type) {
930 case NL80211_IFTYPE_AP:
931 case NL80211_IFTYPE_STATION:
932 case NL80211_IFTYPE_ADHOC:
933 /*
934 * Could maybe also all others here?
935 * Just not sure how that interacts
936 * with the RX/config path e.g. for
937 * mesh.
938 */
939 break;
940 default:
941 return -EBUSY;
942 }
943
944 switch (type) {
945 case NL80211_IFTYPE_AP:
946 case NL80211_IFTYPE_STATION:
947 case NL80211_IFTYPE_ADHOC:
948 /*
949 * Could probably support everything
950 * but WDS here (WDS do_open can fail
951 * under memory pressure, which this
952 * code isn't prepared to handle).
953 */
954 break;
955 case NL80211_IFTYPE_P2P_CLIENT:
956 p2p = true;
957 internal_type = NL80211_IFTYPE_STATION;
958 break;
959 case NL80211_IFTYPE_P2P_GO:
960 p2p = true;
961 internal_type = NL80211_IFTYPE_AP;
962 break;
963 default:
964 return -EBUSY;
965 }
966
967 ret = ieee80211_check_concurrent_iface(sdata, internal_type);
968 if (ret)
969 return ret;
970
971 ieee80211_do_stop(sdata, false);
972
973 ieee80211_teardown_sdata(sdata->dev);
974
975 ret = drv_change_interface(local, sdata, internal_type, p2p);
976 if (ret)
977 type = sdata->vif.type;
978
979 ieee80211_setup_sdata(sdata, type);
980
981 err = ieee80211_do_open(sdata->dev, false);
982 WARN(err, "type change: do_open returned %d", err);
983
984 return ret;
985}
986
889int ieee80211_if_change_type(struct ieee80211_sub_if_data *sdata, 987int ieee80211_if_change_type(struct ieee80211_sub_if_data *sdata,
890 enum nl80211_iftype type) 988 enum nl80211_iftype type)
891{ 989{
990 int ret;
991
892 ASSERT_RTNL(); 992 ASSERT_RTNL();
893 993
894 if (type == sdata->vif.type) 994 if (type == ieee80211_vif_type_p2p(&sdata->vif))
895 return 0; 995 return 0;
896 996
897 /* Setting ad-hoc mode on non-IBSS channel is not supported. */ 997 /* Setting ad-hoc mode on non-IBSS channel is not supported. */
@@ -899,18 +999,15 @@ int ieee80211_if_change_type(struct ieee80211_sub_if_data *sdata,
899 type == NL80211_IFTYPE_ADHOC) 999 type == NL80211_IFTYPE_ADHOC)
900 return -EOPNOTSUPP; 1000 return -EOPNOTSUPP;
901 1001
902 /* 1002 if (ieee80211_sdata_running(sdata)) {
903 * We could, here, on changes between IBSS/STA/MESH modes, 1003 ret = ieee80211_runtime_change_iftype(sdata, type);
904 * invoke an MLME function instead that disassociates etc. 1004 if (ret)
905 * and goes into the requested mode. 1005 return ret;
906 */ 1006 } else {
907 1007 /* Purge and reset type-dependent state. */
908 if (ieee80211_sdata_running(sdata)) 1008 ieee80211_teardown_sdata(sdata->dev);
909 return -EBUSY; 1009 ieee80211_setup_sdata(sdata, type);
910 1010 }
911 /* Purge and reset type-dependent state. */
912 ieee80211_teardown_sdata(sdata->dev);
913 ieee80211_setup_sdata(sdata, type);
914 1011
915 /* reset some values that shouldn't be kept across type changes */ 1012 /* reset some values that shouldn't be kept across type changes */
916 sdata->vif.bss_conf.basic_rates = 1013 sdata->vif.bss_conf.basic_rates =
@@ -1167,8 +1264,7 @@ static u32 ieee80211_idle_off(struct ieee80211_local *local,
1167 return 0; 1264 return 0;
1168 1265
1169#ifdef CONFIG_MAC80211_VERBOSE_DEBUG 1266#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
1170 printk(KERN_DEBUG "%s: device no longer idle - %s\n", 1267 wiphy_debug(local->hw.wiphy, "device no longer idle - %s\n", reason);
1171 wiphy_name(local->hw.wiphy), reason);
1172#endif 1268#endif
1173 1269
1174 local->hw.conf.flags &= ~IEEE80211_CONF_IDLE; 1270 local->hw.conf.flags &= ~IEEE80211_CONF_IDLE;
@@ -1181,8 +1277,7 @@ static u32 ieee80211_idle_on(struct ieee80211_local *local)
1181 return 0; 1277 return 0;
1182 1278
1183#ifdef CONFIG_MAC80211_VERBOSE_DEBUG 1279#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
1184 printk(KERN_DEBUG "%s: device now idle\n", 1280 wiphy_debug(local->hw.wiphy, "device now idle\n");
1185 wiphy_name(local->hw.wiphy));
1186#endif 1281#endif
1187 1282
1188 drv_flush(local, false); 1283 drv_flush(local, false);
@@ -1195,28 +1290,61 @@ u32 __ieee80211_recalc_idle(struct ieee80211_local *local)
1195{ 1290{
1196 struct ieee80211_sub_if_data *sdata; 1291 struct ieee80211_sub_if_data *sdata;
1197 int count = 0; 1292 int count = 0;
1293 bool working = false, scanning = false;
1294 struct ieee80211_work *wk;
1198 1295
1199 if (!list_empty(&local->work_list)) 1296#ifdef CONFIG_PROVE_LOCKING
1200 return ieee80211_idle_off(local, "working"); 1297 WARN_ON(debug_locks && !lockdep_rtnl_is_held() &&
1201 1298 !lockdep_is_held(&local->iflist_mtx));
1202 if (local->scanning) 1299#endif
1203 return ieee80211_idle_off(local, "scanning"); 1300 lockdep_assert_held(&local->mtx);
1204 1301
1205 list_for_each_entry(sdata, &local->interfaces, list) { 1302 list_for_each_entry(sdata, &local->interfaces, list) {
1206 if (!ieee80211_sdata_running(sdata)) 1303 if (!ieee80211_sdata_running(sdata)) {
1304 sdata->vif.bss_conf.idle = true;
1207 continue; 1305 continue;
1306 }
1307
1308 sdata->old_idle = sdata->vif.bss_conf.idle;
1309
1208 /* do not count disabled managed interfaces */ 1310 /* do not count disabled managed interfaces */
1209 if (sdata->vif.type == NL80211_IFTYPE_STATION && 1311 if (sdata->vif.type == NL80211_IFTYPE_STATION &&
1210 !sdata->u.mgd.associated) 1312 !sdata->u.mgd.associated) {
1313 sdata->vif.bss_conf.idle = true;
1211 continue; 1314 continue;
1315 }
1212 /* do not count unused IBSS interfaces */ 1316 /* do not count unused IBSS interfaces */
1213 if (sdata->vif.type == NL80211_IFTYPE_ADHOC && 1317 if (sdata->vif.type == NL80211_IFTYPE_ADHOC &&
1214 !sdata->u.ibss.ssid_len) 1318 !sdata->u.ibss.ssid_len) {
1319 sdata->vif.bss_conf.idle = true;
1215 continue; 1320 continue;
1321 }
1216 /* count everything else */ 1322 /* count everything else */
1217 count++; 1323 count++;
1218 } 1324 }
1219 1325
1326 list_for_each_entry(wk, &local->work_list, list) {
1327 working = true;
1328 wk->sdata->vif.bss_conf.idle = false;
1329 }
1330
1331 if (local->scan_sdata) {
1332 scanning = true;
1333 local->scan_sdata->vif.bss_conf.idle = false;
1334 }
1335
1336 list_for_each_entry(sdata, &local->interfaces, list) {
1337 if (sdata->old_idle == sdata->vif.bss_conf.idle)
1338 continue;
1339 if (!ieee80211_sdata_running(sdata))
1340 continue;
1341 ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_IDLE);
1342 }
1343
1344 if (working)
1345 return ieee80211_idle_off(local, "working");
1346 if (scanning)
1347 return ieee80211_idle_off(local, "scanning");
1220 if (!count) 1348 if (!count)
1221 return ieee80211_idle_on(local); 1349 return ieee80211_idle_on(local);
1222 else 1350 else
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index 1b9d87ed143a..ccd676b2f599 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -49,7 +49,7 @@ static const u8 bcast_addr[ETH_ALEN] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
49 49
50static void assert_key_lock(struct ieee80211_local *local) 50static void assert_key_lock(struct ieee80211_local *local)
51{ 51{
52 WARN_ON(!mutex_is_locked(&local->key_mtx)); 52 lockdep_assert_held(&local->key_mtx);
53} 53}
54 54
55static struct ieee80211_sta *get_sta_for_key(struct ieee80211_key *key) 55static struct ieee80211_sta *get_sta_for_key(struct ieee80211_key *key)
@@ -60,7 +60,7 @@ static struct ieee80211_sta *get_sta_for_key(struct ieee80211_key *key)
60 return NULL; 60 return NULL;
61} 61}
62 62
63static void ieee80211_key_enable_hw_accel(struct ieee80211_key *key) 63static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key)
64{ 64{
65 struct ieee80211_sub_if_data *sdata; 65 struct ieee80211_sub_if_data *sdata;
66 struct ieee80211_sta *sta; 66 struct ieee80211_sta *sta;
@@ -69,12 +69,20 @@ static void ieee80211_key_enable_hw_accel(struct ieee80211_key *key)
69 might_sleep(); 69 might_sleep();
70 70
71 if (!key->local->ops->set_key) 71 if (!key->local->ops->set_key)
72 return; 72 goto out_unsupported;
73 73
74 assert_key_lock(key->local); 74 assert_key_lock(key->local);
75 75
76 sta = get_sta_for_key(key); 76 sta = get_sta_for_key(key);
77 77
78 /*
79 * If this is a per-STA GTK, check if it
80 * is supported; if not, return.
81 */
82 if (sta && !(key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE) &&
83 !(key->local->hw.flags & IEEE80211_HW_SUPPORTS_PER_STA_GTK))
84 goto out_unsupported;
85
78 sdata = key->sdata; 86 sdata = key->sdata;
79 if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) 87 if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
80 sdata = container_of(sdata->bss, 88 sdata = container_of(sdata->bss,
@@ -83,14 +91,28 @@ static void ieee80211_key_enable_hw_accel(struct ieee80211_key *key)
83 91
84 ret = drv_set_key(key->local, SET_KEY, sdata, sta, &key->conf); 92 ret = drv_set_key(key->local, SET_KEY, sdata, sta, &key->conf);
85 93
86 if (!ret) 94 if (!ret) {
87 key->flags |= KEY_FLAG_UPLOADED_TO_HARDWARE; 95 key->flags |= KEY_FLAG_UPLOADED_TO_HARDWARE;
96 return 0;
97 }
88 98
89 if (ret && ret != -ENOSPC && ret != -EOPNOTSUPP) 99 if (ret != -ENOSPC && ret != -EOPNOTSUPP)
90 printk(KERN_ERR "mac80211-%s: failed to set key " 100 wiphy_err(key->local->hw.wiphy,
91 "(%d, %pM) to hardware (%d)\n", 101 "failed to set key (%d, %pM) to hardware (%d)\n",
92 wiphy_name(key->local->hw.wiphy), 102 key->conf.keyidx, sta ? sta->addr : bcast_addr, ret);
93 key->conf.keyidx, sta ? sta->addr : bcast_addr, ret); 103
104 out_unsupported:
105 switch (key->conf.cipher) {
106 case WLAN_CIPHER_SUITE_WEP40:
107 case WLAN_CIPHER_SUITE_WEP104:
108 case WLAN_CIPHER_SUITE_TKIP:
109 case WLAN_CIPHER_SUITE_CCMP:
110 case WLAN_CIPHER_SUITE_AES_CMAC:
111 /* all of these we can do in software */
112 return 0;
113 default:
114 return -EINVAL;
115 }
94} 116}
95 117
96static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key) 118static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key)
@@ -121,14 +143,33 @@ static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key)
121 sta, &key->conf); 143 sta, &key->conf);
122 144
123 if (ret) 145 if (ret)
124 printk(KERN_ERR "mac80211-%s: failed to remove key " 146 wiphy_err(key->local->hw.wiphy,
125 "(%d, %pM) from hardware (%d)\n", 147 "failed to remove key (%d, %pM) from hardware (%d)\n",
126 wiphy_name(key->local->hw.wiphy), 148 key->conf.keyidx, sta ? sta->addr : bcast_addr, ret);
127 key->conf.keyidx, sta ? sta->addr : bcast_addr, ret);
128 149
129 key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE; 150 key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE;
130} 151}
131 152
153void ieee80211_key_removed(struct ieee80211_key_conf *key_conf)
154{
155 struct ieee80211_key *key;
156
157 key = container_of(key_conf, struct ieee80211_key, conf);
158
159 might_sleep();
160 assert_key_lock(key->local);
161
162 key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE;
163
164 /*
165 * Flush TX path to avoid attempts to use this key
166 * after this function returns. Until then, drivers
167 * must be prepared to handle the key.
168 */
169 synchronize_rcu();
170}
171EXPORT_SYMBOL_GPL(ieee80211_key_removed);
172
132static void __ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, 173static void __ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata,
133 int idx) 174 int idx)
134{ 175{
@@ -184,6 +225,7 @@ void ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata,
184 225
185static void __ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, 226static void __ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
186 struct sta_info *sta, 227 struct sta_info *sta,
228 bool pairwise,
187 struct ieee80211_key *old, 229 struct ieee80211_key *old,
188 struct ieee80211_key *new) 230 struct ieee80211_key *new)
189{ 231{
@@ -192,8 +234,14 @@ static void __ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
192 if (new) 234 if (new)
193 list_add(&new->list, &sdata->key_list); 235 list_add(&new->list, &sdata->key_list);
194 236
195 if (sta) { 237 if (sta && pairwise) {
196 rcu_assign_pointer(sta->key, new); 238 rcu_assign_pointer(sta->ptk, new);
239 } else if (sta) {
240 if (old)
241 idx = old->conf.keyidx;
242 else
243 idx = new->conf.keyidx;
244 rcu_assign_pointer(sta->gtk[idx], new);
197 } else { 245 } else {
198 WARN_ON(new && old && new->conf.keyidx != old->conf.keyidx); 246 WARN_ON(new && old && new->conf.keyidx != old->conf.keyidx);
199 247
@@ -227,20 +275,18 @@ static void __ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
227 } 275 }
228} 276}
229 277
230struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg, 278struct ieee80211_key *ieee80211_key_alloc(u32 cipher, int idx, size_t key_len,
231 int idx,
232 size_t key_len,
233 const u8 *key_data, 279 const u8 *key_data,
234 size_t seq_len, const u8 *seq) 280 size_t seq_len, const u8 *seq)
235{ 281{
236 struct ieee80211_key *key; 282 struct ieee80211_key *key;
237 int i, j; 283 int i, j, err;
238 284
239 BUG_ON(idx < 0 || idx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS); 285 BUG_ON(idx < 0 || idx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS);
240 286
241 key = kzalloc(sizeof(struct ieee80211_key) + key_len, GFP_KERNEL); 287 key = kzalloc(sizeof(struct ieee80211_key) + key_len, GFP_KERNEL);
242 if (!key) 288 if (!key)
243 return NULL; 289 return ERR_PTR(-ENOMEM);
244 290
245 /* 291 /*
246 * Default to software encryption; we'll later upload the 292 * Default to software encryption; we'll later upload the
@@ -249,15 +295,16 @@ struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg,
249 key->conf.flags = 0; 295 key->conf.flags = 0;
250 key->flags = 0; 296 key->flags = 0;
251 297
252 key->conf.alg = alg; 298 key->conf.cipher = cipher;
253 key->conf.keyidx = idx; 299 key->conf.keyidx = idx;
254 key->conf.keylen = key_len; 300 key->conf.keylen = key_len;
255 switch (alg) { 301 switch (cipher) {
256 case ALG_WEP: 302 case WLAN_CIPHER_SUITE_WEP40:
303 case WLAN_CIPHER_SUITE_WEP104:
257 key->conf.iv_len = WEP_IV_LEN; 304 key->conf.iv_len = WEP_IV_LEN;
258 key->conf.icv_len = WEP_ICV_LEN; 305 key->conf.icv_len = WEP_ICV_LEN;
259 break; 306 break;
260 case ALG_TKIP: 307 case WLAN_CIPHER_SUITE_TKIP:
261 key->conf.iv_len = TKIP_IV_LEN; 308 key->conf.iv_len = TKIP_IV_LEN;
262 key->conf.icv_len = TKIP_ICV_LEN; 309 key->conf.icv_len = TKIP_ICV_LEN;
263 if (seq) { 310 if (seq) {
@@ -269,7 +316,7 @@ struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg,
269 } 316 }
270 } 317 }
271 break; 318 break;
272 case ALG_CCMP: 319 case WLAN_CIPHER_SUITE_CCMP:
273 key->conf.iv_len = CCMP_HDR_LEN; 320 key->conf.iv_len = CCMP_HDR_LEN;
274 key->conf.icv_len = CCMP_MIC_LEN; 321 key->conf.icv_len = CCMP_MIC_LEN;
275 if (seq) { 322 if (seq) {
@@ -278,42 +325,38 @@ struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg,
278 key->u.ccmp.rx_pn[i][j] = 325 key->u.ccmp.rx_pn[i][j] =
279 seq[CCMP_PN_LEN - j - 1]; 326 seq[CCMP_PN_LEN - j - 1];
280 } 327 }
281 break;
282 case ALG_AES_CMAC:
283 key->conf.iv_len = 0;
284 key->conf.icv_len = sizeof(struct ieee80211_mmie);
285 if (seq)
286 for (j = 0; j < 6; j++)
287 key->u.aes_cmac.rx_pn[j] = seq[6 - j - 1];
288 break;
289 }
290 memcpy(key->conf.key, key_data, key_len);
291 INIT_LIST_HEAD(&key->list);
292
293 if (alg == ALG_CCMP) {
294 /* 328 /*
295 * Initialize AES key state here as an optimization so that 329 * Initialize AES key state here as an optimization so that
296 * it does not need to be initialized for every packet. 330 * it does not need to be initialized for every packet.
297 */ 331 */
298 key->u.ccmp.tfm = ieee80211_aes_key_setup_encrypt(key_data); 332 key->u.ccmp.tfm = ieee80211_aes_key_setup_encrypt(key_data);
299 if (!key->u.ccmp.tfm) { 333 if (IS_ERR(key->u.ccmp.tfm)) {
334 err = PTR_ERR(key->u.ccmp.tfm);
300 kfree(key); 335 kfree(key);
301 return NULL; 336 key = ERR_PTR(err);
302 } 337 }
303 } 338 break;
304 339 case WLAN_CIPHER_SUITE_AES_CMAC:
305 if (alg == ALG_AES_CMAC) { 340 key->conf.iv_len = 0;
341 key->conf.icv_len = sizeof(struct ieee80211_mmie);
342 if (seq)
343 for (j = 0; j < 6; j++)
344 key->u.aes_cmac.rx_pn[j] = seq[6 - j - 1];
306 /* 345 /*
307 * Initialize AES key state here as an optimization so that 346 * Initialize AES key state here as an optimization so that
308 * it does not need to be initialized for every packet. 347 * it does not need to be initialized for every packet.
309 */ 348 */
310 key->u.aes_cmac.tfm = 349 key->u.aes_cmac.tfm =
311 ieee80211_aes_cmac_key_setup(key_data); 350 ieee80211_aes_cmac_key_setup(key_data);
312 if (!key->u.aes_cmac.tfm) { 351 if (IS_ERR(key->u.aes_cmac.tfm)) {
352 err = PTR_ERR(key->u.aes_cmac.tfm);
313 kfree(key); 353 kfree(key);
314 return NULL; 354 key = ERR_PTR(err);
315 } 355 }
356 break;
316 } 357 }
358 memcpy(key->conf.key, key_data, key_len);
359 INIT_LIST_HEAD(&key->list);
317 360
318 return key; 361 return key;
319} 362}
@@ -326,9 +369,9 @@ static void __ieee80211_key_destroy(struct ieee80211_key *key)
326 if (key->local) 369 if (key->local)
327 ieee80211_key_disable_hw_accel(key); 370 ieee80211_key_disable_hw_accel(key);
328 371
329 if (key->conf.alg == ALG_CCMP) 372 if (key->conf.cipher == WLAN_CIPHER_SUITE_CCMP)
330 ieee80211_aes_key_free(key->u.ccmp.tfm); 373 ieee80211_aes_key_free(key->u.ccmp.tfm);
331 if (key->conf.alg == ALG_AES_CMAC) 374 if (key->conf.cipher == WLAN_CIPHER_SUITE_AES_CMAC)
332 ieee80211_aes_cmac_key_free(key->u.aes_cmac.tfm); 375 ieee80211_aes_cmac_key_free(key->u.aes_cmac.tfm);
333 if (key->local) 376 if (key->local)
334 ieee80211_debugfs_key_remove(key); 377 ieee80211_debugfs_key_remove(key);
@@ -336,12 +379,13 @@ static void __ieee80211_key_destroy(struct ieee80211_key *key)
336 kfree(key); 379 kfree(key);
337} 380}
338 381
339void ieee80211_key_link(struct ieee80211_key *key, 382int ieee80211_key_link(struct ieee80211_key *key,
340 struct ieee80211_sub_if_data *sdata, 383 struct ieee80211_sub_if_data *sdata,
341 struct sta_info *sta) 384 struct sta_info *sta)
342{ 385{
343 struct ieee80211_key *old_key; 386 struct ieee80211_key *old_key;
344 int idx; 387 int idx, ret;
388 bool pairwise = key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE;
345 389
346 BUG_ON(!sdata); 390 BUG_ON(!sdata);
347 BUG_ON(!key); 391 BUG_ON(!key);
@@ -358,13 +402,6 @@ void ieee80211_key_link(struct ieee80211_key *key,
358 */ 402 */
359 if (test_sta_flags(sta, WLAN_STA_WME)) 403 if (test_sta_flags(sta, WLAN_STA_WME))
360 key->conf.flags |= IEEE80211_KEY_FLAG_WMM_STA; 404 key->conf.flags |= IEEE80211_KEY_FLAG_WMM_STA;
361
362 /*
363 * This key is for a specific sta interface,
364 * inform the driver that it should try to store
365 * this key as pairwise key.
366 */
367 key->conf.flags |= IEEE80211_KEY_FLAG_PAIRWISE;
368 } else { 405 } else {
369 if (sdata->vif.type == NL80211_IFTYPE_STATION) { 406 if (sdata->vif.type == NL80211_IFTYPE_STATION) {
370 struct sta_info *ap; 407 struct sta_info *ap;
@@ -386,19 +423,23 @@ void ieee80211_key_link(struct ieee80211_key *key,
386 423
387 mutex_lock(&sdata->local->key_mtx); 424 mutex_lock(&sdata->local->key_mtx);
388 425
389 if (sta) 426 if (sta && pairwise)
390 old_key = sta->key; 427 old_key = sta->ptk;
428 else if (sta)
429 old_key = sta->gtk[idx];
391 else 430 else
392 old_key = sdata->keys[idx]; 431 old_key = sdata->keys[idx];
393 432
394 __ieee80211_key_replace(sdata, sta, old_key, key); 433 __ieee80211_key_replace(sdata, sta, pairwise, old_key, key);
395 __ieee80211_key_destroy(old_key); 434 __ieee80211_key_destroy(old_key);
396 435
397 ieee80211_debugfs_key_add(key); 436 ieee80211_debugfs_key_add(key);
398 437
399 ieee80211_key_enable_hw_accel(key); 438 ret = ieee80211_key_enable_hw_accel(key);
400 439
401 mutex_unlock(&sdata->local->key_mtx); 440 mutex_unlock(&sdata->local->key_mtx);
441
442 return ret;
402} 443}
403 444
404static void __ieee80211_key_free(struct ieee80211_key *key) 445static void __ieee80211_key_free(struct ieee80211_key *key)
@@ -408,7 +449,8 @@ static void __ieee80211_key_free(struct ieee80211_key *key)
408 */ 449 */
409 if (key->sdata) 450 if (key->sdata)
410 __ieee80211_key_replace(key->sdata, key->sta, 451 __ieee80211_key_replace(key->sdata, key->sta,
411 key, NULL); 452 key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE,
453 key, NULL);
412 __ieee80211_key_destroy(key); 454 __ieee80211_key_destroy(key);
413} 455}
414 456
diff --git a/net/mac80211/key.h b/net/mac80211/key.h
index b665bbb7a471..0db1c0f5f697 100644
--- a/net/mac80211/key.h
+++ b/net/mac80211/key.h
@@ -16,6 +16,9 @@
16#include <linux/rcupdate.h> 16#include <linux/rcupdate.h>
17#include <net/mac80211.h> 17#include <net/mac80211.h>
18 18
19#define NUM_DEFAULT_KEYS 4
20#define NUM_DEFAULT_MGMT_KEYS 2
21
19#define WEP_IV_LEN 4 22#define WEP_IV_LEN 4
20#define WEP_ICV_LEN 4 23#define WEP_ICV_LEN 4
21#define ALG_TKIP_KEY_LEN 32 24#define ALG_TKIP_KEY_LEN 32
@@ -123,18 +126,16 @@ struct ieee80211_key {
123 struct ieee80211_key_conf conf; 126 struct ieee80211_key_conf conf;
124}; 127};
125 128
126struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg, 129struct ieee80211_key *ieee80211_key_alloc(u32 cipher, int idx, size_t key_len,
127 int idx,
128 size_t key_len,
129 const u8 *key_data, 130 const u8 *key_data,
130 size_t seq_len, const u8 *seq); 131 size_t seq_len, const u8 *seq);
131/* 132/*
132 * Insert a key into data structures (sdata, sta if necessary) 133 * Insert a key into data structures (sdata, sta if necessary)
133 * to make it used, free old key. 134 * to make it used, free old key.
134 */ 135 */
135void ieee80211_key_link(struct ieee80211_key *key, 136int __must_check ieee80211_key_link(struct ieee80211_key *key,
136 struct ieee80211_sub_if_data *sdata, 137 struct ieee80211_sub_if_data *sdata,
137 struct sta_info *sta); 138 struct sta_info *sta);
138void ieee80211_key_free(struct ieee80211_local *local, 139void ieee80211_key_free(struct ieee80211_local *local,
139 struct ieee80211_key *key); 140 struct ieee80211_key *key);
140void ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, int idx); 141void ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, int idx);
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 798a91b100cc..107a0cbe52ac 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -54,6 +54,9 @@ void ieee80211_configure_filter(struct ieee80211_local *local)
54 if (local->monitors || local->scanning) 54 if (local->monitors || local->scanning)
55 new_flags |= FIF_BCN_PRBRESP_PROMISC; 55 new_flags |= FIF_BCN_PRBRESP_PROMISC;
56 56
57 if (local->fif_probe_req || local->probe_req_reg)
58 new_flags |= FIF_PROBE_REQ;
59
57 if (local->fif_fcsfail) 60 if (local->fif_fcsfail)
58 new_flags |= FIF_FCSFAIL; 61 new_flags |= FIF_FCSFAIL;
59 62
@@ -99,16 +102,19 @@ int ieee80211_hw_config(struct ieee80211_local *local, u32 changed)
99 int ret = 0; 102 int ret = 0;
100 int power; 103 int power;
101 enum nl80211_channel_type channel_type; 104 enum nl80211_channel_type channel_type;
105 u32 offchannel_flag;
102 106
103 might_sleep(); 107 might_sleep();
104 108
105 scan_chan = local->scan_channel; 109 scan_chan = local->scan_channel;
106 110
111 offchannel_flag = local->hw.conf.flags & IEEE80211_CONF_OFFCHANNEL;
107 if (scan_chan) { 112 if (scan_chan) {
108 chan = scan_chan; 113 chan = scan_chan;
109 channel_type = NL80211_CHAN_NO_HT; 114 channel_type = NL80211_CHAN_NO_HT;
110 local->hw.conf.flags |= IEEE80211_CONF_OFFCHANNEL; 115 local->hw.conf.flags |= IEEE80211_CONF_OFFCHANNEL;
111 } else if (local->tmp_channel) { 116 } else if (local->tmp_channel &&
117 local->oper_channel != local->tmp_channel) {
112 chan = scan_chan = local->tmp_channel; 118 chan = scan_chan = local->tmp_channel;
113 channel_type = local->tmp_channel_type; 119 channel_type = local->tmp_channel_type;
114 local->hw.conf.flags |= IEEE80211_CONF_OFFCHANNEL; 120 local->hw.conf.flags |= IEEE80211_CONF_OFFCHANNEL;
@@ -117,8 +123,9 @@ int ieee80211_hw_config(struct ieee80211_local *local, u32 changed)
117 channel_type = local->_oper_channel_type; 123 channel_type = local->_oper_channel_type;
118 local->hw.conf.flags &= ~IEEE80211_CONF_OFFCHANNEL; 124 local->hw.conf.flags &= ~IEEE80211_CONF_OFFCHANNEL;
119 } 125 }
126 offchannel_flag ^= local->hw.conf.flags & IEEE80211_CONF_OFFCHANNEL;
120 127
121 if (chan != local->hw.conf.channel || 128 if (offchannel_flag || chan != local->hw.conf.channel ||
122 channel_type != local->hw.conf.channel_type) { 129 channel_type != local->hw.conf.channel_type) {
123 local->hw.conf.channel = chan; 130 local->hw.conf.channel = chan;
124 local->hw.conf.channel_type = channel_type; 131 local->hw.conf.channel_type = channel_type;
@@ -197,6 +204,8 @@ void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
197 sdata->vif.bss_conf.bssid = sdata->u.ibss.bssid; 204 sdata->vif.bss_conf.bssid = sdata->u.ibss.bssid;
198 else if (sdata->vif.type == NL80211_IFTYPE_AP) 205 else if (sdata->vif.type == NL80211_IFTYPE_AP)
199 sdata->vif.bss_conf.bssid = sdata->vif.addr; 206 sdata->vif.bss_conf.bssid = sdata->vif.addr;
207 else if (sdata->vif.type == NL80211_IFTYPE_WDS)
208 sdata->vif.bss_conf.bssid = NULL;
200 else if (ieee80211_vif_is_mesh(&sdata->vif)) { 209 else if (ieee80211_vif_is_mesh(&sdata->vif)) {
201 sdata->vif.bss_conf.bssid = zero; 210 sdata->vif.bss_conf.bssid = zero;
202 } else { 211 } else {
@@ -207,6 +216,7 @@ void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
207 switch (sdata->vif.type) { 216 switch (sdata->vif.type) {
208 case NL80211_IFTYPE_AP: 217 case NL80211_IFTYPE_AP:
209 case NL80211_IFTYPE_ADHOC: 218 case NL80211_IFTYPE_ADHOC:
219 case NL80211_IFTYPE_WDS:
210 case NL80211_IFTYPE_MESH_POINT: 220 case NL80211_IFTYPE_MESH_POINT:
211 break; 221 break;
212 default: 222 default:
@@ -291,7 +301,16 @@ static void ieee80211_restart_work(struct work_struct *work)
291 struct ieee80211_local *local = 301 struct ieee80211_local *local =
292 container_of(work, struct ieee80211_local, restart_work); 302 container_of(work, struct ieee80211_local, restart_work);
293 303
304 /* wait for scan work complete */
305 flush_workqueue(local->workqueue);
306
307 mutex_lock(&local->mtx);
308 WARN(test_bit(SCAN_HW_SCANNING, &local->scanning),
309 "%s called with hardware scan in progress\n", __func__);
310 mutex_unlock(&local->mtx);
311
294 rtnl_lock(); 312 rtnl_lock();
313 ieee80211_scan_cancel(local);
295 ieee80211_reconfig(local); 314 ieee80211_reconfig(local);
296 rtnl_unlock(); 315 rtnl_unlock();
297} 316}
@@ -302,7 +321,7 @@ void ieee80211_restart_hw(struct ieee80211_hw *hw)
302 321
303 trace_api_restart_hw(local); 322 trace_api_restart_hw(local);
304 323
305 /* use this reason, __ieee80211_resume will unblock it */ 324 /* use this reason, ieee80211_reconfig will unblock it */
306 ieee80211_stop_queues_by_reason(hw, 325 ieee80211_stop_queues_by_reason(hw,
307 IEEE80211_QUEUE_STOP_REASON_SUSPEND); 326 IEEE80211_QUEUE_STOP_REASON_SUSPEND);
308 327
@@ -316,7 +335,7 @@ static void ieee80211_recalc_smps_work(struct work_struct *work)
316 container_of(work, struct ieee80211_local, recalc_smps); 335 container_of(work, struct ieee80211_local, recalc_smps);
317 336
318 mutex_lock(&local->iflist_mtx); 337 mutex_lock(&local->iflist_mtx);
319 ieee80211_recalc_smps(local, NULL); 338 ieee80211_recalc_smps(local);
320 mutex_unlock(&local->iflist_mtx); 339 mutex_unlock(&local->iflist_mtx);
321} 340}
322 341
@@ -336,9 +355,6 @@ static int ieee80211_ifa_changed(struct notifier_block *nb,
336 struct ieee80211_if_managed *ifmgd; 355 struct ieee80211_if_managed *ifmgd;
337 int c = 0; 356 int c = 0;
338 357
339 if (!netif_running(ndev))
340 return NOTIFY_DONE;
341
342 /* Make sure it's our interface that got changed */ 358 /* Make sure it's our interface that got changed */
343 if (!wdev) 359 if (!wdev)
344 return NOTIFY_DONE; 360 return NOTIFY_DONE;
@@ -349,11 +365,14 @@ static int ieee80211_ifa_changed(struct notifier_block *nb,
349 sdata = IEEE80211_DEV_TO_SUB_IF(ndev); 365 sdata = IEEE80211_DEV_TO_SUB_IF(ndev);
350 bss_conf = &sdata->vif.bss_conf; 366 bss_conf = &sdata->vif.bss_conf;
351 367
368 if (!ieee80211_sdata_running(sdata))
369 return NOTIFY_DONE;
370
352 /* ARP filtering is only supported in managed mode */ 371 /* ARP filtering is only supported in managed mode */
353 if (sdata->vif.type != NL80211_IFTYPE_STATION) 372 if (sdata->vif.type != NL80211_IFTYPE_STATION)
354 return NOTIFY_DONE; 373 return NOTIFY_DONE;
355 374
356 idev = sdata->dev->ip_ptr; 375 idev = __in_dev_get_rtnl(sdata->dev);
357 if (!idev) 376 if (!idev)
358 return NOTIFY_DONE; 377 return NOTIFY_DONE;
359 378
@@ -390,6 +409,80 @@ static int ieee80211_ifa_changed(struct notifier_block *nb,
390} 409}
391#endif 410#endif
392 411
412static int ieee80211_napi_poll(struct napi_struct *napi, int budget)
413{
414 struct ieee80211_local *local =
415 container_of(napi, struct ieee80211_local, napi);
416
417 return local->ops->napi_poll(&local->hw, budget);
418}
419
420void ieee80211_napi_schedule(struct ieee80211_hw *hw)
421{
422 struct ieee80211_local *local = hw_to_local(hw);
423
424 napi_schedule(&local->napi);
425}
426EXPORT_SYMBOL(ieee80211_napi_schedule);
427
428void ieee80211_napi_complete(struct ieee80211_hw *hw)
429{
430 struct ieee80211_local *local = hw_to_local(hw);
431
432 napi_complete(&local->napi);
433}
434EXPORT_SYMBOL(ieee80211_napi_complete);
435
436/* There isn't a lot of sense in it, but you can transmit anything you like */
437static const struct ieee80211_txrx_stypes
438ieee80211_default_mgmt_stypes[NUM_NL80211_IFTYPES] = {
439 [NL80211_IFTYPE_ADHOC] = {
440 .tx = 0xffff,
441 .rx = BIT(IEEE80211_STYPE_ACTION >> 4),
442 },
443 [NL80211_IFTYPE_STATION] = {
444 .tx = 0xffff,
445 .rx = BIT(IEEE80211_STYPE_ACTION >> 4) |
446 BIT(IEEE80211_STYPE_PROBE_REQ >> 4),
447 },
448 [NL80211_IFTYPE_AP] = {
449 .tx = 0xffff,
450 .rx = BIT(IEEE80211_STYPE_ASSOC_REQ >> 4) |
451 BIT(IEEE80211_STYPE_REASSOC_REQ >> 4) |
452 BIT(IEEE80211_STYPE_PROBE_REQ >> 4) |
453 BIT(IEEE80211_STYPE_DISASSOC >> 4) |
454 BIT(IEEE80211_STYPE_AUTH >> 4) |
455 BIT(IEEE80211_STYPE_DEAUTH >> 4) |
456 BIT(IEEE80211_STYPE_ACTION >> 4),
457 },
458 [NL80211_IFTYPE_AP_VLAN] = {
459 /* copy AP */
460 .tx = 0xffff,
461 .rx = BIT(IEEE80211_STYPE_ASSOC_REQ >> 4) |
462 BIT(IEEE80211_STYPE_REASSOC_REQ >> 4) |
463 BIT(IEEE80211_STYPE_PROBE_REQ >> 4) |
464 BIT(IEEE80211_STYPE_DISASSOC >> 4) |
465 BIT(IEEE80211_STYPE_AUTH >> 4) |
466 BIT(IEEE80211_STYPE_DEAUTH >> 4) |
467 BIT(IEEE80211_STYPE_ACTION >> 4),
468 },
469 [NL80211_IFTYPE_P2P_CLIENT] = {
470 .tx = 0xffff,
471 .rx = BIT(IEEE80211_STYPE_ACTION >> 4) |
472 BIT(IEEE80211_STYPE_PROBE_REQ >> 4),
473 },
474 [NL80211_IFTYPE_P2P_GO] = {
475 .tx = 0xffff,
476 .rx = BIT(IEEE80211_STYPE_ASSOC_REQ >> 4) |
477 BIT(IEEE80211_STYPE_REASSOC_REQ >> 4) |
478 BIT(IEEE80211_STYPE_PROBE_REQ >> 4) |
479 BIT(IEEE80211_STYPE_DISASSOC >> 4) |
480 BIT(IEEE80211_STYPE_AUTH >> 4) |
481 BIT(IEEE80211_STYPE_DEAUTH >> 4) |
482 BIT(IEEE80211_STYPE_ACTION >> 4),
483 },
484};
485
393struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, 486struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len,
394 const struct ieee80211_ops *ops) 487 const struct ieee80211_ops *ops)
395{ 488{
@@ -419,6 +512,8 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len,
419 if (!wiphy) 512 if (!wiphy)
420 return NULL; 513 return NULL;
421 514
515 wiphy->mgmt_stypes = ieee80211_default_mgmt_stypes;
516
422 wiphy->flags |= WIPHY_FLAG_NETNS_OK | 517 wiphy->flags |= WIPHY_FLAG_NETNS_OK |
423 WIPHY_FLAG_4ADDR_AP | 518 WIPHY_FLAG_4ADDR_AP |
424 WIPHY_FLAG_4ADDR_STATION; 519 WIPHY_FLAG_4ADDR_STATION;
@@ -444,6 +539,7 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len,
444 /* set up some defaults */ 539 /* set up some defaults */
445 local->hw.queues = 1; 540 local->hw.queues = 1;
446 local->hw.max_rates = 1; 541 local->hw.max_rates = 1;
542 local->hw.max_report_rates = 0;
447 local->hw.conf.long_frame_max_tx_count = wiphy->retry_long; 543 local->hw.conf.long_frame_max_tx_count = wiphy->retry_long;
448 local->hw.conf.short_frame_max_tx_count = wiphy->retry_short; 544 local->hw.conf.short_frame_max_tx_count = wiphy->retry_short;
449 local->user_power_level = -1; 545 local->user_power_level = -1;
@@ -455,7 +551,7 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len,
455 __hw_addr_init(&local->mc_list); 551 __hw_addr_init(&local->mc_list);
456 552
457 mutex_init(&local->iflist_mtx); 553 mutex_init(&local->iflist_mtx);
458 mutex_init(&local->scan_mtx); 554 mutex_init(&local->mtx);
459 555
460 mutex_init(&local->key_mtx); 556 mutex_init(&local->key_mtx);
461 spin_lock_init(&local->filter_lock); 557 spin_lock_init(&local->filter_lock);
@@ -494,6 +590,9 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len,
494 skb_queue_head_init(&local->skb_queue); 590 skb_queue_head_init(&local->skb_queue);
495 skb_queue_head_init(&local->skb_queue_unreliable); 591 skb_queue_head_init(&local->skb_queue_unreliable);
496 592
593 /* init dummy netdev for use w/ NAPI */
594 init_dummy_netdev(&local->napi_dev);
595
497 return local_to_hw(local); 596 return local_to_hw(local);
498} 597}
499EXPORT_SYMBOL(ieee80211_alloc_hw); 598EXPORT_SYMBOL(ieee80211_alloc_hw);
@@ -506,6 +605,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
506 int channels, max_bitrates; 605 int channels, max_bitrates;
507 bool supp_ht; 606 bool supp_ht;
508 static const u32 cipher_suites[] = { 607 static const u32 cipher_suites[] = {
608 /* keep WEP first, it may be removed below */
509 WLAN_CIPHER_SUITE_WEP40, 609 WLAN_CIPHER_SUITE_WEP40,
510 WLAN_CIPHER_SUITE_WEP104, 610 WLAN_CIPHER_SUITE_WEP104,
511 WLAN_CIPHER_SUITE_TKIP, 611 WLAN_CIPHER_SUITE_TKIP,
@@ -515,6 +615,9 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
515 WLAN_CIPHER_SUITE_AES_CMAC 615 WLAN_CIPHER_SUITE_AES_CMAC
516 }; 616 };
517 617
618 if (hw->max_report_rates == 0)
619 hw->max_report_rates = hw->max_rates;
620
518 /* 621 /*
519 * generic code guarantees at least one band, 622 * generic code guarantees at least one band,
520 * set this very early because much code assumes 623 * set this very early because much code assumes
@@ -554,6 +657,14 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
554 /* mac80211 always supports monitor */ 657 /* mac80211 always supports monitor */
555 local->hw.wiphy->interface_modes |= BIT(NL80211_IFTYPE_MONITOR); 658 local->hw.wiphy->interface_modes |= BIT(NL80211_IFTYPE_MONITOR);
556 659
660#ifndef CONFIG_MAC80211_MESH
661 /* mesh depends on Kconfig, but drivers should set it if they want */
662 local->hw.wiphy->interface_modes &= ~BIT(NL80211_IFTYPE_MESH_POINT);
663#endif
664
665 /* mac80211 supports control port protocol changing */
666 local->hw.wiphy->flags |= WIPHY_FLAG_CONTROL_PORT_PROTOCOL;
667
557 if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) 668 if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
558 local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_MBM; 669 local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_MBM;
559 else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) 670 else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)
@@ -566,10 +677,11 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
566 /* 677 /*
567 * Calculate scan IE length -- we need this to alloc 678 * Calculate scan IE length -- we need this to alloc
568 * memory and to subtract from the driver limit. It 679 * memory and to subtract from the driver limit. It
569 * includes the (extended) supported rates and HT 680 * includes the DS Params, (extended) supported rates, and HT
570 * information -- SSID is the driver's responsibility. 681 * information -- SSID is the driver's responsibility.
571 */ 682 */
572 local->scan_ies_len = 4 + max_bitrates; /* (ext) supp rates */ 683 local->scan_ies_len = 4 + max_bitrates /* (ext) supp rates */ +
684 3 /* DS Params */;
573 if (supp_ht) 685 if (supp_ht)
574 local->scan_ies_len += 2 + sizeof(struct ieee80211_ht_cap); 686 local->scan_ies_len += 2 + sizeof(struct ieee80211_ht_cap);
575 687
@@ -589,10 +701,41 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
589 if (local->hw.wiphy->max_scan_ie_len) 701 if (local->hw.wiphy->max_scan_ie_len)
590 local->hw.wiphy->max_scan_ie_len -= local->scan_ies_len; 702 local->hw.wiphy->max_scan_ie_len -= local->scan_ies_len;
591 703
592 local->hw.wiphy->cipher_suites = cipher_suites; 704 /* Set up cipher suites unless driver already did */
593 local->hw.wiphy->n_cipher_suites = ARRAY_SIZE(cipher_suites); 705 if (!local->hw.wiphy->cipher_suites) {
594 if (!(local->hw.flags & IEEE80211_HW_MFP_CAPABLE)) 706 local->hw.wiphy->cipher_suites = cipher_suites;
595 local->hw.wiphy->n_cipher_suites--; 707 local->hw.wiphy->n_cipher_suites = ARRAY_SIZE(cipher_suites);
708 if (!(local->hw.flags & IEEE80211_HW_MFP_CAPABLE))
709 local->hw.wiphy->n_cipher_suites--;
710 }
711 if (IS_ERR(local->wep_tx_tfm) || IS_ERR(local->wep_rx_tfm)) {
712 if (local->hw.wiphy->cipher_suites == cipher_suites) {
713 local->hw.wiphy->cipher_suites += 2;
714 local->hw.wiphy->n_cipher_suites -= 2;
715 } else {
716 u32 *suites;
717 int r, w = 0;
718
719 /* Filter out WEP */
720
721 suites = kmemdup(
722 local->hw.wiphy->cipher_suites,
723 sizeof(u32) * local->hw.wiphy->n_cipher_suites,
724 GFP_KERNEL);
725 if (!suites)
726 return -ENOMEM;
727 for (r = 0; r < local->hw.wiphy->n_cipher_suites; r++) {
728 u32 suite = local->hw.wiphy->cipher_suites[r];
729 if (suite == WLAN_CIPHER_SUITE_WEP40 ||
730 suite == WLAN_CIPHER_SUITE_WEP104)
731 continue;
732 suites[w++] = suite;
733 }
734 local->hw.wiphy->cipher_suites = suites;
735 local->hw.wiphy->n_cipher_suites = w;
736 local->wiphy_ciphers_allocated = true;
737 }
738 }
596 739
597 result = wiphy_register(local->hw.wiphy); 740 result = wiphy_register(local->hw.wiphy);
598 if (result < 0) 741 if (result < 0)
@@ -606,7 +749,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
606 hw->queues = IEEE80211_MAX_QUEUES; 749 hw->queues = IEEE80211_MAX_QUEUES;
607 750
608 local->workqueue = 751 local->workqueue =
609 create_singlethread_workqueue(wiphy_name(local->hw.wiphy)); 752 alloc_ordered_workqueue(wiphy_name(local->hw.wiphy), 0);
610 if (!local->workqueue) { 753 if (!local->workqueue) {
611 result = -ENOMEM; 754 result = -ENOMEM;
612 goto fail_workqueue; 755 goto fail_workqueue;
@@ -641,16 +784,16 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
641 784
642 result = ieee80211_wep_init(local); 785 result = ieee80211_wep_init(local);
643 if (result < 0) 786 if (result < 0)
644 printk(KERN_DEBUG "%s: Failed to initialize wep: %d\n", 787 wiphy_debug(local->hw.wiphy, "Failed to initialize wep: %d\n",
645 wiphy_name(local->hw.wiphy), result); 788 result);
646 789
647 rtnl_lock(); 790 rtnl_lock();
648 791
649 result = ieee80211_init_rate_ctrl_alg(local, 792 result = ieee80211_init_rate_ctrl_alg(local,
650 hw->rate_control_algorithm); 793 hw->rate_control_algorithm);
651 if (result < 0) { 794 if (result < 0) {
652 printk(KERN_DEBUG "%s: Failed to initialize rate control " 795 wiphy_debug(local->hw.wiphy,
653 "algorithm\n", wiphy_name(local->hw.wiphy)); 796 "Failed to initialize rate control algorithm\n");
654 goto fail_rate; 797 goto fail_rate;
655 } 798 }
656 799
@@ -659,8 +802,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
659 result = ieee80211_if_add(local, "wlan%d", NULL, 802 result = ieee80211_if_add(local, "wlan%d", NULL,
660 NL80211_IFTYPE_STATION, NULL); 803 NL80211_IFTYPE_STATION, NULL);
661 if (result) 804 if (result)
662 printk(KERN_WARNING "%s: Failed to add default virtual iface\n", 805 wiphy_warn(local->hw.wiphy,
663 wiphy_name(local->hw.wiphy)); 806 "Failed to add default virtual iface\n");
664 } 807 }
665 808
666 rtnl_unlock(); 809 rtnl_unlock();
@@ -683,6 +826,9 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
683 goto fail_ifa; 826 goto fail_ifa;
684#endif 827#endif
685 828
829 netif_napi_add(&local->napi_dev, &local->napi, ieee80211_napi_poll,
830 local->hw.napi_weight);
831
686 return 0; 832 return 0;
687 833
688#ifdef CONFIG_INET 834#ifdef CONFIG_INET
@@ -703,6 +849,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
703 fail_workqueue: 849 fail_workqueue:
704 wiphy_unregister(local->hw.wiphy); 850 wiphy_unregister(local->hw.wiphy);
705 fail_wiphy_register: 851 fail_wiphy_register:
852 if (local->wiphy_ciphers_allocated)
853 kfree(local->hw.wiphy->cipher_suites);
706 kfree(local->int_scan_req); 854 kfree(local->int_scan_req);
707 return result; 855 return result;
708} 856}
@@ -732,6 +880,13 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw)
732 880
733 rtnl_unlock(); 881 rtnl_unlock();
734 882
883 /*
884 * Now all work items will be gone, but the
885 * timer might still be armed, so delete it
886 */
887 del_timer_sync(&local->work_timer);
888
889 cancel_work_sync(&local->restart_work);
735 cancel_work_sync(&local->reconfig_filter); 890 cancel_work_sync(&local->reconfig_filter);
736 891
737 ieee80211_clear_tx_pending(local); 892 ieee80211_clear_tx_pending(local);
@@ -740,8 +895,7 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw)
740 895
741 if (skb_queue_len(&local->skb_queue) || 896 if (skb_queue_len(&local->skb_queue) ||
742 skb_queue_len(&local->skb_queue_unreliable)) 897 skb_queue_len(&local->skb_queue_unreliable))
743 printk(KERN_WARNING "%s: skb_queue not empty\n", 898 wiphy_warn(local->hw.wiphy, "skb_queue not empty\n");
744 wiphy_name(local->hw.wiphy));
745 skb_queue_purge(&local->skb_queue); 899 skb_queue_purge(&local->skb_queue);
746 skb_queue_purge(&local->skb_queue_unreliable); 900 skb_queue_purge(&local->skb_queue_unreliable);
747 901
@@ -758,7 +912,10 @@ void ieee80211_free_hw(struct ieee80211_hw *hw)
758 struct ieee80211_local *local = hw_to_local(hw); 912 struct ieee80211_local *local = hw_to_local(hw);
759 913
760 mutex_destroy(&local->iflist_mtx); 914 mutex_destroy(&local->iflist_mtx);
761 mutex_destroy(&local->scan_mtx); 915 mutex_destroy(&local->mtx);
916
917 if (local->wiphy_ciphers_allocated)
918 kfree(local->hw.wiphy->cipher_suites);
762 919
763 wiphy_free(local->hw.wiphy); 920 wiphy_free(local->hw.wiphy);
764} 921}
@@ -806,12 +963,6 @@ static void __exit ieee80211_exit(void)
806 rc80211_minstrel_ht_exit(); 963 rc80211_minstrel_ht_exit();
807 rc80211_minstrel_exit(); 964 rc80211_minstrel_exit();
808 965
809 /*
810 * For key todo, it'll be empty by now but the work
811 * might still be scheduled.
812 */
813 flush_scheduled_work();
814
815 if (mesh_allocated) 966 if (mesh_allocated)
816 ieee80211s_stop(); 967 ieee80211s_stop();
817 968
diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h
index ebd3f1d9d889..58e741128968 100644
--- a/net/mac80211/mesh.h
+++ b/net/mac80211/mesh.h
@@ -115,7 +115,7 @@ struct mesh_path {
115 * @hash_rnd: random value used for hash computations 115 * @hash_rnd: random value used for hash computations
116 * @entries: number of entries in the table 116 * @entries: number of entries in the table
117 * @free_node: function to free nodes of the table 117 * @free_node: function to free nodes of the table
118 * @copy_node: fuction to copy nodes of the table 118 * @copy_node: function to copy nodes of the table
119 * @size_order: determines size of the table, there will be 2^size_order hash 119 * @size_order: determines size of the table, there will be 2^size_order hash
120 * buckets 120 * buckets
121 * @mean_chain_len: maximum average length for the hash buckets' list, if it is 121 * @mean_chain_len: maximum average length for the hash buckets' list, if it is
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index ea13a80a476c..1c91f0f3c307 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -412,7 +412,7 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m
412 enum plink_event event; 412 enum plink_event event;
413 enum plink_frame_type ftype; 413 enum plink_frame_type ftype;
414 size_t baselen; 414 size_t baselen;
415 bool deactivated; 415 bool deactivated, matches_local = true;
416 u8 ie_len; 416 u8 ie_len;
417 u8 *baseaddr; 417 u8 *baseaddr;
418 __le16 plid, llid, reason; 418 __le16 plid, llid, reason;
@@ -487,6 +487,7 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m
487 /* Now we will figure out the appropriate event... */ 487 /* Now we will figure out the appropriate event... */
488 event = PLINK_UNDEFINED; 488 event = PLINK_UNDEFINED;
489 if (ftype != PLINK_CLOSE && (!mesh_matches_local(&elems, sdata))) { 489 if (ftype != PLINK_CLOSE && (!mesh_matches_local(&elems, sdata))) {
490 matches_local = false;
490 switch (ftype) { 491 switch (ftype) {
491 case PLINK_OPEN: 492 case PLINK_OPEN:
492 event = OPN_RJCT; 493 event = OPN_RJCT;
@@ -498,7 +499,15 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m
498 /* avoid warning */ 499 /* avoid warning */
499 break; 500 break;
500 } 501 }
501 spin_lock_bh(&sta->lock); 502 }
503
504 if (!sta && !matches_local) {
505 rcu_read_unlock();
506 reason = cpu_to_le16(MESH_CAPABILITY_POLICY_VIOLATION);
507 llid = 0;
508 mesh_plink_frame_tx(sdata, PLINK_CLOSE, mgmt->sa, llid,
509 plid, reason);
510 return;
502 } else if (!sta) { 511 } else if (!sta) {
503 /* ftype == PLINK_OPEN */ 512 /* ftype == PLINK_OPEN */
504 u32 rates; 513 u32 rates;
@@ -522,7 +531,7 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m
522 } 531 }
523 event = OPN_ACPT; 532 event = OPN_ACPT;
524 spin_lock_bh(&sta->lock); 533 spin_lock_bh(&sta->lock);
525 } else { 534 } else if (matches_local) {
526 spin_lock_bh(&sta->lock); 535 spin_lock_bh(&sta->lock);
527 switch (ftype) { 536 switch (ftype) {
528 case PLINK_OPEN: 537 case PLINK_OPEN:
@@ -564,6 +573,8 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m
564 rcu_read_unlock(); 573 rcu_read_unlock();
565 return; 574 return;
566 } 575 }
576 } else {
577 spin_lock_bh(&sta->lock);
567 } 578 }
568 579
569 mpl_dbg("Mesh plink (peer, state, llid, plid, event): %pM %s %d %d %d\n", 580 mpl_dbg("Mesh plink (peer, state, llid, plid, event): %pM %s %d %d %d\n",
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index b6c163ac22da..a3a9421555af 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -54,6 +54,12 @@
54 */ 54 */
55#define IEEE80211_SIGNAL_AVE_WEIGHT 3 55#define IEEE80211_SIGNAL_AVE_WEIGHT 3
56 56
57/*
58 * How many Beacon frames need to have been used in average signal strength
59 * before starting to indicate signal change events.
60 */
61#define IEEE80211_SIGNAL_AVE_MIN_COUNT 4
62
57#define TMR_RUNNING_TIMER 0 63#define TMR_RUNNING_TIMER 0
58#define TMR_RUNNING_CHANSW 1 64#define TMR_RUNNING_CHANSW 1
59 65
@@ -86,7 +92,7 @@ enum rx_mgmt_action {
86/* utils */ 92/* utils */
87static inline void ASSERT_MGD_MTX(struct ieee80211_if_managed *ifmgd) 93static inline void ASSERT_MGD_MTX(struct ieee80211_if_managed *ifmgd)
88{ 94{
89 WARN_ON(!mutex_is_locked(&ifmgd->mtx)); 95 lockdep_assert_held(&ifmgd->mtx);
90} 96}
91 97
92/* 98/*
@@ -109,7 +115,7 @@ static void run_again(struct ieee80211_if_managed *ifmgd,
109 mod_timer(&ifmgd->timer, timeout); 115 mod_timer(&ifmgd->timer, timeout);
110} 116}
111 117
112static void mod_beacon_timer(struct ieee80211_sub_if_data *sdata) 118void ieee80211_sta_reset_beacon_monitor(struct ieee80211_sub_if_data *sdata)
113{ 119{
114 if (sdata->local->hw.flags & IEEE80211_HW_BEACON_FILTER) 120 if (sdata->local->hw.flags & IEEE80211_HW_BEACON_FILTER)
115 return; 121 return;
@@ -118,6 +124,19 @@ static void mod_beacon_timer(struct ieee80211_sub_if_data *sdata)
118 round_jiffies_up(jiffies + IEEE80211_BEACON_LOSS_TIME)); 124 round_jiffies_up(jiffies + IEEE80211_BEACON_LOSS_TIME));
119} 125}
120 126
127void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata)
128{
129 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
130
131 if (sdata->local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR)
132 return;
133
134 mod_timer(&sdata->u.mgd.conn_mon_timer,
135 round_jiffies_up(jiffies + IEEE80211_CONNECTION_IDLE_TIME));
136
137 ifmgd->probe_send_count = 0;
138}
139
121static int ecw2cw(int ecw) 140static int ecw2cw(int ecw)
122{ 141{
123 return (1 << ecw) - 1; 142 return (1 << ecw) - 1;
@@ -778,16 +797,17 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local,
778 params.uapsd = uapsd; 797 params.uapsd = uapsd;
779 798
780#ifdef CONFIG_MAC80211_VERBOSE_DEBUG 799#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
781 printk(KERN_DEBUG "%s: WMM queue=%d aci=%d acm=%d aifs=%d " 800 wiphy_debug(local->hw.wiphy,
782 "cWmin=%d cWmax=%d txop=%d uapsd=%d\n", 801 "WMM queue=%d aci=%d acm=%d aifs=%d "
783 wiphy_name(local->hw.wiphy), queue, aci, acm, 802 "cWmin=%d cWmax=%d txop=%d uapsd=%d\n",
784 params.aifs, params.cw_min, params.cw_max, params.txop, 803 queue, aci, acm,
785 params.uapsd); 804 params.aifs, params.cw_min, params.cw_max,
805 params.txop, params.uapsd);
786#endif 806#endif
787 if (drv_conf_tx(local, queue, &params)) 807 if (drv_conf_tx(local, queue, &params))
788 printk(KERN_DEBUG "%s: failed to set TX queue " 808 wiphy_debug(local->hw.wiphy,
789 "parameters for queue %d\n", 809 "failed to set TX queue parameters for queue %d\n",
790 wiphy_name(local->hw.wiphy), queue); 810 queue);
791 } 811 }
792 812
793 /* enable WMM or activate new settings */ 813 /* enable WMM or activate new settings */
@@ -860,14 +880,6 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
860 sdata->u.mgd.flags &= ~(IEEE80211_STA_CONNECTION_POLL | 880 sdata->u.mgd.flags &= ~(IEEE80211_STA_CONNECTION_POLL |
861 IEEE80211_STA_BEACON_POLL); 881 IEEE80211_STA_BEACON_POLL);
862 882
863 /*
864 * Always handle WMM once after association regardless
865 * of the first value the AP uses. Setting -1 here has
866 * that effect because the AP values is an unsigned
867 * 4-bit value.
868 */
869 sdata->u.mgd.wmm_last_param_set = -1;
870
871 ieee80211_led_assoc(local, 1); 883 ieee80211_led_assoc(local, 1);
872 884
873 if (local->hw.flags & IEEE80211_HW_NEED_DTIM_PERIOD) 885 if (local->hw.flags & IEEE80211_HW_NEED_DTIM_PERIOD)
@@ -901,7 +913,7 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
901 913
902 mutex_lock(&local->iflist_mtx); 914 mutex_lock(&local->iflist_mtx);
903 ieee80211_recalc_ps(local, -1); 915 ieee80211_recalc_ps(local, -1);
904 ieee80211_recalc_smps(local, sdata); 916 ieee80211_recalc_smps(local);
905 mutex_unlock(&local->iflist_mtx); 917 mutex_unlock(&local->iflist_mtx);
906 918
907 netif_tx_start_all_queues(sdata->dev); 919 netif_tx_start_all_queues(sdata->dev);
@@ -909,7 +921,7 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
909} 921}
910 922
911static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, 923static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
912 bool remove_sta) 924 bool remove_sta, bool tx)
913{ 925{
914 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; 926 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
915 struct ieee80211_local *local = sdata->local; 927 struct ieee80211_local *local = sdata->local;
@@ -948,7 +960,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
948 sta = sta_info_get(sdata, bssid); 960 sta = sta_info_get(sdata, bssid);
949 if (sta) { 961 if (sta) {
950 set_sta_flags(sta, WLAN_STA_BLOCK_BA); 962 set_sta_flags(sta, WLAN_STA_BLOCK_BA);
951 ieee80211_sta_tear_down_BA_sessions(sta); 963 ieee80211_sta_tear_down_BA_sessions(sta, tx);
952 } 964 }
953 mutex_unlock(&local->sta_mtx); 965 mutex_unlock(&local->sta_mtx);
954 966
@@ -990,6 +1002,11 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
990 1002
991 if (remove_sta) 1003 if (remove_sta)
992 sta_info_destroy_addr(sdata, bssid); 1004 sta_info_destroy_addr(sdata, bssid);
1005
1006 del_timer_sync(&sdata->u.mgd.conn_mon_timer);
1007 del_timer_sync(&sdata->u.mgd.bcn_mon_timer);
1008 del_timer_sync(&sdata->u.mgd.timer);
1009 del_timer_sync(&sdata->u.mgd.chswitch_timer);
993} 1010}
994 1011
995void ieee80211_sta_rx_notify(struct ieee80211_sub_if_data *sdata, 1012void ieee80211_sta_rx_notify(struct ieee80211_sub_if_data *sdata,
@@ -1006,21 +1023,26 @@ void ieee80211_sta_rx_notify(struct ieee80211_sub_if_data *sdata,
1006 if (is_multicast_ether_addr(hdr->addr1)) 1023 if (is_multicast_ether_addr(hdr->addr1))
1007 return; 1024 return;
1008 1025
1009 if (sdata->local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR) 1026 ieee80211_sta_reset_conn_monitor(sdata);
1010 return;
1011
1012 mod_timer(&sdata->u.mgd.conn_mon_timer,
1013 round_jiffies_up(jiffies + IEEE80211_CONNECTION_IDLE_TIME));
1014} 1027}
1015 1028
1016static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata) 1029static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata)
1017{ 1030{
1018 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; 1031 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
1019 const u8 *ssid; 1032 const u8 *ssid;
1033 u8 *dst = ifmgd->associated->bssid;
1034 u8 unicast_limit = max(1, IEEE80211_MAX_PROBE_TRIES - 3);
1035
1036 /*
1037 * Try sending broadcast probe requests for the last three
1038 * probe requests after the first ones failed since some
1039 * buggy APs only support broadcast probe requests.
1040 */
1041 if (ifmgd->probe_send_count >= unicast_limit)
1042 dst = NULL;
1020 1043
1021 ssid = ieee80211_bss_get_ie(ifmgd->associated, WLAN_EID_SSID); 1044 ssid = ieee80211_bss_get_ie(ifmgd->associated, WLAN_EID_SSID);
1022 ieee80211_send_probe_req(sdata, ifmgd->associated->bssid, 1045 ieee80211_send_probe_req(sdata, dst, ssid + 2, ssid[1], NULL, 0);
1023 ssid + 2, ssid[1], NULL, 0);
1024 1046
1025 ifmgd->probe_send_count++; 1047 ifmgd->probe_send_count++;
1026 ifmgd->probe_timeout = jiffies + IEEE80211_PROBE_WAIT; 1048 ifmgd->probe_timeout = jiffies + IEEE80211_PROBE_WAIT;
@@ -1102,9 +1124,12 @@ static void __ieee80211_connection_loss(struct ieee80211_sub_if_data *sdata)
1102 1124
1103 printk(KERN_DEBUG "Connection to AP %pM lost.\n", bssid); 1125 printk(KERN_DEBUG "Connection to AP %pM lost.\n", bssid);
1104 1126
1105 ieee80211_set_disassoc(sdata, true); 1127 ieee80211_set_disassoc(sdata, true, true);
1106 ieee80211_recalc_idle(local);
1107 mutex_unlock(&ifmgd->mtx); 1128 mutex_unlock(&ifmgd->mtx);
1129
1130 mutex_lock(&local->mtx);
1131 ieee80211_recalc_idle(local);
1132 mutex_unlock(&local->mtx);
1108 /* 1133 /*
1109 * must be outside lock due to cfg80211, 1134 * must be outside lock due to cfg80211,
1110 * but that's not a problem. 1135 * but that's not a problem.
@@ -1172,8 +1197,10 @@ ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata,
1172 printk(KERN_DEBUG "%s: deauthenticated from %pM (Reason: %u)\n", 1197 printk(KERN_DEBUG "%s: deauthenticated from %pM (Reason: %u)\n",
1173 sdata->name, bssid, reason_code); 1198 sdata->name, bssid, reason_code);
1174 1199
1175 ieee80211_set_disassoc(sdata, true); 1200 ieee80211_set_disassoc(sdata, true, false);
1201 mutex_lock(&sdata->local->mtx);
1176 ieee80211_recalc_idle(sdata->local); 1202 ieee80211_recalc_idle(sdata->local);
1203 mutex_unlock(&sdata->local->mtx);
1177 1204
1178 return RX_MGMT_CFG80211_DEAUTH; 1205 return RX_MGMT_CFG80211_DEAUTH;
1179} 1206}
@@ -1202,8 +1229,10 @@ ieee80211_rx_mgmt_disassoc(struct ieee80211_sub_if_data *sdata,
1202 printk(KERN_DEBUG "%s: disassociated from %pM (Reason: %u)\n", 1229 printk(KERN_DEBUG "%s: disassociated from %pM (Reason: %u)\n",
1203 sdata->name, mgmt->sa, reason_code); 1230 sdata->name, mgmt->sa, reason_code);
1204 1231
1205 ieee80211_set_disassoc(sdata, true); 1232 ieee80211_set_disassoc(sdata, true, false);
1233 mutex_lock(&sdata->local->mtx);
1206 ieee80211_recalc_idle(sdata->local); 1234 ieee80211_recalc_idle(sdata->local);
1235 mutex_unlock(&sdata->local->mtx);
1207 return RX_MGMT_CFG80211_DISASSOC; 1236 return RX_MGMT_CFG80211_DISASSOC;
1208} 1237}
1209 1238
@@ -1262,7 +1291,7 @@ static bool ieee80211_assoc_success(struct ieee80211_work *wk,
1262 1291
1263 rates = 0; 1292 rates = 0;
1264 basic_rates = 0; 1293 basic_rates = 0;
1265 sband = local->hw.wiphy->bands[local->hw.conf.channel->band]; 1294 sband = local->hw.wiphy->bands[wk->chan->band];
1266 1295
1267 for (i = 0; i < elems.supp_rates_len; i++) { 1296 for (i = 0; i < elems.supp_rates_len; i++) {
1268 int rate = (elems.supp_rates[i] & 0x7f) * 5; 1297 int rate = (elems.supp_rates[i] & 0x7f) * 5;
@@ -1298,11 +1327,11 @@ static bool ieee80211_assoc_success(struct ieee80211_work *wk,
1298 } 1327 }
1299 } 1328 }
1300 1329
1301 sta->sta.supp_rates[local->hw.conf.channel->band] = rates; 1330 sta->sta.supp_rates[wk->chan->band] = rates;
1302 sdata->vif.bss_conf.basic_rates = basic_rates; 1331 sdata->vif.bss_conf.basic_rates = basic_rates;
1303 1332
1304 /* cf. IEEE 802.11 9.2.12 */ 1333 /* cf. IEEE 802.11 9.2.12 */
1305 if (local->hw.conf.channel->band == IEEE80211_BAND_2GHZ && 1334 if (wk->chan->band == IEEE80211_BAND_2GHZ &&
1306 have_higher_than_11mbit) 1335 have_higher_than_11mbit)
1307 sdata->flags |= IEEE80211_SDATA_OPERATING_GMODE; 1336 sdata->flags |= IEEE80211_SDATA_OPERATING_GMODE;
1308 else 1337 else
@@ -1330,6 +1359,14 @@ static bool ieee80211_assoc_success(struct ieee80211_work *wk,
1330 return false; 1359 return false;
1331 } 1360 }
1332 1361
1362 /*
1363 * Always handle WMM once after association regardless
1364 * of the first value the AP uses. Setting -1 here has
1365 * that effect because the AP values is an unsigned
1366 * 4-bit value.
1367 */
1368 ifmgd->wmm_last_param_set = -1;
1369
1333 if (elems.wmm_param) 1370 if (elems.wmm_param)
1334 ieee80211_sta_wmm_params(local, sdata, elems.wmm_param, 1371 ieee80211_sta_wmm_params(local, sdata, elems.wmm_param,
1335 elems.wmm_param_len); 1372 elems.wmm_param_len);
@@ -1362,7 +1399,7 @@ static bool ieee80211_assoc_success(struct ieee80211_work *wk,
1362 * Also start the timer that will detect beacon loss. 1399 * Also start the timer that will detect beacon loss.
1363 */ 1400 */
1364 ieee80211_sta_rx_notify(sdata, (struct ieee80211_hdr *)mgmt); 1401 ieee80211_sta_rx_notify(sdata, (struct ieee80211_hdr *)mgmt);
1365 mod_beacon_timer(sdata); 1402 ieee80211_sta_reset_beacon_monitor(sdata);
1366 1403
1367 return true; 1404 return true;
1368} 1405}
@@ -1465,7 +1502,7 @@ static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata,
1465 * we have or will be receiving any beacons or data, so let's 1502 * we have or will be receiving any beacons or data, so let's
1466 * schedule the timers again, just in case. 1503 * schedule the timers again, just in case.
1467 */ 1504 */
1468 mod_beacon_timer(sdata); 1505 ieee80211_sta_reset_beacon_monitor(sdata);
1469 1506
1470 mod_timer(&ifmgd->conn_mon_timer, 1507 mod_timer(&ifmgd->conn_mon_timer,
1471 round_jiffies_up(jiffies + 1508 round_jiffies_up(jiffies +
@@ -1540,15 +1577,18 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
1540 ifmgd->last_beacon_signal = rx_status->signal; 1577 ifmgd->last_beacon_signal = rx_status->signal;
1541 if (ifmgd->flags & IEEE80211_STA_RESET_SIGNAL_AVE) { 1578 if (ifmgd->flags & IEEE80211_STA_RESET_SIGNAL_AVE) {
1542 ifmgd->flags &= ~IEEE80211_STA_RESET_SIGNAL_AVE; 1579 ifmgd->flags &= ~IEEE80211_STA_RESET_SIGNAL_AVE;
1543 ifmgd->ave_beacon_signal = rx_status->signal; 1580 ifmgd->ave_beacon_signal = rx_status->signal * 16;
1544 ifmgd->last_cqm_event_signal = 0; 1581 ifmgd->last_cqm_event_signal = 0;
1582 ifmgd->count_beacon_signal = 1;
1545 } else { 1583 } else {
1546 ifmgd->ave_beacon_signal = 1584 ifmgd->ave_beacon_signal =
1547 (IEEE80211_SIGNAL_AVE_WEIGHT * rx_status->signal * 16 + 1585 (IEEE80211_SIGNAL_AVE_WEIGHT * rx_status->signal * 16 +
1548 (16 - IEEE80211_SIGNAL_AVE_WEIGHT) * 1586 (16 - IEEE80211_SIGNAL_AVE_WEIGHT) *
1549 ifmgd->ave_beacon_signal) / 16; 1587 ifmgd->ave_beacon_signal) / 16;
1588 ifmgd->count_beacon_signal++;
1550 } 1589 }
1551 if (bss_conf->cqm_rssi_thold && 1590 if (bss_conf->cqm_rssi_thold &&
1591 ifmgd->count_beacon_signal >= IEEE80211_SIGNAL_AVE_MIN_COUNT &&
1552 !(local->hw.flags & IEEE80211_HW_SUPPORTS_CQM_RSSI)) { 1592 !(local->hw.flags & IEEE80211_HW_SUPPORTS_CQM_RSSI)) {
1553 int sig = ifmgd->ave_beacon_signal / 16; 1593 int sig = ifmgd->ave_beacon_signal / 16;
1554 int last_event = ifmgd->last_cqm_event_signal; 1594 int last_event = ifmgd->last_cqm_event_signal;
@@ -1588,7 +1628,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
1588 * Push the beacon loss detection into the future since 1628 * Push the beacon loss detection into the future since
1589 * we are processing a beacon from the AP just now. 1629 * we are processing a beacon from the AP just now.
1590 */ 1630 */
1591 mod_beacon_timer(sdata); 1631 ieee80211_sta_reset_beacon_monitor(sdata);
1592 1632
1593 ncrc = crc32_be(0, (void *)&mgmt->u.beacon.beacon_int, 4); 1633 ncrc = crc32_be(0, (void *)&mgmt->u.beacon.beacon_int, 4);
1594 ncrc = ieee802_11_parse_elems_crc(mgmt->u.beacon.variable, 1634 ncrc = ieee802_11_parse_elems_crc(mgmt->u.beacon.variable,
@@ -1599,7 +1639,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
1599 directed_tim = ieee80211_check_tim(elems.tim, elems.tim_len, 1639 directed_tim = ieee80211_check_tim(elems.tim, elems.tim_len,
1600 ifmgd->aid); 1640 ifmgd->aid);
1601 1641
1602 if (ncrc != ifmgd->beacon_crc) { 1642 if (ncrc != ifmgd->beacon_crc || !ifmgd->beacon_crc_valid) {
1603 ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems, 1643 ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems,
1604 true); 1644 true);
1605 1645
@@ -1630,9 +1670,10 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
1630 } 1670 }
1631 } 1671 }
1632 1672
1633 if (ncrc == ifmgd->beacon_crc) 1673 if (ncrc == ifmgd->beacon_crc && ifmgd->beacon_crc_valid)
1634 return; 1674 return;
1635 ifmgd->beacon_crc = ncrc; 1675 ifmgd->beacon_crc = ncrc;
1676 ifmgd->beacon_crc_valid = true;
1636 1677
1637 if (elems.erp_info && elems.erp_info_len >= 1) { 1678 if (elems.erp_info && elems.erp_info_len >= 1) {
1638 erp_valid = true; 1679 erp_valid = true;
@@ -1751,7 +1792,7 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
1751 struct ieee80211_local *local = sdata->local; 1792 struct ieee80211_local *local = sdata->local;
1752 struct ieee80211_work *wk; 1793 struct ieee80211_work *wk;
1753 1794
1754 mutex_lock(&local->work_mtx); 1795 mutex_lock(&local->mtx);
1755 list_for_each_entry(wk, &local->work_list, list) { 1796 list_for_each_entry(wk, &local->work_list, list) {
1756 if (wk->sdata != sdata) 1797 if (wk->sdata != sdata)
1757 continue; 1798 continue;
@@ -1783,7 +1824,7 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
1783 free_work(wk); 1824 free_work(wk);
1784 break; 1825 break;
1785 } 1826 }
1786 mutex_unlock(&local->work_mtx); 1827 mutex_unlock(&local->mtx);
1787 1828
1788 cfg80211_send_deauth(sdata->dev, (u8 *)mgmt, skb->len); 1829 cfg80211_send_deauth(sdata->dev, (u8 *)mgmt, skb->len);
1789 } 1830 }
@@ -1823,10 +1864,12 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata)
1823 1864
1824 else if (ifmgd->probe_send_count < IEEE80211_MAX_PROBE_TRIES) { 1865 else if (ifmgd->probe_send_count < IEEE80211_MAX_PROBE_TRIES) {
1825#ifdef CONFIG_MAC80211_VERBOSE_DEBUG 1866#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
1826 printk(KERN_DEBUG "No probe response from AP %pM" 1867 wiphy_debug(local->hw.wiphy,
1827 " after %dms, try %d\n", bssid, 1868 "%s: No probe response from AP %pM"
1828 (1000 * IEEE80211_PROBE_WAIT)/HZ, 1869 " after %dms, try %d\n",
1829 ifmgd->probe_send_count); 1870 sdata->name,
1871 bssid, (1000 * IEEE80211_PROBE_WAIT)/HZ,
1872 ifmgd->probe_send_count);
1830#endif 1873#endif
1831 ieee80211_mgd_probe_ap_send(sdata); 1874 ieee80211_mgd_probe_ap_send(sdata);
1832 } else { 1875 } else {
@@ -1836,12 +1879,16 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata)
1836 */ 1879 */
1837 ifmgd->flags &= ~(IEEE80211_STA_CONNECTION_POLL | 1880 ifmgd->flags &= ~(IEEE80211_STA_CONNECTION_POLL |
1838 IEEE80211_STA_BEACON_POLL); 1881 IEEE80211_STA_BEACON_POLL);
1839 printk(KERN_DEBUG "No probe response from AP %pM" 1882 wiphy_debug(local->hw.wiphy,
1840 " after %dms, disconnecting.\n", 1883 "%s: No probe response from AP %pM"
1841 bssid, (1000 * IEEE80211_PROBE_WAIT)/HZ); 1884 " after %dms, disconnecting.\n",
1842 ieee80211_set_disassoc(sdata, true); 1885 sdata->name,
1843 ieee80211_recalc_idle(local); 1886 bssid, (1000 * IEEE80211_PROBE_WAIT)/HZ);
1887 ieee80211_set_disassoc(sdata, true, true);
1844 mutex_unlock(&ifmgd->mtx); 1888 mutex_unlock(&ifmgd->mtx);
1889 mutex_lock(&local->mtx);
1890 ieee80211_recalc_idle(local);
1891 mutex_unlock(&local->mtx);
1845 /* 1892 /*
1846 * must be outside lock due to cfg80211, 1893 * must be outside lock due to cfg80211,
1847 * but that's not a problem. 1894 * but that's not a problem.
@@ -1917,6 +1964,8 @@ void ieee80211_sta_quiesce(struct ieee80211_sub_if_data *sdata)
1917 * time -- the code here is properly synchronised. 1964 * time -- the code here is properly synchronised.
1918 */ 1965 */
1919 1966
1967 cancel_work_sync(&ifmgd->request_smps_work);
1968
1920 cancel_work_sync(&ifmgd->beacon_connection_loss_work); 1969 cancel_work_sync(&ifmgd->beacon_connection_loss_work);
1921 if (del_timer_sync(&ifmgd->timer)) 1970 if (del_timer_sync(&ifmgd->timer))
1922 set_bit(TMR_RUNNING_TIMER, &ifmgd->timers_running); 1971 set_bit(TMR_RUNNING_TIMER, &ifmgd->timers_running);
@@ -1952,6 +2001,7 @@ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata)
1952 INIT_WORK(&ifmgd->chswitch_work, ieee80211_chswitch_work); 2001 INIT_WORK(&ifmgd->chswitch_work, ieee80211_chswitch_work);
1953 INIT_WORK(&ifmgd->beacon_connection_loss_work, 2002 INIT_WORK(&ifmgd->beacon_connection_loss_work,
1954 ieee80211_beacon_connection_loss_work); 2003 ieee80211_beacon_connection_loss_work);
2004 INIT_WORK(&ifmgd->request_smps_work, ieee80211_request_smps_work);
1955 setup_timer(&ifmgd->timer, ieee80211_sta_timer, 2005 setup_timer(&ifmgd->timer, ieee80211_sta_timer,
1956 (unsigned long) sdata); 2006 (unsigned long) sdata);
1957 setup_timer(&ifmgd->bcn_mon_timer, ieee80211_sta_bcn_mon_timer, 2007 setup_timer(&ifmgd->bcn_mon_timer, ieee80211_sta_bcn_mon_timer,
@@ -2158,7 +2208,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
2158 } 2208 }
2159 2209
2160 /* Trying to reassociate - clear previous association state */ 2210 /* Trying to reassociate - clear previous association state */
2161 ieee80211_set_disassoc(sdata, true); 2211 ieee80211_set_disassoc(sdata, true, false);
2162 } 2212 }
2163 mutex_unlock(&ifmgd->mtx); 2213 mutex_unlock(&ifmgd->mtx);
2164 2214
@@ -2169,6 +2219,8 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
2169 ifmgd->flags &= ~IEEE80211_STA_DISABLE_11N; 2219 ifmgd->flags &= ~IEEE80211_STA_DISABLE_11N;
2170 ifmgd->flags &= ~IEEE80211_STA_NULLFUNC_ACKED; 2220 ifmgd->flags &= ~IEEE80211_STA_NULLFUNC_ACKED;
2171 2221
2222 ifmgd->beacon_crc_valid = false;
2223
2172 for (i = 0; i < req->crypto.n_ciphers_pairwise; i++) 2224 for (i = 0; i < req->crypto.n_ciphers_pairwise; i++)
2173 if (req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_WEP40 || 2225 if (req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_WEP40 ||
2174 req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_TKIP || 2226 req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_TKIP ||
@@ -2249,6 +2301,9 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
2249 else 2301 else
2250 ifmgd->flags &= ~IEEE80211_STA_CONTROL_PORT; 2302 ifmgd->flags &= ~IEEE80211_STA_CONTROL_PORT;
2251 2303
2304 sdata->control_port_protocol = req->crypto.control_port_ethertype;
2305 sdata->control_port_no_encrypt = req->crypto.control_port_no_encrypt;
2306
2252 ieee80211_add_work(wk); 2307 ieee80211_add_work(wk);
2253 return 0; 2308 return 0;
2254} 2309}
@@ -2267,7 +2322,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
2267 2322
2268 memcpy(bssid, req->bss->bssid, ETH_ALEN); 2323 memcpy(bssid, req->bss->bssid, ETH_ALEN);
2269 if (ifmgd->associated == req->bss) { 2324 if (ifmgd->associated == req->bss) {
2270 ieee80211_set_disassoc(sdata, false); 2325 ieee80211_set_disassoc(sdata, false, true);
2271 mutex_unlock(&ifmgd->mtx); 2326 mutex_unlock(&ifmgd->mtx);
2272 assoc_bss = true; 2327 assoc_bss = true;
2273 } else { 2328 } else {
@@ -2275,7 +2330,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
2275 2330
2276 mutex_unlock(&ifmgd->mtx); 2331 mutex_unlock(&ifmgd->mtx);
2277 2332
2278 mutex_lock(&local->work_mtx); 2333 mutex_lock(&local->mtx);
2279 list_for_each_entry(wk, &local->work_list, list) { 2334 list_for_each_entry(wk, &local->work_list, list) {
2280 if (wk->sdata != sdata) 2335 if (wk->sdata != sdata)
2281 continue; 2336 continue;
@@ -2294,7 +2349,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
2294 free_work(wk); 2349 free_work(wk);
2295 break; 2350 break;
2296 } 2351 }
2297 mutex_unlock(&local->work_mtx); 2352 mutex_unlock(&local->mtx);
2298 2353
2299 /* 2354 /*
2300 * If somebody requests authentication and we haven't 2355 * If somebody requests authentication and we haven't
@@ -2319,7 +2374,9 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
2319 if (assoc_bss) 2374 if (assoc_bss)
2320 sta_info_destroy_addr(sdata, bssid); 2375 sta_info_destroy_addr(sdata, bssid);
2321 2376
2377 mutex_lock(&sdata->local->mtx);
2322 ieee80211_recalc_idle(sdata->local); 2378 ieee80211_recalc_idle(sdata->local);
2379 mutex_unlock(&sdata->local->mtx);
2323 2380
2324 return 0; 2381 return 0;
2325} 2382}
@@ -2348,7 +2405,7 @@ int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata,
2348 sdata->name, req->bss->bssid, req->reason_code); 2405 sdata->name, req->bss->bssid, req->reason_code);
2349 2406
2350 memcpy(bssid, req->bss->bssid, ETH_ALEN); 2407 memcpy(bssid, req->bss->bssid, ETH_ALEN);
2351 ieee80211_set_disassoc(sdata, false); 2408 ieee80211_set_disassoc(sdata, false, true);
2352 2409
2353 mutex_unlock(&ifmgd->mtx); 2410 mutex_unlock(&ifmgd->mtx);
2354 2411
@@ -2357,7 +2414,9 @@ int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata,
2357 cookie, !req->local_state_change); 2414 cookie, !req->local_state_change);
2358 sta_info_destroy_addr(sdata, bssid); 2415 sta_info_destroy_addr(sdata, bssid);
2359 2416
2417 mutex_lock(&sdata->local->mtx);
2360 ieee80211_recalc_idle(sdata->local); 2418 ieee80211_recalc_idle(sdata->local);
2419 mutex_unlock(&sdata->local->mtx);
2361 2420
2362 return 0; 2421 return 0;
2363} 2422}
diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c
index c36b1911987a..4b564091e51d 100644
--- a/net/mac80211/offchannel.c
+++ b/net/mac80211/offchannel.c
@@ -22,12 +22,16 @@
22static void ieee80211_offchannel_ps_enable(struct ieee80211_sub_if_data *sdata) 22static void ieee80211_offchannel_ps_enable(struct ieee80211_sub_if_data *sdata)
23{ 23{
24 struct ieee80211_local *local = sdata->local; 24 struct ieee80211_local *local = sdata->local;
25 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
25 26
26 local->offchannel_ps_enabled = false; 27 local->offchannel_ps_enabled = false;
27 28
28 /* FIXME: what to do when local->pspolling is true? */ 29 /* FIXME: what to do when local->pspolling is true? */
29 30
30 del_timer_sync(&local->dynamic_ps_timer); 31 del_timer_sync(&local->dynamic_ps_timer);
32 del_timer_sync(&ifmgd->bcn_mon_timer);
33 del_timer_sync(&ifmgd->conn_mon_timer);
34
31 cancel_work_sync(&local->dynamic_ps_enable_work); 35 cancel_work_sync(&local->dynamic_ps_enable_work);
32 36
33 if (local->hw.conf.flags & IEEE80211_CONF_PS) { 37 if (local->hw.conf.flags & IEEE80211_CONF_PS) {
@@ -85,6 +89,9 @@ static void ieee80211_offchannel_ps_disable(struct ieee80211_sub_if_data *sdata)
85 mod_timer(&local->dynamic_ps_timer, jiffies + 89 mod_timer(&local->dynamic_ps_timer, jiffies +
86 msecs_to_jiffies(local->hw.conf.dynamic_ps_timeout)); 90 msecs_to_jiffies(local->hw.conf.dynamic_ps_timeout));
87 } 91 }
92
93 ieee80211_sta_reset_beacon_monitor(sdata);
94 ieee80211_sta_reset_conn_monitor(sdata);
88} 95}
89 96
90void ieee80211_offchannel_stop_beaconing(struct ieee80211_local *local) 97void ieee80211_offchannel_stop_beaconing(struct ieee80211_local *local)
@@ -112,8 +119,10 @@ void ieee80211_offchannel_stop_beaconing(struct ieee80211_local *local)
112 * used from user space controlled off-channel operations. 119 * used from user space controlled off-channel operations.
113 */ 120 */
114 if (sdata->vif.type != NL80211_IFTYPE_STATION && 121 if (sdata->vif.type != NL80211_IFTYPE_STATION &&
115 sdata->vif.type != NL80211_IFTYPE_MONITOR) 122 sdata->vif.type != NL80211_IFTYPE_MONITOR) {
123 set_bit(SDATA_STATE_OFFCHANNEL, &sdata->state);
116 netif_tx_stop_all_queues(sdata->dev); 124 netif_tx_stop_all_queues(sdata->dev);
125 }
117 } 126 }
118 mutex_unlock(&local->iflist_mtx); 127 mutex_unlock(&local->iflist_mtx);
119} 128}
@@ -131,6 +140,7 @@ void ieee80211_offchannel_stop_station(struct ieee80211_local *local)
131 continue; 140 continue;
132 141
133 if (sdata->vif.type == NL80211_IFTYPE_STATION) { 142 if (sdata->vif.type == NL80211_IFTYPE_STATION) {
143 set_bit(SDATA_STATE_OFFCHANNEL, &sdata->state);
134 netif_tx_stop_all_queues(sdata->dev); 144 netif_tx_stop_all_queues(sdata->dev);
135 if (sdata->u.mgd.associated) 145 if (sdata->u.mgd.associated)
136 ieee80211_offchannel_ps_enable(sdata); 146 ieee80211_offchannel_ps_enable(sdata);
@@ -155,8 +165,20 @@ void ieee80211_offchannel_return(struct ieee80211_local *local,
155 ieee80211_offchannel_ps_disable(sdata); 165 ieee80211_offchannel_ps_disable(sdata);
156 } 166 }
157 167
158 if (sdata->vif.type != NL80211_IFTYPE_MONITOR) 168 if (sdata->vif.type != NL80211_IFTYPE_MONITOR) {
169 clear_bit(SDATA_STATE_OFFCHANNEL, &sdata->state);
170 /*
171 * This may wake up queues even though the driver
172 * currently has them stopped. This is not very
173 * likely, since the driver won't have gotten any
174 * (or hardly any) new packets while we weren't
175 * on the right channel, and even if it happens
176 * it will at most lead to queueing up one more
177 * packet per queue in mac80211 rather than on
178 * the interface qdisc.
179 */
159 netif_tx_wake_all_queues(sdata->dev); 180 netif_tx_wake_all_queues(sdata->dev);
181 }
160 182
161 /* re-enable beaconing */ 183 /* re-enable beaconing */
162 if (enable_beaconing && 184 if (enable_beaconing &&
diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c
index d287fde0431d..e37355193ed1 100644
--- a/net/mac80211/pm.c
+++ b/net/mac80211/pm.c
@@ -45,7 +45,7 @@ int __ieee80211_suspend(struct ieee80211_hw *hw)
45 list_for_each_entry(sta, &local->sta_list, list) { 45 list_for_each_entry(sta, &local->sta_list, list) {
46 if (hw->flags & IEEE80211_HW_AMPDU_AGGREGATION) { 46 if (hw->flags & IEEE80211_HW_AMPDU_AGGREGATION) {
47 set_sta_flags(sta, WLAN_STA_BLOCK_BA); 47 set_sta_flags(sta, WLAN_STA_BLOCK_BA);
48 ieee80211_sta_tear_down_BA_sessions(sta); 48 ieee80211_sta_tear_down_BA_sessions(sta, true);
49 } 49 }
50 50
51 if (sta->uploaded) { 51 if (sta->uploaded) {
diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c
index be04d46110fe..33f76993da08 100644
--- a/net/mac80211/rate.c
+++ b/net/mac80211/rate.c
@@ -145,6 +145,7 @@ static ssize_t rcname_read(struct file *file, char __user *userbuf,
145static const struct file_operations rcname_ops = { 145static const struct file_operations rcname_ops = {
146 .read = rcname_read, 146 .read = rcname_read,
147 .open = mac80211_open_file_generic, 147 .open = mac80211_open_file_generic,
148 .llseek = default_llseek,
148}; 149};
149#endif 150#endif
150 151
@@ -207,7 +208,7 @@ static bool rc_no_data_or_no_ack(struct ieee80211_tx_rate_control *txrc)
207 208
208 fc = hdr->frame_control; 209 fc = hdr->frame_control;
209 210
210 return ((info->flags & IEEE80211_TX_CTL_NO_ACK) || !ieee80211_is_data(fc)); 211 return (info->flags & IEEE80211_TX_CTL_NO_ACK) || !ieee80211_is_data(fc);
211} 212}
212 213
213static void rc_send_low_broadcast(s8 *idx, u32 basic_rates, u8 max_rate_idx) 214static void rc_send_low_broadcast(s8 *idx, u32 basic_rates, u8 max_rate_idx)
@@ -328,6 +329,9 @@ void rate_control_get_rate(struct ieee80211_sub_if_data *sdata,
328 * if needed. 329 * if needed.
329 */ 330 */
330 for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { 331 for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) {
332 /* Skip invalid rates */
333 if (info->control.rates[i].idx < 0)
334 break;
331 /* Rate masking supports only legacy rates for now */ 335 /* Rate masking supports only legacy rates for now */
332 if (info->control.rates[i].flags & IEEE80211_TX_RC_MCS) 336 if (info->control.rates[i].flags & IEEE80211_TX_RC_MCS)
333 continue; 337 continue;
@@ -368,8 +372,8 @@ int ieee80211_init_rate_ctrl_alg(struct ieee80211_local *local,
368 372
369 ref = rate_control_alloc(name, local); 373 ref = rate_control_alloc(name, local);
370 if (!ref) { 374 if (!ref) {
371 printk(KERN_WARNING "%s: Failed to select rate control " 375 wiphy_warn(local->hw.wiphy,
372 "algorithm\n", wiphy_name(local->hw.wiphy)); 376 "Failed to select rate control algorithm\n");
373 return -ENOENT; 377 return -ENOENT;
374 } 378 }
375 379
@@ -380,9 +384,8 @@ int ieee80211_init_rate_ctrl_alg(struct ieee80211_local *local,
380 sta_info_flush(local, NULL); 384 sta_info_flush(local, NULL);
381 } 385 }
382 386
383 printk(KERN_DEBUG "%s: Selected rate control " 387 wiphy_debug(local->hw.wiphy, "Selected rate control algorithm '%s'\n",
384 "algorithm '%s'\n", wiphy_name(local->hw.wiphy), 388 ref->ops->name);
385 ref->ops->name);
386 389
387 return 0; 390 return 0;
388} 391}
diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c
index 241e76f3fdf2..a290ad231d77 100644
--- a/net/mac80211/rc80211_minstrel_debugfs.c
+++ b/net/mac80211/rc80211_minstrel_debugfs.c
@@ -122,6 +122,7 @@ static const struct file_operations minstrel_stat_fops = {
122 .open = minstrel_stats_open, 122 .open = minstrel_stats_open,
123 .read = minstrel_stats_read, 123 .read = minstrel_stats_read,
124 .release = minstrel_stats_release, 124 .release = minstrel_stats_release,
125 .llseek = default_llseek,
125}; 126};
126 127
127void 128void
diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index c5b465904e3b..2a18d6602d4a 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -397,8 +397,9 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband,
397 !(info->flags & IEEE80211_TX_STAT_AMPDU)) 397 !(info->flags & IEEE80211_TX_STAT_AMPDU))
398 return; 398 return;
399 399
400 if (!info->status.ampdu_len) { 400 if (!(info->flags & IEEE80211_TX_STAT_AMPDU)) {
401 info->status.ampdu_ack_len = 1; 401 info->status.ampdu_ack_len =
402 (info->flags & IEEE80211_TX_STAT_ACK ? 1 : 0);
402 info->status.ampdu_len = 1; 403 info->status.ampdu_len = 1;
403 } 404 }
404 405
@@ -426,7 +427,7 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband,
426 group = minstrel_ht_get_group_idx(&ar[i]); 427 group = minstrel_ht_get_group_idx(&ar[i]);
427 rate = &mi->groups[group].rates[ar[i].idx % 8]; 428 rate = &mi->groups[group].rates[ar[i].idx % 8];
428 429
429 if (last && (info->flags & IEEE80211_TX_STAT_ACK)) 430 if (last)
430 rate->success += info->status.ampdu_ack_len; 431 rate->success += info->status.ampdu_ack_len;
431 432
432 rate->attempts += ar[i].count * info->status.ampdu_len; 433 rate->attempts += ar[i].count * info->status.ampdu_len;
diff --git a/net/mac80211/rc80211_minstrel_ht_debugfs.c b/net/mac80211/rc80211_minstrel_ht_debugfs.c
index 4a5a4b3e7799..cefcb5d2dae6 100644
--- a/net/mac80211/rc80211_minstrel_ht_debugfs.c
+++ b/net/mac80211/rc80211_minstrel_ht_debugfs.c
@@ -90,7 +90,7 @@ minstrel_ht_stats_open(struct inode *inode, struct file *file)
90 MINSTREL_TRUNC(mi->avg_ampdu_len * 10) % 10); 90 MINSTREL_TRUNC(mi->avg_ampdu_len * 10) % 10);
91 ms->len = p - ms->buf; 91 ms->len = p - ms->buf;
92 92
93 return 0; 93 return nonseekable_open(inode, file);
94} 94}
95 95
96static const struct file_operations minstrel_ht_stat_fops = { 96static const struct file_operations minstrel_ht_stat_fops = {
@@ -98,6 +98,7 @@ static const struct file_operations minstrel_ht_stat_fops = {
98 .open = minstrel_ht_stats_open, 98 .open = minstrel_ht_stats_open,
99 .read = minstrel_stats_read, 99 .read = minstrel_stats_read,
100 .release = minstrel_stats_release, 100 .release = minstrel_stats_release,
101 .llseek = no_llseek,
101}; 102};
102 103
103void 104void
diff --git a/net/mac80211/rc80211_pid_debugfs.c b/net/mac80211/rc80211_pid_debugfs.c
index 47438b4a9af5..4851e9e2daed 100644
--- a/net/mac80211/rc80211_pid_debugfs.c
+++ b/net/mac80211/rc80211_pid_debugfs.c
@@ -162,7 +162,7 @@ static ssize_t rate_control_pid_events_read(struct file *file, char __user *buf,
162 file_info->next_entry = (file_info->next_entry + 1) % 162 file_info->next_entry = (file_info->next_entry + 1) %
163 RC_PID_EVENT_RING_SIZE; 163 RC_PID_EVENT_RING_SIZE;
164 164
165 /* Print information about the event. Note that userpace needs to 165 /* Print information about the event. Note that userspace needs to
166 * provide large enough buffers. */ 166 * provide large enough buffers. */
167 length = length < RC_PID_PRINT_BUF_SIZE ? 167 length = length < RC_PID_PRINT_BUF_SIZE ?
168 length : RC_PID_PRINT_BUF_SIZE; 168 length : RC_PID_PRINT_BUF_SIZE;
@@ -206,6 +206,7 @@ static const struct file_operations rc_pid_fop_events = {
206 .poll = rate_control_pid_events_poll, 206 .poll = rate_control_pid_events_poll,
207 .open = rate_control_pid_events_open, 207 .open = rate_control_pid_events_open,
208 .release = rate_control_pid_events_release, 208 .release = rate_control_pid_events_release,
209 .llseek = noop_llseek,
209}; 210};
210 211
211void rate_control_pid_add_sta_debugfs(void *priv, void *priv_sta, 212void rate_control_pid_add_sta_debugfs(void *priv, void *priv_sta,
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index fa0f37e4afe4..902b03ee8f60 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -315,6 +315,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
315static void ieee80211_parse_qos(struct ieee80211_rx_data *rx) 315static void ieee80211_parse_qos(struct ieee80211_rx_data *rx)
316{ 316{
317 struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data; 317 struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
318 struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
318 int tid; 319 int tid;
319 320
320 /* does the frame have a qos control field? */ 321 /* does the frame have a qos control field? */
@@ -323,9 +324,7 @@ static void ieee80211_parse_qos(struct ieee80211_rx_data *rx)
323 /* frame has qos control */ 324 /* frame has qos control */
324 tid = *qc & IEEE80211_QOS_CTL_TID_MASK; 325 tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
325 if (*qc & IEEE80211_QOS_CONTROL_A_MSDU_PRESENT) 326 if (*qc & IEEE80211_QOS_CONTROL_A_MSDU_PRESENT)
326 rx->flags |= IEEE80211_RX_AMSDU; 327 status->rx_flags |= IEEE80211_RX_AMSDU;
327 else
328 rx->flags &= ~IEEE80211_RX_AMSDU;
329 } else { 328 } else {
330 /* 329 /*
331 * IEEE 802.11-2007, 7.1.3.4.1 ("Sequence Number field"): 330 * IEEE 802.11-2007, 7.1.3.4.1 ("Sequence Number field"):
@@ -387,26 +386,25 @@ static ieee80211_rx_result debug_noinline
387ieee80211_rx_h_passive_scan(struct ieee80211_rx_data *rx) 386ieee80211_rx_h_passive_scan(struct ieee80211_rx_data *rx)
388{ 387{
389 struct ieee80211_local *local = rx->local; 388 struct ieee80211_local *local = rx->local;
389 struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
390 struct sk_buff *skb = rx->skb; 390 struct sk_buff *skb = rx->skb;
391 391
392 if (unlikely(test_bit(SCAN_HW_SCANNING, &local->scanning))) 392 if (likely(!(status->rx_flags & IEEE80211_RX_IN_SCAN)))
393 return RX_CONTINUE;
394
395 if (test_bit(SCAN_HW_SCANNING, &local->scanning))
393 return ieee80211_scan_rx(rx->sdata, skb); 396 return ieee80211_scan_rx(rx->sdata, skb);
394 397
395 if (unlikely(test_bit(SCAN_SW_SCANNING, &local->scanning) && 398 if (test_bit(SCAN_SW_SCANNING, &local->scanning)) {
396 (rx->flags & IEEE80211_RX_IN_SCAN))) {
397 /* drop all the other packets during a software scan anyway */ 399 /* drop all the other packets during a software scan anyway */
398 if (ieee80211_scan_rx(rx->sdata, skb) != RX_QUEUED) 400 if (ieee80211_scan_rx(rx->sdata, skb) != RX_QUEUED)
399 dev_kfree_skb(skb); 401 dev_kfree_skb(skb);
400 return RX_QUEUED; 402 return RX_QUEUED;
401 } 403 }
402 404
403 if (unlikely(rx->flags & IEEE80211_RX_IN_SCAN)) { 405 /* scanning finished during invoking of handlers */
404 /* scanning finished during invoking of handlers */ 406 I802_DEBUG_INC(local->rx_handlers_drop_passive_scan);
405 I802_DEBUG_INC(local->rx_handlers_drop_passive_scan); 407 return RX_DROP_UNUSABLE;
406 return RX_DROP_UNUSABLE;
407 }
408
409 return RX_CONTINUE;
410} 408}
411 409
412 410
@@ -538,20 +536,12 @@ static void ieee80211_release_reorder_frame(struct ieee80211_hw *hw,
538 int index, 536 int index,
539 struct sk_buff_head *frames) 537 struct sk_buff_head *frames)
540{ 538{
541 struct ieee80211_supported_band *sband;
542 struct ieee80211_rate *rate = NULL;
543 struct sk_buff *skb = tid_agg_rx->reorder_buf[index]; 539 struct sk_buff *skb = tid_agg_rx->reorder_buf[index];
544 struct ieee80211_rx_status *status;
545 540
546 if (!skb) 541 if (!skb)
547 goto no_frame; 542 goto no_frame;
548 543
549 status = IEEE80211_SKB_RXCB(skb); 544 /* release the frame from the reorder ring buffer */
550
551 /* release the reordered frames to stack */
552 sband = hw->wiphy->bands[status->band];
553 if (!(status->flag & RX_FLAG_HT))
554 rate = &sband->bitrates[status->rate_idx];
555 tid_agg_rx->stored_mpdu_num--; 545 tid_agg_rx->stored_mpdu_num--;
556 tid_agg_rx->reorder_buf[index] = NULL; 546 tid_agg_rx->reorder_buf[index] = NULL;
557 __skb_queue_tail(frames, skb); 547 __skb_queue_tail(frames, skb);
@@ -580,9 +570,102 @@ static void ieee80211_release_reorder_frames(struct ieee80211_hw *hw,
580 * frames that have not yet been received are assumed to be lost and the skb 570 * frames that have not yet been received are assumed to be lost and the skb
581 * can be released for processing. This may also release other skb's from the 571 * can be released for processing. This may also release other skb's from the
582 * reorder buffer if there are no additional gaps between the frames. 572 * reorder buffer if there are no additional gaps between the frames.
573 *
574 * Callers must hold tid_agg_rx->reorder_lock.
583 */ 575 */
584#define HT_RX_REORDER_BUF_TIMEOUT (HZ / 10) 576#define HT_RX_REORDER_BUF_TIMEOUT (HZ / 10)
585 577
578static void ieee80211_sta_reorder_release(struct ieee80211_hw *hw,
579 struct tid_ampdu_rx *tid_agg_rx,
580 struct sk_buff_head *frames)
581{
582 int index, j;
583
584 /* release the buffer until next missing frame */
585 index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) %
586 tid_agg_rx->buf_size;
587 if (!tid_agg_rx->reorder_buf[index] &&
588 tid_agg_rx->stored_mpdu_num > 1) {
589 /*
590 * No buffers ready to be released, but check whether any
591 * frames in the reorder buffer have timed out.
592 */
593 int skipped = 1;
594 for (j = (index + 1) % tid_agg_rx->buf_size; j != index;
595 j = (j + 1) % tid_agg_rx->buf_size) {
596 if (!tid_agg_rx->reorder_buf[j]) {
597 skipped++;
598 continue;
599 }
600 if (!time_after(jiffies, tid_agg_rx->reorder_time[j] +
601 HT_RX_REORDER_BUF_TIMEOUT))
602 goto set_release_timer;
603
604#ifdef CONFIG_MAC80211_HT_DEBUG
605 if (net_ratelimit())
606 wiphy_debug(hw->wiphy,
607 "release an RX reorder frame due to timeout on earlier frames\n");
608#endif
609 ieee80211_release_reorder_frame(hw, tid_agg_rx,
610 j, frames);
611
612 /*
613 * Increment the head seq# also for the skipped slots.
614 */
615 tid_agg_rx->head_seq_num =
616 (tid_agg_rx->head_seq_num + skipped) & SEQ_MASK;
617 skipped = 0;
618 }
619 } else while (tid_agg_rx->reorder_buf[index]) {
620 ieee80211_release_reorder_frame(hw, tid_agg_rx, index, frames);
621 index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) %
622 tid_agg_rx->buf_size;
623 }
624
625 /*
626 * Disable the reorder release timer for now.
627 *
628 * The current implementation lacks a proper locking scheme
629 * which would protect vital statistic and debug counters
630 * from being updated by two different but concurrent BHs.
631 *
632 * More information about the topic is available from:
633 * - thread: http://marc.info/?t=128635927000001
634 *
635 * What was wrong:
636 * => http://marc.info/?l=linux-wireless&m=128636170811964
637 * "Basically the thing is that until your patch, the data
638 * in the struct didn't actually need locking because it
639 * was accessed by the RX path only which is not concurrent."
640 *
641 * List of what needs to be fixed:
642 * => http://marc.info/?l=linux-wireless&m=128656352920957
643 *
644
645 if (tid_agg_rx->stored_mpdu_num) {
646 j = index = seq_sub(tid_agg_rx->head_seq_num,
647 tid_agg_rx->ssn) % tid_agg_rx->buf_size;
648
649 for (; j != (index - 1) % tid_agg_rx->buf_size;
650 j = (j + 1) % tid_agg_rx->buf_size) {
651 if (tid_agg_rx->reorder_buf[j])
652 break;
653 }
654
655 set_release_timer:
656
657 mod_timer(&tid_agg_rx->reorder_timer,
658 tid_agg_rx->reorder_time[j] +
659 HT_RX_REORDER_BUF_TIMEOUT);
660 } else {
661 del_timer(&tid_agg_rx->reorder_timer);
662 }
663 */
664
665set_release_timer:
666 return;
667}
668
586/* 669/*
587 * As this function belongs to the RX path it must be under 670 * As this function belongs to the RX path it must be under
588 * rcu_read_lock protection. It returns false if the frame 671 * rcu_read_lock protection. It returns false if the frame
@@ -598,14 +681,16 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_hw *hw,
598 u16 mpdu_seq_num = (sc & IEEE80211_SCTL_SEQ) >> 4; 681 u16 mpdu_seq_num = (sc & IEEE80211_SCTL_SEQ) >> 4;
599 u16 head_seq_num, buf_size; 682 u16 head_seq_num, buf_size;
600 int index; 683 int index;
684 bool ret = true;
601 685
602 buf_size = tid_agg_rx->buf_size; 686 buf_size = tid_agg_rx->buf_size;
603 head_seq_num = tid_agg_rx->head_seq_num; 687 head_seq_num = tid_agg_rx->head_seq_num;
604 688
689 spin_lock(&tid_agg_rx->reorder_lock);
605 /* frame with out of date sequence number */ 690 /* frame with out of date sequence number */
606 if (seq_less(mpdu_seq_num, head_seq_num)) { 691 if (seq_less(mpdu_seq_num, head_seq_num)) {
607 dev_kfree_skb(skb); 692 dev_kfree_skb(skb);
608 return true; 693 goto out;
609 } 694 }
610 695
611 /* 696 /*
@@ -626,7 +711,7 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_hw *hw,
626 /* check if we already stored this frame */ 711 /* check if we already stored this frame */
627 if (tid_agg_rx->reorder_buf[index]) { 712 if (tid_agg_rx->reorder_buf[index]) {
628 dev_kfree_skb(skb); 713 dev_kfree_skb(skb);
629 return true; 714 goto out;
630 } 715 }
631 716
632 /* 717 /*
@@ -636,58 +721,19 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_hw *hw,
636 if (mpdu_seq_num == tid_agg_rx->head_seq_num && 721 if (mpdu_seq_num == tid_agg_rx->head_seq_num &&
637 tid_agg_rx->stored_mpdu_num == 0) { 722 tid_agg_rx->stored_mpdu_num == 0) {
638 tid_agg_rx->head_seq_num = seq_inc(tid_agg_rx->head_seq_num); 723 tid_agg_rx->head_seq_num = seq_inc(tid_agg_rx->head_seq_num);
639 return false; 724 ret = false;
725 goto out;
640 } 726 }
641 727
642 /* put the frame in the reordering buffer */ 728 /* put the frame in the reordering buffer */
643 tid_agg_rx->reorder_buf[index] = skb; 729 tid_agg_rx->reorder_buf[index] = skb;
644 tid_agg_rx->reorder_time[index] = jiffies; 730 tid_agg_rx->reorder_time[index] = jiffies;
645 tid_agg_rx->stored_mpdu_num++; 731 tid_agg_rx->stored_mpdu_num++;
646 /* release the buffer until next missing frame */ 732 ieee80211_sta_reorder_release(hw, tid_agg_rx, frames);
647 index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) %
648 tid_agg_rx->buf_size;
649 if (!tid_agg_rx->reorder_buf[index] &&
650 tid_agg_rx->stored_mpdu_num > 1) {
651 /*
652 * No buffers ready to be released, but check whether any
653 * frames in the reorder buffer have timed out.
654 */
655 int j;
656 int skipped = 1;
657 for (j = (index + 1) % tid_agg_rx->buf_size; j != index;
658 j = (j + 1) % tid_agg_rx->buf_size) {
659 if (!tid_agg_rx->reorder_buf[j]) {
660 skipped++;
661 continue;
662 }
663 if (!time_after(jiffies, tid_agg_rx->reorder_time[j] +
664 HT_RX_REORDER_BUF_TIMEOUT))
665 break;
666 733
667#ifdef CONFIG_MAC80211_HT_DEBUG 734 out:
668 if (net_ratelimit()) 735 spin_unlock(&tid_agg_rx->reorder_lock);
669 printk(KERN_DEBUG "%s: release an RX reorder " 736 return ret;
670 "frame due to timeout on earlier "
671 "frames\n",
672 wiphy_name(hw->wiphy));
673#endif
674 ieee80211_release_reorder_frame(hw, tid_agg_rx,
675 j, frames);
676
677 /*
678 * Increment the head seq# also for the skipped slots.
679 */
680 tid_agg_rx->head_seq_num =
681 (tid_agg_rx->head_seq_num + skipped) & SEQ_MASK;
682 skipped = 0;
683 }
684 } else while (tid_agg_rx->reorder_buf[index]) {
685 ieee80211_release_reorder_frame(hw, tid_agg_rx, index, frames);
686 index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) %
687 tid_agg_rx->buf_size;
688 }
689
690 return true;
691} 737}
692 738
693/* 739/*
@@ -761,13 +807,14 @@ static ieee80211_rx_result debug_noinline
761ieee80211_rx_h_check(struct ieee80211_rx_data *rx) 807ieee80211_rx_h_check(struct ieee80211_rx_data *rx)
762{ 808{
763 struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data; 809 struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
810 struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
764 811
765 /* Drop duplicate 802.11 retransmissions (IEEE 802.11 Chap. 9.2.9) */ 812 /* Drop duplicate 802.11 retransmissions (IEEE 802.11 Chap. 9.2.9) */
766 if (rx->sta && !is_multicast_ether_addr(hdr->addr1)) { 813 if (rx->sta && !is_multicast_ether_addr(hdr->addr1)) {
767 if (unlikely(ieee80211_has_retry(hdr->frame_control) && 814 if (unlikely(ieee80211_has_retry(hdr->frame_control) &&
768 rx->sta->last_seq_ctrl[rx->queue] == 815 rx->sta->last_seq_ctrl[rx->queue] ==
769 hdr->seq_ctrl)) { 816 hdr->seq_ctrl)) {
770 if (rx->flags & IEEE80211_RX_RA_MATCH) { 817 if (status->rx_flags & IEEE80211_RX_RA_MATCH) {
771 rx->local->dot11FrameDuplicateCount++; 818 rx->local->dot11FrameDuplicateCount++;
772 rx->sta->num_duplicates++; 819 rx->sta->num_duplicates++;
773 } 820 }
@@ -796,11 +843,12 @@ ieee80211_rx_h_check(struct ieee80211_rx_data *rx)
796 if (unlikely((ieee80211_is_data(hdr->frame_control) || 843 if (unlikely((ieee80211_is_data(hdr->frame_control) ||
797 ieee80211_is_pspoll(hdr->frame_control)) && 844 ieee80211_is_pspoll(hdr->frame_control)) &&
798 rx->sdata->vif.type != NL80211_IFTYPE_ADHOC && 845 rx->sdata->vif.type != NL80211_IFTYPE_ADHOC &&
846 rx->sdata->vif.type != NL80211_IFTYPE_WDS &&
799 (!rx->sta || !test_sta_flags(rx->sta, WLAN_STA_ASSOC)))) { 847 (!rx->sta || !test_sta_flags(rx->sta, WLAN_STA_ASSOC)))) {
800 if ((!ieee80211_has_fromds(hdr->frame_control) && 848 if ((!ieee80211_has_fromds(hdr->frame_control) &&
801 !ieee80211_has_tods(hdr->frame_control) && 849 !ieee80211_has_tods(hdr->frame_control) &&
802 ieee80211_is_data(hdr->frame_control)) || 850 ieee80211_is_data(hdr->frame_control)) ||
803 !(rx->flags & IEEE80211_RX_RA_MATCH)) { 851 !(status->rx_flags & IEEE80211_RX_RA_MATCH)) {
804 /* Drop IBSS frames and frames for other hosts 852 /* Drop IBSS frames and frames for other hosts
805 * silently. */ 853 * silently. */
806 return RX_DROP_MONITOR; 854 return RX_DROP_MONITOR;
@@ -822,7 +870,7 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
822 int keyidx; 870 int keyidx;
823 int hdrlen; 871 int hdrlen;
824 ieee80211_rx_result result = RX_DROP_UNUSABLE; 872 ieee80211_rx_result result = RX_DROP_UNUSABLE;
825 struct ieee80211_key *stakey = NULL; 873 struct ieee80211_key *sta_ptk = NULL;
826 int mmie_keyidx = -1; 874 int mmie_keyidx = -1;
827 __le16 fc; 875 __le16 fc;
828 876
@@ -857,22 +905,25 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
857 * No point in finding a key and decrypting if the frame is neither 905 * No point in finding a key and decrypting if the frame is neither
858 * addressed to us nor a multicast frame. 906 * addressed to us nor a multicast frame.
859 */ 907 */
860 if (!(rx->flags & IEEE80211_RX_RA_MATCH)) 908 if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
861 return RX_CONTINUE; 909 return RX_CONTINUE;
862 910
863 /* start without a key */ 911 /* start without a key */
864 rx->key = NULL; 912 rx->key = NULL;
865 913
866 if (rx->sta) 914 if (rx->sta)
867 stakey = rcu_dereference(rx->sta->key); 915 sta_ptk = rcu_dereference(rx->sta->ptk);
868 916
869 fc = hdr->frame_control; 917 fc = hdr->frame_control;
870 918
871 if (!ieee80211_has_protected(fc)) 919 if (!ieee80211_has_protected(fc))
872 mmie_keyidx = ieee80211_get_mmie_keyidx(rx->skb); 920 mmie_keyidx = ieee80211_get_mmie_keyidx(rx->skb);
873 921
874 if (!is_multicast_ether_addr(hdr->addr1) && stakey) { 922 if (!is_multicast_ether_addr(hdr->addr1) && sta_ptk) {
875 rx->key = stakey; 923 rx->key = sta_ptk;
924 if ((status->flag & RX_FLAG_DECRYPTED) &&
925 (status->flag & RX_FLAG_IV_STRIPPED))
926 return RX_CONTINUE;
876 /* Skip decryption if the frame is not protected. */ 927 /* Skip decryption if the frame is not protected. */
877 if (!ieee80211_has_protected(fc)) 928 if (!ieee80211_has_protected(fc))
878 return RX_CONTINUE; 929 return RX_CONTINUE;
@@ -885,7 +936,10 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
885 if (mmie_keyidx < NUM_DEFAULT_KEYS || 936 if (mmie_keyidx < NUM_DEFAULT_KEYS ||
886 mmie_keyidx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS) 937 mmie_keyidx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS)
887 return RX_DROP_MONITOR; /* unexpected BIP keyidx */ 938 return RX_DROP_MONITOR; /* unexpected BIP keyidx */
888 rx->key = rcu_dereference(rx->sdata->keys[mmie_keyidx]); 939 if (rx->sta)
940 rx->key = rcu_dereference(rx->sta->gtk[mmie_keyidx]);
941 if (!rx->key)
942 rx->key = rcu_dereference(rx->sdata->keys[mmie_keyidx]);
889 } else if (!ieee80211_has_protected(fc)) { 943 } else if (!ieee80211_has_protected(fc)) {
890 /* 944 /*
891 * The frame was not protected, so skip decryption. However, we 945 * The frame was not protected, so skip decryption. However, we
@@ -928,16 +982,25 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
928 skb_copy_bits(rx->skb, hdrlen + 3, &keyid, 1); 982 skb_copy_bits(rx->skb, hdrlen + 3, &keyid, 1);
929 keyidx = keyid >> 6; 983 keyidx = keyid >> 6;
930 984
931 rx->key = rcu_dereference(rx->sdata->keys[keyidx]); 985 /* check per-station GTK first, if multicast packet */
986 if (is_multicast_ether_addr(hdr->addr1) && rx->sta)
987 rx->key = rcu_dereference(rx->sta->gtk[keyidx]);
932 988
933 /* 989 /* if not found, try default key */
934 * RSNA-protected unicast frames should always be sent with 990 if (!rx->key) {
935 * pairwise or station-to-station keys, but for WEP we allow 991 rx->key = rcu_dereference(rx->sdata->keys[keyidx]);
936 * using a key index as well. 992
937 */ 993 /*
938 if (rx->key && rx->key->conf.alg != ALG_WEP && 994 * RSNA-protected unicast frames should always be
939 !is_multicast_ether_addr(hdr->addr1)) 995 * sent with pairwise or station-to-station keys,
940 rx->key = NULL; 996 * but for WEP we allow using a key index as well.
997 */
998 if (rx->key &&
999 rx->key->conf.cipher != WLAN_CIPHER_SUITE_WEP40 &&
1000 rx->key->conf.cipher != WLAN_CIPHER_SUITE_WEP104 &&
1001 !is_multicast_ether_addr(hdr->addr1))
1002 rx->key = NULL;
1003 }
941 } 1004 }
942 1005
943 if (rx->key) { 1006 if (rx->key) {
@@ -951,8 +1014,9 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
951 return RX_DROP_UNUSABLE; 1014 return RX_DROP_UNUSABLE;
952 /* the hdr variable is invalid now! */ 1015 /* the hdr variable is invalid now! */
953 1016
954 switch (rx->key->conf.alg) { 1017 switch (rx->key->conf.cipher) {
955 case ALG_WEP: 1018 case WLAN_CIPHER_SUITE_WEP40:
1019 case WLAN_CIPHER_SUITE_WEP104:
956 /* Check for weak IVs if possible */ 1020 /* Check for weak IVs if possible */
957 if (rx->sta && ieee80211_is_data(fc) && 1021 if (rx->sta && ieee80211_is_data(fc) &&
958 (!(status->flag & RX_FLAG_IV_STRIPPED) || 1022 (!(status->flag & RX_FLAG_IV_STRIPPED) ||
@@ -962,15 +1026,21 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
962 1026
963 result = ieee80211_crypto_wep_decrypt(rx); 1027 result = ieee80211_crypto_wep_decrypt(rx);
964 break; 1028 break;
965 case ALG_TKIP: 1029 case WLAN_CIPHER_SUITE_TKIP:
966 result = ieee80211_crypto_tkip_decrypt(rx); 1030 result = ieee80211_crypto_tkip_decrypt(rx);
967 break; 1031 break;
968 case ALG_CCMP: 1032 case WLAN_CIPHER_SUITE_CCMP:
969 result = ieee80211_crypto_ccmp_decrypt(rx); 1033 result = ieee80211_crypto_ccmp_decrypt(rx);
970 break; 1034 break;
971 case ALG_AES_CMAC: 1035 case WLAN_CIPHER_SUITE_AES_CMAC:
972 result = ieee80211_crypto_aes_cmac_decrypt(rx); 1036 result = ieee80211_crypto_aes_cmac_decrypt(rx);
973 break; 1037 break;
1038 default:
1039 /*
1040 * We can reach here only with HW-only algorithms
1041 * but why didn't it decrypt the frame?!
1042 */
1043 return RX_DROP_UNUSABLE;
974 } 1044 }
975 1045
976 /* either the frame has been decrypted or will be dropped */ 1046 /* either the frame has been decrypted or will be dropped */
@@ -1079,7 +1149,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
1079 sta->last_rx = jiffies; 1149 sta->last_rx = jiffies;
1080 } 1150 }
1081 1151
1082 if (!(rx->flags & IEEE80211_RX_RA_MATCH)) 1152 if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
1083 return RX_CONTINUE; 1153 return RX_CONTINUE;
1084 1154
1085 if (rx->sdata->vif.type == NL80211_IFTYPE_STATION) 1155 if (rx->sdata->vif.type == NL80211_IFTYPE_STATION)
@@ -1236,6 +1306,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
1236 unsigned int frag, seq; 1306 unsigned int frag, seq;
1237 struct ieee80211_fragment_entry *entry; 1307 struct ieee80211_fragment_entry *entry;
1238 struct sk_buff *skb; 1308 struct sk_buff *skb;
1309 struct ieee80211_rx_status *status;
1239 1310
1240 hdr = (struct ieee80211_hdr *)rx->skb->data; 1311 hdr = (struct ieee80211_hdr *)rx->skb->data;
1241 fc = hdr->frame_control; 1312 fc = hdr->frame_control;
@@ -1265,7 +1336,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
1265 /* This is the first fragment of a new frame. */ 1336 /* This is the first fragment of a new frame. */
1266 entry = ieee80211_reassemble_add(rx->sdata, frag, seq, 1337 entry = ieee80211_reassemble_add(rx->sdata, frag, seq,
1267 rx->queue, &(rx->skb)); 1338 rx->queue, &(rx->skb));
1268 if (rx->key && rx->key->conf.alg == ALG_CCMP && 1339 if (rx->key && rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP &&
1269 ieee80211_has_protected(fc)) { 1340 ieee80211_has_protected(fc)) {
1270 int queue = ieee80211_is_mgmt(fc) ? 1341 int queue = ieee80211_is_mgmt(fc) ?
1271 NUM_RX_DATA_QUEUES : rx->queue; 1342 NUM_RX_DATA_QUEUES : rx->queue;
@@ -1294,7 +1365,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
1294 int i; 1365 int i;
1295 u8 pn[CCMP_PN_LEN], *rpn; 1366 u8 pn[CCMP_PN_LEN], *rpn;
1296 int queue; 1367 int queue;
1297 if (!rx->key || rx->key->conf.alg != ALG_CCMP) 1368 if (!rx->key || rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP)
1298 return RX_DROP_UNUSABLE; 1369 return RX_DROP_UNUSABLE;
1299 memcpy(pn, entry->last_pn, CCMP_PN_LEN); 1370 memcpy(pn, entry->last_pn, CCMP_PN_LEN);
1300 for (i = CCMP_PN_LEN - 1; i >= 0; i--) { 1371 for (i = CCMP_PN_LEN - 1; i >= 0; i--) {
@@ -1335,7 +1406,8 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
1335 } 1406 }
1336 1407
1337 /* Complete frame has been reassembled - process it now */ 1408 /* Complete frame has been reassembled - process it now */
1338 rx->flags |= IEEE80211_RX_FRAGMENTED; 1409 status = IEEE80211_SKB_RXCB(rx->skb);
1410 status->rx_flags |= IEEE80211_RX_FRAGMENTED;
1339 1411
1340 out: 1412 out:
1341 if (rx->sta) 1413 if (rx->sta)
@@ -1352,9 +1424,10 @@ ieee80211_rx_h_ps_poll(struct ieee80211_rx_data *rx)
1352{ 1424{
1353 struct ieee80211_sub_if_data *sdata = rx->sdata; 1425 struct ieee80211_sub_if_data *sdata = rx->sdata;
1354 __le16 fc = ((struct ieee80211_hdr *)rx->skb->data)->frame_control; 1426 __le16 fc = ((struct ieee80211_hdr *)rx->skb->data)->frame_control;
1427 struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
1355 1428
1356 if (likely(!rx->sta || !ieee80211_is_pspoll(fc) || 1429 if (likely(!rx->sta || !ieee80211_is_pspoll(fc) ||
1357 !(rx->flags & IEEE80211_RX_RA_MATCH))) 1430 !(status->rx_flags & IEEE80211_RX_RA_MATCH)))
1358 return RX_CONTINUE; 1431 return RX_CONTINUE;
1359 1432
1360 if ((sdata->vif.type != NL80211_IFTYPE_AP) && 1433 if ((sdata->vif.type != NL80211_IFTYPE_AP) &&
@@ -1492,7 +1565,7 @@ static bool ieee80211_frame_allowed(struct ieee80211_rx_data *rx, __le16 fc)
1492 * Allow EAPOL frames to us/the PAE group address regardless 1565 * Allow EAPOL frames to us/the PAE group address regardless
1493 * of whether the frame was encrypted or not. 1566 * of whether the frame was encrypted or not.
1494 */ 1567 */
1495 if (ehdr->h_proto == htons(ETH_P_PAE) && 1568 if (ehdr->h_proto == rx->sdata->control_port_protocol &&
1496 (compare_ether_addr(ehdr->h_dest, rx->sdata->vif.addr) == 0 || 1569 (compare_ether_addr(ehdr->h_dest, rx->sdata->vif.addr) == 0 ||
1497 compare_ether_addr(ehdr->h_dest, pae_group_addr) == 0)) 1570 compare_ether_addr(ehdr->h_dest, pae_group_addr) == 0))
1498 return true; 1571 return true;
@@ -1515,6 +1588,7 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx)
1515 struct sk_buff *skb, *xmit_skb; 1588 struct sk_buff *skb, *xmit_skb;
1516 struct ethhdr *ehdr = (struct ethhdr *) rx->skb->data; 1589 struct ethhdr *ehdr = (struct ethhdr *) rx->skb->data;
1517 struct sta_info *dsta; 1590 struct sta_info *dsta;
1591 struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
1518 1592
1519 skb = rx->skb; 1593 skb = rx->skb;
1520 xmit_skb = NULL; 1594 xmit_skb = NULL;
@@ -1522,7 +1596,7 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx)
1522 if ((sdata->vif.type == NL80211_IFTYPE_AP || 1596 if ((sdata->vif.type == NL80211_IFTYPE_AP ||
1523 sdata->vif.type == NL80211_IFTYPE_AP_VLAN) && 1597 sdata->vif.type == NL80211_IFTYPE_AP_VLAN) &&
1524 !(sdata->flags & IEEE80211_SDATA_DONT_BRIDGE_PACKETS) && 1598 !(sdata->flags & IEEE80211_SDATA_DONT_BRIDGE_PACKETS) &&
1525 (rx->flags & IEEE80211_RX_RA_MATCH) && 1599 (status->rx_flags & IEEE80211_RX_RA_MATCH) &&
1526 (sdata->vif.type != NL80211_IFTYPE_AP_VLAN || !sdata->u.vlan.sta)) { 1600 (sdata->vif.type != NL80211_IFTYPE_AP_VLAN || !sdata->u.vlan.sta)) {
1527 if (is_multicast_ether_addr(ehdr->h_dest)) { 1601 if (is_multicast_ether_addr(ehdr->h_dest)) {
1528 /* 1602 /*
@@ -1599,6 +1673,7 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx)
1599 struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; 1673 struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
1600 __le16 fc = hdr->frame_control; 1674 __le16 fc = hdr->frame_control;
1601 struct sk_buff_head frame_list; 1675 struct sk_buff_head frame_list;
1676 struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
1602 1677
1603 if (unlikely(!ieee80211_is_data(fc))) 1678 if (unlikely(!ieee80211_is_data(fc)))
1604 return RX_CONTINUE; 1679 return RX_CONTINUE;
@@ -1606,7 +1681,7 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx)
1606 if (unlikely(!ieee80211_is_data_present(fc))) 1681 if (unlikely(!ieee80211_is_data_present(fc)))
1607 return RX_DROP_MONITOR; 1682 return RX_DROP_MONITOR;
1608 1683
1609 if (!(rx->flags & IEEE80211_RX_AMSDU)) 1684 if (!(status->rx_flags & IEEE80211_RX_AMSDU))
1610 return RX_CONTINUE; 1685 return RX_CONTINUE;
1611 1686
1612 if (ieee80211_has_a4(hdr->frame_control) && 1687 if (ieee80211_has_a4(hdr->frame_control) &&
@@ -1657,6 +1732,7 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
1657 struct sk_buff *skb = rx->skb, *fwd_skb; 1732 struct sk_buff *skb = rx->skb, *fwd_skb;
1658 struct ieee80211_local *local = rx->local; 1733 struct ieee80211_local *local = rx->local;
1659 struct ieee80211_sub_if_data *sdata = rx->sdata; 1734 struct ieee80211_sub_if_data *sdata = rx->sdata;
1735 struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
1660 1736
1661 hdr = (struct ieee80211_hdr *) skb->data; 1737 hdr = (struct ieee80211_hdr *) skb->data;
1662 hdrlen = ieee80211_hdrlen(hdr->frame_control); 1738 hdrlen = ieee80211_hdrlen(hdr->frame_control);
@@ -1702,7 +1778,7 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
1702 1778
1703 mesh_hdr->ttl--; 1779 mesh_hdr->ttl--;
1704 1780
1705 if (rx->flags & IEEE80211_RX_RA_MATCH) { 1781 if (status->rx_flags & IEEE80211_RX_RA_MATCH) {
1706 if (!mesh_hdr->ttl) 1782 if (!mesh_hdr->ttl)
1707 IEEE80211_IFSTA_MESH_CTR_INC(&rx->sdata->u.mesh, 1783 IEEE80211_IFSTA_MESH_CTR_INC(&rx->sdata->u.mesh,
1708 dropped_frames_ttl); 1784 dropped_frames_ttl);
@@ -1909,13 +1985,38 @@ static void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata,
1909} 1985}
1910 1986
1911static ieee80211_rx_result debug_noinline 1987static ieee80211_rx_result debug_noinline
1988ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx)
1989{
1990 struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data;
1991 struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
1992
1993 /*
1994 * From here on, look only at management frames.
1995 * Data and control frames are already handled,
1996 * and unknown (reserved) frames are useless.
1997 */
1998 if (rx->skb->len < 24)
1999 return RX_DROP_MONITOR;
2000
2001 if (!ieee80211_is_mgmt(mgmt->frame_control))
2002 return RX_DROP_MONITOR;
2003
2004 if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
2005 return RX_DROP_MONITOR;
2006
2007 if (ieee80211_drop_unencrypted_mgmt(rx))
2008 return RX_DROP_UNUSABLE;
2009
2010 return RX_CONTINUE;
2011}
2012
2013static ieee80211_rx_result debug_noinline
1912ieee80211_rx_h_action(struct ieee80211_rx_data *rx) 2014ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
1913{ 2015{
1914 struct ieee80211_local *local = rx->local; 2016 struct ieee80211_local *local = rx->local;
1915 struct ieee80211_sub_if_data *sdata = rx->sdata; 2017 struct ieee80211_sub_if_data *sdata = rx->sdata;
1916 struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data; 2018 struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data;
1917 struct sk_buff *nskb; 2019 struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
1918 struct ieee80211_rx_status *status;
1919 int len = rx->skb->len; 2020 int len = rx->skb->len;
1920 2021
1921 if (!ieee80211_is_action(mgmt->frame_control)) 2022 if (!ieee80211_is_action(mgmt->frame_control))
@@ -1928,10 +2029,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
1928 if (!rx->sta && mgmt->u.action.category != WLAN_CATEGORY_PUBLIC) 2029 if (!rx->sta && mgmt->u.action.category != WLAN_CATEGORY_PUBLIC)
1929 return RX_DROP_UNUSABLE; 2030 return RX_DROP_UNUSABLE;
1930 2031
1931 if (!(rx->flags & IEEE80211_RX_RA_MATCH)) 2032 if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
1932 return RX_DROP_UNUSABLE;
1933
1934 if (ieee80211_drop_unencrypted_mgmt(rx))
1935 return RX_DROP_UNUSABLE; 2033 return RX_DROP_UNUSABLE;
1936 2034
1937 switch (mgmt->u.action.category) { 2035 switch (mgmt->u.action.category) {
@@ -2024,17 +2122,36 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
2024 goto queue; 2122 goto queue;
2025 } 2123 }
2026 2124
2125 return RX_CONTINUE;
2126
2027 invalid: 2127 invalid:
2028 /* 2128 status->rx_flags |= IEEE80211_RX_MALFORMED_ACTION_FRM;
2029 * For AP mode, hostapd is responsible for handling any action 2129 /* will return in the next handlers */
2030 * frames that we didn't handle, including returning unknown 2130 return RX_CONTINUE;
2031 * ones. For all other modes we will return them to the sender, 2131
2032 * setting the 0x80 bit in the action category, as required by 2132 handled:
2033 * 802.11-2007 7.3.1.11. 2133 if (rx->sta)
2034 */ 2134 rx->sta->rx_packets++;
2035 if (sdata->vif.type == NL80211_IFTYPE_AP || 2135 dev_kfree_skb(rx->skb);
2036 sdata->vif.type == NL80211_IFTYPE_AP_VLAN) 2136 return RX_QUEUED;
2037 return RX_DROP_MONITOR; 2137
2138 queue:
2139 rx->skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME;
2140 skb_queue_tail(&sdata->skb_queue, rx->skb);
2141 ieee80211_queue_work(&local->hw, &sdata->work);
2142 if (rx->sta)
2143 rx->sta->rx_packets++;
2144 return RX_QUEUED;
2145}
2146
2147static ieee80211_rx_result debug_noinline
2148ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx)
2149{
2150 struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
2151
2152 /* skip known-bad action frames and return them in the next handler */
2153 if (status->rx_flags & IEEE80211_RX_MALFORMED_ACTION_FRM)
2154 return RX_CONTINUE;
2038 2155
2039 /* 2156 /*
2040 * Getting here means the kernel doesn't know how to handle 2157 * Getting here means the kernel doesn't know how to handle
@@ -2042,12 +2159,46 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
2042 * so userspace can register for those to know whether ones 2159 * so userspace can register for those to know whether ones
2043 * it transmitted were processed or returned. 2160 * it transmitted were processed or returned.
2044 */ 2161 */
2045 status = IEEE80211_SKB_RXCB(rx->skb);
2046 2162
2047 if (cfg80211_rx_action(rx->sdata->dev, status->freq, 2163 if (cfg80211_rx_mgmt(rx->sdata->dev, status->freq,
2048 rx->skb->data, rx->skb->len, 2164 rx->skb->data, rx->skb->len,
2049 GFP_ATOMIC)) 2165 GFP_ATOMIC)) {
2050 goto handled; 2166 if (rx->sta)
2167 rx->sta->rx_packets++;
2168 dev_kfree_skb(rx->skb);
2169 return RX_QUEUED;
2170 }
2171
2172
2173 return RX_CONTINUE;
2174}
2175
2176static ieee80211_rx_result debug_noinline
2177ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx)
2178{
2179 struct ieee80211_local *local = rx->local;
2180 struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data;
2181 struct sk_buff *nskb;
2182 struct ieee80211_sub_if_data *sdata = rx->sdata;
2183 struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
2184
2185 if (!ieee80211_is_action(mgmt->frame_control))
2186 return RX_CONTINUE;
2187
2188 /*
2189 * For AP mode, hostapd is responsible for handling any action
2190 * frames that we didn't handle, including returning unknown
2191 * ones. For all other modes we will return them to the sender,
2192 * setting the 0x80 bit in the action category, as required by
2193 * 802.11-2007 7.3.1.11.
2194 * Newer versions of hostapd shall also use the management frame
2195 * registration mechanisms, but older ones still use cooked
2196 * monitor interfaces so push all frames there.
2197 */
2198 if (!(status->rx_flags & IEEE80211_RX_MALFORMED_ACTION_FRM) &&
2199 (sdata->vif.type == NL80211_IFTYPE_AP ||
2200 sdata->vif.type == NL80211_IFTYPE_AP_VLAN))
2201 return RX_DROP_MONITOR;
2051 2202
2052 /* do not return rejected action frames */ 2203 /* do not return rejected action frames */
2053 if (mgmt->u.action.category & 0x80) 2204 if (mgmt->u.action.category & 0x80)
@@ -2066,20 +2217,8 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
2066 2217
2067 ieee80211_tx_skb(rx->sdata, nskb); 2218 ieee80211_tx_skb(rx->sdata, nskb);
2068 } 2219 }
2069
2070 handled:
2071 if (rx->sta)
2072 rx->sta->rx_packets++;
2073 dev_kfree_skb(rx->skb); 2220 dev_kfree_skb(rx->skb);
2074 return RX_QUEUED; 2221 return RX_QUEUED;
2075
2076 queue:
2077 rx->skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME;
2078 skb_queue_tail(&sdata->skb_queue, rx->skb);
2079 ieee80211_queue_work(&local->hw, &sdata->work);
2080 if (rx->sta)
2081 rx->sta->rx_packets++;
2082 return RX_QUEUED;
2083} 2222}
2084 2223
2085static ieee80211_rx_result debug_noinline 2224static ieee80211_rx_result debug_noinline
@@ -2090,15 +2229,6 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx)
2090 struct ieee80211_mgmt *mgmt = (void *)rx->skb->data; 2229 struct ieee80211_mgmt *mgmt = (void *)rx->skb->data;
2091 __le16 stype; 2230 __le16 stype;
2092 2231
2093 if (!(rx->flags & IEEE80211_RX_RA_MATCH))
2094 return RX_DROP_MONITOR;
2095
2096 if (rx->skb->len < 24)
2097 return RX_DROP_MONITOR;
2098
2099 if (ieee80211_drop_unencrypted_mgmt(rx))
2100 return RX_DROP_UNUSABLE;
2101
2102 rxs = ieee80211_work_rx_mgmt(rx->sdata, rx->skb); 2232 rxs = ieee80211_work_rx_mgmt(rx->sdata, rx->skb);
2103 if (rxs != RX_CONTINUE) 2233 if (rxs != RX_CONTINUE)
2104 return rxs; 2234 return rxs;
@@ -2199,8 +2329,13 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx,
2199 struct net_device *prev_dev = NULL; 2329 struct net_device *prev_dev = NULL;
2200 struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); 2330 struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
2201 2331
2202 if (status->flag & RX_FLAG_INTERNAL_CMTR) 2332 /*
2333 * If cooked monitor has been processed already, then
2334 * don't do it again. If not, set the flag.
2335 */
2336 if (rx->flags & IEEE80211_RX_CMNTR)
2203 goto out_free_skb; 2337 goto out_free_skb;
2338 rx->flags |= IEEE80211_RX_CMNTR;
2204 2339
2205 if (skb_headroom(skb) < sizeof(*rthdr) && 2340 if (skb_headroom(skb) < sizeof(*rthdr) &&
2206 pskb_expand_head(skb, sizeof(*rthdr), 0, GFP_ATOMIC)) 2341 pskb_expand_head(skb, sizeof(*rthdr), 0, GFP_ATOMIC))
@@ -2256,30 +2391,53 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx,
2256 if (prev_dev) { 2391 if (prev_dev) {
2257 skb->dev = prev_dev; 2392 skb->dev = prev_dev;
2258 netif_receive_skb(skb); 2393 netif_receive_skb(skb);
2259 skb = NULL; 2394 return;
2260 } else 2395 }
2261 goto out_free_skb;
2262
2263 status->flag |= RX_FLAG_INTERNAL_CMTR;
2264 return;
2265 2396
2266 out_free_skb: 2397 out_free_skb:
2267 dev_kfree_skb(skb); 2398 dev_kfree_skb(skb);
2268} 2399}
2269 2400
2401static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx,
2402 ieee80211_rx_result res)
2403{
2404 switch (res) {
2405 case RX_DROP_MONITOR:
2406 I802_DEBUG_INC(rx->sdata->local->rx_handlers_drop);
2407 if (rx->sta)
2408 rx->sta->rx_dropped++;
2409 /* fall through */
2410 case RX_CONTINUE: {
2411 struct ieee80211_rate *rate = NULL;
2412 struct ieee80211_supported_band *sband;
2413 struct ieee80211_rx_status *status;
2414
2415 status = IEEE80211_SKB_RXCB((rx->skb));
2416
2417 sband = rx->local->hw.wiphy->bands[status->band];
2418 if (!(status->flag & RX_FLAG_HT))
2419 rate = &sband->bitrates[status->rate_idx];
2420
2421 ieee80211_rx_cooked_monitor(rx, rate);
2422 break;
2423 }
2424 case RX_DROP_UNUSABLE:
2425 I802_DEBUG_INC(rx->sdata->local->rx_handlers_drop);
2426 if (rx->sta)
2427 rx->sta->rx_dropped++;
2428 dev_kfree_skb(rx->skb);
2429 break;
2430 case RX_QUEUED:
2431 I802_DEBUG_INC(rx->sdata->local->rx_handlers_queued);
2432 break;
2433 }
2434}
2270 2435
2271static void ieee80211_invoke_rx_handlers(struct ieee80211_sub_if_data *sdata, 2436static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx,
2272 struct ieee80211_rx_data *rx, 2437 struct sk_buff_head *frames)
2273 struct sk_buff *skb,
2274 struct ieee80211_rate *rate)
2275{ 2438{
2276 struct sk_buff_head reorder_release;
2277 ieee80211_rx_result res = RX_DROP_MONITOR; 2439 ieee80211_rx_result res = RX_DROP_MONITOR;
2278 2440 struct sk_buff *skb;
2279 __skb_queue_head_init(&reorder_release);
2280
2281 rx->skb = skb;
2282 rx->sdata = sdata;
2283 2441
2284#define CALL_RXH(rxh) \ 2442#define CALL_RXH(rxh) \
2285 do { \ 2443 do { \
@@ -2288,23 +2446,14 @@ static void ieee80211_invoke_rx_handlers(struct ieee80211_sub_if_data *sdata,
2288 goto rxh_next; \ 2446 goto rxh_next; \
2289 } while (0); 2447 } while (0);
2290 2448
2291 /* 2449 while ((skb = __skb_dequeue(frames))) {
2292 * NB: the rxh_next label works even if we jump
2293 * to it from here because then the list will
2294 * be empty, which is a trivial check
2295 */
2296 CALL_RXH(ieee80211_rx_h_passive_scan)
2297 CALL_RXH(ieee80211_rx_h_check)
2298
2299 ieee80211_rx_reorder_ampdu(rx, &reorder_release);
2300
2301 while ((skb = __skb_dequeue(&reorder_release))) {
2302 /* 2450 /*
2303 * all the other fields are valid across frames 2451 * all the other fields are valid across frames
2304 * that belong to an aMPDU since they are on the 2452 * that belong to an aMPDU since they are on the
2305 * same TID from the same station 2453 * same TID from the same station
2306 */ 2454 */
2307 rx->skb = skb; 2455 rx->skb = skb;
2456 rx->flags = 0;
2308 2457
2309 CALL_RXH(ieee80211_rx_h_decrypt) 2458 CALL_RXH(ieee80211_rx_h_decrypt)
2310 CALL_RXH(ieee80211_rx_h_check_more_data) 2459 CALL_RXH(ieee80211_rx_h_check_more_data)
@@ -2316,50 +2465,92 @@ static void ieee80211_invoke_rx_handlers(struct ieee80211_sub_if_data *sdata,
2316 CALL_RXH(ieee80211_rx_h_remove_qos_control) 2465 CALL_RXH(ieee80211_rx_h_remove_qos_control)
2317 CALL_RXH(ieee80211_rx_h_amsdu) 2466 CALL_RXH(ieee80211_rx_h_amsdu)
2318#ifdef CONFIG_MAC80211_MESH 2467#ifdef CONFIG_MAC80211_MESH
2319 if (ieee80211_vif_is_mesh(&sdata->vif)) 2468 if (ieee80211_vif_is_mesh(&rx->sdata->vif))
2320 CALL_RXH(ieee80211_rx_h_mesh_fwding); 2469 CALL_RXH(ieee80211_rx_h_mesh_fwding);
2321#endif 2470#endif
2322 CALL_RXH(ieee80211_rx_h_data) 2471 CALL_RXH(ieee80211_rx_h_data)
2323 2472
2324 /* special treatment -- needs the queue */ 2473 /* special treatment -- needs the queue */
2325 res = ieee80211_rx_h_ctrl(rx, &reorder_release); 2474 res = ieee80211_rx_h_ctrl(rx, frames);
2326 if (res != RX_CONTINUE) 2475 if (res != RX_CONTINUE)
2327 goto rxh_next; 2476 goto rxh_next;
2328 2477
2478 CALL_RXH(ieee80211_rx_h_mgmt_check)
2329 CALL_RXH(ieee80211_rx_h_action) 2479 CALL_RXH(ieee80211_rx_h_action)
2480 CALL_RXH(ieee80211_rx_h_userspace_mgmt)
2481 CALL_RXH(ieee80211_rx_h_action_return)
2330 CALL_RXH(ieee80211_rx_h_mgmt) 2482 CALL_RXH(ieee80211_rx_h_mgmt)
2331 2483
2484 rxh_next:
2485 ieee80211_rx_handlers_result(rx, res);
2486
2332#undef CALL_RXH 2487#undef CALL_RXH
2488 }
2489}
2490
2491static void ieee80211_invoke_rx_handlers(struct ieee80211_rx_data *rx)
2492{
2493 struct sk_buff_head reorder_release;
2494 ieee80211_rx_result res = RX_DROP_MONITOR;
2495
2496 __skb_queue_head_init(&reorder_release);
2497
2498#define CALL_RXH(rxh) \
2499 do { \
2500 res = rxh(rx); \
2501 if (res != RX_CONTINUE) \
2502 goto rxh_next; \
2503 } while (0);
2504
2505 CALL_RXH(ieee80211_rx_h_passive_scan)
2506 CALL_RXH(ieee80211_rx_h_check)
2507
2508 ieee80211_rx_reorder_ampdu(rx, &reorder_release);
2509
2510 ieee80211_rx_handlers(rx, &reorder_release);
2511 return;
2333 2512
2334 rxh_next: 2513 rxh_next:
2335 switch (res) { 2514 ieee80211_rx_handlers_result(rx, res);
2336 case RX_DROP_MONITOR: 2515
2337 I802_DEBUG_INC(sdata->local->rx_handlers_drop); 2516#undef CALL_RXH
2338 if (rx->sta) 2517}
2339 rx->sta->rx_dropped++; 2518
2340 /* fall through */ 2519/*
2341 case RX_CONTINUE: 2520 * This function makes calls into the RX path. Therefore the
2342 ieee80211_rx_cooked_monitor(rx, rate); 2521 * caller must hold the sta_info->lock and everything has to
2343 break; 2522 * be under rcu_read_lock protection as well.
2344 case RX_DROP_UNUSABLE: 2523 */
2345 I802_DEBUG_INC(sdata->local->rx_handlers_drop); 2524void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid)
2346 if (rx->sta) 2525{
2347 rx->sta->rx_dropped++; 2526 struct sk_buff_head frames;
2348 dev_kfree_skb(rx->skb); 2527 struct ieee80211_rx_data rx = {
2349 break; 2528 .sta = sta,
2350 case RX_QUEUED: 2529 .sdata = sta->sdata,
2351 I802_DEBUG_INC(sdata->local->rx_handlers_queued); 2530 .local = sta->local,
2352 break; 2531 .queue = tid,
2353 } 2532 };
2354 } 2533 struct tid_ampdu_rx *tid_agg_rx;
2534
2535 tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]);
2536 if (!tid_agg_rx)
2537 return;
2538
2539 __skb_queue_head_init(&frames);
2540
2541 spin_lock(&tid_agg_rx->reorder_lock);
2542 ieee80211_sta_reorder_release(&sta->local->hw, tid_agg_rx, &frames);
2543 spin_unlock(&tid_agg_rx->reorder_lock);
2544
2545 ieee80211_rx_handlers(&rx, &frames);
2355} 2546}
2356 2547
2357/* main receive path */ 2548/* main receive path */
2358 2549
2359static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata, 2550static int prepare_for_handlers(struct ieee80211_rx_data *rx,
2360 struct ieee80211_rx_data *rx,
2361 struct ieee80211_hdr *hdr) 2551 struct ieee80211_hdr *hdr)
2362{ 2552{
2553 struct ieee80211_sub_if_data *sdata = rx->sdata;
2363 struct sk_buff *skb = rx->skb; 2554 struct sk_buff *skb = rx->skb;
2364 struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); 2555 struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
2365 u8 *bssid = ieee80211_get_bssid(hdr, skb->len, sdata->vif.type); 2556 u8 *bssid = ieee80211_get_bssid(hdr, skb->len, sdata->vif.type);
@@ -2373,7 +2564,7 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
2373 compare_ether_addr(sdata->vif.addr, hdr->addr1) != 0) { 2564 compare_ether_addr(sdata->vif.addr, hdr->addr1) != 0) {
2374 if (!(sdata->dev->flags & IFF_PROMISC)) 2565 if (!(sdata->dev->flags & IFF_PROMISC))
2375 return 0; 2566 return 0;
2376 rx->flags &= ~IEEE80211_RX_RA_MATCH; 2567 status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
2377 } 2568 }
2378 break; 2569 break;
2379 case NL80211_IFTYPE_ADHOC: 2570 case NL80211_IFTYPE_ADHOC:
@@ -2383,15 +2574,15 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
2383 return 1; 2574 return 1;
2384 } 2575 }
2385 else if (!ieee80211_bssid_match(bssid, sdata->u.ibss.bssid)) { 2576 else if (!ieee80211_bssid_match(bssid, sdata->u.ibss.bssid)) {
2386 if (!(rx->flags & IEEE80211_RX_IN_SCAN)) 2577 if (!(status->rx_flags & IEEE80211_RX_IN_SCAN))
2387 return 0; 2578 return 0;
2388 rx->flags &= ~IEEE80211_RX_RA_MATCH; 2579 status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
2389 } else if (!multicast && 2580 } else if (!multicast &&
2390 compare_ether_addr(sdata->vif.addr, 2581 compare_ether_addr(sdata->vif.addr,
2391 hdr->addr1) != 0) { 2582 hdr->addr1) != 0) {
2392 if (!(sdata->dev->flags & IFF_PROMISC)) 2583 if (!(sdata->dev->flags & IFF_PROMISC))
2393 return 0; 2584 return 0;
2394 rx->flags &= ~IEEE80211_RX_RA_MATCH; 2585 status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
2395 } else if (!rx->sta) { 2586 } else if (!rx->sta) {
2396 int rate_idx; 2587 int rate_idx;
2397 if (status->flag & RX_FLAG_HT) 2588 if (status->flag & RX_FLAG_HT)
@@ -2409,7 +2600,7 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
2409 if (!(sdata->dev->flags & IFF_PROMISC)) 2600 if (!(sdata->dev->flags & IFF_PROMISC))
2410 return 0; 2601 return 0;
2411 2602
2412 rx->flags &= ~IEEE80211_RX_RA_MATCH; 2603 status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
2413 } 2604 }
2414 break; 2605 break;
2415 case NL80211_IFTYPE_AP_VLAN: 2606 case NL80211_IFTYPE_AP_VLAN:
@@ -2420,9 +2611,9 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
2420 return 0; 2611 return 0;
2421 } else if (!ieee80211_bssid_match(bssid, 2612 } else if (!ieee80211_bssid_match(bssid,
2422 sdata->vif.addr)) { 2613 sdata->vif.addr)) {
2423 if (!(rx->flags & IEEE80211_RX_IN_SCAN)) 2614 if (!(status->rx_flags & IEEE80211_RX_IN_SCAN))
2424 return 0; 2615 return 0;
2425 rx->flags &= ~IEEE80211_RX_RA_MATCH; 2616 status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
2426 } 2617 }
2427 break; 2618 break;
2428 case NL80211_IFTYPE_WDS: 2619 case NL80211_IFTYPE_WDS:
@@ -2431,9 +2622,7 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
2431 if (compare_ether_addr(sdata->u.wds.remote_addr, hdr->addr2)) 2622 if (compare_ether_addr(sdata->u.wds.remote_addr, hdr->addr2))
2432 return 0; 2623 return 0;
2433 break; 2624 break;
2434 case NL80211_IFTYPE_MONITOR: 2625 default:
2435 case NL80211_IFTYPE_UNSPECIFIED:
2436 case __NL80211_IFTYPE_AFTER_LAST:
2437 /* should never get here */ 2626 /* should never get here */
2438 WARN_ON(1); 2627 WARN_ON(1);
2439 break; 2628 break;
@@ -2443,12 +2632,56 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
2443} 2632}
2444 2633
2445/* 2634/*
2635 * This function returns whether or not the SKB
2636 * was destined for RX processing or not, which,
2637 * if consume is true, is equivalent to whether
2638 * or not the skb was consumed.
2639 */
2640static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx,
2641 struct sk_buff *skb, bool consume)
2642{
2643 struct ieee80211_local *local = rx->local;
2644 struct ieee80211_sub_if_data *sdata = rx->sdata;
2645 struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
2646 struct ieee80211_hdr *hdr = (void *)skb->data;
2647 int prepares;
2648
2649 rx->skb = skb;
2650 status->rx_flags |= IEEE80211_RX_RA_MATCH;
2651 prepares = prepare_for_handlers(rx, hdr);
2652
2653 if (!prepares)
2654 return false;
2655
2656 if (status->flag & RX_FLAG_MMIC_ERROR) {
2657 if (status->rx_flags & IEEE80211_RX_RA_MATCH)
2658 ieee80211_rx_michael_mic_report(hdr, rx);
2659 return false;
2660 }
2661
2662 if (!consume) {
2663 skb = skb_copy(skb, GFP_ATOMIC);
2664 if (!skb) {
2665 if (net_ratelimit())
2666 wiphy_debug(local->hw.wiphy,
2667 "failed to copy multicast frame for %s\n",
2668 sdata->name);
2669 return true;
2670 }
2671
2672 rx->skb = skb;
2673 }
2674
2675 ieee80211_invoke_rx_handlers(rx);
2676 return true;
2677}
2678
2679/*
2446 * This is the actual Rx frames handler. as it blongs to Rx path it must 2680 * This is the actual Rx frames handler. as it blongs to Rx path it must
2447 * be called with rcu_read_lock protection. 2681 * be called with rcu_read_lock protection.
2448 */ 2682 */
2449static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, 2683static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
2450 struct sk_buff *skb, 2684 struct sk_buff *skb)
2451 struct ieee80211_rate *rate)
2452{ 2685{
2453 struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); 2686 struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
2454 struct ieee80211_local *local = hw_to_local(hw); 2687 struct ieee80211_local *local = hw_to_local(hw);
@@ -2456,11 +2689,8 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
2456 struct ieee80211_hdr *hdr; 2689 struct ieee80211_hdr *hdr;
2457 __le16 fc; 2690 __le16 fc;
2458 struct ieee80211_rx_data rx; 2691 struct ieee80211_rx_data rx;
2459 int prepares; 2692 struct ieee80211_sub_if_data *prev;
2460 struct ieee80211_sub_if_data *prev = NULL; 2693 struct sta_info *sta, *tmp, *prev_sta;
2461 struct sk_buff *skb_new;
2462 struct sta_info *sta, *tmp;
2463 bool found_sta = false;
2464 int err = 0; 2694 int err = 0;
2465 2695
2466 fc = ((struct ieee80211_hdr *)skb->data)->frame_control; 2696 fc = ((struct ieee80211_hdr *)skb->data)->frame_control;
@@ -2473,7 +2703,7 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
2473 2703
2474 if (unlikely(test_bit(SCAN_HW_SCANNING, &local->scanning) || 2704 if (unlikely(test_bit(SCAN_HW_SCANNING, &local->scanning) ||
2475 test_bit(SCAN_OFF_CHANNEL, &local->scanning))) 2705 test_bit(SCAN_OFF_CHANNEL, &local->scanning)))
2476 rx.flags |= IEEE80211_RX_IN_SCAN; 2706 status->rx_flags |= IEEE80211_RX_IN_SCAN;
2477 2707
2478 if (ieee80211_is_mgmt(fc)) 2708 if (ieee80211_is_mgmt(fc))
2479 err = skb_linearize(skb); 2709 err = skb_linearize(skb);
@@ -2490,91 +2720,67 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
2490 ieee80211_verify_alignment(&rx); 2720 ieee80211_verify_alignment(&rx);
2491 2721
2492 if (ieee80211_is_data(fc)) { 2722 if (ieee80211_is_data(fc)) {
2723 prev_sta = NULL;
2724
2493 for_each_sta_info(local, hdr->addr2, sta, tmp) { 2725 for_each_sta_info(local, hdr->addr2, sta, tmp) {
2494 rx.sta = sta; 2726 if (!prev_sta) {
2495 found_sta = true; 2727 prev_sta = sta;
2496 rx.sdata = sta->sdata;
2497
2498 rx.flags |= IEEE80211_RX_RA_MATCH;
2499 prepares = prepare_for_handlers(rx.sdata, &rx, hdr);
2500 if (prepares) {
2501 if (status->flag & RX_FLAG_MMIC_ERROR) {
2502 if (rx.flags & IEEE80211_RX_RA_MATCH)
2503 ieee80211_rx_michael_mic_report(hdr, &rx);
2504 } else
2505 prev = rx.sdata;
2506 }
2507 }
2508 }
2509 if (!found_sta) {
2510 list_for_each_entry_rcu(sdata, &local->interfaces, list) {
2511 if (!ieee80211_sdata_running(sdata))
2512 continue; 2728 continue;
2729 }
2513 2730
2514 if (sdata->vif.type == NL80211_IFTYPE_MONITOR || 2731 rx.sta = prev_sta;
2515 sdata->vif.type == NL80211_IFTYPE_AP_VLAN) 2732 rx.sdata = prev_sta->sdata;
2516 continue; 2733 ieee80211_prepare_and_rx_handle(&rx, skb, false);
2517 2734
2518 /* 2735 prev_sta = sta;
2519 * frame is destined for this interface, but if it's 2736 }
2520 * not also for the previous one we handle that after
2521 * the loop to avoid copying the SKB once too much
2522 */
2523 2737
2524 if (!prev) { 2738 if (prev_sta) {
2525 prev = sdata; 2739 rx.sta = prev_sta;
2526 continue; 2740 rx.sdata = prev_sta->sdata;
2527 }
2528 2741
2529 rx.sta = sta_info_get_bss(prev, hdr->addr2); 2742 if (ieee80211_prepare_and_rx_handle(&rx, skb, true))
2743 return;
2744 }
2745 }
2530 2746
2531 rx.flags |= IEEE80211_RX_RA_MATCH; 2747 prev = NULL;
2532 prepares = prepare_for_handlers(prev, &rx, hdr);
2533 2748
2534 if (!prepares) 2749 list_for_each_entry_rcu(sdata, &local->interfaces, list) {
2535 goto next; 2750 if (!ieee80211_sdata_running(sdata))
2751 continue;
2536 2752
2537 if (status->flag & RX_FLAG_MMIC_ERROR) { 2753 if (sdata->vif.type == NL80211_IFTYPE_MONITOR ||
2538 rx.sdata = prev; 2754 sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
2539 if (rx.flags & IEEE80211_RX_RA_MATCH) 2755 continue;
2540 ieee80211_rx_michael_mic_report(hdr,
2541 &rx);
2542 goto next;
2543 }
2544 2756
2545 /* 2757 /*
2546 * frame was destined for the previous interface 2758 * frame is destined for this interface, but if it's
2547 * so invoke RX handlers for it 2759 * not also for the previous one we handle that after
2548 */ 2760 * the loop to avoid copying the SKB once too much
2761 */
2549 2762
2550 skb_new = skb_copy(skb, GFP_ATOMIC); 2763 if (!prev) {
2551 if (!skb_new) {
2552 if (net_ratelimit())
2553 printk(KERN_DEBUG "%s: failed to copy "
2554 "multicast frame for %s\n",
2555 wiphy_name(local->hw.wiphy),
2556 prev->name);
2557 goto next;
2558 }
2559 ieee80211_invoke_rx_handlers(prev, &rx, skb_new, rate);
2560next:
2561 prev = sdata; 2764 prev = sdata;
2765 continue;
2562 } 2766 }
2563 2767
2564 if (prev) { 2768 rx.sta = sta_info_get_bss(prev, hdr->addr2);
2565 rx.sta = sta_info_get_bss(prev, hdr->addr2); 2769 rx.sdata = prev;
2770 ieee80211_prepare_and_rx_handle(&rx, skb, false);
2566 2771
2567 rx.flags |= IEEE80211_RX_RA_MATCH; 2772 prev = sdata;
2568 prepares = prepare_for_handlers(prev, &rx, hdr); 2773 }
2569 2774
2570 if (!prepares) 2775 if (prev) {
2571 prev = NULL; 2776 rx.sta = sta_info_get_bss(prev, hdr->addr2);
2572 } 2777 rx.sdata = prev;
2778
2779 if (ieee80211_prepare_and_rx_handle(&rx, skb, true))
2780 return;
2573 } 2781 }
2574 if (prev) 2782
2575 ieee80211_invoke_rx_handlers(prev, &rx, skb, rate); 2783 dev_kfree_skb(skb);
2576 else
2577 dev_kfree_skb(skb);
2578} 2784}
2579 2785
2580/* 2786/*
@@ -2615,30 +2821,41 @@ void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb)
2615 if (WARN_ON(!local->started)) 2821 if (WARN_ON(!local->started))
2616 goto drop; 2822 goto drop;
2617 2823
2618 if (status->flag & RX_FLAG_HT) { 2824 if (likely(!(status->flag & RX_FLAG_FAILED_PLCP_CRC))) {
2619 /* 2825 /*
2620 * rate_idx is MCS index, which can be [0-76] as documented on: 2826 * Validate the rate, unless a PLCP error means that
2621 * 2827 * we probably can't have a valid rate here anyway.
2622 * http://wireless.kernel.org/en/developers/Documentation/ieee80211/802.11n
2623 *
2624 * Anything else would be some sort of driver or hardware error.
2625 * The driver should catch hardware errors.
2626 */ 2828 */
2627 if (WARN((status->rate_idx < 0 || 2829
2628 status->rate_idx > 76), 2830 if (status->flag & RX_FLAG_HT) {
2629 "Rate marked as an HT rate but passed " 2831 /*
2630 "status->rate_idx is not " 2832 * rate_idx is MCS index, which can be [0-76]
2631 "an MCS index [0-76]: %d (0x%02x)\n", 2833 * as documented on:
2632 status->rate_idx, 2834 *
2633 status->rate_idx)) 2835 * http://wireless.kernel.org/en/developers/Documentation/ieee80211/802.11n
2634 goto drop; 2836 *
2635 } else { 2837 * Anything else would be some sort of driver or
2636 if (WARN_ON(status->rate_idx < 0 || 2838 * hardware error. The driver should catch hardware
2637 status->rate_idx >= sband->n_bitrates)) 2839 * errors.
2638 goto drop; 2840 */
2639 rate = &sband->bitrates[status->rate_idx]; 2841 if (WARN((status->rate_idx < 0 ||
2842 status->rate_idx > 76),
2843 "Rate marked as an HT rate but passed "
2844 "status->rate_idx is not "
2845 "an MCS index [0-76]: %d (0x%02x)\n",
2846 status->rate_idx,
2847 status->rate_idx))
2848 goto drop;
2849 } else {
2850 if (WARN_ON(status->rate_idx < 0 ||
2851 status->rate_idx >= sband->n_bitrates))
2852 goto drop;
2853 rate = &sband->bitrates[status->rate_idx];
2854 }
2640 } 2855 }
2641 2856
2857 status->rx_flags = 0;
2858
2642 /* 2859 /*
2643 * key references and virtual interfaces are protected using RCU 2860 * key references and virtual interfaces are protected using RCU
2644 * and this requires that we are in a read-side RCU section during 2861 * and this requires that we are in a read-side RCU section during
@@ -2658,7 +2875,7 @@ void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb)
2658 return; 2875 return;
2659 } 2876 }
2660 2877
2661 __ieee80211_rx_handle_packet(hw, skb, rate); 2878 __ieee80211_rx_handle_packet(hw, skb);
2662 2879
2663 rcu_read_unlock(); 2880 rcu_read_unlock();
2664 2881
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 872d7b6ef6b3..fb274db77e3c 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -242,20 +242,19 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local)
242 local->hw_scan_req->n_channels = n_chans; 242 local->hw_scan_req->n_channels = n_chans;
243 243
244 ielen = ieee80211_build_preq_ies(local, (u8 *)local->hw_scan_req->ie, 244 ielen = ieee80211_build_preq_ies(local, (u8 *)local->hw_scan_req->ie,
245 req->ie, req->ie_len, band); 245 req->ie, req->ie_len, band, (u32) -1,
246 0);
246 local->hw_scan_req->ie_len = ielen; 247 local->hw_scan_req->ie_len = ielen;
247 248
248 return true; 249 return true;
249} 250}
250 251
251void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) 252static bool __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted,
253 bool was_hw_scan)
252{ 254{
253 struct ieee80211_local *local = hw_to_local(hw); 255 struct ieee80211_local *local = hw_to_local(hw);
254 bool was_hw_scan;
255
256 trace_api_scan_completed(local, aborted);
257 256
258 mutex_lock(&local->scan_mtx); 257 lockdep_assert_held(&local->mtx);
259 258
260 /* 259 /*
261 * It's ok to abort a not-yet-running scan (that 260 * It's ok to abort a not-yet-running scan (that
@@ -266,17 +265,13 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
266 if (WARN_ON(!local->scanning && !aborted)) 265 if (WARN_ON(!local->scanning && !aborted))
267 aborted = true; 266 aborted = true;
268 267
269 if (WARN_ON(!local->scan_req)) { 268 if (WARN_ON(!local->scan_req))
270 mutex_unlock(&local->scan_mtx); 269 return false;
271 return;
272 }
273 270
274 was_hw_scan = test_bit(SCAN_HW_SCANNING, &local->scanning);
275 if (was_hw_scan && !aborted && ieee80211_prep_hw_scan(local)) { 271 if (was_hw_scan && !aborted && ieee80211_prep_hw_scan(local)) {
276 ieee80211_queue_delayed_work(&local->hw, 272 int rc = drv_hw_scan(local, local->scan_sdata, local->hw_scan_req);
277 &local->scan_work, 0); 273 if (rc == 0)
278 mutex_unlock(&local->scan_mtx); 274 return false;
279 return;
280 } 275 }
281 276
282 kfree(local->hw_scan_req); 277 kfree(local->hw_scan_req);
@@ -290,26 +285,42 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
290 local->scanning = 0; 285 local->scanning = 0;
291 local->scan_channel = NULL; 286 local->scan_channel = NULL;
292 287
293 /* we only have to protect scan_req and hw/sw scan */ 288 return true;
294 mutex_unlock(&local->scan_mtx); 289}
295
296 ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL);
297 if (was_hw_scan)
298 goto done;
299
300 ieee80211_configure_filter(local);
301 290
302 drv_sw_scan_complete(local); 291static void __ieee80211_scan_completed_finish(struct ieee80211_hw *hw,
292 bool was_hw_scan)
293{
294 struct ieee80211_local *local = hw_to_local(hw);
303 295
304 ieee80211_offchannel_return(local, true); 296 ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL);
297 if (!was_hw_scan) {
298 ieee80211_configure_filter(local);
299 drv_sw_scan_complete(local);
300 ieee80211_offchannel_return(local, true);
301 }
305 302
306 done: 303 mutex_lock(&local->mtx);
307 ieee80211_recalc_idle(local); 304 ieee80211_recalc_idle(local);
305 mutex_unlock(&local->mtx);
306
308 ieee80211_mlme_notify_scan_completed(local); 307 ieee80211_mlme_notify_scan_completed(local);
309 ieee80211_ibss_notify_scan_completed(local); 308 ieee80211_ibss_notify_scan_completed(local);
310 ieee80211_mesh_notify_scan_completed(local); 309 ieee80211_mesh_notify_scan_completed(local);
311 ieee80211_queue_work(&local->hw, &local->work_work); 310 ieee80211_queue_work(&local->hw, &local->work_work);
312} 311}
312
313void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
314{
315 struct ieee80211_local *local = hw_to_local(hw);
316
317 trace_api_scan_completed(local, aborted);
318
319 set_bit(SCAN_COMPLETED, &local->scanning);
320 if (aborted)
321 set_bit(SCAN_ABORTED, &local->scanning);
322 ieee80211_queue_delayed_work(&local->hw, &local->scan_work, 0);
323}
313EXPORT_SYMBOL(ieee80211_scan_completed); 324EXPORT_SYMBOL(ieee80211_scan_completed);
314 325
315static int ieee80211_start_sw_scan(struct ieee80211_local *local) 326static int ieee80211_start_sw_scan(struct ieee80211_local *local)
@@ -353,6 +364,8 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata,
353 struct ieee80211_local *local = sdata->local; 364 struct ieee80211_local *local = sdata->local;
354 int rc; 365 int rc;
355 366
367 lockdep_assert_held(&local->mtx);
368
356 if (local->scan_req) 369 if (local->scan_req)
357 return -EBUSY; 370 return -EBUSY;
358 371
@@ -434,8 +447,8 @@ ieee80211_scan_get_channel_time(struct ieee80211_channel *chan)
434 return IEEE80211_PROBE_DELAY + IEEE80211_CHANNEL_TIME; 447 return IEEE80211_PROBE_DELAY + IEEE80211_CHANNEL_TIME;
435} 448}
436 449
437static int ieee80211_scan_state_decision(struct ieee80211_local *local, 450static void ieee80211_scan_state_decision(struct ieee80211_local *local,
438 unsigned long *next_delay) 451 unsigned long *next_delay)
439{ 452{
440 bool associated = false; 453 bool associated = false;
441 bool tx_empty = true; 454 bool tx_empty = true;
@@ -445,12 +458,6 @@ static int ieee80211_scan_state_decision(struct ieee80211_local *local,
445 struct ieee80211_sub_if_data *sdata; 458 struct ieee80211_sub_if_data *sdata;
446 struct ieee80211_channel *next_chan; 459 struct ieee80211_channel *next_chan;
447 460
448 /* if no more bands/channels left, complete scan and advance to the idle state */
449 if (local->scan_channel_idx >= local->scan_req->n_channels) {
450 ieee80211_scan_completed(&local->hw, false);
451 return 1;
452 }
453
454 /* 461 /*
455 * check if at least one STA interface is associated, 462 * check if at least one STA interface is associated,
456 * check if at least one STA interface has pending tx frames 463 * check if at least one STA interface has pending tx frames
@@ -522,7 +529,6 @@ static int ieee80211_scan_state_decision(struct ieee80211_local *local,
522 } 529 }
523 530
524 *next_delay = 0; 531 *next_delay = 0;
525 return 0;
526} 532}
527 533
528static void ieee80211_scan_state_leave_oper_channel(struct ieee80211_local *local, 534static void ieee80211_scan_state_leave_oper_channel(struct ieee80211_local *local,
@@ -638,21 +644,18 @@ void ieee80211_scan_work(struct work_struct *work)
638 container_of(work, struct ieee80211_local, scan_work.work); 644 container_of(work, struct ieee80211_local, scan_work.work);
639 struct ieee80211_sub_if_data *sdata = local->scan_sdata; 645 struct ieee80211_sub_if_data *sdata = local->scan_sdata;
640 unsigned long next_delay = 0; 646 unsigned long next_delay = 0;
647 bool aborted, hw_scan, finish;
641 648
642 mutex_lock(&local->scan_mtx); 649 mutex_lock(&local->mtx);
643 if (!sdata || !local->scan_req) {
644 mutex_unlock(&local->scan_mtx);
645 return;
646 }
647 650
648 if (local->hw_scan_req) { 651 if (test_and_clear_bit(SCAN_COMPLETED, &local->scanning)) {
649 int rc = drv_hw_scan(local, sdata, local->hw_scan_req); 652 aborted = test_and_clear_bit(SCAN_ABORTED, &local->scanning);
650 mutex_unlock(&local->scan_mtx); 653 goto out_complete;
651 if (rc)
652 ieee80211_scan_completed(&local->hw, true);
653 return;
654 } 654 }
655 655
656 if (!sdata || !local->scan_req)
657 goto out;
658
656 if (local->scan_req && !local->scanning) { 659 if (local->scan_req && !local->scanning) {
657 struct cfg80211_scan_request *req = local->scan_req; 660 struct cfg80211_scan_request *req = local->scan_req;
658 int rc; 661 int rc;
@@ -661,21 +664,21 @@ void ieee80211_scan_work(struct work_struct *work)
661 local->scan_sdata = NULL; 664 local->scan_sdata = NULL;
662 665
663 rc = __ieee80211_start_scan(sdata, req); 666 rc = __ieee80211_start_scan(sdata, req);
664 mutex_unlock(&local->scan_mtx); 667 if (rc) {
665 668 /* need to complete scan in cfg80211 */
666 if (rc) 669 local->scan_req = req;
667 ieee80211_scan_completed(&local->hw, true); 670 aborted = true;
668 return; 671 goto out_complete;
672 } else
673 goto out;
669 } 674 }
670 675
671 mutex_unlock(&local->scan_mtx);
672
673 /* 676 /*
674 * Avoid re-scheduling when the sdata is going away. 677 * Avoid re-scheduling when the sdata is going away.
675 */ 678 */
676 if (!ieee80211_sdata_running(sdata)) { 679 if (!ieee80211_sdata_running(sdata)) {
677 ieee80211_scan_completed(&local->hw, true); 680 aborted = true;
678 return; 681 goto out_complete;
679 } 682 }
680 683
681 /* 684 /*
@@ -685,8 +688,12 @@ void ieee80211_scan_work(struct work_struct *work)
685 do { 688 do {
686 switch (local->next_scan_state) { 689 switch (local->next_scan_state) {
687 case SCAN_DECISION: 690 case SCAN_DECISION:
688 if (ieee80211_scan_state_decision(local, &next_delay)) 691 /* if no more bands/channels left, complete scan */
689 return; 692 if (local->scan_channel_idx >= local->scan_req->n_channels) {
693 aborted = false;
694 goto out_complete;
695 }
696 ieee80211_scan_state_decision(local, &next_delay);
690 break; 697 break;
691 case SCAN_SET_CHANNEL: 698 case SCAN_SET_CHANNEL:
692 ieee80211_scan_state_set_channel(local, &next_delay); 699 ieee80211_scan_state_set_channel(local, &next_delay);
@@ -704,6 +711,19 @@ void ieee80211_scan_work(struct work_struct *work)
704 } while (next_delay == 0); 711 } while (next_delay == 0);
705 712
706 ieee80211_queue_delayed_work(&local->hw, &local->scan_work, next_delay); 713 ieee80211_queue_delayed_work(&local->hw, &local->scan_work, next_delay);
714 mutex_unlock(&local->mtx);
715 return;
716
717out_complete:
718 hw_scan = test_bit(SCAN_HW_SCANNING, &local->scanning);
719 finish = __ieee80211_scan_completed(&local->hw, aborted, hw_scan);
720 mutex_unlock(&local->mtx);
721 if (finish)
722 __ieee80211_scan_completed_finish(&local->hw, hw_scan);
723 return;
724
725out:
726 mutex_unlock(&local->mtx);
707} 727}
708 728
709int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata, 729int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata,
@@ -711,9 +731,9 @@ int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata,
711{ 731{
712 int res; 732 int res;
713 733
714 mutex_lock(&sdata->local->scan_mtx); 734 mutex_lock(&sdata->local->mtx);
715 res = __ieee80211_start_scan(sdata, req); 735 res = __ieee80211_start_scan(sdata, req);
716 mutex_unlock(&sdata->local->scan_mtx); 736 mutex_unlock(&sdata->local->mtx);
717 737
718 return res; 738 return res;
719} 739}
@@ -726,7 +746,7 @@ int ieee80211_request_internal_scan(struct ieee80211_sub_if_data *sdata,
726 int ret = -EBUSY; 746 int ret = -EBUSY;
727 enum ieee80211_band band; 747 enum ieee80211_band band;
728 748
729 mutex_lock(&local->scan_mtx); 749 mutex_lock(&local->mtx);
730 750
731 /* busy scanning */ 751 /* busy scanning */
732 if (local->scan_req) 752 if (local->scan_req)
@@ -761,25 +781,44 @@ int ieee80211_request_internal_scan(struct ieee80211_sub_if_data *sdata,
761 781
762 ret = __ieee80211_start_scan(sdata, sdata->local->int_scan_req); 782 ret = __ieee80211_start_scan(sdata, sdata->local->int_scan_req);
763 unlock: 783 unlock:
764 mutex_unlock(&local->scan_mtx); 784 mutex_unlock(&local->mtx);
765 return ret; 785 return ret;
766} 786}
767 787
788/*
789 * Only call this function when a scan can't be queued -- under RTNL.
790 */
768void ieee80211_scan_cancel(struct ieee80211_local *local) 791void ieee80211_scan_cancel(struct ieee80211_local *local)
769{ 792{
770 bool abortscan; 793 bool abortscan;
771 794 bool finish = false;
772 cancel_delayed_work_sync(&local->scan_work);
773 795
774 /* 796 /*
775 * Only call this function when a scan can't be 797 * We are only canceling software scan, or deferred scan that was not
776 * queued -- mostly at suspend under RTNL. 798 * yet really started (see __ieee80211_start_scan ).
799 *
800 * Regarding hardware scan:
801 * - we can not call __ieee80211_scan_completed() as when
802 * SCAN_HW_SCANNING bit is set this function change
803 * local->hw_scan_req to operate on 5G band, what race with
804 * driver which can use local->hw_scan_req
805 *
806 * - we can not cancel scan_work since driver can schedule it
807 * by ieee80211_scan_completed(..., true) to finish scan
808 *
809 * Hence low lever driver is responsible for canceling HW scan.
777 */ 810 */
778 mutex_lock(&local->scan_mtx);
779 abortscan = test_bit(SCAN_SW_SCANNING, &local->scanning) ||
780 (!local->scanning && local->scan_req);
781 mutex_unlock(&local->scan_mtx);
782 811
812 mutex_lock(&local->mtx);
813 abortscan = local->scan_req && !test_bit(SCAN_HW_SCANNING, &local->scanning);
783 if (abortscan) 814 if (abortscan)
784 ieee80211_scan_completed(&local->hw, true); 815 finish = __ieee80211_scan_completed(&local->hw, true, false);
816 mutex_unlock(&local->mtx);
817
818 if (abortscan) {
819 /* The scan is canceled, but stop work from being pending */
820 cancel_delayed_work_sync(&local->scan_work);
821 }
822 if (finish)
823 __ieee80211_scan_completed_finish(&local->hw, false);
785} 824}
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 6d86f0c1ad04..6d8f897d8763 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -125,7 +125,7 @@ struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata,
125 lockdep_is_held(&local->sta_mtx)); 125 lockdep_is_held(&local->sta_mtx));
126 while (sta) { 126 while (sta) {
127 if ((sta->sdata == sdata || 127 if ((sta->sdata == sdata ||
128 sta->sdata->bss == sdata->bss) && 128 (sta->sdata->bss && sta->sdata->bss == sdata->bss)) &&
129 memcmp(sta->sta.addr, addr, ETH_ALEN) == 0) 129 memcmp(sta->sta.addr, addr, ETH_ALEN) == 0)
130 break; 130 break;
131 sta = rcu_dereference_check(sta->hnext, 131 sta = rcu_dereference_check(sta->hnext,
@@ -174,8 +174,7 @@ static void __sta_info_free(struct ieee80211_local *local,
174 } 174 }
175 175
176#ifdef CONFIG_MAC80211_VERBOSE_DEBUG 176#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
177 printk(KERN_DEBUG "%s: Destroyed STA %pM\n", 177 wiphy_debug(local->hw.wiphy, "Destroyed STA %pM\n", sta->sta.addr);
178 wiphy_name(local->hw.wiphy), sta->sta.addr);
179#endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ 178#endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
180 179
181 kfree(sta); 180 kfree(sta);
@@ -262,8 +261,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
262 sta->last_seq_ctrl[i] = cpu_to_le16(USHRT_MAX); 261 sta->last_seq_ctrl[i] = cpu_to_le16(USHRT_MAX);
263 262
264#ifdef CONFIG_MAC80211_VERBOSE_DEBUG 263#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
265 printk(KERN_DEBUG "%s: Allocated STA %pM\n", 264 wiphy_debug(local->hw.wiphy, "Allocated STA %pM\n", sta->sta.addr);
266 wiphy_name(local->hw.wiphy), sta->sta.addr);
267#endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ 265#endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
268 266
269#ifdef CONFIG_MAC80211_MESH 267#ifdef CONFIG_MAC80211_MESH
@@ -282,7 +280,7 @@ static int sta_info_finish_insert(struct sta_info *sta, bool async)
282 unsigned long flags; 280 unsigned long flags;
283 int err = 0; 281 int err = 0;
284 282
285 WARN_ON(!mutex_is_locked(&local->sta_mtx)); 283 lockdep_assert_held(&local->sta_mtx);
286 284
287 /* notify driver */ 285 /* notify driver */
288 if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) 286 if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
@@ -300,8 +298,9 @@ static int sta_info_finish_insert(struct sta_info *sta, bool async)
300 sta->uploaded = true; 298 sta->uploaded = true;
301#ifdef CONFIG_MAC80211_VERBOSE_DEBUG 299#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
302 if (async) 300 if (async)
303 printk(KERN_DEBUG "%s: Finished adding IBSS STA %pM\n", 301 wiphy_debug(local->hw.wiphy,
304 wiphy_name(local->hw.wiphy), sta->sta.addr); 302 "Finished adding IBSS STA %pM\n",
303 sta->sta.addr);
305#endif 304#endif
306 } 305 }
307 306
@@ -411,8 +410,8 @@ int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU)
411 spin_unlock_irqrestore(&local->sta_lock, flags); 410 spin_unlock_irqrestore(&local->sta_lock, flags);
412 411
413#ifdef CONFIG_MAC80211_VERBOSE_DEBUG 412#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
414 printk(KERN_DEBUG "%s: Added IBSS STA %pM\n", 413 wiphy_debug(local->hw.wiphy, "Added IBSS STA %pM\n",
415 wiphy_name(local->hw.wiphy), sta->sta.addr); 414 sta->sta.addr);
416#endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ 415#endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
417 416
418 ieee80211_queue_work(&local->hw, &local->sta_finish_work); 417 ieee80211_queue_work(&local->hw, &local->sta_finish_work);
@@ -459,8 +458,7 @@ int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU)
459 } 458 }
460 459
461#ifdef CONFIG_MAC80211_VERBOSE_DEBUG 460#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
462 printk(KERN_DEBUG "%s: Inserted STA %pM\n", 461 wiphy_debug(local->hw.wiphy, "Inserted STA %pM\n", sta->sta.addr);
463 wiphy_name(local->hw.wiphy), sta->sta.addr);
464#endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ 462#endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
465 463
466 /* move reference to rcu-protected */ 464 /* move reference to rcu-protected */
@@ -618,7 +616,7 @@ static int __must_check __sta_info_destroy(struct sta_info *sta)
618 struct ieee80211_sub_if_data *sdata; 616 struct ieee80211_sub_if_data *sdata;
619 struct sk_buff *skb; 617 struct sk_buff *skb;
620 unsigned long flags; 618 unsigned long flags;
621 int ret; 619 int ret, i;
622 620
623 might_sleep(); 621 might_sleep();
624 622
@@ -635,7 +633,7 @@ static int __must_check __sta_info_destroy(struct sta_info *sta)
635 * will be sufficient. 633 * will be sufficient.
636 */ 634 */
637 set_sta_flags(sta, WLAN_STA_BLOCK_BA); 635 set_sta_flags(sta, WLAN_STA_BLOCK_BA);
638 ieee80211_sta_tear_down_BA_sessions(sta); 636 ieee80211_sta_tear_down_BA_sessions(sta, true);
639 637
640 spin_lock_irqsave(&local->sta_lock, flags); 638 spin_lock_irqsave(&local->sta_lock, flags);
641 ret = sta_info_hash_del(local, sta); 639 ret = sta_info_hash_del(local, sta);
@@ -646,10 +644,10 @@ static int __must_check __sta_info_destroy(struct sta_info *sta)
646 if (ret) 644 if (ret)
647 return ret; 645 return ret;
648 646
649 if (sta->key) { 647 for (i = 0; i < NUM_DEFAULT_KEYS; i++)
650 ieee80211_key_free(local, sta->key); 648 ieee80211_key_free(local, sta->gtk[i]);
651 WARN_ON(sta->key); 649 if (sta->ptk)
652 } 650 ieee80211_key_free(local, sta->ptk);
653 651
654 sta->dead = true; 652 sta->dead = true;
655 653
@@ -690,8 +688,7 @@ static int __must_check __sta_info_destroy(struct sta_info *sta)
690#endif 688#endif
691 689
692#ifdef CONFIG_MAC80211_VERBOSE_DEBUG 690#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
693 printk(KERN_DEBUG "%s: Removed STA %pM\n", 691 wiphy_debug(local->hw.wiphy, "Removed STA %pM\n", sta->sta.addr);
694 wiphy_name(local->hw.wiphy), sta->sta.addr);
695#endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ 692#endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
696 cancel_work_sync(&sta->drv_unblock_wk); 693 cancel_work_sync(&sta->drv_unblock_wk);
697 694
@@ -841,13 +838,20 @@ void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata,
841 mutex_unlock(&local->sta_mtx); 838 mutex_unlock(&local->sta_mtx);
842} 839}
843 840
844struct ieee80211_sta *ieee80211_find_sta_by_hw(struct ieee80211_hw *hw, 841struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw,
845 const u8 *addr) 842 const u8 *addr,
843 const u8 *localaddr)
846{ 844{
847 struct sta_info *sta, *nxt; 845 struct sta_info *sta, *nxt;
848 846
849 /* Just return a random station ... first in list ... */ 847 /*
848 * Just return a random station if localaddr is NULL
849 * ... first in list.
850 */
850 for_each_sta_info(hw_to_local(hw), addr, sta, nxt) { 851 for_each_sta_info(hw_to_local(hw), addr, sta, nxt) {
852 if (localaddr &&
853 compare_ether_addr(sta->sdata->vif.addr, localaddr) != 0)
854 continue;
851 if (!sta->uploaded) 855 if (!sta->uploaded)
852 return NULL; 856 return NULL;
853 return &sta->sta; 857 return &sta->sta;
@@ -855,7 +859,7 @@ struct ieee80211_sta *ieee80211_find_sta_by_hw(struct ieee80211_hw *hw,
855 859
856 return NULL; 860 return NULL;
857} 861}
858EXPORT_SYMBOL_GPL(ieee80211_find_sta_by_hw); 862EXPORT_SYMBOL_GPL(ieee80211_find_sta_by_ifaddr);
859 863
860struct ieee80211_sta *ieee80211_find_sta(struct ieee80211_vif *vif, 864struct ieee80211_sta *ieee80211_find_sta(struct ieee80211_vif *vif,
861 const u8 *addr) 865 const u8 *addr)
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 54262e72376d..9265acadef32 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -79,6 +79,7 @@ enum ieee80211_sta_info_flags {
79 * @dialog_token: dialog token for aggregation session 79 * @dialog_token: dialog token for aggregation session
80 * @state: session state (see above) 80 * @state: session state (see above)
81 * @stop_initiator: initiator of a session stop 81 * @stop_initiator: initiator of a session stop
82 * @tx_stop: TX DelBA frame when stopping
82 * 83 *
83 * This structure is protected by RCU and the per-station 84 * This structure is protected by RCU and the per-station
84 * spinlock. Assignments to the array holding it must hold 85 * spinlock. Assignments to the array holding it must hold
@@ -95,6 +96,7 @@ struct tid_ampdu_tx {
95 unsigned long state; 96 unsigned long state;
96 u8 dialog_token; 97 u8 dialog_token;
97 u8 stop_initiator; 98 u8 stop_initiator;
99 bool tx_stop;
98}; 100};
99 101
100/** 102/**
@@ -103,6 +105,7 @@ struct tid_ampdu_tx {
103 * @reorder_buf: buffer to reorder incoming aggregated MPDUs 105 * @reorder_buf: buffer to reorder incoming aggregated MPDUs
104 * @reorder_time: jiffies when skb was added 106 * @reorder_time: jiffies when skb was added
105 * @session_timer: check if peer keeps Tx-ing on the TID (by timeout value) 107 * @session_timer: check if peer keeps Tx-ing on the TID (by timeout value)
108 * @reorder_timer: releases expired frames from the reorder buffer.
106 * @head_seq_num: head sequence number in reordering buffer. 109 * @head_seq_num: head sequence number in reordering buffer.
107 * @stored_mpdu_num: number of MPDUs in reordering buffer 110 * @stored_mpdu_num: number of MPDUs in reordering buffer
108 * @ssn: Starting Sequence Number expected to be aggregated. 111 * @ssn: Starting Sequence Number expected to be aggregated.
@@ -110,20 +113,25 @@ struct tid_ampdu_tx {
110 * @timeout: reset timer value (in TUs). 113 * @timeout: reset timer value (in TUs).
111 * @dialog_token: dialog token for aggregation session 114 * @dialog_token: dialog token for aggregation session
112 * @rcu_head: RCU head used for freeing this struct 115 * @rcu_head: RCU head used for freeing this struct
116 * @reorder_lock: serializes access to reorder buffer, see below.
113 * 117 *
114 * This structure is protected by RCU and the per-station 118 * This structure is protected by RCU and the per-station
115 * spinlock. Assignments to the array holding it must hold 119 * spinlock. Assignments to the array holding it must hold
116 * the spinlock, only the RX path can access it under RCU 120 * the spinlock.
117 * lock-free. The RX path, since it is single-threaded, 121 *
118 * can even modify the structure without locking since the 122 * The @reorder_lock is used to protect the variables and
119 * only other modifications to it are done when the struct 123 * arrays such as @reorder_buf, @reorder_time, @head_seq_num,
120 * can not yet or no longer be found by the RX path. 124 * @stored_mpdu_num and @reorder_time from being corrupted by
125 * concurrent access of the RX path and the expired frame
126 * release timer.
121 */ 127 */
122struct tid_ampdu_rx { 128struct tid_ampdu_rx {
123 struct rcu_head rcu_head; 129 struct rcu_head rcu_head;
130 spinlock_t reorder_lock;
124 struct sk_buff **reorder_buf; 131 struct sk_buff **reorder_buf;
125 unsigned long *reorder_time; 132 unsigned long *reorder_time;
126 struct timer_list session_timer; 133 struct timer_list session_timer;
134 struct timer_list reorder_timer;
127 u16 head_seq_num; 135 u16 head_seq_num;
128 u16 stored_mpdu_num; 136 u16 stored_mpdu_num;
129 u16 ssn; 137 u16 ssn;
@@ -191,7 +199,8 @@ enum plink_state {
191 * @hnext: hash table linked list pointer 199 * @hnext: hash table linked list pointer
192 * @local: pointer to the global information 200 * @local: pointer to the global information
193 * @sdata: virtual interface this station belongs to 201 * @sdata: virtual interface this station belongs to
194 * @key: peer key negotiated with this station, if any 202 * @ptk: peer key negotiated with this station, if any
203 * @gtk: group keys negotiated with this station, if any
195 * @rate_ctrl: rate control algorithm reference 204 * @rate_ctrl: rate control algorithm reference
196 * @rate_ctrl_priv: rate control private per-STA pointer 205 * @rate_ctrl_priv: rate control private per-STA pointer
197 * @last_tx_rate: rate used for last transmit, to report to userspace as 206 * @last_tx_rate: rate used for last transmit, to report to userspace as
@@ -246,7 +255,8 @@ struct sta_info {
246 struct sta_info *hnext; 255 struct sta_info *hnext;
247 struct ieee80211_local *local; 256 struct ieee80211_local *local;
248 struct ieee80211_sub_if_data *sdata; 257 struct ieee80211_sub_if_data *sdata;
249 struct ieee80211_key *key; 258 struct ieee80211_key *gtk[NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS];
259 struct ieee80211_key *ptk;
250 struct rate_control_ref *rate_ctrl; 260 struct rate_control_ref *rate_ctrl;
251 void *rate_ctrl_priv; 261 void *rate_ctrl_priv;
252 spinlock_t lock; 262 spinlock_t lock;
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index 10caec5ea8fa..3153c19893b8 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -58,6 +58,7 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local,
58 info->control.vif = &sta->sdata->vif; 58 info->control.vif = &sta->sdata->vif;
59 info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING | 59 info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING |
60 IEEE80211_TX_INTFL_RETRANSMISSION; 60 IEEE80211_TX_INTFL_RETRANSMISSION;
61 info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS;
61 62
62 sta->tx_filtered_count++; 63 sta->tx_filtered_count++;
63 64
@@ -114,11 +115,10 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local,
114 115
115#ifdef CONFIG_MAC80211_VERBOSE_DEBUG 116#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
116 if (net_ratelimit()) 117 if (net_ratelimit())
117 printk(KERN_DEBUG "%s: dropped TX filtered frame, " 118 wiphy_debug(local->hw.wiphy,
118 "queue_len=%d PS=%d @%lu\n", 119 "dropped TX filtered frame, queue_len=%d PS=%d @%lu\n",
119 wiphy_name(local->hw.wiphy), 120 skb_queue_len(&sta->tx_filtered),
120 skb_queue_len(&sta->tx_filtered), 121 !!test_sta_flags(sta, WLAN_STA_PS_STA), jiffies);
121 !!test_sta_flags(sta, WLAN_STA_PS_STA), jiffies);
122#endif 122#endif
123 dev_kfree_skb(skb); 123 dev_kfree_skb(skb);
124} 124}
@@ -176,7 +176,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
176 176
177 for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { 177 for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) {
178 /* the HW cannot have attempted that rate */ 178 /* the HW cannot have attempted that rate */
179 if (i >= hw->max_rates) { 179 if (i >= hw->max_report_rates) {
180 info->status.rates[i].idx = -1; 180 info->status.rates[i].idx = -1;
181 info->status.rates[i].count = 0; 181 info->status.rates[i].count = 0;
182 } else if (info->status.rates[i].idx >= 0) { 182 } else if (info->status.rates[i].idx >= 0) {
@@ -296,7 +296,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
296 } 296 }
297 297
298 if (info->flags & IEEE80211_TX_INTFL_NL80211_FRAME_TX) 298 if (info->flags & IEEE80211_TX_INTFL_NL80211_FRAME_TX)
299 cfg80211_action_tx_status( 299 cfg80211_mgmt_tx_status(
300 skb->dev, (unsigned long) skb, skb->data, skb->len, 300 skb->dev, (unsigned long) skb, skb->data, skb->len,
301 !!(info->flags & IEEE80211_TX_STAT_ACK), GFP_ATOMIC); 301 !!(info->flags & IEEE80211_TX_STAT_ACK), GFP_ATOMIC);
302 302
@@ -377,7 +377,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
377 skb2 = skb_clone(skb, GFP_ATOMIC); 377 skb2 = skb_clone(skb, GFP_ATOMIC);
378 if (skb2) { 378 if (skb2) {
379 skb2->dev = prev_dev; 379 skb2->dev = prev_dev;
380 netif_receive_skb(skb2); 380 netif_rx(skb2);
381 } 381 }
382 } 382 }
383 383
@@ -386,7 +386,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
386 } 386 }
387 if (prev_dev) { 387 if (prev_dev) {
388 skb->dev = prev_dev; 388 skb->dev = prev_dev;
389 netif_receive_skb(skb); 389 netif_rx(skb);
390 skb = NULL; 390 skb = NULL;
391 } 391 }
392 rcu_read_unlock(); 392 rcu_read_unlock();
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index c54db966926b..96c594309506 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -273,6 +273,9 @@ ieee80211_tx_h_check_assoc(struct ieee80211_tx_data *tx)
273 */ 273 */
274 return TX_DROP; 274 return TX_DROP;
275 275
276 if (tx->sdata->vif.type == NL80211_IFTYPE_WDS)
277 return TX_CONTINUE;
278
276 if (tx->sdata->vif.type == NL80211_IFTYPE_MESH_POINT) 279 if (tx->sdata->vif.type == NL80211_IFTYPE_MESH_POINT)
277 return TX_CONTINUE; 280 return TX_CONTINUE;
278 281
@@ -351,8 +354,8 @@ static void purge_old_ps_buffers(struct ieee80211_local *local)
351 354
352 local->total_ps_buffered = total; 355 local->total_ps_buffered = total;
353#ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG 356#ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
354 printk(KERN_DEBUG "%s: PS buffers full - purged %d frames\n", 357 wiphy_debug(local->hw.wiphy, "PS buffers full - purged %d frames\n",
355 wiphy_name(local->hw.wiphy), purged); 358 purged);
356#endif 359#endif
357} 360}
358 361
@@ -509,6 +512,18 @@ ieee80211_tx_h_ps_buf(struct ieee80211_tx_data *tx)
509} 512}
510 513
511static ieee80211_tx_result debug_noinline 514static ieee80211_tx_result debug_noinline
515ieee80211_tx_h_check_control_port_protocol(struct ieee80211_tx_data *tx)
516{
517 struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
518
519 if (unlikely(tx->sdata->control_port_protocol == tx->skb->protocol &&
520 tx->sdata->control_port_no_encrypt))
521 info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
522
523 return TX_CONTINUE;
524}
525
526static ieee80211_tx_result debug_noinline
512ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) 527ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
513{ 528{
514 struct ieee80211_key *key = NULL; 529 struct ieee80211_key *key = NULL;
@@ -517,7 +532,7 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
517 532
518 if (unlikely(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT)) 533 if (unlikely(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT))
519 tx->key = NULL; 534 tx->key = NULL;
520 else if (tx->sta && (key = rcu_dereference(tx->sta->key))) 535 else if (tx->sta && (key = rcu_dereference(tx->sta->ptk)))
521 tx->key = key; 536 tx->key = key;
522 else if (ieee80211_is_mgmt(hdr->frame_control) && 537 else if (ieee80211_is_mgmt(hdr->frame_control) &&
523 is_multicast_ether_addr(hdr->addr1) && 538 is_multicast_ether_addr(hdr->addr1) &&
@@ -527,7 +542,7 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
527 else if ((key = rcu_dereference(tx->sdata->default_key))) 542 else if ((key = rcu_dereference(tx->sdata->default_key)))
528 tx->key = key; 543 tx->key = key;
529 else if (tx->sdata->drop_unencrypted && 544 else if (tx->sdata->drop_unencrypted &&
530 (tx->skb->protocol != cpu_to_be16(ETH_P_PAE)) && 545 (tx->skb->protocol != tx->sdata->control_port_protocol) &&
531 !(info->flags & IEEE80211_TX_CTL_INJECTED) && 546 !(info->flags & IEEE80211_TX_CTL_INJECTED) &&
532 (!ieee80211_is_robust_mgmt_frame(hdr) || 547 (!ieee80211_is_robust_mgmt_frame(hdr) ||
533 (ieee80211_is_action(hdr->frame_control) && 548 (ieee80211_is_action(hdr->frame_control) &&
@@ -543,15 +558,16 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
543 tx->key->tx_rx_count++; 558 tx->key->tx_rx_count++;
544 /* TODO: add threshold stuff again */ 559 /* TODO: add threshold stuff again */
545 560
546 switch (tx->key->conf.alg) { 561 switch (tx->key->conf.cipher) {
547 case ALG_WEP: 562 case WLAN_CIPHER_SUITE_WEP40:
563 case WLAN_CIPHER_SUITE_WEP104:
548 if (ieee80211_is_auth(hdr->frame_control)) 564 if (ieee80211_is_auth(hdr->frame_control))
549 break; 565 break;
550 case ALG_TKIP: 566 case WLAN_CIPHER_SUITE_TKIP:
551 if (!ieee80211_is_data_present(hdr->frame_control)) 567 if (!ieee80211_is_data_present(hdr->frame_control))
552 tx->key = NULL; 568 tx->key = NULL;
553 break; 569 break;
554 case ALG_CCMP: 570 case WLAN_CIPHER_SUITE_CCMP:
555 if (!ieee80211_is_data_present(hdr->frame_control) && 571 if (!ieee80211_is_data_present(hdr->frame_control) &&
556 !ieee80211_use_mfp(hdr->frame_control, tx->sta, 572 !ieee80211_use_mfp(hdr->frame_control, tx->sta,
557 tx->skb)) 573 tx->skb))
@@ -561,7 +577,7 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
561 IEEE80211_KEY_FLAG_SW_MGMT) && 577 IEEE80211_KEY_FLAG_SW_MGMT) &&
562 ieee80211_is_mgmt(hdr->frame_control); 578 ieee80211_is_mgmt(hdr->frame_control);
563 break; 579 break;
564 case ALG_AES_CMAC: 580 case WLAN_CIPHER_SUITE_AES_CMAC:
565 if (!ieee80211_is_mgmt(hdr->frame_control)) 581 if (!ieee80211_is_mgmt(hdr->frame_control))
566 tx->key = NULL; 582 tx->key = NULL;
567 break; 583 break;
@@ -946,22 +962,31 @@ ieee80211_tx_h_stats(struct ieee80211_tx_data *tx)
946static ieee80211_tx_result debug_noinline 962static ieee80211_tx_result debug_noinline
947ieee80211_tx_h_encrypt(struct ieee80211_tx_data *tx) 963ieee80211_tx_h_encrypt(struct ieee80211_tx_data *tx)
948{ 964{
965 struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
966
949 if (!tx->key) 967 if (!tx->key)
950 return TX_CONTINUE; 968 return TX_CONTINUE;
951 969
952 switch (tx->key->conf.alg) { 970 switch (tx->key->conf.cipher) {
953 case ALG_WEP: 971 case WLAN_CIPHER_SUITE_WEP40:
972 case WLAN_CIPHER_SUITE_WEP104:
954 return ieee80211_crypto_wep_encrypt(tx); 973 return ieee80211_crypto_wep_encrypt(tx);
955 case ALG_TKIP: 974 case WLAN_CIPHER_SUITE_TKIP:
956 return ieee80211_crypto_tkip_encrypt(tx); 975 return ieee80211_crypto_tkip_encrypt(tx);
957 case ALG_CCMP: 976 case WLAN_CIPHER_SUITE_CCMP:
958 return ieee80211_crypto_ccmp_encrypt(tx); 977 return ieee80211_crypto_ccmp_encrypt(tx);
959 case ALG_AES_CMAC: 978 case WLAN_CIPHER_SUITE_AES_CMAC:
960 return ieee80211_crypto_aes_cmac_encrypt(tx); 979 return ieee80211_crypto_aes_cmac_encrypt(tx);
980 default:
981 /* handle hw-only algorithm */
982 if (info->control.hw_key) {
983 ieee80211_tx_set_protected(tx);
984 return TX_CONTINUE;
985 }
986 break;
987
961 } 988 }
962 989
963 /* not reached */
964 WARN_ON(1);
965 return TX_DROP; 990 return TX_DROP;
966} 991}
967 992
@@ -1339,6 +1364,7 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
1339 CALL_TXH(ieee80211_tx_h_dynamic_ps); 1364 CALL_TXH(ieee80211_tx_h_dynamic_ps);
1340 CALL_TXH(ieee80211_tx_h_check_assoc); 1365 CALL_TXH(ieee80211_tx_h_check_assoc);
1341 CALL_TXH(ieee80211_tx_h_ps_buf); 1366 CALL_TXH(ieee80211_tx_h_ps_buf);
1367 CALL_TXH(ieee80211_tx_h_check_control_port_protocol);
1342 CALL_TXH(ieee80211_tx_h_select_key); 1368 CALL_TXH(ieee80211_tx_h_select_key);
1343 if (!(tx->local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL)) 1369 if (!(tx->local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL))
1344 CALL_TXH(ieee80211_tx_h_rate_ctrl); 1370 CALL_TXH(ieee80211_tx_h_rate_ctrl);
@@ -1511,8 +1537,8 @@ static int ieee80211_skb_resize(struct ieee80211_local *local,
1511 I802_DEBUG_INC(local->tx_expand_skb_head); 1537 I802_DEBUG_INC(local->tx_expand_skb_head);
1512 1538
1513 if (pskb_expand_head(skb, head_need, tail_need, GFP_ATOMIC)) { 1539 if (pskb_expand_head(skb, head_need, tail_need, GFP_ATOMIC)) {
1514 printk(KERN_DEBUG "%s: failed to reallocate TX buffer\n", 1540 wiphy_debug(local->hw.wiphy,
1515 wiphy_name(local->hw.wiphy)); 1541 "failed to reallocate TX buffer\n");
1516 return -ENOMEM; 1542 return -ENOMEM;
1517 } 1543 }
1518 1544
@@ -1586,6 +1612,7 @@ static void ieee80211_xmit(struct ieee80211_sub_if_data *sdata,
1586 return; 1612 return;
1587 } 1613 }
1588 1614
1615 hdr = (struct ieee80211_hdr *) skb->data;
1589 info->control.vif = &sdata->vif; 1616 info->control.vif = &sdata->vif;
1590 1617
1591 if (ieee80211_vif_is_mesh(&sdata->vif) && 1618 if (ieee80211_vif_is_mesh(&sdata->vif) &&
@@ -1699,7 +1726,7 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
1699 u16 ethertype, hdrlen, meshhdrlen = 0; 1726 u16 ethertype, hdrlen, meshhdrlen = 0;
1700 __le16 fc; 1727 __le16 fc;
1701 struct ieee80211_hdr hdr; 1728 struct ieee80211_hdr hdr;
1702 struct ieee80211s_hdr mesh_hdr; 1729 struct ieee80211s_hdr mesh_hdr __maybe_unused;
1703 const u8 *encaps_data; 1730 const u8 *encaps_data;
1704 int encaps_len, skip_header_bytes; 1731 int encaps_len, skip_header_bytes;
1705 int nh_pos, h_pos; 1732 int nh_pos, h_pos;
@@ -1816,7 +1843,8 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
1816#endif 1843#endif
1817 case NL80211_IFTYPE_STATION: 1844 case NL80211_IFTYPE_STATION:
1818 memcpy(hdr.addr1, sdata->u.mgd.bssid, ETH_ALEN); 1845 memcpy(hdr.addr1, sdata->u.mgd.bssid, ETH_ALEN);
1819 if (sdata->u.mgd.use_4addr && ethertype != ETH_P_PAE) { 1846 if (sdata->u.mgd.use_4addr &&
1847 cpu_to_be16(ethertype) != sdata->control_port_protocol) {
1820 fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS); 1848 fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS);
1821 /* RA TA DA SA */ 1849 /* RA TA DA SA */
1822 memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN); 1850 memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN);
@@ -1869,7 +1897,7 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
1869 if (!ieee80211_vif_is_mesh(&sdata->vif) && 1897 if (!ieee80211_vif_is_mesh(&sdata->vif) &&
1870 unlikely(!is_multicast_ether_addr(hdr.addr1) && 1898 unlikely(!is_multicast_ether_addr(hdr.addr1) &&
1871 !(sta_flags & WLAN_STA_AUTHORIZED) && 1899 !(sta_flags & WLAN_STA_AUTHORIZED) &&
1872 !(ethertype == ETH_P_PAE && 1900 !(cpu_to_be16(ethertype) == sdata->control_port_protocol &&
1873 compare_ether_addr(sdata->vif.addr, 1901 compare_ether_addr(sdata->vif.addr,
1874 skb->data + ETH_ALEN) == 0))) { 1902 skb->data + ETH_ALEN) == 0))) {
1875#ifdef CONFIG_MAC80211_VERBOSE_DEBUG 1903#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
@@ -2068,8 +2096,7 @@ void ieee80211_tx_pending(unsigned long data)
2068 2096
2069 if (skb_queue_empty(&local->pending[i])) 2097 if (skb_queue_empty(&local->pending[i]))
2070 list_for_each_entry_rcu(sdata, &local->interfaces, list) 2098 list_for_each_entry_rcu(sdata, &local->interfaces, list)
2071 netif_tx_wake_queue( 2099 netif_wake_subqueue(sdata->dev, i);
2072 netdev_get_tx_queue(sdata->dev, i));
2073 } 2100 }
2074 spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); 2101 spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
2075 2102
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 748387d45bc0..0b6fc92bc0d7 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -283,8 +283,11 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue,
283 283
284 if (skb_queue_empty(&local->pending[queue])) { 284 if (skb_queue_empty(&local->pending[queue])) {
285 rcu_read_lock(); 285 rcu_read_lock();
286 list_for_each_entry_rcu(sdata, &local->interfaces, list) 286 list_for_each_entry_rcu(sdata, &local->interfaces, list) {
287 netif_tx_wake_queue(netdev_get_tx_queue(sdata->dev, queue)); 287 if (test_bit(SDATA_STATE_OFFCHANNEL, &sdata->state))
288 continue;
289 netif_wake_subqueue(sdata->dev, queue);
290 }
288 rcu_read_unlock(); 291 rcu_read_unlock();
289 } else 292 } else
290 tasklet_schedule(&local->tx_pending_tasklet); 293 tasklet_schedule(&local->tx_pending_tasklet);
@@ -323,7 +326,7 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue,
323 326
324 rcu_read_lock(); 327 rcu_read_lock();
325 list_for_each_entry_rcu(sdata, &local->interfaces, list) 328 list_for_each_entry_rcu(sdata, &local->interfaces, list)
326 netif_tx_stop_queue(netdev_get_tx_queue(sdata->dev, queue)); 329 netif_stop_subqueue(sdata->dev, queue);
327 rcu_read_unlock(); 330 rcu_read_unlock();
328} 331}
329 332
@@ -471,16 +474,10 @@ void ieee80211_iterate_active_interfaces(
471 474
472 list_for_each_entry(sdata, &local->interfaces, list) { 475 list_for_each_entry(sdata, &local->interfaces, list) {
473 switch (sdata->vif.type) { 476 switch (sdata->vif.type) {
474 case __NL80211_IFTYPE_AFTER_LAST:
475 case NL80211_IFTYPE_UNSPECIFIED:
476 case NL80211_IFTYPE_MONITOR: 477 case NL80211_IFTYPE_MONITOR:
477 case NL80211_IFTYPE_AP_VLAN: 478 case NL80211_IFTYPE_AP_VLAN:
478 continue; 479 continue;
479 case NL80211_IFTYPE_AP: 480 default:
480 case NL80211_IFTYPE_STATION:
481 case NL80211_IFTYPE_ADHOC:
482 case NL80211_IFTYPE_WDS:
483 case NL80211_IFTYPE_MESH_POINT:
484 break; 481 break;
485 } 482 }
486 if (ieee80211_sdata_running(sdata)) 483 if (ieee80211_sdata_running(sdata))
@@ -505,16 +502,10 @@ void ieee80211_iterate_active_interfaces_atomic(
505 502
506 list_for_each_entry_rcu(sdata, &local->interfaces, list) { 503 list_for_each_entry_rcu(sdata, &local->interfaces, list) {
507 switch (sdata->vif.type) { 504 switch (sdata->vif.type) {
508 case __NL80211_IFTYPE_AFTER_LAST:
509 case NL80211_IFTYPE_UNSPECIFIED:
510 case NL80211_IFTYPE_MONITOR: 505 case NL80211_IFTYPE_MONITOR:
511 case NL80211_IFTYPE_AP_VLAN: 506 case NL80211_IFTYPE_AP_VLAN:
512 continue; 507 continue;
513 case NL80211_IFTYPE_AP: 508 default:
514 case NL80211_IFTYPE_STATION:
515 case NL80211_IFTYPE_ADHOC:
516 case NL80211_IFTYPE_WDS:
517 case NL80211_IFTYPE_MESH_POINT:
518 break; 509 break;
519 } 510 }
520 if (ieee80211_sdata_running(sdata)) 511 if (ieee80211_sdata_running(sdata))
@@ -904,26 +895,34 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
904 895
905int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, 896int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer,
906 const u8 *ie, size_t ie_len, 897 const u8 *ie, size_t ie_len,
907 enum ieee80211_band band) 898 enum ieee80211_band band, u32 rate_mask,
899 u8 channel)
908{ 900{
909 struct ieee80211_supported_band *sband; 901 struct ieee80211_supported_band *sband;
910 u8 *pos; 902 u8 *pos;
911 size_t offset = 0, noffset; 903 size_t offset = 0, noffset;
912 int supp_rates_len, i; 904 int supp_rates_len, i;
905 u8 rates[32];
906 int num_rates;
907 int ext_rates_len;
913 908
914 sband = local->hw.wiphy->bands[band]; 909 sband = local->hw.wiphy->bands[band];
915 910
916 pos = buffer; 911 pos = buffer;
917 912
918 supp_rates_len = min_t(int, sband->n_bitrates, 8); 913 num_rates = 0;
914 for (i = 0; i < sband->n_bitrates; i++) {
915 if ((BIT(i) & rate_mask) == 0)
916 continue; /* skip rate */
917 rates[num_rates++] = (u8) (sband->bitrates[i].bitrate / 5);
918 }
919
920 supp_rates_len = min_t(int, num_rates, 8);
919 921
920 *pos++ = WLAN_EID_SUPP_RATES; 922 *pos++ = WLAN_EID_SUPP_RATES;
921 *pos++ = supp_rates_len; 923 *pos++ = supp_rates_len;
922 924 memcpy(pos, rates, supp_rates_len);
923 for (i = 0; i < supp_rates_len; i++) { 925 pos += supp_rates_len;
924 int rate = sband->bitrates[i].bitrate;
925 *pos++ = (u8) (rate / 5);
926 }
927 926
928 /* insert "request information" if in custom IEs */ 927 /* insert "request information" if in custom IEs */
929 if (ie && ie_len) { 928 if (ie && ie_len) {
@@ -941,14 +940,18 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer,
941 offset = noffset; 940 offset = noffset;
942 } 941 }
943 942
944 if (sband->n_bitrates > i) { 943 ext_rates_len = num_rates - supp_rates_len;
944 if (ext_rates_len > 0) {
945 *pos++ = WLAN_EID_EXT_SUPP_RATES; 945 *pos++ = WLAN_EID_EXT_SUPP_RATES;
946 *pos++ = sband->n_bitrates - i; 946 *pos++ = ext_rates_len;
947 memcpy(pos, rates + supp_rates_len, ext_rates_len);
948 pos += ext_rates_len;
949 }
947 950
948 for (; i < sband->n_bitrates; i++) { 951 if (channel && sband->band == IEEE80211_BAND_2GHZ) {
949 int rate = sband->bitrates[i].bitrate; 952 *pos++ = WLAN_EID_DS_PARAMS;
950 *pos++ = (u8) (rate / 5); 953 *pos++ = 1;
951 } 954 *pos++ = channel;
952 } 955 }
953 956
954 /* insert custom IEs that go before HT */ 957 /* insert custom IEs that go before HT */
@@ -1017,6 +1020,7 @@ void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
1017 struct ieee80211_mgmt *mgmt; 1020 struct ieee80211_mgmt *mgmt;
1018 size_t buf_len; 1021 size_t buf_len;
1019 u8 *buf; 1022 u8 *buf;
1023 u8 chan;
1020 1024
1021 /* FIXME: come up with a proper value */ 1025 /* FIXME: come up with a proper value */
1022 buf = kmalloc(200 + ie_len, GFP_KERNEL); 1026 buf = kmalloc(200 + ie_len, GFP_KERNEL);
@@ -1026,8 +1030,14 @@ void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
1026 return; 1030 return;
1027 } 1031 }
1028 1032
1033 chan = ieee80211_frequency_to_channel(
1034 local->hw.conf.channel->center_freq);
1035
1029 buf_len = ieee80211_build_preq_ies(local, buf, ie, ie_len, 1036 buf_len = ieee80211_build_preq_ies(local, buf, ie, ie_len,
1030 local->hw.conf.channel->band); 1037 local->hw.conf.channel->band,
1038 sdata->rc_rateidx_mask
1039 [local->hw.conf.channel->band],
1040 chan);
1031 1041
1032 skb = ieee80211_probereq_get(&local->hw, &sdata->vif, 1042 skb = ieee80211_probereq_get(&local->hw, &sdata->vif,
1033 ssid, ssid_len, 1043 ssid, ssid_len,
@@ -1189,7 +1199,9 @@ int ieee80211_reconfig(struct ieee80211_local *local)
1189 /* ignore virtual */ 1199 /* ignore virtual */
1190 break; 1200 break;
1191 case NL80211_IFTYPE_UNSPECIFIED: 1201 case NL80211_IFTYPE_UNSPECIFIED:
1192 case __NL80211_IFTYPE_AFTER_LAST: 1202 case NUM_NL80211_IFTYPES:
1203 case NL80211_IFTYPE_P2P_CLIENT:
1204 case NL80211_IFTYPE_P2P_GO:
1193 WARN_ON(1); 1205 WARN_ON(1);
1194 break; 1206 break;
1195 } 1207 }
@@ -1209,7 +1221,7 @@ int ieee80211_reconfig(struct ieee80211_local *local)
1209 mutex_lock(&local->sta_mtx); 1221 mutex_lock(&local->sta_mtx);
1210 1222
1211 list_for_each_entry(sta, &local->sta_list, list) { 1223 list_for_each_entry(sta, &local->sta_list, list) {
1212 ieee80211_sta_tear_down_BA_sessions(sta); 1224 ieee80211_sta_tear_down_BA_sessions(sta, true);
1213 clear_sta_flags(sta, WLAN_STA_BLOCK_BA); 1225 clear_sta_flags(sta, WLAN_STA_BLOCK_BA);
1214 } 1226 }
1215 1227
@@ -1285,17 +1297,13 @@ static int check_mgd_smps(struct ieee80211_if_managed *ifmgd,
1285} 1297}
1286 1298
1287/* must hold iflist_mtx */ 1299/* must hold iflist_mtx */
1288void ieee80211_recalc_smps(struct ieee80211_local *local, 1300void ieee80211_recalc_smps(struct ieee80211_local *local)
1289 struct ieee80211_sub_if_data *forsdata)
1290{ 1301{
1291 struct ieee80211_sub_if_data *sdata; 1302 struct ieee80211_sub_if_data *sdata;
1292 enum ieee80211_smps_mode smps_mode = IEEE80211_SMPS_OFF; 1303 enum ieee80211_smps_mode smps_mode = IEEE80211_SMPS_OFF;
1293 int count = 0; 1304 int count = 0;
1294 1305
1295 if (forsdata) 1306 lockdep_assert_held(&local->iflist_mtx);
1296 WARN_ON(!mutex_is_locked(&forsdata->u.mgd.mtx));
1297
1298 WARN_ON(!mutex_is_locked(&local->iflist_mtx));
1299 1307
1300 /* 1308 /*
1301 * This function could be improved to handle multiple 1309 * This function could be improved to handle multiple
@@ -1308,22 +1316,12 @@ void ieee80211_recalc_smps(struct ieee80211_local *local,
1308 */ 1316 */
1309 1317
1310 list_for_each_entry(sdata, &local->interfaces, list) { 1318 list_for_each_entry(sdata, &local->interfaces, list) {
1311 if (!netif_running(sdata->dev)) 1319 if (!ieee80211_sdata_running(sdata))
1312 continue; 1320 continue;
1313 if (sdata->vif.type != NL80211_IFTYPE_STATION) 1321 if (sdata->vif.type != NL80211_IFTYPE_STATION)
1314 goto set; 1322 goto set;
1315 if (sdata != forsdata) { 1323
1316 /* 1324 count += check_mgd_smps(&sdata->u.mgd, &smps_mode);
1317 * This nested is ok -- we are holding the iflist_mtx
1318 * so can't get here twice or so. But it's required
1319 * since normally we acquire it first and then the
1320 * iflist_mtx.
1321 */
1322 mutex_lock_nested(&sdata->u.mgd.mtx, SINGLE_DEPTH_NESTING);
1323 count += check_mgd_smps(&sdata->u.mgd, &smps_mode);
1324 mutex_unlock(&sdata->u.mgd.mtx);
1325 } else
1326 count += check_mgd_smps(&sdata->u.mgd, &smps_mode);
1327 1325
1328 if (count > 1) { 1326 if (count > 1) {
1329 smps_mode = IEEE80211_SMPS_OFF; 1327 smps_mode = IEEE80211_SMPS_OFF;
diff --git a/net/mac80211/wep.c b/net/mac80211/wep.c
index 9ebc8d8a1f5b..2ff6d1e3ed21 100644
--- a/net/mac80211/wep.c
+++ b/net/mac80211/wep.c
@@ -222,7 +222,7 @@ static int ieee80211_wep_decrypt(struct ieee80211_local *local,
222 struct ieee80211_key *key) 222 struct ieee80211_key *key)
223{ 223{
224 u32 klen; 224 u32 klen;
225 u8 *rc4key; 225 u8 rc4key[3 + WLAN_KEY_LEN_WEP104];
226 u8 keyidx; 226 u8 keyidx;
227 struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; 227 struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
228 unsigned int hdrlen; 228 unsigned int hdrlen;
@@ -240,15 +240,11 @@ static int ieee80211_wep_decrypt(struct ieee80211_local *local,
240 240
241 keyidx = skb->data[hdrlen + 3] >> 6; 241 keyidx = skb->data[hdrlen + 3] >> 6;
242 242
243 if (!key || keyidx != key->conf.keyidx || key->conf.alg != ALG_WEP) 243 if (!key || keyidx != key->conf.keyidx)
244 return -1; 244 return -1;
245 245
246 klen = 3 + key->conf.keylen; 246 klen = 3 + key->conf.keylen;
247 247
248 rc4key = kmalloc(klen, GFP_ATOMIC);
249 if (!rc4key)
250 return -1;
251
252 /* Prepend 24-bit IV to RC4 key */ 248 /* Prepend 24-bit IV to RC4 key */
253 memcpy(rc4key, skb->data + hdrlen, 3); 249 memcpy(rc4key, skb->data + hdrlen, 3);
254 250
@@ -260,8 +256,6 @@ static int ieee80211_wep_decrypt(struct ieee80211_local *local,
260 len)) 256 len))
261 ret = -1; 257 ret = -1;
262 258
263 kfree(rc4key);
264
265 /* Trim ICV */ 259 /* Trim ICV */
266 skb_trim(skb, skb->len - WEP_ICV_LEN); 260 skb_trim(skb, skb->len - WEP_ICV_LEN);
267 261
diff --git a/net/mac80211/work.c b/net/mac80211/work.c
index 81d4ad64184a..ae344d1ba056 100644
--- a/net/mac80211/work.c
+++ b/net/mac80211/work.c
@@ -43,7 +43,7 @@ enum work_action {
43/* utils */ 43/* utils */
44static inline void ASSERT_WORK_MTX(struct ieee80211_local *local) 44static inline void ASSERT_WORK_MTX(struct ieee80211_local *local)
45{ 45{
46 WARN_ON(!mutex_is_locked(&local->work_mtx)); 46 lockdep_assert_held(&local->mtx);
47} 47}
48 48
49/* 49/*
@@ -757,7 +757,7 @@ static void ieee80211_work_rx_queued_mgmt(struct ieee80211_local *local,
757 mgmt = (struct ieee80211_mgmt *) skb->data; 757 mgmt = (struct ieee80211_mgmt *) skb->data;
758 fc = le16_to_cpu(mgmt->frame_control); 758 fc = le16_to_cpu(mgmt->frame_control);
759 759
760 mutex_lock(&local->work_mtx); 760 mutex_lock(&local->mtx);
761 761
762 list_for_each_entry(wk, &local->work_list, list) { 762 list_for_each_entry(wk, &local->work_list, list) {
763 const u8 *bssid = NULL; 763 const u8 *bssid = NULL;
@@ -833,7 +833,7 @@ static void ieee80211_work_rx_queued_mgmt(struct ieee80211_local *local,
833 WARN(1, "unexpected: %d", rma); 833 WARN(1, "unexpected: %d", rma);
834 } 834 }
835 835
836 mutex_unlock(&local->work_mtx); 836 mutex_unlock(&local->mtx);
837 837
838 if (rma != WORK_ACT_DONE) 838 if (rma != WORK_ACT_DONE)
839 goto out; 839 goto out;
@@ -845,9 +845,9 @@ static void ieee80211_work_rx_queued_mgmt(struct ieee80211_local *local,
845 case WORK_DONE_REQUEUE: 845 case WORK_DONE_REQUEUE:
846 synchronize_rcu(); 846 synchronize_rcu();
847 wk->started = false; /* restart */ 847 wk->started = false; /* restart */
848 mutex_lock(&local->work_mtx); 848 mutex_lock(&local->mtx);
849 list_add_tail(&wk->list, &local->work_list); 849 list_add_tail(&wk->list, &local->work_list);
850 mutex_unlock(&local->work_mtx); 850 mutex_unlock(&local->mtx);
851 } 851 }
852 852
853 out: 853 out:
@@ -888,9 +888,9 @@ static void ieee80211_work_work(struct work_struct *work)
888 while ((skb = skb_dequeue(&local->work_skb_queue))) 888 while ((skb = skb_dequeue(&local->work_skb_queue)))
889 ieee80211_work_rx_queued_mgmt(local, skb); 889 ieee80211_work_rx_queued_mgmt(local, skb);
890 890
891 ieee80211_recalc_idle(local); 891 mutex_lock(&local->mtx);
892 892
893 mutex_lock(&local->work_mtx); 893 ieee80211_recalc_idle(local);
894 894
895 list_for_each_entry_safe(wk, tmp, &local->work_list, list) { 895 list_for_each_entry_safe(wk, tmp, &local->work_list, list) {
896 bool started = wk->started; 896 bool started = wk->started;
@@ -995,20 +995,16 @@ static void ieee80211_work_work(struct work_struct *work)
995 run_again(local, jiffies + HZ/2); 995 run_again(local, jiffies + HZ/2);
996 } 996 }
997 997
998 mutex_lock(&local->scan_mtx);
999
1000 if (list_empty(&local->work_list) && local->scan_req && 998 if (list_empty(&local->work_list) && local->scan_req &&
1001 !local->scanning) 999 !local->scanning)
1002 ieee80211_queue_delayed_work(&local->hw, 1000 ieee80211_queue_delayed_work(&local->hw,
1003 &local->scan_work, 1001 &local->scan_work,
1004 round_jiffies_relative(0)); 1002 round_jiffies_relative(0));
1005 1003
1006 mutex_unlock(&local->scan_mtx);
1007
1008 mutex_unlock(&local->work_mtx);
1009
1010 ieee80211_recalc_idle(local); 1004 ieee80211_recalc_idle(local);
1011 1005
1006 mutex_unlock(&local->mtx);
1007
1012 list_for_each_entry_safe(wk, tmp, &free_work, list) { 1008 list_for_each_entry_safe(wk, tmp, &free_work, list) {
1013 wk->done(wk, NULL); 1009 wk->done(wk, NULL);
1014 list_del(&wk->list); 1010 list_del(&wk->list);
@@ -1035,16 +1031,15 @@ void ieee80211_add_work(struct ieee80211_work *wk)
1035 wk->started = false; 1031 wk->started = false;
1036 1032
1037 local = wk->sdata->local; 1033 local = wk->sdata->local;
1038 mutex_lock(&local->work_mtx); 1034 mutex_lock(&local->mtx);
1039 list_add_tail(&wk->list, &local->work_list); 1035 list_add_tail(&wk->list, &local->work_list);
1040 mutex_unlock(&local->work_mtx); 1036 mutex_unlock(&local->mtx);
1041 1037
1042 ieee80211_queue_work(&local->hw, &local->work_work); 1038 ieee80211_queue_work(&local->hw, &local->work_work);
1043} 1039}
1044 1040
1045void ieee80211_work_init(struct ieee80211_local *local) 1041void ieee80211_work_init(struct ieee80211_local *local)
1046{ 1042{
1047 mutex_init(&local->work_mtx);
1048 INIT_LIST_HEAD(&local->work_list); 1043 INIT_LIST_HEAD(&local->work_list);
1049 setup_timer(&local->work_timer, ieee80211_work_timer, 1044 setup_timer(&local->work_timer, ieee80211_work_timer,
1050 (unsigned long)local); 1045 (unsigned long)local);
@@ -1057,7 +1052,7 @@ void ieee80211_work_purge(struct ieee80211_sub_if_data *sdata)
1057 struct ieee80211_local *local = sdata->local; 1052 struct ieee80211_local *local = sdata->local;
1058 struct ieee80211_work *wk; 1053 struct ieee80211_work *wk;
1059 1054
1060 mutex_lock(&local->work_mtx); 1055 mutex_lock(&local->mtx);
1061 list_for_each_entry(wk, &local->work_list, list) { 1056 list_for_each_entry(wk, &local->work_list, list) {
1062 if (wk->sdata != sdata) 1057 if (wk->sdata != sdata)
1063 continue; 1058 continue;
@@ -1065,19 +1060,19 @@ void ieee80211_work_purge(struct ieee80211_sub_if_data *sdata)
1065 wk->started = true; 1060 wk->started = true;
1066 wk->timeout = jiffies; 1061 wk->timeout = jiffies;
1067 } 1062 }
1068 mutex_unlock(&local->work_mtx); 1063 mutex_unlock(&local->mtx);
1069 1064
1070 /* run cleanups etc. */ 1065 /* run cleanups etc. */
1071 ieee80211_work_work(&local->work_work); 1066 ieee80211_work_work(&local->work_work);
1072 1067
1073 mutex_lock(&local->work_mtx); 1068 mutex_lock(&local->mtx);
1074 list_for_each_entry(wk, &local->work_list, list) { 1069 list_for_each_entry(wk, &local->work_list, list) {
1075 if (wk->sdata != sdata) 1070 if (wk->sdata != sdata)
1076 continue; 1071 continue;
1077 WARN_ON(1); 1072 WARN_ON(1);
1078 break; 1073 break;
1079 } 1074 }
1080 mutex_unlock(&local->work_mtx); 1075 mutex_unlock(&local->mtx);
1081} 1076}
1082 1077
1083ieee80211_rx_result ieee80211_work_rx_mgmt(struct ieee80211_sub_if_data *sdata, 1078ieee80211_rx_result ieee80211_work_rx_mgmt(struct ieee80211_sub_if_data *sdata,
@@ -1163,7 +1158,7 @@ int ieee80211_wk_cancel_remain_on_channel(struct ieee80211_sub_if_data *sdata,
1163 struct ieee80211_work *wk, *tmp; 1158 struct ieee80211_work *wk, *tmp;
1164 bool found = false; 1159 bool found = false;
1165 1160
1166 mutex_lock(&local->work_mtx); 1161 mutex_lock(&local->mtx);
1167 list_for_each_entry_safe(wk, tmp, &local->work_list, list) { 1162 list_for_each_entry_safe(wk, tmp, &local->work_list, list) {
1168 if ((unsigned long) wk == cookie) { 1163 if ((unsigned long) wk == cookie) {
1169 wk->timeout = jiffies; 1164 wk->timeout = jiffies;
@@ -1171,7 +1166,7 @@ int ieee80211_wk_cancel_remain_on_channel(struct ieee80211_sub_if_data *sdata,
1171 break; 1166 break;
1172 } 1167 }
1173 } 1168 }
1174 mutex_unlock(&local->work_mtx); 1169 mutex_unlock(&local->mtx);
1175 1170
1176 if (!found) 1171 if (!found)
1177 return -ENOENT; 1172 return -ENOENT;
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index 8d59d27d887e..bee230d8fd11 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -36,8 +36,8 @@ ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
36 int tail; 36 int tail;
37 37
38 hdr = (struct ieee80211_hdr *)skb->data; 38 hdr = (struct ieee80211_hdr *)skb->data;
39 if (!tx->key || tx->key->conf.alg != ALG_TKIP || skb->len < 24 || 39 if (!tx->key || tx->key->conf.cipher != WLAN_CIPHER_SUITE_TKIP ||
40 !ieee80211_is_data_present(hdr->frame_control)) 40 skb->len < 24 || !ieee80211_is_data_present(hdr->frame_control))
41 return TX_CONTINUE; 41 return TX_CONTINUE;
42 42
43 hdrlen = ieee80211_hdrlen(hdr->frame_control); 43 hdrlen = ieee80211_hdrlen(hdr->frame_control);
@@ -94,7 +94,7 @@ ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx)
94 if (status->flag & RX_FLAG_MMIC_STRIPPED) 94 if (status->flag & RX_FLAG_MMIC_STRIPPED)
95 return RX_CONTINUE; 95 return RX_CONTINUE;
96 96
97 if (!rx->key || rx->key->conf.alg != ALG_TKIP || 97 if (!rx->key || rx->key->conf.cipher != WLAN_CIPHER_SUITE_TKIP ||
98 !ieee80211_has_protected(hdr->frame_control) || 98 !ieee80211_has_protected(hdr->frame_control) ||
99 !ieee80211_is_data_present(hdr->frame_control)) 99 !ieee80211_is_data_present(hdr->frame_control))
100 return RX_CONTINUE; 100 return RX_CONTINUE;
@@ -117,7 +117,7 @@ ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx)
117 key = &rx->key->conf.key[key_offset]; 117 key = &rx->key->conf.key[key_offset];
118 michael_mic(key, hdr, data, data_len, mic); 118 michael_mic(key, hdr, data, data_len, mic);
119 if (memcmp(mic, data + data_len, MICHAEL_MIC_LEN) != 0 || wpa_test) { 119 if (memcmp(mic, data + data_len, MICHAEL_MIC_LEN) != 0 || wpa_test) {
120 if (!(rx->flags & IEEE80211_RX_RA_MATCH)) 120 if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
121 return RX_DROP_UNUSABLE; 121 return RX_DROP_UNUSABLE;
122 122
123 mac80211_ev_michael_mic_failure(rx->sdata, rx->key->conf.keyidx, 123 mac80211_ev_michael_mic_failure(rx->sdata, rx->key->conf.keyidx,
@@ -221,19 +221,13 @@ ieee80211_crypto_tkip_decrypt(struct ieee80211_rx_data *rx)
221 if (!rx->sta || skb->len - hdrlen < 12) 221 if (!rx->sta || skb->len - hdrlen < 12)
222 return RX_DROP_UNUSABLE; 222 return RX_DROP_UNUSABLE;
223 223
224 if (status->flag & RX_FLAG_DECRYPTED) { 224 /*
225 if (status->flag & RX_FLAG_IV_STRIPPED) { 225 * Let TKIP code verify IV, but skip decryption.
226 /* 226 * In the case where hardware checks the IV as well,
227 * Hardware took care of all processing, including 227 * we don't even get here, see ieee80211_rx_h_decrypt()
228 * replay protection, and stripped the ICV/IV so 228 */
229 * we cannot do any checks here. 229 if (status->flag & RX_FLAG_DECRYPTED)
230 */
231 return RX_CONTINUE;
232 }
233
234 /* let TKIP code verify IV, but skip decryption */
235 hwaccel = 1; 230 hwaccel = 1;
236 }
237 231
238 res = ieee80211_tkip_decrypt_data(rx->local->wep_rx_tfm, 232 res = ieee80211_tkip_decrypt_data(rx->local->wep_rx_tfm,
239 key, skb->data + hdrlen, 233 key, skb->data + hdrlen,
@@ -447,10 +441,6 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx)
447 if (!rx->sta || data_len < 0) 441 if (!rx->sta || data_len < 0)
448 return RX_DROP_UNUSABLE; 442 return RX_DROP_UNUSABLE;
449 443
450 if ((status->flag & RX_FLAG_DECRYPTED) &&
451 (status->flag & RX_FLAG_IV_STRIPPED))
452 return RX_CONTINUE;
453
454 ccmp_hdr2pn(pn, skb->data + hdrlen); 444 ccmp_hdr2pn(pn, skb->data + hdrlen);
455 445
456 queue = ieee80211_is_mgmt(hdr->frame_control) ? 446 queue = ieee80211_is_mgmt(hdr->frame_control) ?
@@ -564,10 +554,6 @@ ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx)
564 if (!ieee80211_is_mgmt(hdr->frame_control)) 554 if (!ieee80211_is_mgmt(hdr->frame_control))
565 return RX_CONTINUE; 555 return RX_CONTINUE;
566 556
567 if ((status->flag & RX_FLAG_DECRYPTED) &&
568 (status->flag & RX_FLAG_IV_STRIPPED))
569 return RX_CONTINUE;
570
571 if (skb->len < 24 + sizeof(*mmie)) 557 if (skb->len < 24 + sizeof(*mmie))
572 return RX_DROP_UNUSABLE; 558 return RX_DROP_UNUSABLE;
573 559
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 43288259f4a1..1534f2b44caf 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -525,6 +525,7 @@ config NETFILTER_XT_TARGET_TPROXY
525 depends on NETFILTER_XTABLES 525 depends on NETFILTER_XTABLES
526 depends on NETFILTER_ADVANCED 526 depends on NETFILTER_ADVANCED
527 select NF_DEFRAG_IPV4 527 select NF_DEFRAG_IPV4
528 select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES
528 help 529 help
529 This option adds a `TPROXY' target, which is somewhat similar to 530 This option adds a `TPROXY' target, which is somewhat similar to
530 REDIRECT. It can only be used in the mangle table and is useful 531 REDIRECT. It can only be used in the mangle table and is useful
@@ -927,6 +928,7 @@ config NETFILTER_XT_MATCH_SOCKET
927 depends on NETFILTER_ADVANCED 928 depends on NETFILTER_ADVANCED
928 depends on !NF_CONNTRACK || NF_CONNTRACK 929 depends on !NF_CONNTRACK || NF_CONNTRACK
929 select NF_DEFRAG_IPV4 930 select NF_DEFRAG_IPV4
931 select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES
930 help 932 help
931 This option adds a `socket' match, which can be used to match 933 This option adds a `socket' match, which can be used to match
932 packets for which a TCP or UDP socket lookup finds a valid socket. 934 packets for which a TCP or UDP socket lookup finds a valid socket.
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 78b505d33bfb..85dabb86be6f 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -27,7 +27,7 @@
27 27
28static DEFINE_MUTEX(afinfo_mutex); 28static DEFINE_MUTEX(afinfo_mutex);
29 29
30const struct nf_afinfo *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly; 30const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
31EXPORT_SYMBOL(nf_afinfo); 31EXPORT_SYMBOL(nf_afinfo);
32 32
33int nf_register_afinfo(const struct nf_afinfo *afinfo) 33int nf_register_afinfo(const struct nf_afinfo *afinfo)
@@ -105,10 +105,8 @@ EXPORT_SYMBOL(nf_register_hooks);
105 105
106void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n) 106void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
107{ 107{
108 unsigned int i; 108 while (n-- > 0)
109 109 nf_unregister_hook(&reg[n]);
110 for (i = 0; i < n; i++)
111 nf_unregister_hook(&reg[i]);
112} 110}
113EXPORT_SYMBOL(nf_unregister_hooks); 111EXPORT_SYMBOL(nf_unregister_hooks);
114 112
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index 46a77d5c3887..a22dac227055 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -3,7 +3,7 @@
3# 3#
4menuconfig IP_VS 4menuconfig IP_VS
5 tristate "IP virtual server support" 5 tristate "IP virtual server support"
6 depends on NET && INET && NETFILTER && NF_CONNTRACK 6 depends on NET && INET && NETFILTER
7 ---help--- 7 ---help---
8 IP Virtual Server support will let you build a high-performance 8 IP Virtual Server support will let you build a high-performance
9 virtual server based on cluster of two or more real servers. This 9 virtual server based on cluster of two or more real servers. This
@@ -235,7 +235,8 @@ comment 'IPVS application helper'
235 235
236config IP_VS_FTP 236config IP_VS_FTP
237 tristate "FTP protocol helper" 237 tristate "FTP protocol helper"
238 depends on IP_VS_PROTO_TCP && NF_NAT 238 depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT
239 select IP_VS_NFCT
239 ---help--- 240 ---help---
240 FTP is a protocol that transfers IP address and/or port number in 241 FTP is a protocol that transfers IP address and/or port number in
241 the payload. In the virtual server via Network Address Translation, 242 the payload. In the virtual server via Network Address Translation,
@@ -247,4 +248,19 @@ config IP_VS_FTP
247 If you want to compile it in kernel, say Y. To compile it as a 248 If you want to compile it in kernel, say Y. To compile it as a
248 module, choose M here. If unsure, say N. 249 module, choose M here. If unsure, say N.
249 250
251config IP_VS_NFCT
252 bool "Netfilter connection tracking"
253 depends on NF_CONNTRACK
254 ---help---
255 The Netfilter connection tracking support allows the IPVS
256 connection state to be exported to the Netfilter framework
257 for filtering purposes.
258
259config IP_VS_PE_SIP
260 tristate "SIP persistence engine"
261 depends on IP_VS_PROTO_UDP
262 depends on NF_CONNTRACK_SIP
263 ---help---
264 Allow persistence based on the SIP Call-ID
265
250endif # IP_VS 266endif # IP_VS
diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile
index e3baefd7066e..34ee602ddb66 100644
--- a/net/netfilter/ipvs/Makefile
+++ b/net/netfilter/ipvs/Makefile
@@ -9,10 +9,13 @@ ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
9ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o 9ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o
10ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_SCTP) += ip_vs_proto_sctp.o 10ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_SCTP) += ip_vs_proto_sctp.o
11 11
12ip_vs-extra_objs-y :=
13ip_vs-extra_objs-$(CONFIG_IP_VS_NFCT) += ip_vs_nfct.o
14
12ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ 15ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \
13 ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ 16 ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \
14 ip_vs_est.o ip_vs_proto.o \ 17 ip_vs_est.o ip_vs_proto.o ip_vs_pe.o \
15 $(ip_vs_proto-objs-y) 18 $(ip_vs_proto-objs-y) $(ip_vs-extra_objs-y)
16 19
17 20
18# IPVS core 21# IPVS core
@@ -32,3 +35,6 @@ obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
32 35
33# IPVS application helpers 36# IPVS application helpers
34obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o 37obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
38
39# IPVS connection template retrievers
40obj-$(CONFIG_IP_VS_PE_SIP) += ip_vs_pe_sip.o
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index e76f87f4aca8..a475edee0912 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -103,8 +103,8 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
103 goto out; 103 goto out;
104 104
105 list_add(&inc->a_list, &app->incs_list); 105 list_add(&inc->a_list, &app->incs_list);
106 IP_VS_DBG(9, "%s application %s:%u registered\n", 106 IP_VS_DBG(9, "%s App %s:%u registered\n",
107 pp->name, inc->name, inc->port); 107 pp->name, inc->name, ntohs(inc->port));
108 108
109 return 0; 109 return 0;
110 110
@@ -130,7 +130,7 @@ ip_vs_app_inc_release(struct ip_vs_app *inc)
130 pp->unregister_app(inc); 130 pp->unregister_app(inc);
131 131
132 IP_VS_DBG(9, "%s App %s:%u unregistered\n", 132 IP_VS_DBG(9, "%s App %s:%u unregistered\n",
133 pp->name, inc->name, inc->port); 133 pp->name, inc->name, ntohs(inc->port));
134 134
135 list_del(&inc->a_list); 135 list_del(&inc->a_list);
136 136
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index b71c69a2db13..e9adecdc8ca4 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -148,6 +148,42 @@ static unsigned int ip_vs_conn_hashkey(int af, unsigned proto,
148 & ip_vs_conn_tab_mask; 148 & ip_vs_conn_tab_mask;
149} 149}
150 150
151static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
152 bool inverse)
153{
154 const union nf_inet_addr *addr;
155 __be16 port;
156
157 if (p->pe_data && p->pe->hashkey_raw)
158 return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) &
159 ip_vs_conn_tab_mask;
160
161 if (likely(!inverse)) {
162 addr = p->caddr;
163 port = p->cport;
164 } else {
165 addr = p->vaddr;
166 port = p->vport;
167 }
168
169 return ip_vs_conn_hashkey(p->af, p->protocol, addr, port);
170}
171
172static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
173{
174 struct ip_vs_conn_param p;
175
176 ip_vs_conn_fill_param(cp->af, cp->protocol, &cp->caddr, cp->cport,
177 NULL, 0, &p);
178
179 if (cp->dest && cp->dest->svc->pe) {
180 p.pe = cp->dest->svc->pe;
181 p.pe_data = cp->pe_data;
182 p.pe_data_len = cp->pe_data_len;
183 }
184
185 return ip_vs_conn_hashkey_param(&p, false);
186}
151 187
152/* 188/*
153 * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port. 189 * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
@@ -162,7 +198,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
162 return 0; 198 return 0;
163 199
164 /* Hash by protocol, client address and port */ 200 /* Hash by protocol, client address and port */
165 hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport); 201 hash = ip_vs_conn_hashkey_conn(cp);
166 202
167 ct_write_lock(hash); 203 ct_write_lock(hash);
168 spin_lock(&cp->lock); 204 spin_lock(&cp->lock);
@@ -195,7 +231,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
195 int ret; 231 int ret;
196 232
197 /* unhash it and decrease its reference counter */ 233 /* unhash it and decrease its reference counter */
198 hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport); 234 hash = ip_vs_conn_hashkey_conn(cp);
199 235
200 ct_write_lock(hash); 236 ct_write_lock(hash);
201 spin_lock(&cp->lock); 237 spin_lock(&cp->lock);
@@ -218,27 +254,26 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
218/* 254/*
219 * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. 255 * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
220 * Called for pkts coming from OUTside-to-INside. 256 * Called for pkts coming from OUTside-to-INside.
221 * s_addr, s_port: pkt source address (foreign host) 257 * p->caddr, p->cport: pkt source address (foreign host)
222 * d_addr, d_port: pkt dest address (load balancer) 258 * p->vaddr, p->vport: pkt dest address (load balancer)
223 */ 259 */
224static inline struct ip_vs_conn *__ip_vs_conn_in_get 260static inline struct ip_vs_conn *
225(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, 261__ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
226 const union nf_inet_addr *d_addr, __be16 d_port)
227{ 262{
228 unsigned hash; 263 unsigned hash;
229 struct ip_vs_conn *cp; 264 struct ip_vs_conn *cp;
230 265
231 hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port); 266 hash = ip_vs_conn_hashkey_param(p, false);
232 267
233 ct_read_lock(hash); 268 ct_read_lock(hash);
234 269
235 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 270 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
236 if (cp->af == af && 271 if (cp->af == p->af &&
237 ip_vs_addr_equal(af, s_addr, &cp->caddr) && 272 ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
238 ip_vs_addr_equal(af, d_addr, &cp->vaddr) && 273 ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
239 s_port == cp->cport && d_port == cp->vport && 274 p->cport == cp->cport && p->vport == cp->vport &&
240 ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && 275 ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
241 protocol == cp->protocol) { 276 p->protocol == cp->protocol) {
242 /* HIT */ 277 /* HIT */
243 atomic_inc(&cp->refcnt); 278 atomic_inc(&cp->refcnt);
244 ct_read_unlock(hash); 279 ct_read_unlock(hash);
@@ -251,99 +286,111 @@ static inline struct ip_vs_conn *__ip_vs_conn_in_get
251 return NULL; 286 return NULL;
252} 287}
253 288
254struct ip_vs_conn *ip_vs_conn_in_get 289struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
255(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
256 const union nf_inet_addr *d_addr, __be16 d_port)
257{ 290{
258 struct ip_vs_conn *cp; 291 struct ip_vs_conn *cp;
259 292
260 cp = __ip_vs_conn_in_get(af, protocol, s_addr, s_port, d_addr, d_port); 293 cp = __ip_vs_conn_in_get(p);
261 if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) 294 if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) {
262 cp = __ip_vs_conn_in_get(af, protocol, s_addr, 0, d_addr, 295 struct ip_vs_conn_param cport_zero_p = *p;
263 d_port); 296 cport_zero_p.cport = 0;
297 cp = __ip_vs_conn_in_get(&cport_zero_p);
298 }
264 299
265 IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n", 300 IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n",
266 ip_vs_proto_name(protocol), 301 ip_vs_proto_name(p->protocol),
267 IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), 302 IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
268 IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), 303 IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
269 cp ? "hit" : "not hit"); 304 cp ? "hit" : "not hit");
270 305
271 return cp; 306 return cp;
272} 307}
273 308
309static int
310ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb,
311 const struct ip_vs_iphdr *iph,
312 unsigned int proto_off, int inverse,
313 struct ip_vs_conn_param *p)
314{
315 __be16 _ports[2], *pptr;
316
317 pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
318 if (pptr == NULL)
319 return 1;
320
321 if (likely(!inverse))
322 ip_vs_conn_fill_param(af, iph->protocol, &iph->saddr, pptr[0],
323 &iph->daddr, pptr[1], p);
324 else
325 ip_vs_conn_fill_param(af, iph->protocol, &iph->daddr, pptr[1],
326 &iph->saddr, pptr[0], p);
327 return 0;
328}
329
274struct ip_vs_conn * 330struct ip_vs_conn *
275ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb, 331ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb,
276 struct ip_vs_protocol *pp, 332 struct ip_vs_protocol *pp,
277 const struct ip_vs_iphdr *iph, 333 const struct ip_vs_iphdr *iph,
278 unsigned int proto_off, int inverse) 334 unsigned int proto_off, int inverse)
279{ 335{
280 __be16 _ports[2], *pptr; 336 struct ip_vs_conn_param p;
281 337
282 pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); 338 if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p))
283 if (pptr == NULL)
284 return NULL; 339 return NULL;
285 340
286 if (likely(!inverse)) 341 return ip_vs_conn_in_get(&p);
287 return ip_vs_conn_in_get(af, iph->protocol,
288 &iph->saddr, pptr[0],
289 &iph->daddr, pptr[1]);
290 else
291 return ip_vs_conn_in_get(af, iph->protocol,
292 &iph->daddr, pptr[1],
293 &iph->saddr, pptr[0]);
294} 342}
295EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto); 343EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto);
296 344
297/* Get reference to connection template */ 345/* Get reference to connection template */
298struct ip_vs_conn *ip_vs_ct_in_get 346struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
299(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
300 const union nf_inet_addr *d_addr, __be16 d_port)
301{ 347{
302 unsigned hash; 348 unsigned hash;
303 struct ip_vs_conn *cp; 349 struct ip_vs_conn *cp;
304 350
305 hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port); 351 hash = ip_vs_conn_hashkey_param(p, false);
306 352
307 ct_read_lock(hash); 353 ct_read_lock(hash);
308 354
309 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 355 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
310 if (cp->af == af && 356 if (p->pe_data && p->pe->ct_match) {
311 ip_vs_addr_equal(af, s_addr, &cp->caddr) && 357 if (p->pe->ct_match(p, cp))
358 goto out;
359 continue;
360 }
361
362 if (cp->af == p->af &&
363 ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
312 /* protocol should only be IPPROTO_IP if 364 /* protocol should only be IPPROTO_IP if
313 * d_addr is a fwmark */ 365 * p->vaddr is a fwmark */
314 ip_vs_addr_equal(protocol == IPPROTO_IP ? AF_UNSPEC : af, 366 ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC :
315 d_addr, &cp->vaddr) && 367 p->af, p->vaddr, &cp->vaddr) &&
316 s_port == cp->cport && d_port == cp->vport && 368 p->cport == cp->cport && p->vport == cp->vport &&
317 cp->flags & IP_VS_CONN_F_TEMPLATE && 369 cp->flags & IP_VS_CONN_F_TEMPLATE &&
318 protocol == cp->protocol) { 370 p->protocol == cp->protocol)
319 /* HIT */
320 atomic_inc(&cp->refcnt);
321 goto out; 371 goto out;
322 }
323 } 372 }
324 cp = NULL; 373 cp = NULL;
325 374
326 out: 375 out:
376 if (cp)
377 atomic_inc(&cp->refcnt);
327 ct_read_unlock(hash); 378 ct_read_unlock(hash);
328 379
329 IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n", 380 IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
330 ip_vs_proto_name(protocol), 381 ip_vs_proto_name(p->protocol),
331 IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), 382 IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
332 IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), 383 IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
333 cp ? "hit" : "not hit"); 384 cp ? "hit" : "not hit");
334 385
335 return cp; 386 return cp;
336} 387}
337 388
338/* 389/* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
339 * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. 390 * Called for pkts coming from inside-to-OUTside.
340 * Called for pkts coming from inside-to-OUTside. 391 * p->caddr, p->cport: pkt source address (inside host)
341 * s_addr, s_port: pkt source address (inside host) 392 * p->vaddr, p->vport: pkt dest address (foreign host) */
342 * d_addr, d_port: pkt dest address (foreign host) 393struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
343 */
344struct ip_vs_conn *ip_vs_conn_out_get
345(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
346 const union nf_inet_addr *d_addr, __be16 d_port)
347{ 394{
348 unsigned hash; 395 unsigned hash;
349 struct ip_vs_conn *cp, *ret=NULL; 396 struct ip_vs_conn *cp, *ret=NULL;
@@ -351,16 +398,16 @@ struct ip_vs_conn *ip_vs_conn_out_get
351 /* 398 /*
352 * Check for "full" addressed entries 399 * Check for "full" addressed entries
353 */ 400 */
354 hash = ip_vs_conn_hashkey(af, protocol, d_addr, d_port); 401 hash = ip_vs_conn_hashkey_param(p, true);
355 402
356 ct_read_lock(hash); 403 ct_read_lock(hash);
357 404
358 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 405 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
359 if (cp->af == af && 406 if (cp->af == p->af &&
360 ip_vs_addr_equal(af, d_addr, &cp->caddr) && 407 ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
361 ip_vs_addr_equal(af, s_addr, &cp->daddr) && 408 ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
362 d_port == cp->cport && s_port == cp->dport && 409 p->vport == cp->cport && p->cport == cp->dport &&
363 protocol == cp->protocol) { 410 p->protocol == cp->protocol) {
364 /* HIT */ 411 /* HIT */
365 atomic_inc(&cp->refcnt); 412 atomic_inc(&cp->refcnt);
366 ret = cp; 413 ret = cp;
@@ -371,9 +418,9 @@ struct ip_vs_conn *ip_vs_conn_out_get
371 ct_read_unlock(hash); 418 ct_read_unlock(hash);
372 419
373 IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n", 420 IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
374 ip_vs_proto_name(protocol), 421 ip_vs_proto_name(p->protocol),
375 IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), 422 IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
376 IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), 423 IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
377 ret ? "hit" : "not hit"); 424 ret ? "hit" : "not hit");
378 425
379 return ret; 426 return ret;
@@ -385,20 +432,12 @@ ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
385 const struct ip_vs_iphdr *iph, 432 const struct ip_vs_iphdr *iph,
386 unsigned int proto_off, int inverse) 433 unsigned int proto_off, int inverse)
387{ 434{
388 __be16 _ports[2], *pptr; 435 struct ip_vs_conn_param p;
389 436
390 pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); 437 if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p))
391 if (pptr == NULL)
392 return NULL; 438 return NULL;
393 439
394 if (likely(!inverse)) 440 return ip_vs_conn_out_get(&p);
395 return ip_vs_conn_out_get(af, iph->protocol,
396 &iph->saddr, pptr[0],
397 &iph->daddr, pptr[1]);
398 else
399 return ip_vs_conn_out_get(af, iph->protocol,
400 &iph->daddr, pptr[1],
401 &iph->saddr, pptr[0]);
402} 441}
403EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto); 442EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto);
404 443
@@ -505,6 +544,8 @@ static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
505static inline void 544static inline void
506ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) 545ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
507{ 546{
547 unsigned int conn_flags;
548
508 /* if dest is NULL, then return directly */ 549 /* if dest is NULL, then return directly */
509 if (!dest) 550 if (!dest)
510 return; 551 return;
@@ -512,16 +553,20 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
512 /* Increase the refcnt counter of the dest */ 553 /* Increase the refcnt counter of the dest */
513 atomic_inc(&dest->refcnt); 554 atomic_inc(&dest->refcnt);
514 555
556 conn_flags = atomic_read(&dest->conn_flags);
557 if (cp->protocol != IPPROTO_UDP)
558 conn_flags &= ~IP_VS_CONN_F_ONE_PACKET;
515 /* Bind with the destination and its corresponding transmitter */ 559 /* Bind with the destination and its corresponding transmitter */
516 if ((cp->flags & IP_VS_CONN_F_SYNC) && 560 if (cp->flags & IP_VS_CONN_F_SYNC) {
517 (!(cp->flags & IP_VS_CONN_F_TEMPLATE)))
518 /* if the connection is not template and is created 561 /* if the connection is not template and is created
519 * by sync, preserve the activity flag. 562 * by sync, preserve the activity flag.
520 */ 563 */
521 cp->flags |= atomic_read(&dest->conn_flags) & 564 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE))
522 (~IP_VS_CONN_F_INACTIVE); 565 conn_flags &= ~IP_VS_CONN_F_INACTIVE;
523 else 566 /* connections inherit forwarding method from dest */
524 cp->flags |= atomic_read(&dest->conn_flags); 567 cp->flags &= ~IP_VS_CONN_F_FWD_MASK;
568 }
569 cp->flags |= conn_flags;
525 cp->dest = dest; 570 cp->dest = dest;
526 571
527 IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d " 572 IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d "
@@ -717,6 +762,10 @@ static void ip_vs_conn_expire(unsigned long data)
717 if (cp->control) 762 if (cp->control)
718 ip_vs_control_del(cp); 763 ip_vs_control_del(cp);
719 764
765 if (cp->flags & IP_VS_CONN_F_NFCT)
766 ip_vs_conn_drop_conntrack(cp);
767
768 kfree(cp->pe_data);
720 if (unlikely(cp->app != NULL)) 769 if (unlikely(cp->app != NULL))
721 ip_vs_unbind_app(cp); 770 ip_vs_unbind_app(cp);
722 ip_vs_unbind_dest(cp); 771 ip_vs_unbind_dest(cp);
@@ -751,13 +800,12 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
751 * Create a new connection entry and hash it into the ip_vs_conn_tab 800 * Create a new connection entry and hash it into the ip_vs_conn_tab
752 */ 801 */
753struct ip_vs_conn * 802struct ip_vs_conn *
754ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport, 803ip_vs_conn_new(const struct ip_vs_conn_param *p,
755 const union nf_inet_addr *vaddr, __be16 vport,
756 const union nf_inet_addr *daddr, __be16 dport, unsigned flags, 804 const union nf_inet_addr *daddr, __be16 dport, unsigned flags,
757 struct ip_vs_dest *dest) 805 struct ip_vs_dest *dest)
758{ 806{
759 struct ip_vs_conn *cp; 807 struct ip_vs_conn *cp;
760 struct ip_vs_protocol *pp = ip_vs_proto_get(proto); 808 struct ip_vs_protocol *pp = ip_vs_proto_get(p->protocol);
761 809
762 cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC); 810 cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
763 if (cp == NULL) { 811 if (cp == NULL) {
@@ -767,17 +815,21 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
767 815
768 INIT_LIST_HEAD(&cp->c_list); 816 INIT_LIST_HEAD(&cp->c_list);
769 setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); 817 setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
770 cp->af = af; 818 cp->af = p->af;
771 cp->protocol = proto; 819 cp->protocol = p->protocol;
772 ip_vs_addr_copy(af, &cp->caddr, caddr); 820 ip_vs_addr_copy(p->af, &cp->caddr, p->caddr);
773 cp->cport = cport; 821 cp->cport = p->cport;
774 ip_vs_addr_copy(af, &cp->vaddr, vaddr); 822 ip_vs_addr_copy(p->af, &cp->vaddr, p->vaddr);
775 cp->vport = vport; 823 cp->vport = p->vport;
776 /* proto should only be IPPROTO_IP if d_addr is a fwmark */ 824 /* proto should only be IPPROTO_IP if d_addr is a fwmark */
777 ip_vs_addr_copy(proto == IPPROTO_IP ? AF_UNSPEC : af, 825 ip_vs_addr_copy(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
778 &cp->daddr, daddr); 826 &cp->daddr, daddr);
779 cp->dport = dport; 827 cp->dport = dport;
780 cp->flags = flags; 828 cp->flags = flags;
829 if (flags & IP_VS_CONN_F_TEMPLATE && p->pe_data) {
830 cp->pe_data = p->pe_data;
831 cp->pe_data_len = p->pe_data_len;
832 }
781 spin_lock_init(&cp->lock); 833 spin_lock_init(&cp->lock);
782 834
783 /* 835 /*
@@ -803,7 +855,7 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
803 855
804 /* Bind its packet transmitter */ 856 /* Bind its packet transmitter */
805#ifdef CONFIG_IP_VS_IPV6 857#ifdef CONFIG_IP_VS_IPV6
806 if (af == AF_INET6) 858 if (p->af == AF_INET6)
807 ip_vs_bind_xmit_v6(cp); 859 ip_vs_bind_xmit_v6(cp);
808 else 860 else
809#endif 861#endif
@@ -812,13 +864,22 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
812 if (unlikely(pp && atomic_read(&pp->appcnt))) 864 if (unlikely(pp && atomic_read(&pp->appcnt)))
813 ip_vs_bind_app(cp, pp); 865 ip_vs_bind_app(cp, pp);
814 866
867 /*
868 * Allow conntrack to be preserved. By default, conntrack
869 * is created and destroyed for every packet.
870 * Sometimes keeping conntrack can be useful for
871 * IP_VS_CONN_F_ONE_PACKET too.
872 */
873
874 if (ip_vs_conntrack_enabled())
875 cp->flags |= IP_VS_CONN_F_NFCT;
876
815 /* Hash it in the ip_vs_conn_tab finally */ 877 /* Hash it in the ip_vs_conn_tab finally */
816 ip_vs_conn_hash(cp); 878 ip_vs_conn_hash(cp);
817 879
818 return cp; 880 return cp;
819} 881}
820 882
821
822/* 883/*
823 * /proc/net/ip_vs_conn entries 884 * /proc/net/ip_vs_conn entries
824 */ 885 */
@@ -834,7 +895,7 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
834 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { 895 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
835 if (pos-- == 0) { 896 if (pos-- == 0) {
836 seq->private = &ip_vs_conn_tab[idx]; 897 seq->private = &ip_vs_conn_tab[idx];
837 return cp; 898 return cp;
838 } 899 }
839 } 900 }
840 ct_read_unlock_bh(idx); 901 ct_read_unlock_bh(idx);
@@ -891,30 +952,45 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
891 952
892 if (v == SEQ_START_TOKEN) 953 if (v == SEQ_START_TOKEN)
893 seq_puts(seq, 954 seq_puts(seq,
894 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires\n"); 955 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n");
895 else { 956 else {
896 const struct ip_vs_conn *cp = v; 957 const struct ip_vs_conn *cp = v;
958 char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];
959 size_t len = 0;
960
961 if (cp->dest && cp->pe_data &&
962 cp->dest->svc->pe->show_pe_data) {
963 pe_data[0] = ' ';
964 len = strlen(cp->dest->svc->pe->name);
965 memcpy(pe_data + 1, cp->dest->svc->pe->name, len);
966 pe_data[len + 1] = ' ';
967 len += 2;
968 len += cp->dest->svc->pe->show_pe_data(cp,
969 pe_data + len);
970 }
971 pe_data[len] = '\0';
897 972
898#ifdef CONFIG_IP_VS_IPV6 973#ifdef CONFIG_IP_VS_IPV6
899 if (cp->af == AF_INET6) 974 if (cp->af == AF_INET6)
900 seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X %pI6 %04X %-11s %7lu\n", 975 seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X "
976 "%pI6 %04X %-11s %7lu%s\n",
901 ip_vs_proto_name(cp->protocol), 977 ip_vs_proto_name(cp->protocol),
902 &cp->caddr.in6, ntohs(cp->cport), 978 &cp->caddr.in6, ntohs(cp->cport),
903 &cp->vaddr.in6, ntohs(cp->vport), 979 &cp->vaddr.in6, ntohs(cp->vport),
904 &cp->daddr.in6, ntohs(cp->dport), 980 &cp->daddr.in6, ntohs(cp->dport),
905 ip_vs_state_name(cp->protocol, cp->state), 981 ip_vs_state_name(cp->protocol, cp->state),
906 (cp->timer.expires-jiffies)/HZ); 982 (cp->timer.expires-jiffies)/HZ, pe_data);
907 else 983 else
908#endif 984#endif
909 seq_printf(seq, 985 seq_printf(seq,
910 "%-3s %08X %04X %08X %04X" 986 "%-3s %08X %04X %08X %04X"
911 " %08X %04X %-11s %7lu\n", 987 " %08X %04X %-11s %7lu%s\n",
912 ip_vs_proto_name(cp->protocol), 988 ip_vs_proto_name(cp->protocol),
913 ntohl(cp->caddr.ip), ntohs(cp->cport), 989 ntohl(cp->caddr.ip), ntohs(cp->cport),
914 ntohl(cp->vaddr.ip), ntohs(cp->vport), 990 ntohl(cp->vaddr.ip), ntohs(cp->vport),
915 ntohl(cp->daddr.ip), ntohs(cp->dport), 991 ntohl(cp->daddr.ip), ntohs(cp->dport),
916 ip_vs_state_name(cp->protocol, cp->state), 992 ip_vs_state_name(cp->protocol, cp->state),
917 (cp->timer.expires-jiffies)/HZ); 993 (cp->timer.expires-jiffies)/HZ, pe_data);
918 } 994 }
919 return 0; 995 return 0;
920} 996}
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 4f8ddba48011..b4e51e9c5a04 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -40,6 +40,7 @@
40#include <net/udp.h> 40#include <net/udp.h>
41#include <net/icmp.h> /* for icmp_send */ 41#include <net/icmp.h> /* for icmp_send */
42#include <net/route.h> 42#include <net/route.h>
43#include <net/ip6_checksum.h>
43 44
44#include <linux/netfilter.h> 45#include <linux/netfilter.h>
45#include <linux/netfilter_ipv4.h> 46#include <linux/netfilter_ipv4.h>
@@ -47,6 +48,7 @@
47#ifdef CONFIG_IP_VS_IPV6 48#ifdef CONFIG_IP_VS_IPV6
48#include <net/ipv6.h> 49#include <net/ipv6.h>
49#include <linux/netfilter_ipv6.h> 50#include <linux/netfilter_ipv6.h>
51#include <net/ip6_route.h>
50#endif 52#endif
51 53
52#include <net/ip_vs.h> 54#include <net/ip_vs.h>
@@ -175,6 +177,18 @@ ip_vs_set_state(struct ip_vs_conn *cp, int direction,
175 return pp->state_transition(cp, direction, skb, pp); 177 return pp->state_transition(cp, direction, skb, pp);
176} 178}
177 179
180static inline void
181ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
182 struct sk_buff *skb, int protocol,
183 const union nf_inet_addr *caddr, __be16 cport,
184 const union nf_inet_addr *vaddr, __be16 vport,
185 struct ip_vs_conn_param *p)
186{
187 ip_vs_conn_fill_param(svc->af, protocol, caddr, cport, vaddr, vport, p);
188 p->pe = svc->pe;
189 if (p->pe && p->pe->fill_param)
190 p->pe->fill_param(p, skb);
191}
178 192
179/* 193/*
180 * IPVS persistent scheduling function 194 * IPVS persistent scheduling function
@@ -185,15 +199,16 @@ ip_vs_set_state(struct ip_vs_conn *cp, int direction,
185 */ 199 */
186static struct ip_vs_conn * 200static struct ip_vs_conn *
187ip_vs_sched_persist(struct ip_vs_service *svc, 201ip_vs_sched_persist(struct ip_vs_service *svc,
188 const struct sk_buff *skb, 202 struct sk_buff *skb,
189 __be16 ports[2]) 203 __be16 ports[2])
190{ 204{
191 struct ip_vs_conn *cp = NULL; 205 struct ip_vs_conn *cp = NULL;
192 struct ip_vs_iphdr iph; 206 struct ip_vs_iphdr iph;
193 struct ip_vs_dest *dest; 207 struct ip_vs_dest *dest;
194 struct ip_vs_conn *ct; 208 struct ip_vs_conn *ct;
195 __be16 dport; /* destination port to forward */ 209 __be16 dport = 0; /* destination port to forward */
196 __be16 flags; 210 unsigned int flags;
211 struct ip_vs_conn_param param;
197 union nf_inet_addr snet; /* source network of the client, 212 union nf_inet_addr snet; /* source network of the client,
198 after masking */ 213 after masking */
199 214
@@ -226,120 +241,75 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
226 * service, and a template like <caddr, 0, vaddr, vport, daddr, dport> 241 * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
227 * is created for other persistent services. 242 * is created for other persistent services.
228 */ 243 */
229 if (ports[1] == svc->port) { 244 {
230 /* Check if a template already exists */ 245 int protocol = iph.protocol;
231 if (svc->port != FTPPORT) 246 const union nf_inet_addr *vaddr = &iph.daddr;
232 ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, 247 const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
233 &iph.daddr, ports[1]); 248 __be16 vport = 0;
234 else 249
235 ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, 250 if (ports[1] == svc->port) {
236 &iph.daddr, 0); 251 /* non-FTP template:
237 252 * <protocol, caddr, 0, vaddr, vport, daddr, dport>
238 if (!ct || !ip_vs_check_template(ct)) { 253 * FTP template:
239 /* 254 * <protocol, caddr, 0, vaddr, 0, daddr, 0>
240 * No template found or the dest of the connection
241 * template is not available.
242 */
243 dest = svc->scheduler->schedule(svc, skb);
244 if (dest == NULL) {
245 IP_VS_DBG(1, "p-schedule: no dest found.\n");
246 return NULL;
247 }
248
249 /*
250 * Create a template like <protocol,caddr,0,
251 * vaddr,vport,daddr,dport> for non-ftp service,
252 * and <protocol,caddr,0,vaddr,0,daddr,0>
253 * for ftp service.
254 */ 255 */
255 if (svc->port != FTPPORT) 256 if (svc->port != FTPPORT)
256 ct = ip_vs_conn_new(svc->af, iph.protocol, 257 vport = ports[1];
257 &snet, 0,
258 &iph.daddr,
259 ports[1],
260 &dest->addr, dest->port,
261 IP_VS_CONN_F_TEMPLATE,
262 dest);
263 else
264 ct = ip_vs_conn_new(svc->af, iph.protocol,
265 &snet, 0,
266 &iph.daddr, 0,
267 &dest->addr, 0,
268 IP_VS_CONN_F_TEMPLATE,
269 dest);
270 if (ct == NULL)
271 return NULL;
272
273 ct->timeout = svc->timeout;
274 } else { 258 } else {
275 /* set destination with the found template */ 259 /* Note: persistent fwmark-based services and
276 dest = ct->dest; 260 * persistent port zero service are handled here.
277 } 261 * fwmark template:
278 dport = dest->port; 262 * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
279 } else { 263 * port zero template:
280 /* 264 * <protocol,caddr,0,vaddr,0,daddr,0>
281 * Note: persistent fwmark-based services and persistent
282 * port zero service are handled here.
283 * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
284 * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
285 */
286 if (svc->fwmark) {
287 union nf_inet_addr fwmark = {
288 .ip = htonl(svc->fwmark)
289 };
290
291 ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0,
292 &fwmark, 0);
293 } else
294 ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
295 &iph.daddr, 0);
296
297 if (!ct || !ip_vs_check_template(ct)) {
298 /*
299 * If it is not persistent port zero, return NULL,
300 * otherwise create a connection template.
301 */ 265 */
302 if (svc->port) 266 if (svc->fwmark) {
303 return NULL; 267 protocol = IPPROTO_IP;
304 268 vaddr = &fwmark;
305 dest = svc->scheduler->schedule(svc, skb);
306 if (dest == NULL) {
307 IP_VS_DBG(1, "p-schedule: no dest found.\n");
308 return NULL;
309 } 269 }
270 }
271 ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
272 vaddr, vport, &param);
273 }
310 274
311 /* 275 /* Check if a template already exists */
312 * Create a template according to the service 276 ct = ip_vs_ct_in_get(&param);
313 */ 277 if (!ct || !ip_vs_check_template(ct)) {
314 if (svc->fwmark) { 278 /* No template found or the dest of the connection
315 union nf_inet_addr fwmark = { 279 * template is not available.
316 .ip = htonl(svc->fwmark) 280 */
317 }; 281 dest = svc->scheduler->schedule(svc, skb);
318 282 if (!dest) {
319 ct = ip_vs_conn_new(svc->af, IPPROTO_IP, 283 IP_VS_DBG(1, "p-schedule: no dest found.\n");
320 &snet, 0, 284 kfree(param.pe_data);
321 &fwmark, 0, 285 return NULL;
322 &dest->addr, 0,
323 IP_VS_CONN_F_TEMPLATE,
324 dest);
325 } else
326 ct = ip_vs_conn_new(svc->af, iph.protocol,
327 &snet, 0,
328 &iph.daddr, 0,
329 &dest->addr, 0,
330 IP_VS_CONN_F_TEMPLATE,
331 dest);
332 if (ct == NULL)
333 return NULL;
334
335 ct->timeout = svc->timeout;
336 } else {
337 /* set destination with the found template */
338 dest = ct->dest;
339 } 286 }
340 dport = ports[1]; 287
288 if (ports[1] == svc->port && svc->port != FTPPORT)
289 dport = dest->port;
290
291 /* Create a template
292 * This adds param.pe_data to the template,
293 * and thus param.pe_data will be destroyed
294 * when the template expires */
295 ct = ip_vs_conn_new(&param, &dest->addr, dport,
296 IP_VS_CONN_F_TEMPLATE, dest);
297 if (ct == NULL) {
298 kfree(param.pe_data);
299 return NULL;
300 }
301
302 ct->timeout = svc->timeout;
303 } else {
304 /* set destination with the found template */
305 dest = ct->dest;
306 kfree(param.pe_data);
341 } 307 }
342 308
309 dport = ports[1];
310 if (dport == svc->port && dest->port)
311 dport = dest->port;
312
343 flags = (svc->flags & IP_VS_SVC_F_ONEPACKET 313 flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
344 && iph.protocol == IPPROTO_UDP)? 314 && iph.protocol == IPPROTO_UDP)?
345 IP_VS_CONN_F_ONE_PACKET : 0; 315 IP_VS_CONN_F_ONE_PACKET : 0;
@@ -347,12 +317,9 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
347 /* 317 /*
348 * Create a new connection according to the template 318 * Create a new connection according to the template
349 */ 319 */
350 cp = ip_vs_conn_new(svc->af, iph.protocol, 320 ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, ports[0],
351 &iph.saddr, ports[0], 321 &iph.daddr, ports[1], &param);
352 &iph.daddr, ports[1], 322 cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest);
353 &dest->addr, dport,
354 flags,
355 dest);
356 if (cp == NULL) { 323 if (cp == NULL) {
357 ip_vs_conn_put(ct); 324 ip_vs_conn_put(ct);
358 return NULL; 325 return NULL;
@@ -376,23 +343,53 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
376 * Protocols supported: TCP, UDP 343 * Protocols supported: TCP, UDP
377 */ 344 */
378struct ip_vs_conn * 345struct ip_vs_conn *
379ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) 346ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
347 struct ip_vs_protocol *pp, int *ignored)
380{ 348{
381 struct ip_vs_conn *cp = NULL; 349 struct ip_vs_conn *cp = NULL;
382 struct ip_vs_iphdr iph; 350 struct ip_vs_iphdr iph;
383 struct ip_vs_dest *dest; 351 struct ip_vs_dest *dest;
384 __be16 _ports[2], *pptr, flags; 352 __be16 _ports[2], *pptr;
353 unsigned int flags;
385 354
355 *ignored = 1;
386 ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); 356 ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
387 pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); 357 pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
388 if (pptr == NULL) 358 if (pptr == NULL)
389 return NULL; 359 return NULL;
390 360
391 /* 361 /*
362 * FTPDATA needs this check when using local real server.
363 * Never schedule Active FTPDATA connections from real server.
364 * For LVS-NAT they must be already created. For other methods
365 * with persistence the connection is created on SYN+ACK.
366 */
367 if (pptr[0] == FTPDATA) {
368 IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
369 "Not scheduling FTPDATA");
370 return NULL;
371 }
372
373 /*
374 * Do not schedule replies from local real server. It is risky
375 * for fwmark services but mostly for persistent services.
376 */
377 if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
378 (svc->flags & IP_VS_SVC_F_PERSISTENT || svc->fwmark) &&
379 (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) {
380 IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
381 "Not scheduling reply for existing connection");
382 __ip_vs_conn_put(cp);
383 return NULL;
384 }
385
386 /*
392 * Persistent service 387 * Persistent service
393 */ 388 */
394 if (svc->flags & IP_VS_SVC_F_PERSISTENT) 389 if (svc->flags & IP_VS_SVC_F_PERSISTENT) {
390 *ignored = 0;
395 return ip_vs_sched_persist(svc, skb, pptr); 391 return ip_vs_sched_persist(svc, skb, pptr);
392 }
396 393
397 /* 394 /*
398 * Non-persistent service 395 * Non-persistent service
@@ -405,6 +402,8 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
405 return NULL; 402 return NULL;
406 } 403 }
407 404
405 *ignored = 0;
406
408 dest = svc->scheduler->schedule(svc, skb); 407 dest = svc->scheduler->schedule(svc, skb);
409 if (dest == NULL) { 408 if (dest == NULL) {
410 IP_VS_DBG(1, "Schedule: no dest found.\n"); 409 IP_VS_DBG(1, "Schedule: no dest found.\n");
@@ -418,14 +417,16 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
418 /* 417 /*
419 * Create a connection entry. 418 * Create a connection entry.
420 */ 419 */
421 cp = ip_vs_conn_new(svc->af, iph.protocol, 420 {
422 &iph.saddr, pptr[0], 421 struct ip_vs_conn_param p;
423 &iph.daddr, pptr[1], 422 ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr,
424 &dest->addr, dest->port ? dest->port : pptr[1], 423 pptr[0], &iph.daddr, pptr[1], &p);
425 flags, 424 cp = ip_vs_conn_new(&p, &dest->addr,
426 dest); 425 dest->port ? dest->port : pptr[1],
427 if (cp == NULL) 426 flags, dest);
428 return NULL; 427 if (!cp)
428 return NULL;
429 }
429 430
430 IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " 431 IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
431 "d:%s:%u conn->flags:%X conn->refcnt:%d\n", 432 "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
@@ -472,23 +473,26 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
472 if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) { 473 if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
473 int ret, cs; 474 int ret, cs;
474 struct ip_vs_conn *cp; 475 struct ip_vs_conn *cp;
475 __u16 flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && 476 unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
476 iph.protocol == IPPROTO_UDP)? 477 iph.protocol == IPPROTO_UDP)?
477 IP_VS_CONN_F_ONE_PACKET : 0; 478 IP_VS_CONN_F_ONE_PACKET : 0;
478 union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; 479 union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } };
479 480
480 ip_vs_service_put(svc); 481 ip_vs_service_put(svc);
481 482
482 /* create a new connection entry */ 483 /* create a new connection entry */
483 IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__); 484 IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
484 cp = ip_vs_conn_new(svc->af, iph.protocol, 485 {
485 &iph.saddr, pptr[0], 486 struct ip_vs_conn_param p;
486 &iph.daddr, pptr[1], 487 ip_vs_conn_fill_param(svc->af, iph.protocol,
487 &daddr, 0, 488 &iph.saddr, pptr[0],
488 IP_VS_CONN_F_BYPASS | flags, 489 &iph.daddr, pptr[1], &p);
489 NULL); 490 cp = ip_vs_conn_new(&p, &daddr, 0,
490 if (cp == NULL) 491 IP_VS_CONN_F_BYPASS | flags,
491 return NF_DROP; 492 NULL);
493 if (!cp)
494 return NF_DROP;
495 }
492 496
493 /* statistics */ 497 /* statistics */
494 ip_vs_in_stats(cp, skb); 498 ip_vs_in_stats(cp, skb);
@@ -526,9 +530,14 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
526 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ 530 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
527 */ 531 */
528#ifdef CONFIG_IP_VS_IPV6 532#ifdef CONFIG_IP_VS_IPV6
529 if (svc->af == AF_INET6) 533 if (svc->af == AF_INET6) {
534 if (!skb->dev) {
535 struct net *net = dev_net(skb_dst(skb)->dev);
536
537 skb->dev = net->loopback_dev;
538 }
530 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); 539 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
531 else 540 } else
532#endif 541#endif
533 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); 542 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
534 543
@@ -540,6 +549,15 @@ __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
540 return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); 549 return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
541} 550}
542 551
552static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum)
553{
554 if (NF_INET_LOCAL_IN == hooknum)
555 return IP_DEFRAG_VS_IN;
556 if (NF_INET_FORWARD == hooknum)
557 return IP_DEFRAG_VS_FWD;
558 return IP_DEFRAG_VS_OUT;
559}
560
543static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) 561static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
544{ 562{
545 int err = ip_defrag(skb, user); 563 int err = ip_defrag(skb, user);
@@ -600,10 +618,10 @@ void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
600 skb->ip_summed = CHECKSUM_UNNECESSARY; 618 skb->ip_summed = CHECKSUM_UNNECESSARY;
601 619
602 if (inout) 620 if (inout)
603 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, 621 IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
604 "Forwarding altered outgoing ICMP"); 622 "Forwarding altered outgoing ICMP");
605 else 623 else
606 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, 624 IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
607 "Forwarding altered incoming ICMP"); 625 "Forwarding altered incoming ICMP");
608} 626}
609 627
@@ -637,17 +655,21 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
637 } 655 }
638 656
639 /* And finally the ICMP checksum */ 657 /* And finally the ICMP checksum */
640 icmph->icmp6_cksum = 0; 658 icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr,
641 /* TODO IPv6: is this correct for ICMPv6? */ 659 skb->len - icmp_offset,
642 ip_vs_checksum_complete(skb, icmp_offset); 660 IPPROTO_ICMPV6, 0);
643 skb->ip_summed = CHECKSUM_UNNECESSARY; 661 skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset;
662 skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum);
663 skb->ip_summed = CHECKSUM_PARTIAL;
644 664
645 if (inout) 665 if (inout)
646 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, 666 IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
647 "Forwarding altered outgoing ICMPv6"); 667 (void *)ciph - (void *)iph,
668 "Forwarding altered outgoing ICMPv6");
648 else 669 else
649 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, 670 IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
650 "Forwarding altered incoming ICMPv6"); 671 (void *)ciph - (void *)iph,
672 "Forwarding altered incoming ICMPv6");
651} 673}
652#endif 674#endif
653 675
@@ -688,10 +710,25 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
688#endif 710#endif
689 ip_vs_nat_icmp(skb, pp, cp, 1); 711 ip_vs_nat_icmp(skb, pp, cp, 1);
690 712
713#ifdef CONFIG_IP_VS_IPV6
714 if (af == AF_INET6) {
715 if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
716 goto out;
717 } else
718#endif
719 if ((sysctl_ip_vs_snat_reroute ||
720 skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
721 ip_route_me_harder(skb, RTN_LOCAL) != 0)
722 goto out;
723
691 /* do the statistics and put it back */ 724 /* do the statistics and put it back */
692 ip_vs_out_stats(cp, skb); 725 ip_vs_out_stats(cp, skb);
693 726
694 skb->ipvs_property = 1; 727 skb->ipvs_property = 1;
728 if (!(cp->flags & IP_VS_CONN_F_NFCT))
729 ip_vs_notrack(skb);
730 else
731 ip_vs_update_conntrack(skb, cp, 0);
695 verdict = NF_ACCEPT; 732 verdict = NF_ACCEPT;
696 733
697out: 734out:
@@ -705,7 +742,8 @@ out:
705 * Find any that might be relevant, check against existing connections. 742 * Find any that might be relevant, check against existing connections.
706 * Currently handles error types - unreachable, quench, ttl exceeded. 743 * Currently handles error types - unreachable, quench, ttl exceeded.
707 */ 744 */
708static int ip_vs_out_icmp(struct sk_buff *skb, int *related) 745static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
746 unsigned int hooknum)
709{ 747{
710 struct iphdr *iph; 748 struct iphdr *iph;
711 struct icmphdr _icmph, *ic; 749 struct icmphdr _icmph, *ic;
@@ -720,7 +758,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
720 758
721 /* reassemble IP fragments */ 759 /* reassemble IP fragments */
722 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { 760 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
723 if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) 761 if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
724 return NF_STOLEN; 762 return NF_STOLEN;
725 } 763 }
726 764
@@ -763,7 +801,8 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
763 pp->dont_defrag)) 801 pp->dont_defrag))
764 return NF_ACCEPT; 802 return NF_ACCEPT;
765 803
766 IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for"); 804 IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
805 "Checking outgoing ICMP for");
767 806
768 offset += cih->ihl * 4; 807 offset += cih->ihl * 4;
769 808
@@ -779,7 +818,8 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
779} 818}
780 819
781#ifdef CONFIG_IP_VS_IPV6 820#ifdef CONFIG_IP_VS_IPV6
782static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related) 821static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
822 unsigned int hooknum)
783{ 823{
784 struct ipv6hdr *iph; 824 struct ipv6hdr *iph;
785 struct icmp6hdr _icmph, *ic; 825 struct icmp6hdr _icmph, *ic;
@@ -795,7 +835,7 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
795 835
796 /* reassemble IP fragments */ 836 /* reassemble IP fragments */
797 if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { 837 if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
798 if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT)) 838 if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
799 return NF_STOLEN; 839 return NF_STOLEN;
800 } 840 }
801 841
@@ -838,7 +878,8 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
838 if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) 878 if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
839 return NF_ACCEPT; 879 return NF_ACCEPT;
840 880
841 IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for"); 881 IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
882 "Checking outgoing ICMPv6 for");
842 883
843 offset += sizeof(struct ipv6hdr); 884 offset += sizeof(struct ipv6hdr);
844 885
@@ -886,7 +927,7 @@ static unsigned int
886handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, 927handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
887 struct ip_vs_conn *cp, int ihl) 928 struct ip_vs_conn *cp, int ihl)
888{ 929{
889 IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet"); 930 IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet");
890 931
891 if (!skb_make_writable(skb, ihl)) 932 if (!skb_make_writable(skb, ihl))
892 goto drop; 933 goto drop;
@@ -905,6 +946,15 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
905 ip_send_check(ip_hdr(skb)); 946 ip_send_check(ip_hdr(skb));
906 } 947 }
907 948
949 /*
950 * nf_iterate does not expect change in the skb->dst->dev.
951 * It looks like it is not fatal to enable this code for hooks
952 * where our handlers are at the end of the chain list and
953 * when all next handlers use skb->dst->dev and not outdev.
954 * It will definitely route properly the inout NAT traffic
955 * when multiple paths are used.
956 */
957
908 /* For policy routing, packets originating from this 958 /* For policy routing, packets originating from this
909 * machine itself may be routed differently to packets 959 * machine itself may be routed differently to packets
910 * passing through. We want this packet to be routed as 960 * passing through. We want this packet to be routed as
@@ -913,20 +963,25 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
913 */ 963 */
914#ifdef CONFIG_IP_VS_IPV6 964#ifdef CONFIG_IP_VS_IPV6
915 if (af == AF_INET6) { 965 if (af == AF_INET6) {
916 if (ip6_route_me_harder(skb) != 0) 966 if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
917 goto drop; 967 goto drop;
918 } else 968 } else
919#endif 969#endif
920 if (ip_route_me_harder(skb, RTN_LOCAL) != 0) 970 if ((sysctl_ip_vs_snat_reroute ||
971 skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
972 ip_route_me_harder(skb, RTN_LOCAL) != 0)
921 goto drop; 973 goto drop;
922 974
923 IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); 975 IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT");
924 976
925 ip_vs_out_stats(cp, skb); 977 ip_vs_out_stats(cp, skb);
926 ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); 978 ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
927 ip_vs_conn_put(cp);
928
929 skb->ipvs_property = 1; 979 skb->ipvs_property = 1;
980 if (!(cp->flags & IP_VS_CONN_F_NFCT))
981 ip_vs_notrack(skb);
982 else
983 ip_vs_update_conntrack(skb, cp, 0);
984 ip_vs_conn_put(cp);
930 985
931 LeaveFunction(11); 986 LeaveFunction(11);
932 return NF_ACCEPT; 987 return NF_ACCEPT;
@@ -934,35 +989,46 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
934drop: 989drop:
935 ip_vs_conn_put(cp); 990 ip_vs_conn_put(cp);
936 kfree_skb(skb); 991 kfree_skb(skb);
992 LeaveFunction(11);
937 return NF_STOLEN; 993 return NF_STOLEN;
938} 994}
939 995
940/* 996/*
941 * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
942 * Check if outgoing packet belongs to the established ip_vs_conn. 997 * Check if outgoing packet belongs to the established ip_vs_conn.
943 */ 998 */
944static unsigned int 999static unsigned int
945ip_vs_out(unsigned int hooknum, struct sk_buff *skb, 1000ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
946 const struct net_device *in, const struct net_device *out,
947 int (*okfn)(struct sk_buff *))
948{ 1001{
949 struct ip_vs_iphdr iph; 1002 struct ip_vs_iphdr iph;
950 struct ip_vs_protocol *pp; 1003 struct ip_vs_protocol *pp;
951 struct ip_vs_conn *cp; 1004 struct ip_vs_conn *cp;
952 int af;
953 1005
954 EnterFunction(11); 1006 EnterFunction(11);
955 1007
956 af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; 1008 /* Already marked as IPVS request or reply? */
957
958 if (skb->ipvs_property) 1009 if (skb->ipvs_property)
959 return NF_ACCEPT; 1010 return NF_ACCEPT;
960 1011
1012 /* Bad... Do not break raw sockets */
1013 if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
1014 af == AF_INET)) {
1015 struct sock *sk = skb->sk;
1016 struct inet_sock *inet = inet_sk(skb->sk);
1017
1018 if (inet && sk->sk_family == PF_INET && inet->nodefrag)
1019 return NF_ACCEPT;
1020 }
1021
1022 if (unlikely(!skb_dst(skb)))
1023 return NF_ACCEPT;
1024
961 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); 1025 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
962#ifdef CONFIG_IP_VS_IPV6 1026#ifdef CONFIG_IP_VS_IPV6
963 if (af == AF_INET6) { 1027 if (af == AF_INET6) {
964 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { 1028 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
965 int related, verdict = ip_vs_out_icmp_v6(skb, &related); 1029 int related;
1030 int verdict = ip_vs_out_icmp_v6(skb, &related,
1031 hooknum);
966 1032
967 if (related) 1033 if (related)
968 return verdict; 1034 return verdict;
@@ -971,7 +1037,8 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
971 } else 1037 } else
972#endif 1038#endif
973 if (unlikely(iph.protocol == IPPROTO_ICMP)) { 1039 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
974 int related, verdict = ip_vs_out_icmp(skb, &related); 1040 int related;
1041 int verdict = ip_vs_out_icmp(skb, &related, hooknum);
975 1042
976 if (related) 1043 if (related)
977 return verdict; 1044 return verdict;
@@ -985,19 +1052,19 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
985 /* reassemble IP fragments */ 1052 /* reassemble IP fragments */
986#ifdef CONFIG_IP_VS_IPV6 1053#ifdef CONFIG_IP_VS_IPV6
987 if (af == AF_INET6) { 1054 if (af == AF_INET6) {
988 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { 1055 if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
989 int related, verdict = ip_vs_out_icmp_v6(skb, &related); 1056 if (ip_vs_gather_frags_v6(skb,
990 1057 ip_vs_defrag_user(hooknum)))
991 if (related) 1058 return NF_STOLEN;
992 return verdict;
993
994 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
995 } 1059 }
1060
1061 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
996 } else 1062 } else
997#endif 1063#endif
998 if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) && 1064 if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
999 !pp->dont_defrag)) { 1065 !pp->dont_defrag)) {
1000 if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) 1066 if (ip_vs_gather_frags(skb,
1067 ip_vs_defrag_user(hooknum)))
1001 return NF_STOLEN; 1068 return NF_STOLEN;
1002 1069
1003 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); 1070 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
@@ -1008,55 +1075,123 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
1008 */ 1075 */
1009 cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); 1076 cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
1010 1077
1011 if (unlikely(!cp)) { 1078 if (likely(cp))
1012 if (sysctl_ip_vs_nat_icmp_send && 1079 return handle_response(af, skb, pp, cp, iph.len);
1013 (pp->protocol == IPPROTO_TCP || 1080 if (sysctl_ip_vs_nat_icmp_send &&
1014 pp->protocol == IPPROTO_UDP || 1081 (pp->protocol == IPPROTO_TCP ||
1015 pp->protocol == IPPROTO_SCTP)) { 1082 pp->protocol == IPPROTO_UDP ||
1016 __be16 _ports[2], *pptr; 1083 pp->protocol == IPPROTO_SCTP)) {
1017 1084 __be16 _ports[2], *pptr;
1018 pptr = skb_header_pointer(skb, iph.len, 1085
1019 sizeof(_ports), _ports); 1086 pptr = skb_header_pointer(skb, iph.len,
1020 if (pptr == NULL) 1087 sizeof(_ports), _ports);
1021 return NF_ACCEPT; /* Not for me */ 1088 if (pptr == NULL)
1022 if (ip_vs_lookup_real_service(af, iph.protocol, 1089 return NF_ACCEPT; /* Not for me */
1023 &iph.saddr, 1090 if (ip_vs_lookup_real_service(af, iph.protocol,
1024 pptr[0])) { 1091 &iph.saddr,
1025 /* 1092 pptr[0])) {
1026 * Notify the real server: there is no 1093 /*
1027 * existing entry if it is not RST 1094 * Notify the real server: there is no
1028 * packet or not TCP packet. 1095 * existing entry if it is not RST
1029 */ 1096 * packet or not TCP packet.
1030 if ((iph.protocol != IPPROTO_TCP && 1097 */
1031 iph.protocol != IPPROTO_SCTP) 1098 if ((iph.protocol != IPPROTO_TCP &&
1032 || ((iph.protocol == IPPROTO_TCP 1099 iph.protocol != IPPROTO_SCTP)
1033 && !is_tcp_reset(skb, iph.len)) 1100 || ((iph.protocol == IPPROTO_TCP
1034 || (iph.protocol == IPPROTO_SCTP 1101 && !is_tcp_reset(skb, iph.len))
1035 && !is_sctp_abort(skb, 1102 || (iph.protocol == IPPROTO_SCTP
1036 iph.len)))) { 1103 && !is_sctp_abort(skb,
1104 iph.len)))) {
1037#ifdef CONFIG_IP_VS_IPV6 1105#ifdef CONFIG_IP_VS_IPV6
1038 if (af == AF_INET6) 1106 if (af == AF_INET6) {
1039 icmpv6_send(skb, 1107 struct net *net =
1040 ICMPV6_DEST_UNREACH, 1108 dev_net(skb_dst(skb)->dev);
1041 ICMPV6_PORT_UNREACH, 1109
1042 0); 1110 if (!skb->dev)
1043 else 1111 skb->dev = net->loopback_dev;
1112 icmpv6_send(skb,
1113 ICMPV6_DEST_UNREACH,
1114 ICMPV6_PORT_UNREACH,
1115 0);
1116 } else
1044#endif 1117#endif
1045 icmp_send(skb, 1118 icmp_send(skb,
1046 ICMP_DEST_UNREACH, 1119 ICMP_DEST_UNREACH,
1047 ICMP_PORT_UNREACH, 0); 1120 ICMP_PORT_UNREACH, 0);
1048 return NF_DROP; 1121 return NF_DROP;
1049 }
1050 } 1122 }
1051 } 1123 }
1052 IP_VS_DBG_PKT(12, pp, skb, 0,
1053 "packet continues traversal as normal");
1054 return NF_ACCEPT;
1055 } 1124 }
1125 IP_VS_DBG_PKT(12, af, pp, skb, 0,
1126 "ip_vs_out: packet continues traversal as normal");
1127 return NF_ACCEPT;
1128}
1129
1130/*
1131 * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
1132 * used only for VS/NAT.
1133 * Check if packet is reply for established ip_vs_conn.
1134 */
1135static unsigned int
1136ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb,
1137 const struct net_device *in, const struct net_device *out,
1138 int (*okfn)(struct sk_buff *))
1139{
1140 return ip_vs_out(hooknum, skb, AF_INET);
1141}
1142
1143/*
1144 * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
1145 * Check if packet is reply for established ip_vs_conn.
1146 */
1147static unsigned int
1148ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb,
1149 const struct net_device *in, const struct net_device *out,
1150 int (*okfn)(struct sk_buff *))
1151{
1152 unsigned int verdict;
1153
1154 /* Disable BH in LOCAL_OUT until all places are fixed */
1155 local_bh_disable();
1156 verdict = ip_vs_out(hooknum, skb, AF_INET);
1157 local_bh_enable();
1158 return verdict;
1159}
1160
1161#ifdef CONFIG_IP_VS_IPV6
1162
1163/*
1164 * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
1165 * used only for VS/NAT.
1166 * Check if packet is reply for established ip_vs_conn.
1167 */
1168static unsigned int
1169ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb,
1170 const struct net_device *in, const struct net_device *out,
1171 int (*okfn)(struct sk_buff *))
1172{
1173 return ip_vs_out(hooknum, skb, AF_INET6);
1174}
1056 1175
1057 return handle_response(af, skb, pp, cp, iph.len); 1176/*
1177 * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
1178 * Check if packet is reply for established ip_vs_conn.
1179 */
1180static unsigned int
1181ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb,
1182 const struct net_device *in, const struct net_device *out,
1183 int (*okfn)(struct sk_buff *))
1184{
1185 unsigned int verdict;
1186
1187 /* Disable BH in LOCAL_OUT until all places are fixed */
1188 local_bh_disable();
1189 verdict = ip_vs_out(hooknum, skb, AF_INET6);
1190 local_bh_enable();
1191 return verdict;
1058} 1192}
1059 1193
1194#endif
1060 1195
1061/* 1196/*
1062 * Handle ICMP messages in the outside-to-inside direction (incoming). 1197 * Handle ICMP messages in the outside-to-inside direction (incoming).
@@ -1080,8 +1215,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1080 1215
1081 /* reassemble IP fragments */ 1216 /* reassemble IP fragments */
1082 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { 1217 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
1083 if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ? 1218 if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
1084 IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD))
1085 return NF_STOLEN; 1219 return NF_STOLEN;
1086 } 1220 }
1087 1221
@@ -1124,7 +1258,8 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1124 pp->dont_defrag)) 1258 pp->dont_defrag))
1125 return NF_ACCEPT; 1259 return NF_ACCEPT;
1126 1260
1127 IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for"); 1261 IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
1262 "Checking incoming ICMP for");
1128 1263
1129 offset += cih->ihl * 4; 1264 offset += cih->ihl * 4;
1130 1265
@@ -1158,7 +1293,14 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1158 if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) 1293 if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
1159 offset += 2 * sizeof(__u16); 1294 offset += 2 * sizeof(__u16);
1160 verdict = ip_vs_icmp_xmit(skb, cp, pp, offset); 1295 verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
1161 /* do not touch skb anymore */ 1296 /* LOCALNODE from FORWARD hook is not supported */
1297 if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD &&
1298 skb_rtable(skb)->rt_flags & RTCF_LOCAL) {
1299 IP_VS_DBG(1, "%s(): "
1300 "local delivery to %pI4 but in FORWARD\n",
1301 __func__, &skb_rtable(skb)->rt_dst);
1302 verdict = NF_DROP;
1303 }
1162 1304
1163 out: 1305 out:
1164 __ip_vs_conn_put(cp); 1306 __ip_vs_conn_put(cp);
@@ -1179,14 +1321,13 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
1179 struct ip_vs_protocol *pp; 1321 struct ip_vs_protocol *pp;
1180 unsigned int offset, verdict; 1322 unsigned int offset, verdict;
1181 union nf_inet_addr snet; 1323 union nf_inet_addr snet;
1324 struct rt6_info *rt;
1182 1325
1183 *related = 1; 1326 *related = 1;
1184 1327
1185 /* reassemble IP fragments */ 1328 /* reassemble IP fragments */
1186 if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { 1329 if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
1187 if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ? 1330 if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
1188 IP_DEFRAG_VS_IN :
1189 IP_DEFRAG_VS_FWD))
1190 return NF_STOLEN; 1331 return NF_STOLEN;
1191 } 1332 }
1192 1333
@@ -1229,7 +1370,8 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
1229 if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) 1370 if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
1230 return NF_ACCEPT; 1371 return NF_ACCEPT;
1231 1372
1232 IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for"); 1373 IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
1374 "Checking incoming ICMPv6 for");
1233 1375
1234 offset += sizeof(struct ipv6hdr); 1376 offset += sizeof(struct ipv6hdr);
1235 1377
@@ -1257,7 +1399,15 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
1257 IPPROTO_SCTP == cih->nexthdr) 1399 IPPROTO_SCTP == cih->nexthdr)
1258 offset += 2 * sizeof(__u16); 1400 offset += 2 * sizeof(__u16);
1259 verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset); 1401 verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset);
1260 /* do not touch skb anymore */ 1402 /* LOCALNODE from FORWARD hook is not supported */
1403 if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD &&
1404 (rt = (struct rt6_info *) skb_dst(skb)) &&
1405 rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK) {
1406 IP_VS_DBG(1, "%s(): "
1407 "local delivery to %pI6 but in FORWARD\n",
1408 __func__, &rt->rt6i_dst);
1409 verdict = NF_DROP;
1410 }
1261 1411
1262 __ip_vs_conn_put(cp); 1412 __ip_vs_conn_put(cp);
1263 1413
@@ -1271,35 +1421,49 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
1271 * and send it on its way... 1421 * and send it on its way...
1272 */ 1422 */
1273static unsigned int 1423static unsigned int
1274ip_vs_in(unsigned int hooknum, struct sk_buff *skb, 1424ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
1275 const struct net_device *in, const struct net_device *out,
1276 int (*okfn)(struct sk_buff *))
1277{ 1425{
1278 struct ip_vs_iphdr iph; 1426 struct ip_vs_iphdr iph;
1279 struct ip_vs_protocol *pp; 1427 struct ip_vs_protocol *pp;
1280 struct ip_vs_conn *cp; 1428 struct ip_vs_conn *cp;
1281 int ret, restart, af, pkts; 1429 int ret, restart, pkts;
1282 1430
1283 af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; 1431 /* Already marked as IPVS request or reply? */
1284 1432 if (skb->ipvs_property)
1285 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); 1433 return NF_ACCEPT;
1286 1434
1287 /* 1435 /*
1288 * Big tappo: only PACKET_HOST, including loopback for local client 1436 * Big tappo:
1289 * Don't handle local packets on IPv6 for now 1437 * - remote client: only PACKET_HOST
1438 * - route: used for struct net when skb->dev is unset
1290 */ 1439 */
1291 if (unlikely(skb->pkt_type != PACKET_HOST)) { 1440 if (unlikely((skb->pkt_type != PACKET_HOST &&
1292 IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n", 1441 hooknum != NF_INET_LOCAL_OUT) ||
1293 skb->pkt_type, 1442 !skb_dst(skb))) {
1294 iph.protocol, 1443 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1295 IP_VS_DBG_ADDR(af, &iph.daddr)); 1444 IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
1445 " ignored in hook %u\n",
1446 skb->pkt_type, iph.protocol,
1447 IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);
1296 return NF_ACCEPT; 1448 return NF_ACCEPT;
1297 } 1449 }
1450 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1451
1452 /* Bad... Do not break raw sockets */
1453 if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
1454 af == AF_INET)) {
1455 struct sock *sk = skb->sk;
1456 struct inet_sock *inet = inet_sk(skb->sk);
1457
1458 if (inet && sk->sk_family == PF_INET && inet->nodefrag)
1459 return NF_ACCEPT;
1460 }
1298 1461
1299#ifdef CONFIG_IP_VS_IPV6 1462#ifdef CONFIG_IP_VS_IPV6
1300 if (af == AF_INET6) { 1463 if (af == AF_INET6) {
1301 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { 1464 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
1302 int related, verdict = ip_vs_in_icmp_v6(skb, &related, hooknum); 1465 int related;
1466 int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
1303 1467
1304 if (related) 1468 if (related)
1305 return verdict; 1469 return verdict;
@@ -1308,7 +1472,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
1308 } else 1472 } else
1309#endif 1473#endif
1310 if (unlikely(iph.protocol == IPPROTO_ICMP)) { 1474 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1311 int related, verdict = ip_vs_in_icmp(skb, &related, hooknum); 1475 int related;
1476 int verdict = ip_vs_in_icmp(skb, &related, hooknum);
1312 1477
1313 if (related) 1478 if (related)
1314 return verdict; 1479 return verdict;
@@ -1328,23 +1493,18 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
1328 if (unlikely(!cp)) { 1493 if (unlikely(!cp)) {
1329 int v; 1494 int v;
1330 1495
1331 /* For local client packets, it could be a response */
1332 cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
1333 if (cp)
1334 return handle_response(af, skb, pp, cp, iph.len);
1335
1336 if (!pp->conn_schedule(af, skb, pp, &v, &cp)) 1496 if (!pp->conn_schedule(af, skb, pp, &v, &cp))
1337 return v; 1497 return v;
1338 } 1498 }
1339 1499
1340 if (unlikely(!cp)) { 1500 if (unlikely(!cp)) {
1341 /* sorry, all this trouble for a no-hit :) */ 1501 /* sorry, all this trouble for a no-hit :) */
1342 IP_VS_DBG_PKT(12, pp, skb, 0, 1502 IP_VS_DBG_PKT(12, af, pp, skb, 0,
1343 "packet continues traversal as normal"); 1503 "ip_vs_in: packet continues traversal as normal");
1344 return NF_ACCEPT; 1504 return NF_ACCEPT;
1345 } 1505 }
1346 1506
1347 IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet"); 1507 IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet");
1348 1508
1349 /* Check the server status */ 1509 /* Check the server status */
1350 if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { 1510 if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
@@ -1380,8 +1540,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
1380 if (af == AF_INET && (ip_vs_sync_state & IP_VS_STATE_MASTER) && 1540 if (af == AF_INET && (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
1381 cp->protocol == IPPROTO_SCTP) { 1541 cp->protocol == IPPROTO_SCTP) {
1382 if ((cp->state == IP_VS_SCTP_S_ESTABLISHED && 1542 if ((cp->state == IP_VS_SCTP_S_ESTABLISHED &&
1383 (atomic_read(&cp->in_pkts) % 1543 (pkts % sysctl_ip_vs_sync_threshold[1]
1384 sysctl_ip_vs_sync_threshold[1]
1385 == sysctl_ip_vs_sync_threshold[0])) || 1544 == sysctl_ip_vs_sync_threshold[0])) ||
1386 (cp->old_state != cp->state && 1545 (cp->old_state != cp->state &&
1387 ((cp->state == IP_VS_SCTP_S_CLOSED) || 1546 ((cp->state == IP_VS_SCTP_S_CLOSED) ||
@@ -1392,7 +1551,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
1392 } 1551 }
1393 } 1552 }
1394 1553
1395 if (af == AF_INET && 1554 /* Keep this block last: TCP and others with pp->num_states <= 1 */
1555 else if (af == AF_INET &&
1396 (ip_vs_sync_state & IP_VS_STATE_MASTER) && 1556 (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
1397 (((cp->protocol != IPPROTO_TCP || 1557 (((cp->protocol != IPPROTO_TCP ||
1398 cp->state == IP_VS_TCP_S_ESTABLISHED) && 1558 cp->state == IP_VS_TCP_S_ESTABLISHED) &&
@@ -1411,6 +1571,72 @@ out:
1411 return ret; 1571 return ret;
1412} 1572}
1413 1573
1574/*
1575 * AF_INET handler in NF_INET_LOCAL_IN chain
1576 * Schedule and forward packets from remote clients
1577 */
1578static unsigned int
1579ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb,
1580 const struct net_device *in,
1581 const struct net_device *out,
1582 int (*okfn)(struct sk_buff *))
1583{
1584 return ip_vs_in(hooknum, skb, AF_INET);
1585}
1586
1587/*
1588 * AF_INET handler in NF_INET_LOCAL_OUT chain
1589 * Schedule and forward packets from local clients
1590 */
1591static unsigned int
1592ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb,
1593 const struct net_device *in, const struct net_device *out,
1594 int (*okfn)(struct sk_buff *))
1595{
1596 unsigned int verdict;
1597
1598 /* Disable BH in LOCAL_OUT until all places are fixed */
1599 local_bh_disable();
1600 verdict = ip_vs_in(hooknum, skb, AF_INET);
1601 local_bh_enable();
1602 return verdict;
1603}
1604
1605#ifdef CONFIG_IP_VS_IPV6
1606
1607/*
1608 * AF_INET6 handler in NF_INET_LOCAL_IN chain
1609 * Schedule and forward packets from remote clients
1610 */
1611static unsigned int
1612ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb,
1613 const struct net_device *in,
1614 const struct net_device *out,
1615 int (*okfn)(struct sk_buff *))
1616{
1617 return ip_vs_in(hooknum, skb, AF_INET6);
1618}
1619
1620/*
1621 * AF_INET6 handler in NF_INET_LOCAL_OUT chain
1622 * Schedule and forward packets from local clients
1623 */
1624static unsigned int
1625ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb,
1626 const struct net_device *in, const struct net_device *out,
1627 int (*okfn)(struct sk_buff *))
1628{
1629 unsigned int verdict;
1630
1631 /* Disable BH in LOCAL_OUT until all places are fixed */
1632 local_bh_disable();
1633 verdict = ip_vs_in(hooknum, skb, AF_INET6);
1634 local_bh_enable();
1635 return verdict;
1636}
1637
1638#endif
1639
1414 1640
1415/* 1641/*
1416 * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP 1642 * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
@@ -1451,23 +1677,39 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
1451 1677
1452 1678
1453static struct nf_hook_ops ip_vs_ops[] __read_mostly = { 1679static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
1680 /* After packet filtering, change source only for VS/NAT */
1681 {
1682 .hook = ip_vs_reply4,
1683 .owner = THIS_MODULE,
1684 .pf = PF_INET,
1685 .hooknum = NF_INET_LOCAL_IN,
1686 .priority = 99,
1687 },
1454 /* After packet filtering, forward packet through VS/DR, VS/TUN, 1688 /* After packet filtering, forward packet through VS/DR, VS/TUN,
1455 * or VS/NAT(change destination), so that filtering rules can be 1689 * or VS/NAT(change destination), so that filtering rules can be
1456 * applied to IPVS. */ 1690 * applied to IPVS. */
1457 { 1691 {
1458 .hook = ip_vs_in, 1692 .hook = ip_vs_remote_request4,
1459 .owner = THIS_MODULE, 1693 .owner = THIS_MODULE,
1460 .pf = PF_INET, 1694 .pf = PF_INET,
1461 .hooknum = NF_INET_LOCAL_IN, 1695 .hooknum = NF_INET_LOCAL_IN,
1462 .priority = 100, 1696 .priority = 101,
1463 }, 1697 },
1464 /* After packet filtering, change source only for VS/NAT */ 1698 /* Before ip_vs_in, change source only for VS/NAT */
1465 { 1699 {
1466 .hook = ip_vs_out, 1700 .hook = ip_vs_local_reply4,
1467 .owner = THIS_MODULE, 1701 .owner = THIS_MODULE,
1468 .pf = PF_INET, 1702 .pf = PF_INET,
1469 .hooknum = NF_INET_FORWARD, 1703 .hooknum = NF_INET_LOCAL_OUT,
1470 .priority = 100, 1704 .priority = -99,
1705 },
1706 /* After mangle, schedule and forward local requests */
1707 {
1708 .hook = ip_vs_local_request4,
1709 .owner = THIS_MODULE,
1710 .pf = PF_INET,
1711 .hooknum = NF_INET_LOCAL_OUT,
1712 .priority = -98,
1471 }, 1713 },
1472 /* After packet filtering (but before ip_vs_out_icmp), catch icmp 1714 /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1473 * destined for 0.0.0.0/0, which is for incoming IPVS connections */ 1715 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
@@ -1475,27 +1717,51 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
1475 .hook = ip_vs_forward_icmp, 1717 .hook = ip_vs_forward_icmp,
1476 .owner = THIS_MODULE, 1718 .owner = THIS_MODULE,
1477 .pf = PF_INET, 1719 .pf = PF_INET,
1478 .hooknum = NF_INET_FORWARD, 1720 .hooknum = NF_INET_FORWARD,
1479 .priority = 99, 1721 .priority = 99,
1722 },
1723 /* After packet filtering, change source only for VS/NAT */
1724 {
1725 .hook = ip_vs_reply4,
1726 .owner = THIS_MODULE,
1727 .pf = PF_INET,
1728 .hooknum = NF_INET_FORWARD,
1729 .priority = 100,
1480 }, 1730 },
1481#ifdef CONFIG_IP_VS_IPV6 1731#ifdef CONFIG_IP_VS_IPV6
1732 /* After packet filtering, change source only for VS/NAT */
1733 {
1734 .hook = ip_vs_reply6,
1735 .owner = THIS_MODULE,
1736 .pf = PF_INET6,
1737 .hooknum = NF_INET_LOCAL_IN,
1738 .priority = 99,
1739 },
1482 /* After packet filtering, forward packet through VS/DR, VS/TUN, 1740 /* After packet filtering, forward packet through VS/DR, VS/TUN,
1483 * or VS/NAT(change destination), so that filtering rules can be 1741 * or VS/NAT(change destination), so that filtering rules can be
1484 * applied to IPVS. */ 1742 * applied to IPVS. */
1485 { 1743 {
1486 .hook = ip_vs_in, 1744 .hook = ip_vs_remote_request6,
1487 .owner = THIS_MODULE, 1745 .owner = THIS_MODULE,
1488 .pf = PF_INET6, 1746 .pf = PF_INET6,
1489 .hooknum = NF_INET_LOCAL_IN, 1747 .hooknum = NF_INET_LOCAL_IN,
1490 .priority = 100, 1748 .priority = 101,
1491 }, 1749 },
1492 /* After packet filtering, change source only for VS/NAT */ 1750 /* Before ip_vs_in, change source only for VS/NAT */
1751 {
1752 .hook = ip_vs_local_reply6,
1753 .owner = THIS_MODULE,
1754 .pf = PF_INET,
1755 .hooknum = NF_INET_LOCAL_OUT,
1756 .priority = -99,
1757 },
1758 /* After mangle, schedule and forward local requests */
1493 { 1759 {
1494 .hook = ip_vs_out, 1760 .hook = ip_vs_local_request6,
1495 .owner = THIS_MODULE, 1761 .owner = THIS_MODULE,
1496 .pf = PF_INET6, 1762 .pf = PF_INET6,
1497 .hooknum = NF_INET_FORWARD, 1763 .hooknum = NF_INET_LOCAL_OUT,
1498 .priority = 100, 1764 .priority = -98,
1499 }, 1765 },
1500 /* After packet filtering (but before ip_vs_out_icmp), catch icmp 1766 /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1501 * destined for 0.0.0.0/0, which is for incoming IPVS connections */ 1767 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
@@ -1503,8 +1769,16 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
1503 .hook = ip_vs_forward_icmp_v6, 1769 .hook = ip_vs_forward_icmp_v6,
1504 .owner = THIS_MODULE, 1770 .owner = THIS_MODULE,
1505 .pf = PF_INET6, 1771 .pf = PF_INET6,
1506 .hooknum = NF_INET_FORWARD, 1772 .hooknum = NF_INET_FORWARD,
1507 .priority = 99, 1773 .priority = 99,
1774 },
1775 /* After packet filtering, change source only for VS/NAT */
1776 {
1777 .hook = ip_vs_reply6,
1778 .owner = THIS_MODULE,
1779 .pf = PF_INET6,
1780 .hooknum = NF_INET_FORWARD,
1781 .priority = 100,
1508 }, 1782 },
1509#endif 1783#endif
1510}; 1784};
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 0f0c079c422a..5f5daa30b0af 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -61,7 +61,7 @@ static DEFINE_RWLOCK(__ip_vs_svc_lock);
61static DEFINE_RWLOCK(__ip_vs_rs_lock); 61static DEFINE_RWLOCK(__ip_vs_rs_lock);
62 62
63/* lock for state and timeout tables */ 63/* lock for state and timeout tables */
64static DEFINE_RWLOCK(__ip_vs_securetcp_lock); 64static DEFINE_SPINLOCK(ip_vs_securetcp_lock);
65 65
66/* lock for drop entry handling */ 66/* lock for drop entry handling */
67static DEFINE_SPINLOCK(__ip_vs_dropentry_lock); 67static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
@@ -88,6 +88,10 @@ int sysctl_ip_vs_expire_nodest_conn = 0;
88int sysctl_ip_vs_expire_quiescent_template = 0; 88int sysctl_ip_vs_expire_quiescent_template = 0;
89int sysctl_ip_vs_sync_threshold[2] = { 3, 50 }; 89int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
90int sysctl_ip_vs_nat_icmp_send = 0; 90int sysctl_ip_vs_nat_icmp_send = 0;
91#ifdef CONFIG_IP_VS_NFCT
92int sysctl_ip_vs_conntrack;
93#endif
94int sysctl_ip_vs_snat_reroute = 1;
91 95
92 96
93#ifdef CONFIG_IP_VS_DEBUG 97#ifdef CONFIG_IP_VS_DEBUG
@@ -204,7 +208,7 @@ static void update_defense_level(void)
204 spin_unlock(&__ip_vs_droppacket_lock); 208 spin_unlock(&__ip_vs_droppacket_lock);
205 209
206 /* secure_tcp */ 210 /* secure_tcp */
207 write_lock(&__ip_vs_securetcp_lock); 211 spin_lock(&ip_vs_securetcp_lock);
208 switch (sysctl_ip_vs_secure_tcp) { 212 switch (sysctl_ip_vs_secure_tcp) {
209 case 0: 213 case 0:
210 if (old_secure_tcp >= 2) 214 if (old_secure_tcp >= 2)
@@ -238,7 +242,7 @@ static void update_defense_level(void)
238 old_secure_tcp = sysctl_ip_vs_secure_tcp; 242 old_secure_tcp = sysctl_ip_vs_secure_tcp;
239 if (to_change >= 0) 243 if (to_change >= 0)
240 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1); 244 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
241 write_unlock(&__ip_vs_securetcp_lock); 245 spin_unlock(&ip_vs_securetcp_lock);
242 246
243 local_bh_enable(); 247 local_bh_enable();
244} 248}
@@ -401,7 +405,7 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)
401 * Get service by {proto,addr,port} in the service table. 405 * Get service by {proto,addr,port} in the service table.
402 */ 406 */
403static inline struct ip_vs_service * 407static inline struct ip_vs_service *
404__ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr, 408__ip_vs_service_find(int af, __u16 protocol, const union nf_inet_addr *vaddr,
405 __be16 vport) 409 __be16 vport)
406{ 410{
407 unsigned hash; 411 unsigned hash;
@@ -416,7 +420,6 @@ __ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr,
416 && (svc->port == vport) 420 && (svc->port == vport)
417 && (svc->protocol == protocol)) { 421 && (svc->protocol == protocol)) {
418 /* HIT */ 422 /* HIT */
419 atomic_inc(&svc->usecnt);
420 return svc; 423 return svc;
421 } 424 }
422 } 425 }
@@ -429,7 +432,7 @@ __ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr,
429 * Get service by {fwmark} in the service table. 432 * Get service by {fwmark} in the service table.
430 */ 433 */
431static inline struct ip_vs_service * 434static inline struct ip_vs_service *
432__ip_vs_svc_fwm_get(int af, __u32 fwmark) 435__ip_vs_svc_fwm_find(int af, __u32 fwmark)
433{ 436{
434 unsigned hash; 437 unsigned hash;
435 struct ip_vs_service *svc; 438 struct ip_vs_service *svc;
@@ -440,7 +443,6 @@ __ip_vs_svc_fwm_get(int af, __u32 fwmark)
440 list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) { 443 list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
441 if (svc->fwmark == fwmark && svc->af == af) { 444 if (svc->fwmark == fwmark && svc->af == af) {
442 /* HIT */ 445 /* HIT */
443 atomic_inc(&svc->usecnt);
444 return svc; 446 return svc;
445 } 447 }
446 } 448 }
@@ -459,14 +461,14 @@ ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
459 /* 461 /*
460 * Check the table hashed by fwmark first 462 * Check the table hashed by fwmark first
461 */ 463 */
462 if (fwmark && (svc = __ip_vs_svc_fwm_get(af, fwmark))) 464 if (fwmark && (svc = __ip_vs_svc_fwm_find(af, fwmark)))
463 goto out; 465 goto out;
464 466
465 /* 467 /*
466 * Check the table hashed by <protocol,addr,port> 468 * Check the table hashed by <protocol,addr,port>
467 * for "full" addressed entries 469 * for "full" addressed entries
468 */ 470 */
469 svc = __ip_vs_service_get(af, protocol, vaddr, vport); 471 svc = __ip_vs_service_find(af, protocol, vaddr, vport);
470 472
471 if (svc == NULL 473 if (svc == NULL
472 && protocol == IPPROTO_TCP 474 && protocol == IPPROTO_TCP
@@ -476,7 +478,7 @@ ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
476 * Check if ftp service entry exists, the packet 478 * Check if ftp service entry exists, the packet
477 * might belong to FTP data connections. 479 * might belong to FTP data connections.
478 */ 480 */
479 svc = __ip_vs_service_get(af, protocol, vaddr, FTPPORT); 481 svc = __ip_vs_service_find(af, protocol, vaddr, FTPPORT);
480 } 482 }
481 483
482 if (svc == NULL 484 if (svc == NULL
@@ -484,10 +486,12 @@ ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
484 /* 486 /*
485 * Check if the catch-all port (port zero) exists 487 * Check if the catch-all port (port zero) exists
486 */ 488 */
487 svc = __ip_vs_service_get(af, protocol, vaddr, 0); 489 svc = __ip_vs_service_find(af, protocol, vaddr, 0);
488 } 490 }
489 491
490 out: 492 out:
493 if (svc)
494 atomic_inc(&svc->usecnt);
491 read_unlock(&__ip_vs_svc_lock); 495 read_unlock(&__ip_vs_svc_lock);
492 496
493 IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n", 497 IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
@@ -506,14 +510,19 @@ __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
506 dest->svc = svc; 510 dest->svc = svc;
507} 511}
508 512
509static inline void 513static void
510__ip_vs_unbind_svc(struct ip_vs_dest *dest) 514__ip_vs_unbind_svc(struct ip_vs_dest *dest)
511{ 515{
512 struct ip_vs_service *svc = dest->svc; 516 struct ip_vs_service *svc = dest->svc;
513 517
514 dest->svc = NULL; 518 dest->svc = NULL;
515 if (atomic_dec_and_test(&svc->refcnt)) 519 if (atomic_dec_and_test(&svc->refcnt)) {
520 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
521 svc->fwmark,
522 IP_VS_DBG_ADDR(svc->af, &svc->addr),
523 ntohs(svc->port), atomic_read(&svc->usecnt));
516 kfree(svc); 524 kfree(svc);
525 }
517} 526}
518 527
519 528
@@ -758,31 +767,18 @@ ip_vs_zero_stats(struct ip_vs_stats *stats)
758 * Update a destination in the given service 767 * Update a destination in the given service
759 */ 768 */
760static void 769static void
761__ip_vs_update_dest(struct ip_vs_service *svc, 770__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
762 struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest) 771 struct ip_vs_dest_user_kern *udest, int add)
763{ 772{
764 int conn_flags; 773 int conn_flags;
765 774
766 /* set the weight and the flags */ 775 /* set the weight and the flags */
767 atomic_set(&dest->weight, udest->weight); 776 atomic_set(&dest->weight, udest->weight);
768 conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE; 777 conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
769 778 conn_flags |= IP_VS_CONN_F_INACTIVE;
770 /* check if local node and update the flags */
771#ifdef CONFIG_IP_VS_IPV6
772 if (svc->af == AF_INET6) {
773 if (__ip_vs_addr_is_local_v6(&udest->addr.in6)) {
774 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
775 | IP_VS_CONN_F_LOCALNODE;
776 }
777 } else
778#endif
779 if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) {
780 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
781 | IP_VS_CONN_F_LOCALNODE;
782 }
783 779
784 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ 780 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
785 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) { 781 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
786 conn_flags |= IP_VS_CONN_F_NOOUTPUT; 782 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
787 } else { 783 } else {
788 /* 784 /*
@@ -813,6 +809,29 @@ __ip_vs_update_dest(struct ip_vs_service *svc,
813 dest->flags &= ~IP_VS_DEST_F_OVERLOAD; 809 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
814 dest->u_threshold = udest->u_threshold; 810 dest->u_threshold = udest->u_threshold;
815 dest->l_threshold = udest->l_threshold; 811 dest->l_threshold = udest->l_threshold;
812
813 spin_lock(&dest->dst_lock);
814 ip_vs_dst_reset(dest);
815 spin_unlock(&dest->dst_lock);
816
817 if (add)
818 ip_vs_new_estimator(&dest->stats);
819
820 write_lock_bh(&__ip_vs_svc_lock);
821
822 /* Wait until all other svc users go away */
823 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
824
825 if (add) {
826 list_add(&dest->n_list, &svc->destinations);
827 svc->num_dests++;
828 }
829
830 /* call the update_service, because server weight may be changed */
831 if (svc->scheduler->update_service)
832 svc->scheduler->update_service(svc);
833
834 write_unlock_bh(&__ip_vs_svc_lock);
816} 835}
817 836
818 837
@@ -843,7 +862,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
843 return -EINVAL; 862 return -EINVAL;
844 } 863 }
845 864
846 dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC); 865 dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL);
847 if (dest == NULL) { 866 if (dest == NULL) {
848 pr_err("%s(): no memory.\n", __func__); 867 pr_err("%s(): no memory.\n", __func__);
849 return -ENOMEM; 868 return -ENOMEM;
@@ -860,13 +879,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
860 atomic_set(&dest->activeconns, 0); 879 atomic_set(&dest->activeconns, 0);
861 atomic_set(&dest->inactconns, 0); 880 atomic_set(&dest->inactconns, 0);
862 atomic_set(&dest->persistconns, 0); 881 atomic_set(&dest->persistconns, 0);
863 atomic_set(&dest->refcnt, 0); 882 atomic_set(&dest->refcnt, 1);
864 883
865 INIT_LIST_HEAD(&dest->d_list); 884 INIT_LIST_HEAD(&dest->d_list);
866 spin_lock_init(&dest->dst_lock); 885 spin_lock_init(&dest->dst_lock);
867 spin_lock_init(&dest->stats.lock); 886 spin_lock_init(&dest->stats.lock);
868 __ip_vs_update_dest(svc, dest, udest); 887 __ip_vs_update_dest(svc, dest, udest, 1);
869 ip_vs_new_estimator(&dest->stats);
870 888
871 *dest_p = dest; 889 *dest_p = dest;
872 890
@@ -926,65 +944,22 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
926 IP_VS_DBG_ADDR(svc->af, &dest->vaddr), 944 IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
927 ntohs(dest->vport)); 945 ntohs(dest->vport));
928 946
929 __ip_vs_update_dest(svc, dest, udest);
930
931 /* 947 /*
932 * Get the destination from the trash 948 * Get the destination from the trash
933 */ 949 */
934 list_del(&dest->n_list); 950 list_del(&dest->n_list);
935 951
936 ip_vs_new_estimator(&dest->stats); 952 __ip_vs_update_dest(svc, dest, udest, 1);
937 953 ret = 0;
938 write_lock_bh(&__ip_vs_svc_lock); 954 } else {
939
940 /* 955 /*
941 * Wait until all other svc users go away. 956 * Allocate and initialize the dest structure
942 */ 957 */
943 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); 958 ret = ip_vs_new_dest(svc, udest, &dest);
944
945 list_add(&dest->n_list, &svc->destinations);
946 svc->num_dests++;
947
948 /* call the update_service function of its scheduler */
949 if (svc->scheduler->update_service)
950 svc->scheduler->update_service(svc);
951
952 write_unlock_bh(&__ip_vs_svc_lock);
953 return 0;
954 }
955
956 /*
957 * Allocate and initialize the dest structure
958 */
959 ret = ip_vs_new_dest(svc, udest, &dest);
960 if (ret) {
961 return ret;
962 } 959 }
963
964 /*
965 * Add the dest entry into the list
966 */
967 atomic_inc(&dest->refcnt);
968
969 write_lock_bh(&__ip_vs_svc_lock);
970
971 /*
972 * Wait until all other svc users go away.
973 */
974 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
975
976 list_add(&dest->n_list, &svc->destinations);
977 svc->num_dests++;
978
979 /* call the update_service function of its scheduler */
980 if (svc->scheduler->update_service)
981 svc->scheduler->update_service(svc);
982
983 write_unlock_bh(&__ip_vs_svc_lock);
984
985 LeaveFunction(2); 960 LeaveFunction(2);
986 961
987 return 0; 962 return ret;
988} 963}
989 964
990 965
@@ -1023,19 +998,7 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1023 return -ENOENT; 998 return -ENOENT;
1024 } 999 }
1025 1000
1026 __ip_vs_update_dest(svc, dest, udest); 1001 __ip_vs_update_dest(svc, dest, udest, 0);
1027
1028 write_lock_bh(&__ip_vs_svc_lock);
1029
1030 /* Wait until all other svc users go away */
1031 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1032
1033 /* call the update_service, because server weight may be changed */
1034 if (svc->scheduler->update_service)
1035 svc->scheduler->update_service(svc);
1036
1037 write_unlock_bh(&__ip_vs_svc_lock);
1038
1039 LeaveFunction(2); 1002 LeaveFunction(2);
1040 1003
1041 return 0; 1004 return 0;
@@ -1062,6 +1025,10 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest)
1062 * the destination into the trash. 1025 * the destination into the trash.
1063 */ 1026 */
1064 if (atomic_dec_and_test(&dest->refcnt)) { 1027 if (atomic_dec_and_test(&dest->refcnt)) {
1028 IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n",
1029 dest->vfwmark,
1030 IP_VS_DBG_ADDR(dest->af, &dest->addr),
1031 ntohs(dest->port));
1065 ip_vs_dst_reset(dest); 1032 ip_vs_dst_reset(dest);
1066 /* simply decrease svc->refcnt here, let the caller check 1033 /* simply decrease svc->refcnt here, let the caller check
1067 and release the service if nobody refers to it. 1034 and release the service if nobody refers to it.
@@ -1128,7 +1095,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1128 /* 1095 /*
1129 * Wait until all other svc users go away. 1096 * Wait until all other svc users go away.
1130 */ 1097 */
1131 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); 1098 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1132 1099
1133 /* 1100 /*
1134 * Unlink dest from the service 1101 * Unlink dest from the service
@@ -1157,6 +1124,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
1157{ 1124{
1158 int ret = 0; 1125 int ret = 0;
1159 struct ip_vs_scheduler *sched = NULL; 1126 struct ip_vs_scheduler *sched = NULL;
1127 struct ip_vs_pe *pe = NULL;
1160 struct ip_vs_service *svc = NULL; 1128 struct ip_vs_service *svc = NULL;
1161 1129
1162 /* increase the module use count */ 1130 /* increase the module use count */
@@ -1167,7 +1135,17 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
1167 if (sched == NULL) { 1135 if (sched == NULL) {
1168 pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name); 1136 pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
1169 ret = -ENOENT; 1137 ret = -ENOENT;
1170 goto out_mod_dec; 1138 goto out_err;
1139 }
1140
1141 if (u->pe_name && *u->pe_name) {
1142 pe = ip_vs_pe_get(u->pe_name);
1143 if (pe == NULL) {
1144 pr_info("persistence engine module ip_vs_pe_%s "
1145 "not found\n", u->pe_name);
1146 ret = -ENOENT;
1147 goto out_err;
1148 }
1171 } 1149 }
1172 1150
1173#ifdef CONFIG_IP_VS_IPV6 1151#ifdef CONFIG_IP_VS_IPV6
@@ -1177,7 +1155,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
1177 } 1155 }
1178#endif 1156#endif
1179 1157
1180 svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC); 1158 svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);
1181 if (svc == NULL) { 1159 if (svc == NULL) {
1182 IP_VS_DBG(1, "%s(): no memory\n", __func__); 1160 IP_VS_DBG(1, "%s(): no memory\n", __func__);
1183 ret = -ENOMEM; 1161 ret = -ENOMEM;
@@ -1185,7 +1163,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
1185 } 1163 }
1186 1164
1187 /* I'm the first user of the service */ 1165 /* I'm the first user of the service */
1188 atomic_set(&svc->usecnt, 1); 1166 atomic_set(&svc->usecnt, 0);
1189 atomic_set(&svc->refcnt, 0); 1167 atomic_set(&svc->refcnt, 0);
1190 1168
1191 svc->af = u->af; 1169 svc->af = u->af;
@@ -1207,6 +1185,10 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
1207 goto out_err; 1185 goto out_err;
1208 sched = NULL; 1186 sched = NULL;
1209 1187
1188 /* Bind the ct retriever */
1189 ip_vs_bind_pe(svc, pe);
1190 pe = NULL;
1191
1210 /* Update the virtual service counters */ 1192 /* Update the virtual service counters */
1211 if (svc->port == FTPPORT) 1193 if (svc->port == FTPPORT)
1212 atomic_inc(&ip_vs_ftpsvc_counter); 1194 atomic_inc(&ip_vs_ftpsvc_counter);
@@ -1227,10 +1209,9 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
1227 *svc_p = svc; 1209 *svc_p = svc;
1228 return 0; 1210 return 0;
1229 1211
1230 out_err: 1212 out_err:
1231 if (svc != NULL) { 1213 if (svc != NULL) {
1232 if (svc->scheduler) 1214 ip_vs_unbind_scheduler(svc);
1233 ip_vs_unbind_scheduler(svc);
1234 if (svc->inc) { 1215 if (svc->inc) {
1235 local_bh_disable(); 1216 local_bh_disable();
1236 ip_vs_app_inc_put(svc->inc); 1217 ip_vs_app_inc_put(svc->inc);
@@ -1239,8 +1220,8 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
1239 kfree(svc); 1220 kfree(svc);
1240 } 1221 }
1241 ip_vs_scheduler_put(sched); 1222 ip_vs_scheduler_put(sched);
1223 ip_vs_pe_put(pe);
1242 1224
1243 out_mod_dec:
1244 /* decrease the module use count */ 1225 /* decrease the module use count */
1245 ip_vs_use_count_dec(); 1226 ip_vs_use_count_dec();
1246 1227
@@ -1255,6 +1236,7 @@ static int
1255ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) 1236ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
1256{ 1237{
1257 struct ip_vs_scheduler *sched, *old_sched; 1238 struct ip_vs_scheduler *sched, *old_sched;
1239 struct ip_vs_pe *pe = NULL, *old_pe = NULL;
1258 int ret = 0; 1240 int ret = 0;
1259 1241
1260 /* 1242 /*
@@ -1267,6 +1249,17 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
1267 } 1249 }
1268 old_sched = sched; 1250 old_sched = sched;
1269 1251
1252 if (u->pe_name && *u->pe_name) {
1253 pe = ip_vs_pe_get(u->pe_name);
1254 if (pe == NULL) {
1255 pr_info("persistence engine module ip_vs_pe_%s "
1256 "not found\n", u->pe_name);
1257 ret = -ENOENT;
1258 goto out;
1259 }
1260 old_pe = pe;
1261 }
1262
1270#ifdef CONFIG_IP_VS_IPV6 1263#ifdef CONFIG_IP_VS_IPV6
1271 if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) { 1264 if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
1272 ret = -EINVAL; 1265 ret = -EINVAL;
@@ -1279,7 +1272,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
1279 /* 1272 /*
1280 * Wait until all other svc users go away. 1273 * Wait until all other svc users go away.
1281 */ 1274 */
1282 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); 1275 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1283 1276
1284 /* 1277 /*
1285 * Set the flags and timeout value 1278 * Set the flags and timeout value
@@ -1318,15 +1311,17 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
1318 } 1311 }
1319 } 1312 }
1320 1313
1314 old_pe = svc->pe;
1315 if (pe != old_pe) {
1316 ip_vs_unbind_pe(svc);
1317 ip_vs_bind_pe(svc, pe);
1318 }
1319
1321 out_unlock: 1320 out_unlock:
1322 write_unlock_bh(&__ip_vs_svc_lock); 1321 write_unlock_bh(&__ip_vs_svc_lock);
1323#ifdef CONFIG_IP_VS_IPV6
1324 out: 1322 out:
1325#endif 1323 ip_vs_scheduler_put(old_sched);
1326 1324 ip_vs_pe_put(old_pe);
1327 if (old_sched)
1328 ip_vs_scheduler_put(old_sched);
1329
1330 return ret; 1325 return ret;
1331} 1326}
1332 1327
@@ -1340,6 +1335,9 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
1340{ 1335{
1341 struct ip_vs_dest *dest, *nxt; 1336 struct ip_vs_dest *dest, *nxt;
1342 struct ip_vs_scheduler *old_sched; 1337 struct ip_vs_scheduler *old_sched;
1338 struct ip_vs_pe *old_pe;
1339
1340 pr_info("%s: enter\n", __func__);
1343 1341
1344 /* Count only IPv4 services for old get/setsockopt interface */ 1342 /* Count only IPv4 services for old get/setsockopt interface */
1345 if (svc->af == AF_INET) 1343 if (svc->af == AF_INET)
@@ -1350,8 +1348,12 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
1350 /* Unbind scheduler */ 1348 /* Unbind scheduler */
1351 old_sched = svc->scheduler; 1349 old_sched = svc->scheduler;
1352 ip_vs_unbind_scheduler(svc); 1350 ip_vs_unbind_scheduler(svc);
1353 if (old_sched) 1351 ip_vs_scheduler_put(old_sched);
1354 ip_vs_scheduler_put(old_sched); 1352
1353 /* Unbind persistence engine */
1354 old_pe = svc->pe;
1355 ip_vs_unbind_pe(svc);
1356 ip_vs_pe_put(old_pe);
1355 1357
1356 /* Unbind app inc */ 1358 /* Unbind app inc */
1357 if (svc->inc) { 1359 if (svc->inc) {
@@ -1378,21 +1380,23 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
1378 /* 1380 /*
1379 * Free the service if nobody refers to it 1381 * Free the service if nobody refers to it
1380 */ 1382 */
1381 if (atomic_read(&svc->refcnt) == 0) 1383 if (atomic_read(&svc->refcnt) == 0) {
1384 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
1385 svc->fwmark,
1386 IP_VS_DBG_ADDR(svc->af, &svc->addr),
1387 ntohs(svc->port), atomic_read(&svc->usecnt));
1382 kfree(svc); 1388 kfree(svc);
1389 }
1383 1390
1384 /* decrease the module use count */ 1391 /* decrease the module use count */
1385 ip_vs_use_count_dec(); 1392 ip_vs_use_count_dec();
1386} 1393}
1387 1394
1388/* 1395/*
1389 * Delete a service from the service list 1396 * Unlink a service from list and try to delete it if its refcnt reached 0
1390 */ 1397 */
1391static int ip_vs_del_service(struct ip_vs_service *svc) 1398static void ip_vs_unlink_service(struct ip_vs_service *svc)
1392{ 1399{
1393 if (svc == NULL)
1394 return -EEXIST;
1395
1396 /* 1400 /*
1397 * Unhash it from the service table 1401 * Unhash it from the service table
1398 */ 1402 */
@@ -1403,11 +1407,21 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
1403 /* 1407 /*
1404 * Wait until all the svc users go away. 1408 * Wait until all the svc users go away.
1405 */ 1409 */
1406 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); 1410 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1407 1411
1408 __ip_vs_del_service(svc); 1412 __ip_vs_del_service(svc);
1409 1413
1410 write_unlock_bh(&__ip_vs_svc_lock); 1414 write_unlock_bh(&__ip_vs_svc_lock);
1415}
1416
1417/*
1418 * Delete a service from the service list
1419 */
1420static int ip_vs_del_service(struct ip_vs_service *svc)
1421{
1422 if (svc == NULL)
1423 return -EEXIST;
1424 ip_vs_unlink_service(svc);
1411 1425
1412 return 0; 1426 return 0;
1413} 1427}
@@ -1426,14 +1440,7 @@ static int ip_vs_flush(void)
1426 */ 1440 */
1427 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { 1441 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1428 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) { 1442 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
1429 write_lock_bh(&__ip_vs_svc_lock); 1443 ip_vs_unlink_service(svc);
1430 ip_vs_svc_unhash(svc);
1431 /*
1432 * Wait until all the svc users go away.
1433 */
1434 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1435 __ip_vs_del_service(svc);
1436 write_unlock_bh(&__ip_vs_svc_lock);
1437 } 1444 }
1438 } 1445 }
1439 1446
@@ -1443,14 +1450,7 @@ static int ip_vs_flush(void)
1443 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { 1450 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1444 list_for_each_entry_safe(svc, nxt, 1451 list_for_each_entry_safe(svc, nxt,
1445 &ip_vs_svc_fwm_table[idx], f_list) { 1452 &ip_vs_svc_fwm_table[idx], f_list) {
1446 write_lock_bh(&__ip_vs_svc_lock); 1453 ip_vs_unlink_service(svc);
1447 ip_vs_svc_unhash(svc);
1448 /*
1449 * Wait until all the svc users go away.
1450 */
1451 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1452 __ip_vs_del_service(svc);
1453 write_unlock_bh(&__ip_vs_svc_lock);
1454 } 1454 }
1455 } 1455 }
1456 1456
@@ -1579,6 +1579,15 @@ static struct ctl_table vs_vars[] = {
1579 .mode = 0644, 1579 .mode = 0644,
1580 .proc_handler = proc_do_defense_mode, 1580 .proc_handler = proc_do_defense_mode,
1581 }, 1581 },
1582#ifdef CONFIG_IP_VS_NFCT
1583 {
1584 .procname = "conntrack",
1585 .data = &sysctl_ip_vs_conntrack,
1586 .maxlen = sizeof(int),
1587 .mode = 0644,
1588 .proc_handler = &proc_dointvec,
1589 },
1590#endif
1582 { 1591 {
1583 .procname = "secure_tcp", 1592 .procname = "secure_tcp",
1584 .data = &sysctl_ip_vs_secure_tcp, 1593 .data = &sysctl_ip_vs_secure_tcp,
@@ -1586,6 +1595,13 @@ static struct ctl_table vs_vars[] = {
1586 .mode = 0644, 1595 .mode = 0644,
1587 .proc_handler = proc_do_defense_mode, 1596 .proc_handler = proc_do_defense_mode,
1588 }, 1597 },
1598 {
1599 .procname = "snat_reroute",
1600 .data = &sysctl_ip_vs_snat_reroute,
1601 .maxlen = sizeof(int),
1602 .mode = 0644,
1603 .proc_handler = &proc_dointvec,
1604 },
1589#if 0 1605#if 0
1590 { 1606 {
1591 .procname = "timeout_established", 1607 .procname = "timeout_established",
@@ -2041,6 +2057,8 @@ static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
2041static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc, 2057static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
2042 struct ip_vs_service_user *usvc_compat) 2058 struct ip_vs_service_user *usvc_compat)
2043{ 2059{
2060 memset(usvc, 0, sizeof(*usvc));
2061
2044 usvc->af = AF_INET; 2062 usvc->af = AF_INET;
2045 usvc->protocol = usvc_compat->protocol; 2063 usvc->protocol = usvc_compat->protocol;
2046 usvc->addr.ip = usvc_compat->addr; 2064 usvc->addr.ip = usvc_compat->addr;
@@ -2058,6 +2076,8 @@ static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
2058static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, 2076static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
2059 struct ip_vs_dest_user *udest_compat) 2077 struct ip_vs_dest_user *udest_compat)
2060{ 2078{
2079 memset(udest, 0, sizeof(*udest));
2080
2061 udest->addr.ip = udest_compat->addr; 2081 udest->addr.ip = udest_compat->addr;
2062 udest->port = udest_compat->port; 2082 udest->port = udest_compat->port;
2063 udest->conn_flags = udest_compat->conn_flags; 2083 udest->conn_flags = udest_compat->conn_flags;
@@ -2147,10 +2167,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
2147 2167
2148 /* Lookup the exact service by <protocol, addr, port> or fwmark */ 2168 /* Lookup the exact service by <protocol, addr, port> or fwmark */
2149 if (usvc.fwmark == 0) 2169 if (usvc.fwmark == 0)
2150 svc = __ip_vs_service_get(usvc.af, usvc.protocol, 2170 svc = __ip_vs_service_find(usvc.af, usvc.protocol,
2151 &usvc.addr, usvc.port); 2171 &usvc.addr, usvc.port);
2152 else 2172 else
2153 svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); 2173 svc = __ip_vs_svc_fwm_find(usvc.af, usvc.fwmark);
2154 2174
2155 if (cmd != IP_VS_SO_SET_ADD 2175 if (cmd != IP_VS_SO_SET_ADD
2156 && (svc == NULL || svc->protocol != usvc.protocol)) { 2176 && (svc == NULL || svc->protocol != usvc.protocol)) {
@@ -2189,9 +2209,6 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
2189 ret = -EINVAL; 2209 ret = -EINVAL;
2190 } 2210 }
2191 2211
2192 if (svc)
2193 ip_vs_service_put(svc);
2194
2195 out_unlock: 2212 out_unlock:
2196 mutex_unlock(&__ip_vs_mutex); 2213 mutex_unlock(&__ip_vs_mutex);
2197 out_dec: 2214 out_dec:
@@ -2284,10 +2301,10 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2284 int ret = 0; 2301 int ret = 0;
2285 2302
2286 if (get->fwmark) 2303 if (get->fwmark)
2287 svc = __ip_vs_svc_fwm_get(AF_INET, get->fwmark); 2304 svc = __ip_vs_svc_fwm_find(AF_INET, get->fwmark);
2288 else 2305 else
2289 svc = __ip_vs_service_get(AF_INET, get->protocol, &addr, 2306 svc = __ip_vs_service_find(AF_INET, get->protocol, &addr,
2290 get->port); 2307 get->port);
2291 2308
2292 if (svc) { 2309 if (svc) {
2293 int count = 0; 2310 int count = 0;
@@ -2315,7 +2332,6 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2315 } 2332 }
2316 count++; 2333 count++;
2317 } 2334 }
2318 ip_vs_service_put(svc);
2319 } else 2335 } else
2320 ret = -ESRCH; 2336 ret = -ESRCH;
2321 return ret; 2337 return ret;
@@ -2436,15 +2452,14 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2436 entry = (struct ip_vs_service_entry *)arg; 2452 entry = (struct ip_vs_service_entry *)arg;
2437 addr.ip = entry->addr; 2453 addr.ip = entry->addr;
2438 if (entry->fwmark) 2454 if (entry->fwmark)
2439 svc = __ip_vs_svc_fwm_get(AF_INET, entry->fwmark); 2455 svc = __ip_vs_svc_fwm_find(AF_INET, entry->fwmark);
2440 else 2456 else
2441 svc = __ip_vs_service_get(AF_INET, entry->protocol, 2457 svc = __ip_vs_service_find(AF_INET, entry->protocol,
2442 &addr, entry->port); 2458 &addr, entry->port);
2443 if (svc) { 2459 if (svc) {
2444 ip_vs_copy_service(entry, svc); 2460 ip_vs_copy_service(entry, svc);
2445 if (copy_to_user(user, entry, sizeof(*entry)) != 0) 2461 if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2446 ret = -EFAULT; 2462 ret = -EFAULT;
2447 ip_vs_service_put(svc);
2448 } else 2463 } else
2449 ret = -ESRCH; 2464 ret = -ESRCH;
2450 } 2465 }
@@ -2559,6 +2574,8 @@ static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
2559 [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 }, 2574 [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 },
2560 [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING, 2575 [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING,
2561 .len = IP_VS_SCHEDNAME_MAXLEN }, 2576 .len = IP_VS_SCHEDNAME_MAXLEN },
2577 [IPVS_SVC_ATTR_PE_NAME] = { .type = NLA_NUL_STRING,
2578 .len = IP_VS_PENAME_MAXLEN },
2562 [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY, 2579 [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY,
2563 .len = sizeof(struct ip_vs_flags) }, 2580 .len = sizeof(struct ip_vs_flags) },
2564 [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 }, 2581 [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 },
@@ -2635,6 +2652,8 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb,
2635 } 2652 }
2636 2653
2637 NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name); 2654 NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name);
2655 if (svc->pe)
2656 NLA_PUT_STRING(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name);
2638 NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags); 2657 NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags);
2639 NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ); 2658 NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ);
2640 NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask); 2659 NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask);
@@ -2711,10 +2730,12 @@ nla_put_failure:
2711} 2730}
2712 2731
2713static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, 2732static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
2714 struct nlattr *nla, int full_entry) 2733 struct nlattr *nla, int full_entry,
2734 struct ip_vs_service **ret_svc)
2715{ 2735{
2716 struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1]; 2736 struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
2717 struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr; 2737 struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
2738 struct ip_vs_service *svc;
2718 2739
2719 /* Parse mandatory identifying service fields first */ 2740 /* Parse mandatory identifying service fields first */
2720 if (nla == NULL || 2741 if (nla == NULL ||
@@ -2750,14 +2771,21 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
2750 usvc->fwmark = 0; 2771 usvc->fwmark = 0;
2751 } 2772 }
2752 2773
2774 if (usvc->fwmark)
2775 svc = __ip_vs_svc_fwm_find(usvc->af, usvc->fwmark);
2776 else
2777 svc = __ip_vs_service_find(usvc->af, usvc->protocol,
2778 &usvc->addr, usvc->port);
2779 *ret_svc = svc;
2780
2753 /* If a full entry was requested, check for the additional fields */ 2781 /* If a full entry was requested, check for the additional fields */
2754 if (full_entry) { 2782 if (full_entry) {
2755 struct nlattr *nla_sched, *nla_flags, *nla_timeout, 2783 struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
2756 *nla_netmask; 2784 *nla_netmask;
2757 struct ip_vs_flags flags; 2785 struct ip_vs_flags flags;
2758 struct ip_vs_service *svc;
2759 2786
2760 nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME]; 2787 nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
2788 nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
2761 nla_flags = attrs[IPVS_SVC_ATTR_FLAGS]; 2789 nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
2762 nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT]; 2790 nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
2763 nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK]; 2791 nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
@@ -2768,21 +2796,14 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
2768 nla_memcpy(&flags, nla_flags, sizeof(flags)); 2796 nla_memcpy(&flags, nla_flags, sizeof(flags));
2769 2797
2770 /* prefill flags from service if it already exists */ 2798 /* prefill flags from service if it already exists */
2771 if (usvc->fwmark) 2799 if (svc)
2772 svc = __ip_vs_svc_fwm_get(usvc->af, usvc->fwmark);
2773 else
2774 svc = __ip_vs_service_get(usvc->af, usvc->protocol,
2775 &usvc->addr, usvc->port);
2776 if (svc) {
2777 usvc->flags = svc->flags; 2800 usvc->flags = svc->flags;
2778 ip_vs_service_put(svc);
2779 } else
2780 usvc->flags = 0;
2781 2801
2782 /* set new flags from userland */ 2802 /* set new flags from userland */
2783 usvc->flags = (usvc->flags & ~flags.mask) | 2803 usvc->flags = (usvc->flags & ~flags.mask) |
2784 (flags.flags & flags.mask); 2804 (flags.flags & flags.mask);
2785 usvc->sched_name = nla_data(nla_sched); 2805 usvc->sched_name = nla_data(nla_sched);
2806 usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
2786 usvc->timeout = nla_get_u32(nla_timeout); 2807 usvc->timeout = nla_get_u32(nla_timeout);
2787 usvc->netmask = nla_get_u32(nla_netmask); 2808 usvc->netmask = nla_get_u32(nla_netmask);
2788 } 2809 }
@@ -2793,17 +2814,11 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
2793static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla) 2814static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla)
2794{ 2815{
2795 struct ip_vs_service_user_kern usvc; 2816 struct ip_vs_service_user_kern usvc;
2817 struct ip_vs_service *svc;
2796 int ret; 2818 int ret;
2797 2819
2798 ret = ip_vs_genl_parse_service(&usvc, nla, 0); 2820 ret = ip_vs_genl_parse_service(&usvc, nla, 0, &svc);
2799 if (ret) 2821 return ret ? ERR_PTR(ret) : svc;
2800 return ERR_PTR(ret);
2801
2802 if (usvc.fwmark)
2803 return __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
2804 else
2805 return __ip_vs_service_get(usvc.af, usvc.protocol,
2806 &usvc.addr, usvc.port);
2807} 2822}
2808 2823
2809static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) 2824static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
@@ -2894,7 +2909,6 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
2894 2909
2895nla_put_failure: 2910nla_put_failure:
2896 cb->args[0] = idx; 2911 cb->args[0] = idx;
2897 ip_vs_service_put(svc);
2898 2912
2899out_err: 2913out_err:
2900 mutex_unlock(&__ip_vs_mutex); 2914 mutex_unlock(&__ip_vs_mutex);
@@ -3107,17 +3121,10 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
3107 3121
3108 ret = ip_vs_genl_parse_service(&usvc, 3122 ret = ip_vs_genl_parse_service(&usvc,
3109 info->attrs[IPVS_CMD_ATTR_SERVICE], 3123 info->attrs[IPVS_CMD_ATTR_SERVICE],
3110 need_full_svc); 3124 need_full_svc, &svc);
3111 if (ret) 3125 if (ret)
3112 goto out; 3126 goto out;
3113 3127
3114 /* Lookup the exact service by <protocol, addr, port> or fwmark */
3115 if (usvc.fwmark == 0)
3116 svc = __ip_vs_service_get(usvc.af, usvc.protocol,
3117 &usvc.addr, usvc.port);
3118 else
3119 svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
3120
3121 /* Unless we're adding a new service, the service must already exist */ 3128 /* Unless we're adding a new service, the service must already exist */
3122 if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) { 3129 if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
3123 ret = -ESRCH; 3130 ret = -ESRCH;
@@ -3151,6 +3158,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
3151 break; 3158 break;
3152 case IPVS_CMD_DEL_SERVICE: 3159 case IPVS_CMD_DEL_SERVICE:
3153 ret = ip_vs_del_service(svc); 3160 ret = ip_vs_del_service(svc);
3161 /* do not use svc, it can be freed */
3154 break; 3162 break;
3155 case IPVS_CMD_NEW_DEST: 3163 case IPVS_CMD_NEW_DEST:
3156 ret = ip_vs_add_dest(svc, &udest); 3164 ret = ip_vs_add_dest(svc, &udest);
@@ -3169,8 +3177,6 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
3169 } 3177 }
3170 3178
3171out: 3179out:
3172 if (svc)
3173 ip_vs_service_put(svc);
3174 mutex_unlock(&__ip_vs_mutex); 3180 mutex_unlock(&__ip_vs_mutex);
3175 3181
3176 return ret; 3182 return ret;
@@ -3216,7 +3222,6 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
3216 goto out_err; 3222 goto out_err;
3217 } else if (svc) { 3223 } else if (svc) {
3218 ret = ip_vs_genl_fill_service(msg, svc); 3224 ret = ip_vs_genl_fill_service(msg, svc);
3219 ip_vs_service_put(svc);
3220 if (ret) 3225 if (ret)
3221 goto nla_put_failure; 3226 goto nla_put_failure;
3222 } else { 3227 } else {
@@ -3385,6 +3390,16 @@ int __init ip_vs_control_init(void)
3385 3390
3386 EnterFunction(2); 3391 EnterFunction(2);
3387 3392
3393 /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
3394 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
3395 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
3396 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
3397 }
3398 for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
3399 INIT_LIST_HEAD(&ip_vs_rtable[idx]);
3400 }
3401 smp_wmb();
3402
3388 ret = nf_register_sockopt(&ip_vs_sockopts); 3403 ret = nf_register_sockopt(&ip_vs_sockopts);
3389 if (ret) { 3404 if (ret) {
3390 pr_err("cannot register sockopt.\n"); 3405 pr_err("cannot register sockopt.\n");
@@ -3403,15 +3418,6 @@ int __init ip_vs_control_init(void)
3403 3418
3404 sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars); 3419 sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars);
3405 3420
3406 /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
3407 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
3408 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
3409 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
3410 }
3411 for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
3412 INIT_LIST_HEAD(&ip_vs_rtable[idx]);
3413 }
3414
3415 ip_vs_new_estimator(&ip_vs_stats); 3421 ip_vs_new_estimator(&ip_vs_stats);
3416 3422
3417 /* Hook the defense timer */ 3423 /* Hook the defense timer */
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index f228a17ec649..75455000ad1c 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -20,17 +20,6 @@
20 * 20 *
21 * Author: Wouter Gadeyne 21 * Author: Wouter Gadeyne
22 * 22 *
23 *
24 * Code for ip_vs_expect_related and ip_vs_expect_callback is taken from
25 * http://www.ssi.bg/~ja/nfct/:
26 *
27 * ip_vs_nfct.c: Netfilter connection tracking support for IPVS
28 *
29 * Portions Copyright (C) 2001-2002
30 * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland.
31 *
32 * Portions Copyright (C) 2003-2008
33 * Julian Anastasov
34 */ 23 */
35 24
36#define KMSG_COMPONENT "IPVS" 25#define KMSG_COMPONENT "IPVS"
@@ -45,6 +34,7 @@
45#include <linux/netfilter.h> 34#include <linux/netfilter.h>
46#include <net/netfilter/nf_conntrack.h> 35#include <net/netfilter/nf_conntrack.h>
47#include <net/netfilter/nf_conntrack_expect.h> 36#include <net/netfilter/nf_conntrack_expect.h>
37#include <net/netfilter/nf_nat.h>
48#include <net/netfilter/nf_nat_helper.h> 38#include <net/netfilter/nf_nat_helper.h>
49#include <linux/gfp.h> 39#include <linux/gfp.h>
50#include <net/protocol.h> 40#include <net/protocol.h>
@@ -57,16 +47,6 @@
57#define SERVER_STRING "227 Entering Passive Mode (" 47#define SERVER_STRING "227 Entering Passive Mode ("
58#define CLIENT_STRING "PORT " 48#define CLIENT_STRING "PORT "
59 49
60#define FMT_TUPLE "%pI4:%u->%pI4:%u/%u"
61#define ARG_TUPLE(T) &(T)->src.u3.ip, ntohs((T)->src.u.all), \
62 &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \
63 (T)->dst.protonum
64
65#define FMT_CONN "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u"
66#define ARG_CONN(C) &((C)->caddr.ip), ntohs((C)->cport), \
67 &((C)->vaddr.ip), ntohs((C)->vport), \
68 &((C)->daddr.ip), ntohs((C)->dport), \
69 (C)->protocol, (C)->state
70 50
71/* 51/*
72 * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper 52 * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
@@ -84,6 +64,8 @@ static int ip_vs_ftp_pasv;
84static int 64static int
85ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) 65ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
86{ 66{
67 /* We use connection tracking for the command connection */
68 cp->flags |= IP_VS_CONN_F_NFCT;
87 return 0; 69 return 0;
88} 70}
89 71
@@ -148,120 +130,6 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
148} 130}
149 131
150/* 132/*
151 * Called from init_conntrack() as expectfn handler.
152 */
153static void
154ip_vs_expect_callback(struct nf_conn *ct,
155 struct nf_conntrack_expect *exp)
156{
157 struct nf_conntrack_tuple *orig, new_reply;
158 struct ip_vs_conn *cp;
159
160 if (exp->tuple.src.l3num != PF_INET)
161 return;
162
163 /*
164 * We assume that no NF locks are held before this callback.
165 * ip_vs_conn_out_get and ip_vs_conn_in_get should match their
166 * expectations even if they use wildcard values, now we provide the
167 * actual values from the newly created original conntrack direction.
168 * The conntrack is confirmed when packet reaches IPVS hooks.
169 */
170
171 /* RS->CLIENT */
172 orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
173 cp = ip_vs_conn_out_get(exp->tuple.src.l3num, orig->dst.protonum,
174 &orig->src.u3, orig->src.u.tcp.port,
175 &orig->dst.u3, orig->dst.u.tcp.port);
176 if (cp) {
177 /* Change reply CLIENT->RS to CLIENT->VS */
178 new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
179 IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
180 FMT_TUPLE ", found inout cp=" FMT_CONN "\n",
181 __func__, ct, ct->status,
182 ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
183 ARG_CONN(cp));
184 new_reply.dst.u3 = cp->vaddr;
185 new_reply.dst.u.tcp.port = cp->vport;
186 IP_VS_DBG(7, "%s(): ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE
187 ", inout cp=" FMT_CONN "\n",
188 __func__, ct,
189 ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
190 ARG_CONN(cp));
191 goto alter;
192 }
193
194 /* CLIENT->VS */
195 cp = ip_vs_conn_in_get(exp->tuple.src.l3num, orig->dst.protonum,
196 &orig->src.u3, orig->src.u.tcp.port,
197 &orig->dst.u3, orig->dst.u.tcp.port);
198 if (cp) {
199 /* Change reply VS->CLIENT to RS->CLIENT */
200 new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
201 IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
202 FMT_TUPLE ", found outin cp=" FMT_CONN "\n",
203 __func__, ct, ct->status,
204 ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
205 ARG_CONN(cp));
206 new_reply.src.u3 = cp->daddr;
207 new_reply.src.u.tcp.port = cp->dport;
208 IP_VS_DBG(7, "%s(): ct=%p, new tuples=" FMT_TUPLE ", "
209 FMT_TUPLE ", outin cp=" FMT_CONN "\n",
210 __func__, ct,
211 ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
212 ARG_CONN(cp));
213 goto alter;
214 }
215
216 IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuple=" FMT_TUPLE
217 " - unknown expect\n",
218 __func__, ct, ct->status, ARG_TUPLE(orig));
219 return;
220
221alter:
222 /* Never alter conntrack for non-NAT conns */
223 if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ)
224 nf_conntrack_alter_reply(ct, &new_reply);
225 ip_vs_conn_put(cp);
226 return;
227}
228
229/*
230 * Create NF conntrack expectation with wildcard (optional) source port.
231 * Then the default callback function will alter the reply and will confirm
232 * the conntrack entry when the first packet comes.
233 */
234static void
235ip_vs_expect_related(struct sk_buff *skb, struct nf_conn *ct,
236 struct ip_vs_conn *cp, u_int8_t proto,
237 const __be16 *port, int from_rs)
238{
239 struct nf_conntrack_expect *exp;
240
241 BUG_ON(!ct || ct == &nf_conntrack_untracked);
242
243 exp = nf_ct_expect_alloc(ct);
244 if (!exp)
245 return;
246
247 if (from_rs)
248 nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT,
249 nf_ct_l3num(ct), &cp->daddr, &cp->caddr,
250 proto, port, &cp->cport);
251 else
252 nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT,
253 nf_ct_l3num(ct), &cp->caddr, &cp->vaddr,
254 proto, port, &cp->vport);
255
256 exp->expectfn = ip_vs_expect_callback;
257
258 IP_VS_DBG(7, "%s(): ct=%p, expect tuple=" FMT_TUPLE "\n",
259 __func__, ct, ARG_TUPLE(&exp->tuple));
260 nf_ct_expect_related(exp);
261 nf_ct_expect_put(exp);
262}
263
264/*
265 * Look at outgoing ftp packets to catch the response to a PASV command 133 * Look at outgoing ftp packets to catch the response to a PASV command
266 * from the server (inside-to-outside). 134 * from the server (inside-to-outside).
267 * When we see one, we build a connection entry with the client address, 135 * When we see one, we build a connection entry with the client address,
@@ -327,14 +195,19 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
327 /* 195 /*
328 * Now update or create an connection entry for it 196 * Now update or create an connection entry for it
329 */ 197 */
330 n_cp = ip_vs_conn_out_get(AF_INET, iph->protocol, &from, port, 198 {
331 &cp->caddr, 0); 199 struct ip_vs_conn_param p;
200 ip_vs_conn_fill_param(AF_INET, iph->protocol,
201 &from, port, &cp->caddr, 0, &p);
202 n_cp = ip_vs_conn_out_get(&p);
203 }
332 if (!n_cp) { 204 if (!n_cp) {
333 n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP, 205 struct ip_vs_conn_param p;
334 &cp->caddr, 0, 206 ip_vs_conn_fill_param(AF_INET, IPPROTO_TCP, &cp->caddr,
335 &cp->vaddr, port, 207 0, &cp->vaddr, port, &p);
336 &from, port, 208 n_cp = ip_vs_conn_new(&p, &from, port,
337 IP_VS_CONN_F_NO_CPORT, 209 IP_VS_CONN_F_NO_CPORT |
210 IP_VS_CONN_F_NFCT,
338 cp->dest); 211 cp->dest);
339 if (!n_cp) 212 if (!n_cp)
340 return 0; 213 return 0;
@@ -359,7 +232,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
359 buf_len = strlen(buf); 232 buf_len = strlen(buf);
360 233
361 ct = nf_ct_get(skb, &ctinfo); 234 ct = nf_ct_get(skb, &ctinfo);
362 if (ct && !nf_ct_is_untracked(ct)) { 235 if (ct && !nf_ct_is_untracked(ct) && nfct_nat(ct)) {
363 /* If mangling fails this function will return 0 236 /* If mangling fails this function will return 0
364 * which will cause the packet to be dropped. 237 * which will cause the packet to be dropped.
365 * Mangling can only fail under memory pressure, 238 * Mangling can only fail under memory pressure,
@@ -369,9 +242,14 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
369 ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, 242 ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
370 start-data, end-start, 243 start-data, end-start,
371 buf, buf_len); 244 buf, buf_len);
372 if (ret) 245 if (ret) {
373 ip_vs_expect_related(skb, ct, n_cp, 246 ip_vs_nfct_expect_related(skb, ct, n_cp,
374 IPPROTO_TCP, NULL, 0); 247 IPPROTO_TCP, 0, 0);
248 if (skb->ip_summed == CHECKSUM_COMPLETE)
249 skb->ip_summed = CHECKSUM_UNNECESSARY;
250 /* csum is updated */
251 ret = 1;
252 }
375 } 253 }
376 254
377 /* 255 /*
@@ -409,7 +287,6 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
409 union nf_inet_addr to; 287 union nf_inet_addr to;
410 __be16 port; 288 __be16 port;
411 struct ip_vs_conn *n_cp; 289 struct ip_vs_conn *n_cp;
412 struct nf_conn *ct;
413 290
414#ifdef CONFIG_IP_VS_IPV6 291#ifdef CONFIG_IP_VS_IPV6
415 /* This application helper doesn't work with IPv6 yet, 292 /* This application helper doesn't work with IPv6 yet,
@@ -479,28 +356,24 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
479 ip_vs_proto_name(iph->protocol), 356 ip_vs_proto_name(iph->protocol),
480 &to.ip, ntohs(port), &cp->vaddr.ip, 0); 357 &to.ip, ntohs(port), &cp->vaddr.ip, 0);
481 358
482 n_cp = ip_vs_conn_in_get(AF_INET, iph->protocol, 359 {
483 &to, port, 360 struct ip_vs_conn_param p;
484 &cp->vaddr, htons(ntohs(cp->vport)-1)); 361 ip_vs_conn_fill_param(AF_INET, iph->protocol, &to, port,
485 if (!n_cp) {
486 n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
487 &to, port,
488 &cp->vaddr, htons(ntohs(cp->vport)-1), 362 &cp->vaddr, htons(ntohs(cp->vport)-1),
489 &cp->daddr, htons(ntohs(cp->dport)-1), 363 &p);
490 0, 364 n_cp = ip_vs_conn_in_get(&p);
491 cp->dest); 365 if (!n_cp) {
492 if (!n_cp) 366 n_cp = ip_vs_conn_new(&p, &cp->daddr,
493 return 0; 367 htons(ntohs(cp->dport)-1),
368 IP_VS_CONN_F_NFCT, cp->dest);
369 if (!n_cp)
370 return 0;
494 371
495 /* add its controller */ 372 /* add its controller */
496 ip_vs_control_add(n_cp, cp); 373 ip_vs_control_add(n_cp, cp);
374 }
497 } 375 }
498 376
499 ct = (struct nf_conn *)skb->nfct;
500 if (ct && ct != &nf_conntrack_untracked)
501 ip_vs_expect_related(skb, ct, n_cp,
502 IPPROTO_TCP, &n_cp->dport, 1);
503
504 /* 377 /*
505 * Move tunnel to listen state 378 * Move tunnel to listen state
506 */ 379 */
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
new file mode 100644
index 000000000000..4680647cd450
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -0,0 +1,292 @@
1/*
2 * ip_vs_nfct.c: Netfilter connection tracking support for IPVS
3 *
4 * Portions Copyright (C) 2001-2002
5 * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland.
6 *
7 * Portions Copyright (C) 2003-2010
8 * Julian Anastasov
9 *
10 *
11 * This code is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 *
25 *
26 * Authors:
27 * Ben North <ben@redfrontdoor.org>
28 * Julian Anastasov <ja@ssi.bg> Reorganize and sync with latest kernels
29 * Hannes Eder <heder@google.com> Extend NFCT support for FTP, ipvs match
30 *
31 *
32 * Current status:
33 *
34 * - provide conntrack confirmation for new and related connections, by
35 * this way we can see their proper conntrack state in all hooks
36 * - support for all forwarding methods, not only NAT
37 * - FTP support (NAT), ability to support other NAT apps with expectations
38 * - to correctly create expectations for related NAT connections the proper
39 * NF conntrack support must be already installed, eg. ip_vs_ftp requires
40 * nf_conntrack_ftp ... iptables_nat for the same ports (but no iptables
41 * NAT rules are needed)
42 * - alter reply for NAT when forwarding packet in original direction:
43 * conntrack from client in NEW or RELATED (Passive FTP DATA) state or
44 * when RELATED conntrack is created from real server (Active FTP DATA)
45 * - if iptables_nat is not loaded the Passive FTP will not work (the
46 * PASV response can not be NAT-ed) but Active FTP should work
47 *
48 */
49
50#define KMSG_COMPONENT "IPVS"
51#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
52
53#include <linux/module.h>
54#include <linux/types.h>
55#include <linux/kernel.h>
56#include <linux/errno.h>
57#include <linux/compiler.h>
58#include <linux/vmalloc.h>
59#include <linux/skbuff.h>
60#include <net/ip.h>
61#include <linux/netfilter.h>
62#include <linux/netfilter_ipv4.h>
63#include <net/ip_vs.h>
64#include <net/netfilter/nf_conntrack_core.h>
65#include <net/netfilter/nf_conntrack_expect.h>
66#include <net/netfilter/nf_conntrack_helper.h>
67#include <net/netfilter/nf_conntrack_zones.h>
68
69
70#define FMT_TUPLE "%pI4:%u->%pI4:%u/%u"
71#define ARG_TUPLE(T) &(T)->src.u3.ip, ntohs((T)->src.u.all), \
72 &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \
73 (T)->dst.protonum
74
75#define FMT_CONN "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u"
76#define ARG_CONN(C) &((C)->caddr.ip), ntohs((C)->cport), \
77 &((C)->vaddr.ip), ntohs((C)->vport), \
78 &((C)->daddr.ip), ntohs((C)->dport), \
79 (C)->protocol, (C)->state
80
81void
82ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
83{
84 enum ip_conntrack_info ctinfo;
85 struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
86 struct nf_conntrack_tuple new_tuple;
87
88 if (ct == NULL || nf_ct_is_confirmed(ct) || nf_ct_is_untracked(ct) ||
89 nf_ct_is_dying(ct))
90 return;
91
92 /* Never alter conntrack for non-NAT conns */
93 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
94 return;
95
96 /* Alter reply only in original direction */
97 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
98 return;
99
100 /*
101 * The connection is not yet in the hashtable, so we update it.
102 * CIP->VIP will remain the same, so leave the tuple in
103 * IP_CT_DIR_ORIGINAL untouched. When the reply comes back from the
104 * real-server we will see RIP->DIP.
105 */
106 new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
107 /*
108 * This will also take care of UDP and other protocols.
109 */
110 if (outin) {
111 new_tuple.src.u3 = cp->daddr;
112 if (new_tuple.dst.protonum != IPPROTO_ICMP &&
113 new_tuple.dst.protonum != IPPROTO_ICMPV6)
114 new_tuple.src.u.tcp.port = cp->dport;
115 } else {
116 new_tuple.dst.u3 = cp->vaddr;
117 if (new_tuple.dst.protonum != IPPROTO_ICMP &&
118 new_tuple.dst.protonum != IPPROTO_ICMPV6)
119 new_tuple.dst.u.tcp.port = cp->vport;
120 }
121 IP_VS_DBG(7, "%s: Updating conntrack ct=%p, status=0x%lX, "
122 "ctinfo=%d, old reply=" FMT_TUPLE
123 ", new reply=" FMT_TUPLE ", cp=" FMT_CONN "\n",
124 __func__, ct, ct->status, ctinfo,
125 ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple),
126 ARG_TUPLE(&new_tuple), ARG_CONN(cp));
127 nf_conntrack_alter_reply(ct, &new_tuple);
128}
129
130int ip_vs_confirm_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp)
131{
132 return nf_conntrack_confirm(skb);
133}
134
135/*
136 * Called from init_conntrack() as expectfn handler.
137 */
138static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
139 struct nf_conntrack_expect *exp)
140{
141 struct nf_conntrack_tuple *orig, new_reply;
142 struct ip_vs_conn *cp;
143 struct ip_vs_conn_param p;
144
145 if (exp->tuple.src.l3num != PF_INET)
146 return;
147
148 /*
149 * We assume that no NF locks are held before this callback.
150 * ip_vs_conn_out_get and ip_vs_conn_in_get should match their
151 * expectations even if they use wildcard values, now we provide the
152 * actual values from the newly created original conntrack direction.
153 * The conntrack is confirmed when packet reaches IPVS hooks.
154 */
155
156 /* RS->CLIENT */
157 orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
158 ip_vs_conn_fill_param(exp->tuple.src.l3num, orig->dst.protonum,
159 &orig->src.u3, orig->src.u.tcp.port,
160 &orig->dst.u3, orig->dst.u.tcp.port, &p);
161 cp = ip_vs_conn_out_get(&p);
162 if (cp) {
163 /* Change reply CLIENT->RS to CLIENT->VS */
164 new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
165 IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
166 FMT_TUPLE ", found inout cp=" FMT_CONN "\n",
167 __func__, ct, ct->status,
168 ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
169 ARG_CONN(cp));
170 new_reply.dst.u3 = cp->vaddr;
171 new_reply.dst.u.tcp.port = cp->vport;
172 IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE
173 ", inout cp=" FMT_CONN "\n",
174 __func__, ct,
175 ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
176 ARG_CONN(cp));
177 goto alter;
178 }
179
180 /* CLIENT->VS */
181 cp = ip_vs_conn_in_get(&p);
182 if (cp) {
183 /* Change reply VS->CLIENT to RS->CLIENT */
184 new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
185 IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
186 FMT_TUPLE ", found outin cp=" FMT_CONN "\n",
187 __func__, ct, ct->status,
188 ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
189 ARG_CONN(cp));
190 new_reply.src.u3 = cp->daddr;
191 new_reply.src.u.tcp.port = cp->dport;
192 IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", "
193 FMT_TUPLE ", outin cp=" FMT_CONN "\n",
194 __func__, ct,
195 ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
196 ARG_CONN(cp));
197 goto alter;
198 }
199
200 IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE
201 " - unknown expect\n",
202 __func__, ct, ct->status, ARG_TUPLE(orig));
203 return;
204
205alter:
206 /* Never alter conntrack for non-NAT conns */
207 if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ)
208 nf_conntrack_alter_reply(ct, &new_reply);
209 ip_vs_conn_put(cp);
210 return;
211}
212
213/*
214 * Create NF conntrack expectation with wildcard (optional) source port.
215 * Then the default callback function will alter the reply and will confirm
216 * the conntrack entry when the first packet comes.
217 * Use port 0 to expect connection from any port.
218 */
219void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct,
220 struct ip_vs_conn *cp, u_int8_t proto,
221 const __be16 port, int from_rs)
222{
223 struct nf_conntrack_expect *exp;
224
225 if (ct == NULL || nf_ct_is_untracked(ct))
226 return;
227
228 exp = nf_ct_expect_alloc(ct);
229 if (!exp)
230 return;
231
232 nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
233 from_rs ? &cp->daddr : &cp->caddr,
234 from_rs ? &cp->caddr : &cp->vaddr,
235 proto, port ? &port : NULL,
236 from_rs ? &cp->cport : &cp->vport);
237
238 exp->expectfn = ip_vs_nfct_expect_callback;
239
240 IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n",
241 __func__, ct, ARG_TUPLE(&exp->tuple));
242 nf_ct_expect_related(exp);
243 nf_ct_expect_put(exp);
244}
245EXPORT_SYMBOL(ip_vs_nfct_expect_related);
246
247/*
248 * Our connection was terminated, try to drop the conntrack immediately
249 */
250void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
251{
252 struct nf_conntrack_tuple_hash *h;
253 struct nf_conn *ct;
254 struct nf_conntrack_tuple tuple;
255
256 if (!cp->cport)
257 return;
258
259 tuple = (struct nf_conntrack_tuple) {
260 .dst = { .protonum = cp->protocol, .dir = IP_CT_DIR_ORIGINAL } };
261 tuple.src.u3 = cp->caddr;
262 tuple.src.u.all = cp->cport;
263 tuple.src.l3num = cp->af;
264 tuple.dst.u3 = cp->vaddr;
265 tuple.dst.u.all = cp->vport;
266
267 IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE
268 " for conn " FMT_CONN "\n",
269 __func__, ARG_TUPLE(&tuple), ARG_CONN(cp));
270
271 h = nf_conntrack_find_get(&init_net, NF_CT_DEFAULT_ZONE, &tuple);
272 if (h) {
273 ct = nf_ct_tuplehash_to_ctrack(h);
274 /* Show what happens instead of calling nf_ct_kill() */
275 if (del_timer(&ct->timeout)) {
276 IP_VS_DBG(7, "%s: ct=%p, deleted conntrack timer for tuple="
277 FMT_TUPLE "\n",
278 __func__, ct, ARG_TUPLE(&tuple));
279 if (ct->timeout.function)
280 ct->timeout.function(ct->timeout.data);
281 } else {
282 IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple="
283 FMT_TUPLE "\n",
284 __func__, ct, ARG_TUPLE(&tuple));
285 }
286 nf_ct_put(ct);
287 } else {
288 IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n",
289 __func__, ARG_TUPLE(&tuple));
290 }
291}
292
diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c
new file mode 100644
index 000000000000..3414af70ee12
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_pe.c
@@ -0,0 +1,147 @@
1#define KMSG_COMPONENT "IPVS"
2#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
3
4#include <linux/module.h>
5#include <linux/spinlock.h>
6#include <linux/interrupt.h>
7#include <asm/string.h>
8#include <linux/kmod.h>
9#include <linux/sysctl.h>
10
11#include <net/ip_vs.h>
12
13/* IPVS pe list */
14static LIST_HEAD(ip_vs_pe);
15
16/* lock for service table */
17static DEFINE_SPINLOCK(ip_vs_pe_lock);
18
19/* Bind a service with a pe */
20void ip_vs_bind_pe(struct ip_vs_service *svc, struct ip_vs_pe *pe)
21{
22 svc->pe = pe;
23}
24
25/* Unbind a service from its pe */
26void ip_vs_unbind_pe(struct ip_vs_service *svc)
27{
28 svc->pe = NULL;
29}
30
31/* Get pe in the pe list by name */
32static struct ip_vs_pe *
33ip_vs_pe_getbyname(const char *pe_name)
34{
35 struct ip_vs_pe *pe;
36
37 IP_VS_DBG(2, "%s(): pe_name \"%s\"\n", __func__,
38 pe_name);
39
40 spin_lock_bh(&ip_vs_pe_lock);
41
42 list_for_each_entry(pe, &ip_vs_pe, n_list) {
43 /* Test and get the modules atomically */
44 if (pe->module &&
45 !try_module_get(pe->module)) {
46 /* This pe is just deleted */
47 continue;
48 }
49 if (strcmp(pe_name, pe->name)==0) {
50 /* HIT */
51 spin_unlock_bh(&ip_vs_pe_lock);
52 return pe;
53 }
54 if (pe->module)
55 module_put(pe->module);
56 }
57
58 spin_unlock_bh(&ip_vs_pe_lock);
59 return NULL;
60}
61
62/* Lookup pe and try to load it if it doesn't exist */
63struct ip_vs_pe *ip_vs_pe_get(const char *name)
64{
65 struct ip_vs_pe *pe;
66
67 /* Search for the pe by name */
68 pe = ip_vs_pe_getbyname(name);
69
70 /* If pe not found, load the module and search again */
71 if (!pe) {
72 request_module("ip_vs_pe_%s", name);
73 pe = ip_vs_pe_getbyname(name);
74 }
75
76 return pe;
77}
78
79void ip_vs_pe_put(struct ip_vs_pe *pe)
80{
81 if (pe && pe->module)
82 module_put(pe->module);
83}
84
85/* Register a pe in the pe list */
86int register_ip_vs_pe(struct ip_vs_pe *pe)
87{
88 struct ip_vs_pe *tmp;
89
90 /* increase the module use count */
91 ip_vs_use_count_inc();
92
93 spin_lock_bh(&ip_vs_pe_lock);
94
95 if (!list_empty(&pe->n_list)) {
96 spin_unlock_bh(&ip_vs_pe_lock);
97 ip_vs_use_count_dec();
98 pr_err("%s(): [%s] pe already linked\n",
99 __func__, pe->name);
100 return -EINVAL;
101 }
102
103 /* Make sure that the pe with this name doesn't exist
104 * in the pe list.
105 */
106 list_for_each_entry(tmp, &ip_vs_pe, n_list) {
107 if (strcmp(tmp->name, pe->name) == 0) {
108 spin_unlock_bh(&ip_vs_pe_lock);
109 ip_vs_use_count_dec();
110 pr_err("%s(): [%s] pe already existed "
111 "in the system\n", __func__, pe->name);
112 return -EINVAL;
113 }
114 }
115 /* Add it into the d-linked pe list */
116 list_add(&pe->n_list, &ip_vs_pe);
117 spin_unlock_bh(&ip_vs_pe_lock);
118
119 pr_info("[%s] pe registered.\n", pe->name);
120
121 return 0;
122}
123EXPORT_SYMBOL_GPL(register_ip_vs_pe);
124
125/* Unregister a pe from the pe list */
126int unregister_ip_vs_pe(struct ip_vs_pe *pe)
127{
128 spin_lock_bh(&ip_vs_pe_lock);
129 if (list_empty(&pe->n_list)) {
130 spin_unlock_bh(&ip_vs_pe_lock);
131 pr_err("%s(): [%s] pe is not in the list. failed\n",
132 __func__, pe->name);
133 return -EINVAL;
134 }
135
136 /* Remove it from the d-linked pe list */
137 list_del(&pe->n_list);
138 spin_unlock_bh(&ip_vs_pe_lock);
139
140 /* decrease the module use count */
141 ip_vs_use_count_dec();
142
143 pr_info("[%s] pe unregistered.\n", pe->name);
144
145 return 0;
146}
147EXPORT_SYMBOL_GPL(unregister_ip_vs_pe);
diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
new file mode 100644
index 000000000000..b8b4e9620f3e
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -0,0 +1,169 @@
1#define KMSG_COMPONENT "IPVS"
2#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
3
4#include <linux/module.h>
5#include <linux/kernel.h>
6
7#include <net/ip_vs.h>
8#include <net/netfilter/nf_conntrack.h>
9#include <linux/netfilter/nf_conntrack_sip.h>
10
11#ifdef CONFIG_IP_VS_DEBUG
12static const char *ip_vs_dbg_callid(char *buf, size_t buf_len,
13 const char *callid, size_t callid_len,
14 int *idx)
15{
16 size_t len = min(min(callid_len, (size_t)64), buf_len - *idx - 1);
17 memcpy(buf + *idx, callid, len);
18 buf[*idx+len] = '\0';
19 *idx += len + 1;
20 return buf + *idx - len;
21}
22
23#define IP_VS_DEBUG_CALLID(callid, len) \
24 ip_vs_dbg_callid(ip_vs_dbg_buf, sizeof(ip_vs_dbg_buf), \
25 callid, len, &ip_vs_dbg_idx)
26#endif
27
28static int get_callid(const char *dptr, unsigned int dataoff,
29 unsigned int datalen,
30 unsigned int *matchoff, unsigned int *matchlen)
31{
32 /* Find callid */
33 while (1) {
34 int ret = ct_sip_get_header(NULL, dptr, dataoff, datalen,
35 SIP_HDR_CALL_ID, matchoff,
36 matchlen);
37 if (ret > 0)
38 break;
39 if (!ret)
40 return 0;
41 dataoff += *matchoff;
42 }
43
44 /* Empty callid is useless */
45 if (!*matchlen)
46 return -EINVAL;
47
48 /* Too large is useless */
49 if (*matchlen > IP_VS_PEDATA_MAXLEN)
50 return -EINVAL;
51
52 /* SIP headers are always followed by a line terminator */
53 if (*matchoff + *matchlen == datalen)
54 return -EINVAL;
55
56 /* RFC 2543 allows lines to be terminated with CR, LF or CRLF,
57 * RFC 3261 allows only CRLF, we support both. */
58 if (*(dptr + *matchoff + *matchlen) != '\r' &&
59 *(dptr + *matchoff + *matchlen) != '\n')
60 return -EINVAL;
61
62 IP_VS_DBG_BUF(9, "SIP callid %s (%d bytes)\n",
63 IP_VS_DEBUG_CALLID(dptr + *matchoff, *matchlen),
64 *matchlen);
65 return 0;
66}
67
68static int
69ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
70{
71 struct ip_vs_iphdr iph;
72 unsigned int dataoff, datalen, matchoff, matchlen;
73 const char *dptr;
74
75 ip_vs_fill_iphdr(p->af, skb_network_header(skb), &iph);
76
77 /* Only useful with UDP */
78 if (iph.protocol != IPPROTO_UDP)
79 return -EINVAL;
80
81 /* No Data ? */
82 dataoff = iph.len + sizeof(struct udphdr);
83 if (dataoff >= skb->len)
84 return -EINVAL;
85
86 dptr = skb->data + dataoff;
87 datalen = skb->len - dataoff;
88
89 if (get_callid(dptr, dataoff, datalen, &matchoff, &matchlen))
90 return -EINVAL;
91
92 p->pe_data = kmalloc(matchlen, GFP_ATOMIC);
93 if (!p->pe_data)
94 return -ENOMEM;
95
96 /* N.B: pe_data is only set on success,
97 * this allows fallback to the default persistence logic on failure
98 */
99 memcpy(p->pe_data, dptr + matchoff, matchlen);
100 p->pe_data_len = matchlen;
101
102 return 0;
103}
104
105static bool ip_vs_sip_ct_match(const struct ip_vs_conn_param *p,
106 struct ip_vs_conn *ct)
107
108{
109 bool ret = 0;
110
111 if (ct->af == p->af &&
112 ip_vs_addr_equal(p->af, p->caddr, &ct->caddr) &&
113 /* protocol should only be IPPROTO_IP if
114 * d_addr is a fwmark */
115 ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
116 p->vaddr, &ct->vaddr) &&
117 ct->vport == p->vport &&
118 ct->flags & IP_VS_CONN_F_TEMPLATE &&
119 ct->protocol == p->protocol &&
120 ct->pe_data && ct->pe_data_len == p->pe_data_len &&
121 !memcmp(ct->pe_data, p->pe_data, p->pe_data_len))
122 ret = 1;
123
124 IP_VS_DBG_BUF(9, "SIP template match %s %s->%s:%d %s\n",
125 ip_vs_proto_name(p->protocol),
126 IP_VS_DEBUG_CALLID(p->pe_data, p->pe_data_len),
127 IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
128 ret ? "hit" : "not hit");
129
130 return ret;
131}
132
133static u32 ip_vs_sip_hashkey_raw(const struct ip_vs_conn_param *p,
134 u32 initval, bool inverse)
135{
136 return jhash(p->pe_data, p->pe_data_len, initval);
137}
138
139static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf)
140{
141 memcpy(buf, cp->pe_data, cp->pe_data_len);
142 return cp->pe_data_len;
143}
144
145static struct ip_vs_pe ip_vs_sip_pe =
146{
147 .name = "sip",
148 .refcnt = ATOMIC_INIT(0),
149 .module = THIS_MODULE,
150 .n_list = LIST_HEAD_INIT(ip_vs_sip_pe.n_list),
151 .fill_param = ip_vs_sip_fill_param,
152 .ct_match = ip_vs_sip_ct_match,
153 .hashkey_raw = ip_vs_sip_hashkey_raw,
154 .show_pe_data = ip_vs_sip_show_pe_data,
155};
156
157static int __init ip_vs_sip_init(void)
158{
159 return register_ip_vs_pe(&ip_vs_sip_pe);
160}
161
162static void __exit ip_vs_sip_cleanup(void)
163{
164 unregister_ip_vs_pe(&ip_vs_sip_pe);
165}
166
167module_init(ip_vs_sip_init);
168module_exit(ip_vs_sip_cleanup);
169MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index 027f654799fe..c53998390877 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -172,8 +172,8 @@ ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp,
172 else if (ih->frag_off & htons(IP_OFFSET)) 172 else if (ih->frag_off & htons(IP_OFFSET))
173 sprintf(buf, "%pI4->%pI4 frag", &ih->saddr, &ih->daddr); 173 sprintf(buf, "%pI4->%pI4 frag", &ih->saddr, &ih->daddr);
174 else { 174 else {
175 __be16 _ports[2], *pptr 175 __be16 _ports[2], *pptr;
176; 176
177 pptr = skb_header_pointer(skb, offset + ih->ihl*4, 177 pptr = skb_header_pointer(skb, offset + ih->ihl*4,
178 sizeof(_ports), _ports); 178 sizeof(_ports), _ports);
179 if (pptr == NULL) 179 if (pptr == NULL)
@@ -223,13 +223,13 @@ ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp,
223 223
224 224
225void 225void
226ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, 226ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp,
227 const struct sk_buff *skb, 227 const struct sk_buff *skb,
228 int offset, 228 int offset,
229 const char *msg) 229 const char *msg)
230{ 230{
231#ifdef CONFIG_IP_VS_IPV6 231#ifdef CONFIG_IP_VS_IPV6
232 if (skb->protocol == htons(ETH_P_IPV6)) 232 if (af == AF_INET6)
233 ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg); 233 ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg);
234 else 234 else
235#endif 235#endif
diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
index 1892dfc12fdd..3a0461117d3f 100644
--- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -40,6 +40,19 @@ struct isakmp_hdr {
40 40
41#define PORT_ISAKMP 500 41#define PORT_ISAKMP 500
42 42
43static void
44ah_esp_conn_fill_param_proto(int af, const struct ip_vs_iphdr *iph,
45 int inverse, struct ip_vs_conn_param *p)
46{
47 if (likely(!inverse))
48 ip_vs_conn_fill_param(af, IPPROTO_UDP,
49 &iph->saddr, htons(PORT_ISAKMP),
50 &iph->daddr, htons(PORT_ISAKMP), p);
51 else
52 ip_vs_conn_fill_param(af, IPPROTO_UDP,
53 &iph->daddr, htons(PORT_ISAKMP),
54 &iph->saddr, htons(PORT_ISAKMP), p);
55}
43 56
44static struct ip_vs_conn * 57static struct ip_vs_conn *
45ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, 58ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
@@ -47,21 +60,10 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
47 int inverse) 60 int inverse)
48{ 61{
49 struct ip_vs_conn *cp; 62 struct ip_vs_conn *cp;
63 struct ip_vs_conn_param p;
50 64
51 if (likely(!inverse)) { 65 ah_esp_conn_fill_param_proto(af, iph, inverse, &p);
52 cp = ip_vs_conn_in_get(af, IPPROTO_UDP, 66 cp = ip_vs_conn_in_get(&p);
53 &iph->saddr,
54 htons(PORT_ISAKMP),
55 &iph->daddr,
56 htons(PORT_ISAKMP));
57 } else {
58 cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
59 &iph->daddr,
60 htons(PORT_ISAKMP),
61 &iph->saddr,
62 htons(PORT_ISAKMP));
63 }
64
65 if (!cp) { 67 if (!cp) {
66 /* 68 /*
67 * We are not sure if the packet is from our 69 * We are not sure if the packet is from our
@@ -87,21 +89,10 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb,
87 int inverse) 89 int inverse)
88{ 90{
89 struct ip_vs_conn *cp; 91 struct ip_vs_conn *cp;
92 struct ip_vs_conn_param p;
90 93
91 if (likely(!inverse)) { 94 ah_esp_conn_fill_param_proto(af, iph, inverse, &p);
92 cp = ip_vs_conn_out_get(af, IPPROTO_UDP, 95 cp = ip_vs_conn_out_get(&p);
93 &iph->saddr,
94 htons(PORT_ISAKMP),
95 &iph->daddr,
96 htons(PORT_ISAKMP));
97 } else {
98 cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
99 &iph->daddr,
100 htons(PORT_ISAKMP),
101 &iph->saddr,
102 htons(PORT_ISAKMP));
103 }
104
105 if (!cp) { 96 if (!cp) {
106 IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet " 97 IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet "
107 "%s%s %s->%s\n", 98 "%s%s %s->%s\n",
@@ -126,54 +117,6 @@ ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
126 return 0; 117 return 0;
127} 118}
128 119
129
130static void
131ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb,
132 int offset, const char *msg)
133{
134 char buf[256];
135 struct iphdr _iph, *ih;
136
137 ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
138 if (ih == NULL)
139 sprintf(buf, "TRUNCATED");
140 else
141 sprintf(buf, "%pI4->%pI4", &ih->saddr, &ih->daddr);
142
143 pr_debug("%s: %s %s\n", msg, pp->name, buf);
144}
145
146#ifdef CONFIG_IP_VS_IPV6
147static void
148ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb,
149 int offset, const char *msg)
150{
151 char buf[256];
152 struct ipv6hdr _iph, *ih;
153
154 ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
155 if (ih == NULL)
156 sprintf(buf, "TRUNCATED");
157 else
158 sprintf(buf, "%pI6->%pI6", &ih->saddr, &ih->daddr);
159
160 pr_debug("%s: %s %s\n", msg, pp->name, buf);
161}
162#endif
163
164static void
165ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
166 int offset, const char *msg)
167{
168#ifdef CONFIG_IP_VS_IPV6
169 if (skb->protocol == htons(ETH_P_IPV6))
170 ah_esp_debug_packet_v6(pp, skb, offset, msg);
171 else
172#endif
173 ah_esp_debug_packet_v4(pp, skb, offset, msg);
174}
175
176
177static void ah_esp_init(struct ip_vs_protocol *pp) 120static void ah_esp_init(struct ip_vs_protocol *pp)
178{ 121{
179 /* nothing to do now */ 122 /* nothing to do now */
@@ -204,7 +147,7 @@ struct ip_vs_protocol ip_vs_protocol_ah = {
204 .register_app = NULL, 147 .register_app = NULL,
205 .unregister_app = NULL, 148 .unregister_app = NULL,
206 .app_conn_bind = NULL, 149 .app_conn_bind = NULL,
207 .debug_packet = ah_esp_debug_packet, 150 .debug_packet = ip_vs_tcpudp_debug_packet,
208 .timeout_change = NULL, /* ISAKMP */ 151 .timeout_change = NULL, /* ISAKMP */
209 .set_state_timeout = NULL, 152 .set_state_timeout = NULL,
210}; 153};
@@ -228,7 +171,7 @@ struct ip_vs_protocol ip_vs_protocol_esp = {
228 .register_app = NULL, 171 .register_app = NULL,
229 .unregister_app = NULL, 172 .unregister_app = NULL,
230 .app_conn_bind = NULL, 173 .app_conn_bind = NULL,
231 .debug_packet = ah_esp_debug_packet, 174 .debug_packet = ip_vs_tcpudp_debug_packet,
232 .timeout_change = NULL, /* ISAKMP */ 175 .timeout_change = NULL, /* ISAKMP */
233}; 176};
234#endif 177#endif
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 4c0855cb006e..1ea96bcd342b 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -31,6 +31,8 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
31 if ((sch->type == SCTP_CID_INIT) && 31 if ((sch->type == SCTP_CID_INIT) &&
32 (svc = ip_vs_service_get(af, skb->mark, iph.protocol, 32 (svc = ip_vs_service_get(af, skb->mark, iph.protocol,
33 &iph.daddr, sh->dest))) { 33 &iph.daddr, sh->dest))) {
34 int ignored;
35
34 if (ip_vs_todrop()) { 36 if (ip_vs_todrop()) {
35 /* 37 /*
36 * It seems that we are very loaded. 38 * It seems that we are very loaded.
@@ -44,8 +46,8 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
44 * Let the virtual server select a real server for the 46 * Let the virtual server select a real server for the
45 * incoming connection, and create a connection entry. 47 * incoming connection, and create a connection entry.
46 */ 48 */
47 *cpp = ip_vs_schedule(svc, skb); 49 *cpp = ip_vs_schedule(svc, skb, pp, &ignored);
48 if (!*cpp) { 50 if (!*cpp && !ignored) {
49 *verdict = ip_vs_leave(svc, skb, pp); 51 *verdict = ip_vs_leave(svc, skb, pp);
50 return 0; 52 return 0;
51 } 53 }
@@ -61,6 +63,7 @@ sctp_snat_handler(struct sk_buff *skb,
61{ 63{
62 sctp_sctphdr_t *sctph; 64 sctp_sctphdr_t *sctph;
63 unsigned int sctphoff; 65 unsigned int sctphoff;
66 struct sk_buff *iter;
64 __be32 crc32; 67 __be32 crc32;
65 68
66#ifdef CONFIG_IP_VS_IPV6 69#ifdef CONFIG_IP_VS_IPV6
@@ -89,8 +92,8 @@ sctp_snat_handler(struct sk_buff *skb,
89 92
90 /* Calculate the checksum */ 93 /* Calculate the checksum */
91 crc32 = sctp_start_cksum((u8 *) sctph, skb_headlen(skb) - sctphoff); 94 crc32 = sctp_start_cksum((u8 *) sctph, skb_headlen(skb) - sctphoff);
92 for (skb = skb_shinfo(skb)->frag_list; skb; skb = skb->next) 95 skb_walk_frags(skb, iter)
93 crc32 = sctp_update_cksum((u8 *) skb->data, skb_headlen(skb), 96 crc32 = sctp_update_cksum((u8 *) iter->data, skb_headlen(iter),
94 crc32); 97 crc32);
95 crc32 = sctp_end_cksum(crc32); 98 crc32 = sctp_end_cksum(crc32);
96 sctph->checksum = crc32; 99 sctph->checksum = crc32;
@@ -102,9 +105,9 @@ static int
102sctp_dnat_handler(struct sk_buff *skb, 105sctp_dnat_handler(struct sk_buff *skb,
103 struct ip_vs_protocol *pp, struct ip_vs_conn *cp) 106 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
104{ 107{
105
106 sctp_sctphdr_t *sctph; 108 sctp_sctphdr_t *sctph;
107 unsigned int sctphoff; 109 unsigned int sctphoff;
110 struct sk_buff *iter;
108 __be32 crc32; 111 __be32 crc32;
109 112
110#ifdef CONFIG_IP_VS_IPV6 113#ifdef CONFIG_IP_VS_IPV6
@@ -133,8 +136,8 @@ sctp_dnat_handler(struct sk_buff *skb,
133 136
134 /* Calculate the checksum */ 137 /* Calculate the checksum */
135 crc32 = sctp_start_cksum((u8 *) sctph, skb_headlen(skb) - sctphoff); 138 crc32 = sctp_start_cksum((u8 *) sctph, skb_headlen(skb) - sctphoff);
136 for (skb = skb_shinfo(skb)->frag_list; skb; skb = skb->next) 139 skb_walk_frags(skb, iter)
137 crc32 = sctp_update_cksum((u8 *) skb->data, skb_headlen(skb), 140 crc32 = sctp_update_cksum((u8 *) iter->data, skb_headlen(iter),
138 crc32); 141 crc32);
139 crc32 = sctp_end_cksum(crc32); 142 crc32 = sctp_end_cksum(crc32);
140 sctph->checksum = crc32; 143 sctph->checksum = crc32;
@@ -145,9 +148,9 @@ sctp_dnat_handler(struct sk_buff *skb,
145static int 148static int
146sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) 149sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
147{ 150{
148 struct sk_buff *list = skb_shinfo(skb)->frag_list;
149 unsigned int sctphoff; 151 unsigned int sctphoff;
150 struct sctphdr *sh, _sctph; 152 struct sctphdr *sh, _sctph;
153 struct sk_buff *iter;
151 __le32 cmp; 154 __le32 cmp;
152 __le32 val; 155 __le32 val;
153 __u32 tmp; 156 __u32 tmp;
@@ -166,15 +169,15 @@ sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
166 cmp = sh->checksum; 169 cmp = sh->checksum;
167 170
168 tmp = sctp_start_cksum((__u8 *) sh, skb_headlen(skb)); 171 tmp = sctp_start_cksum((__u8 *) sh, skb_headlen(skb));
169 for (; list; list = list->next) 172 skb_walk_frags(skb, iter)
170 tmp = sctp_update_cksum((__u8 *) list->data, 173 tmp = sctp_update_cksum((__u8 *) iter->data,
171 skb_headlen(list), tmp); 174 skb_headlen(iter), tmp);
172 175
173 val = sctp_end_cksum(tmp); 176 val = sctp_end_cksum(tmp);
174 177
175 if (val != cmp) { 178 if (val != cmp) {
176 /* CRC failure, dump it. */ 179 /* CRC failure, dump it. */
177 IP_VS_DBG_RL_PKT(0, pp, skb, 0, 180 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
178 "Failed checksum for"); 181 "Failed checksum for");
179 return 0; 182 return 0;
180 } 183 }
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 282d24de8592..f6c5200e2146 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -43,9 +43,12 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
43 return 0; 43 return 0;
44 } 44 }
45 45
46 /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
46 if (th->syn && 47 if (th->syn &&
47 (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr, 48 (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr,
48 th->dest))) { 49 th->dest))) {
50 int ignored;
51
49 if (ip_vs_todrop()) { 52 if (ip_vs_todrop()) {
50 /* 53 /*
51 * It seems that we are very loaded. 54 * It seems that we are very loaded.
@@ -60,8 +63,8 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
60 * Let the virtual server select a real server for the 63 * Let the virtual server select a real server for the
61 * incoming connection, and create a connection entry. 64 * incoming connection, and create a connection entry.
62 */ 65 */
63 *cpp = ip_vs_schedule(svc, skb); 66 *cpp = ip_vs_schedule(svc, skb, pp, &ignored);
64 if (!*cpp) { 67 if (!*cpp && !ignored) {
65 *verdict = ip_vs_leave(svc, skb, pp); 68 *verdict = ip_vs_leave(svc, skb, pp);
66 return 0; 69 return 0;
67 } 70 }
@@ -101,15 +104,15 @@ tcp_partial_csum_update(int af, struct tcphdr *tcph,
101#ifdef CONFIG_IP_VS_IPV6 104#ifdef CONFIG_IP_VS_IPV6
102 if (af == AF_INET6) 105 if (af == AF_INET6)
103 tcph->check = 106 tcph->check =
104 csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, 107 ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
105 ip_vs_check_diff2(oldlen, newlen, 108 ip_vs_check_diff2(oldlen, newlen,
106 ~csum_unfold(tcph->check)))); 109 csum_unfold(tcph->check))));
107 else 110 else
108#endif 111#endif
109 tcph->check = 112 tcph->check =
110 csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, 113 ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
111 ip_vs_check_diff2(oldlen, newlen, 114 ip_vs_check_diff2(oldlen, newlen,
112 ~csum_unfold(tcph->check)))); 115 csum_unfold(tcph->check))));
113} 116}
114 117
115 118
@@ -120,6 +123,7 @@ tcp_snat_handler(struct sk_buff *skb,
120 struct tcphdr *tcph; 123 struct tcphdr *tcph;
121 unsigned int tcphoff; 124 unsigned int tcphoff;
122 int oldlen; 125 int oldlen;
126 int payload_csum = 0;
123 127
124#ifdef CONFIG_IP_VS_IPV6 128#ifdef CONFIG_IP_VS_IPV6
125 if (cp->af == AF_INET6) 129 if (cp->af == AF_INET6)
@@ -134,13 +138,20 @@ tcp_snat_handler(struct sk_buff *skb,
134 return 0; 138 return 0;
135 139
136 if (unlikely(cp->app != NULL)) { 140 if (unlikely(cp->app != NULL)) {
141 int ret;
142
137 /* Some checks before mangling */ 143 /* Some checks before mangling */
138 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) 144 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
139 return 0; 145 return 0;
140 146
141 /* Call application helper if needed */ 147 /* Call application helper if needed */
142 if (!ip_vs_app_pkt_out(cp, skb)) 148 if (!(ret = ip_vs_app_pkt_out(cp, skb)))
143 return 0; 149 return 0;
150 /* ret=2: csum update is needed after payload mangling */
151 if (ret == 1)
152 oldlen = skb->len - tcphoff;
153 else
154 payload_csum = 1;
144 } 155 }
145 156
146 tcph = (void *)skb_network_header(skb) + tcphoff; 157 tcph = (void *)skb_network_header(skb) + tcphoff;
@@ -151,12 +162,13 @@ tcp_snat_handler(struct sk_buff *skb,
151 tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, 162 tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
152 htons(oldlen), 163 htons(oldlen),
153 htons(skb->len - tcphoff)); 164 htons(skb->len - tcphoff));
154 } else if (!cp->app) { 165 } else if (!payload_csum) {
155 /* Only port and addr are changed, do fast csum update */ 166 /* Only port and addr are changed, do fast csum update */
156 tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, 167 tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
157 cp->dport, cp->vport); 168 cp->dport, cp->vport);
158 if (skb->ip_summed == CHECKSUM_COMPLETE) 169 if (skb->ip_summed == CHECKSUM_COMPLETE)
159 skb->ip_summed = CHECKSUM_NONE; 170 skb->ip_summed = (cp->app && pp->csum_check) ?
171 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
160 } else { 172 } else {
161 /* full checksum calculation */ 173 /* full checksum calculation */
162 tcph->check = 0; 174 tcph->check = 0;
@@ -174,6 +186,7 @@ tcp_snat_handler(struct sk_buff *skb,
174 skb->len - tcphoff, 186 skb->len - tcphoff,
175 cp->protocol, 187 cp->protocol,
176 skb->csum); 188 skb->csum);
189 skb->ip_summed = CHECKSUM_UNNECESSARY;
177 190
178 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", 191 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
179 pp->name, tcph->check, 192 pp->name, tcph->check,
@@ -190,6 +203,7 @@ tcp_dnat_handler(struct sk_buff *skb,
190 struct tcphdr *tcph; 203 struct tcphdr *tcph;
191 unsigned int tcphoff; 204 unsigned int tcphoff;
192 int oldlen; 205 int oldlen;
206 int payload_csum = 0;
193 207
194#ifdef CONFIG_IP_VS_IPV6 208#ifdef CONFIG_IP_VS_IPV6
195 if (cp->af == AF_INET6) 209 if (cp->af == AF_INET6)
@@ -204,6 +218,8 @@ tcp_dnat_handler(struct sk_buff *skb,
204 return 0; 218 return 0;
205 219
206 if (unlikely(cp->app != NULL)) { 220 if (unlikely(cp->app != NULL)) {
221 int ret;
222
207 /* Some checks before mangling */ 223 /* Some checks before mangling */
208 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) 224 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
209 return 0; 225 return 0;
@@ -212,8 +228,13 @@ tcp_dnat_handler(struct sk_buff *skb,
212 * Attempt ip_vs_app call. 228 * Attempt ip_vs_app call.
213 * It will fix ip_vs_conn and iph ack_seq stuff 229 * It will fix ip_vs_conn and iph ack_seq stuff
214 */ 230 */
215 if (!ip_vs_app_pkt_in(cp, skb)) 231 if (!(ret = ip_vs_app_pkt_in(cp, skb)))
216 return 0; 232 return 0;
233 /* ret=2: csum update is needed after payload mangling */
234 if (ret == 1)
235 oldlen = skb->len - tcphoff;
236 else
237 payload_csum = 1;
217 } 238 }
218 239
219 tcph = (void *)skb_network_header(skb) + tcphoff; 240 tcph = (void *)skb_network_header(skb) + tcphoff;
@@ -223,15 +244,16 @@ tcp_dnat_handler(struct sk_buff *skb,
223 * Adjust TCP checksums 244 * Adjust TCP checksums
224 */ 245 */
225 if (skb->ip_summed == CHECKSUM_PARTIAL) { 246 if (skb->ip_summed == CHECKSUM_PARTIAL) {
226 tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, 247 tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
227 htons(oldlen), 248 htons(oldlen),
228 htons(skb->len - tcphoff)); 249 htons(skb->len - tcphoff));
229 } else if (!cp->app) { 250 } else if (!payload_csum) {
230 /* Only port and addr are changed, do fast csum update */ 251 /* Only port and addr are changed, do fast csum update */
231 tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, 252 tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
232 cp->vport, cp->dport); 253 cp->vport, cp->dport);
233 if (skb->ip_summed == CHECKSUM_COMPLETE) 254 if (skb->ip_summed == CHECKSUM_COMPLETE)
234 skb->ip_summed = CHECKSUM_NONE; 255 skb->ip_summed = (cp->app && pp->csum_check) ?
256 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
235 } else { 257 } else {
236 /* full checksum calculation */ 258 /* full checksum calculation */
237 tcph->check = 0; 259 tcph->check = 0;
@@ -278,7 +300,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
278 skb->len - tcphoff, 300 skb->len - tcphoff,
279 ipv6_hdr(skb)->nexthdr, 301 ipv6_hdr(skb)->nexthdr,
280 skb->csum)) { 302 skb->csum)) {
281 IP_VS_DBG_RL_PKT(0, pp, skb, 0, 303 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
282 "Failed checksum for"); 304 "Failed checksum for");
283 return 0; 305 return 0;
284 } 306 }
@@ -289,7 +311,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
289 skb->len - tcphoff, 311 skb->len - tcphoff,
290 ip_hdr(skb)->protocol, 312 ip_hdr(skb)->protocol,
291 skb->csum)) { 313 skb->csum)) {
292 IP_VS_DBG_RL_PKT(0, pp, skb, 0, 314 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
293 "Failed checksum for"); 315 "Failed checksum for");
294 return 0; 316 return 0;
295 } 317 }
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index 8553231b5d41..9d106a06bb0a 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -46,6 +46,8 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
46 svc = ip_vs_service_get(af, skb->mark, iph.protocol, 46 svc = ip_vs_service_get(af, skb->mark, iph.protocol,
47 &iph.daddr, uh->dest); 47 &iph.daddr, uh->dest);
48 if (svc) { 48 if (svc) {
49 int ignored;
50
49 if (ip_vs_todrop()) { 51 if (ip_vs_todrop()) {
50 /* 52 /*
51 * It seems that we are very loaded. 53 * It seems that we are very loaded.
@@ -60,8 +62,8 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
60 * Let the virtual server select a real server for the 62 * Let the virtual server select a real server for the
61 * incoming connection, and create a connection entry. 63 * incoming connection, and create a connection entry.
62 */ 64 */
63 *cpp = ip_vs_schedule(svc, skb); 65 *cpp = ip_vs_schedule(svc, skb, pp, &ignored);
64 if (!*cpp) { 66 if (!*cpp && !ignored) {
65 *verdict = ip_vs_leave(svc, skb, pp); 67 *verdict = ip_vs_leave(svc, skb, pp);
66 return 0; 68 return 0;
67 } 69 }
@@ -102,15 +104,15 @@ udp_partial_csum_update(int af, struct udphdr *uhdr,
102#ifdef CONFIG_IP_VS_IPV6 104#ifdef CONFIG_IP_VS_IPV6
103 if (af == AF_INET6) 105 if (af == AF_INET6)
104 uhdr->check = 106 uhdr->check =
105 csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, 107 ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
106 ip_vs_check_diff2(oldlen, newlen, 108 ip_vs_check_diff2(oldlen, newlen,
107 ~csum_unfold(uhdr->check)))); 109 csum_unfold(uhdr->check))));
108 else 110 else
109#endif 111#endif
110 uhdr->check = 112 uhdr->check =
111 csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, 113 ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
112 ip_vs_check_diff2(oldlen, newlen, 114 ip_vs_check_diff2(oldlen, newlen,
113 ~csum_unfold(uhdr->check)))); 115 csum_unfold(uhdr->check))));
114} 116}
115 117
116 118
@@ -121,6 +123,7 @@ udp_snat_handler(struct sk_buff *skb,
121 struct udphdr *udph; 123 struct udphdr *udph;
122 unsigned int udphoff; 124 unsigned int udphoff;
123 int oldlen; 125 int oldlen;
126 int payload_csum = 0;
124 127
125#ifdef CONFIG_IP_VS_IPV6 128#ifdef CONFIG_IP_VS_IPV6
126 if (cp->af == AF_INET6) 129 if (cp->af == AF_INET6)
@@ -135,6 +138,8 @@ udp_snat_handler(struct sk_buff *skb,
135 return 0; 138 return 0;
136 139
137 if (unlikely(cp->app != NULL)) { 140 if (unlikely(cp->app != NULL)) {
141 int ret;
142
138 /* Some checks before mangling */ 143 /* Some checks before mangling */
139 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) 144 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
140 return 0; 145 return 0;
@@ -142,8 +147,13 @@ udp_snat_handler(struct sk_buff *skb,
142 /* 147 /*
143 * Call application helper if needed 148 * Call application helper if needed
144 */ 149 */
145 if (!ip_vs_app_pkt_out(cp, skb)) 150 if (!(ret = ip_vs_app_pkt_out(cp, skb)))
146 return 0; 151 return 0;
152 /* ret=2: csum update is needed after payload mangling */
153 if (ret == 1)
154 oldlen = skb->len - udphoff;
155 else
156 payload_csum = 1;
147 } 157 }
148 158
149 udph = (void *)skb_network_header(skb) + udphoff; 159 udph = (void *)skb_network_header(skb) + udphoff;
@@ -156,12 +166,13 @@ udp_snat_handler(struct sk_buff *skb,
156 udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, 166 udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
157 htons(oldlen), 167 htons(oldlen),
158 htons(skb->len - udphoff)); 168 htons(skb->len - udphoff));
159 } else if (!cp->app && (udph->check != 0)) { 169 } else if (!payload_csum && (udph->check != 0)) {
160 /* Only port and addr are changed, do fast csum update */ 170 /* Only port and addr are changed, do fast csum update */
161 udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, 171 udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
162 cp->dport, cp->vport); 172 cp->dport, cp->vport);
163 if (skb->ip_summed == CHECKSUM_COMPLETE) 173 if (skb->ip_summed == CHECKSUM_COMPLETE)
164 skb->ip_summed = CHECKSUM_NONE; 174 skb->ip_summed = (cp->app && pp->csum_check) ?
175 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
165 } else { 176 } else {
166 /* full checksum calculation */ 177 /* full checksum calculation */
167 udph->check = 0; 178 udph->check = 0;
@@ -181,6 +192,7 @@ udp_snat_handler(struct sk_buff *skb,
181 skb->csum); 192 skb->csum);
182 if (udph->check == 0) 193 if (udph->check == 0)
183 udph->check = CSUM_MANGLED_0; 194 udph->check = CSUM_MANGLED_0;
195 skb->ip_summed = CHECKSUM_UNNECESSARY;
184 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", 196 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
185 pp->name, udph->check, 197 pp->name, udph->check,
186 (char*)&(udph->check) - (char*)udph); 198 (char*)&(udph->check) - (char*)udph);
@@ -196,6 +208,7 @@ udp_dnat_handler(struct sk_buff *skb,
196 struct udphdr *udph; 208 struct udphdr *udph;
197 unsigned int udphoff; 209 unsigned int udphoff;
198 int oldlen; 210 int oldlen;
211 int payload_csum = 0;
199 212
200#ifdef CONFIG_IP_VS_IPV6 213#ifdef CONFIG_IP_VS_IPV6
201 if (cp->af == AF_INET6) 214 if (cp->af == AF_INET6)
@@ -210,6 +223,8 @@ udp_dnat_handler(struct sk_buff *skb,
210 return 0; 223 return 0;
211 224
212 if (unlikely(cp->app != NULL)) { 225 if (unlikely(cp->app != NULL)) {
226 int ret;
227
213 /* Some checks before mangling */ 228 /* Some checks before mangling */
214 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) 229 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
215 return 0; 230 return 0;
@@ -218,8 +233,13 @@ udp_dnat_handler(struct sk_buff *skb,
218 * Attempt ip_vs_app call. 233 * Attempt ip_vs_app call.
219 * It will fix ip_vs_conn 234 * It will fix ip_vs_conn
220 */ 235 */
221 if (!ip_vs_app_pkt_in(cp, skb)) 236 if (!(ret = ip_vs_app_pkt_in(cp, skb)))
222 return 0; 237 return 0;
238 /* ret=2: csum update is needed after payload mangling */
239 if (ret == 1)
240 oldlen = skb->len - udphoff;
241 else
242 payload_csum = 1;
223 } 243 }
224 244
225 udph = (void *)skb_network_header(skb) + udphoff; 245 udph = (void *)skb_network_header(skb) + udphoff;
@@ -229,15 +249,16 @@ udp_dnat_handler(struct sk_buff *skb,
229 * Adjust UDP checksums 249 * Adjust UDP checksums
230 */ 250 */
231 if (skb->ip_summed == CHECKSUM_PARTIAL) { 251 if (skb->ip_summed == CHECKSUM_PARTIAL) {
232 udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, 252 udp_partial_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
233 htons(oldlen), 253 htons(oldlen),
234 htons(skb->len - udphoff)); 254 htons(skb->len - udphoff));
235 } else if (!cp->app && (udph->check != 0)) { 255 } else if (!payload_csum && (udph->check != 0)) {
236 /* Only port and addr are changed, do fast csum update */ 256 /* Only port and addr are changed, do fast csum update */
237 udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, 257 udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
238 cp->vport, cp->dport); 258 cp->vport, cp->dport);
239 if (skb->ip_summed == CHECKSUM_COMPLETE) 259 if (skb->ip_summed == CHECKSUM_COMPLETE)
240 skb->ip_summed = CHECKSUM_NONE; 260 skb->ip_summed = (cp->app && pp->csum_check) ?
261 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
241 } else { 262 } else {
242 /* full checksum calculation */ 263 /* full checksum calculation */
243 udph->check = 0; 264 udph->check = 0;
@@ -293,7 +314,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
293 skb->len - udphoff, 314 skb->len - udphoff,
294 ipv6_hdr(skb)->nexthdr, 315 ipv6_hdr(skb)->nexthdr,
295 skb->csum)) { 316 skb->csum)) {
296 IP_VS_DBG_RL_PKT(0, pp, skb, 0, 317 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
297 "Failed checksum for"); 318 "Failed checksum for");
298 return 0; 319 return 0;
299 } 320 }
@@ -304,7 +325,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
304 skb->len - udphoff, 325 skb->len - udphoff,
305 ip_hdr(skb)->protocol, 326 ip_hdr(skb)->protocol,
306 skb->csum)) { 327 skb->csum)) {
307 IP_VS_DBG_RL_PKT(0, pp, skb, 0, 328 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
308 "Failed checksum for"); 329 "Failed checksum for");
309 return 0; 330 return 0;
310 } 331 }
diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c
index bbc1ac795952..076ebe00435d 100644
--- a/net/netfilter/ipvs/ip_vs_sched.c
+++ b/net/netfilter/ipvs/ip_vs_sched.c
@@ -35,7 +35,7 @@
35static LIST_HEAD(ip_vs_schedulers); 35static LIST_HEAD(ip_vs_schedulers);
36 36
37/* lock for service table */ 37/* lock for service table */
38static DEFINE_RWLOCK(__ip_vs_sched_lock); 38static DEFINE_SPINLOCK(ip_vs_sched_lock);
39 39
40 40
41/* 41/*
@@ -46,15 +46,6 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
46{ 46{
47 int ret; 47 int ret;
48 48
49 if (svc == NULL) {
50 pr_err("%s(): svc arg NULL\n", __func__);
51 return -EINVAL;
52 }
53 if (scheduler == NULL) {
54 pr_err("%s(): scheduler arg NULL\n", __func__);
55 return -EINVAL;
56 }
57
58 svc->scheduler = scheduler; 49 svc->scheduler = scheduler;
59 50
60 if (scheduler->init_service) { 51 if (scheduler->init_service) {
@@ -74,18 +65,10 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
74 */ 65 */
75int ip_vs_unbind_scheduler(struct ip_vs_service *svc) 66int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
76{ 67{
77 struct ip_vs_scheduler *sched; 68 struct ip_vs_scheduler *sched = svc->scheduler;
78 69
79 if (svc == NULL) { 70 if (!sched)
80 pr_err("%s(): svc arg NULL\n", __func__); 71 return 0;
81 return -EINVAL;
82 }
83
84 sched = svc->scheduler;
85 if (sched == NULL) {
86 pr_err("%s(): svc isn't bound\n", __func__);
87 return -EINVAL;
88 }
89 72
90 if (sched->done_service) { 73 if (sched->done_service) {
91 if (sched->done_service(svc) != 0) { 74 if (sched->done_service(svc) != 0) {
@@ -108,7 +91,7 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
108 91
109 IP_VS_DBG(2, "%s(): sched_name \"%s\"\n", __func__, sched_name); 92 IP_VS_DBG(2, "%s(): sched_name \"%s\"\n", __func__, sched_name);
110 93
111 read_lock_bh(&__ip_vs_sched_lock); 94 spin_lock_bh(&ip_vs_sched_lock);
112 95
113 list_for_each_entry(sched, &ip_vs_schedulers, n_list) { 96 list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
114 /* 97 /*
@@ -122,14 +105,14 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
122 } 105 }
123 if (strcmp(sched_name, sched->name)==0) { 106 if (strcmp(sched_name, sched->name)==0) {
124 /* HIT */ 107 /* HIT */
125 read_unlock_bh(&__ip_vs_sched_lock); 108 spin_unlock_bh(&ip_vs_sched_lock);
126 return sched; 109 return sched;
127 } 110 }
128 if (sched->module) 111 if (sched->module)
129 module_put(sched->module); 112 module_put(sched->module);
130 } 113 }
131 114
132 read_unlock_bh(&__ip_vs_sched_lock); 115 spin_unlock_bh(&ip_vs_sched_lock);
133 return NULL; 116 return NULL;
134} 117}
135 118
@@ -159,7 +142,7 @@ struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name)
159 142
160void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) 143void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
161{ 144{
162 if (scheduler->module) 145 if (scheduler && scheduler->module)
163 module_put(scheduler->module); 146 module_put(scheduler->module);
164} 147}
165 148
@@ -184,10 +167,10 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
184 /* increase the module use count */ 167 /* increase the module use count */
185 ip_vs_use_count_inc(); 168 ip_vs_use_count_inc();
186 169
187 write_lock_bh(&__ip_vs_sched_lock); 170 spin_lock_bh(&ip_vs_sched_lock);
188 171
189 if (!list_empty(&scheduler->n_list)) { 172 if (!list_empty(&scheduler->n_list)) {
190 write_unlock_bh(&__ip_vs_sched_lock); 173 spin_unlock_bh(&ip_vs_sched_lock);
191 ip_vs_use_count_dec(); 174 ip_vs_use_count_dec();
192 pr_err("%s(): [%s] scheduler already linked\n", 175 pr_err("%s(): [%s] scheduler already linked\n",
193 __func__, scheduler->name); 176 __func__, scheduler->name);
@@ -200,7 +183,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
200 */ 183 */
201 list_for_each_entry(sched, &ip_vs_schedulers, n_list) { 184 list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
202 if (strcmp(scheduler->name, sched->name) == 0) { 185 if (strcmp(scheduler->name, sched->name) == 0) {
203 write_unlock_bh(&__ip_vs_sched_lock); 186 spin_unlock_bh(&ip_vs_sched_lock);
204 ip_vs_use_count_dec(); 187 ip_vs_use_count_dec();
205 pr_err("%s(): [%s] scheduler already existed " 188 pr_err("%s(): [%s] scheduler already existed "
206 "in the system\n", __func__, scheduler->name); 189 "in the system\n", __func__, scheduler->name);
@@ -211,7 +194,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
211 * Add it into the d-linked scheduler list 194 * Add it into the d-linked scheduler list
212 */ 195 */
213 list_add(&scheduler->n_list, &ip_vs_schedulers); 196 list_add(&scheduler->n_list, &ip_vs_schedulers);
214 write_unlock_bh(&__ip_vs_sched_lock); 197 spin_unlock_bh(&ip_vs_sched_lock);
215 198
216 pr_info("[%s] scheduler registered.\n", scheduler->name); 199 pr_info("[%s] scheduler registered.\n", scheduler->name);
217 200
@@ -229,9 +212,9 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
229 return -EINVAL; 212 return -EINVAL;
230 } 213 }
231 214
232 write_lock_bh(&__ip_vs_sched_lock); 215 spin_lock_bh(&ip_vs_sched_lock);
233 if (list_empty(&scheduler->n_list)) { 216 if (list_empty(&scheduler->n_list)) {
234 write_unlock_bh(&__ip_vs_sched_lock); 217 spin_unlock_bh(&ip_vs_sched_lock);
235 pr_err("%s(): [%s] scheduler is not in the list. failed\n", 218 pr_err("%s(): [%s] scheduler is not in the list. failed\n",
236 __func__, scheduler->name); 219 __func__, scheduler->name);
237 return -EINVAL; 220 return -EINVAL;
@@ -241,7 +224,7 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
241 * Remove it from the d-linked scheduler list 224 * Remove it from the d-linked scheduler list
242 */ 225 */
243 list_del(&scheduler->n_list); 226 list_del(&scheduler->n_list);
244 write_unlock_bh(&__ip_vs_sched_lock); 227 spin_unlock_bh(&ip_vs_sched_lock);
245 228
246 /* decrease the module use count */ 229 /* decrease the module use count */
247 ip_vs_use_count_dec(); 230 ip_vs_use_count_dec();
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 7ba06939829f..ab85aedea17e 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -288,6 +288,16 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp)
288 ip_vs_sync_conn(cp->control); 288 ip_vs_sync_conn(cp->control);
289} 289}
290 290
291static inline int
292ip_vs_conn_fill_param_sync(int af, int protocol,
293 const union nf_inet_addr *caddr, __be16 cport,
294 const union nf_inet_addr *vaddr, __be16 vport,
295 struct ip_vs_conn_param *p)
296{
297 /* XXX: Need to take into account persistence engine */
298 ip_vs_conn_fill_param(af, protocol, caddr, cport, vaddr, vport, p);
299 return 0;
300}
291 301
292/* 302/*
293 * Process received multicast message and create the corresponding 303 * Process received multicast message and create the corresponding
@@ -301,6 +311,7 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
301 struct ip_vs_conn *cp; 311 struct ip_vs_conn *cp;
302 struct ip_vs_protocol *pp; 312 struct ip_vs_protocol *pp;
303 struct ip_vs_dest *dest; 313 struct ip_vs_dest *dest;
314 struct ip_vs_conn_param param;
304 char *p; 315 char *p;
305 int i; 316 int i;
306 317
@@ -370,18 +381,20 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
370 } 381 }
371 } 382 }
372 383
373 if (!(flags & IP_VS_CONN_F_TEMPLATE)) 384 {
374 cp = ip_vs_conn_in_get(AF_INET, s->protocol, 385 if (ip_vs_conn_fill_param_sync(AF_INET, s->protocol,
375 (union nf_inet_addr *)&s->caddr, 386 (union nf_inet_addr *)&s->caddr,
376 s->cport, 387 s->cport,
377 (union nf_inet_addr *)&s->vaddr, 388 (union nf_inet_addr *)&s->vaddr,
378 s->vport); 389 s->vport, &param)) {
379 else 390 pr_err("ip_vs_conn_fill_param_sync failed");
380 cp = ip_vs_ct_in_get(AF_INET, s->protocol, 391 return;
381 (union nf_inet_addr *)&s->caddr, 392 }
382 s->cport, 393 if (!(flags & IP_VS_CONN_F_TEMPLATE))
383 (union nf_inet_addr *)&s->vaddr, 394 cp = ip_vs_conn_in_get(&param);
384 s->vport); 395 else
396 cp = ip_vs_ct_in_get(&param);
397 }
385 if (!cp) { 398 if (!cp) {
386 /* 399 /*
387 * Find the appropriate destination for the connection. 400 * Find the appropriate destination for the connection.
@@ -406,14 +419,9 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
406 else 419 else
407 flags &= ~IP_VS_CONN_F_INACTIVE; 420 flags &= ~IP_VS_CONN_F_INACTIVE;
408 } 421 }
409 cp = ip_vs_conn_new(AF_INET, s->protocol, 422 cp = ip_vs_conn_new(&param,
410 (union nf_inet_addr *)&s->caddr,
411 s->cport,
412 (union nf_inet_addr *)&s->vaddr,
413 s->vport,
414 (union nf_inet_addr *)&s->daddr, 423 (union nf_inet_addr *)&s->daddr,
415 s->dport, 424 s->dport, flags, dest);
416 flags, dest);
417 if (dest) 425 if (dest)
418 atomic_dec(&dest->refcnt); 426 atomic_dec(&dest->refcnt);
419 if (!cp) { 427 if (!cp) {
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 21e1a5e9b9d3..de04ea39cde8 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -11,6 +11,16 @@
11 * 11 *
12 * Changes: 12 * Changes:
13 * 13 *
14 * Description of forwarding methods:
15 * - all transmitters are called from LOCAL_IN (remote clients) and
16 * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD
17 * - not all connections have destination server, for example,
18 * connections in backup server when fwmark is used
19 * - bypass connections use daddr from packet
20 * LOCAL_OUT rules:
21 * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING)
22 * - skb->pkt_type is not set yet
23 * - the only place where we can see skb->sk != NULL
14 */ 24 */
15 25
16#define KMSG_COMPONENT "IPVS" 26#define KMSG_COMPONENT "IPVS"
@@ -26,9 +36,9 @@
26#include <net/route.h> /* for ip_route_output */ 36#include <net/route.h> /* for ip_route_output */
27#include <net/ipv6.h> 37#include <net/ipv6.h>
28#include <net/ip6_route.h> 38#include <net/ip6_route.h>
39#include <net/addrconf.h>
29#include <linux/icmpv6.h> 40#include <linux/icmpv6.h>
30#include <linux/netfilter.h> 41#include <linux/netfilter.h>
31#include <net/netfilter/nf_conntrack.h>
32#include <linux/netfilter_ipv4.h> 42#include <linux/netfilter_ipv4.h>
33 43
34#include <net/ip_vs.h> 44#include <net/ip_vs.h>
@@ -38,26 +48,27 @@
38 * Destination cache to speed up outgoing route lookup 48 * Destination cache to speed up outgoing route lookup
39 */ 49 */
40static inline void 50static inline void
41__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst) 51__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst,
52 u32 dst_cookie)
42{ 53{
43 struct dst_entry *old_dst; 54 struct dst_entry *old_dst;
44 55
45 old_dst = dest->dst_cache; 56 old_dst = dest->dst_cache;
46 dest->dst_cache = dst; 57 dest->dst_cache = dst;
47 dest->dst_rtos = rtos; 58 dest->dst_rtos = rtos;
59 dest->dst_cookie = dst_cookie;
48 dst_release(old_dst); 60 dst_release(old_dst);
49} 61}
50 62
51static inline struct dst_entry * 63static inline struct dst_entry *
52__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie) 64__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos)
53{ 65{
54 struct dst_entry *dst = dest->dst_cache; 66 struct dst_entry *dst = dest->dst_cache;
55 67
56 if (!dst) 68 if (!dst)
57 return NULL; 69 return NULL;
58 if ((dst->obsolete 70 if ((dst->obsolete || rtos != dest->dst_rtos) &&
59 || (dest->af == AF_INET && rtos != dest->dst_rtos)) && 71 dst->ops->check(dst, dest->dst_cookie) == NULL) {
60 dst->ops->check(dst, cookie) == NULL) {
61 dest->dst_cache = NULL; 72 dest->dst_cache = NULL;
62 dst_release(dst); 73 dst_release(dst);
63 return NULL; 74 return NULL;
@@ -66,16 +77,24 @@ __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
66 return dst; 77 return dst;
67} 78}
68 79
80/*
81 * Get route to destination or remote server
82 * rt_mode: flags, &1=Allow local dest, &2=Allow non-local dest,
83 * &4=Allow redirect from remote daddr to local
84 */
69static struct rtable * 85static struct rtable *
70__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) 86__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
87 __be32 daddr, u32 rtos, int rt_mode)
71{ 88{
89 struct net *net = dev_net(skb_dst(skb)->dev);
72 struct rtable *rt; /* Route to the other host */ 90 struct rtable *rt; /* Route to the other host */
73 struct ip_vs_dest *dest = cp->dest; 91 struct rtable *ort; /* Original route */
92 int local;
74 93
75 if (dest) { 94 if (dest) {
76 spin_lock(&dest->dst_lock); 95 spin_lock(&dest->dst_lock);
77 if (!(rt = (struct rtable *) 96 if (!(rt = (struct rtable *)
78 __ip_vs_dst_check(dest, rtos, 0))) { 97 __ip_vs_dst_check(dest, rtos))) {
79 struct flowi fl = { 98 struct flowi fl = {
80 .oif = 0, 99 .oif = 0,
81 .nl_u = { 100 .nl_u = {
@@ -85,13 +104,13 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
85 .tos = rtos, } }, 104 .tos = rtos, } },
86 }; 105 };
87 106
88 if (ip_route_output_key(&init_net, &rt, &fl)) { 107 if (ip_route_output_key(net, &rt, &fl)) {
89 spin_unlock(&dest->dst_lock); 108 spin_unlock(&dest->dst_lock);
90 IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", 109 IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
91 &dest->addr.ip); 110 &dest->addr.ip);
92 return NULL; 111 return NULL;
93 } 112 }
94 __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst)); 113 __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0);
95 IP_VS_DBG(10, "new dst %pI4, refcnt=%d, rtos=%X\n", 114 IP_VS_DBG(10, "new dst %pI4, refcnt=%d, rtos=%X\n",
96 &dest->addr.ip, 115 &dest->addr.ip,
97 atomic_read(&rt->dst.__refcnt), rtos); 116 atomic_read(&rt->dst.__refcnt), rtos);
@@ -102,78 +121,199 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
102 .oif = 0, 121 .oif = 0,
103 .nl_u = { 122 .nl_u = {
104 .ip4_u = { 123 .ip4_u = {
105 .daddr = cp->daddr.ip, 124 .daddr = daddr,
106 .saddr = 0, 125 .saddr = 0,
107 .tos = rtos, } }, 126 .tos = rtos, } },
108 }; 127 };
109 128
110 if (ip_route_output_key(&init_net, &rt, &fl)) { 129 if (ip_route_output_key(net, &rt, &fl)) {
111 IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", 130 IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
112 &cp->daddr.ip); 131 &daddr);
113 return NULL; 132 return NULL;
114 } 133 }
115 } 134 }
116 135
136 local = rt->rt_flags & RTCF_LOCAL;
137 if (!((local ? 1 : 2) & rt_mode)) {
138 IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n",
139 (rt->rt_flags & RTCF_LOCAL) ?
140 "local":"non-local", &rt->rt_dst);
141 ip_rt_put(rt);
142 return NULL;
143 }
144 if (local && !(rt_mode & 4) && !((ort = skb_rtable(skb)) &&
145 ort->rt_flags & RTCF_LOCAL)) {
146 IP_VS_DBG_RL("Redirect from non-local address %pI4 to local "
147 "requires NAT method, dest: %pI4\n",
148 &ip_hdr(skb)->daddr, &rt->rt_dst);
149 ip_rt_put(rt);
150 return NULL;
151 }
152 if (unlikely(!local && ipv4_is_loopback(ip_hdr(skb)->saddr))) {
153 IP_VS_DBG_RL("Stopping traffic from loopback address %pI4 "
154 "to non-local address, dest: %pI4\n",
155 &ip_hdr(skb)->saddr, &rt->rt_dst);
156 ip_rt_put(rt);
157 return NULL;
158 }
159
117 return rt; 160 return rt;
118} 161}
119 162
163/* Reroute packet to local IPv4 stack after DNAT */
164static int
165__ip_vs_reroute_locally(struct sk_buff *skb)
166{
167 struct rtable *rt = skb_rtable(skb);
168 struct net_device *dev = rt->dst.dev;
169 struct net *net = dev_net(dev);
170 struct iphdr *iph = ip_hdr(skb);
171
172 if (rt->fl.iif) {
173 unsigned long orefdst = skb->_skb_refdst;
174
175 if (ip_route_input(skb, iph->daddr, iph->saddr,
176 iph->tos, skb->dev))
177 return 0;
178 refdst_drop(orefdst);
179 } else {
180 struct flowi fl = {
181 .oif = 0,
182 .nl_u = {
183 .ip4_u = {
184 .daddr = iph->daddr,
185 .saddr = iph->saddr,
186 .tos = RT_TOS(iph->tos),
187 }
188 },
189 .mark = skb->mark,
190 };
191 struct rtable *rt;
192
193 if (ip_route_output_key(net, &rt, &fl))
194 return 0;
195 if (!(rt->rt_flags & RTCF_LOCAL)) {
196 ip_rt_put(rt);
197 return 0;
198 }
199 /* Drop old route. */
200 skb_dst_drop(skb);
201 skb_dst_set(skb, &rt->dst);
202 }
203 return 1;
204}
205
120#ifdef CONFIG_IP_VS_IPV6 206#ifdef CONFIG_IP_VS_IPV6
207
208static inline int __ip_vs_is_local_route6(struct rt6_info *rt)
209{
210 return rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK;
211}
212
213static struct dst_entry *
214__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
215 struct in6_addr *ret_saddr, int do_xfrm)
216{
217 struct dst_entry *dst;
218 struct flowi fl = {
219 .oif = 0,
220 .nl_u = {
221 .ip6_u = {
222 .daddr = *daddr,
223 },
224 },
225 };
226
227 dst = ip6_route_output(net, NULL, &fl);
228 if (dst->error)
229 goto out_err;
230 if (!ret_saddr)
231 return dst;
232 if (ipv6_addr_any(&fl.fl6_src) &&
233 ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
234 &fl.fl6_dst, 0, &fl.fl6_src) < 0)
235 goto out_err;
236 if (do_xfrm && xfrm_lookup(net, &dst, &fl, NULL, 0) < 0)
237 goto out_err;
238 ipv6_addr_copy(ret_saddr, &fl.fl6_src);
239 return dst;
240
241out_err:
242 dst_release(dst);
243 IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr);
244 return NULL;
245}
246
247/*
248 * Get route to destination or remote server
249 * rt_mode: flags, &1=Allow local dest, &2=Allow non-local dest,
250 * &4=Allow redirect from remote daddr to local
251 */
121static struct rt6_info * 252static struct rt6_info *
122__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp) 253__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
254 struct in6_addr *daddr, struct in6_addr *ret_saddr,
255 int do_xfrm, int rt_mode)
123{ 256{
257 struct net *net = dev_net(skb_dst(skb)->dev);
124 struct rt6_info *rt; /* Route to the other host */ 258 struct rt6_info *rt; /* Route to the other host */
125 struct ip_vs_dest *dest = cp->dest; 259 struct rt6_info *ort; /* Original route */
260 struct dst_entry *dst;
261 int local;
126 262
127 if (dest) { 263 if (dest) {
128 spin_lock(&dest->dst_lock); 264 spin_lock(&dest->dst_lock);
129 rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0); 265 rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0);
130 if (!rt) { 266 if (!rt) {
131 struct flowi fl = { 267 u32 cookie;
132 .oif = 0,
133 .nl_u = {
134 .ip6_u = {
135 .daddr = dest->addr.in6,
136 .saddr = {
137 .s6_addr32 =
138 { 0, 0, 0, 0 },
139 },
140 },
141 },
142 };
143 268
144 rt = (struct rt6_info *)ip6_route_output(&init_net, 269 dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
145 NULL, &fl); 270 &dest->dst_saddr,
146 if (!rt) { 271 do_xfrm);
272 if (!dst) {
147 spin_unlock(&dest->dst_lock); 273 spin_unlock(&dest->dst_lock);
148 IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
149 &dest->addr.in6);
150 return NULL; 274 return NULL;
151 } 275 }
152 __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst)); 276 rt = (struct rt6_info *) dst;
153 IP_VS_DBG(10, "new dst %pI6, refcnt=%d\n", 277 cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
154 &dest->addr.in6, 278 __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie);
279 IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
280 &dest->addr.in6, &dest->dst_saddr,
155 atomic_read(&rt->dst.__refcnt)); 281 atomic_read(&rt->dst.__refcnt));
156 } 282 }
283 if (ret_saddr)
284 ipv6_addr_copy(ret_saddr, &dest->dst_saddr);
157 spin_unlock(&dest->dst_lock); 285 spin_unlock(&dest->dst_lock);
158 } else { 286 } else {
159 struct flowi fl = { 287 dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm);
160 .oif = 0, 288 if (!dst)
161 .nl_u = {
162 .ip6_u = {
163 .daddr = cp->daddr.in6,
164 .saddr = {
165 .s6_addr32 = { 0, 0, 0, 0 },
166 },
167 },
168 },
169 };
170
171 rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
172 if (!rt) {
173 IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
174 &cp->daddr.in6);
175 return NULL; 289 return NULL;
176 } 290 rt = (struct rt6_info *) dst;
291 }
292
293 local = __ip_vs_is_local_route6(rt);
294 if (!((local ? 1 : 2) & rt_mode)) {
295 IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6\n",
296 local ? "local":"non-local", daddr);
297 dst_release(&rt->dst);
298 return NULL;
299 }
300 if (local && !(rt_mode & 4) &&
301 !((ort = (struct rt6_info *) skb_dst(skb)) &&
302 __ip_vs_is_local_route6(ort))) {
303 IP_VS_DBG_RL("Redirect from non-local address %pI6 to local "
304 "requires NAT method, dest: %pI6\n",
305 &ipv6_hdr(skb)->daddr, daddr);
306 dst_release(&rt->dst);
307 return NULL;
308 }
309 if (unlikely(!local && (!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
310 ipv6_addr_type(&ipv6_hdr(skb)->saddr) &
311 IPV6_ADDR_LOOPBACK)) {
312 IP_VS_DBG_RL("Stopping traffic from loopback address %pI6 "
313 "to non-local address, dest: %pI6\n",
314 &ipv6_hdr(skb)->saddr, daddr);
315 dst_release(&rt->dst);
316 return NULL;
177 } 317 }
178 318
179 return rt; 319 return rt;
@@ -194,12 +334,44 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
194 dst_release(old_dst); 334 dst_release(old_dst);
195} 335}
196 336
197#define IP_VS_XMIT(pf, skb, rt) \ 337#define IP_VS_XMIT_TUNNEL(skb, cp) \
338({ \
339 int __ret = NF_ACCEPT; \
340 \
341 (skb)->ipvs_property = 1; \
342 if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT)) \
343 __ret = ip_vs_confirm_conntrack(skb, cp); \
344 if (__ret == NF_ACCEPT) { \
345 nf_reset(skb); \
346 skb_forward_csum(skb); \
347 } \
348 __ret; \
349})
350
351#define IP_VS_XMIT_NAT(pf, skb, cp, local) \
352do { \
353 (skb)->ipvs_property = 1; \
354 if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \
355 ip_vs_notrack(skb); \
356 else \
357 ip_vs_update_conntrack(skb, cp, 1); \
358 if (local) \
359 return NF_ACCEPT; \
360 skb_forward_csum(skb); \
361 NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \
362 skb_dst(skb)->dev, dst_output); \
363} while (0)
364
365#define IP_VS_XMIT(pf, skb, cp, local) \
198do { \ 366do { \
199 (skb)->ipvs_property = 1; \ 367 (skb)->ipvs_property = 1; \
368 if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \
369 ip_vs_notrack(skb); \
370 if (local) \
371 return NF_ACCEPT; \
200 skb_forward_csum(skb); \ 372 skb_forward_csum(skb); \
201 NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ 373 NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \
202 (rt)->dst.dev, dst_output); \ 374 skb_dst(skb)->dev, dst_output); \
203} while (0) 375} while (0)
204 376
205 377
@@ -211,7 +383,7 @@ ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
211 struct ip_vs_protocol *pp) 383 struct ip_vs_protocol *pp)
212{ 384{
213 /* we do not touch skb and do not need pskb ptr */ 385 /* we do not touch skb and do not need pskb ptr */
214 return NF_ACCEPT; 386 IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
215} 387}
216 388
217 389
@@ -226,24 +398,13 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
226{ 398{
227 struct rtable *rt; /* Route to the other host */ 399 struct rtable *rt; /* Route to the other host */
228 struct iphdr *iph = ip_hdr(skb); 400 struct iphdr *iph = ip_hdr(skb);
229 u8 tos = iph->tos;
230 int mtu; 401 int mtu;
231 struct flowi fl = {
232 .oif = 0,
233 .nl_u = {
234 .ip4_u = {
235 .daddr = iph->daddr,
236 .saddr = 0,
237 .tos = RT_TOS(tos), } },
238 };
239 402
240 EnterFunction(10); 403 EnterFunction(10);
241 404
242 if (ip_route_output_key(&init_net, &rt, &fl)) { 405 if (!(rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr,
243 IP_VS_DBG_RL("%s(): ip_route_output error, dest: %pI4\n", 406 RT_TOS(iph->tos), 2)))
244 __func__, &iph->daddr);
245 goto tx_error_icmp; 407 goto tx_error_icmp;
246 }
247 408
248 /* MTU checking */ 409 /* MTU checking */
249 mtu = dst_mtu(&rt->dst); 410 mtu = dst_mtu(&rt->dst);
@@ -271,7 +432,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
271 /* Another hack: avoid icmp_send in ip_fragment */ 432 /* Another hack: avoid icmp_send in ip_fragment */
272 skb->local_df = 1; 433 skb->local_df = 1;
273 434
274 IP_VS_XMIT(NFPROTO_IPV4, skb, rt); 435 IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0);
275 436
276 LeaveFunction(10); 437 LeaveFunction(10);
277 return NF_STOLEN; 438 return NF_STOLEN;
@@ -292,28 +453,22 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
292 struct rt6_info *rt; /* Route to the other host */ 453 struct rt6_info *rt; /* Route to the other host */
293 struct ipv6hdr *iph = ipv6_hdr(skb); 454 struct ipv6hdr *iph = ipv6_hdr(skb);
294 int mtu; 455 int mtu;
295 struct flowi fl = {
296 .oif = 0,
297 .nl_u = {
298 .ip6_u = {
299 .daddr = iph->daddr,
300 .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
301 };
302 456
303 EnterFunction(10); 457 EnterFunction(10);
304 458
305 rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); 459 if (!(rt = __ip_vs_get_out_rt_v6(skb, NULL, &iph->daddr, NULL, 0, 2)))
306 if (!rt) {
307 IP_VS_DBG_RL("%s(): ip6_route_output error, dest: %pI6\n",
308 __func__, &iph->daddr);
309 goto tx_error_icmp; 460 goto tx_error_icmp;
310 }
311 461
312 /* MTU checking */ 462 /* MTU checking */
313 mtu = dst_mtu(&rt->dst); 463 mtu = dst_mtu(&rt->dst);
314 if (skb->len > mtu) { 464 if (skb->len > mtu) {
315 dst_release(&rt->dst); 465 if (!skb->dev) {
466 struct net *net = dev_net(skb_dst(skb)->dev);
467
468 skb->dev = net->loopback_dev;
469 }
316 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 470 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
471 dst_release(&rt->dst);
317 IP_VS_DBG_RL("%s(): frag needed\n", __func__); 472 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
318 goto tx_error; 473 goto tx_error;
319 } 474 }
@@ -335,7 +490,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
335 /* Another hack: avoid icmp_send in ip_fragment */ 490 /* Another hack: avoid icmp_send in ip_fragment */
336 skb->local_df = 1; 491 skb->local_df = 1;
337 492
338 IP_VS_XMIT(NFPROTO_IPV6, skb, rt); 493 IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0);
339 494
340 LeaveFunction(10); 495 LeaveFunction(10);
341 return NF_STOLEN; 496 return NF_STOLEN;
@@ -349,30 +504,6 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
349} 504}
350#endif 505#endif
351 506
352static void
353ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp)
354{
355 struct nf_conn *ct = (struct nf_conn *)skb->nfct;
356 struct nf_conntrack_tuple new_tuple;
357
358 if (ct == NULL || nf_ct_is_untracked(ct) || nf_ct_is_confirmed(ct))
359 return;
360
361 /*
362 * The connection is not yet in the hashtable, so we update it.
363 * CIP->VIP will remain the same, so leave the tuple in
364 * IP_CT_DIR_ORIGINAL untouched. When the reply comes back from the
365 * real-server we will see RIP->DIP.
366 */
367 new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
368 new_tuple.src.u3 = cp->daddr;
369 /*
370 * This will also take care of UDP and other protocols.
371 */
372 new_tuple.src.u.tcp.port = cp->dport;
373 nf_conntrack_alter_reply(ct, &new_tuple);
374}
375
376/* 507/*
377 * NAT transmitter (only for outside-to-inside nat forwarding) 508 * NAT transmitter (only for outside-to-inside nat forwarding)
378 * Not used for related ICMP 509 * Not used for related ICMP
@@ -384,6 +515,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
384 struct rtable *rt; /* Route to the other host */ 515 struct rtable *rt; /* Route to the other host */
385 int mtu; 516 int mtu;
386 struct iphdr *iph = ip_hdr(skb); 517 struct iphdr *iph = ip_hdr(skb);
518 int local;
387 519
388 EnterFunction(10); 520 EnterFunction(10);
389 521
@@ -397,16 +529,42 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
397 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); 529 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
398 } 530 }
399 531
400 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) 532 if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
533 RT_TOS(iph->tos), 1|2|4)))
401 goto tx_error_icmp; 534 goto tx_error_icmp;
535 local = rt->rt_flags & RTCF_LOCAL;
536 /*
537 * Avoid duplicate tuple in reply direction for NAT traffic
538 * to local address when connection is sync-ed
539 */
540#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
541 if (cp->flags & IP_VS_CONN_F_SYNC && local) {
542 enum ip_conntrack_info ctinfo;
543 struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
544
545 if (ct && !nf_ct_is_untracked(ct)) {
546 IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0,
547 "ip_vs_nat_xmit(): "
548 "stopping DNAT to local address");
549 goto tx_error_put;
550 }
551 }
552#endif
553
554 /* From world but DNAT to loopback address? */
555 if (local && ipv4_is_loopback(rt->rt_dst) && skb_rtable(skb)->fl.iif) {
556 IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): "
557 "stopping DNAT to loopback address");
558 goto tx_error_put;
559 }
402 560
403 /* MTU checking */ 561 /* MTU checking */
404 mtu = dst_mtu(&rt->dst); 562 mtu = dst_mtu(&rt->dst);
405 if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { 563 if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
406 ip_rt_put(rt);
407 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); 564 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
408 IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); 565 IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0,
409 goto tx_error; 566 "ip_vs_nat_xmit(): frag needed for");
567 goto tx_error_put;
410 } 568 }
411 569
412 /* copy-on-write the packet before mangling it */ 570 /* copy-on-write the packet before mangling it */
@@ -416,19 +574,28 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
416 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 574 if (skb_cow(skb, rt->dst.dev->hard_header_len))
417 goto tx_error_put; 575 goto tx_error_put;
418 576
419 /* drop old route */
420 skb_dst_drop(skb);
421 skb_dst_set(skb, &rt->dst);
422
423 /* mangle the packet */ 577 /* mangle the packet */
424 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) 578 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
425 goto tx_error; 579 goto tx_error_put;
426 ip_hdr(skb)->daddr = cp->daddr.ip; 580 ip_hdr(skb)->daddr = cp->daddr.ip;
427 ip_send_check(ip_hdr(skb)); 581 ip_send_check(ip_hdr(skb));
428 582
429 IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); 583 if (!local) {
584 /* drop old route */
585 skb_dst_drop(skb);
586 skb_dst_set(skb, &rt->dst);
587 } else {
588 ip_rt_put(rt);
589 /*
590 * Some IPv4 replies get local address from routes,
591 * not from iph, so while we DNAT after routing
592 * we need this second input/output route.
593 */
594 if (!__ip_vs_reroute_locally(skb))
595 goto tx_error;
596 }
430 597
431 ip_vs_update_conntrack(skb, cp); 598 IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT");
432 599
433 /* FIXME: when application helper enlarges the packet and the length 600 /* FIXME: when application helper enlarges the packet and the length
434 is larger than the MTU of outgoing device, there will be still 601 is larger than the MTU of outgoing device, there will be still
@@ -437,7 +604,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
437 /* Another hack: avoid icmp_send in ip_fragment */ 604 /* Another hack: avoid icmp_send in ip_fragment */
438 skb->local_df = 1; 605 skb->local_df = 1;
439 606
440 IP_VS_XMIT(NFPROTO_IPV4, skb, rt); 607 IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local);
441 608
442 LeaveFunction(10); 609 LeaveFunction(10);
443 return NF_STOLEN; 610 return NF_STOLEN;
@@ -445,8 +612,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
445 tx_error_icmp: 612 tx_error_icmp:
446 dst_link_failure(skb); 613 dst_link_failure(skb);
447 tx_error: 614 tx_error:
448 LeaveFunction(10);
449 kfree_skb(skb); 615 kfree_skb(skb);
616 LeaveFunction(10);
450 return NF_STOLEN; 617 return NF_STOLEN;
451 tx_error_put: 618 tx_error_put:
452 ip_rt_put(rt); 619 ip_rt_put(rt);
@@ -460,6 +627,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
460{ 627{
461 struct rt6_info *rt; /* Route to the other host */ 628 struct rt6_info *rt; /* Route to the other host */
462 int mtu; 629 int mtu;
630 int local;
463 631
464 EnterFunction(10); 632 EnterFunction(10);
465 633
@@ -474,18 +642,49 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
474 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); 642 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
475 } 643 }
476 644
477 rt = __ip_vs_get_out_rt_v6(cp); 645 if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
478 if (!rt) 646 0, 1|2|4)))
479 goto tx_error_icmp; 647 goto tx_error_icmp;
648 local = __ip_vs_is_local_route6(rt);
649 /*
650 * Avoid duplicate tuple in reply direction for NAT traffic
651 * to local address when connection is sync-ed
652 */
653#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
654 if (cp->flags & IP_VS_CONN_F_SYNC && local) {
655 enum ip_conntrack_info ctinfo;
656 struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
657
658 if (ct && !nf_ct_is_untracked(ct)) {
659 IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0,
660 "ip_vs_nat_xmit_v6(): "
661 "stopping DNAT to local address");
662 goto tx_error_put;
663 }
664 }
665#endif
666
667 /* From world but DNAT to loopback address? */
668 if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
669 ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
670 IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0,
671 "ip_vs_nat_xmit_v6(): "
672 "stopping DNAT to loopback address");
673 goto tx_error_put;
674 }
480 675
481 /* MTU checking */ 676 /* MTU checking */
482 mtu = dst_mtu(&rt->dst); 677 mtu = dst_mtu(&rt->dst);
483 if (skb->len > mtu) { 678 if (skb->len > mtu) {
484 dst_release(&rt->dst); 679 if (!skb->dev) {
680 struct net *net = dev_net(skb_dst(skb)->dev);
681
682 skb->dev = net->loopback_dev;
683 }
485 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 684 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
486 IP_VS_DBG_RL_PKT(0, pp, skb, 0, 685 IP_VS_DBG_RL_PKT(0, AF_INET6, pp, skb, 0,
487 "ip_vs_nat_xmit_v6(): frag needed for"); 686 "ip_vs_nat_xmit_v6(): frag needed for");
488 goto tx_error; 687 goto tx_error_put;
489 } 688 }
490 689
491 /* copy-on-write the packet before mangling it */ 690 /* copy-on-write the packet before mangling it */
@@ -495,18 +694,21 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
495 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 694 if (skb_cow(skb, rt->dst.dev->hard_header_len))
496 goto tx_error_put; 695 goto tx_error_put;
497 696
498 /* drop old route */
499 skb_dst_drop(skb);
500 skb_dst_set(skb, &rt->dst);
501
502 /* mangle the packet */ 697 /* mangle the packet */
503 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) 698 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
504 goto tx_error; 699 goto tx_error;
505 ipv6_hdr(skb)->daddr = cp->daddr.in6; 700 ipv6_addr_copy(&ipv6_hdr(skb)->daddr, &cp->daddr.in6);
506 701
507 IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); 702 if (!local || !skb->dev) {
703 /* drop the old route when skb is not shared */
704 skb_dst_drop(skb);
705 skb_dst_set(skb, &rt->dst);
706 } else {
707 /* destined to loopback, do we need to change route? */
708 dst_release(&rt->dst);
709 }
508 710
509 ip_vs_update_conntrack(skb, cp); 711 IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT");
510 712
511 /* FIXME: when application helper enlarges the packet and the length 713 /* FIXME: when application helper enlarges the packet and the length
512 is larger than the MTU of outgoing device, there will be still 714 is larger than the MTU of outgoing device, there will be still
@@ -515,7 +717,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
515 /* Another hack: avoid icmp_send in ip_fragment */ 717 /* Another hack: avoid icmp_send in ip_fragment */
516 skb->local_df = 1; 718 skb->local_df = 1;
517 719
518 IP_VS_XMIT(NFPROTO_IPV6, skb, rt); 720 IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local);
519 721
520 LeaveFunction(10); 722 LeaveFunction(10);
521 return NF_STOLEN; 723 return NF_STOLEN;
@@ -561,30 +763,27 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
561 struct iphdr *old_iph = ip_hdr(skb); 763 struct iphdr *old_iph = ip_hdr(skb);
562 u8 tos = old_iph->tos; 764 u8 tos = old_iph->tos;
563 __be16 df = old_iph->frag_off; 765 __be16 df = old_iph->frag_off;
564 sk_buff_data_t old_transport_header = skb->transport_header;
565 struct iphdr *iph; /* Our new IP header */ 766 struct iphdr *iph; /* Our new IP header */
566 unsigned int max_headroom; /* The extra header space needed */ 767 unsigned int max_headroom; /* The extra header space needed */
567 int mtu; 768 int mtu;
769 int ret;
568 770
569 EnterFunction(10); 771 EnterFunction(10);
570 772
571 if (skb->protocol != htons(ETH_P_IP)) { 773 if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
572 IP_VS_DBG_RL("%s(): protocol error, " 774 RT_TOS(tos), 1|2)))
573 "ETH_P_IP: %d, skb protocol: %d\n",
574 __func__, htons(ETH_P_IP), skb->protocol);
575 goto tx_error;
576 }
577
578 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
579 goto tx_error_icmp; 775 goto tx_error_icmp;
776 if (rt->rt_flags & RTCF_LOCAL) {
777 ip_rt_put(rt);
778 IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
779 }
580 780
581 tdev = rt->dst.dev; 781 tdev = rt->dst.dev;
582 782
583 mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); 783 mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
584 if (mtu < 68) { 784 if (mtu < 68) {
585 ip_rt_put(rt);
586 IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); 785 IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
587 goto tx_error; 786 goto tx_error_put;
588 } 787 }
589 if (skb_dst(skb)) 788 if (skb_dst(skb))
590 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); 789 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
@@ -594,9 +793,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
594 if ((old_iph->frag_off & htons(IP_DF)) 793 if ((old_iph->frag_off & htons(IP_DF))
595 && mtu < ntohs(old_iph->tot_len)) { 794 && mtu < ntohs(old_iph->tot_len)) {
596 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); 795 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
597 ip_rt_put(rt);
598 IP_VS_DBG_RL("%s(): frag needed\n", __func__); 796 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
599 goto tx_error; 797 goto tx_error_put;
600 } 798 }
601 799
602 /* 800 /*
@@ -619,7 +817,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
619 old_iph = ip_hdr(skb); 817 old_iph = ip_hdr(skb);
620 } 818 }
621 819
622 skb->transport_header = old_transport_header; 820 skb->transport_header = skb->network_header;
623 821
624 /* fix old IP header checksum */ 822 /* fix old IP header checksum */
625 ip_send_check(old_iph); 823 ip_send_check(old_iph);
@@ -649,7 +847,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
649 /* Another hack: avoid icmp_send in ip_fragment */ 847 /* Another hack: avoid icmp_send in ip_fragment */
650 skb->local_df = 1; 848 skb->local_df = 1;
651 849
652 ip_local_out(skb); 850 ret = IP_VS_XMIT_TUNNEL(skb, cp);
851 if (ret == NF_ACCEPT)
852 ip_local_out(skb);
853 else if (ret == NF_DROP)
854 kfree_skb(skb);
653 855
654 LeaveFunction(10); 856 LeaveFunction(10);
655 857
@@ -661,6 +863,9 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
661 kfree_skb(skb); 863 kfree_skb(skb);
662 LeaveFunction(10); 864 LeaveFunction(10);
663 return NF_STOLEN; 865 return NF_STOLEN;
866tx_error_put:
867 ip_rt_put(rt);
868 goto tx_error;
664} 869}
665 870
666#ifdef CONFIG_IP_VS_IPV6 871#ifdef CONFIG_IP_VS_IPV6
@@ -669,43 +874,44 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
669 struct ip_vs_protocol *pp) 874 struct ip_vs_protocol *pp)
670{ 875{
671 struct rt6_info *rt; /* Route to the other host */ 876 struct rt6_info *rt; /* Route to the other host */
877 struct in6_addr saddr; /* Source for tunnel */
672 struct net_device *tdev; /* Device to other host */ 878 struct net_device *tdev; /* Device to other host */
673 struct ipv6hdr *old_iph = ipv6_hdr(skb); 879 struct ipv6hdr *old_iph = ipv6_hdr(skb);
674 sk_buff_data_t old_transport_header = skb->transport_header;
675 struct ipv6hdr *iph; /* Our new IP header */ 880 struct ipv6hdr *iph; /* Our new IP header */
676 unsigned int max_headroom; /* The extra header space needed */ 881 unsigned int max_headroom; /* The extra header space needed */
677 int mtu; 882 int mtu;
883 int ret;
678 884
679 EnterFunction(10); 885 EnterFunction(10);
680 886
681 if (skb->protocol != htons(ETH_P_IPV6)) { 887 if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6,
682 IP_VS_DBG_RL("%s(): protocol error, " 888 &saddr, 1, 1|2)))
683 "ETH_P_IPV6: %d, skb protocol: %d\n",
684 __func__, htons(ETH_P_IPV6), skb->protocol);
685 goto tx_error;
686 }
687
688 rt = __ip_vs_get_out_rt_v6(cp);
689 if (!rt)
690 goto tx_error_icmp; 889 goto tx_error_icmp;
890 if (__ip_vs_is_local_route6(rt)) {
891 dst_release(&rt->dst);
892 IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1);
893 }
691 894
692 tdev = rt->dst.dev; 895 tdev = rt->dst.dev;
693 896
694 mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); 897 mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
695 /* TODO IPv6: do we need this check in IPv6? */ 898 if (mtu < IPV6_MIN_MTU) {
696 if (mtu < 1280) { 899 IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
697 dst_release(&rt->dst); 900 IPV6_MIN_MTU);
698 IP_VS_DBG_RL("%s(): mtu less than 1280\n", __func__); 901 goto tx_error_put;
699 goto tx_error;
700 } 902 }
701 if (skb_dst(skb)) 903 if (skb_dst(skb))
702 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); 904 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
703 905
704 if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) { 906 if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) {
907 if (!skb->dev) {
908 struct net *net = dev_net(skb_dst(skb)->dev);
909
910 skb->dev = net->loopback_dev;
911 }
705 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 912 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
706 dst_release(&rt->dst);
707 IP_VS_DBG_RL("%s(): frag needed\n", __func__); 913 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
708 goto tx_error; 914 goto tx_error_put;
709 } 915 }
710 916
711 /* 917 /*
@@ -728,7 +934,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
728 old_iph = ipv6_hdr(skb); 934 old_iph = ipv6_hdr(skb);
729 } 935 }
730 936
731 skb->transport_header = old_transport_header; 937 skb->transport_header = skb->network_header;
732 938
733 skb_push(skb, sizeof(struct ipv6hdr)); 939 skb_push(skb, sizeof(struct ipv6hdr));
734 skb_reset_network_header(skb); 940 skb_reset_network_header(skb);
@@ -748,14 +954,18 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
748 be16_add_cpu(&iph->payload_len, sizeof(*old_iph)); 954 be16_add_cpu(&iph->payload_len, sizeof(*old_iph));
749 iph->priority = old_iph->priority; 955 iph->priority = old_iph->priority;
750 memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl)); 956 memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
751 iph->daddr = rt->rt6i_dst.addr; 957 ipv6_addr_copy(&iph->daddr, &cp->daddr.in6);
752 iph->saddr = cp->vaddr.in6; /* rt->rt6i_src.addr; */ 958 ipv6_addr_copy(&iph->saddr, &saddr);
753 iph->hop_limit = old_iph->hop_limit; 959 iph->hop_limit = old_iph->hop_limit;
754 960
755 /* Another hack: avoid icmp_send in ip_fragment */ 961 /* Another hack: avoid icmp_send in ip_fragment */
756 skb->local_df = 1; 962 skb->local_df = 1;
757 963
758 ip6_local_out(skb); 964 ret = IP_VS_XMIT_TUNNEL(skb, cp);
965 if (ret == NF_ACCEPT)
966 ip6_local_out(skb);
967 else if (ret == NF_DROP)
968 kfree_skb(skb);
759 969
760 LeaveFunction(10); 970 LeaveFunction(10);
761 971
@@ -767,6 +977,9 @@ tx_error:
767 kfree_skb(skb); 977 kfree_skb(skb);
768 LeaveFunction(10); 978 LeaveFunction(10);
769 return NF_STOLEN; 979 return NF_STOLEN;
980tx_error_put:
981 dst_release(&rt->dst);
982 goto tx_error;
770} 983}
771#endif 984#endif
772 985
@@ -785,8 +998,13 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
785 998
786 EnterFunction(10); 999 EnterFunction(10);
787 1000
788 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) 1001 if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
1002 RT_TOS(iph->tos), 1|2)))
789 goto tx_error_icmp; 1003 goto tx_error_icmp;
1004 if (rt->rt_flags & RTCF_LOCAL) {
1005 ip_rt_put(rt);
1006 IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
1007 }
790 1008
791 /* MTU checking */ 1009 /* MTU checking */
792 mtu = dst_mtu(&rt->dst); 1010 mtu = dst_mtu(&rt->dst);
@@ -814,7 +1032,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
814 /* Another hack: avoid icmp_send in ip_fragment */ 1032 /* Another hack: avoid icmp_send in ip_fragment */
815 skb->local_df = 1; 1033 skb->local_df = 1;
816 1034
817 IP_VS_XMIT(NFPROTO_IPV4, skb, rt); 1035 IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0);
818 1036
819 LeaveFunction(10); 1037 LeaveFunction(10);
820 return NF_STOLEN; 1038 return NF_STOLEN;
@@ -837,13 +1055,22 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
837 1055
838 EnterFunction(10); 1056 EnterFunction(10);
839 1057
840 rt = __ip_vs_get_out_rt_v6(cp); 1058 if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
841 if (!rt) 1059 0, 1|2)))
842 goto tx_error_icmp; 1060 goto tx_error_icmp;
1061 if (__ip_vs_is_local_route6(rt)) {
1062 dst_release(&rt->dst);
1063 IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1);
1064 }
843 1065
844 /* MTU checking */ 1066 /* MTU checking */
845 mtu = dst_mtu(&rt->dst); 1067 mtu = dst_mtu(&rt->dst);
846 if (skb->len > mtu) { 1068 if (skb->len > mtu) {
1069 if (!skb->dev) {
1070 struct net *net = dev_net(skb_dst(skb)->dev);
1071
1072 skb->dev = net->loopback_dev;
1073 }
847 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 1074 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
848 dst_release(&rt->dst); 1075 dst_release(&rt->dst);
849 IP_VS_DBG_RL("%s(): frag needed\n", __func__); 1076 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
@@ -867,7 +1094,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
867 /* Another hack: avoid icmp_send in ip_fragment */ 1094 /* Another hack: avoid icmp_send in ip_fragment */
868 skb->local_df = 1; 1095 skb->local_df = 1;
869 1096
870 IP_VS_XMIT(NFPROTO_IPV6, skb, rt); 1097 IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0);
871 1098
872 LeaveFunction(10); 1099 LeaveFunction(10);
873 return NF_STOLEN; 1100 return NF_STOLEN;
@@ -893,6 +1120,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
893 struct rtable *rt; /* Route to the other host */ 1120 struct rtable *rt; /* Route to the other host */
894 int mtu; 1121 int mtu;
895 int rc; 1122 int rc;
1123 int local;
896 1124
897 EnterFunction(10); 1125 EnterFunction(10);
898 1126
@@ -913,16 +1141,43 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
913 * mangle and send the packet here (only for VS/NAT) 1141 * mangle and send the packet here (only for VS/NAT)
914 */ 1142 */
915 1143
916 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos)))) 1144 if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
1145 RT_TOS(ip_hdr(skb)->tos), 1|2|4)))
917 goto tx_error_icmp; 1146 goto tx_error_icmp;
1147 local = rt->rt_flags & RTCF_LOCAL;
1148
1149 /*
1150 * Avoid duplicate tuple in reply direction for NAT traffic
1151 * to local address when connection is sync-ed
1152 */
1153#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
1154 if (cp->flags & IP_VS_CONN_F_SYNC && local) {
1155 enum ip_conntrack_info ctinfo;
1156 struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
1157
1158 if (ct && !nf_ct_is_untracked(ct)) {
1159 IP_VS_DBG(10, "%s(): "
1160 "stopping DNAT to local address %pI4\n",
1161 __func__, &cp->daddr.ip);
1162 goto tx_error_put;
1163 }
1164 }
1165#endif
1166
1167 /* From world but DNAT to loopback address? */
1168 if (local && ipv4_is_loopback(rt->rt_dst) && skb_rtable(skb)->fl.iif) {
1169 IP_VS_DBG(1, "%s(): "
1170 "stopping DNAT to loopback %pI4\n",
1171 __func__, &cp->daddr.ip);
1172 goto tx_error_put;
1173 }
918 1174
919 /* MTU checking */ 1175 /* MTU checking */
920 mtu = dst_mtu(&rt->dst); 1176 mtu = dst_mtu(&rt->dst);
921 if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) { 1177 if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) {
922 ip_rt_put(rt);
923 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); 1178 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
924 IP_VS_DBG_RL("%s(): frag needed\n", __func__); 1179 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
925 goto tx_error; 1180 goto tx_error_put;
926 } 1181 }
927 1182
928 /* copy-on-write the packet before mangling it */ 1183 /* copy-on-write the packet before mangling it */
@@ -932,16 +1187,27 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
932 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 1187 if (skb_cow(skb, rt->dst.dev->hard_header_len))
933 goto tx_error_put; 1188 goto tx_error_put;
934 1189
935 /* drop the old route when skb is not shared */
936 skb_dst_drop(skb);
937 skb_dst_set(skb, &rt->dst);
938
939 ip_vs_nat_icmp(skb, pp, cp, 0); 1190 ip_vs_nat_icmp(skb, pp, cp, 0);
940 1191
1192 if (!local) {
1193 /* drop the old route when skb is not shared */
1194 skb_dst_drop(skb);
1195 skb_dst_set(skb, &rt->dst);
1196 } else {
1197 ip_rt_put(rt);
1198 /*
1199 * Some IPv4 replies get local address from routes,
1200 * not from iph, so while we DNAT after routing
1201 * we need this second input/output route.
1202 */
1203 if (!__ip_vs_reroute_locally(skb))
1204 goto tx_error;
1205 }
1206
941 /* Another hack: avoid icmp_send in ip_fragment */ 1207 /* Another hack: avoid icmp_send in ip_fragment */
942 skb->local_df = 1; 1208 skb->local_df = 1;
943 1209
944 IP_VS_XMIT(NFPROTO_IPV4, skb, rt); 1210 IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local);
945 1211
946 rc = NF_STOLEN; 1212 rc = NF_STOLEN;
947 goto out; 1213 goto out;
@@ -967,6 +1233,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
967 struct rt6_info *rt; /* Route to the other host */ 1233 struct rt6_info *rt; /* Route to the other host */
968 int mtu; 1234 int mtu;
969 int rc; 1235 int rc;
1236 int local;
970 1237
971 EnterFunction(10); 1238 EnterFunction(10);
972 1239
@@ -987,17 +1254,49 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
987 * mangle and send the packet here (only for VS/NAT) 1254 * mangle and send the packet here (only for VS/NAT)
988 */ 1255 */
989 1256
990 rt = __ip_vs_get_out_rt_v6(cp); 1257 if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
991 if (!rt) 1258 0, 1|2|4)))
992 goto tx_error_icmp; 1259 goto tx_error_icmp;
993 1260
1261 local = __ip_vs_is_local_route6(rt);
1262 /*
1263 * Avoid duplicate tuple in reply direction for NAT traffic
1264 * to local address when connection is sync-ed
1265 */
1266#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
1267 if (cp->flags & IP_VS_CONN_F_SYNC && local) {
1268 enum ip_conntrack_info ctinfo;
1269 struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
1270
1271 if (ct && !nf_ct_is_untracked(ct)) {
1272 IP_VS_DBG(10, "%s(): "
1273 "stopping DNAT to local address %pI6\n",
1274 __func__, &cp->daddr.in6);
1275 goto tx_error_put;
1276 }
1277 }
1278#endif
1279
1280 /* From world but DNAT to loopback address? */
1281 if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
1282 ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
1283 IP_VS_DBG(1, "%s(): "
1284 "stopping DNAT to loopback %pI6\n",
1285 __func__, &cp->daddr.in6);
1286 goto tx_error_put;
1287 }
1288
994 /* MTU checking */ 1289 /* MTU checking */
995 mtu = dst_mtu(&rt->dst); 1290 mtu = dst_mtu(&rt->dst);
996 if (skb->len > mtu) { 1291 if (skb->len > mtu) {
997 dst_release(&rt->dst); 1292 if (!skb->dev) {
1293 struct net *net = dev_net(skb_dst(skb)->dev);
1294
1295 skb->dev = net->loopback_dev;
1296 }
998 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 1297 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
999 IP_VS_DBG_RL("%s(): frag needed\n", __func__); 1298 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
1000 goto tx_error; 1299 goto tx_error_put;
1001 } 1300 }
1002 1301
1003 /* copy-on-write the packet before mangling it */ 1302 /* copy-on-write the packet before mangling it */
@@ -1007,16 +1306,21 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
1007 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 1306 if (skb_cow(skb, rt->dst.dev->hard_header_len))
1008 goto tx_error_put; 1307 goto tx_error_put;
1009 1308
1010 /* drop the old route when skb is not shared */
1011 skb_dst_drop(skb);
1012 skb_dst_set(skb, &rt->dst);
1013
1014 ip_vs_nat_icmp_v6(skb, pp, cp, 0); 1309 ip_vs_nat_icmp_v6(skb, pp, cp, 0);
1015 1310
1311 if (!local || !skb->dev) {
1312 /* drop the old route when skb is not shared */
1313 skb_dst_drop(skb);
1314 skb_dst_set(skb, &rt->dst);
1315 } else {
1316 /* destined to loopback, do we need to change route? */
1317 dst_release(&rt->dst);
1318 }
1319
1016 /* Another hack: avoid icmp_send in ip_fragment */ 1320 /* Another hack: avoid icmp_send in ip_fragment */
1017 skb->local_df = 1; 1321 skb->local_df = 1;
1018 1322
1019 IP_VS_XMIT(NFPROTO_IPV6, skb, rt); 1323 IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local);
1020 1324
1021 rc = NF_STOLEN; 1325 rc = NF_STOLEN;
1022 goto out; 1326 goto out;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index df3eedb142ff..1eacf8d9966a 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -65,32 +65,42 @@ EXPORT_SYMBOL_GPL(nf_conntrack_max);
65DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked); 65DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
66EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); 66EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
67 67
68static int nf_conntrack_hash_rnd_initted; 68static unsigned int nf_conntrack_hash_rnd __read_mostly;
69static unsigned int nf_conntrack_hash_rnd;
70 69
71static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple, 70static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, u16 zone)
72 u16 zone, unsigned int size, unsigned int rnd)
73{ 71{
74 unsigned int n; 72 unsigned int n;
75 u_int32_t h;
76 73
77 /* The direction must be ignored, so we hash everything up to the 74 /* The direction must be ignored, so we hash everything up to the
78 * destination ports (which is a multiple of 4) and treat the last 75 * destination ports (which is a multiple of 4) and treat the last
79 * three bytes manually. 76 * three bytes manually.
80 */ 77 */
81 n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32); 78 n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
82 h = jhash2((u32 *)tuple, n, 79 return jhash2((u32 *)tuple, n, zone ^ nf_conntrack_hash_rnd ^
83 zone ^ rnd ^ (((__force __u16)tuple->dst.u.all << 16) | 80 (((__force __u16)tuple->dst.u.all << 16) |
84 tuple->dst.protonum)); 81 tuple->dst.protonum));
82}
83
84static u32 __hash_bucket(u32 hash, unsigned int size)
85{
86 return ((u64)hash * size) >> 32;
87}
88
89static u32 hash_bucket(u32 hash, const struct net *net)
90{
91 return __hash_bucket(hash, net->ct.htable_size);
92}
85 93
86 return ((u64)h * size) >> 32; 94static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
95 u16 zone, unsigned int size)
96{
97 return __hash_bucket(hash_conntrack_raw(tuple, zone), size);
87} 98}
88 99
89static inline u_int32_t hash_conntrack(const struct net *net, u16 zone, 100static inline u_int32_t hash_conntrack(const struct net *net, u16 zone,
90 const struct nf_conntrack_tuple *tuple) 101 const struct nf_conntrack_tuple *tuple)
91{ 102{
92 return __hash_conntrack(tuple, zone, net->ct.htable_size, 103 return __hash_conntrack(tuple, zone, net->ct.htable_size);
93 nf_conntrack_hash_rnd);
94} 104}
95 105
96bool 106bool
@@ -292,20 +302,20 @@ static void death_by_timeout(unsigned long ul_conntrack)
292 * OR 302 * OR
293 * - Caller must lock nf_conntrack_lock before calling this function 303 * - Caller must lock nf_conntrack_lock before calling this function
294 */ 304 */
295struct nf_conntrack_tuple_hash * 305static struct nf_conntrack_tuple_hash *
296__nf_conntrack_find(struct net *net, u16 zone, 306____nf_conntrack_find(struct net *net, u16 zone,
297 const struct nf_conntrack_tuple *tuple) 307 const struct nf_conntrack_tuple *tuple, u32 hash)
298{ 308{
299 struct nf_conntrack_tuple_hash *h; 309 struct nf_conntrack_tuple_hash *h;
300 struct hlist_nulls_node *n; 310 struct hlist_nulls_node *n;
301 unsigned int hash = hash_conntrack(net, zone, tuple); 311 unsigned int bucket = hash_bucket(hash, net);
302 312
303 /* Disable BHs the entire time since we normally need to disable them 313 /* Disable BHs the entire time since we normally need to disable them
304 * at least once for the stats anyway. 314 * at least once for the stats anyway.
305 */ 315 */
306 local_bh_disable(); 316 local_bh_disable();
307begin: 317begin:
308 hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) { 318 hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) {
309 if (nf_ct_tuple_equal(tuple, &h->tuple) && 319 if (nf_ct_tuple_equal(tuple, &h->tuple) &&
310 nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)) == zone) { 320 nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)) == zone) {
311 NF_CT_STAT_INC(net, found); 321 NF_CT_STAT_INC(net, found);
@@ -319,7 +329,7 @@ begin:
319 * not the expected one, we must restart lookup. 329 * not the expected one, we must restart lookup.
320 * We probably met an item that was moved to another chain. 330 * We probably met an item that was moved to another chain.
321 */ 331 */
322 if (get_nulls_value(n) != hash) { 332 if (get_nulls_value(n) != bucket) {
323 NF_CT_STAT_INC(net, search_restart); 333 NF_CT_STAT_INC(net, search_restart);
324 goto begin; 334 goto begin;
325 } 335 }
@@ -327,19 +337,27 @@ begin:
327 337
328 return NULL; 338 return NULL;
329} 339}
340
341struct nf_conntrack_tuple_hash *
342__nf_conntrack_find(struct net *net, u16 zone,
343 const struct nf_conntrack_tuple *tuple)
344{
345 return ____nf_conntrack_find(net, zone, tuple,
346 hash_conntrack_raw(tuple, zone));
347}
330EXPORT_SYMBOL_GPL(__nf_conntrack_find); 348EXPORT_SYMBOL_GPL(__nf_conntrack_find);
331 349
332/* Find a connection corresponding to a tuple. */ 350/* Find a connection corresponding to a tuple. */
333struct nf_conntrack_tuple_hash * 351static struct nf_conntrack_tuple_hash *
334nf_conntrack_find_get(struct net *net, u16 zone, 352__nf_conntrack_find_get(struct net *net, u16 zone,
335 const struct nf_conntrack_tuple *tuple) 353 const struct nf_conntrack_tuple *tuple, u32 hash)
336{ 354{
337 struct nf_conntrack_tuple_hash *h; 355 struct nf_conntrack_tuple_hash *h;
338 struct nf_conn *ct; 356 struct nf_conn *ct;
339 357
340 rcu_read_lock(); 358 rcu_read_lock();
341begin: 359begin:
342 h = __nf_conntrack_find(net, zone, tuple); 360 h = ____nf_conntrack_find(net, zone, tuple, hash);
343 if (h) { 361 if (h) {
344 ct = nf_ct_tuplehash_to_ctrack(h); 362 ct = nf_ct_tuplehash_to_ctrack(h);
345 if (unlikely(nf_ct_is_dying(ct) || 363 if (unlikely(nf_ct_is_dying(ct) ||
@@ -357,6 +375,14 @@ begin:
357 375
358 return h; 376 return h;
359} 377}
378
379struct nf_conntrack_tuple_hash *
380nf_conntrack_find_get(struct net *net, u16 zone,
381 const struct nf_conntrack_tuple *tuple)
382{
383 return __nf_conntrack_find_get(net, zone, tuple,
384 hash_conntrack_raw(tuple, zone));
385}
360EXPORT_SYMBOL_GPL(nf_conntrack_find_get); 386EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
361 387
362static void __nf_conntrack_hash_insert(struct nf_conn *ct, 388static void __nf_conntrack_hash_insert(struct nf_conn *ct,
@@ -409,8 +435,11 @@ __nf_conntrack_confirm(struct sk_buff *skb)
409 return NF_ACCEPT; 435 return NF_ACCEPT;
410 436
411 zone = nf_ct_zone(ct); 437 zone = nf_ct_zone(ct);
412 hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 438 /* reuse the hash saved before */
413 repl_hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 439 hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
440 hash = hash_bucket(hash, net);
441 repl_hash = hash_conntrack(net, zone,
442 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
414 443
415 /* We're not in hash table, and we refuse to set up related 444 /* We're not in hash table, and we refuse to set up related
416 connections for unconfirmed conns. But packet copies and 445 connections for unconfirmed conns. But packet copies and
@@ -567,17 +596,29 @@ static noinline int early_drop(struct net *net, unsigned int hash)
567 return dropped; 596 return dropped;
568} 597}
569 598
570struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone, 599static struct nf_conn *
571 const struct nf_conntrack_tuple *orig, 600__nf_conntrack_alloc(struct net *net, u16 zone,
572 const struct nf_conntrack_tuple *repl, 601 const struct nf_conntrack_tuple *orig,
573 gfp_t gfp) 602 const struct nf_conntrack_tuple *repl,
603 gfp_t gfp, u32 hash)
574{ 604{
575 struct nf_conn *ct; 605 struct nf_conn *ct;
576 606
577 if (unlikely(!nf_conntrack_hash_rnd_initted)) { 607 if (unlikely(!nf_conntrack_hash_rnd)) {
578 get_random_bytes(&nf_conntrack_hash_rnd, 608 unsigned int rand;
579 sizeof(nf_conntrack_hash_rnd)); 609
580 nf_conntrack_hash_rnd_initted = 1; 610 /*
611 * Why not initialize nf_conntrack_rnd in a "init()" function ?
612 * Because there isn't enough entropy when system initializing,
613 * and we initialize it as late as possible.
614 */
615 do {
616 get_random_bytes(&rand, sizeof(rand));
617 } while (!rand);
618 cmpxchg(&nf_conntrack_hash_rnd, 0, rand);
619
620 /* recompute the hash as nf_conntrack_hash_rnd is initialized */
621 hash = hash_conntrack_raw(orig, zone);
581 } 622 }
582 623
583 /* We don't want any race condition at early drop stage */ 624 /* We don't want any race condition at early drop stage */
@@ -585,8 +626,7 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
585 626
586 if (nf_conntrack_max && 627 if (nf_conntrack_max &&
587 unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) { 628 unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
588 unsigned int hash = hash_conntrack(net, zone, orig); 629 if (!early_drop(net, hash_bucket(hash, net))) {
589 if (!early_drop(net, hash)) {
590 atomic_dec(&net->ct.count); 630 atomic_dec(&net->ct.count);
591 if (net_ratelimit()) 631 if (net_ratelimit())
592 printk(KERN_WARNING 632 printk(KERN_WARNING
@@ -616,7 +656,8 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
616 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; 656 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
617 ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; 657 ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
618 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; 658 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
619 ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev = NULL; 659 /* save hash for reusing when confirming */
660 *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
620 /* Don't set timer yet: wait for confirmation */ 661 /* Don't set timer yet: wait for confirmation */
621 setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct); 662 setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
622 write_pnet(&ct->ct_net, net); 663 write_pnet(&ct->ct_net, net);
@@ -643,6 +684,14 @@ out_free:
643 return ERR_PTR(-ENOMEM); 684 return ERR_PTR(-ENOMEM);
644#endif 685#endif
645} 686}
687
688struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
689 const struct nf_conntrack_tuple *orig,
690 const struct nf_conntrack_tuple *repl,
691 gfp_t gfp)
692{
693 return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0);
694}
646EXPORT_SYMBOL_GPL(nf_conntrack_alloc); 695EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
647 696
648void nf_conntrack_free(struct nf_conn *ct) 697void nf_conntrack_free(struct nf_conn *ct)
@@ -664,7 +713,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
664 struct nf_conntrack_l3proto *l3proto, 713 struct nf_conntrack_l3proto *l3proto,
665 struct nf_conntrack_l4proto *l4proto, 714 struct nf_conntrack_l4proto *l4proto,
666 struct sk_buff *skb, 715 struct sk_buff *skb,
667 unsigned int dataoff) 716 unsigned int dataoff, u32 hash)
668{ 717{
669 struct nf_conn *ct; 718 struct nf_conn *ct;
670 struct nf_conn_help *help; 719 struct nf_conn_help *help;
@@ -678,7 +727,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
678 return NULL; 727 return NULL;
679 } 728 }
680 729
681 ct = nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC); 730 ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
731 hash);
682 if (IS_ERR(ct)) { 732 if (IS_ERR(ct)) {
683 pr_debug("Can't allocate conntrack.\n"); 733 pr_debug("Can't allocate conntrack.\n");
684 return (struct nf_conntrack_tuple_hash *)ct; 734 return (struct nf_conntrack_tuple_hash *)ct;
@@ -755,6 +805,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
755 struct nf_conntrack_tuple_hash *h; 805 struct nf_conntrack_tuple_hash *h;
756 struct nf_conn *ct; 806 struct nf_conn *ct;
757 u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; 807 u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
808 u32 hash;
758 809
759 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), 810 if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
760 dataoff, l3num, protonum, &tuple, l3proto, 811 dataoff, l3num, protonum, &tuple, l3proto,
@@ -764,10 +815,11 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
764 } 815 }
765 816
766 /* look for tuple match */ 817 /* look for tuple match */
767 h = nf_conntrack_find_get(net, zone, &tuple); 818 hash = hash_conntrack_raw(&tuple, zone);
819 h = __nf_conntrack_find_get(net, zone, &tuple, hash);
768 if (!h) { 820 if (!h) {
769 h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto, 821 h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
770 skb, dataoff); 822 skb, dataoff, hash);
771 if (!h) 823 if (!h)
772 return NULL; 824 return NULL;
773 if (IS_ERR(h)) 825 if (IS_ERR(h))
@@ -1307,8 +1359,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
1307 ct = nf_ct_tuplehash_to_ctrack(h); 1359 ct = nf_ct_tuplehash_to_ctrack(h);
1308 hlist_nulls_del_rcu(&h->hnnode); 1360 hlist_nulls_del_rcu(&h->hnnode);
1309 bucket = __hash_conntrack(&h->tuple, nf_ct_zone(ct), 1361 bucket = __hash_conntrack(&h->tuple, nf_ct_zone(ct),
1310 hashsize, 1362 hashsize);
1311 nf_conntrack_hash_rnd);
1312 hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); 1363 hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
1313 } 1364 }
1314 } 1365 }
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index cdcc7649476b..5702de35e2bb 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -26,10 +26,10 @@
26 26
27static DEFINE_MUTEX(nf_ct_ecache_mutex); 27static DEFINE_MUTEX(nf_ct_ecache_mutex);
28 28
29struct nf_ct_event_notifier *nf_conntrack_event_cb __read_mostly; 29struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb __read_mostly;
30EXPORT_SYMBOL_GPL(nf_conntrack_event_cb); 30EXPORT_SYMBOL_GPL(nf_conntrack_event_cb);
31 31
32struct nf_exp_event_notifier *nf_expect_event_cb __read_mostly; 32struct nf_exp_event_notifier __rcu *nf_expect_event_cb __read_mostly;
33EXPORT_SYMBOL_GPL(nf_expect_event_cb); 33EXPORT_SYMBOL_GPL(nf_expect_event_cb);
34 34
35/* deliver cached events and clear cache entry - must be called with locally 35/* deliver cached events and clear cache entry - must be called with locally
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index acb29ccaa41f..46e8966912b1 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -38,25 +38,30 @@ static int nf_ct_expect_hash_rnd_initted __read_mostly;
38 38
39static struct kmem_cache *nf_ct_expect_cachep __read_mostly; 39static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
40 40
41static HLIST_HEAD(nf_ct_userspace_expect_list);
42
41/* nf_conntrack_expect helper functions */ 43/* nf_conntrack_expect helper functions */
42void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) 44void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
45 u32 pid, int report)
43{ 46{
44 struct nf_conn_help *master_help = nfct_help(exp->master); 47 struct nf_conn_help *master_help = nfct_help(exp->master);
45 struct net *net = nf_ct_exp_net(exp); 48 struct net *net = nf_ct_exp_net(exp);
46 49
47 NF_CT_ASSERT(master_help);
48 NF_CT_ASSERT(!timer_pending(&exp->timeout)); 50 NF_CT_ASSERT(!timer_pending(&exp->timeout));
49 51
50 hlist_del_rcu(&exp->hnode); 52 hlist_del_rcu(&exp->hnode);
51 net->ct.expect_count--; 53 net->ct.expect_count--;
52 54
53 hlist_del(&exp->lnode); 55 hlist_del(&exp->lnode);
54 master_help->expecting[exp->class]--; 56 if (!(exp->flags & NF_CT_EXPECT_USERSPACE))
57 master_help->expecting[exp->class]--;
58
59 nf_ct_expect_event_report(IPEXP_DESTROY, exp, pid, report);
55 nf_ct_expect_put(exp); 60 nf_ct_expect_put(exp);
56 61
57 NF_CT_STAT_INC(net, expect_delete); 62 NF_CT_STAT_INC(net, expect_delete);
58} 63}
59EXPORT_SYMBOL_GPL(nf_ct_unlink_expect); 64EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report);
60 65
61static void nf_ct_expectation_timed_out(unsigned long ul_expect) 66static void nf_ct_expectation_timed_out(unsigned long ul_expect)
62{ 67{
@@ -320,16 +325,21 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
320 325
321 atomic_inc(&exp->use); 326 atomic_inc(&exp->use);
322 327
323 hlist_add_head(&exp->lnode, &master_help->expectations); 328 if (master_help) {
324 master_help->expecting[exp->class]++; 329 hlist_add_head(&exp->lnode, &master_help->expectations);
330 master_help->expecting[exp->class]++;
331 } else if (exp->flags & NF_CT_EXPECT_USERSPACE)
332 hlist_add_head(&exp->lnode, &nf_ct_userspace_expect_list);
325 333
326 hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]); 334 hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]);
327 net->ct.expect_count++; 335 net->ct.expect_count++;
328 336
329 setup_timer(&exp->timeout, nf_ct_expectation_timed_out, 337 setup_timer(&exp->timeout, nf_ct_expectation_timed_out,
330 (unsigned long)exp); 338 (unsigned long)exp);
331 p = &master_help->helper->expect_policy[exp->class]; 339 if (master_help) {
332 exp->timeout.expires = jiffies + p->timeout * HZ; 340 p = &master_help->helper->expect_policy[exp->class];
341 exp->timeout.expires = jiffies + p->timeout * HZ;
342 }
333 add_timer(&exp->timeout); 343 add_timer(&exp->timeout);
334 344
335 atomic_inc(&exp->use); 345 atomic_inc(&exp->use);
@@ -380,7 +390,9 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
380 unsigned int h; 390 unsigned int h;
381 int ret = 1; 391 int ret = 1;
382 392
383 if (!master_help->helper) { 393 /* Don't allow expectations created from kernel-space with no helper */
394 if (!(expect->flags & NF_CT_EXPECT_USERSPACE) &&
395 (!master_help || (master_help && !master_help->helper))) {
384 ret = -ESHUTDOWN; 396 ret = -ESHUTDOWN;
385 goto out; 397 goto out;
386 } 398 }
@@ -398,13 +410,16 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
398 } 410 }
399 } 411 }
400 /* Will be over limit? */ 412 /* Will be over limit? */
401 p = &master_help->helper->expect_policy[expect->class]; 413 if (master_help) {
402 if (p->max_expected && 414 p = &master_help->helper->expect_policy[expect->class];
403 master_help->expecting[expect->class] >= p->max_expected) { 415 if (p->max_expected &&
404 evict_oldest_expect(master, expect); 416 master_help->expecting[expect->class] >= p->max_expected) {
405 if (master_help->expecting[expect->class] >= p->max_expected) { 417 evict_oldest_expect(master, expect);
406 ret = -EMFILE; 418 if (master_help->expecting[expect->class]
407 goto out; 419 >= p->max_expected) {
420 ret = -EMFILE;
421 goto out;
422 }
408 } 423 }
409 } 424 }
410 425
@@ -439,6 +454,21 @@ out:
439} 454}
440EXPORT_SYMBOL_GPL(nf_ct_expect_related_report); 455EXPORT_SYMBOL_GPL(nf_ct_expect_related_report);
441 456
457void nf_ct_remove_userspace_expectations(void)
458{
459 struct nf_conntrack_expect *exp;
460 struct hlist_node *n, *next;
461
462 hlist_for_each_entry_safe(exp, n, next,
463 &nf_ct_userspace_expect_list, lnode) {
464 if (del_timer(&exp->timeout)) {
465 nf_ct_unlink_expect(exp);
466 nf_ct_expect_put(exp);
467 }
468 }
469}
470EXPORT_SYMBOL_GPL(nf_ct_remove_userspace_expectations);
471
442#ifdef CONFIG_PROC_FS 472#ifdef CONFIG_PROC_FS
443struct ct_expect_iter_state { 473struct ct_expect_iter_state {
444 struct seq_net_private p; 474 struct seq_net_private p;
@@ -529,8 +559,12 @@ static int exp_seq_show(struct seq_file *s, void *v)
529 seq_printf(s, "PERMANENT"); 559 seq_printf(s, "PERMANENT");
530 delim = ","; 560 delim = ",";
531 } 561 }
532 if (expect->flags & NF_CT_EXPECT_INACTIVE) 562 if (expect->flags & NF_CT_EXPECT_INACTIVE) {
533 seq_printf(s, "%sINACTIVE", delim); 563 seq_printf(s, "%sINACTIVE", delim);
564 delim = ",";
565 }
566 if (expect->flags & NF_CT_EXPECT_USERSPACE)
567 seq_printf(s, "%sUSERSPACE", delim);
534 568
535 helper = rcu_dereference(nfct_help(expect->master)->helper); 569 helper = rcu_dereference(nfct_help(expect->master)->helper);
536 if (helper) { 570 if (helper) {
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index 7dcf7a404190..bd82450c193f 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -16,7 +16,7 @@
16#include <linux/skbuff.h> 16#include <linux/skbuff.h>
17#include <net/netfilter/nf_conntrack_extend.h> 17#include <net/netfilter/nf_conntrack_extend.h>
18 18
19static struct nf_ct_ext_type *nf_ct_ext_types[NF_CT_EXT_NUM]; 19static struct nf_ct_ext_type __rcu *nf_ct_ext_types[NF_CT_EXT_NUM];
20static DEFINE_MUTEX(nf_ct_ext_type_mutex); 20static DEFINE_MUTEX(nf_ct_ext_type_mutex);
21 21
22void __nf_ct_ext_destroy(struct nf_conn *ct) 22void __nf_ct_ext_destroy(struct nf_conn *ct)
@@ -48,15 +48,17 @@ nf_ct_ext_create(struct nf_ct_ext **ext, enum nf_ct_ext_id id, gfp_t gfp)
48{ 48{
49 unsigned int off, len; 49 unsigned int off, len;
50 struct nf_ct_ext_type *t; 50 struct nf_ct_ext_type *t;
51 size_t alloc_size;
51 52
52 rcu_read_lock(); 53 rcu_read_lock();
53 t = rcu_dereference(nf_ct_ext_types[id]); 54 t = rcu_dereference(nf_ct_ext_types[id]);
54 BUG_ON(t == NULL); 55 BUG_ON(t == NULL);
55 off = ALIGN(sizeof(struct nf_ct_ext), t->align); 56 off = ALIGN(sizeof(struct nf_ct_ext), t->align);
56 len = off + t->len; 57 len = off + t->len;
58 alloc_size = t->alloc_size;
57 rcu_read_unlock(); 59 rcu_read_unlock();
58 60
59 *ext = kzalloc(t->alloc_size, gfp); 61 *ext = kzalloc(alloc_size, gfp);
60 if (!*ext) 62 if (!*ext)
61 return NULL; 63 return NULL;
62 64
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 5bae1cd15eea..b729ace1dcc1 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -22,6 +22,7 @@
22#include <linux/rculist_nulls.h> 22#include <linux/rculist_nulls.h>
23#include <linux/types.h> 23#include <linux/types.h>
24#include <linux/timer.h> 24#include <linux/timer.h>
25#include <linux/security.h>
25#include <linux/skbuff.h> 26#include <linux/skbuff.h>
26#include <linux/errno.h> 27#include <linux/errno.h>
27#include <linux/netlink.h> 28#include <linux/netlink.h>
@@ -245,16 +246,31 @@ nla_put_failure:
245 246
246#ifdef CONFIG_NF_CONNTRACK_SECMARK 247#ifdef CONFIG_NF_CONNTRACK_SECMARK
247static inline int 248static inline int
248ctnetlink_dump_secmark(struct sk_buff *skb, const struct nf_conn *ct) 249ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct)
249{ 250{
250 NLA_PUT_BE32(skb, CTA_SECMARK, htonl(ct->secmark)); 251 struct nlattr *nest_secctx;
251 return 0; 252 int len, ret;
253 char *secctx;
254
255 ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
256 if (ret)
257 return ret;
258
259 ret = -1;
260 nest_secctx = nla_nest_start(skb, CTA_SECCTX | NLA_F_NESTED);
261 if (!nest_secctx)
262 goto nla_put_failure;
252 263
264 NLA_PUT_STRING(skb, CTA_SECCTX_NAME, secctx);
265 nla_nest_end(skb, nest_secctx);
266
267 ret = 0;
253nla_put_failure: 268nla_put_failure:
254 return -1; 269 security_release_secctx(secctx, len);
270 return ret;
255} 271}
256#else 272#else
257#define ctnetlink_dump_secmark(a, b) (0) 273#define ctnetlink_dump_secctx(a, b) (0)
258#endif 274#endif
259 275
260#define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple) 276#define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
@@ -391,7 +407,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
391 ctnetlink_dump_protoinfo(skb, ct) < 0 || 407 ctnetlink_dump_protoinfo(skb, ct) < 0 ||
392 ctnetlink_dump_helpinfo(skb, ct) < 0 || 408 ctnetlink_dump_helpinfo(skb, ct) < 0 ||
393 ctnetlink_dump_mark(skb, ct) < 0 || 409 ctnetlink_dump_mark(skb, ct) < 0 ||
394 ctnetlink_dump_secmark(skb, ct) < 0 || 410 ctnetlink_dump_secctx(skb, ct) < 0 ||
395 ctnetlink_dump_id(skb, ct) < 0 || 411 ctnetlink_dump_id(skb, ct) < 0 ||
396 ctnetlink_dump_use(skb, ct) < 0 || 412 ctnetlink_dump_use(skb, ct) < 0 ||
397 ctnetlink_dump_master(skb, ct) < 0 || 413 ctnetlink_dump_master(skb, ct) < 0 ||
@@ -437,6 +453,17 @@ ctnetlink_counters_size(const struct nf_conn *ct)
437 ; 453 ;
438} 454}
439 455
456#ifdef CONFIG_NF_CONNTRACK_SECMARK
457static int ctnetlink_nlmsg_secctx_size(const struct nf_conn *ct)
458{
459 int len;
460
461 security_secid_to_secctx(ct->secmark, NULL, &len);
462
463 return sizeof(char) * len;
464}
465#endif
466
440static inline size_t 467static inline size_t
441ctnetlink_nlmsg_size(const struct nf_conn *ct) 468ctnetlink_nlmsg_size(const struct nf_conn *ct)
442{ 469{
@@ -453,7 +480,8 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct)
453 + nla_total_size(0) /* CTA_HELP */ 480 + nla_total_size(0) /* CTA_HELP */
454 + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */ 481 + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */
455#ifdef CONFIG_NF_CONNTRACK_SECMARK 482#ifdef CONFIG_NF_CONNTRACK_SECMARK
456 + nla_total_size(sizeof(u_int32_t)) /* CTA_SECMARK */ 483 + nla_total_size(0) /* CTA_SECCTX */
484 + nla_total_size(ctnetlink_nlmsg_secctx_size(ct)) /* CTA_SECCTX_NAME */
457#endif 485#endif
458#ifdef CONFIG_NF_NAT_NEEDED 486#ifdef CONFIG_NF_NAT_NEEDED
459 + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */ 487 + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */
@@ -556,7 +584,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
556 584
557#ifdef CONFIG_NF_CONNTRACK_SECMARK 585#ifdef CONFIG_NF_CONNTRACK_SECMARK
558 if ((events & (1 << IPCT_SECMARK) || ct->secmark) 586 if ((events & (1 << IPCT_SECMARK) || ct->secmark)
559 && ctnetlink_dump_secmark(skb, ct) < 0) 587 && ctnetlink_dump_secctx(skb, ct) < 0)
560 goto nla_put_failure; 588 goto nla_put_failure;
561#endif 589#endif
562 590
@@ -1560,8 +1588,8 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb,
1560 const struct nf_conntrack_expect *exp) 1588 const struct nf_conntrack_expect *exp)
1561{ 1589{
1562 struct nf_conn *master = exp->master; 1590 struct nf_conn *master = exp->master;
1563 struct nf_conntrack_helper *helper;
1564 long timeout = (exp->timeout.expires - jiffies) / HZ; 1591 long timeout = (exp->timeout.expires - jiffies) / HZ;
1592 struct nf_conn_help *help;
1565 1593
1566 if (timeout < 0) 1594 if (timeout < 0)
1567 timeout = 0; 1595 timeout = 0;
@@ -1577,9 +1605,15 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb,
1577 1605
1578 NLA_PUT_BE32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout)); 1606 NLA_PUT_BE32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout));
1579 NLA_PUT_BE32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp)); 1607 NLA_PUT_BE32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp));
1580 helper = rcu_dereference(nfct_help(master)->helper); 1608 NLA_PUT_BE32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags));
1581 if (helper) 1609 help = nfct_help(master);
1582 NLA_PUT_STRING(skb, CTA_EXPECT_HELP_NAME, helper->name); 1610 if (help) {
1611 struct nf_conntrack_helper *helper;
1612
1613 helper = rcu_dereference(help->helper);
1614 if (helper)
1615 NLA_PUT_STRING(skb, CTA_EXPECT_HELP_NAME, helper->name);
1616 }
1583 1617
1584 return 0; 1618 return 0;
1585 1619
@@ -1626,17 +1660,20 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
1626 struct nlmsghdr *nlh; 1660 struct nlmsghdr *nlh;
1627 struct nfgenmsg *nfmsg; 1661 struct nfgenmsg *nfmsg;
1628 struct sk_buff *skb; 1662 struct sk_buff *skb;
1629 unsigned int type; 1663 unsigned int type, group;
1630 int flags = 0; 1664 int flags = 0;
1631 1665
1632 if (events & (1 << IPEXP_NEW)) { 1666 if (events & (1 << IPEXP_DESTROY)) {
1667 type = IPCTNL_MSG_EXP_DELETE;
1668 group = NFNLGRP_CONNTRACK_EXP_DESTROY;
1669 } else if (events & (1 << IPEXP_NEW)) {
1633 type = IPCTNL_MSG_EXP_NEW; 1670 type = IPCTNL_MSG_EXP_NEW;
1634 flags = NLM_F_CREATE|NLM_F_EXCL; 1671 flags = NLM_F_CREATE|NLM_F_EXCL;
1672 group = NFNLGRP_CONNTRACK_EXP_NEW;
1635 } else 1673 } else
1636 return 0; 1674 return 0;
1637 1675
1638 if (!item->report && 1676 if (!item->report && !nfnetlink_has_listeners(net, group))
1639 !nfnetlink_has_listeners(net, NFNLGRP_CONNTRACK_EXP_NEW))
1640 return 0; 1677 return 0;
1641 1678
1642 skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); 1679 skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
@@ -1659,8 +1696,7 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
1659 rcu_read_unlock(); 1696 rcu_read_unlock();
1660 1697
1661 nlmsg_end(skb, nlh); 1698 nlmsg_end(skb, nlh);
1662 nfnetlink_send(skb, net, item->pid, NFNLGRP_CONNTRACK_EXP_NEW, 1699 nfnetlink_send(skb, net, item->pid, group, item->report, GFP_ATOMIC);
1663 item->report, GFP_ATOMIC);
1664 return 0; 1700 return 0;
1665 1701
1666nla_put_failure: 1702nla_put_failure:
@@ -1733,6 +1769,8 @@ static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = {
1733 [CTA_EXPECT_TIMEOUT] = { .type = NLA_U32 }, 1769 [CTA_EXPECT_TIMEOUT] = { .type = NLA_U32 },
1734 [CTA_EXPECT_ID] = { .type = NLA_U32 }, 1770 [CTA_EXPECT_ID] = { .type = NLA_U32 },
1735 [CTA_EXPECT_HELP_NAME] = { .type = NLA_NUL_STRING }, 1771 [CTA_EXPECT_HELP_NAME] = { .type = NLA_NUL_STRING },
1772 [CTA_EXPECT_ZONE] = { .type = NLA_U16 },
1773 [CTA_EXPECT_FLAGS] = { .type = NLA_U32 },
1736}; 1774};
1737 1775
1738static int 1776static int
@@ -1841,7 +1879,13 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
1841 } 1879 }
1842 1880
1843 /* after list removal, usage count == 1 */ 1881 /* after list removal, usage count == 1 */
1844 nf_ct_unexpect_related(exp); 1882 spin_lock_bh(&nf_conntrack_lock);
1883 if (del_timer(&exp->timeout)) {
1884 nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).pid,
1885 nlmsg_report(nlh));
1886 nf_ct_expect_put(exp);
1887 }
1888 spin_unlock_bh(&nf_conntrack_lock);
1845 /* have to put what we 'get' above. 1889 /* have to put what we 'get' above.
1846 * after this line usage count == 0 */ 1890 * after this line usage count == 0 */
1847 nf_ct_expect_put(exp); 1891 nf_ct_expect_put(exp);
@@ -1858,7 +1902,9 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
1858 m_help = nfct_help(exp->master); 1902 m_help = nfct_help(exp->master);
1859 if (!strcmp(m_help->helper->name, name) && 1903 if (!strcmp(m_help->helper->name, name) &&
1860 del_timer(&exp->timeout)) { 1904 del_timer(&exp->timeout)) {
1861 nf_ct_unlink_expect(exp); 1905 nf_ct_unlink_expect_report(exp,
1906 NETLINK_CB(skb).pid,
1907 nlmsg_report(nlh));
1862 nf_ct_expect_put(exp); 1908 nf_ct_expect_put(exp);
1863 } 1909 }
1864 } 1910 }
@@ -1872,7 +1918,9 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
1872 &net->ct.expect_hash[i], 1918 &net->ct.expect_hash[i],
1873 hnode) { 1919 hnode) {
1874 if (del_timer(&exp->timeout)) { 1920 if (del_timer(&exp->timeout)) {
1875 nf_ct_unlink_expect(exp); 1921 nf_ct_unlink_expect_report(exp,
1922 NETLINK_CB(skb).pid,
1923 nlmsg_report(nlh));
1876 nf_ct_expect_put(exp); 1924 nf_ct_expect_put(exp);
1877 } 1925 }
1878 } 1926 }
@@ -1918,23 +1966,35 @@ ctnetlink_create_expect(struct net *net, u16 zone,
1918 if (!h) 1966 if (!h)
1919 return -ENOENT; 1967 return -ENOENT;
1920 ct = nf_ct_tuplehash_to_ctrack(h); 1968 ct = nf_ct_tuplehash_to_ctrack(h);
1921 help = nfct_help(ct);
1922
1923 if (!help || !help->helper) {
1924 /* such conntrack hasn't got any helper, abort */
1925 err = -EOPNOTSUPP;
1926 goto out;
1927 }
1928
1929 exp = nf_ct_expect_alloc(ct); 1969 exp = nf_ct_expect_alloc(ct);
1930 if (!exp) { 1970 if (!exp) {
1931 err = -ENOMEM; 1971 err = -ENOMEM;
1932 goto out; 1972 goto out;
1933 } 1973 }
1974 help = nfct_help(ct);
1975 if (!help) {
1976 if (!cda[CTA_EXPECT_TIMEOUT]) {
1977 err = -EINVAL;
1978 goto out;
1979 }
1980 exp->timeout.expires =
1981 jiffies + ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ;
1982
1983 exp->flags = NF_CT_EXPECT_USERSPACE;
1984 if (cda[CTA_EXPECT_FLAGS]) {
1985 exp->flags |=
1986 ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS]));
1987 }
1988 } else {
1989 if (cda[CTA_EXPECT_FLAGS]) {
1990 exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS]));
1991 exp->flags &= ~NF_CT_EXPECT_USERSPACE;
1992 } else
1993 exp->flags = 0;
1994 }
1934 1995
1935 exp->class = 0; 1996 exp->class = 0;
1936 exp->expectfn = NULL; 1997 exp->expectfn = NULL;
1937 exp->flags = 0;
1938 exp->master = ct; 1998 exp->master = ct;
1939 exp->helper = NULL; 1999 exp->helper = NULL;
1940 memcpy(&exp->tuple, &tuple, sizeof(struct nf_conntrack_tuple)); 2000 memcpy(&exp->tuple, &tuple, sizeof(struct nf_conntrack_tuple));
@@ -2102,6 +2162,7 @@ static void __exit ctnetlink_exit(void)
2102{ 2162{
2103 pr_info("ctnetlink: unregistering from nfnetlink.\n"); 2163 pr_info("ctnetlink: unregistering from nfnetlink.\n");
2104 2164
2165 nf_ct_remove_userspace_expectations();
2105#ifdef CONFIG_NF_CONNTRACK_EVENTS 2166#ifdef CONFIG_NF_CONNTRACK_EVENTS
2106 nf_ct_expect_unregister_notifier(&ctnl_notifier_exp); 2167 nf_ct_expect_unregister_notifier(&ctnl_notifier_exp);
2107 nf_conntrack_unregister_notifier(&ctnl_notifier); 2168 nf_conntrack_unregister_notifier(&ctnl_notifier);
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 5886ba1d52a0..ed6d92958023 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -28,8 +28,8 @@
28#include <net/netfilter/nf_conntrack_l4proto.h> 28#include <net/netfilter/nf_conntrack_l4proto.h>
29#include <net/netfilter/nf_conntrack_core.h> 29#include <net/netfilter/nf_conntrack_core.h>
30 30
31static struct nf_conntrack_l4proto **nf_ct_protos[PF_MAX] __read_mostly; 31static struct nf_conntrack_l4proto __rcu **nf_ct_protos[PF_MAX] __read_mostly;
32struct nf_conntrack_l3proto *nf_ct_l3protos[AF_MAX] __read_mostly; 32struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[AF_MAX] __read_mostly;
33EXPORT_SYMBOL_GPL(nf_ct_l3protos); 33EXPORT_SYMBOL_GPL(nf_ct_l3protos);
34 34
35static DEFINE_MUTEX(nf_ct_proto_mutex); 35static DEFINE_MUTEX(nf_ct_proto_mutex);
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index c4c885dca3bd..3fb2b73b24dc 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -329,8 +329,8 @@ static unsigned int get_conntrack_index(const struct tcphdr *tcph)
329/* TCP connection tracking based on 'Real Stateful TCP Packet Filtering 329/* TCP connection tracking based on 'Real Stateful TCP Packet Filtering
330 in IP Filter' by Guido van Rooij. 330 in IP Filter' by Guido van Rooij.
331 331
332 http://www.nluug.nl/events/sane2000/papers.html 332 http://www.sane.nl/events/sane2000/papers.html
333 http://www.iae.nl/users/guido/papers/tcp_filtering.ps.gz 333 http://www.darkart.com/mirrors/www.obfuscation.org/ipf/
334 334
335 The boundaries and the conditions are changed according to RFC793: 335 The boundaries and the conditions are changed according to RFC793:
336 the packet must intersect the window (i.e. segments may be 336 the packet must intersect the window (i.e. segments may be
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 53d892210a04..bcf47eb518ef 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -130,6 +130,44 @@ static int digits_len(const struct nf_conn *ct, const char *dptr,
130 return len; 130 return len;
131} 131}
132 132
133static int iswordc(const char c)
134{
135 if (isalnum(c) || c == '!' || c == '"' || c == '%' ||
136 (c >= '(' && c <= '/') || c == ':' || c == '<' || c == '>' ||
137 c == '?' || (c >= '[' && c <= ']') || c == '_' || c == '`' ||
138 c == '{' || c == '}' || c == '~')
139 return 1;
140 return 0;
141}
142
143static int word_len(const char *dptr, const char *limit)
144{
145 int len = 0;
146 while (dptr < limit && iswordc(*dptr)) {
147 dptr++;
148 len++;
149 }
150 return len;
151}
152
153static int callid_len(const struct nf_conn *ct, const char *dptr,
154 const char *limit, int *shift)
155{
156 int len, domain_len;
157
158 len = word_len(dptr, limit);
159 dptr += len;
160 if (!len || dptr == limit || *dptr != '@')
161 return len;
162 dptr++;
163 len++;
164
165 domain_len = word_len(dptr, limit);
166 if (!domain_len)
167 return 0;
168 return len + domain_len;
169}
170
133/* get media type + port length */ 171/* get media type + port length */
134static int media_len(const struct nf_conn *ct, const char *dptr, 172static int media_len(const struct nf_conn *ct, const char *dptr,
135 const char *limit, int *shift) 173 const char *limit, int *shift)
@@ -152,6 +190,9 @@ static int parse_addr(const struct nf_conn *ct, const char *cp,
152 const char *end; 190 const char *end;
153 int ret = 0; 191 int ret = 0;
154 192
193 if (!ct)
194 return 0;
195
155 memset(addr, 0, sizeof(*addr)); 196 memset(addr, 0, sizeof(*addr));
156 switch (nf_ct_l3num(ct)) { 197 switch (nf_ct_l3num(ct)) {
157 case AF_INET: 198 case AF_INET:
@@ -296,6 +337,7 @@ static const struct sip_header ct_sip_hdrs[] = {
296 [SIP_HDR_VIA_TCP] = SIP_HDR("Via", "v", "TCP ", epaddr_len), 337 [SIP_HDR_VIA_TCP] = SIP_HDR("Via", "v", "TCP ", epaddr_len),
297 [SIP_HDR_EXPIRES] = SIP_HDR("Expires", NULL, NULL, digits_len), 338 [SIP_HDR_EXPIRES] = SIP_HDR("Expires", NULL, NULL, digits_len),
298 [SIP_HDR_CONTENT_LENGTH] = SIP_HDR("Content-Length", "l", NULL, digits_len), 339 [SIP_HDR_CONTENT_LENGTH] = SIP_HDR("Content-Length", "l", NULL, digits_len),
340 [SIP_HDR_CALL_ID] = SIP_HDR("Call-Id", "i", NULL, callid_len),
299}; 341};
300 342
301static const char *sip_follow_continuation(const char *dptr, const char *limit) 343static const char *sip_follow_continuation(const char *dptr, const char *limit)
@@ -1376,7 +1418,7 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff,
1376 unsigned int msglen, origlen; 1418 unsigned int msglen, origlen;
1377 const char *dptr, *end; 1419 const char *dptr, *end;
1378 s16 diff, tdiff = 0; 1420 s16 diff, tdiff = 0;
1379 int ret; 1421 int ret = NF_ACCEPT;
1380 typeof(nf_nat_sip_seq_adjust_hook) nf_nat_sip_seq_adjust; 1422 typeof(nf_nat_sip_seq_adjust_hook) nf_nat_sip_seq_adjust;
1381 1423
1382 if (ctinfo != IP_CT_ESTABLISHED && 1424 if (ctinfo != IP_CT_ESTABLISHED &&
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index eb973fcd67ab..0fb65705b44b 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -15,6 +15,7 @@
15#include <linux/seq_file.h> 15#include <linux/seq_file.h>
16#include <linux/percpu.h> 16#include <linux/percpu.h>
17#include <linux/netdevice.h> 17#include <linux/netdevice.h>
18#include <linux/security.h>
18#include <net/net_namespace.h> 19#include <net/net_namespace.h>
19#ifdef CONFIG_SYSCTL 20#ifdef CONFIG_SYSCTL
20#include <linux/sysctl.h> 21#include <linux/sysctl.h>
@@ -108,6 +109,29 @@ static void ct_seq_stop(struct seq_file *s, void *v)
108 rcu_read_unlock(); 109 rcu_read_unlock();
109} 110}
110 111
112#ifdef CONFIG_NF_CONNTRACK_SECMARK
113static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
114{
115 int ret;
116 u32 len;
117 char *secctx;
118
119 ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
120 if (ret)
121 return ret;
122
123 ret = seq_printf(s, "secctx=%s ", secctx);
124
125 security_release_secctx(secctx, len);
126 return ret;
127}
128#else
129static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
130{
131 return 0;
132}
133#endif
134
111/* return 0 on success, 1 in case of error */ 135/* return 0 on success, 1 in case of error */
112static int ct_seq_show(struct seq_file *s, void *v) 136static int ct_seq_show(struct seq_file *s, void *v)
113{ 137{
@@ -168,10 +192,8 @@ static int ct_seq_show(struct seq_file *s, void *v)
168 goto release; 192 goto release;
169#endif 193#endif
170 194
171#ifdef CONFIG_NF_CONNTRACK_SECMARK 195 if (ct_show_secctx(s, ct))
172 if (seq_printf(s, "secmark=%u ", ct->secmark))
173 goto release; 196 goto release;
174#endif
175 197
176#ifdef CONFIG_NF_CONNTRACK_ZONES 198#ifdef CONFIG_NF_CONNTRACK_ZONES
177 if (seq_printf(s, "zone=%u ", nf_ct_zone(ct))) 199 if (seq_printf(s, "zone=%u ", nf_ct_zone(ct)))
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 7df37fd786bc..b07393eab88e 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -16,7 +16,7 @@
16#define NF_LOG_PREFIXLEN 128 16#define NF_LOG_PREFIXLEN 128
17#define NFLOGGER_NAME_LEN 64 17#define NFLOGGER_NAME_LEN 64
18 18
19static const struct nf_logger *nf_loggers[NFPROTO_NUMPROTO] __read_mostly; 19static const struct nf_logger __rcu *nf_loggers[NFPROTO_NUMPROTO] __read_mostly;
20static struct list_head nf_loggers_l[NFPROTO_NUMPROTO] __read_mostly; 20static struct list_head nf_loggers_l[NFPROTO_NUMPROTO] __read_mostly;
21static DEFINE_MUTEX(nf_log_mutex); 21static DEFINE_MUTEX(nf_log_mutex);
22 22
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 78b3cf9c519c..74aebed5bd28 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -18,7 +18,7 @@
18 * long term mutex. The handler must provide an an outfn() to accept packets 18 * long term mutex. The handler must provide an an outfn() to accept packets
19 * for queueing and must reinject all packets it receives, no matter what. 19 * for queueing and must reinject all packets it receives, no matter what.
20 */ 20 */
21static const struct nf_queue_handler *queue_handler[NFPROTO_NUMPROTO] __read_mostly; 21static const struct nf_queue_handler __rcu *queue_handler[NFPROTO_NUMPROTO] __read_mostly;
22 22
23static DEFINE_MUTEX(queue_handler_mutex); 23static DEFINE_MUTEX(queue_handler_mutex);
24 24
diff --git a/net/netfilter/nf_tproxy_core.c b/net/netfilter/nf_tproxy_core.c
index 5490fc37c92d..4d87befb04c0 100644
--- a/net/netfilter/nf_tproxy_core.c
+++ b/net/netfilter/nf_tproxy_core.c
@@ -18,41 +18,6 @@
18#include <net/udp.h> 18#include <net/udp.h>
19#include <net/netfilter/nf_tproxy_core.h> 19#include <net/netfilter/nf_tproxy_core.h>
20 20
21struct sock *
22nf_tproxy_get_sock_v4(struct net *net, const u8 protocol,
23 const __be32 saddr, const __be32 daddr,
24 const __be16 sport, const __be16 dport,
25 const struct net_device *in, bool listening_only)
26{
27 struct sock *sk;
28
29 /* look up socket */
30 switch (protocol) {
31 case IPPROTO_TCP:
32 if (listening_only)
33 sk = __inet_lookup_listener(net, &tcp_hashinfo,
34 daddr, ntohs(dport),
35 in->ifindex);
36 else
37 sk = __inet_lookup(net, &tcp_hashinfo,
38 saddr, sport, daddr, dport,
39 in->ifindex);
40 break;
41 case IPPROTO_UDP:
42 sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
43 in->ifindex);
44 break;
45 default:
46 WARN_ON(1);
47 sk = NULL;
48 }
49
50 pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, listener only: %d, sock %p\n",
51 protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), listening_only, sk);
52
53 return sk;
54}
55EXPORT_SYMBOL_GPL(nf_tproxy_get_sock_v4);
56 21
57static void 22static void
58nf_tproxy_destructor(struct sk_buff *skb) 23nf_tproxy_destructor(struct sk_buff *skb)
@@ -70,7 +35,11 @@ nf_tproxy_destructor(struct sk_buff *skb)
70int 35int
71nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk) 36nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk)
72{ 37{
73 if (inet_sk(sk)->transparent) { 38 bool transparent = (sk->sk_state == TCP_TIME_WAIT) ?
39 inet_twsk(sk)->tw_transparent :
40 inet_sk(sk)->transparent;
41
42 if (transparent) {
74 skb_orphan(skb); 43 skb_orphan(skb);
75 skb->sk = sk; 44 skb->sk = sk;
76 skb->destructor = nf_tproxy_destructor; 45 skb->destructor = nf_tproxy_destructor;
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index e34622fa0003..80463507420e 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -116,10 +116,8 @@ EXPORT_SYMBOL(xt_register_targets);
116void 116void
117xt_unregister_targets(struct xt_target *target, unsigned int n) 117xt_unregister_targets(struct xt_target *target, unsigned int n)
118{ 118{
119 unsigned int i; 119 while (n-- > 0)
120 120 xt_unregister_target(&target[n]);
121 for (i = 0; i < n; i++)
122 xt_unregister_target(&target[i]);
123} 121}
124EXPORT_SYMBOL(xt_unregister_targets); 122EXPORT_SYMBOL(xt_unregister_targets);
125 123
@@ -174,10 +172,8 @@ EXPORT_SYMBOL(xt_register_matches);
174void 172void
175xt_unregister_matches(struct xt_match *match, unsigned int n) 173xt_unregister_matches(struct xt_match *match, unsigned int n)
176{ 174{
177 unsigned int i; 175 while (n-- > 0)
178 176 xt_unregister_match(&match[n]);
179 for (i = 0; i < n; i++)
180 xt_unregister_match(&match[i]);
181} 177}
182EXPORT_SYMBOL(xt_unregister_matches); 178EXPORT_SYMBOL(xt_unregister_matches);
183 179
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 0cb6053f02fd..782e51986a6f 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -9,7 +9,6 @@
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/gfp.h> 10#include <linux/gfp.h>
11#include <linux/skbuff.h> 11#include <linux/skbuff.h>
12#include <linux/selinux.h>
13#include <linux/netfilter_ipv4/ip_tables.h> 12#include <linux/netfilter_ipv4/ip_tables.h>
14#include <linux/netfilter_ipv6/ip6_tables.h> 13#include <linux/netfilter_ipv6/ip6_tables.h>
15#include <linux/netfilter/x_tables.h> 14#include <linux/netfilter/x_tables.h>
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c
index 23b2d6c486b5..9faf5e050b79 100644
--- a/net/netfilter/xt_SECMARK.c
+++ b/net/netfilter/xt_SECMARK.c
@@ -14,8 +14,8 @@
14 */ 14 */
15#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 15#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/security.h>
17#include <linux/skbuff.h> 18#include <linux/skbuff.h>
18#include <linux/selinux.h>
19#include <linux/netfilter/x_tables.h> 19#include <linux/netfilter/x_tables.h>
20#include <linux/netfilter/xt_SECMARK.h> 20#include <linux/netfilter/xt_SECMARK.h>
21 21
@@ -39,9 +39,8 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
39 39
40 switch (mode) { 40 switch (mode) {
41 case SECMARK_MODE_SEL: 41 case SECMARK_MODE_SEL:
42 secmark = info->u.sel.selsid; 42 secmark = info->secid;
43 break; 43 break;
44
45 default: 44 default:
46 BUG(); 45 BUG();
47 } 46 }
@@ -50,33 +49,33 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
50 return XT_CONTINUE; 49 return XT_CONTINUE;
51} 50}
52 51
53static int checkentry_selinux(struct xt_secmark_target_info *info) 52static int checkentry_lsm(struct xt_secmark_target_info *info)
54{ 53{
55 int err; 54 int err;
56 struct xt_secmark_target_selinux_info *sel = &info->u.sel;
57 55
58 sel->selctx[SECMARK_SELCTX_MAX - 1] = '\0'; 56 info->secctx[SECMARK_SECCTX_MAX - 1] = '\0';
57 info->secid = 0;
59 58
60 err = selinux_string_to_sid(sel->selctx, &sel->selsid); 59 err = security_secctx_to_secid(info->secctx, strlen(info->secctx),
60 &info->secid);
61 if (err) { 61 if (err) {
62 if (err == -EINVAL) 62 if (err == -EINVAL)
63 pr_info("invalid SELinux context \'%s\'\n", 63 pr_info("invalid security context \'%s\'\n", info->secctx);
64 sel->selctx);
65 return err; 64 return err;
66 } 65 }
67 66
68 if (!sel->selsid) { 67 if (!info->secid) {
69 pr_info("unable to map SELinux context \'%s\'\n", sel->selctx); 68 pr_info("unable to map security context \'%s\'\n", info->secctx);
70 return -ENOENT; 69 return -ENOENT;
71 } 70 }
72 71
73 err = selinux_secmark_relabel_packet_permission(sel->selsid); 72 err = security_secmark_relabel_packet(info->secid);
74 if (err) { 73 if (err) {
75 pr_info("unable to obtain relabeling permission\n"); 74 pr_info("unable to obtain relabeling permission\n");
76 return err; 75 return err;
77 } 76 }
78 77
79 selinux_secmark_refcount_inc(); 78 security_secmark_refcount_inc();
80 return 0; 79 return 0;
81} 80}
82 81
@@ -100,16 +99,16 @@ static int secmark_tg_check(const struct xt_tgchk_param *par)
100 99
101 switch (info->mode) { 100 switch (info->mode) {
102 case SECMARK_MODE_SEL: 101 case SECMARK_MODE_SEL:
103 err = checkentry_selinux(info);
104 if (err <= 0)
105 return err;
106 break; 102 break;
107
108 default: 103 default:
109 pr_info("invalid mode: %hu\n", info->mode); 104 pr_info("invalid mode: %hu\n", info->mode);
110 return -EINVAL; 105 return -EINVAL;
111 } 106 }
112 107
108 err = checkentry_lsm(info);
109 if (err)
110 return err;
111
113 if (!mode) 112 if (!mode)
114 mode = info->mode; 113 mode = info->mode;
115 return 0; 114 return 0;
@@ -119,7 +118,7 @@ static void secmark_tg_destroy(const struct xt_tgdtor_param *par)
119{ 118{
120 switch (mode) { 119 switch (mode) {
121 case SECMARK_MODE_SEL: 120 case SECMARK_MODE_SEL:
122 selinux_secmark_refcount_dec(); 121 security_secmark_refcount_dec();
123 } 122 }
124} 123}
125 124
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index c61294d85fda..640678f47a2a 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Transparent proxy support for Linux/iptables 2 * Transparent proxy support for Linux/iptables
3 * 3 *
4 * Copyright (c) 2006-2007 BalaBit IT Ltd. 4 * Copyright (c) 2006-2010 BalaBit IT Ltd.
5 * Author: Balazs Scheidler, Krisztian Kovacs 5 * Author: Balazs Scheidler, Krisztian Kovacs
6 * 6 *
7 * This program is free software; you can redistribute it and/or modify 7 * This program is free software; you can redistribute it and/or modify
@@ -16,19 +16,98 @@
16#include <net/checksum.h> 16#include <net/checksum.h>
17#include <net/udp.h> 17#include <net/udp.h>
18#include <net/inet_sock.h> 18#include <net/inet_sock.h>
19 19#include <linux/inetdevice.h>
20#include <linux/netfilter/x_tables.h> 20#include <linux/netfilter/x_tables.h>
21#include <linux/netfilter_ipv4/ip_tables.h> 21#include <linux/netfilter_ipv4/ip_tables.h>
22#include <linux/netfilter/xt_TPROXY.h>
23 22
24#include <net/netfilter/ipv4/nf_defrag_ipv4.h> 23#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
24
25#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
26#define XT_TPROXY_HAVE_IPV6 1
27#include <net/if_inet6.h>
28#include <net/addrconf.h>
29#include <linux/netfilter_ipv6/ip6_tables.h>
30#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
31#endif
32
25#include <net/netfilter/nf_tproxy_core.h> 33#include <net/netfilter/nf_tproxy_core.h>
34#include <linux/netfilter/xt_TPROXY.h>
35
36static inline __be32
37tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
38{
39 struct in_device *indev;
40 __be32 laddr;
41
42 if (user_laddr)
43 return user_laddr;
44
45 laddr = 0;
46 rcu_read_lock();
47 indev = __in_dev_get_rcu(skb->dev);
48 for_primary_ifa(indev) {
49 laddr = ifa->ifa_local;
50 break;
51 } endfor_ifa(indev);
52 rcu_read_unlock();
53
54 return laddr ? laddr : daddr;
55}
56
57/**
58 * tproxy_handle_time_wait4() - handle IPv4 TCP TIME_WAIT reopen redirections
59 * @skb: The skb being processed.
60 * @laddr: IPv4 address to redirect to or zero.
61 * @lport: TCP port to redirect to or zero.
62 * @sk: The TIME_WAIT TCP socket found by the lookup.
63 *
64 * We have to handle SYN packets arriving to TIME_WAIT sockets
65 * differently: instead of reopening the connection we should rather
66 * redirect the new connection to the proxy if there's a listener
67 * socket present.
68 *
69 * tproxy_handle_time_wait4() consumes the socket reference passed in.
70 *
71 * Returns the listener socket if there's one, the TIME_WAIT socket if
72 * no such listener is found, or NULL if the TCP header is incomplete.
73 */
74static struct sock *
75tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport,
76 struct sock *sk)
77{
78 const struct iphdr *iph = ip_hdr(skb);
79 struct tcphdr _hdr, *hp;
80
81 hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
82 if (hp == NULL) {
83 inet_twsk_put(inet_twsk(sk));
84 return NULL;
85 }
86
87 if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
88 /* SYN to a TIME_WAIT socket, we'd rather redirect it
89 * to a listener socket if there's one */
90 struct sock *sk2;
91
92 sk2 = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
93 iph->saddr, laddr ? laddr : iph->daddr,
94 hp->source, lport ? lport : hp->dest,
95 skb->dev, NFT_LOOKUP_LISTENER);
96 if (sk2) {
97 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
98 inet_twsk_put(inet_twsk(sk));
99 sk = sk2;
100 }
101 }
102
103 return sk;
104}
26 105
27static unsigned int 106static unsigned int
28tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par) 107tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
108 u_int32_t mark_mask, u_int32_t mark_value)
29{ 109{
30 const struct iphdr *iph = ip_hdr(skb); 110 const struct iphdr *iph = ip_hdr(skb);
31 const struct xt_tproxy_target_info *tgi = par->targinfo;
32 struct udphdr _hdr, *hp; 111 struct udphdr _hdr, *hp;
33 struct sock *sk; 112 struct sock *sk;
34 113
@@ -36,12 +115,195 @@ tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par)
36 if (hp == NULL) 115 if (hp == NULL)
37 return NF_DROP; 116 return NF_DROP;
38 117
118 /* check if there's an ongoing connection on the packet
119 * addresses, this happens if the redirect already happened
120 * and the current packet belongs to an already established
121 * connection */
39 sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, 122 sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
40 iph->saddr, 123 iph->saddr, iph->daddr,
41 tgi->laddr ? tgi->laddr : iph->daddr, 124 hp->source, hp->dest,
42 hp->source, 125 skb->dev, NFT_LOOKUP_ESTABLISHED);
43 tgi->lport ? tgi->lport : hp->dest, 126
44 par->in, true); 127 laddr = tproxy_laddr4(skb, laddr, iph->daddr);
128 if (!lport)
129 lport = hp->dest;
130
131 /* UDP has no TCP_TIME_WAIT state, so we never enter here */
132 if (sk && sk->sk_state == TCP_TIME_WAIT)
133 /* reopening a TIME_WAIT connection needs special handling */
134 sk = tproxy_handle_time_wait4(skb, laddr, lport, sk);
135 else if (!sk)
136 /* no, there's no established connection, check if
137 * there's a listener on the redirected addr/port */
138 sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
139 iph->saddr, laddr,
140 hp->source, lport,
141 skb->dev, NFT_LOOKUP_LISTENER);
142
143 /* NOTE: assign_sock consumes our sk reference */
144 if (sk && nf_tproxy_assign_sock(skb, sk)) {
145 /* This should be in a separate target, but we don't do multiple
146 targets on the same rule yet */
147 skb->mark = (skb->mark & ~mark_mask) ^ mark_value;
148
149 pr_debug("redirecting: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n",
150 iph->protocol, &iph->daddr, ntohs(hp->dest),
151 &laddr, ntohs(lport), skb->mark);
152 return NF_ACCEPT;
153 }
154
155 pr_debug("no socket, dropping: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n",
156 iph->protocol, &iph->saddr, ntohs(hp->source),
157 &iph->daddr, ntohs(hp->dest), skb->mark);
158 return NF_DROP;
159}
160
161static unsigned int
162tproxy_tg4_v0(struct sk_buff *skb, const struct xt_action_param *par)
163{
164 const struct xt_tproxy_target_info *tgi = par->targinfo;
165
166 return tproxy_tg4(skb, tgi->laddr, tgi->lport, tgi->mark_mask, tgi->mark_value);
167}
168
169static unsigned int
170tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par)
171{
172 const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
173
174 return tproxy_tg4(skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value);
175}
176
177#ifdef XT_TPROXY_HAVE_IPV6
178
179static inline const struct in6_addr *
180tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr,
181 const struct in6_addr *daddr)
182{
183 struct inet6_dev *indev;
184 struct inet6_ifaddr *ifa;
185 struct in6_addr *laddr;
186
187 if (!ipv6_addr_any(user_laddr))
188 return user_laddr;
189 laddr = NULL;
190
191 rcu_read_lock();
192 indev = __in6_dev_get(skb->dev);
193 if (indev)
194 list_for_each_entry(ifa, &indev->addr_list, if_list) {
195 if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED))
196 continue;
197
198 laddr = &ifa->addr;
199 break;
200 }
201 rcu_read_unlock();
202
203 return laddr ? laddr : daddr;
204}
205
206/**
207 * tproxy_handle_time_wait6() - handle IPv6 TCP TIME_WAIT reopen redirections
208 * @skb: The skb being processed.
209 * @tproto: Transport protocol.
210 * @thoff: Transport protocol header offset.
211 * @par: Iptables target parameters.
212 * @sk: The TIME_WAIT TCP socket found by the lookup.
213 *
214 * We have to handle SYN packets arriving to TIME_WAIT sockets
215 * differently: instead of reopening the connection we should rather
216 * redirect the new connection to the proxy if there's a listener
217 * socket present.
218 *
219 * tproxy_handle_time_wait6() consumes the socket reference passed in.
220 *
221 * Returns the listener socket if there's one, the TIME_WAIT socket if
222 * no such listener is found, or NULL if the TCP header is incomplete.
223 */
224static struct sock *
225tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
226 const struct xt_action_param *par,
227 struct sock *sk)
228{
229 const struct ipv6hdr *iph = ipv6_hdr(skb);
230 struct tcphdr _hdr, *hp;
231 const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
232
233 hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
234 if (hp == NULL) {
235 inet_twsk_put(inet_twsk(sk));
236 return NULL;
237 }
238
239 if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
240 /* SYN to a TIME_WAIT socket, we'd rather redirect it
241 * to a listener socket if there's one */
242 struct sock *sk2;
243
244 sk2 = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
245 &iph->saddr,
246 tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr),
247 hp->source,
248 tgi->lport ? tgi->lport : hp->dest,
249 skb->dev, NFT_LOOKUP_LISTENER);
250 if (sk2) {
251 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
252 inet_twsk_put(inet_twsk(sk));
253 sk = sk2;
254 }
255 }
256
257 return sk;
258}
259
260static unsigned int
261tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
262{
263 const struct ipv6hdr *iph = ipv6_hdr(skb);
264 const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
265 struct udphdr _hdr, *hp;
266 struct sock *sk;
267 const struct in6_addr *laddr;
268 __be16 lport;
269 int thoff;
270 int tproto;
271
272 tproto = ipv6_find_hdr(skb, &thoff, -1, NULL);
273 if (tproto < 0) {
274 pr_debug("unable to find transport header in IPv6 packet, dropping\n");
275 return NF_DROP;
276 }
277
278 hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
279 if (hp == NULL) {
280 pr_debug("unable to grab transport header contents in IPv6 packet, dropping\n");
281 return NF_DROP;
282 }
283
284 /* check if there's an ongoing connection on the packet
285 * addresses, this happens if the redirect already happened
286 * and the current packet belongs to an already established
287 * connection */
288 sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
289 &iph->saddr, &iph->daddr,
290 hp->source, hp->dest,
291 par->in, NFT_LOOKUP_ESTABLISHED);
292
293 laddr = tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr);
294 lport = tgi->lport ? tgi->lport : hp->dest;
295
296 /* UDP has no TCP_TIME_WAIT state, so we never enter here */
297 if (sk && sk->sk_state == TCP_TIME_WAIT)
298 /* reopening a TIME_WAIT connection needs special handling */
299 sk = tproxy_handle_time_wait6(skb, tproto, thoff, par, sk);
300 else if (!sk)
301 /* no there's no established connection, check if
302 * there's a listener on the redirected addr/port */
303 sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
304 &iph->saddr, laddr,
305 hp->source, lport,
306 par->in, NFT_LOOKUP_LISTENER);
45 307
46 /* NOTE: assign_sock consumes our sk reference */ 308 /* NOTE: assign_sock consumes our sk reference */
47 if (sk && nf_tproxy_assign_sock(skb, sk)) { 309 if (sk && nf_tproxy_assign_sock(skb, sk)) {
@@ -49,19 +311,34 @@ tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par)
49 targets on the same rule yet */ 311 targets on the same rule yet */
50 skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value; 312 skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value;
51 313
52 pr_debug("redirecting: proto %u %08x:%u -> %08x:%u, mark: %x\n", 314 pr_debug("redirecting: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
53 iph->protocol, ntohl(iph->daddr), ntohs(hp->dest), 315 tproto, &iph->saddr, ntohs(hp->source),
54 ntohl(tgi->laddr), ntohs(tgi->lport), skb->mark); 316 laddr, ntohs(lport), skb->mark);
55 return NF_ACCEPT; 317 return NF_ACCEPT;
56 } 318 }
57 319
58 pr_debug("no socket, dropping: proto %u %08x:%u -> %08x:%u, mark: %x\n", 320 pr_debug("no socket, dropping: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
59 iph->protocol, ntohl(iph->daddr), ntohs(hp->dest), 321 tproto, &iph->saddr, ntohs(hp->source),
60 ntohl(tgi->laddr), ntohs(tgi->lport), skb->mark); 322 &iph->daddr, ntohs(hp->dest), skb->mark);
323
61 return NF_DROP; 324 return NF_DROP;
62} 325}
63 326
64static int tproxy_tg_check(const struct xt_tgchk_param *par) 327static int tproxy_tg6_check(const struct xt_tgchk_param *par)
328{
329 const struct ip6t_ip6 *i = par->entryinfo;
330
331 if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP)
332 && !(i->flags & IP6T_INV_PROTO))
333 return 0;
334
335 pr_info("Can be used only in combination with "
336 "either -p tcp or -p udp\n");
337 return -EINVAL;
338}
339#endif
340
341static int tproxy_tg4_check(const struct xt_tgchk_param *par)
65{ 342{
66 const struct ipt_ip *i = par->entryinfo; 343 const struct ipt_ip *i = par->entryinfo;
67 344
@@ -74,31 +351,64 @@ static int tproxy_tg_check(const struct xt_tgchk_param *par)
74 return -EINVAL; 351 return -EINVAL;
75} 352}
76 353
77static struct xt_target tproxy_tg_reg __read_mostly = { 354static struct xt_target tproxy_tg_reg[] __read_mostly = {
78 .name = "TPROXY", 355 {
79 .family = AF_INET, 356 .name = "TPROXY",
80 .table = "mangle", 357 .family = NFPROTO_IPV4,
81 .target = tproxy_tg, 358 .table = "mangle",
82 .targetsize = sizeof(struct xt_tproxy_target_info), 359 .target = tproxy_tg4_v0,
83 .checkentry = tproxy_tg_check, 360 .revision = 0,
84 .hooks = 1 << NF_INET_PRE_ROUTING, 361 .targetsize = sizeof(struct xt_tproxy_target_info),
85 .me = THIS_MODULE, 362 .checkentry = tproxy_tg4_check,
363 .hooks = 1 << NF_INET_PRE_ROUTING,
364 .me = THIS_MODULE,
365 },
366 {
367 .name = "TPROXY",
368 .family = NFPROTO_IPV4,
369 .table = "mangle",
370 .target = tproxy_tg4_v1,
371 .revision = 1,
372 .targetsize = sizeof(struct xt_tproxy_target_info_v1),
373 .checkentry = tproxy_tg4_check,
374 .hooks = 1 << NF_INET_PRE_ROUTING,
375 .me = THIS_MODULE,
376 },
377#ifdef XT_TPROXY_HAVE_IPV6
378 {
379 .name = "TPROXY",
380 .family = NFPROTO_IPV6,
381 .table = "mangle",
382 .target = tproxy_tg6_v1,
383 .revision = 1,
384 .targetsize = sizeof(struct xt_tproxy_target_info_v1),
385 .checkentry = tproxy_tg6_check,
386 .hooks = 1 << NF_INET_PRE_ROUTING,
387 .me = THIS_MODULE,
388 },
389#endif
390
86}; 391};
87 392
88static int __init tproxy_tg_init(void) 393static int __init tproxy_tg_init(void)
89{ 394{
90 nf_defrag_ipv4_enable(); 395 nf_defrag_ipv4_enable();
91 return xt_register_target(&tproxy_tg_reg); 396#ifdef XT_TPROXY_HAVE_IPV6
397 nf_defrag_ipv6_enable();
398#endif
399
400 return xt_register_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg));
92} 401}
93 402
94static void __exit tproxy_tg_exit(void) 403static void __exit tproxy_tg_exit(void)
95{ 404{
96 xt_unregister_target(&tproxy_tg_reg); 405 xt_unregister_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg));
97} 406}
98 407
99module_init(tproxy_tg_init); 408module_init(tproxy_tg_init);
100module_exit(tproxy_tg_exit); 409module_exit(tproxy_tg_exit);
101MODULE_LICENSE("GPL"); 410MODULE_LICENSE("GPL");
102MODULE_AUTHOR("Krisztian Kovacs"); 411MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs");
103MODULE_DESCRIPTION("Netfilter transparent proxy (TPROXY) target module."); 412MODULE_DESCRIPTION("Netfilter transparent proxy (TPROXY) target module.");
104MODULE_ALIAS("ipt_TPROXY"); 413MODULE_ALIAS("ipt_TPROXY");
414MODULE_ALIAS("ip6t_TPROXY");
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index b46a8390896d..9228ee0dc11a 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -448,6 +448,7 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
448{ 448{
449 __be16 _ports[2], *ports; 449 __be16 _ports[2], *ports;
450 u8 nexthdr; 450 u8 nexthdr;
451 int poff;
451 452
452 memset(dst, 0, sizeof(*dst)); 453 memset(dst, 0, sizeof(*dst));
453 454
@@ -492,19 +493,13 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
492 return 0; 493 return 0;
493 } 494 }
494 495
495 switch (nexthdr) { 496 poff = proto_ports_offset(nexthdr);
496 case IPPROTO_TCP: 497 if (poff >= 0) {
497 case IPPROTO_UDP: 498 ports = skb_header_pointer(skb, protoff + poff, sizeof(_ports),
498 case IPPROTO_UDPLITE:
499 case IPPROTO_SCTP:
500 case IPPROTO_DCCP:
501 ports = skb_header_pointer(skb, protoff, sizeof(_ports),
502 &_ports); 499 &_ports);
503 break; 500 } else {
504 default:
505 _ports[0] = _ports[1] = 0; 501 _ports[0] = _ports[1] = 0;
506 ports = _ports; 502 ports = _ports;
507 break;
508 } 503 }
509 if (!ports) 504 if (!ports)
510 return -1; 505 return -1;
diff --git a/net/netfilter/xt_ipvs.c b/net/netfilter/xt_ipvs.c
index 7a4d66db95ae..9127a3d8aa35 100644
--- a/net/netfilter/xt_ipvs.c
+++ b/net/netfilter/xt_ipvs.c
@@ -16,7 +16,6 @@
16#include <linux/ip_vs.h> 16#include <linux/ip_vs.h>
17#include <linux/types.h> 17#include <linux/types.h>
18#include <linux/netfilter/x_tables.h> 18#include <linux/netfilter/x_tables.h>
19#include <linux/netfilter/x_tables.h>
20#include <linux/netfilter/xt_ipvs.h> 19#include <linux/netfilter/xt_ipvs.h>
21#include <net/netfilter/nf_conntrack.h> 20#include <net/netfilter/nf_conntrack.h>
22 21
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 76aec6a44762..d2ff15a2412b 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -567,6 +567,7 @@ static const struct file_operations recent_mt_fops = {
567 .write = recent_mt_proc_write, 567 .write = recent_mt_proc_write,
568 .release = seq_release_private, 568 .release = seq_release_private,
569 .owner = THIS_MODULE, 569 .owner = THIS_MODULE,
570 .llseek = seq_lseek,
570}; 571};
571 572
572static int __net_init recent_proc_net_init(struct net *net) 573static int __net_init recent_proc_net_init(struct net *net)
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 1ca89908cbad..00d6ae838303 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -22,6 +22,12 @@
22#include <net/netfilter/nf_tproxy_core.h> 22#include <net/netfilter/nf_tproxy_core.h>
23#include <net/netfilter/ipv4/nf_defrag_ipv4.h> 23#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
24 24
25#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
26#define XT_SOCKET_HAVE_IPV6 1
27#include <linux/netfilter_ipv6/ip6_tables.h>
28#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
29#endif
30
25#include <linux/netfilter/xt_socket.h> 31#include <linux/netfilter/xt_socket.h>
26 32
27#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) 33#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
@@ -30,7 +36,7 @@
30#endif 36#endif
31 37
32static int 38static int
33extract_icmp_fields(const struct sk_buff *skb, 39extract_icmp4_fields(const struct sk_buff *skb,
34 u8 *protocol, 40 u8 *protocol,
35 __be32 *raddr, 41 __be32 *raddr,
36 __be32 *laddr, 42 __be32 *laddr,
@@ -86,7 +92,6 @@ extract_icmp_fields(const struct sk_buff *skb,
86 return 0; 92 return 0;
87} 93}
88 94
89
90static bool 95static bool
91socket_match(const struct sk_buff *skb, struct xt_action_param *par, 96socket_match(const struct sk_buff *skb, struct xt_action_param *par,
92 const struct xt_socket_mtinfo1 *info) 97 const struct xt_socket_mtinfo1 *info)
@@ -115,7 +120,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
115 dport = hp->dest; 120 dport = hp->dest;
116 121
117 } else if (iph->protocol == IPPROTO_ICMP) { 122 } else if (iph->protocol == IPPROTO_ICMP) {
118 if (extract_icmp_fields(skb, &protocol, &saddr, &daddr, 123 if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr,
119 &sport, &dport)) 124 &sport, &dport))
120 return false; 125 return false;
121 } else { 126 } else {
@@ -142,7 +147,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
142#endif 147#endif
143 148
144 sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), protocol, 149 sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), protocol,
145 saddr, daddr, sport, dport, par->in, false); 150 saddr, daddr, sport, dport, par->in, NFT_LOOKUP_ANY);
146 if (sk != NULL) { 151 if (sk != NULL) {
147 bool wildcard; 152 bool wildcard;
148 bool transparent = true; 153 bool transparent = true;
@@ -165,32 +170,156 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
165 sk = NULL; 170 sk = NULL;
166 } 171 }
167 172
168 pr_debug("proto %u %08x:%u -> %08x:%u (orig %08x:%u) sock %p\n", 173 pr_debug("proto %hhu %pI4:%hu -> %pI4:%hu (orig %pI4:%hu) sock %p\n",
169 protocol, ntohl(saddr), ntohs(sport), 174 protocol, &saddr, ntohs(sport),
170 ntohl(daddr), ntohs(dport), 175 &daddr, ntohs(dport),
171 ntohl(iph->daddr), hp ? ntohs(hp->dest) : 0, sk); 176 &iph->daddr, hp ? ntohs(hp->dest) : 0, sk);
172 177
173 return (sk != NULL); 178 return (sk != NULL);
174} 179}
175 180
176static bool 181static bool
177socket_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) 182socket_mt4_v0(const struct sk_buff *skb, struct xt_action_param *par)
178{ 183{
179 return socket_match(skb, par, NULL); 184 return socket_match(skb, par, NULL);
180} 185}
181 186
182static bool 187static bool
183socket_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) 188socket_mt4_v1(const struct sk_buff *skb, struct xt_action_param *par)
184{ 189{
185 return socket_match(skb, par, par->matchinfo); 190 return socket_match(skb, par, par->matchinfo);
186} 191}
187 192
193#ifdef XT_SOCKET_HAVE_IPV6
194
195static int
196extract_icmp6_fields(const struct sk_buff *skb,
197 unsigned int outside_hdrlen,
198 int *protocol,
199 struct in6_addr **raddr,
200 struct in6_addr **laddr,
201 __be16 *rport,
202 __be16 *lport)
203{
204 struct ipv6hdr *inside_iph, _inside_iph;
205 struct icmp6hdr *icmph, _icmph;
206 __be16 *ports, _ports[2];
207 u8 inside_nexthdr;
208 int inside_hdrlen;
209
210 icmph = skb_header_pointer(skb, outside_hdrlen,
211 sizeof(_icmph), &_icmph);
212 if (icmph == NULL)
213 return 1;
214
215 if (icmph->icmp6_type & ICMPV6_INFOMSG_MASK)
216 return 1;
217
218 inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(_icmph), sizeof(_inside_iph), &_inside_iph);
219 if (inside_iph == NULL)
220 return 1;
221 inside_nexthdr = inside_iph->nexthdr;
222
223 inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) + sizeof(_inside_iph), &inside_nexthdr);
224 if (inside_hdrlen < 0)
225 return 1; /* hjm: Packet has no/incomplete transport layer headers. */
226
227 if (inside_nexthdr != IPPROTO_TCP &&
228 inside_nexthdr != IPPROTO_UDP)
229 return 1;
230
231 ports = skb_header_pointer(skb, inside_hdrlen,
232 sizeof(_ports), &_ports);
233 if (ports == NULL)
234 return 1;
235
236 /* the inside IP packet is the one quoted from our side, thus
237 * its saddr is the local address */
238 *protocol = inside_nexthdr;
239 *laddr = &inside_iph->saddr;
240 *lport = ports[0];
241 *raddr = &inside_iph->daddr;
242 *rport = ports[1];
243
244 return 0;
245}
246
247static bool
248socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par)
249{
250 struct ipv6hdr *iph = ipv6_hdr(skb);
251 struct udphdr _hdr, *hp = NULL;
252 struct sock *sk;
253 struct in6_addr *daddr, *saddr;
254 __be16 dport, sport;
255 int thoff, tproto;
256 const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo;
257
258 tproto = ipv6_find_hdr(skb, &thoff, -1, NULL);
259 if (tproto < 0) {
260 pr_debug("unable to find transport header in IPv6 packet, dropping\n");
261 return NF_DROP;
262 }
263
264 if (tproto == IPPROTO_UDP || tproto == IPPROTO_TCP) {
265 hp = skb_header_pointer(skb, thoff,
266 sizeof(_hdr), &_hdr);
267 if (hp == NULL)
268 return false;
269
270 saddr = &iph->saddr;
271 sport = hp->source;
272 daddr = &iph->daddr;
273 dport = hp->dest;
274
275 } else if (tproto == IPPROTO_ICMPV6) {
276 if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr,
277 &sport, &dport))
278 return false;
279 } else {
280 return false;
281 }
282
283 sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
284 saddr, daddr, sport, dport, par->in, NFT_LOOKUP_ANY);
285 if (sk != NULL) {
286 bool wildcard;
287 bool transparent = true;
288
289 /* Ignore sockets listening on INADDR_ANY */
290 wildcard = (sk->sk_state != TCP_TIME_WAIT &&
291 ipv6_addr_any(&inet6_sk(sk)->rcv_saddr));
292
293 /* Ignore non-transparent sockets,
294 if XT_SOCKET_TRANSPARENT is used */
295 if (info && info->flags & XT_SOCKET_TRANSPARENT)
296 transparent = ((sk->sk_state != TCP_TIME_WAIT &&
297 inet_sk(sk)->transparent) ||
298 (sk->sk_state == TCP_TIME_WAIT &&
299 inet_twsk(sk)->tw_transparent));
300
301 nf_tproxy_put_sock(sk);
302
303 if (wildcard || !transparent)
304 sk = NULL;
305 }
306
307 pr_debug("proto %hhd %pI6:%hu -> %pI6:%hu "
308 "(orig %pI6:%hu) sock %p\n",
309 tproto, saddr, ntohs(sport),
310 daddr, ntohs(dport),
311 &iph->daddr, hp ? ntohs(hp->dest) : 0, sk);
312
313 return (sk != NULL);
314}
315#endif
316
188static struct xt_match socket_mt_reg[] __read_mostly = { 317static struct xt_match socket_mt_reg[] __read_mostly = {
189 { 318 {
190 .name = "socket", 319 .name = "socket",
191 .revision = 0, 320 .revision = 0,
192 .family = NFPROTO_IPV4, 321 .family = NFPROTO_IPV4,
193 .match = socket_mt_v0, 322 .match = socket_mt4_v0,
194 .hooks = (1 << NF_INET_PRE_ROUTING) | 323 .hooks = (1 << NF_INET_PRE_ROUTING) |
195 (1 << NF_INET_LOCAL_IN), 324 (1 << NF_INET_LOCAL_IN),
196 .me = THIS_MODULE, 325 .me = THIS_MODULE,
@@ -199,17 +328,33 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
199 .name = "socket", 328 .name = "socket",
200 .revision = 1, 329 .revision = 1,
201 .family = NFPROTO_IPV4, 330 .family = NFPROTO_IPV4,
202 .match = socket_mt_v1, 331 .match = socket_mt4_v1,
203 .matchsize = sizeof(struct xt_socket_mtinfo1), 332 .matchsize = sizeof(struct xt_socket_mtinfo1),
204 .hooks = (1 << NF_INET_PRE_ROUTING) | 333 .hooks = (1 << NF_INET_PRE_ROUTING) |
205 (1 << NF_INET_LOCAL_IN), 334 (1 << NF_INET_LOCAL_IN),
206 .me = THIS_MODULE, 335 .me = THIS_MODULE,
207 }, 336 },
337#ifdef XT_SOCKET_HAVE_IPV6
338 {
339 .name = "socket",
340 .revision = 1,
341 .family = NFPROTO_IPV6,
342 .match = socket_mt6_v1,
343 .matchsize = sizeof(struct xt_socket_mtinfo1),
344 .hooks = (1 << NF_INET_PRE_ROUTING) |
345 (1 << NF_INET_LOCAL_IN),
346 .me = THIS_MODULE,
347 },
348#endif
208}; 349};
209 350
210static int __init socket_mt_init(void) 351static int __init socket_mt_init(void)
211{ 352{
212 nf_defrag_ipv4_enable(); 353 nf_defrag_ipv4_enable();
354#ifdef XT_SOCKET_HAVE_IPV6
355 nf_defrag_ipv6_enable();
356#endif
357
213 return xt_register_matches(socket_mt_reg, ARRAY_SIZE(socket_mt_reg)); 358 return xt_register_matches(socket_mt_reg, ARRAY_SIZE(socket_mt_reg));
214} 359}
215 360
@@ -225,3 +370,4 @@ MODULE_LICENSE("GPL");
225MODULE_AUTHOR("Krisztian Kovacs, Balazs Scheidler"); 370MODULE_AUTHOR("Krisztian Kovacs, Balazs Scheidler");
226MODULE_DESCRIPTION("x_tables socket match module"); 371MODULE_DESCRIPTION("x_tables socket match module");
227MODULE_ALIAS("ipt_socket"); 372MODULE_ALIAS("ipt_socket");
373MODULE_ALIAS("ip6t_socket");
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 2cbf380377d5..478181d53c55 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -83,9 +83,9 @@ struct netlink_sock {
83 struct module *module; 83 struct module *module;
84}; 84};
85 85
86struct listeners_rcu_head { 86struct listeners {
87 struct rcu_head rcu_head; 87 struct rcu_head rcu;
88 void *ptr; 88 unsigned long masks[0];
89}; 89};
90 90
91#define NETLINK_KERNEL_SOCKET 0x1 91#define NETLINK_KERNEL_SOCKET 0x1
@@ -119,7 +119,7 @@ struct nl_pid_hash {
119struct netlink_table { 119struct netlink_table {
120 struct nl_pid_hash hash; 120 struct nl_pid_hash hash;
121 struct hlist_head mc_list; 121 struct hlist_head mc_list;
122 unsigned long *listeners; 122 struct listeners __rcu *listeners;
123 unsigned int nl_nonroot; 123 unsigned int nl_nonroot;
124 unsigned int groups; 124 unsigned int groups;
125 struct mutex *cb_mutex; 125 struct mutex *cb_mutex;
@@ -338,7 +338,7 @@ netlink_update_listeners(struct sock *sk)
338 if (i < NLGRPLONGS(nlk_sk(sk)->ngroups)) 338 if (i < NLGRPLONGS(nlk_sk(sk)->ngroups))
339 mask |= nlk_sk(sk)->groups[i]; 339 mask |= nlk_sk(sk)->groups[i];
340 } 340 }
341 tbl->listeners[i] = mask; 341 tbl->listeners->masks[i] = mask;
342 } 342 }
343 /* this function is only called with the netlink table "grabbed", which 343 /* this function is only called with the netlink table "grabbed", which
344 * makes sure updates are visible before bind or setsockopt return. */ 344 * makes sure updates are visible before bind or setsockopt return. */
@@ -936,7 +936,7 @@ EXPORT_SYMBOL(netlink_unicast);
936int netlink_has_listeners(struct sock *sk, unsigned int group) 936int netlink_has_listeners(struct sock *sk, unsigned int group)
937{ 937{
938 int res = 0; 938 int res = 0;
939 unsigned long *listeners; 939 struct listeners *listeners;
940 940
941 BUG_ON(!netlink_is_kernel(sk)); 941 BUG_ON(!netlink_is_kernel(sk));
942 942
@@ -944,7 +944,7 @@ int netlink_has_listeners(struct sock *sk, unsigned int group)
944 listeners = rcu_dereference(nl_table[sk->sk_protocol].listeners); 944 listeners = rcu_dereference(nl_table[sk->sk_protocol].listeners);
945 945
946 if (group - 1 < nl_table[sk->sk_protocol].groups) 946 if (group - 1 < nl_table[sk->sk_protocol].groups)
947 res = test_bit(group - 1, listeners); 947 res = test_bit(group - 1, listeners->masks);
948 948
949 rcu_read_unlock(); 949 rcu_read_unlock();
950 950
@@ -1406,7 +1406,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
1406 struct netlink_sock *nlk = nlk_sk(sk); 1406 struct netlink_sock *nlk = nlk_sk(sk);
1407 int noblock = flags&MSG_DONTWAIT; 1407 int noblock = flags&MSG_DONTWAIT;
1408 size_t copied; 1408 size_t copied;
1409 struct sk_buff *skb; 1409 struct sk_buff *skb, *data_skb;
1410 int err; 1410 int err;
1411 1411
1412 if (flags&MSG_OOB) 1412 if (flags&MSG_OOB)
@@ -1418,59 +1418,35 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
1418 if (skb == NULL) 1418 if (skb == NULL)
1419 goto out; 1419 goto out;
1420 1420
1421 data_skb = skb;
1422
1421#ifdef CONFIG_COMPAT_NETLINK_MESSAGES 1423#ifdef CONFIG_COMPAT_NETLINK_MESSAGES
1422 if (unlikely(skb_shinfo(skb)->frag_list)) { 1424 if (unlikely(skb_shinfo(skb)->frag_list)) {
1423 bool need_compat = !!(flags & MSG_CMSG_COMPAT);
1424
1425 /* 1425 /*
1426 * If this skb has a frag_list, then here that means that 1426 * If this skb has a frag_list, then here that means that we
1427 * we will have to use the frag_list skb for compat tasks 1427 * will have to use the frag_list skb's data for compat tasks
1428 * and the regular skb for non-compat tasks. 1428 * and the regular skb's data for normal (non-compat) tasks.
1429 * 1429 *
1430 * The skb might (and likely will) be cloned, so we can't 1430 * If we need to send the compat skb, assign it to the
1431 * just reset frag_list and go on with things -- we need to 1431 * 'data_skb' variable so that it will be used below for data
1432 * keep that. For the compat case that's easy -- simply get 1432 * copying. We keep 'skb' for everything else, including
1433 * a reference to the compat skb and free the regular one 1433 * freeing both later.
1434 * including the frag. For the non-compat case, we need to
1435 * avoid sending the frag to the user -- so assign NULL but
1436 * restore it below before freeing the skb.
1437 */ 1434 */
1438 if (need_compat) { 1435 if (flags & MSG_CMSG_COMPAT)
1439 struct sk_buff *compskb = skb_shinfo(skb)->frag_list; 1436 data_skb = skb_shinfo(skb)->frag_list;
1440 skb_get(compskb);
1441 kfree_skb(skb);
1442 skb = compskb;
1443 } else {
1444 /*
1445 * Before setting frag_list to NULL, we must get a
1446 * private copy of skb if shared (because of MSG_PEEK)
1447 */
1448 if (skb_shared(skb)) {
1449 struct sk_buff *nskb;
1450
1451 nskb = pskb_copy(skb, GFP_KERNEL);
1452 kfree_skb(skb);
1453 skb = nskb;
1454 err = -ENOMEM;
1455 if (!skb)
1456 goto out;
1457 }
1458 kfree_skb(skb_shinfo(skb)->frag_list);
1459 skb_shinfo(skb)->frag_list = NULL;
1460 }
1461 } 1437 }
1462#endif 1438#endif
1463 1439
1464 msg->msg_namelen = 0; 1440 msg->msg_namelen = 0;
1465 1441
1466 copied = skb->len; 1442 copied = data_skb->len;
1467 if (len < copied) { 1443 if (len < copied) {
1468 msg->msg_flags |= MSG_TRUNC; 1444 msg->msg_flags |= MSG_TRUNC;
1469 copied = len; 1445 copied = len;
1470 } 1446 }
1471 1447
1472 skb_reset_transport_header(skb); 1448 skb_reset_transport_header(data_skb);
1473 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); 1449 err = skb_copy_datagram_iovec(data_skb, 0, msg->msg_iov, copied);
1474 1450
1475 if (msg->msg_name) { 1451 if (msg->msg_name) {
1476 struct sockaddr_nl *addr = (struct sockaddr_nl *)msg->msg_name; 1452 struct sockaddr_nl *addr = (struct sockaddr_nl *)msg->msg_name;
@@ -1490,7 +1466,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
1490 } 1466 }
1491 siocb->scm->creds = *NETLINK_CREDS(skb); 1467 siocb->scm->creds = *NETLINK_CREDS(skb);
1492 if (flags & MSG_TRUNC) 1468 if (flags & MSG_TRUNC)
1493 copied = skb->len; 1469 copied = data_skb->len;
1494 1470
1495 skb_free_datagram(sk, skb); 1471 skb_free_datagram(sk, skb);
1496 1472
@@ -1522,7 +1498,7 @@ netlink_kernel_create(struct net *net, int unit, unsigned int groups,
1522 struct socket *sock; 1498 struct socket *sock;
1523 struct sock *sk; 1499 struct sock *sk;
1524 struct netlink_sock *nlk; 1500 struct netlink_sock *nlk;
1525 unsigned long *listeners = NULL; 1501 struct listeners *listeners = NULL;
1526 1502
1527 BUG_ON(!nl_table); 1503 BUG_ON(!nl_table);
1528 1504
@@ -1547,8 +1523,7 @@ netlink_kernel_create(struct net *net, int unit, unsigned int groups,
1547 if (groups < 32) 1523 if (groups < 32)
1548 groups = 32; 1524 groups = 32;
1549 1525
1550 listeners = kzalloc(NLGRPSZ(groups) + sizeof(struct listeners_rcu_head), 1526 listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
1551 GFP_KERNEL);
1552 if (!listeners) 1527 if (!listeners)
1553 goto out_sock_release; 1528 goto out_sock_release;
1554 1529
@@ -1565,7 +1540,7 @@ netlink_kernel_create(struct net *net, int unit, unsigned int groups,
1565 netlink_table_grab(); 1540 netlink_table_grab();
1566 if (!nl_table[unit].registered) { 1541 if (!nl_table[unit].registered) {
1567 nl_table[unit].groups = groups; 1542 nl_table[unit].groups = groups;
1568 nl_table[unit].listeners = listeners; 1543 rcu_assign_pointer(nl_table[unit].listeners, listeners);
1569 nl_table[unit].cb_mutex = cb_mutex; 1544 nl_table[unit].cb_mutex = cb_mutex;
1570 nl_table[unit].module = module; 1545 nl_table[unit].module = module;
1571 nl_table[unit].registered = 1; 1546 nl_table[unit].registered = 1;
@@ -1596,43 +1571,28 @@ netlink_kernel_release(struct sock *sk)
1596EXPORT_SYMBOL(netlink_kernel_release); 1571EXPORT_SYMBOL(netlink_kernel_release);
1597 1572
1598 1573
1599static void netlink_free_old_listeners(struct rcu_head *rcu_head) 1574static void listeners_free_rcu(struct rcu_head *head)
1600{ 1575{
1601 struct listeners_rcu_head *lrh; 1576 kfree(container_of(head, struct listeners, rcu));
1602
1603 lrh = container_of(rcu_head, struct listeners_rcu_head, rcu_head);
1604 kfree(lrh->ptr);
1605} 1577}
1606 1578
1607int __netlink_change_ngroups(struct sock *sk, unsigned int groups) 1579int __netlink_change_ngroups(struct sock *sk, unsigned int groups)
1608{ 1580{
1609 unsigned long *listeners, *old = NULL; 1581 struct listeners *new, *old;
1610 struct listeners_rcu_head *old_rcu_head;
1611 struct netlink_table *tbl = &nl_table[sk->sk_protocol]; 1582 struct netlink_table *tbl = &nl_table[sk->sk_protocol];
1612 1583
1613 if (groups < 32) 1584 if (groups < 32)
1614 groups = 32; 1585 groups = 32;
1615 1586
1616 if (NLGRPSZ(tbl->groups) < NLGRPSZ(groups)) { 1587 if (NLGRPSZ(tbl->groups) < NLGRPSZ(groups)) {
1617 listeners = kzalloc(NLGRPSZ(groups) + 1588 new = kzalloc(sizeof(*new) + NLGRPSZ(groups), GFP_ATOMIC);
1618 sizeof(struct listeners_rcu_head), 1589 if (!new)
1619 GFP_ATOMIC);
1620 if (!listeners)
1621 return -ENOMEM; 1590 return -ENOMEM;
1622 old = tbl->listeners; 1591 old = rcu_dereference_raw(tbl->listeners);
1623 memcpy(listeners, old, NLGRPSZ(tbl->groups)); 1592 memcpy(new->masks, old->masks, NLGRPSZ(tbl->groups));
1624 rcu_assign_pointer(tbl->listeners, listeners); 1593 rcu_assign_pointer(tbl->listeners, new);
1625 /* 1594
1626 * Free the old memory after an RCU grace period so we 1595 call_rcu(&old->rcu, listeners_free_rcu);
1627 * don't leak it. We use call_rcu() here in order to be
1628 * able to call this function from atomic contexts. The
1629 * allocation of this memory will have reserved enough
1630 * space for struct listeners_rcu_head at the end.
1631 */
1632 old_rcu_head = (void *)(tbl->listeners +
1633 NLGRPLONGS(tbl->groups));
1634 old_rcu_head->ptr = old;
1635 call_rcu(&old_rcu_head->rcu_head, netlink_free_old_listeners);
1636 } 1596 }
1637 tbl->groups = groups; 1597 tbl->groups = groups;
1638 1598
@@ -2126,6 +2086,25 @@ static void __net_exit netlink_net_exit(struct net *net)
2126#endif 2086#endif
2127} 2087}
2128 2088
2089static void __init netlink_add_usersock_entry(void)
2090{
2091 struct listeners *listeners;
2092 int groups = 32;
2093
2094 listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
2095 if (!listeners)
2096 panic("netlink_add_usersock_entry: Cannot allocate listeners\n");
2097
2098 netlink_table_grab();
2099
2100 nl_table[NETLINK_USERSOCK].groups = groups;
2101 rcu_assign_pointer(nl_table[NETLINK_USERSOCK].listeners, listeners);
2102 nl_table[NETLINK_USERSOCK].module = THIS_MODULE;
2103 nl_table[NETLINK_USERSOCK].registered = 1;
2104
2105 netlink_table_ungrab();
2106}
2107
2129static struct pernet_operations __net_initdata netlink_net_ops = { 2108static struct pernet_operations __net_initdata netlink_net_ops = {
2130 .init = netlink_net_init, 2109 .init = netlink_net_init,
2131 .exit = netlink_net_exit, 2110 .exit = netlink_net_exit,
@@ -2174,6 +2153,8 @@ static int __init netlink_proto_init(void)
2174 hash->rehash_time = jiffies; 2153 hash->rehash_time = jiffies;
2175 } 2154 }
2176 2155
2156 netlink_add_usersock_entry();
2157
2177 sock_register(&netlink_family_ops); 2158 sock_register(&netlink_family_ops);
2178 register_pernet_subsys(&netlink_net_ops); 2159 register_pernet_subsys(&netlink_net_ops);
2179 /* The netlink device handler may be needed early. */ 2160 /* The netlink device handler may be needed early. */
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 26ed3e8587c2..1781d99145e2 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -547,8 +547,20 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
547 info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN; 547 info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN;
548 info.attrs = family->attrbuf; 548 info.attrs = family->attrbuf;
549 genl_info_net_set(&info, net); 549 genl_info_net_set(&info, net);
550 memset(&info.user_ptr, 0, sizeof(info.user_ptr));
550 551
551 return ops->doit(skb, &info); 552 if (family->pre_doit) {
553 err = family->pre_doit(ops, skb, &info);
554 if (err)
555 return err;
556 }
557
558 err = ops->doit(skb, &info);
559
560 if (family->post_doit)
561 family->post_doit(ops, skb, &info);
562
563 return err;
552} 564}
553 565
554static void genl_rcv(struct sk_buff *skb) 566static void genl_rcv(struct sk_buff *skb)
diff --git a/net/nonet.c b/net/nonet.c
index 92e76640c7cd..b1a73fda9c12 100644
--- a/net/nonet.c
+++ b/net/nonet.c
@@ -22,4 +22,5 @@ static int sock_no_open(struct inode *irrelevant, struct file *dontcare)
22const struct file_operations bad_sock_fops = { 22const struct file_operations bad_sock_fops = {
23 .owner = THIS_MODULE, 23 .owner = THIS_MODULE,
24 .open = sock_no_open, 24 .open = sock_no_open,
25 .llseek = noop_llseek,
25}; 26};
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 9a17f28b1253..3616f27b9d46 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -488,7 +488,7 @@ retry:
488 skb->dev = dev; 488 skb->dev = dev;
489 skb->priority = sk->sk_priority; 489 skb->priority = sk->sk_priority;
490 skb->mark = sk->sk_mark; 490 skb->mark = sk->sk_mark;
491 err = sock_tx_timestamp(msg, sk, skb_tx(skb)); 491 err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
492 if (err < 0) 492 if (err < 0)
493 goto out_unlock; 493 goto out_unlock;
494 494
@@ -1209,7 +1209,7 @@ static int packet_snd(struct socket *sock,
1209 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len); 1209 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1210 if (err) 1210 if (err)
1211 goto out_free; 1211 goto out_free;
1212 err = sock_tx_timestamp(msg, sk, skb_tx(skb)); 1212 err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1213 if (err < 0) 1213 if (err < 0)
1214 goto out_free; 1214 goto out_free;
1215 1215
diff --git a/net/phonet/Kconfig b/net/phonet/Kconfig
index 6ec7d55b1769..0d9b8a220a78 100644
--- a/net/phonet/Kconfig
+++ b/net/phonet/Kconfig
@@ -14,3 +14,15 @@ config PHONET
14 14
15 To compile this driver as a module, choose M here: the module 15 To compile this driver as a module, choose M here: the module
16 will be called phonet. If unsure, say N. 16 will be called phonet. If unsure, say N.
17
18config PHONET_PIPECTRLR
19 bool "Phonet Pipe Controller (EXPERIMENTAL)"
20 depends on PHONET && EXPERIMENTAL
21 default N
22 help
23 The Pipe Controller implementation in Phonet stack to support Pipe
24 data with Nokia Slim modems like WG2.5 used on ST-Ericsson U8500
25 platform.
26
27 This option is incompatible with older Nokia modems.
28 Say N here unless you really know what you are doing.
diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index 73aee7f2fcdc..fd95beb72f5d 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -251,6 +251,16 @@ int pn_skb_send(struct sock *sk, struct sk_buff *skb,
251 else if (phonet_address_lookup(net, daddr) == 0) { 251 else if (phonet_address_lookup(net, daddr) == 0) {
252 dev = phonet_device_get(net); 252 dev = phonet_device_get(net);
253 skb->pkt_type = PACKET_LOOPBACK; 253 skb->pkt_type = PACKET_LOOPBACK;
254 } else if (pn_sockaddr_get_object(target) == 0) {
255 /* Resource routing (small race until phonet_rcv()) */
256 struct sock *sk = pn_find_sock_by_res(net,
257 target->spn_resource);
258 if (sk) {
259 sock_put(sk);
260 dev = phonet_device_get(net);
261 skb->pkt_type = PACKET_LOOPBACK;
262 } else
263 dev = phonet_route_output(net, daddr);
254 } else 264 } else
255 dev = phonet_route_output(net, daddr); 265 dev = phonet_route_output(net, daddr);
256 266
@@ -383,6 +393,13 @@ static int phonet_rcv(struct sk_buff *skb, struct net_device *dev,
383 goto out; 393 goto out;
384 } 394 }
385 395
396 /* resource routing */
397 if (pn_sockaddr_get_object(&sa) == 0) {
398 struct sock *sk = pn_find_sock_by_res(net, sa.spn_resource);
399 if (sk)
400 return sk_receive_skb(sk, skb, 0);
401 }
402
386 /* check if we are the destination */ 403 /* check if we are the destination */
387 if (phonet_address_lookup(net, pn_sockaddr_get_addr(&sa)) == 0) { 404 if (phonet_address_lookup(net, pn_sockaddr_get_addr(&sa)) == 0) {
388 /* Phonet packet input */ 405 /* Phonet packet input */
diff --git a/net/phonet/datagram.c b/net/phonet/datagram.c
index 1bd38db4fe1e..2f032381bd45 100644
--- a/net/phonet/datagram.c
+++ b/net/phonet/datagram.c
@@ -52,6 +52,19 @@ static int pn_ioctl(struct sock *sk, int cmd, unsigned long arg)
52 answ = skb ? skb->len : 0; 52 answ = skb ? skb->len : 0;
53 release_sock(sk); 53 release_sock(sk);
54 return put_user(answ, (int __user *)arg); 54 return put_user(answ, (int __user *)arg);
55
56 case SIOCPNADDRESOURCE:
57 case SIOCPNDELRESOURCE: {
58 u32 res;
59 if (get_user(res, (u32 __user *)arg))
60 return -EFAULT;
61 if (res >= 256)
62 return -EINVAL;
63 if (cmd == SIOCPNADDRESOURCE)
64 return pn_sock_bind_res(sk, res);
65 else
66 return pn_sock_unbind_res(sk, res);
67 }
55 } 68 }
56 69
57 return -ENOIOCTLCMD; 70 return -ENOIOCTLCMD;
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index b2a3ae6cad78..3e60f2e4e6c2 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -109,6 +109,210 @@ static int pep_reply(struct sock *sk, struct sk_buff *oskb,
109} 109}
110 110
111#define PAD 0x00 111#define PAD 0x00
112
113#ifdef CONFIG_PHONET_PIPECTRLR
114static u8 pipe_negotiate_fc(u8 *host_fc, u8 *remote_fc, int len)
115{
116 int i, j;
117 u8 base_fc, final_fc;
118
119 for (i = 0; i < len; i++) {
120 base_fc = host_fc[i];
121 for (j = 0; j < len; j++) {
122 if (remote_fc[j] == base_fc) {
123 final_fc = base_fc;
124 goto done;
125 }
126 }
127 }
128 return -EINVAL;
129
130done:
131 return final_fc;
132
133}
134
135static int pipe_get_flow_info(struct sock *sk, struct sk_buff *skb,
136 u8 *pref_rx_fc, u8 *req_tx_fc)
137{
138 struct pnpipehdr *hdr;
139 u8 n_sb;
140
141 if (!pskb_may_pull(skb, sizeof(*hdr) + 4))
142 return -EINVAL;
143
144 hdr = pnp_hdr(skb);
145 n_sb = hdr->data[4];
146
147 __skb_pull(skb, sizeof(*hdr) + 4);
148 while (n_sb > 0) {
149 u8 type, buf[3], len = sizeof(buf);
150 u8 *data = pep_get_sb(skb, &type, &len, buf);
151
152 if (data == NULL)
153 return -EINVAL;
154
155 switch (type) {
156 case PN_PIPE_SB_REQUIRED_FC_TX:
157 if (len < 3 || (data[2] | data[3] | data[4]) > 3)
158 break;
159 req_tx_fc[0] = data[2];
160 req_tx_fc[1] = data[3];
161 req_tx_fc[2] = data[4];
162 break;
163
164 case PN_PIPE_SB_PREFERRED_FC_RX:
165 if (len < 3 || (data[2] | data[3] | data[4]) > 3)
166 break;
167 pref_rx_fc[0] = data[2];
168 pref_rx_fc[1] = data[3];
169 pref_rx_fc[2] = data[4];
170 break;
171
172 }
173 n_sb--;
174 }
175 return 0;
176}
177
178static int pipe_handler_send_req(struct sock *sk, u8 utid,
179 u8 msg_id, gfp_t priority)
180{
181 int len;
182 struct pnpipehdr *ph;
183 struct sk_buff *skb;
184 struct pep_sock *pn = pep_sk(sk);
185
186 static const u8 data[4] = {
187 PAD, PAD, PAD, PAD,
188 };
189
190 switch (msg_id) {
191 case PNS_PEP_CONNECT_REQ:
192 len = sizeof(data);
193 break;
194
195 case PNS_PEP_DISCONNECT_REQ:
196 case PNS_PEP_ENABLE_REQ:
197 case PNS_PEP_DISABLE_REQ:
198 len = 0;
199 break;
200
201 default:
202 return -EINVAL;
203 }
204
205 skb = alloc_skb(MAX_PNPIPE_HEADER + len, priority);
206 if (!skb)
207 return -ENOMEM;
208 skb_set_owner_w(skb, sk);
209
210 skb_reserve(skb, MAX_PNPIPE_HEADER);
211 if (len) {
212 __skb_put(skb, len);
213 skb_copy_to_linear_data(skb, data, len);
214 }
215 __skb_push(skb, sizeof(*ph));
216 skb_reset_transport_header(skb);
217 ph = pnp_hdr(skb);
218 ph->utid = utid;
219 ph->message_id = msg_id;
220 ph->pipe_handle = pn->pipe_handle;
221 ph->error_code = PN_PIPE_NO_ERROR;
222
223 return pn_skb_send(sk, skb, &pn->remote_pep);
224}
225
226static int pipe_handler_send_created_ind(struct sock *sk,
227 u8 utid, u8 msg_id)
228{
229 int err_code;
230 struct pnpipehdr *ph;
231 struct sk_buff *skb;
232
233 struct pep_sock *pn = pep_sk(sk);
234 static u8 data[4] = {
235 0x03, 0x04,
236 };
237 data[2] = pn->tx_fc;
238 data[3] = pn->rx_fc;
239
240 /*
241 * actually, below is number of sub-blocks and not error code.
242 * Pipe_created_ind message format does not have any
243 * error code field. However, the Phonet stack will always send
244 * an error code as part of pnpipehdr. So, use that err_code to
245 * specify the number of sub-blocks.
246 */
247 err_code = 0x01;
248
249 skb = alloc_skb(MAX_PNPIPE_HEADER + sizeof(data), GFP_ATOMIC);
250 if (!skb)
251 return -ENOMEM;
252 skb_set_owner_w(skb, sk);
253
254 skb_reserve(skb, MAX_PNPIPE_HEADER);
255 __skb_put(skb, sizeof(data));
256 skb_copy_to_linear_data(skb, data, sizeof(data));
257 __skb_push(skb, sizeof(*ph));
258 skb_reset_transport_header(skb);
259 ph = pnp_hdr(skb);
260 ph->utid = utid;
261 ph->message_id = msg_id;
262 ph->pipe_handle = pn->pipe_handle;
263 ph->error_code = err_code;
264
265 return pn_skb_send(sk, skb, &pn->remote_pep);
266}
267
268static int pipe_handler_send_ind(struct sock *sk, u8 utid, u8 msg_id)
269{
270 int err_code;
271 struct pnpipehdr *ph;
272 struct sk_buff *skb;
273 struct pep_sock *pn = pep_sk(sk);
274
275 /*
276 * actually, below is a filler.
277 * Pipe_enabled/disabled_ind message format does not have any
278 * error code field. However, the Phonet stack will always send
279 * an error code as part of pnpipehdr. So, use that err_code to
280 * specify the filler value.
281 */
282 err_code = 0x0;
283
284 skb = alloc_skb(MAX_PNPIPE_HEADER, GFP_ATOMIC);
285 if (!skb)
286 return -ENOMEM;
287 skb_set_owner_w(skb, sk);
288
289 skb_reserve(skb, MAX_PNPIPE_HEADER);
290 __skb_push(skb, sizeof(*ph));
291 skb_reset_transport_header(skb);
292 ph = pnp_hdr(skb);
293 ph->utid = utid;
294 ph->message_id = msg_id;
295 ph->pipe_handle = pn->pipe_handle;
296 ph->error_code = err_code;
297
298 return pn_skb_send(sk, skb, &pn->remote_pep);
299}
300
301static int pipe_handler_enable_pipe(struct sock *sk, int enable)
302{
303 int utid, req;
304
305 if (enable) {
306 utid = PNS_PIPE_ENABLE_UTID;
307 req = PNS_PEP_ENABLE_REQ;
308 } else {
309 utid = PNS_PIPE_DISABLE_UTID;
310 req = PNS_PEP_DISABLE_REQ;
311 }
312 return pipe_handler_send_req(sk, utid, req, GFP_ATOMIC);
313}
314#endif
315
112static int pep_accept_conn(struct sock *sk, struct sk_buff *skb) 316static int pep_accept_conn(struct sock *sk, struct sk_buff *skb)
113{ 317{
114 static const u8 data[20] = { 318 static const u8 data[20] = {
@@ -192,7 +396,11 @@ static int pipe_snd_status(struct sock *sk, u8 type, u8 status, gfp_t priority)
192 ph->data[3] = PAD; 396 ph->data[3] = PAD;
193 ph->data[4] = status; 397 ph->data[4] = status;
194 398
399#ifdef CONFIG_PHONET_PIPECTRLR
400 return pn_skb_send(sk, skb, &pn->remote_pep);
401#else
195 return pn_skb_send(sk, skb, &pipe_srv); 402 return pn_skb_send(sk, skb, &pipe_srv);
403#endif
196} 404}
197 405
198/* Send our RX flow control information to the sender. 406/* Send our RX flow control information to the sender.
@@ -225,12 +433,13 @@ static void pipe_grant_credits(struct sock *sk)
225static int pipe_rcv_status(struct sock *sk, struct sk_buff *skb) 433static int pipe_rcv_status(struct sock *sk, struct sk_buff *skb)
226{ 434{
227 struct pep_sock *pn = pep_sk(sk); 435 struct pep_sock *pn = pep_sk(sk);
228 struct pnpipehdr *hdr = pnp_hdr(skb); 436 struct pnpipehdr *hdr;
229 int wake = 0; 437 int wake = 0;
230 438
231 if (!pskb_may_pull(skb, sizeof(*hdr) + 4)) 439 if (!pskb_may_pull(skb, sizeof(*hdr) + 4))
232 return -EINVAL; 440 return -EINVAL;
233 441
442 hdr = pnp_hdr(skb);
234 if (hdr->data[0] != PN_PEP_TYPE_COMMON) { 443 if (hdr->data[0] != PN_PEP_TYPE_COMMON) {
235 LIMIT_NETDEBUG(KERN_DEBUG"Phonet unknown PEP type: %u\n", 444 LIMIT_NETDEBUG(KERN_DEBUG"Phonet unknown PEP type: %u\n",
236 (unsigned)hdr->data[0]); 445 (unsigned)hdr->data[0]);
@@ -323,11 +532,35 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb)
323 sk->sk_state_change(sk); 532 sk->sk_state_change(sk);
324 break; 533 break;
325 534
535#ifdef CONFIG_PHONET_PIPECTRLR
536 case PNS_PEP_DISCONNECT_RESP:
537 pn->pipe_state = PIPE_IDLE;
538 sk->sk_state = TCP_CLOSE;
539 break;
540#endif
541
326 case PNS_PEP_ENABLE_REQ: 542 case PNS_PEP_ENABLE_REQ:
327 /* Wait for PNS_PIPE_(ENABLED|REDIRECTED)_IND */ 543 /* Wait for PNS_PIPE_(ENABLED|REDIRECTED)_IND */
328 pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC); 544 pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC);
329 break; 545 break;
330 546
547#ifdef CONFIG_PHONET_PIPECTRLR
548 case PNS_PEP_ENABLE_RESP:
549 pn->pipe_state = PIPE_ENABLED;
550 pipe_handler_send_ind(sk, PNS_PIPE_ENABLED_IND_UTID,
551 PNS_PIPE_ENABLED_IND);
552
553 if (!pn_flow_safe(pn->tx_fc)) {
554 atomic_set(&pn->tx_credits, 1);
555 sk->sk_write_space(sk);
556 }
557 if (sk->sk_state == TCP_ESTABLISHED)
558 break; /* Nothing to do */
559 sk->sk_state = TCP_ESTABLISHED;
560 pipe_grant_credits(sk);
561 break;
562#endif
563
331 case PNS_PEP_RESET_REQ: 564 case PNS_PEP_RESET_REQ:
332 switch (hdr->state_after_reset) { 565 switch (hdr->state_after_reset) {
333 case PN_PIPE_DISABLE: 566 case PN_PIPE_DISABLE:
@@ -346,6 +579,17 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb)
346 pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC); 579 pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC);
347 break; 580 break;
348 581
582#ifdef CONFIG_PHONET_PIPECTRLR
583 case PNS_PEP_DISABLE_RESP:
584 pn->pipe_state = PIPE_DISABLED;
585 atomic_set(&pn->tx_credits, 0);
586 pipe_handler_send_ind(sk, PNS_PIPE_DISABLED_IND_UTID,
587 PNS_PIPE_DISABLED_IND);
588 sk->sk_state = TCP_SYN_RECV;
589 pn->rx_credits = 0;
590 break;
591#endif
592
349 case PNS_PEP_CTRL_REQ: 593 case PNS_PEP_CTRL_REQ:
350 if (skb_queue_len(&pn->ctrlreq_queue) >= PNPIPE_CTRLREQ_MAX) { 594 if (skb_queue_len(&pn->ctrlreq_queue) >= PNPIPE_CTRLREQ_MAX) {
351 atomic_inc(&sk->sk_drops); 595 atomic_inc(&sk->sk_drops);
@@ -437,6 +681,42 @@ static void pipe_destruct(struct sock *sk)
437 skb_queue_purge(&pn->ctrlreq_queue); 681 skb_queue_purge(&pn->ctrlreq_queue);
438} 682}
439 683
684#ifdef CONFIG_PHONET_PIPECTRLR
685static int pep_connresp_rcv(struct sock *sk, struct sk_buff *skb)
686{
687 struct pep_sock *pn = pep_sk(sk);
688 u8 host_pref_rx_fc[3] = {3, 2, 1}, host_req_tx_fc[3] = {3, 2, 1};
689 u8 remote_pref_rx_fc[3], remote_req_tx_fc[3];
690 u8 negotiated_rx_fc, negotiated_tx_fc;
691 int ret;
692
693 pipe_get_flow_info(sk, skb, remote_pref_rx_fc,
694 remote_req_tx_fc);
695 negotiated_tx_fc = pipe_negotiate_fc(remote_req_tx_fc,
696 host_pref_rx_fc,
697 sizeof(host_pref_rx_fc));
698 negotiated_rx_fc = pipe_negotiate_fc(host_req_tx_fc,
699 remote_pref_rx_fc,
700 sizeof(host_pref_rx_fc));
701
702 pn->pipe_state = PIPE_DISABLED;
703 sk->sk_state = TCP_SYN_RECV;
704 sk->sk_backlog_rcv = pipe_do_rcv;
705 sk->sk_destruct = pipe_destruct;
706 pn->rx_credits = 0;
707 pn->rx_fc = negotiated_rx_fc;
708 pn->tx_fc = negotiated_tx_fc;
709 sk->sk_state_change(sk);
710
711 ret = pipe_handler_send_created_ind(sk,
712 PNS_PIPE_CREATED_IND_UTID,
713 PNS_PIPE_CREATED_IND
714 );
715
716 return ret;
717}
718#endif
719
440static int pep_connreq_rcv(struct sock *sk, struct sk_buff *skb) 720static int pep_connreq_rcv(struct sock *sk, struct sk_buff *skb)
441{ 721{
442 struct sock *newsk; 722 struct sock *newsk;
@@ -600,6 +880,12 @@ static int pep_do_rcv(struct sock *sk, struct sk_buff *skb)
600 err = pep_connreq_rcv(sk, skb); 880 err = pep_connreq_rcv(sk, skb);
601 break; 881 break;
602 882
883#ifdef CONFIG_PHONET_PIPECTRLR
884 case PNS_PEP_CONNECT_RESP:
885 err = pep_connresp_rcv(sk, skb);
886 break;
887#endif
888
603 case PNS_PEP_DISCONNECT_REQ: 889 case PNS_PEP_DISCONNECT_REQ:
604 pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC); 890 pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC);
605 break; 891 break;
@@ -620,6 +906,28 @@ drop:
620 return err; 906 return err;
621} 907}
622 908
909static int pipe_do_remove(struct sock *sk)
910{
911 struct pep_sock *pn = pep_sk(sk);
912 struct pnpipehdr *ph;
913 struct sk_buff *skb;
914
915 skb = alloc_skb(MAX_PNPIPE_HEADER, GFP_KERNEL);
916 if (!skb)
917 return -ENOMEM;
918
919 skb_reserve(skb, MAX_PNPIPE_HEADER);
920 __skb_push(skb, sizeof(*ph));
921 skb_reset_transport_header(skb);
922 ph = pnp_hdr(skb);
923 ph->utid = 0;
924 ph->message_id = PNS_PIPE_REMOVE_REQ;
925 ph->pipe_handle = pn->pipe_handle;
926 ph->data[0] = PAD;
927
928 return pn_skb_send(sk, skb, &pipe_srv);
929}
930
623/* associated socket ceases to exist */ 931/* associated socket ceases to exist */
624static void pep_sock_close(struct sock *sk, long timeout) 932static void pep_sock_close(struct sock *sk, long timeout)
625{ 933{
@@ -638,7 +946,22 @@ static void pep_sock_close(struct sock *sk, long timeout)
638 sk_for_each_safe(sknode, p, n, &pn->ackq) 946 sk_for_each_safe(sknode, p, n, &pn->ackq)
639 sk_del_node_init(sknode); 947 sk_del_node_init(sknode);
640 sk->sk_state = TCP_CLOSE; 948 sk->sk_state = TCP_CLOSE;
949 } else if ((1 << sk->sk_state) & (TCPF_SYN_RECV|TCPF_ESTABLISHED))
950 /* Forcefully remove dangling Phonet pipe */
951 pipe_do_remove(sk);
952
953#ifdef CONFIG_PHONET_PIPECTRLR
954 if (pn->pipe_state != PIPE_IDLE) {
955 /* send pep disconnect request */
956 pipe_handler_send_req(sk,
957 PNS_PEP_DISCONNECT_UTID, PNS_PEP_DISCONNECT_REQ,
958 GFP_KERNEL);
959
960 pn->pipe_state = PIPE_IDLE;
961 sk->sk_state = TCP_CLOSE;
641 } 962 }
963#endif
964
642 ifindex = pn->ifindex; 965 ifindex = pn->ifindex;
643 pn->ifindex = 0; 966 pn->ifindex = 0;
644 release_sock(sk); 967 release_sock(sk);
@@ -715,6 +1038,20 @@ out:
715 return newsk; 1038 return newsk;
716} 1039}
717 1040
1041#ifdef CONFIG_PHONET_PIPECTRLR
1042static int pep_sock_connect(struct sock *sk, struct sockaddr *addr, int len)
1043{
1044 struct pep_sock *pn = pep_sk(sk);
1045 struct sockaddr_pn *spn = (struct sockaddr_pn *)addr;
1046
1047 memcpy(&pn->remote_pep, spn, sizeof(struct sockaddr_pn));
1048
1049 return pipe_handler_send_req(sk,
1050 PNS_PEP_CONNECT_UTID, PNS_PEP_CONNECT_REQ,
1051 GFP_ATOMIC);
1052}
1053#endif
1054
718static int pep_ioctl(struct sock *sk, int cmd, unsigned long arg) 1055static int pep_ioctl(struct sock *sk, int cmd, unsigned long arg)
719{ 1056{
720 struct pep_sock *pn = pep_sk(sk); 1057 struct pep_sock *pn = pep_sk(sk);
@@ -766,6 +1103,18 @@ static int pep_setsockopt(struct sock *sk, int level, int optname,
766 1103
767 lock_sock(sk); 1104 lock_sock(sk);
768 switch (optname) { 1105 switch (optname) {
1106#ifdef CONFIG_PHONET_PIPECTRLR
1107 case PNPIPE_PIPE_HANDLE:
1108 if (val) {
1109 if (pn->pipe_state > PIPE_IDLE) {
1110 err = -EFAULT;
1111 break;
1112 }
1113 pn->pipe_handle = val;
1114 break;
1115 }
1116#endif
1117
769 case PNPIPE_ENCAP: 1118 case PNPIPE_ENCAP:
770 if (val && val != PNPIPE_ENCAP_IP) { 1119 if (val && val != PNPIPE_ENCAP_IP) {
771 err = -EINVAL; 1120 err = -EINVAL;
@@ -791,6 +1140,17 @@ static int pep_setsockopt(struct sock *sk, int level, int optname,
791 err = 0; 1140 err = 0;
792 } 1141 }
793 goto out_norel; 1142 goto out_norel;
1143
1144#ifdef CONFIG_PHONET_PIPECTRLR
1145 case PNPIPE_ENABLE:
1146 if (pn->pipe_state <= PIPE_IDLE) {
1147 err = -ENOTCONN;
1148 break;
1149 }
1150 err = pipe_handler_enable_pipe(sk, val);
1151 break;
1152#endif
1153
794 default: 1154 default:
795 err = -ENOPROTOOPT; 1155 err = -ENOPROTOOPT;
796 } 1156 }
@@ -815,9 +1175,19 @@ static int pep_getsockopt(struct sock *sk, int level, int optname,
815 case PNPIPE_ENCAP: 1175 case PNPIPE_ENCAP:
816 val = pn->ifindex ? PNPIPE_ENCAP_IP : PNPIPE_ENCAP_NONE; 1176 val = pn->ifindex ? PNPIPE_ENCAP_IP : PNPIPE_ENCAP_NONE;
817 break; 1177 break;
1178
818 case PNPIPE_IFINDEX: 1179 case PNPIPE_IFINDEX:
819 val = pn->ifindex; 1180 val = pn->ifindex;
820 break; 1181 break;
1182
1183#ifdef CONFIG_PHONET_PIPECTRLR
1184 case PNPIPE_ENABLE:
1185 if (pn->pipe_state <= PIPE_IDLE)
1186 return -ENOTCONN;
1187 val = pn->pipe_state != PIPE_DISABLED;
1188 break;
1189#endif
1190
821 default: 1191 default:
822 return -ENOPROTOOPT; 1192 return -ENOPROTOOPT;
823 } 1193 }
@@ -834,6 +1204,7 @@ static int pipe_skb_send(struct sock *sk, struct sk_buff *skb)
834{ 1204{
835 struct pep_sock *pn = pep_sk(sk); 1205 struct pep_sock *pn = pep_sk(sk);
836 struct pnpipehdr *ph; 1206 struct pnpipehdr *ph;
1207 int err;
837 1208
838 if (pn_flow_safe(pn->tx_fc) && 1209 if (pn_flow_safe(pn->tx_fc) &&
839 !atomic_add_unless(&pn->tx_credits, -1, 0)) { 1210 !atomic_add_unless(&pn->tx_credits, -1, 0)) {
@@ -851,8 +1222,16 @@ static int pipe_skb_send(struct sock *sk, struct sk_buff *skb)
851 } else 1222 } else
852 ph->message_id = PNS_PIPE_DATA; 1223 ph->message_id = PNS_PIPE_DATA;
853 ph->pipe_handle = pn->pipe_handle; 1224 ph->pipe_handle = pn->pipe_handle;
1225#ifdef CONFIG_PHONET_PIPECTRLR
1226 err = pn_skb_send(sk, skb, &pn->remote_pep);
1227#else
1228 err = pn_skb_send(sk, skb, &pipe_srv);
1229#endif
1230
1231 if (err && pn_flow_safe(pn->tx_fc))
1232 atomic_inc(&pn->tx_credits);
1233 return err;
854 1234
855 return pn_skb_send(sk, skb, &pipe_srv);
856} 1235}
857 1236
858static int pep_sendmsg(struct kiocb *iocb, struct sock *sk, 1237static int pep_sendmsg(struct kiocb *iocb, struct sock *sk,
@@ -872,7 +1251,7 @@ static int pep_sendmsg(struct kiocb *iocb, struct sock *sk,
872 skb = sock_alloc_send_skb(sk, MAX_PNPIPE_HEADER + len, 1251 skb = sock_alloc_send_skb(sk, MAX_PNPIPE_HEADER + len,
873 flags & MSG_DONTWAIT, &err); 1252 flags & MSG_DONTWAIT, &err);
874 if (!skb) 1253 if (!skb)
875 return -ENOBUFS; 1254 return err;
876 1255
877 skb_reserve(skb, MAX_PHONET_HEADER + 3); 1256 skb_reserve(skb, MAX_PHONET_HEADER + 3);
878 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); 1257 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
@@ -1044,6 +1423,8 @@ static void pep_sock_unhash(struct sock *sk)
1044 struct sock *skparent = NULL; 1423 struct sock *skparent = NULL;
1045 1424
1046 lock_sock(sk); 1425 lock_sock(sk);
1426
1427#ifndef CONFIG_PHONET_PIPECTRLR
1047 if ((1 << sk->sk_state) & ~(TCPF_CLOSE|TCPF_LISTEN)) { 1428 if ((1 << sk->sk_state) & ~(TCPF_CLOSE|TCPF_LISTEN)) {
1048 skparent = pn->listener; 1429 skparent = pn->listener;
1049 release_sock(sk); 1430 release_sock(sk);
@@ -1053,6 +1434,7 @@ static void pep_sock_unhash(struct sock *sk)
1053 sk_del_node_init(sk); 1434 sk_del_node_init(sk);
1054 sk = skparent; 1435 sk = skparent;
1055 } 1436 }
1437#endif
1056 /* Unhash a listening sock only when it is closed 1438 /* Unhash a listening sock only when it is closed
1057 * and all of its active connected pipes are closed. */ 1439 * and all of its active connected pipes are closed. */
1058 if (hlist_empty(&pn->hlist)) 1440 if (hlist_empty(&pn->hlist))
@@ -1066,6 +1448,9 @@ static void pep_sock_unhash(struct sock *sk)
1066static struct proto pep_proto = { 1448static struct proto pep_proto = {
1067 .close = pep_sock_close, 1449 .close = pep_sock_close,
1068 .accept = pep_sock_accept, 1450 .accept = pep_sock_accept,
1451#ifdef CONFIG_PHONET_PIPECTRLR
1452 .connect = pep_sock_connect,
1453#endif
1069 .ioctl = pep_ioctl, 1454 .ioctl = pep_ioctl,
1070 .init = pep_init, 1455 .init = pep_init,
1071 .setsockopt = pep_setsockopt, 1456 .setsockopt = pep_setsockopt,
diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c
index b18e48fae975..947038ddd04c 100644
--- a/net/phonet/pn_dev.c
+++ b/net/phonet/pn_dev.c
@@ -292,8 +292,7 @@ static void phonet_route_autodel(struct net_device *dev)
292 if (bitmap_empty(deleted, 64)) 292 if (bitmap_empty(deleted, 64))
293 return; /* short-circuit RCU */ 293 return; /* short-circuit RCU */
294 synchronize_rcu(); 294 synchronize_rcu();
295 for (i = find_first_bit(deleted, 64); i < 64; 295 for_each_set_bit(i, deleted, 64) {
296 i = find_next_bit(deleted, 64, i + 1)) {
297 rtm_phonet_notify(RTM_DELROUTE, dev, i); 296 rtm_phonet_notify(RTM_DELROUTE, dev, i);
298 dev_put(dev); 297 dev_put(dev);
299 } 298 }
@@ -374,6 +373,7 @@ int __init phonet_device_init(void)
374 if (err) 373 if (err)
375 return err; 374 return err;
376 375
376 proc_net_fops_create(&init_net, "pnresource", 0, &pn_res_seq_fops);
377 register_netdevice_notifier(&phonet_device_notifier); 377 register_netdevice_notifier(&phonet_device_notifier);
378 err = phonet_netlink_register(); 378 err = phonet_netlink_register();
379 if (err) 379 if (err)
@@ -386,6 +386,7 @@ void phonet_device_exit(void)
386 rtnl_unregister_all(PF_PHONET); 386 rtnl_unregister_all(PF_PHONET);
387 unregister_netdevice_notifier(&phonet_device_notifier); 387 unregister_netdevice_notifier(&phonet_device_notifier);
388 unregister_pernet_device(&phonet_net_ops); 388 unregister_pernet_device(&phonet_net_ops);
389 proc_net_remove(&init_net, "pnresource");
389} 390}
390 391
391int phonet_route_add(struct net_device *dev, u8 daddr) 392int phonet_route_add(struct net_device *dev, u8 daddr)
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index 6e9848bf0370..25f746d20c1f 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -158,6 +158,7 @@ void pn_sock_unhash(struct sock *sk)
158 spin_lock_bh(&pnsocks.lock); 158 spin_lock_bh(&pnsocks.lock);
159 sk_del_node_init(sk); 159 sk_del_node_init(sk);
160 spin_unlock_bh(&pnsocks.lock); 160 spin_unlock_bh(&pnsocks.lock);
161 pn_sock_unbind_all_res(sk);
161} 162}
162EXPORT_SYMBOL(pn_sock_unhash); 163EXPORT_SYMBOL(pn_sock_unhash);
163 164
@@ -224,6 +225,101 @@ static int pn_socket_autobind(struct socket *sock)
224 return 0; /* socket was already bound */ 225 return 0; /* socket was already bound */
225} 226}
226 227
228#ifdef CONFIG_PHONET_PIPECTRLR
229static int pn_socket_connect(struct socket *sock, struct sockaddr *addr,
230 int len, int flags)
231{
232 struct sock *sk = sock->sk;
233 struct sockaddr_pn *spn = (struct sockaddr_pn *)addr;
234 long timeo;
235 int err;
236
237 if (len < sizeof(struct sockaddr_pn))
238 return -EINVAL;
239 if (spn->spn_family != AF_PHONET)
240 return -EAFNOSUPPORT;
241
242 lock_sock(sk);
243
244 switch (sock->state) {
245 case SS_UNCONNECTED:
246 sk->sk_state = TCP_CLOSE;
247 break;
248 case SS_CONNECTING:
249 switch (sk->sk_state) {
250 case TCP_SYN_RECV:
251 sock->state = SS_CONNECTED;
252 err = -EISCONN;
253 goto out;
254 case TCP_CLOSE:
255 err = -EALREADY;
256 if (flags & O_NONBLOCK)
257 goto out;
258 goto wait_connect;
259 }
260 break;
261 case SS_CONNECTED:
262 switch (sk->sk_state) {
263 case TCP_SYN_RECV:
264 err = -EISCONN;
265 goto out;
266 case TCP_CLOSE:
267 sock->state = SS_UNCONNECTED;
268 break;
269 }
270 break;
271 case SS_DISCONNECTING:
272 case SS_FREE:
273 break;
274 }
275 sk->sk_state = TCP_CLOSE;
276 sk_stream_kill_queues(sk);
277
278 sock->state = SS_CONNECTING;
279 err = sk->sk_prot->connect(sk, addr, len);
280 if (err < 0) {
281 sock->state = SS_UNCONNECTED;
282 sk->sk_state = TCP_CLOSE;
283 goto out;
284 }
285
286 err = -EINPROGRESS;
287wait_connect:
288 if (sk->sk_state != TCP_SYN_RECV && (flags & O_NONBLOCK))
289 goto out;
290
291 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
292 release_sock(sk);
293
294 err = -ERESTARTSYS;
295 timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
296 sk->sk_state != TCP_CLOSE,
297 timeo);
298
299 lock_sock(sk);
300 if (timeo < 0)
301 goto out; /* -ERESTARTSYS */
302
303 err = -ETIMEDOUT;
304 if (timeo == 0 && sk->sk_state != TCP_SYN_RECV)
305 goto out;
306
307 if (sk->sk_state != TCP_SYN_RECV) {
308 sock->state = SS_UNCONNECTED;
309 err = sock_error(sk);
310 if (!err)
311 err = -ECONNREFUSED;
312 goto out;
313 }
314 sock->state = SS_CONNECTED;
315 err = 0;
316
317out:
318 release_sock(sk);
319 return err;
320}
321#endif
322
227static int pn_socket_accept(struct socket *sock, struct socket *newsock, 323static int pn_socket_accept(struct socket *sock, struct socket *newsock,
228 int flags) 324 int flags)
229{ 325{
@@ -281,7 +377,9 @@ static unsigned int pn_socket_poll(struct file *file, struct socket *sock,
281 if (!mask && sk->sk_state == TCP_CLOSE_WAIT) 377 if (!mask && sk->sk_state == TCP_CLOSE_WAIT)
282 return POLLHUP; 378 return POLLHUP;
283 379
284 if (sk->sk_state == TCP_ESTABLISHED && atomic_read(&pn->tx_credits)) 380 if (sk->sk_state == TCP_ESTABLISHED &&
381 atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf &&
382 atomic_read(&pn->tx_credits))
285 mask |= POLLOUT | POLLWRNORM | POLLWRBAND; 383 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
286 384
287 return mask; 385 return mask;
@@ -390,7 +488,11 @@ const struct proto_ops phonet_stream_ops = {
390 .owner = THIS_MODULE, 488 .owner = THIS_MODULE,
391 .release = pn_socket_release, 489 .release = pn_socket_release,
392 .bind = pn_socket_bind, 490 .bind = pn_socket_bind,
491#ifdef CONFIG_PHONET_PIPECTRLR
492 .connect = pn_socket_connect,
493#else
393 .connect = sock_no_connect, 494 .connect = sock_no_connect,
495#endif
394 .socketpair = sock_no_socketpair, 496 .socketpair = sock_no_socketpair,
395 .accept = pn_socket_accept, 497 .accept = pn_socket_accept,
396 .getname = pn_socket_getname, 498 .getname = pn_socket_getname,
@@ -563,3 +665,188 @@ const struct file_operations pn_sock_seq_fops = {
563 .release = seq_release_net, 665 .release = seq_release_net,
564}; 666};
565#endif 667#endif
668
669static struct {
670 struct sock *sk[256];
671} pnres;
672
673/*
674 * Find and hold socket based on resource.
675 */
676struct sock *pn_find_sock_by_res(struct net *net, u8 res)
677{
678 struct sock *sk;
679
680 if (!net_eq(net, &init_net))
681 return NULL;
682
683 rcu_read_lock();
684 sk = rcu_dereference(pnres.sk[res]);
685 if (sk)
686 sock_hold(sk);
687 rcu_read_unlock();
688 return sk;
689}
690
691static DEFINE_MUTEX(resource_mutex);
692
693int pn_sock_bind_res(struct sock *sk, u8 res)
694{
695 int ret = -EADDRINUSE;
696
697 if (!net_eq(sock_net(sk), &init_net))
698 return -ENOIOCTLCMD;
699 if (!capable(CAP_SYS_ADMIN))
700 return -EPERM;
701 if (pn_socket_autobind(sk->sk_socket))
702 return -EAGAIN;
703
704 mutex_lock(&resource_mutex);
705 if (pnres.sk[res] == NULL) {
706 sock_hold(sk);
707 rcu_assign_pointer(pnres.sk[res], sk);
708 ret = 0;
709 }
710 mutex_unlock(&resource_mutex);
711 return ret;
712}
713
714int pn_sock_unbind_res(struct sock *sk, u8 res)
715{
716 int ret = -ENOENT;
717
718 if (!capable(CAP_SYS_ADMIN))
719 return -EPERM;
720
721 mutex_lock(&resource_mutex);
722 if (pnres.sk[res] == sk) {
723 rcu_assign_pointer(pnres.sk[res], NULL);
724 ret = 0;
725 }
726 mutex_unlock(&resource_mutex);
727
728 if (ret == 0) {
729 synchronize_rcu();
730 sock_put(sk);
731 }
732 return ret;
733}
734
735void pn_sock_unbind_all_res(struct sock *sk)
736{
737 unsigned res, match = 0;
738
739 mutex_lock(&resource_mutex);
740 for (res = 0; res < 256; res++) {
741 if (pnres.sk[res] == sk) {
742 rcu_assign_pointer(pnres.sk[res], NULL);
743 match++;
744 }
745 }
746 mutex_unlock(&resource_mutex);
747
748 if (match == 0)
749 return;
750 synchronize_rcu();
751 while (match > 0) {
752 sock_put(sk);
753 match--;
754 }
755}
756
757#ifdef CONFIG_PROC_FS
758static struct sock **pn_res_get_idx(struct seq_file *seq, loff_t pos)
759{
760 struct net *net = seq_file_net(seq);
761 unsigned i;
762
763 if (!net_eq(net, &init_net))
764 return NULL;
765
766 for (i = 0; i < 256; i++) {
767 if (pnres.sk[i] == NULL)
768 continue;
769 if (!pos)
770 return pnres.sk + i;
771 pos--;
772 }
773 return NULL;
774}
775
776static struct sock **pn_res_get_next(struct seq_file *seq, struct sock **sk)
777{
778 struct net *net = seq_file_net(seq);
779 unsigned i;
780
781 BUG_ON(!net_eq(net, &init_net));
782
783 for (i = (sk - pnres.sk) + 1; i < 256; i++)
784 if (pnres.sk[i])
785 return pnres.sk + i;
786 return NULL;
787}
788
789static void *pn_res_seq_start(struct seq_file *seq, loff_t *pos)
790 __acquires(resource_mutex)
791{
792 mutex_lock(&resource_mutex);
793 return *pos ? pn_res_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
794}
795
796static void *pn_res_seq_next(struct seq_file *seq, void *v, loff_t *pos)
797{
798 struct sock **sk;
799
800 if (v == SEQ_START_TOKEN)
801 sk = pn_res_get_idx(seq, 0);
802 else
803 sk = pn_res_get_next(seq, v);
804 (*pos)++;
805 return sk;
806}
807
808static void pn_res_seq_stop(struct seq_file *seq, void *v)
809 __releases(resource_mutex)
810{
811 mutex_unlock(&resource_mutex);
812}
813
814static int pn_res_seq_show(struct seq_file *seq, void *v)
815{
816 int len;
817
818 if (v == SEQ_START_TOKEN)
819 seq_printf(seq, "%s%n", "rs uid inode", &len);
820 else {
821 struct sock **psk = v;
822 struct sock *sk = *psk;
823
824 seq_printf(seq, "%02X %5d %lu%n",
825 (int) (psk - pnres.sk), sock_i_uid(sk),
826 sock_i_ino(sk), &len);
827 }
828 seq_printf(seq, "%*s\n", 63 - len, "");
829 return 0;
830}
831
832static const struct seq_operations pn_res_seq_ops = {
833 .start = pn_res_seq_start,
834 .next = pn_res_seq_next,
835 .stop = pn_res_seq_stop,
836 .show = pn_res_seq_show,
837};
838
839static int pn_res_open(struct inode *inode, struct file *file)
840{
841 return seq_open_net(inode, file, &pn_res_seq_ops,
842 sizeof(struct seq_net_private));
843}
844
845const struct file_operations pn_res_seq_fops = {
846 .owner = THIS_MODULE,
847 .open = pn_res_open,
848 .read = seq_read,
849 .llseek = seq_lseek,
850 .release = seq_release_net,
851};
852#endif
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index aebfecbdb841..bb6ad81b671d 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -39,7 +39,15 @@
39#include <net/sock.h> 39#include <net/sock.h>
40 40
41#include "rds.h" 41#include "rds.h"
42#include "rdma.h" 42
43char *rds_str_array(char **array, size_t elements, size_t index)
44{
45 if ((index < elements) && array[index])
46 return array[index];
47 else
48 return "unknown";
49}
50EXPORT_SYMBOL(rds_str_array);
43 51
44/* this is just used for stats gathering :/ */ 52/* this is just used for stats gathering :/ */
45static DEFINE_SPINLOCK(rds_sock_lock); 53static DEFINE_SPINLOCK(rds_sock_lock);
@@ -62,7 +70,7 @@ static int rds_release(struct socket *sock)
62 struct rds_sock *rs; 70 struct rds_sock *rs;
63 unsigned long flags; 71 unsigned long flags;
64 72
65 if (sk == NULL) 73 if (!sk)
66 goto out; 74 goto out;
67 75
68 rs = rds_sk_to_rs(sk); 76 rs = rds_sk_to_rs(sk);
@@ -73,7 +81,15 @@ static int rds_release(struct socket *sock)
73 * with the socket. */ 81 * with the socket. */
74 rds_clear_recv_queue(rs); 82 rds_clear_recv_queue(rs);
75 rds_cong_remove_socket(rs); 83 rds_cong_remove_socket(rs);
84
85 /*
86 * the binding lookup hash uses rcu, we need to
87 * make sure we sychronize_rcu before we free our
88 * entry
89 */
76 rds_remove_bound(rs); 90 rds_remove_bound(rs);
91 synchronize_rcu();
92
77 rds_send_drop_to(rs, NULL); 93 rds_send_drop_to(rs, NULL);
78 rds_rdma_drop_keys(rs); 94 rds_rdma_drop_keys(rs);
79 rds_notify_queue_get(rs, NULL); 95 rds_notify_queue_get(rs, NULL);
@@ -83,6 +99,8 @@ static int rds_release(struct socket *sock)
83 rds_sock_count--; 99 rds_sock_count--;
84 spin_unlock_irqrestore(&rds_sock_lock, flags); 100 spin_unlock_irqrestore(&rds_sock_lock, flags);
85 101
102 rds_trans_put(rs->rs_transport);
103
86 sock->sk = NULL; 104 sock->sk = NULL;
87 sock_put(sk); 105 sock_put(sk);
88out: 106out:
@@ -514,7 +532,7 @@ out:
514 spin_unlock_irqrestore(&rds_sock_lock, flags); 532 spin_unlock_irqrestore(&rds_sock_lock, flags);
515} 533}
516 534
517static void __exit rds_exit(void) 535static void rds_exit(void)
518{ 536{
519 sock_unregister(rds_family_ops.family); 537 sock_unregister(rds_family_ops.family);
520 proto_unregister(&rds_proto); 538 proto_unregister(&rds_proto);
@@ -529,7 +547,7 @@ static void __exit rds_exit(void)
529} 547}
530module_exit(rds_exit); 548module_exit(rds_exit);
531 549
532static int __init rds_init(void) 550static int rds_init(void)
533{ 551{
534 int ret; 552 int ret;
535 553
diff --git a/net/rds/bind.c b/net/rds/bind.c
index 5d95fc007f1a..2f6b3fcc79f8 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -34,45 +34,52 @@
34#include <net/sock.h> 34#include <net/sock.h>
35#include <linux/in.h> 35#include <linux/in.h>
36#include <linux/if_arp.h> 36#include <linux/if_arp.h>
37#include <linux/jhash.h>
37#include "rds.h" 38#include "rds.h"
38 39
39/* 40#define BIND_HASH_SIZE 1024
40 * XXX this probably still needs more work.. no INADDR_ANY, and rbtrees aren't 41static struct hlist_head bind_hash_table[BIND_HASH_SIZE];
41 * particularly zippy.
42 *
43 * This is now called for every incoming frame so we arguably care much more
44 * about it than we used to.
45 */
46static DEFINE_SPINLOCK(rds_bind_lock); 42static DEFINE_SPINLOCK(rds_bind_lock);
47static struct rb_root rds_bind_tree = RB_ROOT;
48 43
49static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port, 44static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port)
50 struct rds_sock *insert) 45{
46 return bind_hash_table + (jhash_2words((u32)addr, (u32)port, 0) &
47 (BIND_HASH_SIZE - 1));
48}
49
50static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port,
51 struct rds_sock *insert)
51{ 52{
52 struct rb_node **p = &rds_bind_tree.rb_node;
53 struct rb_node *parent = NULL;
54 struct rds_sock *rs; 53 struct rds_sock *rs;
54 struct hlist_node *node;
55 struct hlist_head *head = hash_to_bucket(addr, port);
55 u64 cmp; 56 u64 cmp;
56 u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port); 57 u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port);
57 58
58 while (*p) { 59 rcu_read_lock();
59 parent = *p; 60 hlist_for_each_entry_rcu(rs, node, head, rs_bound_node) {
60 rs = rb_entry(parent, struct rds_sock, rs_bound_node);
61
62 cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) | 61 cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) |
63 be16_to_cpu(rs->rs_bound_port); 62 be16_to_cpu(rs->rs_bound_port);
64 63
65 if (needle < cmp) 64 if (cmp == needle) {
66 p = &(*p)->rb_left; 65 rcu_read_unlock();
67 else if (needle > cmp)
68 p = &(*p)->rb_right;
69 else
70 return rs; 66 return rs;
67 }
71 } 68 }
69 rcu_read_unlock();
72 70
73 if (insert) { 71 if (insert) {
74 rb_link_node(&insert->rs_bound_node, parent, p); 72 /*
75 rb_insert_color(&insert->rs_bound_node, &rds_bind_tree); 73 * make sure our addr and port are set before
74 * we are added to the list, other people
75 * in rcu will find us as soon as the
76 * hlist_add_head_rcu is done
77 */
78 insert->rs_bound_addr = addr;
79 insert->rs_bound_port = port;
80 rds_sock_addref(insert);
81
82 hlist_add_head_rcu(&insert->rs_bound_node, head);
76 } 83 }
77 return NULL; 84 return NULL;
78} 85}
@@ -86,15 +93,13 @@ static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port,
86struct rds_sock *rds_find_bound(__be32 addr, __be16 port) 93struct rds_sock *rds_find_bound(__be32 addr, __be16 port)
87{ 94{
88 struct rds_sock *rs; 95 struct rds_sock *rs;
89 unsigned long flags;
90 96
91 spin_lock_irqsave(&rds_bind_lock, flags); 97 rs = rds_bind_lookup(addr, port, NULL);
92 rs = rds_bind_tree_walk(addr, port, NULL); 98
93 if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) 99 if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
94 rds_sock_addref(rs); 100 rds_sock_addref(rs);
95 else 101 else
96 rs = NULL; 102 rs = NULL;
97 spin_unlock_irqrestore(&rds_bind_lock, flags);
98 103
99 rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, 104 rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr,
100 ntohs(port)); 105 ntohs(port));
@@ -121,22 +126,15 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
121 do { 126 do {
122 if (rover == 0) 127 if (rover == 0)
123 rover++; 128 rover++;
124 if (rds_bind_tree_walk(addr, cpu_to_be16(rover), rs) == NULL) { 129 if (!rds_bind_lookup(addr, cpu_to_be16(rover), rs)) {
125 *port = cpu_to_be16(rover); 130 *port = rs->rs_bound_port;
126 ret = 0; 131 ret = 0;
132 rdsdebug("rs %p binding to %pI4:%d\n",
133 rs, &addr, (int)ntohs(*port));
127 break; 134 break;
128 } 135 }
129 } while (rover++ != last); 136 } while (rover++ != last);
130 137
131 if (ret == 0) {
132 rs->rs_bound_addr = addr;
133 rs->rs_bound_port = *port;
134 rds_sock_addref(rs);
135
136 rdsdebug("rs %p binding to %pI4:%d\n",
137 rs, &addr, (int)ntohs(*port));
138 }
139
140 spin_unlock_irqrestore(&rds_bind_lock, flags); 138 spin_unlock_irqrestore(&rds_bind_lock, flags);
141 139
142 return ret; 140 return ret;
@@ -153,7 +151,7 @@ void rds_remove_bound(struct rds_sock *rs)
153 rs, &rs->rs_bound_addr, 151 rs, &rs->rs_bound_addr,
154 ntohs(rs->rs_bound_port)); 152 ntohs(rs->rs_bound_port));
155 153
156 rb_erase(&rs->rs_bound_node, &rds_bind_tree); 154 hlist_del_init_rcu(&rs->rs_bound_node);
157 rds_sock_put(rs); 155 rds_sock_put(rs);
158 rs->rs_bound_addr = 0; 156 rs->rs_bound_addr = 0;
159 } 157 }
@@ -184,7 +182,7 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
184 goto out; 182 goto out;
185 183
186 trans = rds_trans_get_preferred(sin->sin_addr.s_addr); 184 trans = rds_trans_get_preferred(sin->sin_addr.s_addr);
187 if (trans == NULL) { 185 if (!trans) {
188 ret = -EADDRNOTAVAIL; 186 ret = -EADDRNOTAVAIL;
189 rds_remove_bound(rs); 187 rds_remove_bound(rs);
190 if (printk_ratelimit()) 188 if (printk_ratelimit())
@@ -198,5 +196,9 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
198 196
199out: 197out:
200 release_sock(sk); 198 release_sock(sk);
199
200 /* we might have called rds_remove_bound on error */
201 if (ret)
202 synchronize_rcu();
201 return ret; 203 return ret;
202} 204}
diff --git a/net/rds/cong.c b/net/rds/cong.c
index 0871a29f0780..75ea686f27d5 100644
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -141,7 +141,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
141 unsigned long flags; 141 unsigned long flags;
142 142
143 map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL); 143 map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL);
144 if (map == NULL) 144 if (!map)
145 return NULL; 145 return NULL;
146 146
147 map->m_addr = addr; 147 map->m_addr = addr;
@@ -159,7 +159,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
159 ret = rds_cong_tree_walk(addr, map); 159 ret = rds_cong_tree_walk(addr, map);
160 spin_unlock_irqrestore(&rds_cong_lock, flags); 160 spin_unlock_irqrestore(&rds_cong_lock, flags);
161 161
162 if (ret == NULL) { 162 if (!ret) {
163 ret = map; 163 ret = map;
164 map = NULL; 164 map = NULL;
165 } 165 }
@@ -205,7 +205,7 @@ int rds_cong_get_maps(struct rds_connection *conn)
205 conn->c_lcong = rds_cong_from_addr(conn->c_laddr); 205 conn->c_lcong = rds_cong_from_addr(conn->c_laddr);
206 conn->c_fcong = rds_cong_from_addr(conn->c_faddr); 206 conn->c_fcong = rds_cong_from_addr(conn->c_faddr);
207 207
208 if (conn->c_lcong == NULL || conn->c_fcong == NULL) 208 if (!(conn->c_lcong && conn->c_fcong))
209 return -ENOMEM; 209 return -ENOMEM;
210 210
211 return 0; 211 return 0;
@@ -221,7 +221,7 @@ void rds_cong_queue_updates(struct rds_cong_map *map)
221 list_for_each_entry(conn, &map->m_conn_list, c_map_item) { 221 list_for_each_entry(conn, &map->m_conn_list, c_map_item) {
222 if (!test_and_set_bit(0, &conn->c_map_queued)) { 222 if (!test_and_set_bit(0, &conn->c_map_queued)) {
223 rds_stats_inc(s_cong_update_queued); 223 rds_stats_inc(s_cong_update_queued);
224 queue_delayed_work(rds_wq, &conn->c_send_w, 0); 224 rds_send_xmit(conn);
225 } 225 }
226 } 226 }
227 227
diff --git a/net/rds/connection.c b/net/rds/connection.c
index 7619b671ca28..9334d892366e 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -37,7 +37,6 @@
37 37
38#include "rds.h" 38#include "rds.h"
39#include "loop.h" 39#include "loop.h"
40#include "rdma.h"
41 40
42#define RDS_CONNECTION_HASH_BITS 12 41#define RDS_CONNECTION_HASH_BITS 12
43#define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS) 42#define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS)
@@ -63,18 +62,7 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
63 var |= RDS_INFO_CONNECTION_FLAG_##suffix; \ 62 var |= RDS_INFO_CONNECTION_FLAG_##suffix; \
64} while (0) 63} while (0)
65 64
66static inline int rds_conn_is_sending(struct rds_connection *conn) 65/* rcu read lock must be held or the connection spinlock */
67{
68 int ret = 0;
69
70 if (!mutex_trylock(&conn->c_send_lock))
71 ret = 1;
72 else
73 mutex_unlock(&conn->c_send_lock);
74
75 return ret;
76}
77
78static struct rds_connection *rds_conn_lookup(struct hlist_head *head, 66static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
79 __be32 laddr, __be32 faddr, 67 __be32 laddr, __be32 faddr,
80 struct rds_transport *trans) 68 struct rds_transport *trans)
@@ -82,7 +70,7 @@ static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
82 struct rds_connection *conn, *ret = NULL; 70 struct rds_connection *conn, *ret = NULL;
83 struct hlist_node *pos; 71 struct hlist_node *pos;
84 72
85 hlist_for_each_entry(conn, pos, head, c_hash_node) { 73 hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) {
86 if (conn->c_faddr == faddr && conn->c_laddr == laddr && 74 if (conn->c_faddr == faddr && conn->c_laddr == laddr &&
87 conn->c_trans == trans) { 75 conn->c_trans == trans) {
88 ret = conn; 76 ret = conn;
@@ -100,7 +88,7 @@ static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
100 * and receiving over this connection again in the future. It is up to 88 * and receiving over this connection again in the future. It is up to
101 * the transport to have serialized this call with its send and recv. 89 * the transport to have serialized this call with its send and recv.
102 */ 90 */
103void rds_conn_reset(struct rds_connection *conn) 91static void rds_conn_reset(struct rds_connection *conn)
104{ 92{
105 rdsdebug("connection %pI4 to %pI4 reset\n", 93 rdsdebug("connection %pI4 to %pI4 reset\n",
106 &conn->c_laddr, &conn->c_faddr); 94 &conn->c_laddr, &conn->c_faddr);
@@ -129,10 +117,11 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
129{ 117{
130 struct rds_connection *conn, *parent = NULL; 118 struct rds_connection *conn, *parent = NULL;
131 struct hlist_head *head = rds_conn_bucket(laddr, faddr); 119 struct hlist_head *head = rds_conn_bucket(laddr, faddr);
120 struct rds_transport *loop_trans;
132 unsigned long flags; 121 unsigned long flags;
133 int ret; 122 int ret;
134 123
135 spin_lock_irqsave(&rds_conn_lock, flags); 124 rcu_read_lock();
136 conn = rds_conn_lookup(head, laddr, faddr, trans); 125 conn = rds_conn_lookup(head, laddr, faddr, trans);
137 if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && 126 if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
138 !is_outgoing) { 127 !is_outgoing) {
@@ -143,12 +132,12 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
143 parent = conn; 132 parent = conn;
144 conn = parent->c_passive; 133 conn = parent->c_passive;
145 } 134 }
146 spin_unlock_irqrestore(&rds_conn_lock, flags); 135 rcu_read_unlock();
147 if (conn) 136 if (conn)
148 goto out; 137 goto out;
149 138
150 conn = kmem_cache_zalloc(rds_conn_slab, gfp); 139 conn = kmem_cache_zalloc(rds_conn_slab, gfp);
151 if (conn == NULL) { 140 if (!conn) {
152 conn = ERR_PTR(-ENOMEM); 141 conn = ERR_PTR(-ENOMEM);
153 goto out; 142 goto out;
154 } 143 }
@@ -159,7 +148,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
159 spin_lock_init(&conn->c_lock); 148 spin_lock_init(&conn->c_lock);
160 conn->c_next_tx_seq = 1; 149 conn->c_next_tx_seq = 1;
161 150
162 mutex_init(&conn->c_send_lock); 151 init_waitqueue_head(&conn->c_waitq);
163 INIT_LIST_HEAD(&conn->c_send_queue); 152 INIT_LIST_HEAD(&conn->c_send_queue);
164 INIT_LIST_HEAD(&conn->c_retrans); 153 INIT_LIST_HEAD(&conn->c_retrans);
165 154
@@ -175,7 +164,9 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
175 * can bind to the destination address then we'd rather the messages 164 * can bind to the destination address then we'd rather the messages
176 * flow through loopback rather than either transport. 165 * flow through loopback rather than either transport.
177 */ 166 */
178 if (rds_trans_get_preferred(faddr)) { 167 loop_trans = rds_trans_get_preferred(faddr);
168 if (loop_trans) {
169 rds_trans_put(loop_trans);
179 conn->c_loopback = 1; 170 conn->c_loopback = 1;
180 if (is_outgoing && trans->t_prefer_loopback) { 171 if (is_outgoing && trans->t_prefer_loopback) {
181 /* "outgoing" connection - and the transport 172 /* "outgoing" connection - and the transport
@@ -238,7 +229,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
238 kmem_cache_free(rds_conn_slab, conn); 229 kmem_cache_free(rds_conn_slab, conn);
239 conn = found; 230 conn = found;
240 } else { 231 } else {
241 hlist_add_head(&conn->c_hash_node, head); 232 hlist_add_head_rcu(&conn->c_hash_node, head);
242 rds_cong_add_conn(conn); 233 rds_cong_add_conn(conn);
243 rds_conn_count++; 234 rds_conn_count++;
244 } 235 }
@@ -263,21 +254,91 @@ struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
263} 254}
264EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); 255EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
265 256
257void rds_conn_shutdown(struct rds_connection *conn)
258{
259 /* shut it down unless it's down already */
260 if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
261 /*
262 * Quiesce the connection mgmt handlers before we start tearing
263 * things down. We don't hold the mutex for the entire
264 * duration of the shutdown operation, else we may be
265 * deadlocking with the CM handler. Instead, the CM event
266 * handler is supposed to check for state DISCONNECTING
267 */
268 mutex_lock(&conn->c_cm_lock);
269 if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING)
270 && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) {
271 rds_conn_error(conn, "shutdown called in state %d\n",
272 atomic_read(&conn->c_state));
273 mutex_unlock(&conn->c_cm_lock);
274 return;
275 }
276 mutex_unlock(&conn->c_cm_lock);
277
278 wait_event(conn->c_waitq,
279 !test_bit(RDS_IN_XMIT, &conn->c_flags));
280
281 conn->c_trans->conn_shutdown(conn);
282 rds_conn_reset(conn);
283
284 if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) {
285 /* This can happen - eg when we're in the middle of tearing
286 * down the connection, and someone unloads the rds module.
287 * Quite reproduceable with loopback connections.
288 * Mostly harmless.
289 */
290 rds_conn_error(conn,
291 "%s: failed to transition to state DOWN, "
292 "current state is %d\n",
293 __func__,
294 atomic_read(&conn->c_state));
295 return;
296 }
297 }
298
299 /* Then reconnect if it's still live.
300 * The passive side of an IB loopback connection is never added
301 * to the conn hash, so we never trigger a reconnect on this
302 * conn - the reconnect is always triggered by the active peer. */
303 cancel_delayed_work_sync(&conn->c_conn_w);
304 rcu_read_lock();
305 if (!hlist_unhashed(&conn->c_hash_node)) {
306 rcu_read_unlock();
307 rds_queue_reconnect(conn);
308 } else {
309 rcu_read_unlock();
310 }
311}
312
313/*
314 * Stop and free a connection.
315 *
316 * This can only be used in very limited circumstances. It assumes that once
317 * the conn has been shutdown that no one else is referencing the connection.
318 * We can only ensure this in the rmmod path in the current code.
319 */
266void rds_conn_destroy(struct rds_connection *conn) 320void rds_conn_destroy(struct rds_connection *conn)
267{ 321{
268 struct rds_message *rm, *rtmp; 322 struct rds_message *rm, *rtmp;
323 unsigned long flags;
269 324
270 rdsdebug("freeing conn %p for %pI4 -> " 325 rdsdebug("freeing conn %p for %pI4 -> "
271 "%pI4\n", conn, &conn->c_laddr, 326 "%pI4\n", conn, &conn->c_laddr,
272 &conn->c_faddr); 327 &conn->c_faddr);
273 328
274 hlist_del_init(&conn->c_hash_node); 329 /* Ensure conn will not be scheduled for reconnect */
330 spin_lock_irq(&rds_conn_lock);
331 hlist_del_init_rcu(&conn->c_hash_node);
332 spin_unlock_irq(&rds_conn_lock);
333 synchronize_rcu();
275 334
276 /* wait for the rds thread to shut it down */ 335 /* shut the connection down */
277 atomic_set(&conn->c_state, RDS_CONN_ERROR); 336 rds_conn_drop(conn);
278 cancel_delayed_work(&conn->c_conn_w); 337 flush_work(&conn->c_down_w);
279 queue_work(rds_wq, &conn->c_down_w); 338
280 flush_workqueue(rds_wq); 339 /* make sure lingering queued work won't try to ref the conn */
340 cancel_delayed_work_sync(&conn->c_send_w);
341 cancel_delayed_work_sync(&conn->c_recv_w);
281 342
282 /* tear down queued messages */ 343 /* tear down queued messages */
283 list_for_each_entry_safe(rm, rtmp, 344 list_for_each_entry_safe(rm, rtmp,
@@ -302,7 +363,9 @@ void rds_conn_destroy(struct rds_connection *conn)
302 BUG_ON(!list_empty(&conn->c_retrans)); 363 BUG_ON(!list_empty(&conn->c_retrans));
303 kmem_cache_free(rds_conn_slab, conn); 364 kmem_cache_free(rds_conn_slab, conn);
304 365
366 spin_lock_irqsave(&rds_conn_lock, flags);
305 rds_conn_count--; 367 rds_conn_count--;
368 spin_unlock_irqrestore(&rds_conn_lock, flags);
306} 369}
307EXPORT_SYMBOL_GPL(rds_conn_destroy); 370EXPORT_SYMBOL_GPL(rds_conn_destroy);
308 371
@@ -316,23 +379,23 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
316 struct list_head *list; 379 struct list_head *list;
317 struct rds_connection *conn; 380 struct rds_connection *conn;
318 struct rds_message *rm; 381 struct rds_message *rm;
319 unsigned long flags;
320 unsigned int total = 0; 382 unsigned int total = 0;
383 unsigned long flags;
321 size_t i; 384 size_t i;
322 385
323 len /= sizeof(struct rds_info_message); 386 len /= sizeof(struct rds_info_message);
324 387
325 spin_lock_irqsave(&rds_conn_lock, flags); 388 rcu_read_lock();
326 389
327 for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); 390 for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
328 i++, head++) { 391 i++, head++) {
329 hlist_for_each_entry(conn, pos, head, c_hash_node) { 392 hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) {
330 if (want_send) 393 if (want_send)
331 list = &conn->c_send_queue; 394 list = &conn->c_send_queue;
332 else 395 else
333 list = &conn->c_retrans; 396 list = &conn->c_retrans;
334 397
335 spin_lock(&conn->c_lock); 398 spin_lock_irqsave(&conn->c_lock, flags);
336 399
337 /* XXX too lazy to maintain counts.. */ 400 /* XXX too lazy to maintain counts.. */
338 list_for_each_entry(rm, list, m_conn_item) { 401 list_for_each_entry(rm, list, m_conn_item) {
@@ -343,11 +406,10 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
343 conn->c_faddr, 0); 406 conn->c_faddr, 0);
344 } 407 }
345 408
346 spin_unlock(&conn->c_lock); 409 spin_unlock_irqrestore(&conn->c_lock, flags);
347 } 410 }
348 } 411 }
349 412 rcu_read_unlock();
350 spin_unlock_irqrestore(&rds_conn_lock, flags);
351 413
352 lens->nr = total; 414 lens->nr = total;
353 lens->each = sizeof(struct rds_info_message); 415 lens->each = sizeof(struct rds_info_message);
@@ -377,19 +439,17 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
377 uint64_t buffer[(item_len + 7) / 8]; 439 uint64_t buffer[(item_len + 7) / 8];
378 struct hlist_head *head; 440 struct hlist_head *head;
379 struct hlist_node *pos; 441 struct hlist_node *pos;
380 struct hlist_node *tmp;
381 struct rds_connection *conn; 442 struct rds_connection *conn;
382 unsigned long flags;
383 size_t i; 443 size_t i;
384 444
385 spin_lock_irqsave(&rds_conn_lock, flags); 445 rcu_read_lock();
386 446
387 lens->nr = 0; 447 lens->nr = 0;
388 lens->each = item_len; 448 lens->each = item_len;
389 449
390 for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); 450 for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
391 i++, head++) { 451 i++, head++) {
392 hlist_for_each_entry_safe(conn, pos, tmp, head, c_hash_node) { 452 hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) {
393 453
394 /* XXX no c_lock usage.. */ 454 /* XXX no c_lock usage.. */
395 if (!visitor(conn, buffer)) 455 if (!visitor(conn, buffer))
@@ -405,8 +465,7 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
405 lens->nr++; 465 lens->nr++;
406 } 466 }
407 } 467 }
408 468 rcu_read_unlock();
409 spin_unlock_irqrestore(&rds_conn_lock, flags);
410} 469}
411EXPORT_SYMBOL_GPL(rds_for_each_conn_info); 470EXPORT_SYMBOL_GPL(rds_for_each_conn_info);
412 471
@@ -423,8 +482,8 @@ static int rds_conn_info_visitor(struct rds_connection *conn,
423 sizeof(cinfo->transport)); 482 sizeof(cinfo->transport));
424 cinfo->flags = 0; 483 cinfo->flags = 0;
425 484
426 rds_conn_info_set(cinfo->flags, 485 rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &conn->c_flags),
427 rds_conn_is_sending(conn), SENDING); 486 SENDING);
428 /* XXX Future: return the state rather than these funky bits */ 487 /* XXX Future: return the state rather than these funky bits */
429 rds_conn_info_set(cinfo->flags, 488 rds_conn_info_set(cinfo->flags,
430 atomic_read(&conn->c_state) == RDS_CONN_CONNECTING, 489 atomic_read(&conn->c_state) == RDS_CONN_CONNECTING,
@@ -444,12 +503,12 @@ static void rds_conn_info(struct socket *sock, unsigned int len,
444 sizeof(struct rds_info_connection)); 503 sizeof(struct rds_info_connection));
445} 504}
446 505
447int __init rds_conn_init(void) 506int rds_conn_init(void)
448{ 507{
449 rds_conn_slab = kmem_cache_create("rds_connection", 508 rds_conn_slab = kmem_cache_create("rds_connection",
450 sizeof(struct rds_connection), 509 sizeof(struct rds_connection),
451 0, 0, NULL); 510 0, 0, NULL);
452 if (rds_conn_slab == NULL) 511 if (!rds_conn_slab)
453 return -ENOMEM; 512 return -ENOMEM;
454 513
455 rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info); 514 rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info);
@@ -487,6 +546,18 @@ void rds_conn_drop(struct rds_connection *conn)
487EXPORT_SYMBOL_GPL(rds_conn_drop); 546EXPORT_SYMBOL_GPL(rds_conn_drop);
488 547
489/* 548/*
549 * If the connection is down, trigger a connect. We may have scheduled a
550 * delayed reconnect however - in this case we should not interfere.
551 */
552void rds_conn_connect_if_down(struct rds_connection *conn)
553{
554 if (rds_conn_state(conn) == RDS_CONN_DOWN &&
555 !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
556 queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
557}
558EXPORT_SYMBOL_GPL(rds_conn_connect_if_down);
559
560/*
490 * An error occurred on the connection 561 * An error occurred on the connection
491 */ 562 */
492void 563void
diff --git a/net/rds/ib.c b/net/rds/ib.c
index 8f2d6dd7700a..4123967d4d65 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -42,7 +42,7 @@
42#include "rds.h" 42#include "rds.h"
43#include "ib.h" 43#include "ib.h"
44 44
45unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE; 45static unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE;
46unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */ 46unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */
47unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT; 47unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
48 48
@@ -53,13 +53,72 @@ MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer");
53module_param(rds_ib_retry_count, int, 0444); 53module_param(rds_ib_retry_count, int, 0444);
54MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error"); 54MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error");
55 55
56/*
57 * we have a clumsy combination of RCU and a rwsem protecting this list
58 * because it is used both in the get_mr fast path and while blocking in
59 * the FMR flushing path.
60 */
61DECLARE_RWSEM(rds_ib_devices_lock);
56struct list_head rds_ib_devices; 62struct list_head rds_ib_devices;
57 63
58/* NOTE: if also grabbing ibdev lock, grab this first */ 64/* NOTE: if also grabbing ibdev lock, grab this first */
59DEFINE_SPINLOCK(ib_nodev_conns_lock); 65DEFINE_SPINLOCK(ib_nodev_conns_lock);
60LIST_HEAD(ib_nodev_conns); 66LIST_HEAD(ib_nodev_conns);
61 67
62void rds_ib_add_one(struct ib_device *device) 68static void rds_ib_nodev_connect(void)
69{
70 struct rds_ib_connection *ic;
71
72 spin_lock(&ib_nodev_conns_lock);
73 list_for_each_entry(ic, &ib_nodev_conns, ib_node)
74 rds_conn_connect_if_down(ic->conn);
75 spin_unlock(&ib_nodev_conns_lock);
76}
77
78static void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev)
79{
80 struct rds_ib_connection *ic;
81 unsigned long flags;
82
83 spin_lock_irqsave(&rds_ibdev->spinlock, flags);
84 list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node)
85 rds_conn_drop(ic->conn);
86 spin_unlock_irqrestore(&rds_ibdev->spinlock, flags);
87}
88
89/*
90 * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references
91 * from interrupt context so we push freing off into a work struct in krdsd.
92 */
93static void rds_ib_dev_free(struct work_struct *work)
94{
95 struct rds_ib_ipaddr *i_ipaddr, *i_next;
96 struct rds_ib_device *rds_ibdev = container_of(work,
97 struct rds_ib_device, free_work);
98
99 if (rds_ibdev->mr_pool)
100 rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
101 if (rds_ibdev->mr)
102 ib_dereg_mr(rds_ibdev->mr);
103 if (rds_ibdev->pd)
104 ib_dealloc_pd(rds_ibdev->pd);
105
106 list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
107 list_del(&i_ipaddr->list);
108 kfree(i_ipaddr);
109 }
110
111 kfree(rds_ibdev);
112}
113
114void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
115{
116 BUG_ON(atomic_read(&rds_ibdev->refcount) <= 0);
117 if (atomic_dec_and_test(&rds_ibdev->refcount))
118 queue_work(rds_wq, &rds_ibdev->free_work);
119}
120
121static void rds_ib_add_one(struct ib_device *device)
63{ 122{
64 struct rds_ib_device *rds_ibdev; 123 struct rds_ib_device *rds_ibdev;
65 struct ib_device_attr *dev_attr; 124 struct ib_device_attr *dev_attr;
@@ -77,11 +136,14 @@ void rds_ib_add_one(struct ib_device *device)
77 goto free_attr; 136 goto free_attr;
78 } 137 }
79 138
80 rds_ibdev = kmalloc(sizeof *rds_ibdev, GFP_KERNEL); 139 rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL,
140 ibdev_to_node(device));
81 if (!rds_ibdev) 141 if (!rds_ibdev)
82 goto free_attr; 142 goto free_attr;
83 143
84 spin_lock_init(&rds_ibdev->spinlock); 144 spin_lock_init(&rds_ibdev->spinlock);
145 atomic_set(&rds_ibdev->refcount, 1);
146 INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
85 147
86 rds_ibdev->max_wrs = dev_attr->max_qp_wr; 148 rds_ibdev->max_wrs = dev_attr->max_qp_wr;
87 rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); 149 rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
@@ -91,68 +153,107 @@ void rds_ib_add_one(struct ib_device *device)
91 min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) : 153 min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) :
92 fmr_pool_size; 154 fmr_pool_size;
93 155
156 rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom;
157 rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom;
158
94 rds_ibdev->dev = device; 159 rds_ibdev->dev = device;
95 rds_ibdev->pd = ib_alloc_pd(device); 160 rds_ibdev->pd = ib_alloc_pd(device);
96 if (IS_ERR(rds_ibdev->pd)) 161 if (IS_ERR(rds_ibdev->pd)) {
97 goto free_dev; 162 rds_ibdev->pd = NULL;
163 goto put_dev;
164 }
98 165
99 rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, 166 rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE);
100 IB_ACCESS_LOCAL_WRITE); 167 if (IS_ERR(rds_ibdev->mr)) {
101 if (IS_ERR(rds_ibdev->mr)) 168 rds_ibdev->mr = NULL;
102 goto err_pd; 169 goto put_dev;
170 }
103 171
104 rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev); 172 rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev);
105 if (IS_ERR(rds_ibdev->mr_pool)) { 173 if (IS_ERR(rds_ibdev->mr_pool)) {
106 rds_ibdev->mr_pool = NULL; 174 rds_ibdev->mr_pool = NULL;
107 goto err_mr; 175 goto put_dev;
108 } 176 }
109 177
110 INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); 178 INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
111 INIT_LIST_HEAD(&rds_ibdev->conn_list); 179 INIT_LIST_HEAD(&rds_ibdev->conn_list);
112 list_add_tail(&rds_ibdev->list, &rds_ib_devices); 180
181 down_write(&rds_ib_devices_lock);
182 list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices);
183 up_write(&rds_ib_devices_lock);
184 atomic_inc(&rds_ibdev->refcount);
113 185
114 ib_set_client_data(device, &rds_ib_client, rds_ibdev); 186 ib_set_client_data(device, &rds_ib_client, rds_ibdev);
187 atomic_inc(&rds_ibdev->refcount);
115 188
116 goto free_attr; 189 rds_ib_nodev_connect();
117 190
118err_mr: 191put_dev:
119 ib_dereg_mr(rds_ibdev->mr); 192 rds_ib_dev_put(rds_ibdev);
120err_pd:
121 ib_dealloc_pd(rds_ibdev->pd);
122free_dev:
123 kfree(rds_ibdev);
124free_attr: 193free_attr:
125 kfree(dev_attr); 194 kfree(dev_attr);
126} 195}
127 196
128void rds_ib_remove_one(struct ib_device *device) 197/*
198 * New connections use this to find the device to associate with the
199 * connection. It's not in the fast path so we're not concerned about the
200 * performance of the IB call. (As of this writing, it uses an interrupt
201 * blocking spinlock to serialize walking a per-device list of all registered
202 * clients.)
203 *
204 * RCU is used to handle incoming connections racing with device teardown.
205 * Rather than use a lock to serialize removal from the client_data and
206 * getting a new reference, we use an RCU grace period. The destruction
207 * path removes the device from client_data and then waits for all RCU
208 * readers to finish.
209 *
210 * A new connection can get NULL from this if its arriving on a
211 * device that is in the process of being removed.
212 */
213struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device)
129{ 214{
130 struct rds_ib_device *rds_ibdev; 215 struct rds_ib_device *rds_ibdev;
131 struct rds_ib_ipaddr *i_ipaddr, *i_next;
132 216
217 rcu_read_lock();
133 rds_ibdev = ib_get_client_data(device, &rds_ib_client); 218 rds_ibdev = ib_get_client_data(device, &rds_ib_client);
134 if (!rds_ibdev) 219 if (rds_ibdev)
135 return; 220 atomic_inc(&rds_ibdev->refcount);
221 rcu_read_unlock();
222 return rds_ibdev;
223}
136 224
137 list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { 225/*
138 list_del(&i_ipaddr->list); 226 * The IB stack is letting us know that a device is going away. This can
139 kfree(i_ipaddr); 227 * happen if the underlying HCA driver is removed or if PCI hotplug is removing
140 } 228 * the pci function, for example.
229 *
230 * This can be called at any time and can be racing with any other RDS path.
231 */
232static void rds_ib_remove_one(struct ib_device *device)
233{
234 struct rds_ib_device *rds_ibdev;
141 235
142 rds_ib_destroy_conns(rds_ibdev); 236 rds_ibdev = ib_get_client_data(device, &rds_ib_client);
237 if (!rds_ibdev)
238 return;
143 239
144 if (rds_ibdev->mr_pool) 240 rds_ib_dev_shutdown(rds_ibdev);
145 rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
146 241
147 ib_dereg_mr(rds_ibdev->mr); 242 /* stop connection attempts from getting a reference to this device. */
243 ib_set_client_data(device, &rds_ib_client, NULL);
148 244
149 while (ib_dealloc_pd(rds_ibdev->pd)) { 245 down_write(&rds_ib_devices_lock);
150 rdsdebug("Failed to dealloc pd %p\n", rds_ibdev->pd); 246 list_del_rcu(&rds_ibdev->list);
151 msleep(1); 247 up_write(&rds_ib_devices_lock);
152 }
153 248
154 list_del(&rds_ibdev->list); 249 /*
155 kfree(rds_ibdev); 250 * This synchronize rcu is waiting for readers of both the ib
251 * client data and the devices list to finish before we drop
252 * both of those references.
253 */
254 synchronize_rcu();
255 rds_ib_dev_put(rds_ibdev);
256 rds_ib_dev_put(rds_ibdev);
156} 257}
157 258
158struct ib_client rds_ib_client = { 259struct ib_client rds_ib_client = {
@@ -186,7 +287,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
186 rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); 287 rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
187 rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); 288 rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
188 289
189 rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); 290 rds_ibdev = ic->rds_ibdev;
190 iinfo->max_send_wr = ic->i_send_ring.w_nr; 291 iinfo->max_send_wr = ic->i_send_ring.w_nr;
191 iinfo->max_recv_wr = ic->i_recv_ring.w_nr; 292 iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
192 iinfo->max_send_sge = rds_ibdev->max_sge; 293 iinfo->max_send_sge = rds_ibdev->max_sge;
@@ -248,29 +349,36 @@ static int rds_ib_laddr_check(__be32 addr)
248 return ret; 349 return ret;
249} 350}
250 351
352static void rds_ib_unregister_client(void)
353{
354 ib_unregister_client(&rds_ib_client);
355 /* wait for rds_ib_dev_free() to complete */
356 flush_workqueue(rds_wq);
357}
358
251void rds_ib_exit(void) 359void rds_ib_exit(void)
252{ 360{
253 rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); 361 rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
362 rds_ib_unregister_client();
254 rds_ib_destroy_nodev_conns(); 363 rds_ib_destroy_nodev_conns();
255 ib_unregister_client(&rds_ib_client);
256 rds_ib_sysctl_exit(); 364 rds_ib_sysctl_exit();
257 rds_ib_recv_exit(); 365 rds_ib_recv_exit();
258 rds_trans_unregister(&rds_ib_transport); 366 rds_trans_unregister(&rds_ib_transport);
367 rds_ib_fmr_exit();
259} 368}
260 369
261struct rds_transport rds_ib_transport = { 370struct rds_transport rds_ib_transport = {
262 .laddr_check = rds_ib_laddr_check, 371 .laddr_check = rds_ib_laddr_check,
263 .xmit_complete = rds_ib_xmit_complete, 372 .xmit_complete = rds_ib_xmit_complete,
264 .xmit = rds_ib_xmit, 373 .xmit = rds_ib_xmit,
265 .xmit_cong_map = NULL,
266 .xmit_rdma = rds_ib_xmit_rdma, 374 .xmit_rdma = rds_ib_xmit_rdma,
375 .xmit_atomic = rds_ib_xmit_atomic,
267 .recv = rds_ib_recv, 376 .recv = rds_ib_recv,
268 .conn_alloc = rds_ib_conn_alloc, 377 .conn_alloc = rds_ib_conn_alloc,
269 .conn_free = rds_ib_conn_free, 378 .conn_free = rds_ib_conn_free,
270 .conn_connect = rds_ib_conn_connect, 379 .conn_connect = rds_ib_conn_connect,
271 .conn_shutdown = rds_ib_conn_shutdown, 380 .conn_shutdown = rds_ib_conn_shutdown,
272 .inc_copy_to_user = rds_ib_inc_copy_to_user, 381 .inc_copy_to_user = rds_ib_inc_copy_to_user,
273 .inc_purge = rds_ib_inc_purge,
274 .inc_free = rds_ib_inc_free, 382 .inc_free = rds_ib_inc_free,
275 .cm_initiate_connect = rds_ib_cm_initiate_connect, 383 .cm_initiate_connect = rds_ib_cm_initiate_connect,
276 .cm_handle_connect = rds_ib_cm_handle_connect, 384 .cm_handle_connect = rds_ib_cm_handle_connect,
@@ -286,16 +394,20 @@ struct rds_transport rds_ib_transport = {
286 .t_type = RDS_TRANS_IB 394 .t_type = RDS_TRANS_IB
287}; 395};
288 396
289int __init rds_ib_init(void) 397int rds_ib_init(void)
290{ 398{
291 int ret; 399 int ret;
292 400
293 INIT_LIST_HEAD(&rds_ib_devices); 401 INIT_LIST_HEAD(&rds_ib_devices);
294 402
295 ret = ib_register_client(&rds_ib_client); 403 ret = rds_ib_fmr_init();
296 if (ret) 404 if (ret)
297 goto out; 405 goto out;
298 406
407 ret = ib_register_client(&rds_ib_client);
408 if (ret)
409 goto out_fmr_exit;
410
299 ret = rds_ib_sysctl_init(); 411 ret = rds_ib_sysctl_init();
300 if (ret) 412 if (ret)
301 goto out_ibreg; 413 goto out_ibreg;
@@ -317,7 +429,9 @@ out_recv:
317out_sysctl: 429out_sysctl:
318 rds_ib_sysctl_exit(); 430 rds_ib_sysctl_exit();
319out_ibreg: 431out_ibreg:
320 ib_unregister_client(&rds_ib_client); 432 rds_ib_unregister_client();
433out_fmr_exit:
434 rds_ib_fmr_exit();
321out: 435out:
322 return ret; 436 return ret;
323} 437}
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 64df4e79b29f..e34ad032b66d 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -3,11 +3,13 @@
3 3
4#include <rdma/ib_verbs.h> 4#include <rdma/ib_verbs.h>
5#include <rdma/rdma_cm.h> 5#include <rdma/rdma_cm.h>
6#include <linux/pci.h>
7#include <linux/slab.h>
6#include "rds.h" 8#include "rds.h"
7#include "rdma_transport.h" 9#include "rdma_transport.h"
8 10
9#define RDS_FMR_SIZE 256 11#define RDS_FMR_SIZE 256
10#define RDS_FMR_POOL_SIZE 4096 12#define RDS_FMR_POOL_SIZE 8192
11 13
12#define RDS_IB_MAX_SGE 8 14#define RDS_IB_MAX_SGE 8
13#define RDS_IB_RECV_SGE 2 15#define RDS_IB_RECV_SGE 2
@@ -19,6 +21,9 @@
19 21
20#define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ 22#define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */
21 23
24#define RDS_IB_RECYCLE_BATCH_COUNT 32
25
26extern struct rw_semaphore rds_ib_devices_lock;
22extern struct list_head rds_ib_devices; 27extern struct list_head rds_ib_devices;
23 28
24/* 29/*
@@ -26,20 +31,29 @@ extern struct list_head rds_ib_devices;
26 * try and minimize the amount of memory tied up both the device and 31 * try and minimize the amount of memory tied up both the device and
27 * socket receive queues. 32 * socket receive queues.
28 */ 33 */
29/* page offset of the final full frag that fits in the page */
30#define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE)
31struct rds_page_frag { 34struct rds_page_frag {
32 struct list_head f_item; 35 struct list_head f_item;
33 struct page *f_page; 36 struct list_head f_cache_entry;
34 unsigned long f_offset; 37 struct scatterlist f_sg;
35 dma_addr_t f_mapped;
36}; 38};
37 39
38struct rds_ib_incoming { 40struct rds_ib_incoming {
39 struct list_head ii_frags; 41 struct list_head ii_frags;
42 struct list_head ii_cache_entry;
40 struct rds_incoming ii_inc; 43 struct rds_incoming ii_inc;
41}; 44};
42 45
46struct rds_ib_cache_head {
47 struct list_head *first;
48 unsigned long count;
49};
50
51struct rds_ib_refill_cache {
52 struct rds_ib_cache_head *percpu;
53 struct list_head *xfer;
54 struct list_head *ready;
55};
56
43struct rds_ib_connect_private { 57struct rds_ib_connect_private {
44 /* Add new fields at the end, and don't permute existing fields. */ 58 /* Add new fields at the end, and don't permute existing fields. */
45 __be32 dp_saddr; 59 __be32 dp_saddr;
@@ -53,8 +67,7 @@ struct rds_ib_connect_private {
53}; 67};
54 68
55struct rds_ib_send_work { 69struct rds_ib_send_work {
56 struct rds_message *s_rm; 70 void *s_op;
57 struct rds_rdma_op *s_op;
58 struct ib_send_wr s_wr; 71 struct ib_send_wr s_wr;
59 struct ib_sge s_sge[RDS_IB_MAX_SGE]; 72 struct ib_sge s_sge[RDS_IB_MAX_SGE];
60 unsigned long s_queued; 73 unsigned long s_queued;
@@ -92,10 +105,11 @@ struct rds_ib_connection {
92 105
93 /* tx */ 106 /* tx */
94 struct rds_ib_work_ring i_send_ring; 107 struct rds_ib_work_ring i_send_ring;
95 struct rds_message *i_rm; 108 struct rm_data_op *i_data_op;
96 struct rds_header *i_send_hdrs; 109 struct rds_header *i_send_hdrs;
97 u64 i_send_hdrs_dma; 110 u64 i_send_hdrs_dma;
98 struct rds_ib_send_work *i_sends; 111 struct rds_ib_send_work *i_sends;
112 atomic_t i_signaled_sends;
99 113
100 /* rx */ 114 /* rx */
101 struct tasklet_struct i_recv_tasklet; 115 struct tasklet_struct i_recv_tasklet;
@@ -106,8 +120,9 @@ struct rds_ib_connection {
106 struct rds_header *i_recv_hdrs; 120 struct rds_header *i_recv_hdrs;
107 u64 i_recv_hdrs_dma; 121 u64 i_recv_hdrs_dma;
108 struct rds_ib_recv_work *i_recvs; 122 struct rds_ib_recv_work *i_recvs;
109 struct rds_page_frag i_frag;
110 u64 i_ack_recv; /* last ACK received */ 123 u64 i_ack_recv; /* last ACK received */
124 struct rds_ib_refill_cache i_cache_incs;
125 struct rds_ib_refill_cache i_cache_frags;
111 126
112 /* sending acks */ 127 /* sending acks */
113 unsigned long i_ack_flags; 128 unsigned long i_ack_flags;
@@ -138,7 +153,6 @@ struct rds_ib_connection {
138 153
139 /* Batched completions */ 154 /* Batched completions */
140 unsigned int i_unsignaled_wrs; 155 unsigned int i_unsignaled_wrs;
141 long i_unsignaled_bytes;
142}; 156};
143 157
144/* This assumes that atomic_t is at least 32 bits */ 158/* This assumes that atomic_t is at least 32 bits */
@@ -164,9 +178,17 @@ struct rds_ib_device {
164 unsigned int max_fmrs; 178 unsigned int max_fmrs;
165 int max_sge; 179 int max_sge;
166 unsigned int max_wrs; 180 unsigned int max_wrs;
181 unsigned int max_initiator_depth;
182 unsigned int max_responder_resources;
167 spinlock_t spinlock; /* protect the above */ 183 spinlock_t spinlock; /* protect the above */
184 atomic_t refcount;
185 struct work_struct free_work;
168}; 186};
169 187
188#define pcidev_to_node(pcidev) pcibus_to_node(pcidev->bus)
189#define ibdev_to_node(ibdev) pcidev_to_node(to_pci_dev(ibdev->dma_device))
190#define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev)
191
170/* bits for i_ack_flags */ 192/* bits for i_ack_flags */
171#define IB_ACK_IN_FLIGHT 0 193#define IB_ACK_IN_FLIGHT 0
172#define IB_ACK_REQUESTED 1 194#define IB_ACK_REQUESTED 1
@@ -202,6 +224,8 @@ struct rds_ib_statistics {
202 uint64_t s_ib_rdma_mr_pool_flush; 224 uint64_t s_ib_rdma_mr_pool_flush;
203 uint64_t s_ib_rdma_mr_pool_wait; 225 uint64_t s_ib_rdma_mr_pool_wait;
204 uint64_t s_ib_rdma_mr_pool_depleted; 226 uint64_t s_ib_rdma_mr_pool_depleted;
227 uint64_t s_ib_atomic_cswp;
228 uint64_t s_ib_atomic_fadd;
205}; 229};
206 230
207extern struct workqueue_struct *rds_ib_wq; 231extern struct workqueue_struct *rds_ib_wq;
@@ -241,11 +265,10 @@ static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev,
241 265
242/* ib.c */ 266/* ib.c */
243extern struct rds_transport rds_ib_transport; 267extern struct rds_transport rds_ib_transport;
244extern void rds_ib_add_one(struct ib_device *device); 268struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device);
245extern void rds_ib_remove_one(struct ib_device *device); 269void rds_ib_dev_put(struct rds_ib_device *rds_ibdev);
246extern struct ib_client rds_ib_client; 270extern struct ib_client rds_ib_client;
247 271
248extern unsigned int fmr_pool_size;
249extern unsigned int fmr_message_size; 272extern unsigned int fmr_message_size;
250extern unsigned int rds_ib_retry_count; 273extern unsigned int rds_ib_retry_count;
251 274
@@ -258,7 +281,7 @@ void rds_ib_conn_free(void *arg);
258int rds_ib_conn_connect(struct rds_connection *conn); 281int rds_ib_conn_connect(struct rds_connection *conn);
259void rds_ib_conn_shutdown(struct rds_connection *conn); 282void rds_ib_conn_shutdown(struct rds_connection *conn);
260void rds_ib_state_change(struct sock *sk); 283void rds_ib_state_change(struct sock *sk);
261int __init rds_ib_listen_init(void); 284int rds_ib_listen_init(void);
262void rds_ib_listen_stop(void); 285void rds_ib_listen_stop(void);
263void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...); 286void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...);
264int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, 287int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
@@ -275,15 +298,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn,
275int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr); 298int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
276void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); 299void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
277void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); 300void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
278void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock); 301void rds_ib_destroy_nodev_conns(void);
279static inline void rds_ib_destroy_nodev_conns(void)
280{
281 __rds_ib_destroy_conns(&ib_nodev_conns, &ib_nodev_conns_lock);
282}
283static inline void rds_ib_destroy_conns(struct rds_ib_device *rds_ibdev)
284{
285 __rds_ib_destroy_conns(&rds_ibdev->conn_list, &rds_ibdev->spinlock);
286}
287struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *); 302struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *);
288void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo); 303void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo);
289void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); 304void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
@@ -292,14 +307,16 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
292void rds_ib_sync_mr(void *trans_private, int dir); 307void rds_ib_sync_mr(void *trans_private, int dir);
293void rds_ib_free_mr(void *trans_private, int invalidate); 308void rds_ib_free_mr(void *trans_private, int invalidate);
294void rds_ib_flush_mrs(void); 309void rds_ib_flush_mrs(void);
310int rds_ib_fmr_init(void);
311void rds_ib_fmr_exit(void);
295 312
296/* ib_recv.c */ 313/* ib_recv.c */
297int __init rds_ib_recv_init(void); 314int rds_ib_recv_init(void);
298void rds_ib_recv_exit(void); 315void rds_ib_recv_exit(void);
299int rds_ib_recv(struct rds_connection *conn); 316int rds_ib_recv(struct rds_connection *conn);
300int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, 317int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic);
301 gfp_t page_gfp, int prefill); 318void rds_ib_recv_free_caches(struct rds_ib_connection *ic);
302void rds_ib_inc_purge(struct rds_incoming *inc); 319void rds_ib_recv_refill(struct rds_connection *conn, int prefill);
303void rds_ib_inc_free(struct rds_incoming *inc); 320void rds_ib_inc_free(struct rds_incoming *inc);
304int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, 321int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
305 size_t size); 322 size_t size);
@@ -325,17 +342,19 @@ u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest);
325extern wait_queue_head_t rds_ib_ring_empty_wait; 342extern wait_queue_head_t rds_ib_ring_empty_wait;
326 343
327/* ib_send.c */ 344/* ib_send.c */
345char *rds_ib_wc_status_str(enum ib_wc_status status);
328void rds_ib_xmit_complete(struct rds_connection *conn); 346void rds_ib_xmit_complete(struct rds_connection *conn);
329int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, 347int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
330 unsigned int hdr_off, unsigned int sg, unsigned int off); 348 unsigned int hdr_off, unsigned int sg, unsigned int off);
331void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context); 349void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context);
332void rds_ib_send_init_ring(struct rds_ib_connection *ic); 350void rds_ib_send_init_ring(struct rds_ib_connection *ic);
333void rds_ib_send_clear_ring(struct rds_ib_connection *ic); 351void rds_ib_send_clear_ring(struct rds_ib_connection *ic);
334int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op); 352int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
335void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits); 353void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits);
336void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted); 354void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted);
337int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted, 355int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted,
338 u32 *adv_credits, int need_posted, int max_posted); 356 u32 *adv_credits, int need_posted, int max_posted);
357int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op);
339 358
340/* ib_stats.c */ 359/* ib_stats.c */
341DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats); 360DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
@@ -344,7 +363,7 @@ unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
344 unsigned int avail); 363 unsigned int avail);
345 364
346/* ib_sysctl.c */ 365/* ib_sysctl.c */
347int __init rds_ib_sysctl_init(void); 366int rds_ib_sysctl_init(void);
348void rds_ib_sysctl_exit(void); 367void rds_ib_sysctl_exit(void);
349extern unsigned long rds_ib_sysctl_max_send_wr; 368extern unsigned long rds_ib_sysctl_max_send_wr;
350extern unsigned long rds_ib_sysctl_max_recv_wr; 369extern unsigned long rds_ib_sysctl_max_recv_wr;
@@ -352,30 +371,5 @@ extern unsigned long rds_ib_sysctl_max_unsig_wrs;
352extern unsigned long rds_ib_sysctl_max_unsig_bytes; 371extern unsigned long rds_ib_sysctl_max_unsig_bytes;
353extern unsigned long rds_ib_sysctl_max_recv_allocation; 372extern unsigned long rds_ib_sysctl_max_recv_allocation;
354extern unsigned int rds_ib_sysctl_flow_control; 373extern unsigned int rds_ib_sysctl_flow_control;
355extern ctl_table rds_ib_sysctl_table[];
356
357/*
358 * Helper functions for getting/setting the header and data SGEs in
359 * RDS packets (not RDMA)
360 *
361 * From version 3.1 onwards, header is in front of data in the sge.
362 */
363static inline struct ib_sge *
364rds_ib_header_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
365{
366 if (ic->conn->c_version > RDS_PROTOCOL_3_0)
367 return &sge[0];
368 else
369 return &sge[1];
370}
371
372static inline struct ib_sge *
373rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
374{
375 if (ic->conn->c_version > RDS_PROTOCOL_3_0)
376 return &sge[1];
377 else
378 return &sge[0];
379}
380 374
381#endif 375#endif
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index f68832798db2..ee369d201a65 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -38,6 +38,36 @@
38#include "rds.h" 38#include "rds.h"
39#include "ib.h" 39#include "ib.h"
40 40
41static char *rds_ib_event_type_strings[] = {
42#define RDS_IB_EVENT_STRING(foo) \
43 [IB_EVENT_##foo] = __stringify(IB_EVENT_##foo)
44 RDS_IB_EVENT_STRING(CQ_ERR),
45 RDS_IB_EVENT_STRING(QP_FATAL),
46 RDS_IB_EVENT_STRING(QP_REQ_ERR),
47 RDS_IB_EVENT_STRING(QP_ACCESS_ERR),
48 RDS_IB_EVENT_STRING(COMM_EST),
49 RDS_IB_EVENT_STRING(SQ_DRAINED),
50 RDS_IB_EVENT_STRING(PATH_MIG),
51 RDS_IB_EVENT_STRING(PATH_MIG_ERR),
52 RDS_IB_EVENT_STRING(DEVICE_FATAL),
53 RDS_IB_EVENT_STRING(PORT_ACTIVE),
54 RDS_IB_EVENT_STRING(PORT_ERR),
55 RDS_IB_EVENT_STRING(LID_CHANGE),
56 RDS_IB_EVENT_STRING(PKEY_CHANGE),
57 RDS_IB_EVENT_STRING(SM_CHANGE),
58 RDS_IB_EVENT_STRING(SRQ_ERR),
59 RDS_IB_EVENT_STRING(SRQ_LIMIT_REACHED),
60 RDS_IB_EVENT_STRING(QP_LAST_WQE_REACHED),
61 RDS_IB_EVENT_STRING(CLIENT_REREGISTER),
62#undef RDS_IB_EVENT_STRING
63};
64
65static char *rds_ib_event_str(enum ib_event_type type)
66{
67 return rds_str_array(rds_ib_event_type_strings,
68 ARRAY_SIZE(rds_ib_event_type_strings), type);
69};
70
41/* 71/*
42 * Set the selected protocol version 72 * Set the selected protocol version
43 */ 73 */
@@ -95,7 +125,6 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
95{ 125{
96 const struct rds_ib_connect_private *dp = NULL; 126 const struct rds_ib_connect_private *dp = NULL;
97 struct rds_ib_connection *ic = conn->c_transport_data; 127 struct rds_ib_connection *ic = conn->c_transport_data;
98 struct rds_ib_device *rds_ibdev;
99 struct ib_qp_attr qp_attr; 128 struct ib_qp_attr qp_attr;
100 int err; 129 int err;
101 130
@@ -111,11 +140,21 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
111 } 140 }
112 } 141 }
113 142
114 printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n", 143 if (conn->c_version < RDS_PROTOCOL(3,1)) {
115 &conn->c_faddr, 144 printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed,"
116 RDS_PROTOCOL_MAJOR(conn->c_version), 145 " no longer supported\n",
117 RDS_PROTOCOL_MINOR(conn->c_version), 146 &conn->c_faddr,
118 ic->i_flowctl ? ", flow control" : ""); 147 RDS_PROTOCOL_MAJOR(conn->c_version),
148 RDS_PROTOCOL_MINOR(conn->c_version));
149 rds_conn_destroy(conn);
150 return;
151 } else {
152 printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
153 &conn->c_faddr,
154 RDS_PROTOCOL_MAJOR(conn->c_version),
155 RDS_PROTOCOL_MINOR(conn->c_version),
156 ic->i_flowctl ? ", flow control" : "");
157 }
119 158
120 /* 159 /*
121 * Init rings and fill recv. this needs to wait until protocol negotiation 160 * Init rings and fill recv. this needs to wait until protocol negotiation
@@ -125,7 +164,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
125 rds_ib_recv_init_ring(ic); 164 rds_ib_recv_init_ring(ic);
126 /* Post receive buffers - as a side effect, this will update 165 /* Post receive buffers - as a side effect, this will update
127 * the posted credit count. */ 166 * the posted credit count. */
128 rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1); 167 rds_ib_recv_refill(conn, 1);
129 168
130 /* Tune RNR behavior */ 169 /* Tune RNR behavior */
131 rds_ib_tune_rnr(ic, &qp_attr); 170 rds_ib_tune_rnr(ic, &qp_attr);
@@ -135,12 +174,11 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
135 if (err) 174 if (err)
136 printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err); 175 printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err);
137 176
138 /* update ib_device with this local ipaddr & conn */ 177 /* update ib_device with this local ipaddr */
139 rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); 178 err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr);
140 err = rds_ib_update_ipaddr(rds_ibdev, conn->c_laddr);
141 if (err) 179 if (err)
142 printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err); 180 printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n",
143 rds_ib_add_conn(rds_ibdev, conn); 181 err);
144 182
145 /* If the peer gave us the last packet it saw, process this as if 183 /* If the peer gave us the last packet it saw, process this as if
146 * we had received a regular ACK. */ 184 * we had received a regular ACK. */
@@ -153,18 +191,23 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
153static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, 191static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
154 struct rdma_conn_param *conn_param, 192 struct rdma_conn_param *conn_param,
155 struct rds_ib_connect_private *dp, 193 struct rds_ib_connect_private *dp,
156 u32 protocol_version) 194 u32 protocol_version,
195 u32 max_responder_resources,
196 u32 max_initiator_depth)
157{ 197{
198 struct rds_ib_connection *ic = conn->c_transport_data;
199 struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
200
158 memset(conn_param, 0, sizeof(struct rdma_conn_param)); 201 memset(conn_param, 0, sizeof(struct rdma_conn_param));
159 /* XXX tune these? */ 202
160 conn_param->responder_resources = 1; 203 conn_param->responder_resources =
161 conn_param->initiator_depth = 1; 204 min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources);
205 conn_param->initiator_depth =
206 min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth);
162 conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7); 207 conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7);
163 conn_param->rnr_retry_count = 7; 208 conn_param->rnr_retry_count = 7;
164 209
165 if (dp) { 210 if (dp) {
166 struct rds_ib_connection *ic = conn->c_transport_data;
167
168 memset(dp, 0, sizeof(*dp)); 211 memset(dp, 0, sizeof(*dp));
169 dp->dp_saddr = conn->c_laddr; 212 dp->dp_saddr = conn->c_laddr;
170 dp->dp_daddr = conn->c_faddr; 213 dp->dp_daddr = conn->c_faddr;
@@ -189,7 +232,8 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
189 232
190static void rds_ib_cq_event_handler(struct ib_event *event, void *data) 233static void rds_ib_cq_event_handler(struct ib_event *event, void *data)
191{ 234{
192 rdsdebug("event %u data %p\n", event->event, data); 235 rdsdebug("event %u (%s) data %p\n",
236 event->event, rds_ib_event_str(event->event), data);
193} 237}
194 238
195static void rds_ib_qp_event_handler(struct ib_event *event, void *data) 239static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
@@ -197,16 +241,18 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
197 struct rds_connection *conn = data; 241 struct rds_connection *conn = data;
198 struct rds_ib_connection *ic = conn->c_transport_data; 242 struct rds_ib_connection *ic = conn->c_transport_data;
199 243
200 rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event); 244 rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event,
245 rds_ib_event_str(event->event));
201 246
202 switch (event->event) { 247 switch (event->event) {
203 case IB_EVENT_COMM_EST: 248 case IB_EVENT_COMM_EST:
204 rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); 249 rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
205 break; 250 break;
206 default: 251 default:
207 rdsdebug("Fatal QP Event %u " 252 rdsdebug("Fatal QP Event %u (%s) "
208 "- connection %pI4->%pI4, reconnecting\n", 253 "- connection %pI4->%pI4, reconnecting\n",
209 event->event, &conn->c_laddr, &conn->c_faddr); 254 event->event, rds_ib_event_str(event->event),
255 &conn->c_laddr, &conn->c_faddr);
210 rds_conn_drop(conn); 256 rds_conn_drop(conn);
211 break; 257 break;
212 } 258 }
@@ -224,18 +270,16 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
224 struct rds_ib_device *rds_ibdev; 270 struct rds_ib_device *rds_ibdev;
225 int ret; 271 int ret;
226 272
227 /* rds_ib_add_one creates a rds_ib_device object per IB device, 273 /*
228 * and allocates a protection domain, memory range and FMR pool 274 * It's normal to see a null device if an incoming connection races
229 * for each. If that fails for any reason, it will not register 275 * with device removal, so we don't print a warning.
230 * the rds_ibdev at all.
231 */ 276 */
232 rds_ibdev = ib_get_client_data(dev, &rds_ib_client); 277 rds_ibdev = rds_ib_get_client_data(dev);
233 if (rds_ibdev == NULL) { 278 if (!rds_ibdev)
234 if (printk_ratelimit())
235 printk(KERN_NOTICE "RDS/IB: No client_data for device %s\n",
236 dev->name);
237 return -EOPNOTSUPP; 279 return -EOPNOTSUPP;
238 } 280
281 /* add the conn now so that connection establishment has the dev */
282 rds_ib_add_conn(rds_ibdev, conn);
239 283
240 if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1) 284 if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1)
241 rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1); 285 rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1);
@@ -306,7 +350,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
306 ic->i_send_ring.w_nr * 350 ic->i_send_ring.w_nr *
307 sizeof(struct rds_header), 351 sizeof(struct rds_header),
308 &ic->i_send_hdrs_dma, GFP_KERNEL); 352 &ic->i_send_hdrs_dma, GFP_KERNEL);
309 if (ic->i_send_hdrs == NULL) { 353 if (!ic->i_send_hdrs) {
310 ret = -ENOMEM; 354 ret = -ENOMEM;
311 rdsdebug("ib_dma_alloc_coherent send failed\n"); 355 rdsdebug("ib_dma_alloc_coherent send failed\n");
312 goto out; 356 goto out;
@@ -316,7 +360,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
316 ic->i_recv_ring.w_nr * 360 ic->i_recv_ring.w_nr *
317 sizeof(struct rds_header), 361 sizeof(struct rds_header),
318 &ic->i_recv_hdrs_dma, GFP_KERNEL); 362 &ic->i_recv_hdrs_dma, GFP_KERNEL);
319 if (ic->i_recv_hdrs == NULL) { 363 if (!ic->i_recv_hdrs) {
320 ret = -ENOMEM; 364 ret = -ENOMEM;
321 rdsdebug("ib_dma_alloc_coherent recv failed\n"); 365 rdsdebug("ib_dma_alloc_coherent recv failed\n");
322 goto out; 366 goto out;
@@ -324,22 +368,24 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
324 368
325 ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), 369 ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
326 &ic->i_ack_dma, GFP_KERNEL); 370 &ic->i_ack_dma, GFP_KERNEL);
327 if (ic->i_ack == NULL) { 371 if (!ic->i_ack) {
328 ret = -ENOMEM; 372 ret = -ENOMEM;
329 rdsdebug("ib_dma_alloc_coherent ack failed\n"); 373 rdsdebug("ib_dma_alloc_coherent ack failed\n");
330 goto out; 374 goto out;
331 } 375 }
332 376
333 ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work)); 377 ic->i_sends = vmalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work),
334 if (ic->i_sends == NULL) { 378 ibdev_to_node(dev));
379 if (!ic->i_sends) {
335 ret = -ENOMEM; 380 ret = -ENOMEM;
336 rdsdebug("send allocation failed\n"); 381 rdsdebug("send allocation failed\n");
337 goto out; 382 goto out;
338 } 383 }
339 memset(ic->i_sends, 0, ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work)); 384 memset(ic->i_sends, 0, ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work));
340 385
341 ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work)); 386 ic->i_recvs = vmalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work),
342 if (ic->i_recvs == NULL) { 387 ibdev_to_node(dev));
388 if (!ic->i_recvs) {
343 ret = -ENOMEM; 389 ret = -ENOMEM;
344 rdsdebug("recv allocation failed\n"); 390 rdsdebug("recv allocation failed\n");
345 goto out; 391 goto out;
@@ -352,6 +398,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
352 ic->i_send_cq, ic->i_recv_cq); 398 ic->i_send_cq, ic->i_recv_cq);
353 399
354out: 400out:
401 rds_ib_dev_put(rds_ibdev);
355 return ret; 402 return ret;
356} 403}
357 404
@@ -409,7 +456,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
409 struct rds_ib_connection *ic = NULL; 456 struct rds_ib_connection *ic = NULL;
410 struct rdma_conn_param conn_param; 457 struct rdma_conn_param conn_param;
411 u32 version; 458 u32 version;
412 int err, destroy = 1; 459 int err = 1, destroy = 1;
413 460
414 /* Check whether the remote protocol version matches ours. */ 461 /* Check whether the remote protocol version matches ours. */
415 version = rds_ib_protocol_compatible(event); 462 version = rds_ib_protocol_compatible(event);
@@ -448,7 +495,6 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
448 /* Wait and see - our connect may still be succeeding */ 495 /* Wait and see - our connect may still be succeeding */
449 rds_ib_stats_inc(s_ib_connect_raced); 496 rds_ib_stats_inc(s_ib_connect_raced);
450 } 497 }
451 mutex_unlock(&conn->c_cm_lock);
452 goto out; 498 goto out;
453 } 499 }
454 500
@@ -475,24 +521,23 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
475 err = rds_ib_setup_qp(conn); 521 err = rds_ib_setup_qp(conn);
476 if (err) { 522 if (err) {
477 rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err); 523 rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err);
478 mutex_unlock(&conn->c_cm_lock);
479 goto out; 524 goto out;
480 } 525 }
481 526
482 rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version); 527 rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
528 event->param.conn.responder_resources,
529 event->param.conn.initiator_depth);
483 530
484 /* rdma_accept() calls rdma_reject() internally if it fails */ 531 /* rdma_accept() calls rdma_reject() internally if it fails */
485 err = rdma_accept(cm_id, &conn_param); 532 err = rdma_accept(cm_id, &conn_param);
486 mutex_unlock(&conn->c_cm_lock); 533 if (err)
487 if (err) {
488 rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err); 534 rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err);
489 goto out;
490 }
491
492 return 0;
493 535
494out: 536out:
495 rdma_reject(cm_id, NULL, 0); 537 if (conn)
538 mutex_unlock(&conn->c_cm_lock);
539 if (err)
540 rdma_reject(cm_id, NULL, 0);
496 return destroy; 541 return destroy;
497} 542}
498 543
@@ -516,8 +561,8 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
516 goto out; 561 goto out;
517 } 562 }
518 563
519 rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION); 564 rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION,
520 565 UINT_MAX, UINT_MAX);
521 ret = rdma_connect(cm_id, &conn_param); 566 ret = rdma_connect(cm_id, &conn_param);
522 if (ret) 567 if (ret)
523 rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret); 568 rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
@@ -601,9 +646,19 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
601 ic->i_cm_id, err); 646 ic->i_cm_id, err);
602 } 647 }
603 648
649 /*
650 * We want to wait for tx and rx completion to finish
651 * before we tear down the connection, but we have to be
652 * careful not to get stuck waiting on a send ring that
653 * only has unsignaled sends in it. We've shutdown new
654 * sends before getting here so by waiting for signaled
655 * sends to complete we're ensured that there will be no
656 * more tx processing.
657 */
604 wait_event(rds_ib_ring_empty_wait, 658 wait_event(rds_ib_ring_empty_wait,
605 rds_ib_ring_empty(&ic->i_send_ring) && 659 rds_ib_ring_empty(&ic->i_recv_ring) &&
606 rds_ib_ring_empty(&ic->i_recv_ring)); 660 (atomic_read(&ic->i_signaled_sends) == 0));
661 tasklet_kill(&ic->i_recv_tasklet);
607 662
608 if (ic->i_send_hdrs) 663 if (ic->i_send_hdrs)
609 ib_dma_free_coherent(dev, 664 ib_dma_free_coherent(dev,
@@ -654,9 +709,12 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
654 BUG_ON(ic->rds_ibdev); 709 BUG_ON(ic->rds_ibdev);
655 710
656 /* Clear pending transmit */ 711 /* Clear pending transmit */
657 if (ic->i_rm) { 712 if (ic->i_data_op) {
658 rds_message_put(ic->i_rm); 713 struct rds_message *rm;
659 ic->i_rm = NULL; 714
715 rm = container_of(ic->i_data_op, struct rds_message, data);
716 rds_message_put(rm);
717 ic->i_data_op = NULL;
660 } 718 }
661 719
662 /* Clear the ACK state */ 720 /* Clear the ACK state */
@@ -690,12 +748,19 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
690{ 748{
691 struct rds_ib_connection *ic; 749 struct rds_ib_connection *ic;
692 unsigned long flags; 750 unsigned long flags;
751 int ret;
693 752
694 /* XXX too lazy? */ 753 /* XXX too lazy? */
695 ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL); 754 ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL);
696 if (ic == NULL) 755 if (!ic)
697 return -ENOMEM; 756 return -ENOMEM;
698 757
758 ret = rds_ib_recv_alloc_caches(ic);
759 if (ret) {
760 kfree(ic);
761 return ret;
762 }
763
699 INIT_LIST_HEAD(&ic->ib_node); 764 INIT_LIST_HEAD(&ic->ib_node);
700 tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn, 765 tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn,
701 (unsigned long) ic); 766 (unsigned long) ic);
@@ -703,6 +768,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
703#ifndef KERNEL_HAS_ATOMIC64 768#ifndef KERNEL_HAS_ATOMIC64
704 spin_lock_init(&ic->i_ack_lock); 769 spin_lock_init(&ic->i_ack_lock);
705#endif 770#endif
771 atomic_set(&ic->i_signaled_sends, 0);
706 772
707 /* 773 /*
708 * rds_ib_conn_shutdown() waits for these to be emptied so they 774 * rds_ib_conn_shutdown() waits for these to be emptied so they
@@ -744,6 +810,8 @@ void rds_ib_conn_free(void *arg)
744 list_del(&ic->ib_node); 810 list_del(&ic->ib_node);
745 spin_unlock_irq(lock_ptr); 811 spin_unlock_irq(lock_ptr);
746 812
813 rds_ib_recv_free_caches(ic);
814
747 kfree(ic); 815 kfree(ic);
748} 816}
749 817
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index a54cd63f9e35..18a833c450c8 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -32,11 +32,16 @@
32 */ 32 */
33#include <linux/kernel.h> 33#include <linux/kernel.h>
34#include <linux/slab.h> 34#include <linux/slab.h>
35#include <linux/rculist.h>
35 36
36#include "rds.h" 37#include "rds.h"
37#include "rdma.h"
38#include "ib.h" 38#include "ib.h"
39#include "xlist.h"
39 40
41static struct workqueue_struct *rds_ib_fmr_wq;
42
43static DEFINE_PER_CPU(unsigned long, clean_list_grace);
44#define CLEAN_LIST_BUSY_BIT 0
40 45
41/* 46/*
42 * This is stored as mr->r_trans_private. 47 * This is stored as mr->r_trans_private.
@@ -45,7 +50,11 @@ struct rds_ib_mr {
45 struct rds_ib_device *device; 50 struct rds_ib_device *device;
46 struct rds_ib_mr_pool *pool; 51 struct rds_ib_mr_pool *pool;
47 struct ib_fmr *fmr; 52 struct ib_fmr *fmr;
48 struct list_head list; 53
54 struct xlist_head xlist;
55
56 /* unmap_list is for freeing */
57 struct list_head unmap_list;
49 unsigned int remap_count; 58 unsigned int remap_count;
50 59
51 struct scatterlist *sg; 60 struct scatterlist *sg;
@@ -59,14 +68,16 @@ struct rds_ib_mr {
59 */ 68 */
60struct rds_ib_mr_pool { 69struct rds_ib_mr_pool {
61 struct mutex flush_lock; /* serialize fmr invalidate */ 70 struct mutex flush_lock; /* serialize fmr invalidate */
62 struct work_struct flush_worker; /* flush worker */ 71 struct delayed_work flush_worker; /* flush worker */
63 72
64 spinlock_t list_lock; /* protect variables below */
65 atomic_t item_count; /* total # of MRs */ 73 atomic_t item_count; /* total # of MRs */
66 atomic_t dirty_count; /* # dirty of MRs */ 74 atomic_t dirty_count; /* # dirty of MRs */
67 struct list_head drop_list; /* MRs that have reached their max_maps limit */ 75
68 struct list_head free_list; /* unused MRs */ 76 struct xlist_head drop_list; /* MRs that have reached their max_maps limit */
69 struct list_head clean_list; /* unused & unamapped MRs */ 77 struct xlist_head free_list; /* unused MRs */
78 struct xlist_head clean_list; /* global unused & unamapped MRs */
79 wait_queue_head_t flush_wait;
80
70 atomic_t free_pinned; /* memory pinned by free MRs */ 81 atomic_t free_pinned; /* memory pinned by free MRs */
71 unsigned long max_items; 82 unsigned long max_items;
72 unsigned long max_items_soft; 83 unsigned long max_items_soft;
@@ -74,7 +85,7 @@ struct rds_ib_mr_pool {
74 struct ib_fmr_attr fmr_attr; 85 struct ib_fmr_attr fmr_attr;
75}; 86};
76 87
77static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all); 88static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **);
78static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr); 89static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr);
79static void rds_ib_mr_pool_flush_worker(struct work_struct *work); 90static void rds_ib_mr_pool_flush_worker(struct work_struct *work);
80 91
@@ -83,16 +94,17 @@ static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
83 struct rds_ib_device *rds_ibdev; 94 struct rds_ib_device *rds_ibdev;
84 struct rds_ib_ipaddr *i_ipaddr; 95 struct rds_ib_ipaddr *i_ipaddr;
85 96
86 list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { 97 rcu_read_lock();
87 spin_lock_irq(&rds_ibdev->spinlock); 98 list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) {
88 list_for_each_entry(i_ipaddr, &rds_ibdev->ipaddr_list, list) { 99 list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
89 if (i_ipaddr->ipaddr == ipaddr) { 100 if (i_ipaddr->ipaddr == ipaddr) {
90 spin_unlock_irq(&rds_ibdev->spinlock); 101 atomic_inc(&rds_ibdev->refcount);
102 rcu_read_unlock();
91 return rds_ibdev; 103 return rds_ibdev;
92 } 104 }
93 } 105 }
94 spin_unlock_irq(&rds_ibdev->spinlock);
95 } 106 }
107 rcu_read_unlock();
96 108
97 return NULL; 109 return NULL;
98} 110}
@@ -108,7 +120,7 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
108 i_ipaddr->ipaddr = ipaddr; 120 i_ipaddr->ipaddr = ipaddr;
109 121
110 spin_lock_irq(&rds_ibdev->spinlock); 122 spin_lock_irq(&rds_ibdev->spinlock);
111 list_add_tail(&i_ipaddr->list, &rds_ibdev->ipaddr_list); 123 list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list);
112 spin_unlock_irq(&rds_ibdev->spinlock); 124 spin_unlock_irq(&rds_ibdev->spinlock);
113 125
114 return 0; 126 return 0;
@@ -116,17 +128,24 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
116 128
117static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) 129static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
118{ 130{
119 struct rds_ib_ipaddr *i_ipaddr, *next; 131 struct rds_ib_ipaddr *i_ipaddr;
132 struct rds_ib_ipaddr *to_free = NULL;
133
120 134
121 spin_lock_irq(&rds_ibdev->spinlock); 135 spin_lock_irq(&rds_ibdev->spinlock);
122 list_for_each_entry_safe(i_ipaddr, next, &rds_ibdev->ipaddr_list, list) { 136 list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
123 if (i_ipaddr->ipaddr == ipaddr) { 137 if (i_ipaddr->ipaddr == ipaddr) {
124 list_del(&i_ipaddr->list); 138 list_del_rcu(&i_ipaddr->list);
125 kfree(i_ipaddr); 139 to_free = i_ipaddr;
126 break; 140 break;
127 } 141 }
128 } 142 }
129 spin_unlock_irq(&rds_ibdev->spinlock); 143 spin_unlock_irq(&rds_ibdev->spinlock);
144
145 if (to_free) {
146 synchronize_rcu();
147 kfree(to_free);
148 }
130} 149}
131 150
132int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) 151int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
@@ -134,8 +153,10 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
134 struct rds_ib_device *rds_ibdev_old; 153 struct rds_ib_device *rds_ibdev_old;
135 154
136 rds_ibdev_old = rds_ib_get_device(ipaddr); 155 rds_ibdev_old = rds_ib_get_device(ipaddr);
137 if (rds_ibdev_old) 156 if (rds_ibdev_old) {
138 rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr); 157 rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr);
158 rds_ib_dev_put(rds_ibdev_old);
159 }
139 160
140 return rds_ib_add_ipaddr(rds_ibdev, ipaddr); 161 return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
141} 162}
@@ -150,12 +171,13 @@ void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *con
150 BUG_ON(list_empty(&ic->ib_node)); 171 BUG_ON(list_empty(&ic->ib_node));
151 list_del(&ic->ib_node); 172 list_del(&ic->ib_node);
152 173
153 spin_lock_irq(&rds_ibdev->spinlock); 174 spin_lock(&rds_ibdev->spinlock);
154 list_add_tail(&ic->ib_node, &rds_ibdev->conn_list); 175 list_add_tail(&ic->ib_node, &rds_ibdev->conn_list);
155 spin_unlock_irq(&rds_ibdev->spinlock); 176 spin_unlock(&rds_ibdev->spinlock);
156 spin_unlock_irq(&ib_nodev_conns_lock); 177 spin_unlock_irq(&ib_nodev_conns_lock);
157 178
158 ic->rds_ibdev = rds_ibdev; 179 ic->rds_ibdev = rds_ibdev;
180 atomic_inc(&rds_ibdev->refcount);
159} 181}
160 182
161void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) 183void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
@@ -175,18 +197,18 @@ void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *
175 spin_unlock(&ib_nodev_conns_lock); 197 spin_unlock(&ib_nodev_conns_lock);
176 198
177 ic->rds_ibdev = NULL; 199 ic->rds_ibdev = NULL;
200 rds_ib_dev_put(rds_ibdev);
178} 201}
179 202
180void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock) 203void rds_ib_destroy_nodev_conns(void)
181{ 204{
182 struct rds_ib_connection *ic, *_ic; 205 struct rds_ib_connection *ic, *_ic;
183 LIST_HEAD(tmp_list); 206 LIST_HEAD(tmp_list);
184 207
185 /* avoid calling conn_destroy with irqs off */ 208 /* avoid calling conn_destroy with irqs off */
186 spin_lock_irq(list_lock); 209 spin_lock_irq(&ib_nodev_conns_lock);
187 list_splice(list, &tmp_list); 210 list_splice(&ib_nodev_conns, &tmp_list);
188 INIT_LIST_HEAD(list); 211 spin_unlock_irq(&ib_nodev_conns_lock);
189 spin_unlock_irq(list_lock);
190 212
191 list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) 213 list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node)
192 rds_conn_destroy(ic->conn); 214 rds_conn_destroy(ic->conn);
@@ -200,12 +222,12 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev)
200 if (!pool) 222 if (!pool)
201 return ERR_PTR(-ENOMEM); 223 return ERR_PTR(-ENOMEM);
202 224
203 INIT_LIST_HEAD(&pool->free_list); 225 INIT_XLIST_HEAD(&pool->free_list);
204 INIT_LIST_HEAD(&pool->drop_list); 226 INIT_XLIST_HEAD(&pool->drop_list);
205 INIT_LIST_HEAD(&pool->clean_list); 227 INIT_XLIST_HEAD(&pool->clean_list);
206 mutex_init(&pool->flush_lock); 228 mutex_init(&pool->flush_lock);
207 spin_lock_init(&pool->list_lock); 229 init_waitqueue_head(&pool->flush_wait);
208 INIT_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); 230 INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
209 231
210 pool->fmr_attr.max_pages = fmr_message_size; 232 pool->fmr_attr.max_pages = fmr_message_size;
211 pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps; 233 pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
@@ -233,34 +255,60 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co
233 255
234void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) 256void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
235{ 257{
236 flush_workqueue(rds_wq); 258 cancel_delayed_work_sync(&pool->flush_worker);
237 rds_ib_flush_mr_pool(pool, 1); 259 rds_ib_flush_mr_pool(pool, 1, NULL);
238 WARN_ON(atomic_read(&pool->item_count)); 260 WARN_ON(atomic_read(&pool->item_count));
239 WARN_ON(atomic_read(&pool->free_pinned)); 261 WARN_ON(atomic_read(&pool->free_pinned));
240 kfree(pool); 262 kfree(pool);
241} 263}
242 264
265static void refill_local(struct rds_ib_mr_pool *pool, struct xlist_head *xl,
266 struct rds_ib_mr **ibmr_ret)
267{
268 struct xlist_head *ibmr_xl;
269 ibmr_xl = xlist_del_head_fast(xl);
270 *ibmr_ret = list_entry(ibmr_xl, struct rds_ib_mr, xlist);
271}
272
243static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool) 273static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool)
244{ 274{
245 struct rds_ib_mr *ibmr = NULL; 275 struct rds_ib_mr *ibmr = NULL;
246 unsigned long flags; 276 struct xlist_head *ret;
277 unsigned long *flag;
247 278
248 spin_lock_irqsave(&pool->list_lock, flags); 279 preempt_disable();
249 if (!list_empty(&pool->clean_list)) { 280 flag = &__get_cpu_var(clean_list_grace);
250 ibmr = list_entry(pool->clean_list.next, struct rds_ib_mr, list); 281 set_bit(CLEAN_LIST_BUSY_BIT, flag);
251 list_del_init(&ibmr->list); 282 ret = xlist_del_head(&pool->clean_list);
252 } 283 if (ret)
253 spin_unlock_irqrestore(&pool->list_lock, flags); 284 ibmr = list_entry(ret, struct rds_ib_mr, xlist);
254 285
286 clear_bit(CLEAN_LIST_BUSY_BIT, flag);
287 preempt_enable();
255 return ibmr; 288 return ibmr;
256} 289}
257 290
291static inline void wait_clean_list_grace(void)
292{
293 int cpu;
294 unsigned long *flag;
295
296 for_each_online_cpu(cpu) {
297 flag = &per_cpu(clean_list_grace, cpu);
298 while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
299 cpu_relax();
300 }
301}
302
258static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) 303static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
259{ 304{
260 struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; 305 struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
261 struct rds_ib_mr *ibmr = NULL; 306 struct rds_ib_mr *ibmr = NULL;
262 int err = 0, iter = 0; 307 int err = 0, iter = 0;
263 308
309 if (atomic_read(&pool->dirty_count) >= pool->max_items / 10)
310 queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10);
311
264 while (1) { 312 while (1) {
265 ibmr = rds_ib_reuse_fmr(pool); 313 ibmr = rds_ib_reuse_fmr(pool);
266 if (ibmr) 314 if (ibmr)
@@ -287,19 +335,24 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
287 335
288 /* We do have some empty MRs. Flush them out. */ 336 /* We do have some empty MRs. Flush them out. */
289 rds_ib_stats_inc(s_ib_rdma_mr_pool_wait); 337 rds_ib_stats_inc(s_ib_rdma_mr_pool_wait);
290 rds_ib_flush_mr_pool(pool, 0); 338 rds_ib_flush_mr_pool(pool, 0, &ibmr);
339 if (ibmr)
340 return ibmr;
291 } 341 }
292 342
293 ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL); 343 ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, rdsibdev_to_node(rds_ibdev));
294 if (!ibmr) { 344 if (!ibmr) {
295 err = -ENOMEM; 345 err = -ENOMEM;
296 goto out_no_cigar; 346 goto out_no_cigar;
297 } 347 }
298 348
349 memset(ibmr, 0, sizeof(*ibmr));
350
299 ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd, 351 ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
300 (IB_ACCESS_LOCAL_WRITE | 352 (IB_ACCESS_LOCAL_WRITE |
301 IB_ACCESS_REMOTE_READ | 353 IB_ACCESS_REMOTE_READ |
302 IB_ACCESS_REMOTE_WRITE), 354 IB_ACCESS_REMOTE_WRITE|
355 IB_ACCESS_REMOTE_ATOMIC),
303 &pool->fmr_attr); 356 &pool->fmr_attr);
304 if (IS_ERR(ibmr->fmr)) { 357 if (IS_ERR(ibmr->fmr)) {
305 err = PTR_ERR(ibmr->fmr); 358 err = PTR_ERR(ibmr->fmr);
@@ -367,7 +420,8 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm
367 if (page_cnt > fmr_message_size) 420 if (page_cnt > fmr_message_size)
368 return -EINVAL; 421 return -EINVAL;
369 422
370 dma_pages = kmalloc(sizeof(u64) * page_cnt, GFP_ATOMIC); 423 dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC,
424 rdsibdev_to_node(rds_ibdev));
371 if (!dma_pages) 425 if (!dma_pages)
372 return -ENOMEM; 426 return -ENOMEM;
373 427
@@ -441,7 +495,7 @@ static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
441 495
442 /* FIXME we need a way to tell a r/w MR 496 /* FIXME we need a way to tell a r/w MR
443 * from a r/o MR */ 497 * from a r/o MR */
444 BUG_ON(in_interrupt()); 498 BUG_ON(irqs_disabled());
445 set_page_dirty(page); 499 set_page_dirty(page);
446 put_page(page); 500 put_page(page);
447 } 501 }
@@ -477,33 +531,109 @@ static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int fr
477} 531}
478 532
479/* 533/*
534 * given an xlist of mrs, put them all into the list_head for more processing
535 */
536static void xlist_append_to_list(struct xlist_head *xlist, struct list_head *list)
537{
538 struct rds_ib_mr *ibmr;
539 struct xlist_head splice;
540 struct xlist_head *cur;
541 struct xlist_head *next;
542
543 splice.next = NULL;
544 xlist_splice(xlist, &splice);
545 cur = splice.next;
546 while (cur) {
547 next = cur->next;
548 ibmr = list_entry(cur, struct rds_ib_mr, xlist);
549 list_add_tail(&ibmr->unmap_list, list);
550 cur = next;
551 }
552}
553
554/*
555 * this takes a list head of mrs and turns it into an xlist of clusters.
556 * each cluster has an xlist of MR_CLUSTER_SIZE mrs that are ready for
557 * reuse.
558 */
559static void list_append_to_xlist(struct rds_ib_mr_pool *pool,
560 struct list_head *list, struct xlist_head *xlist,
561 struct xlist_head **tail_ret)
562{
563 struct rds_ib_mr *ibmr;
564 struct xlist_head *cur_mr = xlist;
565 struct xlist_head *tail_mr = NULL;
566
567 list_for_each_entry(ibmr, list, unmap_list) {
568 tail_mr = &ibmr->xlist;
569 tail_mr->next = NULL;
570 cur_mr->next = tail_mr;
571 cur_mr = tail_mr;
572 }
573 *tail_ret = tail_mr;
574}
575
576/*
480 * Flush our pool of MRs. 577 * Flush our pool of MRs.
481 * At a minimum, all currently unused MRs are unmapped. 578 * At a minimum, all currently unused MRs are unmapped.
482 * If the number of MRs allocated exceeds the limit, we also try 579 * If the number of MRs allocated exceeds the limit, we also try
483 * to free as many MRs as needed to get back to this limit. 580 * to free as many MRs as needed to get back to this limit.
484 */ 581 */
485static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all) 582static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
583 int free_all, struct rds_ib_mr **ibmr_ret)
486{ 584{
487 struct rds_ib_mr *ibmr, *next; 585 struct rds_ib_mr *ibmr, *next;
586 struct xlist_head clean_xlist;
587 struct xlist_head *clean_tail;
488 LIST_HEAD(unmap_list); 588 LIST_HEAD(unmap_list);
489 LIST_HEAD(fmr_list); 589 LIST_HEAD(fmr_list);
490 unsigned long unpinned = 0; 590 unsigned long unpinned = 0;
491 unsigned long flags;
492 unsigned int nfreed = 0, ncleaned = 0, free_goal; 591 unsigned int nfreed = 0, ncleaned = 0, free_goal;
493 int ret = 0; 592 int ret = 0;
494 593
495 rds_ib_stats_inc(s_ib_rdma_mr_pool_flush); 594 rds_ib_stats_inc(s_ib_rdma_mr_pool_flush);
496 595
497 mutex_lock(&pool->flush_lock); 596 if (ibmr_ret) {
597 DEFINE_WAIT(wait);
598 while(!mutex_trylock(&pool->flush_lock)) {
599 ibmr = rds_ib_reuse_fmr(pool);
600 if (ibmr) {
601 *ibmr_ret = ibmr;
602 finish_wait(&pool->flush_wait, &wait);
603 goto out_nolock;
604 }
605
606 prepare_to_wait(&pool->flush_wait, &wait,
607 TASK_UNINTERRUPTIBLE);
608 if (xlist_empty(&pool->clean_list))
609 schedule();
610
611 ibmr = rds_ib_reuse_fmr(pool);
612 if (ibmr) {
613 *ibmr_ret = ibmr;
614 finish_wait(&pool->flush_wait, &wait);
615 goto out_nolock;
616 }
617 }
618 finish_wait(&pool->flush_wait, &wait);
619 } else
620 mutex_lock(&pool->flush_lock);
621
622 if (ibmr_ret) {
623 ibmr = rds_ib_reuse_fmr(pool);
624 if (ibmr) {
625 *ibmr_ret = ibmr;
626 goto out;
627 }
628 }
498 629
499 spin_lock_irqsave(&pool->list_lock, flags);
500 /* Get the list of all MRs to be dropped. Ordering matters - 630 /* Get the list of all MRs to be dropped. Ordering matters -
501 * we want to put drop_list ahead of free_list. */ 631 * we want to put drop_list ahead of free_list.
502 list_splice_init(&pool->free_list, &unmap_list); 632 */
503 list_splice_init(&pool->drop_list, &unmap_list); 633 xlist_append_to_list(&pool->drop_list, &unmap_list);
634 xlist_append_to_list(&pool->free_list, &unmap_list);
504 if (free_all) 635 if (free_all)
505 list_splice_init(&pool->clean_list, &unmap_list); 636 xlist_append_to_list(&pool->clean_list, &unmap_list);
506 spin_unlock_irqrestore(&pool->list_lock, flags);
507 637
508 free_goal = rds_ib_flush_goal(pool, free_all); 638 free_goal = rds_ib_flush_goal(pool, free_all);
509 639
@@ -511,19 +641,20 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
511 goto out; 641 goto out;
512 642
513 /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */ 643 /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
514 list_for_each_entry(ibmr, &unmap_list, list) 644 list_for_each_entry(ibmr, &unmap_list, unmap_list)
515 list_add(&ibmr->fmr->list, &fmr_list); 645 list_add(&ibmr->fmr->list, &fmr_list);
646
516 ret = ib_unmap_fmr(&fmr_list); 647 ret = ib_unmap_fmr(&fmr_list);
517 if (ret) 648 if (ret)
518 printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret); 649 printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret);
519 650
520 /* Now we can destroy the DMA mapping and unpin any pages */ 651 /* Now we can destroy the DMA mapping and unpin any pages */
521 list_for_each_entry_safe(ibmr, next, &unmap_list, list) { 652 list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) {
522 unpinned += ibmr->sg_len; 653 unpinned += ibmr->sg_len;
523 __rds_ib_teardown_mr(ibmr); 654 __rds_ib_teardown_mr(ibmr);
524 if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) { 655 if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) {
525 rds_ib_stats_inc(s_ib_rdma_mr_free); 656 rds_ib_stats_inc(s_ib_rdma_mr_free);
526 list_del(&ibmr->list); 657 list_del(&ibmr->unmap_list);
527 ib_dealloc_fmr(ibmr->fmr); 658 ib_dealloc_fmr(ibmr->fmr);
528 kfree(ibmr); 659 kfree(ibmr);
529 nfreed++; 660 nfreed++;
@@ -531,9 +662,27 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
531 ncleaned++; 662 ncleaned++;
532 } 663 }
533 664
534 spin_lock_irqsave(&pool->list_lock, flags); 665 if (!list_empty(&unmap_list)) {
535 list_splice(&unmap_list, &pool->clean_list); 666 /* we have to make sure that none of the things we're about
536 spin_unlock_irqrestore(&pool->list_lock, flags); 667 * to put on the clean list would race with other cpus trying
668 * to pull items off. The xlist would explode if we managed to
669 * remove something from the clean list and then add it back again
670 * while another CPU was spinning on that same item in xlist_del_head.
671 *
672 * This is pretty unlikely, but just in case wait for an xlist grace period
673 * here before adding anything back into the clean list.
674 */
675 wait_clean_list_grace();
676
677 list_append_to_xlist(pool, &unmap_list, &clean_xlist, &clean_tail);
678 if (ibmr_ret)
679 refill_local(pool, &clean_xlist, ibmr_ret);
680
681 /* refill_local may have emptied our list */
682 if (!xlist_empty(&clean_xlist))
683 xlist_add(clean_xlist.next, clean_tail, &pool->clean_list);
684
685 }
537 686
538 atomic_sub(unpinned, &pool->free_pinned); 687 atomic_sub(unpinned, &pool->free_pinned);
539 atomic_sub(ncleaned, &pool->dirty_count); 688 atomic_sub(ncleaned, &pool->dirty_count);
@@ -541,14 +690,35 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
541 690
542out: 691out:
543 mutex_unlock(&pool->flush_lock); 692 mutex_unlock(&pool->flush_lock);
693 if (waitqueue_active(&pool->flush_wait))
694 wake_up(&pool->flush_wait);
695out_nolock:
544 return ret; 696 return ret;
545} 697}
546 698
699int rds_ib_fmr_init(void)
700{
701 rds_ib_fmr_wq = create_workqueue("rds_fmr_flushd");
702 if (!rds_ib_fmr_wq)
703 return -ENOMEM;
704 return 0;
705}
706
707/*
708 * By the time this is called all the IB devices should have been torn down and
709 * had their pools freed. As each pool is freed its work struct is waited on,
710 * so the pool flushing work queue should be idle by the time we get here.
711 */
712void rds_ib_fmr_exit(void)
713{
714 destroy_workqueue(rds_ib_fmr_wq);
715}
716
547static void rds_ib_mr_pool_flush_worker(struct work_struct *work) 717static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
548{ 718{
549 struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker); 719 struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work);
550 720
551 rds_ib_flush_mr_pool(pool, 0); 721 rds_ib_flush_mr_pool(pool, 0, NULL);
552} 722}
553 723
554void rds_ib_free_mr(void *trans_private, int invalidate) 724void rds_ib_free_mr(void *trans_private, int invalidate)
@@ -556,47 +726,49 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
556 struct rds_ib_mr *ibmr = trans_private; 726 struct rds_ib_mr *ibmr = trans_private;
557 struct rds_ib_device *rds_ibdev = ibmr->device; 727 struct rds_ib_device *rds_ibdev = ibmr->device;
558 struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; 728 struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
559 unsigned long flags;
560 729
561 rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); 730 rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
562 731
563 /* Return it to the pool's free list */ 732 /* Return it to the pool's free list */
564 spin_lock_irqsave(&pool->list_lock, flags);
565 if (ibmr->remap_count >= pool->fmr_attr.max_maps) 733 if (ibmr->remap_count >= pool->fmr_attr.max_maps)
566 list_add(&ibmr->list, &pool->drop_list); 734 xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->drop_list);
567 else 735 else
568 list_add(&ibmr->list, &pool->free_list); 736 xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->free_list);
569 737
570 atomic_add(ibmr->sg_len, &pool->free_pinned); 738 atomic_add(ibmr->sg_len, &pool->free_pinned);
571 atomic_inc(&pool->dirty_count); 739 atomic_inc(&pool->dirty_count);
572 spin_unlock_irqrestore(&pool->list_lock, flags);
573 740
574 /* If we've pinned too many pages, request a flush */ 741 /* If we've pinned too many pages, request a flush */
575 if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || 742 if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
576 atomic_read(&pool->dirty_count) >= pool->max_items / 10) 743 atomic_read(&pool->dirty_count) >= pool->max_items / 10)
577 queue_work(rds_wq, &pool->flush_worker); 744 queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10);
578 745
579 if (invalidate) { 746 if (invalidate) {
580 if (likely(!in_interrupt())) { 747 if (likely(!in_interrupt())) {
581 rds_ib_flush_mr_pool(pool, 0); 748 rds_ib_flush_mr_pool(pool, 0, NULL);
582 } else { 749 } else {
583 /* We get here if the user created a MR marked 750 /* We get here if the user created a MR marked
584 * as use_once and invalidate at the same time. */ 751 * as use_once and invalidate at the same time. */
585 queue_work(rds_wq, &pool->flush_worker); 752 queue_delayed_work(rds_ib_fmr_wq,
753 &pool->flush_worker, 10);
586 } 754 }
587 } 755 }
756
757 rds_ib_dev_put(rds_ibdev);
588} 758}
589 759
590void rds_ib_flush_mrs(void) 760void rds_ib_flush_mrs(void)
591{ 761{
592 struct rds_ib_device *rds_ibdev; 762 struct rds_ib_device *rds_ibdev;
593 763
764 down_read(&rds_ib_devices_lock);
594 list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { 765 list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
595 struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; 766 struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
596 767
597 if (pool) 768 if (pool)
598 rds_ib_flush_mr_pool(pool, 0); 769 rds_ib_flush_mr_pool(pool, 0, NULL);
599 } 770 }
771 up_read(&rds_ib_devices_lock);
600} 772}
601 773
602void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, 774void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
@@ -628,6 +800,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
628 printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret); 800 printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret);
629 801
630 ibmr->device = rds_ibdev; 802 ibmr->device = rds_ibdev;
803 rds_ibdev = NULL;
631 804
632 out: 805 out:
633 if (ret) { 806 if (ret) {
@@ -635,5 +808,8 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
635 rds_ib_free_mr(ibmr, 0); 808 rds_ib_free_mr(ibmr, 0);
636 ibmr = ERR_PTR(ret); 809 ibmr = ERR_PTR(ret);
637 } 810 }
811 if (rds_ibdev)
812 rds_ib_dev_put(rds_ibdev);
638 return ibmr; 813 return ibmr;
639} 814}
815
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index c74e9904a6b2..e29e0ca32f74 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -43,42 +43,6 @@ static struct kmem_cache *rds_ib_incoming_slab;
43static struct kmem_cache *rds_ib_frag_slab; 43static struct kmem_cache *rds_ib_frag_slab;
44static atomic_t rds_ib_allocation = ATOMIC_INIT(0); 44static atomic_t rds_ib_allocation = ATOMIC_INIT(0);
45 45
46static void rds_ib_frag_drop_page(struct rds_page_frag *frag)
47{
48 rdsdebug("frag %p page %p\n", frag, frag->f_page);
49 __free_page(frag->f_page);
50 frag->f_page = NULL;
51}
52
53static void rds_ib_frag_free(struct rds_page_frag *frag)
54{
55 rdsdebug("frag %p page %p\n", frag, frag->f_page);
56 BUG_ON(frag->f_page != NULL);
57 kmem_cache_free(rds_ib_frag_slab, frag);
58}
59
60/*
61 * We map a page at a time. Its fragments are posted in order. This
62 * is called in fragment order as the fragments get send completion events.
63 * Only the last frag in the page performs the unmapping.
64 *
65 * It's OK for ring cleanup to call this in whatever order it likes because
66 * DMA is not in flight and so we can unmap while other ring entries still
67 * hold page references in their frags.
68 */
69static void rds_ib_recv_unmap_page(struct rds_ib_connection *ic,
70 struct rds_ib_recv_work *recv)
71{
72 struct rds_page_frag *frag = recv->r_frag;
73
74 rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page);
75 if (frag->f_mapped)
76 ib_dma_unmap_page(ic->i_cm_id->device,
77 frag->f_mapped,
78 RDS_FRAG_SIZE, DMA_FROM_DEVICE);
79 frag->f_mapped = 0;
80}
81
82void rds_ib_recv_init_ring(struct rds_ib_connection *ic) 46void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
83{ 47{
84 struct rds_ib_recv_work *recv; 48 struct rds_ib_recv_work *recv;
@@ -95,16 +59,161 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
95 recv->r_wr.sg_list = recv->r_sge; 59 recv->r_wr.sg_list = recv->r_sge;
96 recv->r_wr.num_sge = RDS_IB_RECV_SGE; 60 recv->r_wr.num_sge = RDS_IB_RECV_SGE;
97 61
98 sge = rds_ib_data_sge(ic, recv->r_sge); 62 sge = &recv->r_sge[0];
63 sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
64 sge->length = sizeof(struct rds_header);
65 sge->lkey = ic->i_mr->lkey;
66
67 sge = &recv->r_sge[1];
99 sge->addr = 0; 68 sge->addr = 0;
100 sge->length = RDS_FRAG_SIZE; 69 sge->length = RDS_FRAG_SIZE;
101 sge->lkey = ic->i_mr->lkey; 70 sge->lkey = ic->i_mr->lkey;
71 }
72}
102 73
103 sge = rds_ib_header_sge(ic, recv->r_sge); 74/*
104 sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); 75 * The entire 'from' list, including the from element itself, is put on
105 sge->length = sizeof(struct rds_header); 76 * to the tail of the 'to' list.
106 sge->lkey = ic->i_mr->lkey; 77 */
78static void list_splice_entire_tail(struct list_head *from,
79 struct list_head *to)
80{
81 struct list_head *from_last = from->prev;
82
83 list_splice_tail(from_last, to);
84 list_add_tail(from_last, to);
85}
86
87static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache)
88{
89 struct list_head *tmp;
90
91 tmp = xchg(&cache->xfer, NULL);
92 if (tmp) {
93 if (cache->ready)
94 list_splice_entire_tail(tmp, cache->ready);
95 else
96 cache->ready = tmp;
97 }
98}
99
100static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache)
101{
102 struct rds_ib_cache_head *head;
103 int cpu;
104
105 cache->percpu = alloc_percpu(struct rds_ib_cache_head);
106 if (!cache->percpu)
107 return -ENOMEM;
108
109 for_each_possible_cpu(cpu) {
110 head = per_cpu_ptr(cache->percpu, cpu);
111 head->first = NULL;
112 head->count = 0;
113 }
114 cache->xfer = NULL;
115 cache->ready = NULL;
116
117 return 0;
118}
119
120int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic)
121{
122 int ret;
123
124 ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs);
125 if (!ret) {
126 ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags);
127 if (ret)
128 free_percpu(ic->i_cache_incs.percpu);
107 } 129 }
130
131 return ret;
132}
133
134static void rds_ib_cache_splice_all_lists(struct rds_ib_refill_cache *cache,
135 struct list_head *caller_list)
136{
137 struct rds_ib_cache_head *head;
138 int cpu;
139
140 for_each_possible_cpu(cpu) {
141 head = per_cpu_ptr(cache->percpu, cpu);
142 if (head->first) {
143 list_splice_entire_tail(head->first, caller_list);
144 head->first = NULL;
145 }
146 }
147
148 if (cache->ready) {
149 list_splice_entire_tail(cache->ready, caller_list);
150 cache->ready = NULL;
151 }
152}
153
154void rds_ib_recv_free_caches(struct rds_ib_connection *ic)
155{
156 struct rds_ib_incoming *inc;
157 struct rds_ib_incoming *inc_tmp;
158 struct rds_page_frag *frag;
159 struct rds_page_frag *frag_tmp;
160 LIST_HEAD(list);
161
162 rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
163 rds_ib_cache_splice_all_lists(&ic->i_cache_incs, &list);
164 free_percpu(ic->i_cache_incs.percpu);
165
166 list_for_each_entry_safe(inc, inc_tmp, &list, ii_cache_entry) {
167 list_del(&inc->ii_cache_entry);
168 WARN_ON(!list_empty(&inc->ii_frags));
169 kmem_cache_free(rds_ib_incoming_slab, inc);
170 }
171
172 rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
173 rds_ib_cache_splice_all_lists(&ic->i_cache_frags, &list);
174 free_percpu(ic->i_cache_frags.percpu);
175
176 list_for_each_entry_safe(frag, frag_tmp, &list, f_cache_entry) {
177 list_del(&frag->f_cache_entry);
178 WARN_ON(!list_empty(&frag->f_item));
179 kmem_cache_free(rds_ib_frag_slab, frag);
180 }
181}
182
183/* fwd decl */
184static void rds_ib_recv_cache_put(struct list_head *new_item,
185 struct rds_ib_refill_cache *cache);
186static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache);
187
188
189/* Recycle frag and attached recv buffer f_sg */
190static void rds_ib_frag_free(struct rds_ib_connection *ic,
191 struct rds_page_frag *frag)
192{
193 rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg));
194
195 rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags);
196}
197
198/* Recycle inc after freeing attached frags */
199void rds_ib_inc_free(struct rds_incoming *inc)
200{
201 struct rds_ib_incoming *ibinc;
202 struct rds_page_frag *frag;
203 struct rds_page_frag *pos;
204 struct rds_ib_connection *ic = inc->i_conn->c_transport_data;
205
206 ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
207
208 /* Free attached frags */
209 list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
210 list_del_init(&frag->f_item);
211 rds_ib_frag_free(ic, frag);
212 }
213 BUG_ON(!list_empty(&ibinc->ii_frags));
214
215 rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
216 rds_ib_recv_cache_put(&ibinc->ii_cache_entry, &ic->i_cache_incs);
108} 217}
109 218
110static void rds_ib_recv_clear_one(struct rds_ib_connection *ic, 219static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
@@ -115,10 +224,8 @@ static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
115 recv->r_ibinc = NULL; 224 recv->r_ibinc = NULL;
116 } 225 }
117 if (recv->r_frag) { 226 if (recv->r_frag) {
118 rds_ib_recv_unmap_page(ic, recv); 227 ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
119 if (recv->r_frag->f_page) 228 rds_ib_frag_free(ic, recv->r_frag);
120 rds_ib_frag_drop_page(recv->r_frag);
121 rds_ib_frag_free(recv->r_frag);
122 recv->r_frag = NULL; 229 recv->r_frag = NULL;
123 } 230 }
124} 231}
@@ -129,84 +236,111 @@ void rds_ib_recv_clear_ring(struct rds_ib_connection *ic)
129 236
130 for (i = 0; i < ic->i_recv_ring.w_nr; i++) 237 for (i = 0; i < ic->i_recv_ring.w_nr; i++)
131 rds_ib_recv_clear_one(ic, &ic->i_recvs[i]); 238 rds_ib_recv_clear_one(ic, &ic->i_recvs[i]);
132
133 if (ic->i_frag.f_page)
134 rds_ib_frag_drop_page(&ic->i_frag);
135} 239}
136 240
137static int rds_ib_recv_refill_one(struct rds_connection *conn, 241static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic,
138 struct rds_ib_recv_work *recv, 242 gfp_t slab_mask)
139 gfp_t kptr_gfp, gfp_t page_gfp)
140{ 243{
141 struct rds_ib_connection *ic = conn->c_transport_data; 244 struct rds_ib_incoming *ibinc;
142 dma_addr_t dma_addr; 245 struct list_head *cache_item;
143 struct ib_sge *sge; 246 int avail_allocs;
144 int ret = -ENOMEM;
145 247
146 if (recv->r_ibinc == NULL) { 248 cache_item = rds_ib_recv_cache_get(&ic->i_cache_incs);
147 if (!atomic_add_unless(&rds_ib_allocation, 1, rds_ib_sysctl_max_recv_allocation)) { 249 if (cache_item) {
250 ibinc = container_of(cache_item, struct rds_ib_incoming, ii_cache_entry);
251 } else {
252 avail_allocs = atomic_add_unless(&rds_ib_allocation,
253 1, rds_ib_sysctl_max_recv_allocation);
254 if (!avail_allocs) {
148 rds_ib_stats_inc(s_ib_rx_alloc_limit); 255 rds_ib_stats_inc(s_ib_rx_alloc_limit);
149 goto out; 256 return NULL;
150 } 257 }
151 recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab, 258 ibinc = kmem_cache_alloc(rds_ib_incoming_slab, slab_mask);
152 kptr_gfp); 259 if (!ibinc) {
153 if (recv->r_ibinc == NULL) {
154 atomic_dec(&rds_ib_allocation); 260 atomic_dec(&rds_ib_allocation);
155 goto out; 261 return NULL;
156 } 262 }
157 INIT_LIST_HEAD(&recv->r_ibinc->ii_frags);
158 rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr);
159 } 263 }
264 INIT_LIST_HEAD(&ibinc->ii_frags);
265 rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr);
160 266
161 if (recv->r_frag == NULL) { 267 return ibinc;
162 recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, kptr_gfp); 268}
163 if (recv->r_frag == NULL) 269
164 goto out; 270static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic,
165 INIT_LIST_HEAD(&recv->r_frag->f_item); 271 gfp_t slab_mask, gfp_t page_mask)
166 recv->r_frag->f_page = NULL; 272{
273 struct rds_page_frag *frag;
274 struct list_head *cache_item;
275 int ret;
276
277 cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags);
278 if (cache_item) {
279 frag = container_of(cache_item, struct rds_page_frag, f_cache_entry);
280 } else {
281 frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask);
282 if (!frag)
283 return NULL;
284
285 sg_init_table(&frag->f_sg, 1);
286 ret = rds_page_remainder_alloc(&frag->f_sg,
287 RDS_FRAG_SIZE, page_mask);
288 if (ret) {
289 kmem_cache_free(rds_ib_frag_slab, frag);
290 return NULL;
291 }
167 } 292 }
168 293
169 if (ic->i_frag.f_page == NULL) { 294 INIT_LIST_HEAD(&frag->f_item);
170 ic->i_frag.f_page = alloc_page(page_gfp); 295
171 if (ic->i_frag.f_page == NULL) 296 return frag;
172 goto out; 297}
173 ic->i_frag.f_offset = 0; 298
299static int rds_ib_recv_refill_one(struct rds_connection *conn,
300 struct rds_ib_recv_work *recv, int prefill)
301{
302 struct rds_ib_connection *ic = conn->c_transport_data;
303 struct ib_sge *sge;
304 int ret = -ENOMEM;
305 gfp_t slab_mask = GFP_NOWAIT;
306 gfp_t page_mask = GFP_NOWAIT;
307
308 if (prefill) {
309 slab_mask = GFP_KERNEL;
310 page_mask = GFP_HIGHUSER;
174 } 311 }
175 312
176 dma_addr = ib_dma_map_page(ic->i_cm_id->device, 313 if (!ic->i_cache_incs.ready)
177 ic->i_frag.f_page, 314 rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
178 ic->i_frag.f_offset, 315 if (!ic->i_cache_frags.ready)
179 RDS_FRAG_SIZE, 316 rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
180 DMA_FROM_DEVICE);
181 if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr))
182 goto out;
183 317
184 /* 318 /*
185 * Once we get the RDS_PAGE_LAST_OFF frag then rds_ib_frag_unmap() 319 * ibinc was taken from recv if recv contained the start of a message.
186 * must be called on this recv. This happens as completions hit 320 * recvs that were continuations will still have this allocated.
187 * in order or on connection shutdown.
188 */ 321 */
189 recv->r_frag->f_page = ic->i_frag.f_page; 322 if (!recv->r_ibinc) {
190 recv->r_frag->f_offset = ic->i_frag.f_offset; 323 recv->r_ibinc = rds_ib_refill_one_inc(ic, slab_mask);
191 recv->r_frag->f_mapped = dma_addr; 324 if (!recv->r_ibinc)
325 goto out;
326 }
192 327
193 sge = rds_ib_data_sge(ic, recv->r_sge); 328 WARN_ON(recv->r_frag); /* leak! */
194 sge->addr = dma_addr; 329 recv->r_frag = rds_ib_refill_one_frag(ic, slab_mask, page_mask);
195 sge->length = RDS_FRAG_SIZE; 330 if (!recv->r_frag)
331 goto out;
332
333 ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg,
334 1, DMA_FROM_DEVICE);
335 WARN_ON(ret != 1);
196 336
197 sge = rds_ib_header_sge(ic, recv->r_sge); 337 sge = &recv->r_sge[0];
198 sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); 338 sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
199 sge->length = sizeof(struct rds_header); 339 sge->length = sizeof(struct rds_header);
200 340
201 get_page(recv->r_frag->f_page); 341 sge = &recv->r_sge[1];
202 342 sge->addr = sg_dma_address(&recv->r_frag->f_sg);
203 if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) { 343 sge->length = sg_dma_len(&recv->r_frag->f_sg);
204 ic->i_frag.f_offset += RDS_FRAG_SIZE;
205 } else {
206 put_page(ic->i_frag.f_page);
207 ic->i_frag.f_page = NULL;
208 ic->i_frag.f_offset = 0;
209 }
210 344
211 ret = 0; 345 ret = 0;
212out: 346out:
@@ -216,13 +350,11 @@ out:
216/* 350/*
217 * This tries to allocate and post unused work requests after making sure that 351 * This tries to allocate and post unused work requests after making sure that
218 * they have all the allocations they need to queue received fragments into 352 * they have all the allocations they need to queue received fragments into
219 * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc 353 * sockets.
220 * pairs don't go unmatched.
221 * 354 *
222 * -1 is returned if posting fails due to temporary resource exhaustion. 355 * -1 is returned if posting fails due to temporary resource exhaustion.
223 */ 356 */
224int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, 357void rds_ib_recv_refill(struct rds_connection *conn, int prefill)
225 gfp_t page_gfp, int prefill)
226{ 358{
227 struct rds_ib_connection *ic = conn->c_transport_data; 359 struct rds_ib_connection *ic = conn->c_transport_data;
228 struct rds_ib_recv_work *recv; 360 struct rds_ib_recv_work *recv;
@@ -236,28 +368,25 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
236 if (pos >= ic->i_recv_ring.w_nr) { 368 if (pos >= ic->i_recv_ring.w_nr) {
237 printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n", 369 printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
238 pos); 370 pos);
239 ret = -EINVAL;
240 break; 371 break;
241 } 372 }
242 373
243 recv = &ic->i_recvs[pos]; 374 recv = &ic->i_recvs[pos];
244 ret = rds_ib_recv_refill_one(conn, recv, kptr_gfp, page_gfp); 375 ret = rds_ib_recv_refill_one(conn, recv, prefill);
245 if (ret) { 376 if (ret) {
246 ret = -1;
247 break; 377 break;
248 } 378 }
249 379
250 /* XXX when can this fail? */ 380 /* XXX when can this fail? */
251 ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); 381 ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
252 rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv, 382 rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv,
253 recv->r_ibinc, recv->r_frag->f_page, 383 recv->r_ibinc, sg_page(&recv->r_frag->f_sg),
254 (long) recv->r_frag->f_mapped, ret); 384 (long) sg_dma_address(&recv->r_frag->f_sg), ret);
255 if (ret) { 385 if (ret) {
256 rds_ib_conn_error(conn, "recv post on " 386 rds_ib_conn_error(conn, "recv post on "
257 "%pI4 returned %d, disconnecting and " 387 "%pI4 returned %d, disconnecting and "
258 "reconnecting\n", &conn->c_faddr, 388 "reconnecting\n", &conn->c_faddr,
259 ret); 389 ret);
260 ret = -1;
261 break; 390 break;
262 } 391 }
263 392
@@ -270,37 +399,73 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
270 399
271 if (ret) 400 if (ret)
272 rds_ib_ring_unalloc(&ic->i_recv_ring, 1); 401 rds_ib_ring_unalloc(&ic->i_recv_ring, 1);
273 return ret;
274} 402}
275 403
276void rds_ib_inc_purge(struct rds_incoming *inc) 404/*
405 * We want to recycle several types of recv allocations, like incs and frags.
406 * To use this, the *_free() function passes in the ptr to a list_head within
407 * the recyclee, as well as the cache to put it on.
408 *
409 * First, we put the memory on a percpu list. When this reaches a certain size,
410 * We move it to an intermediate non-percpu list in a lockless manner, with some
411 * xchg/compxchg wizardry.
412 *
413 * N.B. Instead of a list_head as the anchor, we use a single pointer, which can
414 * be NULL and xchg'd. The list is actually empty when the pointer is NULL, and
415 * list_empty() will return true with one element is actually present.
416 */
417static void rds_ib_recv_cache_put(struct list_head *new_item,
418 struct rds_ib_refill_cache *cache)
277{ 419{
278 struct rds_ib_incoming *ibinc; 420 unsigned long flags;
279 struct rds_page_frag *frag; 421 struct rds_ib_cache_head *chp;
280 struct rds_page_frag *pos; 422 struct list_head *old;
281 423
282 ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); 424 local_irq_save(flags);
283 rdsdebug("purging ibinc %p inc %p\n", ibinc, inc);
284 425
285 list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) { 426 chp = per_cpu_ptr(cache->percpu, smp_processor_id());
286 list_del_init(&frag->f_item); 427 if (!chp->first)
287 rds_ib_frag_drop_page(frag); 428 INIT_LIST_HEAD(new_item);
288 rds_ib_frag_free(frag); 429 else /* put on front */
289 } 430 list_add_tail(new_item, chp->first);
431 chp->first = new_item;
432 chp->count++;
433
434 if (chp->count < RDS_IB_RECYCLE_BATCH_COUNT)
435 goto end;
436
437 /*
438 * Return our per-cpu first list to the cache's xfer by atomically
439 * grabbing the current xfer list, appending it to our per-cpu list,
440 * and then atomically returning that entire list back to the
441 * cache's xfer list as long as it's still empty.
442 */
443 do {
444 old = xchg(&cache->xfer, NULL);
445 if (old)
446 list_splice_entire_tail(old, chp->first);
447 old = cmpxchg(&cache->xfer, NULL, chp->first);
448 } while (old);
449
450 chp->first = NULL;
451 chp->count = 0;
452end:
453 local_irq_restore(flags);
290} 454}
291 455
292void rds_ib_inc_free(struct rds_incoming *inc) 456static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache)
293{ 457{
294 struct rds_ib_incoming *ibinc; 458 struct list_head *head = cache->ready;
295 459
296 ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); 460 if (head) {
461 if (!list_empty(head)) {
462 cache->ready = head->next;
463 list_del_init(head);
464 } else
465 cache->ready = NULL;
466 }
297 467
298 rds_ib_inc_purge(inc); 468 return head;
299 rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
300 BUG_ON(!list_empty(&ibinc->ii_frags));
301 kmem_cache_free(rds_ib_incoming_slab, ibinc);
302 atomic_dec(&rds_ib_allocation);
303 BUG_ON(atomic_read(&rds_ib_allocation) < 0);
304} 469}
305 470
306int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, 471int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
@@ -336,13 +501,13 @@ int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
336 to_copy = min_t(unsigned long, to_copy, len - copied); 501 to_copy = min_t(unsigned long, to_copy, len - copied);
337 502
338 rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag " 503 rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
339 "[%p, %lu] + %lu\n", 504 "[%p, %u] + %lu\n",
340 to_copy, iov->iov_base, iov->iov_len, iov_off, 505 to_copy, iov->iov_base, iov->iov_len, iov_off,
341 frag->f_page, frag->f_offset, frag_off); 506 sg_page(&frag->f_sg), frag->f_sg.offset, frag_off);
342 507
343 /* XXX needs + offset for multiple recvs per page */ 508 /* XXX needs + offset for multiple recvs per page */
344 ret = rds_page_copy_to_user(frag->f_page, 509 ret = rds_page_copy_to_user(sg_page(&frag->f_sg),
345 frag->f_offset + frag_off, 510 frag->f_sg.offset + frag_off,
346 iov->iov_base + iov_off, 511 iov->iov_base + iov_off,
347 to_copy); 512 to_copy);
348 if (ret) { 513 if (ret) {
@@ -557,47 +722,6 @@ u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic)
557 return rds_ib_get_ack(ic); 722 return rds_ib_get_ack(ic);
558} 723}
559 724
560static struct rds_header *rds_ib_get_header(struct rds_connection *conn,
561 struct rds_ib_recv_work *recv,
562 u32 data_len)
563{
564 struct rds_ib_connection *ic = conn->c_transport_data;
565 void *hdr_buff = &ic->i_recv_hdrs[recv - ic->i_recvs];
566 void *addr;
567 u32 misplaced_hdr_bytes;
568
569 /*
570 * Support header at the front (RDS 3.1+) as well as header-at-end.
571 *
572 * Cases:
573 * 1) header all in header buff (great!)
574 * 2) header all in data page (copy all to header buff)
575 * 3) header split across hdr buf + data page
576 * (move bit in hdr buff to end before copying other bit from data page)
577 */
578 if (conn->c_version > RDS_PROTOCOL_3_0 || data_len == RDS_FRAG_SIZE)
579 return hdr_buff;
580
581 if (data_len <= (RDS_FRAG_SIZE - sizeof(struct rds_header))) {
582 addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0);
583 memcpy(hdr_buff,
584 addr + recv->r_frag->f_offset + data_len,
585 sizeof(struct rds_header));
586 kunmap_atomic(addr, KM_SOFTIRQ0);
587 return hdr_buff;
588 }
589
590 misplaced_hdr_bytes = (sizeof(struct rds_header) - (RDS_FRAG_SIZE - data_len));
591
592 memmove(hdr_buff + misplaced_hdr_bytes, hdr_buff, misplaced_hdr_bytes);
593
594 addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0);
595 memcpy(hdr_buff, addr + recv->r_frag->f_offset + data_len,
596 sizeof(struct rds_header) - misplaced_hdr_bytes);
597 kunmap_atomic(addr, KM_SOFTIRQ0);
598 return hdr_buff;
599}
600
601/* 725/*
602 * It's kind of lame that we're copying from the posted receive pages into 726 * It's kind of lame that we're copying from the posted receive pages into
603 * long-lived bitmaps. We could have posted the bitmaps and rdma written into 727 * long-lived bitmaps. We could have posted the bitmaps and rdma written into
@@ -639,7 +763,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn,
639 to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); 763 to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
640 BUG_ON(to_copy & 7); /* Must be 64bit aligned. */ 764 BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
641 765
642 addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0); 766 addr = kmap_atomic(sg_page(&frag->f_sg), KM_SOFTIRQ0);
643 767
644 src = addr + frag_off; 768 src = addr + frag_off;
645 dst = (void *)map->m_page_addrs[map_page] + map_off; 769 dst = (void *)map->m_page_addrs[map_page] + map_off;
@@ -710,7 +834,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
710 } 834 }
711 data_len -= sizeof(struct rds_header); 835 data_len -= sizeof(struct rds_header);
712 836
713 ihdr = rds_ib_get_header(conn, recv, data_len); 837 ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
714 838
715 /* Validate the checksum. */ 839 /* Validate the checksum. */
716 if (!rds_message_verify_checksum(ihdr)) { 840 if (!rds_message_verify_checksum(ihdr)) {
@@ -742,12 +866,12 @@ static void rds_ib_process_recv(struct rds_connection *conn,
742 * the inc is freed. We don't go that route, so we have to drop the 866 * the inc is freed. We don't go that route, so we have to drop the
743 * page ref ourselves. We can't just leave the page on the recv 867 * page ref ourselves. We can't just leave the page on the recv
744 * because that confuses the dma mapping of pages and each recv's use 868 * because that confuses the dma mapping of pages and each recv's use
745 * of a partial page. We can leave the frag, though, it will be 869 * of a partial page.
746 * reused.
747 * 870 *
748 * FIXME: Fold this into the code path below. 871 * FIXME: Fold this into the code path below.
749 */ 872 */
750 rds_ib_frag_drop_page(recv->r_frag); 873 rds_ib_frag_free(ic, recv->r_frag);
874 recv->r_frag = NULL;
751 return; 875 return;
752 } 876 }
753 877
@@ -757,7 +881,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
757 * into the inc and save the inc so we can hang upcoming fragments 881 * into the inc and save the inc so we can hang upcoming fragments
758 * off its list. 882 * off its list.
759 */ 883 */
760 if (ibinc == NULL) { 884 if (!ibinc) {
761 ibinc = recv->r_ibinc; 885 ibinc = recv->r_ibinc;
762 recv->r_ibinc = NULL; 886 recv->r_ibinc = NULL;
763 ic->i_ibinc = ibinc; 887 ic->i_ibinc = ibinc;
@@ -842,32 +966,38 @@ static inline void rds_poll_cq(struct rds_ib_connection *ic,
842 struct rds_ib_recv_work *recv; 966 struct rds_ib_recv_work *recv;
843 967
844 while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) { 968 while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) {
845 rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", 969 rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
846 (unsigned long long)wc.wr_id, wc.status, wc.byte_len, 970 (unsigned long long)wc.wr_id, wc.status,
971 rds_ib_wc_status_str(wc.status), wc.byte_len,
847 be32_to_cpu(wc.ex.imm_data)); 972 be32_to_cpu(wc.ex.imm_data));
848 rds_ib_stats_inc(s_ib_rx_cq_event); 973 rds_ib_stats_inc(s_ib_rx_cq_event);
849 974
850 recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)]; 975 recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
851 976
852 rds_ib_recv_unmap_page(ic, recv); 977 ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
853 978
854 /* 979 /*
855 * Also process recvs in connecting state because it is possible 980 * Also process recvs in connecting state because it is possible
856 * to get a recv completion _before_ the rdmacm ESTABLISHED 981 * to get a recv completion _before_ the rdmacm ESTABLISHED
857 * event is processed. 982 * event is processed.
858 */ 983 */
859 if (rds_conn_up(conn) || rds_conn_connecting(conn)) { 984 if (wc.status == IB_WC_SUCCESS) {
985 rds_ib_process_recv(conn, recv, wc.byte_len, state);
986 } else {
860 /* We expect errors as the qp is drained during shutdown */ 987 /* We expect errors as the qp is drained during shutdown */
861 if (wc.status == IB_WC_SUCCESS) { 988 if (rds_conn_up(conn) || rds_conn_connecting(conn))
862 rds_ib_process_recv(conn, recv, wc.byte_len, state); 989 rds_ib_conn_error(conn, "recv completion on %pI4 had "
863 } else { 990 "status %u (%s), disconnecting and "
864 rds_ib_conn_error(conn, "recv completion on " 991 "reconnecting\n", &conn->c_faddr,
865 "%pI4 had status %u, disconnecting and " 992 wc.status,
866 "reconnecting\n", &conn->c_faddr, 993 rds_ib_wc_status_str(wc.status));
867 wc.status);
868 }
869 } 994 }
870 995
996 /*
997 * It's very important that we only free this ring entry if we've truly
998 * freed the resources allocated to the entry. The refilling path can
999 * leak if we don't.
1000 */
871 rds_ib_ring_free(&ic->i_recv_ring, 1); 1001 rds_ib_ring_free(&ic->i_recv_ring, 1);
872 } 1002 }
873} 1003}
@@ -897,11 +1027,8 @@ void rds_ib_recv_tasklet_fn(unsigned long data)
897 if (rds_ib_ring_empty(&ic->i_recv_ring)) 1027 if (rds_ib_ring_empty(&ic->i_recv_ring))
898 rds_ib_stats_inc(s_ib_rx_ring_empty); 1028 rds_ib_stats_inc(s_ib_rx_ring_empty);
899 1029
900 /*
901 * If the ring is running low, then schedule the thread to refill.
902 */
903 if (rds_ib_ring_low(&ic->i_recv_ring)) 1030 if (rds_ib_ring_low(&ic->i_recv_ring))
904 queue_delayed_work(rds_wq, &conn->c_recv_w, 0); 1031 rds_ib_recv_refill(conn, 0);
905} 1032}
906 1033
907int rds_ib_recv(struct rds_connection *conn) 1034int rds_ib_recv(struct rds_connection *conn)
@@ -910,25 +1037,13 @@ int rds_ib_recv(struct rds_connection *conn)
910 int ret = 0; 1037 int ret = 0;
911 1038
912 rdsdebug("conn %p\n", conn); 1039 rdsdebug("conn %p\n", conn);
913
914 /*
915 * If we get a temporary posting failure in this context then
916 * we're really low and we want the caller to back off for a bit.
917 */
918 mutex_lock(&ic->i_recv_mutex);
919 if (rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0))
920 ret = -ENOMEM;
921 else
922 rds_ib_stats_inc(s_ib_rx_refill_from_thread);
923 mutex_unlock(&ic->i_recv_mutex);
924
925 if (rds_conn_up(conn)) 1040 if (rds_conn_up(conn))
926 rds_ib_attempt_ack(ic); 1041 rds_ib_attempt_ack(ic);
927 1042
928 return ret; 1043 return ret;
929} 1044}
930 1045
931int __init rds_ib_recv_init(void) 1046int rds_ib_recv_init(void)
932{ 1047{
933 struct sysinfo si; 1048 struct sysinfo si;
934 int ret = -ENOMEM; 1049 int ret = -ENOMEM;
@@ -939,14 +1054,14 @@ int __init rds_ib_recv_init(void)
939 1054
940 rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming", 1055 rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming",
941 sizeof(struct rds_ib_incoming), 1056 sizeof(struct rds_ib_incoming),
942 0, 0, NULL); 1057 0, SLAB_HWCACHE_ALIGN, NULL);
943 if (rds_ib_incoming_slab == NULL) 1058 if (!rds_ib_incoming_slab)
944 goto out; 1059 goto out;
945 1060
946 rds_ib_frag_slab = kmem_cache_create("rds_ib_frag", 1061 rds_ib_frag_slab = kmem_cache_create("rds_ib_frag",
947 sizeof(struct rds_page_frag), 1062 sizeof(struct rds_page_frag),
948 0, 0, NULL); 1063 0, SLAB_HWCACHE_ALIGN, NULL);
949 if (rds_ib_frag_slab == NULL) 1064 if (!rds_ib_frag_slab)
950 kmem_cache_destroy(rds_ib_incoming_slab); 1065 kmem_cache_destroy(rds_ib_incoming_slab);
951 else 1066 else
952 ret = 0; 1067 ret = 0;
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 17fa80803ab0..71f373c421bc 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -36,11 +36,49 @@
36#include <linux/dmapool.h> 36#include <linux/dmapool.h>
37 37
38#include "rds.h" 38#include "rds.h"
39#include "rdma.h"
40#include "ib.h" 39#include "ib.h"
41 40
42static void rds_ib_send_rdma_complete(struct rds_message *rm, 41static char *rds_ib_wc_status_strings[] = {
43 int wc_status) 42#define RDS_IB_WC_STATUS_STR(foo) \
43 [IB_WC_##foo] = __stringify(IB_WC_##foo)
44 RDS_IB_WC_STATUS_STR(SUCCESS),
45 RDS_IB_WC_STATUS_STR(LOC_LEN_ERR),
46 RDS_IB_WC_STATUS_STR(LOC_QP_OP_ERR),
47 RDS_IB_WC_STATUS_STR(LOC_EEC_OP_ERR),
48 RDS_IB_WC_STATUS_STR(LOC_PROT_ERR),
49 RDS_IB_WC_STATUS_STR(WR_FLUSH_ERR),
50 RDS_IB_WC_STATUS_STR(MW_BIND_ERR),
51 RDS_IB_WC_STATUS_STR(BAD_RESP_ERR),
52 RDS_IB_WC_STATUS_STR(LOC_ACCESS_ERR),
53 RDS_IB_WC_STATUS_STR(REM_INV_REQ_ERR),
54 RDS_IB_WC_STATUS_STR(REM_ACCESS_ERR),
55 RDS_IB_WC_STATUS_STR(REM_OP_ERR),
56 RDS_IB_WC_STATUS_STR(RETRY_EXC_ERR),
57 RDS_IB_WC_STATUS_STR(RNR_RETRY_EXC_ERR),
58 RDS_IB_WC_STATUS_STR(LOC_RDD_VIOL_ERR),
59 RDS_IB_WC_STATUS_STR(REM_INV_RD_REQ_ERR),
60 RDS_IB_WC_STATUS_STR(REM_ABORT_ERR),
61 RDS_IB_WC_STATUS_STR(INV_EECN_ERR),
62 RDS_IB_WC_STATUS_STR(INV_EEC_STATE_ERR),
63 RDS_IB_WC_STATUS_STR(FATAL_ERR),
64 RDS_IB_WC_STATUS_STR(RESP_TIMEOUT_ERR),
65 RDS_IB_WC_STATUS_STR(GENERAL_ERR),
66#undef RDS_IB_WC_STATUS_STR
67};
68
69char *rds_ib_wc_status_str(enum ib_wc_status status)
70{
71 return rds_str_array(rds_ib_wc_status_strings,
72 ARRAY_SIZE(rds_ib_wc_status_strings), status);
73}
74
75/*
76 * Convert IB-specific error message to RDS error message and call core
77 * completion handler.
78 */
79static void rds_ib_send_complete(struct rds_message *rm,
80 int wc_status,
81 void (*complete)(struct rds_message *rm, int status))
44{ 82{
45 int notify_status; 83 int notify_status;
46 84
@@ -60,69 +98,125 @@ static void rds_ib_send_rdma_complete(struct rds_message *rm,
60 notify_status = RDS_RDMA_OTHER_ERROR; 98 notify_status = RDS_RDMA_OTHER_ERROR;
61 break; 99 break;
62 } 100 }
63 rds_rdma_send_complete(rm, notify_status); 101 complete(rm, notify_status);
102}
103
104static void rds_ib_send_unmap_data(struct rds_ib_connection *ic,
105 struct rm_data_op *op,
106 int wc_status)
107{
108 if (op->op_nents)
109 ib_dma_unmap_sg(ic->i_cm_id->device,
110 op->op_sg, op->op_nents,
111 DMA_TO_DEVICE);
64} 112}
65 113
66static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic, 114static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic,
67 struct rds_rdma_op *op) 115 struct rm_rdma_op *op,
116 int wc_status)
68{ 117{
69 if (op->r_mapped) { 118 if (op->op_mapped) {
70 ib_dma_unmap_sg(ic->i_cm_id->device, 119 ib_dma_unmap_sg(ic->i_cm_id->device,
71 op->r_sg, op->r_nents, 120 op->op_sg, op->op_nents,
72 op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 121 op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
73 op->r_mapped = 0; 122 op->op_mapped = 0;
74 } 123 }
124
125 /* If the user asked for a completion notification on this
126 * message, we can implement three different semantics:
127 * 1. Notify when we received the ACK on the RDS message
128 * that was queued with the RDMA. This provides reliable
129 * notification of RDMA status at the expense of a one-way
130 * packet delay.
131 * 2. Notify when the IB stack gives us the completion event for
132 * the RDMA operation.
133 * 3. Notify when the IB stack gives us the completion event for
134 * the accompanying RDS messages.
135 * Here, we implement approach #3. To implement approach #2,
136 * we would need to take an event for the rdma WR. To implement #1,
137 * don't call rds_rdma_send_complete at all, and fall back to the notify
138 * handling in the ACK processing code.
139 *
140 * Note: There's no need to explicitly sync any RDMA buffers using
141 * ib_dma_sync_sg_for_cpu - the completion for the RDMA
142 * operation itself unmapped the RDMA buffers, which takes care
143 * of synching.
144 */
145 rds_ib_send_complete(container_of(op, struct rds_message, rdma),
146 wc_status, rds_rdma_send_complete);
147
148 if (op->op_write)
149 rds_stats_add(s_send_rdma_bytes, op->op_bytes);
150 else
151 rds_stats_add(s_recv_rdma_bytes, op->op_bytes);
75} 152}
76 153
77static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic, 154static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic,
78 struct rds_ib_send_work *send, 155 struct rm_atomic_op *op,
79 int wc_status) 156 int wc_status)
80{ 157{
81 struct rds_message *rm = send->s_rm; 158 /* unmap atomic recvbuf */
82 159 if (op->op_mapped) {
83 rdsdebug("ic %p send %p rm %p\n", ic, send, rm); 160 ib_dma_unmap_sg(ic->i_cm_id->device, op->op_sg, 1,
84 161 DMA_FROM_DEVICE);
85 ib_dma_unmap_sg(ic->i_cm_id->device, 162 op->op_mapped = 0;
86 rm->m_sg, rm->m_nents, 163 }
87 DMA_TO_DEVICE);
88
89 if (rm->m_rdma_op != NULL) {
90 rds_ib_send_unmap_rdma(ic, rm->m_rdma_op);
91
92 /* If the user asked for a completion notification on this
93 * message, we can implement three different semantics:
94 * 1. Notify when we received the ACK on the RDS message
95 * that was queued with the RDMA. This provides reliable
96 * notification of RDMA status at the expense of a one-way
97 * packet delay.
98 * 2. Notify when the IB stack gives us the completion event for
99 * the RDMA operation.
100 * 3. Notify when the IB stack gives us the completion event for
101 * the accompanying RDS messages.
102 * Here, we implement approach #3. To implement approach #2,
103 * call rds_rdma_send_complete from the cq_handler. To implement #1,
104 * don't call rds_rdma_send_complete at all, and fall back to the notify
105 * handling in the ACK processing code.
106 *
107 * Note: There's no need to explicitly sync any RDMA buffers using
108 * ib_dma_sync_sg_for_cpu - the completion for the RDMA
109 * operation itself unmapped the RDMA buffers, which takes care
110 * of synching.
111 */
112 rds_ib_send_rdma_complete(rm, wc_status);
113 164
114 if (rm->m_rdma_op->r_write) 165 rds_ib_send_complete(container_of(op, struct rds_message, atomic),
115 rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes); 166 wc_status, rds_atomic_send_complete);
116 else 167
117 rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes); 168 if (op->op_type == RDS_ATOMIC_TYPE_CSWP)
169 rds_ib_stats_inc(s_ib_atomic_cswp);
170 else
171 rds_ib_stats_inc(s_ib_atomic_fadd);
172}
173
174/*
175 * Unmap the resources associated with a struct send_work.
176 *
177 * Returns the rm for no good reason other than it is unobtainable
178 * other than by switching on wr.opcode, currently, and the caller,
179 * the event handler, needs it.
180 */
181static struct rds_message *rds_ib_send_unmap_op(struct rds_ib_connection *ic,
182 struct rds_ib_send_work *send,
183 int wc_status)
184{
185 struct rds_message *rm = NULL;
186
187 /* In the error case, wc.opcode sometimes contains garbage */
188 switch (send->s_wr.opcode) {
189 case IB_WR_SEND:
190 if (send->s_op) {
191 rm = container_of(send->s_op, struct rds_message, data);
192 rds_ib_send_unmap_data(ic, send->s_op, wc_status);
193 }
194 break;
195 case IB_WR_RDMA_WRITE:
196 case IB_WR_RDMA_READ:
197 if (send->s_op) {
198 rm = container_of(send->s_op, struct rds_message, rdma);
199 rds_ib_send_unmap_rdma(ic, send->s_op, wc_status);
200 }
201 break;
202 case IB_WR_ATOMIC_FETCH_AND_ADD:
203 case IB_WR_ATOMIC_CMP_AND_SWP:
204 if (send->s_op) {
205 rm = container_of(send->s_op, struct rds_message, atomic);
206 rds_ib_send_unmap_atomic(ic, send->s_op, wc_status);
207 }
208 break;
209 default:
210 if (printk_ratelimit())
211 printk(KERN_NOTICE
212 "RDS/IB: %s: unexpected opcode 0x%x in WR!\n",
213 __func__, send->s_wr.opcode);
214 break;
118 } 215 }
119 216
120 /* If anyone waited for this message to get flushed out, wake 217 send->s_wr.opcode = 0xdead;
121 * them up now */
122 rds_message_unmapped(rm);
123 218
124 rds_message_put(rm); 219 return rm;
125 send->s_rm = NULL;
126} 220}
127 221
128void rds_ib_send_init_ring(struct rds_ib_connection *ic) 222void rds_ib_send_init_ring(struct rds_ib_connection *ic)
@@ -133,23 +227,18 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic)
133 for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { 227 for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
134 struct ib_sge *sge; 228 struct ib_sge *sge;
135 229
136 send->s_rm = NULL;
137 send->s_op = NULL; 230 send->s_op = NULL;
138 231
139 send->s_wr.wr_id = i; 232 send->s_wr.wr_id = i;
140 send->s_wr.sg_list = send->s_sge; 233 send->s_wr.sg_list = send->s_sge;
141 send->s_wr.num_sge = 1;
142 send->s_wr.opcode = IB_WR_SEND;
143 send->s_wr.send_flags = 0;
144 send->s_wr.ex.imm_data = 0; 234 send->s_wr.ex.imm_data = 0;
145 235
146 sge = rds_ib_data_sge(ic, send->s_sge); 236 sge = &send->s_sge[0];
147 sge->lkey = ic->i_mr->lkey;
148
149 sge = rds_ib_header_sge(ic, send->s_sge);
150 sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); 237 sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
151 sge->length = sizeof(struct rds_header); 238 sge->length = sizeof(struct rds_header);
152 sge->lkey = ic->i_mr->lkey; 239 sge->lkey = ic->i_mr->lkey;
240
241 send->s_sge[1].lkey = ic->i_mr->lkey;
153 } 242 }
154} 243}
155 244
@@ -159,16 +248,24 @@ void rds_ib_send_clear_ring(struct rds_ib_connection *ic)
159 u32 i; 248 u32 i;
160 249
161 for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { 250 for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
162 if (send->s_wr.opcode == 0xdead) 251 if (send->s_op && send->s_wr.opcode != 0xdead)
163 continue; 252 rds_ib_send_unmap_op(ic, send, IB_WC_WR_FLUSH_ERR);
164 if (send->s_rm)
165 rds_ib_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR);
166 if (send->s_op)
167 rds_ib_send_unmap_rdma(ic, send->s_op);
168 } 253 }
169} 254}
170 255
171/* 256/*
257 * The only fast path caller always has a non-zero nr, so we don't
258 * bother testing nr before performing the atomic sub.
259 */
260static void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr)
261{
262 if ((atomic_sub_return(nr, &ic->i_signaled_sends) == 0) &&
263 waitqueue_active(&rds_ib_ring_empty_wait))
264 wake_up(&rds_ib_ring_empty_wait);
265 BUG_ON(atomic_read(&ic->i_signaled_sends) < 0);
266}
267
268/*
172 * The _oldest/_free ring operations here race cleanly with the alloc/unalloc 269 * The _oldest/_free ring operations here race cleanly with the alloc/unalloc
173 * operations performed in the send path. As the sender allocs and potentially 270 * operations performed in the send path. As the sender allocs and potentially
174 * unallocs the next free entry in the ring it doesn't alter which is 271 * unallocs the next free entry in the ring it doesn't alter which is
@@ -178,12 +275,14 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
178{ 275{
179 struct rds_connection *conn = context; 276 struct rds_connection *conn = context;
180 struct rds_ib_connection *ic = conn->c_transport_data; 277 struct rds_ib_connection *ic = conn->c_transport_data;
278 struct rds_message *rm = NULL;
181 struct ib_wc wc; 279 struct ib_wc wc;
182 struct rds_ib_send_work *send; 280 struct rds_ib_send_work *send;
183 u32 completed; 281 u32 completed;
184 u32 oldest; 282 u32 oldest;
185 u32 i = 0; 283 u32 i = 0;
186 int ret; 284 int ret;
285 int nr_sig = 0;
187 286
188 rdsdebug("cq %p conn %p\n", cq, conn); 287 rdsdebug("cq %p conn %p\n", cq, conn);
189 rds_ib_stats_inc(s_ib_tx_cq_call); 288 rds_ib_stats_inc(s_ib_tx_cq_call);
@@ -192,8 +291,9 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
192 rdsdebug("ib_req_notify_cq send failed: %d\n", ret); 291 rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
193 292
194 while (ib_poll_cq(cq, 1, &wc) > 0) { 293 while (ib_poll_cq(cq, 1, &wc) > 0) {
195 rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", 294 rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
196 (unsigned long long)wc.wr_id, wc.status, wc.byte_len, 295 (unsigned long long)wc.wr_id, wc.status,
296 rds_ib_wc_status_str(wc.status), wc.byte_len,
197 be32_to_cpu(wc.ex.imm_data)); 297 be32_to_cpu(wc.ex.imm_data));
198 rds_ib_stats_inc(s_ib_tx_cq_event); 298 rds_ib_stats_inc(s_ib_tx_cq_event);
199 299
@@ -210,51 +310,30 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
210 310
211 for (i = 0; i < completed; i++) { 311 for (i = 0; i < completed; i++) {
212 send = &ic->i_sends[oldest]; 312 send = &ic->i_sends[oldest];
313 if (send->s_wr.send_flags & IB_SEND_SIGNALED)
314 nr_sig++;
213 315
214 /* In the error case, wc.opcode sometimes contains garbage */ 316 rm = rds_ib_send_unmap_op(ic, send, wc.status);
215 switch (send->s_wr.opcode) {
216 case IB_WR_SEND:
217 if (send->s_rm)
218 rds_ib_send_unmap_rm(ic, send, wc.status);
219 break;
220 case IB_WR_RDMA_WRITE:
221 case IB_WR_RDMA_READ:
222 /* Nothing to be done - the SG list will be unmapped
223 * when the SEND completes. */
224 break;
225 default:
226 if (printk_ratelimit())
227 printk(KERN_NOTICE
228 "RDS/IB: %s: unexpected opcode 0x%x in WR!\n",
229 __func__, send->s_wr.opcode);
230 break;
231 }
232 317
233 send->s_wr.opcode = 0xdead;
234 send->s_wr.num_sge = 1;
235 if (send->s_queued + HZ/2 < jiffies) 318 if (send->s_queued + HZ/2 < jiffies)
236 rds_ib_stats_inc(s_ib_tx_stalled); 319 rds_ib_stats_inc(s_ib_tx_stalled);
237 320
238 /* If a RDMA operation produced an error, signal this right 321 if (send->s_op) {
239 * away. If we don't, the subsequent SEND that goes with this 322 if (send->s_op == rm->m_final_op) {
240 * RDMA will be canceled with ERR_WFLUSH, and the application 323 /* If anyone waited for this message to get flushed out, wake
241 * never learn that the RDMA failed. */ 324 * them up now */
242 if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) { 325 rds_message_unmapped(rm);
243 struct rds_message *rm;
244
245 rm = rds_send_get_message(conn, send->s_op);
246 if (rm) {
247 if (rm->m_rdma_op)
248 rds_ib_send_unmap_rdma(ic, rm->m_rdma_op);
249 rds_ib_send_rdma_complete(rm, wc.status);
250 rds_message_put(rm);
251 } 326 }
327 rds_message_put(rm);
328 send->s_op = NULL;
252 } 329 }
253 330
254 oldest = (oldest + 1) % ic->i_send_ring.w_nr; 331 oldest = (oldest + 1) % ic->i_send_ring.w_nr;
255 } 332 }
256 333
257 rds_ib_ring_free(&ic->i_send_ring, completed); 334 rds_ib_ring_free(&ic->i_send_ring, completed);
335 rds_ib_sub_signaled(ic, nr_sig);
336 nr_sig = 0;
258 337
259 if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || 338 if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
260 test_bit(0, &conn->c_map_queued)) 339 test_bit(0, &conn->c_map_queued))
@@ -262,10 +341,10 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
262 341
263 /* We expect errors as the qp is drained during shutdown */ 342 /* We expect errors as the qp is drained during shutdown */
264 if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) { 343 if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) {
265 rds_ib_conn_error(conn, 344 rds_ib_conn_error(conn, "send completion on %pI4 had status "
266 "send completion on %pI4 " 345 "%u (%s), disconnecting and reconnecting\n",
267 "had status %u, disconnecting and reconnecting\n", 346 &conn->c_faddr, wc.status,
268 &conn->c_faddr, wc.status); 347 rds_ib_wc_status_str(wc.status));
269 } 348 }
270 } 349 }
271} 350}
@@ -294,7 +373,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
294 * credits (see rds_ib_send_add_credits below). 373 * credits (see rds_ib_send_add_credits below).
295 * 374 *
296 * The RDS send code is essentially single-threaded; rds_send_xmit 375 * The RDS send code is essentially single-threaded; rds_send_xmit
297 * grabs c_send_lock to ensure exclusive access to the send ring. 376 * sets RDS_IN_XMIT to ensure exclusive access to the send ring.
298 * However, the ACK sending code is independent and can race with 377 * However, the ACK sending code is independent and can race with
299 * message SENDs. 378 * message SENDs.
300 * 379 *
@@ -413,40 +492,21 @@ void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted)
413 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); 492 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
414} 493}
415 494
416static inline void 495static inline int rds_ib_set_wr_signal_state(struct rds_ib_connection *ic,
417rds_ib_xmit_populate_wr(struct rds_ib_connection *ic, 496 struct rds_ib_send_work *send,
418 struct rds_ib_send_work *send, unsigned int pos, 497 bool notify)
419 unsigned long buffer, unsigned int length,
420 int send_flags)
421{ 498{
422 struct ib_sge *sge; 499 /*
423 500 * We want to delay signaling completions just enough to get
424 WARN_ON(pos != send - ic->i_sends); 501 * the batching benefits but not so much that we create dead time
425 502 * on the wire.
426 send->s_wr.send_flags = send_flags; 503 */
427 send->s_wr.opcode = IB_WR_SEND; 504 if (ic->i_unsignaled_wrs-- == 0 || notify) {
428 send->s_wr.num_sge = 2; 505 ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
429 send->s_wr.next = NULL; 506 send->s_wr.send_flags |= IB_SEND_SIGNALED;
430 send->s_queued = jiffies; 507 return 1;
431 send->s_op = NULL;
432
433 if (length != 0) {
434 sge = rds_ib_data_sge(ic, send->s_sge);
435 sge->addr = buffer;
436 sge->length = length;
437 sge->lkey = ic->i_mr->lkey;
438
439 sge = rds_ib_header_sge(ic, send->s_sge);
440 } else {
441 /* We're sending a packet with no payload. There is only
442 * one SGE */
443 send->s_wr.num_sge = 1;
444 sge = &send->s_sge[0];
445 } 508 }
446 509 return 0;
447 sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header));
448 sge->length = sizeof(struct rds_header);
449 sge->lkey = ic->i_mr->lkey;
450} 510}
451 511
452/* 512/*
@@ -475,13 +535,14 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
475 u32 pos; 535 u32 pos;
476 u32 i; 536 u32 i;
477 u32 work_alloc; 537 u32 work_alloc;
478 u32 credit_alloc; 538 u32 credit_alloc = 0;
479 u32 posted; 539 u32 posted;
480 u32 adv_credits = 0; 540 u32 adv_credits = 0;
481 int send_flags = 0; 541 int send_flags = 0;
482 int sent; 542 int bytes_sent = 0;
483 int ret; 543 int ret;
484 int flow_controlled = 0; 544 int flow_controlled = 0;
545 int nr_sig = 0;
485 546
486 BUG_ON(off % RDS_FRAG_SIZE); 547 BUG_ON(off % RDS_FRAG_SIZE);
487 BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header)); 548 BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
@@ -507,14 +568,13 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
507 goto out; 568 goto out;
508 } 569 }
509 570
510 credit_alloc = work_alloc;
511 if (ic->i_flowctl) { 571 if (ic->i_flowctl) {
512 credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT); 572 credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT);
513 adv_credits += posted; 573 adv_credits += posted;
514 if (credit_alloc < work_alloc) { 574 if (credit_alloc < work_alloc) {
515 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc); 575 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
516 work_alloc = credit_alloc; 576 work_alloc = credit_alloc;
517 flow_controlled++; 577 flow_controlled = 1;
518 } 578 }
519 if (work_alloc == 0) { 579 if (work_alloc == 0) {
520 set_bit(RDS_LL_SEND_FULL, &conn->c_flags); 580 set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
@@ -525,31 +585,25 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
525 } 585 }
526 586
527 /* map the message the first time we see it */ 587 /* map the message the first time we see it */
528 if (ic->i_rm == NULL) { 588 if (!ic->i_data_op) {
529 /* 589 if (rm->data.op_nents) {
530 printk(KERN_NOTICE "rds_ib_xmit prep msg dport=%u flags=0x%x len=%d\n", 590 rm->data.op_count = ib_dma_map_sg(dev,
531 be16_to_cpu(rm->m_inc.i_hdr.h_dport), 591 rm->data.op_sg,
532 rm->m_inc.i_hdr.h_flags, 592 rm->data.op_nents,
533 be32_to_cpu(rm->m_inc.i_hdr.h_len)); 593 DMA_TO_DEVICE);
534 */ 594 rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count);
535 if (rm->m_nents) { 595 if (rm->data.op_count == 0) {
536 rm->m_count = ib_dma_map_sg(dev,
537 rm->m_sg, rm->m_nents, DMA_TO_DEVICE);
538 rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count);
539 if (rm->m_count == 0) {
540 rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); 596 rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
541 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 597 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
542 ret = -ENOMEM; /* XXX ? */ 598 ret = -ENOMEM; /* XXX ? */
543 goto out; 599 goto out;
544 } 600 }
545 } else { 601 } else {
546 rm->m_count = 0; 602 rm->data.op_count = 0;
547 } 603 }
548 604
549 ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
550 ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes;
551 rds_message_addref(rm); 605 rds_message_addref(rm);
552 ic->i_rm = rm; 606 ic->i_data_op = &rm->data;
553 607
554 /* Finalize the header */ 608 /* Finalize the header */
555 if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags)) 609 if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags))
@@ -559,10 +613,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
559 613
560 /* If it has a RDMA op, tell the peer we did it. This is 614 /* If it has a RDMA op, tell the peer we did it. This is
561 * used by the peer to release use-once RDMA MRs. */ 615 * used by the peer to release use-once RDMA MRs. */
562 if (rm->m_rdma_op) { 616 if (rm->rdma.op_active) {
563 struct rds_ext_header_rdma ext_hdr; 617 struct rds_ext_header_rdma ext_hdr;
564 618
565 ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key); 619 ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
566 rds_message_add_extension(&rm->m_inc.i_hdr, 620 rds_message_add_extension(&rm->m_inc.i_hdr,
567 RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); 621 RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
568 } 622 }
@@ -582,99 +636,77 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
582 /* 636 /*
583 * Update adv_credits since we reset the ACK_REQUIRED bit. 637 * Update adv_credits since we reset the ACK_REQUIRED bit.
584 */ 638 */
585 rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits); 639 if (ic->i_flowctl) {
586 adv_credits += posted; 640 rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits);
587 BUG_ON(adv_credits > 255); 641 adv_credits += posted;
642 BUG_ON(adv_credits > 255);
643 }
588 } 644 }
589 645
590 send = &ic->i_sends[pos];
591 first = send;
592 prev = NULL;
593 scat = &rm->m_sg[sg];
594 sent = 0;
595 i = 0;
596
597 /* Sometimes you want to put a fence between an RDMA 646 /* Sometimes you want to put a fence between an RDMA
598 * READ and the following SEND. 647 * READ and the following SEND.
599 * We could either do this all the time 648 * We could either do this all the time
600 * or when requested by the user. Right now, we let 649 * or when requested by the user. Right now, we let
601 * the application choose. 650 * the application choose.
602 */ 651 */
603 if (rm->m_rdma_op && rm->m_rdma_op->r_fence) 652 if (rm->rdma.op_active && rm->rdma.op_fence)
604 send_flags = IB_SEND_FENCE; 653 send_flags = IB_SEND_FENCE;
605 654
606 /* 655 /* Each frag gets a header. Msgs may be 0 bytes */
607 * We could be copying the header into the unused tail of the page. 656 send = &ic->i_sends[pos];
608 * That would need to be changed in the future when those pages might 657 first = send;
609 * be mapped userspace pages or page cache pages. So instead we always 658 prev = NULL;
610 * use a second sge and our long-lived ring of mapped headers. We send 659 scat = &ic->i_data_op->op_sg[sg];
611 * the header after the data so that the data payload can be aligned on 660 i = 0;
612 * the receiver. 661 do {
613 */ 662 unsigned int len = 0;
614 663
615 /* handle a 0-len message */ 664 /* Set up the header */
616 if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) { 665 send->s_wr.send_flags = send_flags;
617 rds_ib_xmit_populate_wr(ic, send, pos, 0, 0, send_flags); 666 send->s_wr.opcode = IB_WR_SEND;
618 goto add_header; 667 send->s_wr.num_sge = 1;
619 } 668 send->s_wr.next = NULL;
669 send->s_queued = jiffies;
670 send->s_op = NULL;
620 671
621 /* if there's data reference it with a chain of work reqs */ 672 send->s_sge[0].addr = ic->i_send_hdrs_dma
622 for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) { 673 + (pos * sizeof(struct rds_header));
623 unsigned int len; 674 send->s_sge[0].length = sizeof(struct rds_header);
624 675
625 send = &ic->i_sends[pos]; 676 memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
626 677
627 len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); 678 /* Set up the data, if present */
628 rds_ib_xmit_populate_wr(ic, send, pos, 679 if (i < work_alloc
629 ib_sg_dma_address(dev, scat) + off, len, 680 && scat != &rm->data.op_sg[rm->data.op_count]) {
630 send_flags); 681 len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
682 send->s_wr.num_sge = 2;
631 683
632 /* 684 send->s_sge[1].addr = ib_sg_dma_address(dev, scat) + off;
633 * We want to delay signaling completions just enough to get 685 send->s_sge[1].length = len;
634 * the batching benefits but not so much that we create dead time
635 * on the wire.
636 */
637 if (ic->i_unsignaled_wrs-- == 0) {
638 ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
639 send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
640 }
641 686
642 ic->i_unsignaled_bytes -= len; 687 bytes_sent += len;
643 if (ic->i_unsignaled_bytes <= 0) { 688 off += len;
644 ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes; 689 if (off == ib_sg_dma_len(dev, scat)) {
645 send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; 690 scat++;
691 off = 0;
692 }
646 } 693 }
647 694
695 rds_ib_set_wr_signal_state(ic, send, 0);
696
648 /* 697 /*
649 * Always signal the last one if we're stopping due to flow control. 698 * Always signal the last one if we're stopping due to flow control.
650 */ 699 */
651 if (flow_controlled && i == (work_alloc-1)) 700 if (ic->i_flowctl && flow_controlled && i == (work_alloc-1))
652 send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; 701 send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
653 702
703 if (send->s_wr.send_flags & IB_SEND_SIGNALED)
704 nr_sig++;
705
654 rdsdebug("send %p wr %p num_sge %u next %p\n", send, 706 rdsdebug("send %p wr %p num_sge %u next %p\n", send,
655 &send->s_wr, send->s_wr.num_sge, send->s_wr.next); 707 &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
656 708
657 sent += len; 709 if (ic->i_flowctl && adv_credits) {
658 off += len;
659 if (off == ib_sg_dma_len(dev, scat)) {
660 scat++;
661 off = 0;
662 }
663
664add_header:
665 /* Tack on the header after the data. The header SGE should already
666 * have been set up to point to the right header buffer. */
667 memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
668
669 if (0) {
670 struct rds_header *hdr = &ic->i_send_hdrs[pos];
671
672 printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n",
673 be16_to_cpu(hdr->h_dport),
674 hdr->h_flags,
675 be32_to_cpu(hdr->h_len));
676 }
677 if (adv_credits) {
678 struct rds_header *hdr = &ic->i_send_hdrs[pos]; 710 struct rds_header *hdr = &ic->i_send_hdrs[pos];
679 711
680 /* add credit and redo the header checksum */ 712 /* add credit and redo the header checksum */
@@ -689,20 +721,25 @@ add_header:
689 prev = send; 721 prev = send;
690 722
691 pos = (pos + 1) % ic->i_send_ring.w_nr; 723 pos = (pos + 1) % ic->i_send_ring.w_nr;
692 } 724 send = &ic->i_sends[pos];
725 i++;
726
727 } while (i < work_alloc
728 && scat != &rm->data.op_sg[rm->data.op_count]);
693 729
694 /* Account the RDS header in the number of bytes we sent, but just once. 730 /* Account the RDS header in the number of bytes we sent, but just once.
695 * The caller has no concept of fragmentation. */ 731 * The caller has no concept of fragmentation. */
696 if (hdr_off == 0) 732 if (hdr_off == 0)
697 sent += sizeof(struct rds_header); 733 bytes_sent += sizeof(struct rds_header);
698 734
699 /* if we finished the message then send completion owns it */ 735 /* if we finished the message then send completion owns it */
700 if (scat == &rm->m_sg[rm->m_count]) { 736 if (scat == &rm->data.op_sg[rm->data.op_count]) {
701 prev->s_rm = ic->i_rm; 737 prev->s_op = ic->i_data_op;
702 prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; 738 prev->s_wr.send_flags |= IB_SEND_SOLICITED;
703 ic->i_rm = NULL; 739 ic->i_data_op = NULL;
704 } 740 }
705 741
742 /* Put back wrs & credits we didn't use */
706 if (i < work_alloc) { 743 if (i < work_alloc) {
707 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); 744 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
708 work_alloc = i; 745 work_alloc = i;
@@ -710,6 +747,9 @@ add_header:
710 if (ic->i_flowctl && i < credit_alloc) 747 if (ic->i_flowctl && i < credit_alloc)
711 rds_ib_send_add_credits(conn, credit_alloc - i); 748 rds_ib_send_add_credits(conn, credit_alloc - i);
712 749
750 if (nr_sig)
751 atomic_add(nr_sig, &ic->i_signaled_sends);
752
713 /* XXX need to worry about failed_wr and partial sends. */ 753 /* XXX need to worry about failed_wr and partial sends. */
714 failed_wr = &first->s_wr; 754 failed_wr = &first->s_wr;
715 ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); 755 ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
@@ -720,32 +760,127 @@ add_header:
720 printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 " 760 printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 "
721 "returned %d\n", &conn->c_faddr, ret); 761 "returned %d\n", &conn->c_faddr, ret);
722 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 762 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
723 if (prev->s_rm) { 763 rds_ib_sub_signaled(ic, nr_sig);
724 ic->i_rm = prev->s_rm; 764 if (prev->s_op) {
725 prev->s_rm = NULL; 765 ic->i_data_op = prev->s_op;
766 prev->s_op = NULL;
726 } 767 }
727 768
728 rds_ib_conn_error(ic->conn, "ib_post_send failed\n"); 769 rds_ib_conn_error(ic->conn, "ib_post_send failed\n");
729 goto out; 770 goto out;
730 } 771 }
731 772
732 ret = sent; 773 ret = bytes_sent;
733out: 774out:
734 BUG_ON(adv_credits); 775 BUG_ON(adv_credits);
735 return ret; 776 return ret;
736} 777}
737 778
738int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) 779/*
780 * Issue atomic operation.
781 * A simplified version of the rdma case, we always map 1 SG, and
782 * only 8 bytes, for the return value from the atomic operation.
783 */
784int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
785{
786 struct rds_ib_connection *ic = conn->c_transport_data;
787 struct rds_ib_send_work *send = NULL;
788 struct ib_send_wr *failed_wr;
789 struct rds_ib_device *rds_ibdev;
790 u32 pos;
791 u32 work_alloc;
792 int ret;
793 int nr_sig = 0;
794
795 rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
796
797 work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos);
798 if (work_alloc != 1) {
799 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
800 rds_ib_stats_inc(s_ib_tx_ring_full);
801 ret = -ENOMEM;
802 goto out;
803 }
804
805 /* address of send request in ring */
806 send = &ic->i_sends[pos];
807 send->s_queued = jiffies;
808
809 if (op->op_type == RDS_ATOMIC_TYPE_CSWP) {
810 send->s_wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP;
811 send->s_wr.wr.atomic.compare_add = op->op_m_cswp.compare;
812 send->s_wr.wr.atomic.swap = op->op_m_cswp.swap;
813 send->s_wr.wr.atomic.compare_add_mask = op->op_m_cswp.compare_mask;
814 send->s_wr.wr.atomic.swap_mask = op->op_m_cswp.swap_mask;
815 } else { /* FADD */
816 send->s_wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD;
817 send->s_wr.wr.atomic.compare_add = op->op_m_fadd.add;
818 send->s_wr.wr.atomic.swap = 0;
819 send->s_wr.wr.atomic.compare_add_mask = op->op_m_fadd.nocarry_mask;
820 send->s_wr.wr.atomic.swap_mask = 0;
821 }
822 nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify);
823 send->s_wr.num_sge = 1;
824 send->s_wr.next = NULL;
825 send->s_wr.wr.atomic.remote_addr = op->op_remote_addr;
826 send->s_wr.wr.atomic.rkey = op->op_rkey;
827 send->s_op = op;
828 rds_message_addref(container_of(send->s_op, struct rds_message, atomic));
829
830 /* map 8 byte retval buffer to the device */
831 ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE);
832 rdsdebug("ic %p mapping atomic op %p. mapped %d pg\n", ic, op, ret);
833 if (ret != 1) {
834 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
835 rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
836 ret = -ENOMEM; /* XXX ? */
837 goto out;
838 }
839
840 /* Convert our struct scatterlist to struct ib_sge */
841 send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg);
842 send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg);
843 send->s_sge[0].lkey = ic->i_mr->lkey;
844
845 rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr,
846 send->s_sge[0].addr, send->s_sge[0].length);
847
848 if (nr_sig)
849 atomic_add(nr_sig, &ic->i_signaled_sends);
850
851 failed_wr = &send->s_wr;
852 ret = ib_post_send(ic->i_cm_id->qp, &send->s_wr, &failed_wr);
853 rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic,
854 send, &send->s_wr, ret, failed_wr);
855 BUG_ON(failed_wr != &send->s_wr);
856 if (ret) {
857 printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 "
858 "returned %d\n", &conn->c_faddr, ret);
859 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
860 rds_ib_sub_signaled(ic, nr_sig);
861 goto out;
862 }
863
864 if (unlikely(failed_wr != &send->s_wr)) {
865 printk(KERN_WARNING "RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!\n", ret);
866 BUG_ON(failed_wr != &send->s_wr);
867 }
868
869out:
870 return ret;
871}
872
873int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
739{ 874{
740 struct rds_ib_connection *ic = conn->c_transport_data; 875 struct rds_ib_connection *ic = conn->c_transport_data;
741 struct rds_ib_send_work *send = NULL; 876 struct rds_ib_send_work *send = NULL;
742 struct rds_ib_send_work *first; 877 struct rds_ib_send_work *first;
743 struct rds_ib_send_work *prev; 878 struct rds_ib_send_work *prev;
744 struct ib_send_wr *failed_wr; 879 struct ib_send_wr *failed_wr;
745 struct rds_ib_device *rds_ibdev;
746 struct scatterlist *scat; 880 struct scatterlist *scat;
747 unsigned long len; 881 unsigned long len;
748 u64 remote_addr = op->r_remote_addr; 882 u64 remote_addr = op->op_remote_addr;
883 u32 max_sge = ic->rds_ibdev->max_sge;
749 u32 pos; 884 u32 pos;
750 u32 work_alloc; 885 u32 work_alloc;
751 u32 i; 886 u32 i;
@@ -753,29 +888,28 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
753 int sent; 888 int sent;
754 int ret; 889 int ret;
755 int num_sge; 890 int num_sge;
756 891 int nr_sig = 0;
757 rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); 892
758 893 /* map the op the first time we see it */
759 /* map the message the first time we see it */ 894 if (!op->op_mapped) {
760 if (!op->r_mapped) { 895 op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
761 op->r_count = ib_dma_map_sg(ic->i_cm_id->device, 896 op->op_sg, op->op_nents, (op->op_write) ?
762 op->r_sg, op->r_nents, (op->r_write) ? 897 DMA_TO_DEVICE : DMA_FROM_DEVICE);
763 DMA_TO_DEVICE : DMA_FROM_DEVICE); 898 rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
764 rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count); 899 if (op->op_count == 0) {
765 if (op->r_count == 0) {
766 rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); 900 rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
767 ret = -ENOMEM; /* XXX ? */ 901 ret = -ENOMEM; /* XXX ? */
768 goto out; 902 goto out;
769 } 903 }
770 904
771 op->r_mapped = 1; 905 op->op_mapped = 1;
772 } 906 }
773 907
774 /* 908 /*
775 * Instead of knowing how to return a partial rdma read/write we insist that there 909 * Instead of knowing how to return a partial rdma read/write we insist that there
776 * be enough work requests to send the entire message. 910 * be enough work requests to send the entire message.
777 */ 911 */
778 i = ceil(op->r_count, rds_ibdev->max_sge); 912 i = ceil(op->op_count, max_sge);
779 913
780 work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); 914 work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
781 if (work_alloc != i) { 915 if (work_alloc != i) {
@@ -788,30 +922,24 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
788 send = &ic->i_sends[pos]; 922 send = &ic->i_sends[pos];
789 first = send; 923 first = send;
790 prev = NULL; 924 prev = NULL;
791 scat = &op->r_sg[0]; 925 scat = &op->op_sg[0];
792 sent = 0; 926 sent = 0;
793 num_sge = op->r_count; 927 num_sge = op->op_count;
794 928
795 for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) { 929 for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
796 send->s_wr.send_flags = 0; 930 send->s_wr.send_flags = 0;
797 send->s_queued = jiffies; 931 send->s_queued = jiffies;
798 /* 932 send->s_op = NULL;
799 * We want to delay signaling completions just enough to get 933
800 * the batching benefits but not so much that we create dead time on the wire. 934 nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify);
801 */
802 if (ic->i_unsignaled_wrs-- == 0) {
803 ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
804 send->s_wr.send_flags = IB_SEND_SIGNALED;
805 }
806 935
807 send->s_wr.opcode = op->r_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; 936 send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
808 send->s_wr.wr.rdma.remote_addr = remote_addr; 937 send->s_wr.wr.rdma.remote_addr = remote_addr;
809 send->s_wr.wr.rdma.rkey = op->r_key; 938 send->s_wr.wr.rdma.rkey = op->op_rkey;
810 send->s_op = op;
811 939
812 if (num_sge > rds_ibdev->max_sge) { 940 if (num_sge > max_sge) {
813 send->s_wr.num_sge = rds_ibdev->max_sge; 941 send->s_wr.num_sge = max_sge;
814 num_sge -= rds_ibdev->max_sge; 942 num_sge -= max_sge;
815 } else { 943 } else {
816 send->s_wr.num_sge = num_sge; 944 send->s_wr.num_sge = num_sge;
817 } 945 }
@@ -821,7 +949,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
821 if (prev) 949 if (prev)
822 prev->s_wr.next = &send->s_wr; 950 prev->s_wr.next = &send->s_wr;
823 951
824 for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) { 952 for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) {
825 len = ib_sg_dma_len(ic->i_cm_id->device, scat); 953 len = ib_sg_dma_len(ic->i_cm_id->device, scat);
826 send->s_sge[j].addr = 954 send->s_sge[j].addr =
827 ib_sg_dma_address(ic->i_cm_id->device, scat); 955 ib_sg_dma_address(ic->i_cm_id->device, scat);
@@ -843,15 +971,20 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
843 send = ic->i_sends; 971 send = ic->i_sends;
844 } 972 }
845 973
846 /* if we finished the message then send completion owns it */ 974 /* give a reference to the last op */
847 if (scat == &op->r_sg[op->r_count]) 975 if (scat == &op->op_sg[op->op_count]) {
848 prev->s_wr.send_flags = IB_SEND_SIGNALED; 976 prev->s_op = op;
977 rds_message_addref(container_of(op, struct rds_message, rdma));
978 }
849 979
850 if (i < work_alloc) { 980 if (i < work_alloc) {
851 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); 981 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
852 work_alloc = i; 982 work_alloc = i;
853 } 983 }
854 984
985 if (nr_sig)
986 atomic_add(nr_sig, &ic->i_signaled_sends);
987
855 failed_wr = &first->s_wr; 988 failed_wr = &first->s_wr;
856 ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); 989 ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
857 rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, 990 rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
@@ -861,6 +994,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
861 printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 " 994 printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 "
862 "returned %d\n", &conn->c_faddr, ret); 995 "returned %d\n", &conn->c_faddr, ret);
863 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 996 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
997 rds_ib_sub_signaled(ic, nr_sig);
864 goto out; 998 goto out;
865 } 999 }
866 1000
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c
index d2c904dd6fbc..2d5965d6e97c 100644
--- a/net/rds/ib_stats.c
+++ b/net/rds/ib_stats.c
@@ -67,6 +67,8 @@ static const char *const rds_ib_stat_names[] = {
67 "ib_rdma_mr_pool_flush", 67 "ib_rdma_mr_pool_flush",
68 "ib_rdma_mr_pool_wait", 68 "ib_rdma_mr_pool_wait",
69 "ib_rdma_mr_pool_depleted", 69 "ib_rdma_mr_pool_depleted",
70 "ib_atomic_cswp",
71 "ib_atomic_fadd",
70}; 72};
71 73
72unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, 74unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c
index 03f01cb4e0fe..1253b006efdb 100644
--- a/net/rds/ib_sysctl.c
+++ b/net/rds/ib_sysctl.c
@@ -49,10 +49,6 @@ unsigned long rds_ib_sysctl_max_unsig_wrs = 16;
49static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1; 49static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1;
50static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64; 50static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64;
51 51
52unsigned long rds_ib_sysctl_max_unsig_bytes = (16 << 20);
53static unsigned long rds_ib_sysctl_max_unsig_bytes_min = 1;
54static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL;
55
56/* 52/*
57 * This sysctl does nothing. 53 * This sysctl does nothing.
58 * 54 *
@@ -65,7 +61,7 @@ static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL;
65 */ 61 */
66unsigned int rds_ib_sysctl_flow_control = 0; 62unsigned int rds_ib_sysctl_flow_control = 0;
67 63
68ctl_table rds_ib_sysctl_table[] = { 64static ctl_table rds_ib_sysctl_table[] = {
69 { 65 {
70 .procname = "max_send_wr", 66 .procname = "max_send_wr",
71 .data = &rds_ib_sysctl_max_send_wr, 67 .data = &rds_ib_sysctl_max_send_wr,
@@ -94,15 +90,6 @@ ctl_table rds_ib_sysctl_table[] = {
94 .extra2 = &rds_ib_sysctl_max_unsig_wr_max, 90 .extra2 = &rds_ib_sysctl_max_unsig_wr_max,
95 }, 91 },
96 { 92 {
97 .procname = "max_unsignaled_bytes",
98 .data = &rds_ib_sysctl_max_unsig_bytes,
99 .maxlen = sizeof(unsigned long),
100 .mode = 0644,
101 .proc_handler = proc_doulongvec_minmax,
102 .extra1 = &rds_ib_sysctl_max_unsig_bytes_min,
103 .extra2 = &rds_ib_sysctl_max_unsig_bytes_max,
104 },
105 {
106 .procname = "max_recv_allocation", 93 .procname = "max_recv_allocation",
107 .data = &rds_ib_sysctl_max_recv_allocation, 94 .data = &rds_ib_sysctl_max_recv_allocation,
108 .maxlen = sizeof(unsigned long), 95 .maxlen = sizeof(unsigned long),
@@ -132,10 +119,10 @@ void rds_ib_sysctl_exit(void)
132 unregister_sysctl_table(rds_ib_sysctl_hdr); 119 unregister_sysctl_table(rds_ib_sysctl_hdr);
133} 120}
134 121
135int __init rds_ib_sysctl_init(void) 122int rds_ib_sysctl_init(void)
136{ 123{
137 rds_ib_sysctl_hdr = register_sysctl_paths(rds_ib_sysctl_path, rds_ib_sysctl_table); 124 rds_ib_sysctl_hdr = register_sysctl_paths(rds_ib_sysctl_path, rds_ib_sysctl_table);
138 if (rds_ib_sysctl_hdr == NULL) 125 if (!rds_ib_sysctl_hdr)
139 return -ENOMEM; 126 return -ENOMEM;
140 return 0; 127 return 0;
141} 128}
diff --git a/net/rds/info.c b/net/rds/info.c
index c45c4173a44d..4fdf1b6e84ff 100644
--- a/net/rds/info.c
+++ b/net/rds/info.c
@@ -76,7 +76,7 @@ void rds_info_register_func(int optname, rds_info_func func)
76 BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST); 76 BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
77 77
78 spin_lock(&rds_info_lock); 78 spin_lock(&rds_info_lock);
79 BUG_ON(rds_info_funcs[offset] != NULL); 79 BUG_ON(rds_info_funcs[offset]);
80 rds_info_funcs[offset] = func; 80 rds_info_funcs[offset] = func;
81 spin_unlock(&rds_info_lock); 81 spin_unlock(&rds_info_lock);
82} 82}
@@ -102,7 +102,7 @@ EXPORT_SYMBOL_GPL(rds_info_deregister_func);
102 */ 102 */
103void rds_info_iter_unmap(struct rds_info_iterator *iter) 103void rds_info_iter_unmap(struct rds_info_iterator *iter)
104{ 104{
105 if (iter->addr != NULL) { 105 if (iter->addr) {
106 kunmap_atomic(iter->addr, KM_USER0); 106 kunmap_atomic(iter->addr, KM_USER0);
107 iter->addr = NULL; 107 iter->addr = NULL;
108 } 108 }
@@ -117,7 +117,7 @@ void rds_info_copy(struct rds_info_iterator *iter, void *data,
117 unsigned long this; 117 unsigned long this;
118 118
119 while (bytes) { 119 while (bytes) {
120 if (iter->addr == NULL) 120 if (!iter->addr)
121 iter->addr = kmap_atomic(*iter->pages, KM_USER0); 121 iter->addr = kmap_atomic(*iter->pages, KM_USER0);
122 122
123 this = min(bytes, PAGE_SIZE - iter->offset); 123 this = min(bytes, PAGE_SIZE - iter->offset);
@@ -188,7 +188,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
188 >> PAGE_SHIFT; 188 >> PAGE_SHIFT;
189 189
190 pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL); 190 pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL);
191 if (pages == NULL) { 191 if (!pages) {
192 ret = -ENOMEM; 192 ret = -ENOMEM;
193 goto out; 193 goto out;
194 } 194 }
@@ -206,7 +206,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
206 206
207call_func: 207call_func:
208 func = rds_info_funcs[optname - RDS_INFO_FIRST]; 208 func = rds_info_funcs[optname - RDS_INFO_FIRST];
209 if (func == NULL) { 209 if (!func) {
210 ret = -ENOPROTOOPT; 210 ret = -ENOPROTOOPT;
211 goto out; 211 goto out;
212 } 212 }
@@ -234,7 +234,7 @@ call_func:
234 ret = -EFAULT; 234 ret = -EFAULT;
235 235
236out: 236out:
237 for (i = 0; pages != NULL && i < nr_pages; i++) 237 for (i = 0; pages && i < nr_pages; i++)
238 put_page(pages[i]); 238 put_page(pages[i]);
239 kfree(pages); 239 kfree(pages);
240 240
diff --git a/net/rds/iw.c b/net/rds/iw.c
index c8f3d3525cb9..5a9676fe594f 100644
--- a/net/rds/iw.c
+++ b/net/rds/iw.c
@@ -56,7 +56,7 @@ struct list_head rds_iw_devices;
56DEFINE_SPINLOCK(iw_nodev_conns_lock); 56DEFINE_SPINLOCK(iw_nodev_conns_lock);
57LIST_HEAD(iw_nodev_conns); 57LIST_HEAD(iw_nodev_conns);
58 58
59void rds_iw_add_one(struct ib_device *device) 59static void rds_iw_add_one(struct ib_device *device)
60{ 60{
61 struct rds_iw_device *rds_iwdev; 61 struct rds_iw_device *rds_iwdev;
62 struct ib_device_attr *dev_attr; 62 struct ib_device_attr *dev_attr;
@@ -124,7 +124,7 @@ free_attr:
124 kfree(dev_attr); 124 kfree(dev_attr);
125} 125}
126 126
127void rds_iw_remove_one(struct ib_device *device) 127static void rds_iw_remove_one(struct ib_device *device)
128{ 128{
129 struct rds_iw_device *rds_iwdev; 129 struct rds_iw_device *rds_iwdev;
130 struct rds_iw_cm_id *i_cm_id, *next; 130 struct rds_iw_cm_id *i_cm_id, *next;
@@ -264,7 +264,6 @@ struct rds_transport rds_iw_transport = {
264 .laddr_check = rds_iw_laddr_check, 264 .laddr_check = rds_iw_laddr_check,
265 .xmit_complete = rds_iw_xmit_complete, 265 .xmit_complete = rds_iw_xmit_complete,
266 .xmit = rds_iw_xmit, 266 .xmit = rds_iw_xmit,
267 .xmit_cong_map = NULL,
268 .xmit_rdma = rds_iw_xmit_rdma, 267 .xmit_rdma = rds_iw_xmit_rdma,
269 .recv = rds_iw_recv, 268 .recv = rds_iw_recv,
270 .conn_alloc = rds_iw_conn_alloc, 269 .conn_alloc = rds_iw_conn_alloc,
@@ -272,7 +271,6 @@ struct rds_transport rds_iw_transport = {
272 .conn_connect = rds_iw_conn_connect, 271 .conn_connect = rds_iw_conn_connect,
273 .conn_shutdown = rds_iw_conn_shutdown, 272 .conn_shutdown = rds_iw_conn_shutdown,
274 .inc_copy_to_user = rds_iw_inc_copy_to_user, 273 .inc_copy_to_user = rds_iw_inc_copy_to_user,
275 .inc_purge = rds_iw_inc_purge,
276 .inc_free = rds_iw_inc_free, 274 .inc_free = rds_iw_inc_free,
277 .cm_initiate_connect = rds_iw_cm_initiate_connect, 275 .cm_initiate_connect = rds_iw_cm_initiate_connect,
278 .cm_handle_connect = rds_iw_cm_handle_connect, 276 .cm_handle_connect = rds_iw_cm_handle_connect,
@@ -289,7 +287,7 @@ struct rds_transport rds_iw_transport = {
289 .t_prefer_loopback = 1, 287 .t_prefer_loopback = 1,
290}; 288};
291 289
292int __init rds_iw_init(void) 290int rds_iw_init(void)
293{ 291{
294 int ret; 292 int ret;
295 293
diff --git a/net/rds/iw.h b/net/rds/iw.h
index eef2f0c28476..90151922178c 100644
--- a/net/rds/iw.h
+++ b/net/rds/iw.h
@@ -70,7 +70,7 @@ struct rds_iw_send_work {
70 struct rds_message *s_rm; 70 struct rds_message *s_rm;
71 71
72 /* We should really put these into a union: */ 72 /* We should really put these into a union: */
73 struct rds_rdma_op *s_op; 73 struct rm_rdma_op *s_op;
74 struct rds_iw_mapping *s_mapping; 74 struct rds_iw_mapping *s_mapping;
75 struct ib_mr *s_mr; 75 struct ib_mr *s_mr;
76 struct ib_fast_reg_page_list *s_page_list; 76 struct ib_fast_reg_page_list *s_page_list;
@@ -268,8 +268,6 @@ static inline u32 rds_iw_local_dma_lkey(struct rds_iw_connection *ic)
268 268
269/* ib.c */ 269/* ib.c */
270extern struct rds_transport rds_iw_transport; 270extern struct rds_transport rds_iw_transport;
271extern void rds_iw_add_one(struct ib_device *device);
272extern void rds_iw_remove_one(struct ib_device *device);
273extern struct ib_client rds_iw_client; 271extern struct ib_client rds_iw_client;
274 272
275extern unsigned int fastreg_pool_size; 273extern unsigned int fastreg_pool_size;
@@ -284,7 +282,7 @@ void rds_iw_conn_free(void *arg);
284int rds_iw_conn_connect(struct rds_connection *conn); 282int rds_iw_conn_connect(struct rds_connection *conn);
285void rds_iw_conn_shutdown(struct rds_connection *conn); 283void rds_iw_conn_shutdown(struct rds_connection *conn);
286void rds_iw_state_change(struct sock *sk); 284void rds_iw_state_change(struct sock *sk);
287int __init rds_iw_listen_init(void); 285int rds_iw_listen_init(void);
288void rds_iw_listen_stop(void); 286void rds_iw_listen_stop(void);
289void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...); 287void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...);
290int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, 288int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
@@ -318,15 +316,13 @@ void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
318void rds_iw_sync_mr(void *trans_private, int dir); 316void rds_iw_sync_mr(void *trans_private, int dir);
319void rds_iw_free_mr(void *trans_private, int invalidate); 317void rds_iw_free_mr(void *trans_private, int invalidate);
320void rds_iw_flush_mrs(void); 318void rds_iw_flush_mrs(void);
321void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id);
322 319
323/* ib_recv.c */ 320/* ib_recv.c */
324int __init rds_iw_recv_init(void); 321int rds_iw_recv_init(void);
325void rds_iw_recv_exit(void); 322void rds_iw_recv_exit(void);
326int rds_iw_recv(struct rds_connection *conn); 323int rds_iw_recv(struct rds_connection *conn);
327int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, 324int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
328 gfp_t page_gfp, int prefill); 325 gfp_t page_gfp, int prefill);
329void rds_iw_inc_purge(struct rds_incoming *inc);
330void rds_iw_inc_free(struct rds_incoming *inc); 326void rds_iw_inc_free(struct rds_incoming *inc);
331int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, 327int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
332 size_t size); 328 size_t size);
@@ -358,7 +354,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
358void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context); 354void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context);
359void rds_iw_send_init_ring(struct rds_iw_connection *ic); 355void rds_iw_send_init_ring(struct rds_iw_connection *ic);
360void rds_iw_send_clear_ring(struct rds_iw_connection *ic); 356void rds_iw_send_clear_ring(struct rds_iw_connection *ic);
361int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op); 357int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
362void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits); 358void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits);
363void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted); 359void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted);
364int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted, 360int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted,
@@ -371,7 +367,7 @@ unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
371 unsigned int avail); 367 unsigned int avail);
372 368
373/* ib_sysctl.c */ 369/* ib_sysctl.c */
374int __init rds_iw_sysctl_init(void); 370int rds_iw_sysctl_init(void);
375void rds_iw_sysctl_exit(void); 371void rds_iw_sysctl_exit(void);
376extern unsigned long rds_iw_sysctl_max_send_wr; 372extern unsigned long rds_iw_sysctl_max_send_wr;
377extern unsigned long rds_iw_sysctl_max_recv_wr; 373extern unsigned long rds_iw_sysctl_max_recv_wr;
@@ -379,7 +375,6 @@ extern unsigned long rds_iw_sysctl_max_unsig_wrs;
379extern unsigned long rds_iw_sysctl_max_unsig_bytes; 375extern unsigned long rds_iw_sysctl_max_unsig_bytes;
380extern unsigned long rds_iw_sysctl_max_recv_allocation; 376extern unsigned long rds_iw_sysctl_max_recv_allocation;
381extern unsigned int rds_iw_sysctl_flow_control; 377extern unsigned int rds_iw_sysctl_flow_control;
382extern ctl_table rds_iw_sysctl_table[];
383 378
384/* 379/*
385 * Helper functions for getting/setting the header and data SGEs in 380 * Helper functions for getting/setting the header and data SGEs in
diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c
index b5dd6ac39be8..712cf2d1f28e 100644
--- a/net/rds/iw_cm.c
+++ b/net/rds/iw_cm.c
@@ -257,7 +257,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
257 * the rds_iwdev at all. 257 * the rds_iwdev at all.
258 */ 258 */
259 rds_iwdev = ib_get_client_data(dev, &rds_iw_client); 259 rds_iwdev = ib_get_client_data(dev, &rds_iw_client);
260 if (rds_iwdev == NULL) { 260 if (!rds_iwdev) {
261 if (printk_ratelimit()) 261 if (printk_ratelimit())
262 printk(KERN_NOTICE "RDS/IW: No client_data for device %s\n", 262 printk(KERN_NOTICE "RDS/IW: No client_data for device %s\n",
263 dev->name); 263 dev->name);
@@ -292,7 +292,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
292 ic->i_send_ring.w_nr * 292 ic->i_send_ring.w_nr *
293 sizeof(struct rds_header), 293 sizeof(struct rds_header),
294 &ic->i_send_hdrs_dma, GFP_KERNEL); 294 &ic->i_send_hdrs_dma, GFP_KERNEL);
295 if (ic->i_send_hdrs == NULL) { 295 if (!ic->i_send_hdrs) {
296 ret = -ENOMEM; 296 ret = -ENOMEM;
297 rdsdebug("ib_dma_alloc_coherent send failed\n"); 297 rdsdebug("ib_dma_alloc_coherent send failed\n");
298 goto out; 298 goto out;
@@ -302,7 +302,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
302 ic->i_recv_ring.w_nr * 302 ic->i_recv_ring.w_nr *
303 sizeof(struct rds_header), 303 sizeof(struct rds_header),
304 &ic->i_recv_hdrs_dma, GFP_KERNEL); 304 &ic->i_recv_hdrs_dma, GFP_KERNEL);
305 if (ic->i_recv_hdrs == NULL) { 305 if (!ic->i_recv_hdrs) {
306 ret = -ENOMEM; 306 ret = -ENOMEM;
307 rdsdebug("ib_dma_alloc_coherent recv failed\n"); 307 rdsdebug("ib_dma_alloc_coherent recv failed\n");
308 goto out; 308 goto out;
@@ -310,14 +310,14 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
310 310
311 ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), 311 ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
312 &ic->i_ack_dma, GFP_KERNEL); 312 &ic->i_ack_dma, GFP_KERNEL);
313 if (ic->i_ack == NULL) { 313 if (!ic->i_ack) {
314 ret = -ENOMEM; 314 ret = -ENOMEM;
315 rdsdebug("ib_dma_alloc_coherent ack failed\n"); 315 rdsdebug("ib_dma_alloc_coherent ack failed\n");
316 goto out; 316 goto out;
317 } 317 }
318 318
319 ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work)); 319 ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work));
320 if (ic->i_sends == NULL) { 320 if (!ic->i_sends) {
321 ret = -ENOMEM; 321 ret = -ENOMEM;
322 rdsdebug("send allocation failed\n"); 322 rdsdebug("send allocation failed\n");
323 goto out; 323 goto out;
@@ -325,7 +325,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
325 rds_iw_send_init_ring(ic); 325 rds_iw_send_init_ring(ic);
326 326
327 ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work)); 327 ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work));
328 if (ic->i_recvs == NULL) { 328 if (!ic->i_recvs) {
329 ret = -ENOMEM; 329 ret = -ENOMEM;
330 rdsdebug("recv allocation failed\n"); 330 rdsdebug("recv allocation failed\n");
331 goto out; 331 goto out;
@@ -696,7 +696,7 @@ int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp)
696 696
697 /* XXX too lazy? */ 697 /* XXX too lazy? */
698 ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL); 698 ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL);
699 if (ic == NULL) 699 if (!ic)
700 return -ENOMEM; 700 return -ENOMEM;
701 701
702 INIT_LIST_HEAD(&ic->iw_node); 702 INIT_LIST_HEAD(&ic->iw_node);
diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c
index 13dc1862d862..59509e9a9e72 100644
--- a/net/rds/iw_rdma.c
+++ b/net/rds/iw_rdma.c
@@ -34,7 +34,6 @@
34#include <linux/slab.h> 34#include <linux/slab.h>
35 35
36#include "rds.h" 36#include "rds.h"
37#include "rdma.h"
38#include "iw.h" 37#include "iw.h"
39 38
40 39
@@ -158,7 +157,8 @@ static int rds_iw_add_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *
158 return 0; 157 return 0;
159} 158}
160 159
161void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id) 160static void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev,
161 struct rdma_cm_id *cm_id)
162{ 162{
163 struct rds_iw_cm_id *i_cm_id; 163 struct rds_iw_cm_id *i_cm_id;
164 164
@@ -207,9 +207,9 @@ void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *con
207 BUG_ON(list_empty(&ic->iw_node)); 207 BUG_ON(list_empty(&ic->iw_node));
208 list_del(&ic->iw_node); 208 list_del(&ic->iw_node);
209 209
210 spin_lock_irq(&rds_iwdev->spinlock); 210 spin_lock(&rds_iwdev->spinlock);
211 list_add_tail(&ic->iw_node, &rds_iwdev->conn_list); 211 list_add_tail(&ic->iw_node, &rds_iwdev->conn_list);
212 spin_unlock_irq(&rds_iwdev->spinlock); 212 spin_unlock(&rds_iwdev->spinlock);
213 spin_unlock_irq(&iw_nodev_conns_lock); 213 spin_unlock_irq(&iw_nodev_conns_lock);
214 214
215 ic->rds_iwdev = rds_iwdev; 215 ic->rds_iwdev = rds_iwdev;
diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c
index 3d479067d54d..5e57347f49ff 100644
--- a/net/rds/iw_recv.c
+++ b/net/rds/iw_recv.c
@@ -53,7 +53,7 @@ static void rds_iw_frag_drop_page(struct rds_page_frag *frag)
53static void rds_iw_frag_free(struct rds_page_frag *frag) 53static void rds_iw_frag_free(struct rds_page_frag *frag)
54{ 54{
55 rdsdebug("frag %p page %p\n", frag, frag->f_page); 55 rdsdebug("frag %p page %p\n", frag, frag->f_page);
56 BUG_ON(frag->f_page != NULL); 56 BUG_ON(frag->f_page);
57 kmem_cache_free(rds_iw_frag_slab, frag); 57 kmem_cache_free(rds_iw_frag_slab, frag);
58} 58}
59 59
@@ -143,14 +143,14 @@ static int rds_iw_recv_refill_one(struct rds_connection *conn,
143 struct ib_sge *sge; 143 struct ib_sge *sge;
144 int ret = -ENOMEM; 144 int ret = -ENOMEM;
145 145
146 if (recv->r_iwinc == NULL) { 146 if (!recv->r_iwinc) {
147 if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) { 147 if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) {
148 rds_iw_stats_inc(s_iw_rx_alloc_limit); 148 rds_iw_stats_inc(s_iw_rx_alloc_limit);
149 goto out; 149 goto out;
150 } 150 }
151 recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab, 151 recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab,
152 kptr_gfp); 152 kptr_gfp);
153 if (recv->r_iwinc == NULL) { 153 if (!recv->r_iwinc) {
154 atomic_dec(&rds_iw_allocation); 154 atomic_dec(&rds_iw_allocation);
155 goto out; 155 goto out;
156 } 156 }
@@ -158,17 +158,17 @@ static int rds_iw_recv_refill_one(struct rds_connection *conn,
158 rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr); 158 rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr);
159 } 159 }
160 160
161 if (recv->r_frag == NULL) { 161 if (!recv->r_frag) {
162 recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp); 162 recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp);
163 if (recv->r_frag == NULL) 163 if (!recv->r_frag)
164 goto out; 164 goto out;
165 INIT_LIST_HEAD(&recv->r_frag->f_item); 165 INIT_LIST_HEAD(&recv->r_frag->f_item);
166 recv->r_frag->f_page = NULL; 166 recv->r_frag->f_page = NULL;
167 } 167 }
168 168
169 if (ic->i_frag.f_page == NULL) { 169 if (!ic->i_frag.f_page) {
170 ic->i_frag.f_page = alloc_page(page_gfp); 170 ic->i_frag.f_page = alloc_page(page_gfp);
171 if (ic->i_frag.f_page == NULL) 171 if (!ic->i_frag.f_page)
172 goto out; 172 goto out;
173 ic->i_frag.f_offset = 0; 173 ic->i_frag.f_offset = 0;
174 } 174 }
@@ -273,7 +273,7 @@ int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
273 return ret; 273 return ret;
274} 274}
275 275
276void rds_iw_inc_purge(struct rds_incoming *inc) 276static void rds_iw_inc_purge(struct rds_incoming *inc)
277{ 277{
278 struct rds_iw_incoming *iwinc; 278 struct rds_iw_incoming *iwinc;
279 struct rds_page_frag *frag; 279 struct rds_page_frag *frag;
@@ -716,7 +716,7 @@ static void rds_iw_process_recv(struct rds_connection *conn,
716 * into the inc and save the inc so we can hang upcoming fragments 716 * into the inc and save the inc so we can hang upcoming fragments
717 * off its list. 717 * off its list.
718 */ 718 */
719 if (iwinc == NULL) { 719 if (!iwinc) {
720 iwinc = recv->r_iwinc; 720 iwinc = recv->r_iwinc;
721 recv->r_iwinc = NULL; 721 recv->r_iwinc = NULL;
722 ic->i_iwinc = iwinc; 722 ic->i_iwinc = iwinc;
@@ -887,7 +887,7 @@ int rds_iw_recv(struct rds_connection *conn)
887 return ret; 887 return ret;
888} 888}
889 889
890int __init rds_iw_recv_init(void) 890int rds_iw_recv_init(void)
891{ 891{
892 struct sysinfo si; 892 struct sysinfo si;
893 int ret = -ENOMEM; 893 int ret = -ENOMEM;
@@ -899,13 +899,13 @@ int __init rds_iw_recv_init(void)
899 rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming", 899 rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming",
900 sizeof(struct rds_iw_incoming), 900 sizeof(struct rds_iw_incoming),
901 0, 0, NULL); 901 0, 0, NULL);
902 if (rds_iw_incoming_slab == NULL) 902 if (!rds_iw_incoming_slab)
903 goto out; 903 goto out;
904 904
905 rds_iw_frag_slab = kmem_cache_create("rds_iw_frag", 905 rds_iw_frag_slab = kmem_cache_create("rds_iw_frag",
906 sizeof(struct rds_page_frag), 906 sizeof(struct rds_page_frag),
907 0, 0, NULL); 907 0, 0, NULL);
908 if (rds_iw_frag_slab == NULL) 908 if (!rds_iw_frag_slab)
909 kmem_cache_destroy(rds_iw_incoming_slab); 909 kmem_cache_destroy(rds_iw_incoming_slab);
910 else 910 else
911 ret = 0; 911 ret = 0;
diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c
index 52182ff7519e..6280ea020d4e 100644
--- a/net/rds/iw_send.c
+++ b/net/rds/iw_send.c
@@ -36,7 +36,6 @@
36#include <linux/dmapool.h> 36#include <linux/dmapool.h>
37 37
38#include "rds.h" 38#include "rds.h"
39#include "rdma.h"
40#include "iw.h" 39#include "iw.h"
41 40
42static void rds_iw_send_rdma_complete(struct rds_message *rm, 41static void rds_iw_send_rdma_complete(struct rds_message *rm,
@@ -64,13 +63,13 @@ static void rds_iw_send_rdma_complete(struct rds_message *rm,
64} 63}
65 64
66static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic, 65static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic,
67 struct rds_rdma_op *op) 66 struct rm_rdma_op *op)
68{ 67{
69 if (op->r_mapped) { 68 if (op->op_mapped) {
70 ib_dma_unmap_sg(ic->i_cm_id->device, 69 ib_dma_unmap_sg(ic->i_cm_id->device,
71 op->r_sg, op->r_nents, 70 op->op_sg, op->op_nents,
72 op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 71 op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
73 op->r_mapped = 0; 72 op->op_mapped = 0;
74 } 73 }
75} 74}
76 75
@@ -83,11 +82,11 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
83 rdsdebug("ic %p send %p rm %p\n", ic, send, rm); 82 rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
84 83
85 ib_dma_unmap_sg(ic->i_cm_id->device, 84 ib_dma_unmap_sg(ic->i_cm_id->device,
86 rm->m_sg, rm->m_nents, 85 rm->data.op_sg, rm->data.op_nents,
87 DMA_TO_DEVICE); 86 DMA_TO_DEVICE);
88 87
89 if (rm->m_rdma_op != NULL) { 88 if (rm->rdma.op_active) {
90 rds_iw_send_unmap_rdma(ic, rm->m_rdma_op); 89 rds_iw_send_unmap_rdma(ic, &rm->rdma);
91 90
92 /* If the user asked for a completion notification on this 91 /* If the user asked for a completion notification on this
93 * message, we can implement three different semantics: 92 * message, we can implement three different semantics:
@@ -111,10 +110,10 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
111 */ 110 */
112 rds_iw_send_rdma_complete(rm, wc_status); 111 rds_iw_send_rdma_complete(rm, wc_status);
113 112
114 if (rm->m_rdma_op->r_write) 113 if (rm->rdma.op_write)
115 rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes); 114 rds_stats_add(s_send_rdma_bytes, rm->rdma.op_bytes);
116 else 115 else
117 rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes); 116 rds_stats_add(s_recv_rdma_bytes, rm->rdma.op_bytes);
118 } 117 }
119 118
120 /* If anyone waited for this message to get flushed out, wake 119 /* If anyone waited for this message to get flushed out, wake
@@ -556,25 +555,27 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
556 } 555 }
557 556
558 /* map the message the first time we see it */ 557 /* map the message the first time we see it */
559 if (ic->i_rm == NULL) { 558 if (!ic->i_rm) {
560 /* 559 /*
561 printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n", 560 printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n",
562 be16_to_cpu(rm->m_inc.i_hdr.h_dport), 561 be16_to_cpu(rm->m_inc.i_hdr.h_dport),
563 rm->m_inc.i_hdr.h_flags, 562 rm->m_inc.i_hdr.h_flags,
564 be32_to_cpu(rm->m_inc.i_hdr.h_len)); 563 be32_to_cpu(rm->m_inc.i_hdr.h_len));
565 */ 564 */
566 if (rm->m_nents) { 565 if (rm->data.op_nents) {
567 rm->m_count = ib_dma_map_sg(dev, 566 rm->data.op_count = ib_dma_map_sg(dev,
568 rm->m_sg, rm->m_nents, DMA_TO_DEVICE); 567 rm->data.op_sg,
569 rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count); 568 rm->data.op_nents,
570 if (rm->m_count == 0) { 569 DMA_TO_DEVICE);
570 rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count);
571 if (rm->data.op_count == 0) {
571 rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); 572 rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
572 rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); 573 rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
573 ret = -ENOMEM; /* XXX ? */ 574 ret = -ENOMEM; /* XXX ? */
574 goto out; 575 goto out;
575 } 576 }
576 } else { 577 } else {
577 rm->m_count = 0; 578 rm->data.op_count = 0;
578 } 579 }
579 580
580 ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; 581 ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
@@ -590,10 +591,10 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
590 591
591 /* If it has a RDMA op, tell the peer we did it. This is 592 /* If it has a RDMA op, tell the peer we did it. This is
592 * used by the peer to release use-once RDMA MRs. */ 593 * used by the peer to release use-once RDMA MRs. */
593 if (rm->m_rdma_op) { 594 if (rm->rdma.op_active) {
594 struct rds_ext_header_rdma ext_hdr; 595 struct rds_ext_header_rdma ext_hdr;
595 596
596 ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key); 597 ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
597 rds_message_add_extension(&rm->m_inc.i_hdr, 598 rds_message_add_extension(&rm->m_inc.i_hdr,
598 RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); 599 RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
599 } 600 }
@@ -621,7 +622,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
621 send = &ic->i_sends[pos]; 622 send = &ic->i_sends[pos];
622 first = send; 623 first = send;
623 prev = NULL; 624 prev = NULL;
624 scat = &rm->m_sg[sg]; 625 scat = &rm->data.op_sg[sg];
625 sent = 0; 626 sent = 0;
626 i = 0; 627 i = 0;
627 628
@@ -631,7 +632,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
631 * or when requested by the user. Right now, we let 632 * or when requested by the user. Right now, we let
632 * the application choose. 633 * the application choose.
633 */ 634 */
634 if (rm->m_rdma_op && rm->m_rdma_op->r_fence) 635 if (rm->rdma.op_active && rm->rdma.op_fence)
635 send_flags = IB_SEND_FENCE; 636 send_flags = IB_SEND_FENCE;
636 637
637 /* 638 /*
@@ -650,7 +651,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
650 } 651 }
651 652
652 /* if there's data reference it with a chain of work reqs */ 653 /* if there's data reference it with a chain of work reqs */
653 for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) { 654 for (; i < work_alloc && scat != &rm->data.op_sg[rm->data.op_count]; i++) {
654 unsigned int len; 655 unsigned int len;
655 656
656 send = &ic->i_sends[pos]; 657 send = &ic->i_sends[pos];
@@ -728,7 +729,7 @@ add_header:
728 sent += sizeof(struct rds_header); 729 sent += sizeof(struct rds_header);
729 730
730 /* if we finished the message then send completion owns it */ 731 /* if we finished the message then send completion owns it */
731 if (scat == &rm->m_sg[rm->m_count]) { 732 if (scat == &rm->data.op_sg[rm->data.op_count]) {
732 prev->s_rm = ic->i_rm; 733 prev->s_rm = ic->i_rm;
733 prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; 734 prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
734 ic->i_rm = NULL; 735 ic->i_rm = NULL;
@@ -784,7 +785,7 @@ static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rd
784 ib_update_fast_reg_key(send->s_mr, send->s_remap_count++); 785 ib_update_fast_reg_key(send->s_mr, send->s_remap_count++);
785} 786}
786 787
787int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) 788int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
788{ 789{
789 struct rds_iw_connection *ic = conn->c_transport_data; 790 struct rds_iw_connection *ic = conn->c_transport_data;
790 struct rds_iw_send_work *send = NULL; 791 struct rds_iw_send_work *send = NULL;
@@ -794,7 +795,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
794 struct rds_iw_device *rds_iwdev; 795 struct rds_iw_device *rds_iwdev;
795 struct scatterlist *scat; 796 struct scatterlist *scat;
796 unsigned long len; 797 unsigned long len;
797 u64 remote_addr = op->r_remote_addr; 798 u64 remote_addr = op->op_remote_addr;
798 u32 pos, fr_pos; 799 u32 pos, fr_pos;
799 u32 work_alloc; 800 u32 work_alloc;
800 u32 i; 801 u32 i;
@@ -806,21 +807,21 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
806 rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); 807 rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
807 808
808 /* map the message the first time we see it */ 809 /* map the message the first time we see it */
809 if (!op->r_mapped) { 810 if (!op->op_mapped) {
810 op->r_count = ib_dma_map_sg(ic->i_cm_id->device, 811 op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
811 op->r_sg, op->r_nents, (op->r_write) ? 812 op->op_sg, op->op_nents, (op->op_write) ?
812 DMA_TO_DEVICE : DMA_FROM_DEVICE); 813 DMA_TO_DEVICE : DMA_FROM_DEVICE);
813 rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count); 814 rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
814 if (op->r_count == 0) { 815 if (op->op_count == 0) {
815 rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); 816 rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
816 ret = -ENOMEM; /* XXX ? */ 817 ret = -ENOMEM; /* XXX ? */
817 goto out; 818 goto out;
818 } 819 }
819 820
820 op->r_mapped = 1; 821 op->op_mapped = 1;
821 } 822 }
822 823
823 if (!op->r_write) { 824 if (!op->op_write) {
824 /* Alloc space on the send queue for the fastreg */ 825 /* Alloc space on the send queue for the fastreg */
825 work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos); 826 work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos);
826 if (work_alloc != 1) { 827 if (work_alloc != 1) {
@@ -835,7 +836,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
835 * Instead of knowing how to return a partial rdma read/write we insist that there 836 * Instead of knowing how to return a partial rdma read/write we insist that there
836 * be enough work requests to send the entire message. 837 * be enough work requests to send the entire message.
837 */ 838 */
838 i = ceil(op->r_count, rds_iwdev->max_sge); 839 i = ceil(op->op_count, rds_iwdev->max_sge);
839 840
840 work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos); 841 work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
841 if (work_alloc != i) { 842 if (work_alloc != i) {
@@ -846,17 +847,17 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
846 } 847 }
847 848
848 send = &ic->i_sends[pos]; 849 send = &ic->i_sends[pos];
849 if (!op->r_write) { 850 if (!op->op_write) {
850 first = prev = &ic->i_sends[fr_pos]; 851 first = prev = &ic->i_sends[fr_pos];
851 } else { 852 } else {
852 first = send; 853 first = send;
853 prev = NULL; 854 prev = NULL;
854 } 855 }
855 scat = &op->r_sg[0]; 856 scat = &op->op_sg[0];
856 sent = 0; 857 sent = 0;
857 num_sge = op->r_count; 858 num_sge = op->op_count;
858 859
859 for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) { 860 for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
860 send->s_wr.send_flags = 0; 861 send->s_wr.send_flags = 0;
861 send->s_queued = jiffies; 862 send->s_queued = jiffies;
862 863
@@ -873,13 +874,13 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
873 * for local access after RDS is finished with it, using 874 * for local access after RDS is finished with it, using
874 * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed. 875 * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed.
875 */ 876 */
876 if (op->r_write) 877 if (op->op_write)
877 send->s_wr.opcode = IB_WR_RDMA_WRITE; 878 send->s_wr.opcode = IB_WR_RDMA_WRITE;
878 else 879 else
879 send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV; 880 send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
880 881
881 send->s_wr.wr.rdma.remote_addr = remote_addr; 882 send->s_wr.wr.rdma.remote_addr = remote_addr;
882 send->s_wr.wr.rdma.rkey = op->r_key; 883 send->s_wr.wr.rdma.rkey = op->op_rkey;
883 send->s_op = op; 884 send->s_op = op;
884 885
885 if (num_sge > rds_iwdev->max_sge) { 886 if (num_sge > rds_iwdev->max_sge) {
@@ -893,7 +894,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
893 if (prev) 894 if (prev)
894 prev->s_wr.next = &send->s_wr; 895 prev->s_wr.next = &send->s_wr;
895 896
896 for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) { 897 for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) {
897 len = ib_sg_dma_len(ic->i_cm_id->device, scat); 898 len = ib_sg_dma_len(ic->i_cm_id->device, scat);
898 899
899 if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) 900 if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV)
@@ -927,7 +928,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
927 } 928 }
928 929
929 /* if we finished the message then send completion owns it */ 930 /* if we finished the message then send completion owns it */
930 if (scat == &op->r_sg[op->r_count]) 931 if (scat == &op->op_sg[op->op_count])
931 first->s_wr.send_flags = IB_SEND_SIGNALED; 932 first->s_wr.send_flags = IB_SEND_SIGNALED;
932 933
933 if (i < work_alloc) { 934 if (i < work_alloc) {
@@ -941,9 +942,9 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
941 * adapters do not allow using the lkey for this at all. To bypass this use a 942 * adapters do not allow using the lkey for this at all. To bypass this use a
942 * fastreg_mr (or possibly a dma_mr) 943 * fastreg_mr (or possibly a dma_mr)
943 */ 944 */
944 if (!op->r_write) { 945 if (!op->op_write) {
945 rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos], 946 rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos],
946 op->r_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr); 947 op->op_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr);
947 work_alloc++; 948 work_alloc++;
948 } 949 }
949 950
diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c
index 1c4428a61a02..e2e47176e729 100644
--- a/net/rds/iw_sysctl.c
+++ b/net/rds/iw_sysctl.c
@@ -55,7 +55,7 @@ static unsigned long rds_iw_sysctl_max_unsig_bytes_max = ~0UL;
55 55
56unsigned int rds_iw_sysctl_flow_control = 1; 56unsigned int rds_iw_sysctl_flow_control = 1;
57 57
58ctl_table rds_iw_sysctl_table[] = { 58static ctl_table rds_iw_sysctl_table[] = {
59 { 59 {
60 .procname = "max_send_wr", 60 .procname = "max_send_wr",
61 .data = &rds_iw_sysctl_max_send_wr, 61 .data = &rds_iw_sysctl_max_send_wr,
@@ -122,10 +122,10 @@ void rds_iw_sysctl_exit(void)
122 unregister_sysctl_table(rds_iw_sysctl_hdr); 122 unregister_sysctl_table(rds_iw_sysctl_hdr);
123} 123}
124 124
125int __init rds_iw_sysctl_init(void) 125int rds_iw_sysctl_init(void)
126{ 126{
127 rds_iw_sysctl_hdr = register_sysctl_paths(rds_iw_sysctl_path, rds_iw_sysctl_table); 127 rds_iw_sysctl_hdr = register_sysctl_paths(rds_iw_sysctl_path, rds_iw_sysctl_table);
128 if (rds_iw_sysctl_hdr == NULL) 128 if (!rds_iw_sysctl_hdr)
129 return -ENOMEM; 129 return -ENOMEM;
130 return 0; 130 return 0;
131} 131}
diff --git a/net/rds/loop.c b/net/rds/loop.c
index dd9879379457..c390156b426f 100644
--- a/net/rds/loop.c
+++ b/net/rds/loop.c
@@ -61,10 +61,17 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
61 unsigned int hdr_off, unsigned int sg, 61 unsigned int hdr_off, unsigned int sg,
62 unsigned int off) 62 unsigned int off)
63{ 63{
64 /* Do not send cong updates to loopback */
65 if (rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) {
66 rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
67 return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES;
68 }
69
64 BUG_ON(hdr_off || sg || off); 70 BUG_ON(hdr_off || sg || off);
65 71
66 rds_inc_init(&rm->m_inc, conn, conn->c_laddr); 72 rds_inc_init(&rm->m_inc, conn, conn->c_laddr);
67 rds_message_addref(rm); /* for the inc */ 73 /* For the embedded inc. Matching put is in loop_inc_free() */
74 rds_message_addref(rm);
68 75
69 rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc, 76 rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc,
70 GFP_KERNEL, KM_USER0); 77 GFP_KERNEL, KM_USER0);
@@ -77,16 +84,14 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
77 return sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len); 84 return sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len);
78} 85}
79 86
80static int rds_loop_xmit_cong_map(struct rds_connection *conn, 87/*
81 struct rds_cong_map *map, 88 * See rds_loop_xmit(). Since our inc is embedded in the rm, we
82 unsigned long offset) 89 * make sure the rm lives at least until the inc is done.
90 */
91static void rds_loop_inc_free(struct rds_incoming *inc)
83{ 92{
84 BUG_ON(offset); 93 struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
85 BUG_ON(map != conn->c_lcong); 94 rds_message_put(rm);
86
87 rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
88
89 return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES;
90} 95}
91 96
92/* we need to at least give the thread something to succeed */ 97/* we need to at least give the thread something to succeed */
@@ -112,7 +117,7 @@ static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp)
112 unsigned long flags; 117 unsigned long flags;
113 118
114 lc = kzalloc(sizeof(struct rds_loop_connection), GFP_KERNEL); 119 lc = kzalloc(sizeof(struct rds_loop_connection), GFP_KERNEL);
115 if (lc == NULL) 120 if (!lc)
116 return -ENOMEM; 121 return -ENOMEM;
117 122
118 INIT_LIST_HEAD(&lc->loop_node); 123 INIT_LIST_HEAD(&lc->loop_node);
@@ -169,14 +174,12 @@ void rds_loop_exit(void)
169 */ 174 */
170struct rds_transport rds_loop_transport = { 175struct rds_transport rds_loop_transport = {
171 .xmit = rds_loop_xmit, 176 .xmit = rds_loop_xmit,
172 .xmit_cong_map = rds_loop_xmit_cong_map,
173 .recv = rds_loop_recv, 177 .recv = rds_loop_recv,
174 .conn_alloc = rds_loop_conn_alloc, 178 .conn_alloc = rds_loop_conn_alloc,
175 .conn_free = rds_loop_conn_free, 179 .conn_free = rds_loop_conn_free,
176 .conn_connect = rds_loop_conn_connect, 180 .conn_connect = rds_loop_conn_connect,
177 .conn_shutdown = rds_loop_conn_shutdown, 181 .conn_shutdown = rds_loop_conn_shutdown,
178 .inc_copy_to_user = rds_message_inc_copy_to_user, 182 .inc_copy_to_user = rds_message_inc_copy_to_user,
179 .inc_purge = rds_message_inc_purge, 183 .inc_free = rds_loop_inc_free,
180 .inc_free = rds_message_inc_free,
181 .t_name = "loopback", 184 .t_name = "loopback",
182}; 185};
diff --git a/net/rds/message.c b/net/rds/message.c
index 9a1d67e001ba..848cff45183b 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -34,9 +34,6 @@
34#include <linux/slab.h> 34#include <linux/slab.h>
35 35
36#include "rds.h" 36#include "rds.h"
37#include "rdma.h"
38
39static DECLARE_WAIT_QUEUE_HEAD(rds_message_flush_waitq);
40 37
41static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = { 38static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
42[RDS_EXTHDR_NONE] = 0, 39[RDS_EXTHDR_NONE] = 0,
@@ -63,29 +60,31 @@ static void rds_message_purge(struct rds_message *rm)
63 if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags))) 60 if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags)))
64 return; 61 return;
65 62
66 for (i = 0; i < rm->m_nents; i++) { 63 for (i = 0; i < rm->data.op_nents; i++) {
67 rdsdebug("putting data page %p\n", (void *)sg_page(&rm->m_sg[i])); 64 rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i]));
68 /* XXX will have to put_page for page refs */ 65 /* XXX will have to put_page for page refs */
69 __free_page(sg_page(&rm->m_sg[i])); 66 __free_page(sg_page(&rm->data.op_sg[i]));
70 } 67 }
71 rm->m_nents = 0; 68 rm->data.op_nents = 0;
72 69
73 if (rm->m_rdma_op) 70 if (rm->rdma.op_active)
74 rds_rdma_free_op(rm->m_rdma_op); 71 rds_rdma_free_op(&rm->rdma);
75 if (rm->m_rdma_mr) 72 if (rm->rdma.op_rdma_mr)
76 rds_mr_put(rm->m_rdma_mr); 73 rds_mr_put(rm->rdma.op_rdma_mr);
77}
78 74
79void rds_message_inc_purge(struct rds_incoming *inc) 75 if (rm->atomic.op_active)
80{ 76 rds_atomic_free_op(&rm->atomic);
81 struct rds_message *rm = container_of(inc, struct rds_message, m_inc); 77 if (rm->atomic.op_rdma_mr)
82 rds_message_purge(rm); 78 rds_mr_put(rm->atomic.op_rdma_mr);
83} 79}
84 80
85void rds_message_put(struct rds_message *rm) 81void rds_message_put(struct rds_message *rm)
86{ 82{
87 rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount)); 83 rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
88 84 if (atomic_read(&rm->m_refcount) == 0) {
85printk(KERN_CRIT "danger refcount zero on %p\n", rm);
86WARN_ON(1);
87 }
89 if (atomic_dec_and_test(&rm->m_refcount)) { 88 if (atomic_dec_and_test(&rm->m_refcount)) {
90 BUG_ON(!list_empty(&rm->m_sock_item)); 89 BUG_ON(!list_empty(&rm->m_sock_item));
91 BUG_ON(!list_empty(&rm->m_conn_item)); 90 BUG_ON(!list_empty(&rm->m_conn_item));
@@ -96,12 +95,6 @@ void rds_message_put(struct rds_message *rm)
96} 95}
97EXPORT_SYMBOL_GPL(rds_message_put); 96EXPORT_SYMBOL_GPL(rds_message_put);
98 97
99void rds_message_inc_free(struct rds_incoming *inc)
100{
101 struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
102 rds_message_put(rm);
103}
104
105void rds_message_populate_header(struct rds_header *hdr, __be16 sport, 98void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
106 __be16 dport, u64 seq) 99 __be16 dport, u64 seq)
107{ 100{
@@ -113,8 +106,8 @@ void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
113} 106}
114EXPORT_SYMBOL_GPL(rds_message_populate_header); 107EXPORT_SYMBOL_GPL(rds_message_populate_header);
115 108
116int rds_message_add_extension(struct rds_header *hdr, 109int rds_message_add_extension(struct rds_header *hdr, unsigned int type,
117 unsigned int type, const void *data, unsigned int len) 110 const void *data, unsigned int len)
118{ 111{
119 unsigned int ext_len = sizeof(u8) + len; 112 unsigned int ext_len = sizeof(u8) + len;
120 unsigned char *dst; 113 unsigned char *dst;
@@ -184,26 +177,6 @@ none:
184 return RDS_EXTHDR_NONE; 177 return RDS_EXTHDR_NONE;
185} 178}
186 179
187int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version)
188{
189 struct rds_ext_header_version ext_hdr;
190
191 ext_hdr.h_version = cpu_to_be32(version);
192 return rds_message_add_extension(hdr, RDS_EXTHDR_VERSION, &ext_hdr, sizeof(ext_hdr));
193}
194
195int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version)
196{
197 struct rds_ext_header_version ext_hdr;
198 unsigned int pos = 0, len = sizeof(ext_hdr);
199
200 /* We assume the version extension is the only one present */
201 if (rds_message_next_extension(hdr, &pos, &ext_hdr, &len) != RDS_EXTHDR_VERSION)
202 return 0;
203 *version = be32_to_cpu(ext_hdr.h_version);
204 return 1;
205}
206
207int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset) 180int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset)
208{ 181{
209 struct rds_ext_header_rdma_dest ext_hdr; 182 struct rds_ext_header_rdma_dest ext_hdr;
@@ -214,41 +187,73 @@ int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 o
214} 187}
215EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension); 188EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension);
216 189
217struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp) 190/*
191 * Each rds_message is allocated with extra space for the scatterlist entries
192 * rds ops will need. This is to minimize memory allocation count. Then, each rds op
193 * can grab SGs when initializing its part of the rds_message.
194 */
195struct rds_message *rds_message_alloc(unsigned int extra_len, gfp_t gfp)
218{ 196{
219 struct rds_message *rm; 197 struct rds_message *rm;
220 198
221 rm = kzalloc(sizeof(struct rds_message) + 199 rm = kzalloc(sizeof(struct rds_message) + extra_len, gfp);
222 (nents * sizeof(struct scatterlist)), gfp);
223 if (!rm) 200 if (!rm)
224 goto out; 201 goto out;
225 202
226 if (nents) 203 rm->m_used_sgs = 0;
227 sg_init_table(rm->m_sg, nents); 204 rm->m_total_sgs = extra_len / sizeof(struct scatterlist);
205
228 atomic_set(&rm->m_refcount, 1); 206 atomic_set(&rm->m_refcount, 1);
229 INIT_LIST_HEAD(&rm->m_sock_item); 207 INIT_LIST_HEAD(&rm->m_sock_item);
230 INIT_LIST_HEAD(&rm->m_conn_item); 208 INIT_LIST_HEAD(&rm->m_conn_item);
231 spin_lock_init(&rm->m_rs_lock); 209 spin_lock_init(&rm->m_rs_lock);
210 init_waitqueue_head(&rm->m_flush_wait);
232 211
233out: 212out:
234 return rm; 213 return rm;
235} 214}
236 215
216/*
217 * RDS ops use this to grab SG entries from the rm's sg pool.
218 */
219struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents)
220{
221 struct scatterlist *sg_first = (struct scatterlist *) &rm[1];
222 struct scatterlist *sg_ret;
223
224 WARN_ON(rm->m_used_sgs + nents > rm->m_total_sgs);
225 WARN_ON(!nents);
226
227 if (rm->m_used_sgs + nents > rm->m_total_sgs)
228 return NULL;
229
230 sg_ret = &sg_first[rm->m_used_sgs];
231 sg_init_table(sg_ret, nents);
232 rm->m_used_sgs += nents;
233
234 return sg_ret;
235}
236
237struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len) 237struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len)
238{ 238{
239 struct rds_message *rm; 239 struct rds_message *rm;
240 unsigned int i; 240 unsigned int i;
241 int num_sgs = ceil(total_len, PAGE_SIZE);
242 int extra_bytes = num_sgs * sizeof(struct scatterlist);
241 243
242 rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL); 244 rm = rds_message_alloc(extra_bytes, GFP_NOWAIT);
243 if (rm == NULL) 245 if (!rm)
244 return ERR_PTR(-ENOMEM); 246 return ERR_PTR(-ENOMEM);
245 247
246 set_bit(RDS_MSG_PAGEVEC, &rm->m_flags); 248 set_bit(RDS_MSG_PAGEVEC, &rm->m_flags);
247 rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); 249 rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
248 rm->m_nents = ceil(total_len, PAGE_SIZE); 250 rm->data.op_nents = ceil(total_len, PAGE_SIZE);
251 rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs);
252 if (!rm->data.op_sg)
253 return ERR_PTR(-ENOMEM);
249 254
250 for (i = 0; i < rm->m_nents; ++i) { 255 for (i = 0; i < rm->data.op_nents; ++i) {
251 sg_set_page(&rm->m_sg[i], 256 sg_set_page(&rm->data.op_sg[i],
252 virt_to_page(page_addrs[i]), 257 virt_to_page(page_addrs[i]),
253 PAGE_SIZE, 0); 258 PAGE_SIZE, 0);
254 } 259 }
@@ -256,40 +261,33 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in
256 return rm; 261 return rm;
257} 262}
258 263
259struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, 264int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov,
260 size_t total_len) 265 size_t total_len)
261{ 266{
262 unsigned long to_copy; 267 unsigned long to_copy;
263 unsigned long iov_off; 268 unsigned long iov_off;
264 unsigned long sg_off; 269 unsigned long sg_off;
265 struct rds_message *rm;
266 struct iovec *iov; 270 struct iovec *iov;
267 struct scatterlist *sg; 271 struct scatterlist *sg;
268 int ret; 272 int ret = 0;
269
270 rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL);
271 if (rm == NULL) {
272 ret = -ENOMEM;
273 goto out;
274 }
275 273
276 rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); 274 rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
277 275
278 /* 276 /*
279 * now allocate and copy in the data payload. 277 * now allocate and copy in the data payload.
280 */ 278 */
281 sg = rm->m_sg; 279 sg = rm->data.op_sg;
282 iov = first_iov; 280 iov = first_iov;
283 iov_off = 0; 281 iov_off = 0;
284 sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */ 282 sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */
285 283
286 while (total_len) { 284 while (total_len) {
287 if (sg_page(sg) == NULL) { 285 if (!sg_page(sg)) {
288 ret = rds_page_remainder_alloc(sg, total_len, 286 ret = rds_page_remainder_alloc(sg, total_len,
289 GFP_HIGHUSER); 287 GFP_HIGHUSER);
290 if (ret) 288 if (ret)
291 goto out; 289 goto out;
292 rm->m_nents++; 290 rm->data.op_nents++;
293 sg_off = 0; 291 sg_off = 0;
294 } 292 }
295 293
@@ -320,14 +318,8 @@ struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
320 sg++; 318 sg++;
321 } 319 }
322 320
323 ret = 0;
324out: 321out:
325 if (ret) { 322 return ret;
326 if (rm)
327 rds_message_put(rm);
328 rm = ERR_PTR(ret);
329 }
330 return rm;
331} 323}
332 324
333int rds_message_inc_copy_to_user(struct rds_incoming *inc, 325int rds_message_inc_copy_to_user(struct rds_incoming *inc,
@@ -348,7 +340,7 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc,
348 340
349 iov = first_iov; 341 iov = first_iov;
350 iov_off = 0; 342 iov_off = 0;
351 sg = rm->m_sg; 343 sg = rm->data.op_sg;
352 vec_off = 0; 344 vec_off = 0;
353 copied = 0; 345 copied = 0;
354 346
@@ -394,15 +386,14 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc,
394 */ 386 */
395void rds_message_wait(struct rds_message *rm) 387void rds_message_wait(struct rds_message *rm)
396{ 388{
397 wait_event(rds_message_flush_waitq, 389 wait_event_interruptible(rm->m_flush_wait,
398 !test_bit(RDS_MSG_MAPPED, &rm->m_flags)); 390 !test_bit(RDS_MSG_MAPPED, &rm->m_flags));
399} 391}
400 392
401void rds_message_unmapped(struct rds_message *rm) 393void rds_message_unmapped(struct rds_message *rm)
402{ 394{
403 clear_bit(RDS_MSG_MAPPED, &rm->m_flags); 395 clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
404 if (waitqueue_active(&rds_message_flush_waitq)) 396 wake_up_interruptible(&rm->m_flush_wait);
405 wake_up(&rds_message_flush_waitq);
406} 397}
407EXPORT_SYMBOL_GPL(rds_message_unmapped); 398EXPORT_SYMBOL_GPL(rds_message_unmapped);
408 399
diff --git a/net/rds/page.c b/net/rds/page.c
index 595a952d4b17..d8acdebe3c7c 100644
--- a/net/rds/page.c
+++ b/net/rds/page.c
@@ -40,7 +40,8 @@ struct rds_page_remainder {
40 unsigned long r_offset; 40 unsigned long r_offset;
41}; 41};
42 42
43DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders); 43static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder,
44 rds_page_remainders);
44 45
45/* 46/*
46 * returns 0 on success or -errno on failure. 47 * returns 0 on success or -errno on failure.
@@ -57,30 +58,17 @@ int rds_page_copy_user(struct page *page, unsigned long offset,
57 unsigned long ret; 58 unsigned long ret;
58 void *addr; 59 void *addr;
59 60
60 if (to_user) 61 addr = kmap(page);
62 if (to_user) {
61 rds_stats_add(s_copy_to_user, bytes); 63 rds_stats_add(s_copy_to_user, bytes);
62 else 64 ret = copy_to_user(ptr, addr + offset, bytes);
65 } else {
63 rds_stats_add(s_copy_from_user, bytes); 66 rds_stats_add(s_copy_from_user, bytes);
64 67 ret = copy_from_user(addr + offset, ptr, bytes);
65 addr = kmap_atomic(page, KM_USER0);
66 if (to_user)
67 ret = __copy_to_user_inatomic(ptr, addr + offset, bytes);
68 else
69 ret = __copy_from_user_inatomic(addr + offset, ptr, bytes);
70 kunmap_atomic(addr, KM_USER0);
71
72 if (ret) {
73 addr = kmap(page);
74 if (to_user)
75 ret = copy_to_user(ptr, addr + offset, bytes);
76 else
77 ret = copy_from_user(addr + offset, ptr, bytes);
78 kunmap(page);
79 if (ret)
80 return -EFAULT;
81 } 68 }
69 kunmap(page);
82 70
83 return 0; 71 return ret ? -EFAULT : 0;
84} 72}
85EXPORT_SYMBOL_GPL(rds_page_copy_user); 73EXPORT_SYMBOL_GPL(rds_page_copy_user);
86 74
@@ -116,7 +104,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
116 /* jump straight to allocation if we're trying for a huge page */ 104 /* jump straight to allocation if we're trying for a huge page */
117 if (bytes >= PAGE_SIZE) { 105 if (bytes >= PAGE_SIZE) {
118 page = alloc_page(gfp); 106 page = alloc_page(gfp);
119 if (page == NULL) { 107 if (!page) {
120 ret = -ENOMEM; 108 ret = -ENOMEM;
121 } else { 109 } else {
122 sg_set_page(scat, page, PAGE_SIZE, 0); 110 sg_set_page(scat, page, PAGE_SIZE, 0);
@@ -162,7 +150,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
162 rem = &per_cpu(rds_page_remainders, get_cpu()); 150 rem = &per_cpu(rds_page_remainders, get_cpu());
163 local_irq_save(flags); 151 local_irq_save(flags);
164 152
165 if (page == NULL) { 153 if (!page) {
166 ret = -ENOMEM; 154 ret = -ENOMEM;
167 break; 155 break;
168 } 156 }
@@ -186,6 +174,7 @@ out:
186 ret ? 0 : scat->length); 174 ret ? 0 : scat->length);
187 return ret; 175 return ret;
188} 176}
177EXPORT_SYMBOL_GPL(rds_page_remainder_alloc);
189 178
190static int rds_page_remainder_cpu_notify(struct notifier_block *self, 179static int rds_page_remainder_cpu_notify(struct notifier_block *self,
191 unsigned long action, void *hcpu) 180 unsigned long action, void *hcpu)
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 75fd13bb631b..8920f2a83327 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -35,7 +35,7 @@
35#include <linux/rbtree.h> 35#include <linux/rbtree.h>
36#include <linux/dma-mapping.h> /* for DMA_*_DEVICE */ 36#include <linux/dma-mapping.h> /* for DMA_*_DEVICE */
37 37
38#include "rdma.h" 38#include "rds.h"
39 39
40/* 40/*
41 * XXX 41 * XXX
@@ -130,14 +130,22 @@ void rds_rdma_drop_keys(struct rds_sock *rs)
130{ 130{
131 struct rds_mr *mr; 131 struct rds_mr *mr;
132 struct rb_node *node; 132 struct rb_node *node;
133 unsigned long flags;
133 134
134 /* Release any MRs associated with this socket */ 135 /* Release any MRs associated with this socket */
136 spin_lock_irqsave(&rs->rs_rdma_lock, flags);
135 while ((node = rb_first(&rs->rs_rdma_keys))) { 137 while ((node = rb_first(&rs->rs_rdma_keys))) {
136 mr = container_of(node, struct rds_mr, r_rb_node); 138 mr = container_of(node, struct rds_mr, r_rb_node);
137 if (mr->r_trans == rs->rs_transport) 139 if (mr->r_trans == rs->rs_transport)
138 mr->r_invalidate = 0; 140 mr->r_invalidate = 0;
141 rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
142 RB_CLEAR_NODE(&mr->r_rb_node);
143 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
144 rds_destroy_mr(mr);
139 rds_mr_put(mr); 145 rds_mr_put(mr);
146 spin_lock_irqsave(&rs->rs_rdma_lock, flags);
140 } 147 }
148 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
141 149
142 if (rs->rs_transport && rs->rs_transport->flush_mrs) 150 if (rs->rs_transport && rs->rs_transport->flush_mrs)
143 rs->rs_transport->flush_mrs(); 151 rs->rs_transport->flush_mrs();
@@ -181,7 +189,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
181 goto out; 189 goto out;
182 } 190 }
183 191
184 if (rs->rs_transport->get_mr == NULL) { 192 if (!rs->rs_transport->get_mr) {
185 ret = -EOPNOTSUPP; 193 ret = -EOPNOTSUPP;
186 goto out; 194 goto out;
187 } 195 }
@@ -197,13 +205,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
197 205
198 /* XXX clamp nr_pages to limit the size of this alloc? */ 206 /* XXX clamp nr_pages to limit the size of this alloc? */
199 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); 207 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
200 if (pages == NULL) { 208 if (!pages) {
201 ret = -ENOMEM; 209 ret = -ENOMEM;
202 goto out; 210 goto out;
203 } 211 }
204 212
205 mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL); 213 mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL);
206 if (mr == NULL) { 214 if (!mr) {
207 ret = -ENOMEM; 215 ret = -ENOMEM;
208 goto out; 216 goto out;
209 } 217 }
@@ -230,13 +238,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
230 * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to 238 * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to
231 * the zero page. 239 * the zero page.
232 */ 240 */
233 ret = rds_pin_pages(args->vec.addr & PAGE_MASK, nr_pages, pages, 1); 241 ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1);
234 if (ret < 0) 242 if (ret < 0)
235 goto out; 243 goto out;
236 244
237 nents = ret; 245 nents = ret;
238 sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL); 246 sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL);
239 if (sg == NULL) { 247 if (!sg) {
240 ret = -ENOMEM; 248 ret = -ENOMEM;
241 goto out; 249 goto out;
242 } 250 }
@@ -406,68 +414,153 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
406 414
407 spin_lock_irqsave(&rs->rs_rdma_lock, flags); 415 spin_lock_irqsave(&rs->rs_rdma_lock, flags);
408 mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); 416 mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
409 if (mr && (mr->r_use_once || force)) { 417 if (!mr) {
418 printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key);
419 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
420 return;
421 }
422
423 if (mr->r_use_once || force) {
410 rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); 424 rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
411 RB_CLEAR_NODE(&mr->r_rb_node); 425 RB_CLEAR_NODE(&mr->r_rb_node);
412 zot_me = 1; 426 zot_me = 1;
413 } else if (mr) 427 }
414 atomic_inc(&mr->r_refcount);
415 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); 428 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
416 429
417 /* May have to issue a dma_sync on this memory region. 430 /* May have to issue a dma_sync on this memory region.
418 * Note we could avoid this if the operation was a RDMA READ, 431 * Note we could avoid this if the operation was a RDMA READ,
419 * but at this point we can't tell. */ 432 * but at this point we can't tell. */
420 if (mr != NULL) { 433 if (mr->r_trans->sync_mr)
421 if (mr->r_trans->sync_mr) 434 mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
422 mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE); 435
423 436 /* If the MR was marked as invalidate, this will
424 /* If the MR was marked as invalidate, this will 437 * trigger an async flush. */
425 * trigger an async flush. */ 438 if (zot_me)
426 if (zot_me) 439 rds_destroy_mr(mr);
427 rds_destroy_mr(mr); 440 rds_mr_put(mr);
428 rds_mr_put(mr);
429 }
430} 441}
431 442
432void rds_rdma_free_op(struct rds_rdma_op *ro) 443void rds_rdma_free_op(struct rm_rdma_op *ro)
433{ 444{
434 unsigned int i; 445 unsigned int i;
435 446
436 for (i = 0; i < ro->r_nents; i++) { 447 for (i = 0; i < ro->op_nents; i++) {
437 struct page *page = sg_page(&ro->r_sg[i]); 448 struct page *page = sg_page(&ro->op_sg[i]);
438 449
439 /* Mark page dirty if it was possibly modified, which 450 /* Mark page dirty if it was possibly modified, which
440 * is the case for a RDMA_READ which copies from remote 451 * is the case for a RDMA_READ which copies from remote
441 * to local memory */ 452 * to local memory */
442 if (!ro->r_write) { 453 if (!ro->op_write) {
443 BUG_ON(in_interrupt()); 454 BUG_ON(irqs_disabled());
444 set_page_dirty(page); 455 set_page_dirty(page);
445 } 456 }
446 put_page(page); 457 put_page(page);
447 } 458 }
448 459
449 kfree(ro->r_notifier); 460 kfree(ro->op_notifier);
450 kfree(ro); 461 ro->op_notifier = NULL;
462 ro->op_active = 0;
463}
464
465void rds_atomic_free_op(struct rm_atomic_op *ao)
466{
467 struct page *page = sg_page(ao->op_sg);
468
469 /* Mark page dirty if it was possibly modified, which
470 * is the case for a RDMA_READ which copies from remote
471 * to local memory */
472 set_page_dirty(page);
473 put_page(page);
474
475 kfree(ao->op_notifier);
476 ao->op_notifier = NULL;
477 ao->op_active = 0;
451} 478}
452 479
480
453/* 481/*
454 * args is a pointer to an in-kernel copy in the sendmsg cmsg. 482 * Count the number of pages needed to describe an incoming iovec array.
455 */ 483 */
456static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, 484static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs)
457 struct rds_rdma_args *args) 485{
486 int tot_pages = 0;
487 unsigned int nr_pages;
488 unsigned int i;
489
490 /* figure out the number of pages in the vector */
491 for (i = 0; i < nr_iovecs; i++) {
492 nr_pages = rds_pages_in_vec(&iov[i]);
493 if (nr_pages == 0)
494 return -EINVAL;
495
496 tot_pages += nr_pages;
497
498 /*
499 * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1,
500 * so tot_pages cannot overflow without first going negative.
501 */
502 if (tot_pages < 0)
503 return -EINVAL;
504 }
505
506 return tot_pages;
507}
508
509int rds_rdma_extra_size(struct rds_rdma_args *args)
458{ 510{
459 struct rds_iovec vec; 511 struct rds_iovec vec;
460 struct rds_rdma_op *op = NULL; 512 struct rds_iovec __user *local_vec;
513 int tot_pages = 0;
461 unsigned int nr_pages; 514 unsigned int nr_pages;
462 unsigned int max_pages; 515 unsigned int i;
516
517 local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
518
519 /* figure out the number of pages in the vector */
520 for (i = 0; i < args->nr_local; i++) {
521 if (copy_from_user(&vec, &local_vec[i],
522 sizeof(struct rds_iovec)))
523 return -EFAULT;
524
525 nr_pages = rds_pages_in_vec(&vec);
526 if (nr_pages == 0)
527 return -EINVAL;
528
529 tot_pages += nr_pages;
530
531 /*
532 * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1,
533 * so tot_pages cannot overflow without first going negative.
534 */
535 if (tot_pages < 0)
536 return -EINVAL;
537 }
538
539 return tot_pages * sizeof(struct scatterlist);
540}
541
542/*
543 * The application asks for a RDMA transfer.
544 * Extract all arguments and set up the rdma_op
545 */
546int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
547 struct cmsghdr *cmsg)
548{
549 struct rds_rdma_args *args;
550 struct rm_rdma_op *op = &rm->rdma;
551 int nr_pages;
463 unsigned int nr_bytes; 552 unsigned int nr_bytes;
464 struct page **pages = NULL; 553 struct page **pages = NULL;
465 struct rds_iovec __user *local_vec; 554 struct rds_iovec iovstack[UIO_FASTIOV], *iovs = iovstack;
466 struct scatterlist *sg; 555 int iov_size;
467 unsigned int nr;
468 unsigned int i, j; 556 unsigned int i, j;
469 int ret; 557 int ret = 0;
558
559 if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
560 || rm->rdma.op_active)
561 return -EINVAL;
470 562
563 args = CMSG_DATA(cmsg);
471 564
472 if (rs->rs_bound_addr == 0) { 565 if (rs->rs_bound_addr == 0) {
473 ret = -ENOTCONN; /* XXX not a great errno */ 566 ret = -ENOTCONN; /* XXX not a great errno */
@@ -479,61 +572,59 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
479 goto out; 572 goto out;
480 } 573 }
481 574
482 nr_pages = 0; 575 /* Check whether to allocate the iovec area */
483 max_pages = 0; 576 iov_size = args->nr_local * sizeof(struct rds_iovec);
484 577 if (args->nr_local > UIO_FASTIOV) {
485 local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; 578 iovs = sock_kmalloc(rds_rs_to_sk(rs), iov_size, GFP_KERNEL);
486 579 if (!iovs) {
487 /* figure out the number of pages in the vector */ 580 ret = -ENOMEM;
488 for (i = 0; i < args->nr_local; i++) {
489 if (copy_from_user(&vec, &local_vec[i],
490 sizeof(struct rds_iovec))) {
491 ret = -EFAULT;
492 goto out;
493 }
494
495 nr = rds_pages_in_vec(&vec);
496 if (nr == 0) {
497 ret = -EINVAL;
498 goto out; 581 goto out;
499 } 582 }
583 }
500 584
501 max_pages = max(nr, max_pages); 585 if (copy_from_user(iovs, (struct rds_iovec __user *)(unsigned long) args->local_vec_addr, iov_size)) {
502 nr_pages += nr; 586 ret = -EFAULT;
587 goto out;
503 } 588 }
504 589
505 pages = kcalloc(max_pages, sizeof(struct page *), GFP_KERNEL); 590 nr_pages = rds_rdma_pages(iovs, args->nr_local);
506 if (pages == NULL) { 591 if (nr_pages < 0) {
507 ret = -ENOMEM; 592 ret = -EINVAL;
508 goto out; 593 goto out;
509 } 594 }
510 595
511 op = kzalloc(offsetof(struct rds_rdma_op, r_sg[nr_pages]), GFP_KERNEL); 596 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
512 if (op == NULL) { 597 if (!pages) {
513 ret = -ENOMEM; 598 ret = -ENOMEM;
514 goto out; 599 goto out;
515 } 600 }
516 601
517 op->r_write = !!(args->flags & RDS_RDMA_READWRITE); 602 op->op_write = !!(args->flags & RDS_RDMA_READWRITE);
518 op->r_fence = !!(args->flags & RDS_RDMA_FENCE); 603 op->op_fence = !!(args->flags & RDS_RDMA_FENCE);
519 op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); 604 op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
520 op->r_recverr = rs->rs_recverr; 605 op->op_silent = !!(args->flags & RDS_RDMA_SILENT);
606 op->op_active = 1;
607 op->op_recverr = rs->rs_recverr;
521 WARN_ON(!nr_pages); 608 WARN_ON(!nr_pages);
522 sg_init_table(op->r_sg, nr_pages); 609 op->op_sg = rds_message_alloc_sgs(rm, nr_pages);
610 if (!op->op_sg) {
611 ret = -ENOMEM;
612 goto out;
613 }
523 614
524 if (op->r_notify || op->r_recverr) { 615 if (op->op_notify || op->op_recverr) {
525 /* We allocate an uninitialized notifier here, because 616 /* We allocate an uninitialized notifier here, because
526 * we don't want to do that in the completion handler. We 617 * we don't want to do that in the completion handler. We
527 * would have to use GFP_ATOMIC there, and don't want to deal 618 * would have to use GFP_ATOMIC there, and don't want to deal
528 * with failed allocations. 619 * with failed allocations.
529 */ 620 */
530 op->r_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL); 621 op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL);
531 if (!op->r_notifier) { 622 if (!op->op_notifier) {
532 ret = -ENOMEM; 623 ret = -ENOMEM;
533 goto out; 624 goto out;
534 } 625 }
535 op->r_notifier->n_user_token = args->user_token; 626 op->op_notifier->n_user_token = args->user_token;
536 op->r_notifier->n_status = RDS_RDMA_SUCCESS; 627 op->op_notifier->n_status = RDS_RDMA_SUCCESS;
537 } 628 }
538 629
539 /* The cookie contains the R_Key of the remote memory region, and 630 /* The cookie contains the R_Key of the remote memory region, and
@@ -543,68 +634,55 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
543 * destination address (which is really an offset into the MR) 634 * destination address (which is really an offset into the MR)
544 * FIXME: We may want to move this into ib_rdma.c 635 * FIXME: We may want to move this into ib_rdma.c
545 */ 636 */
546 op->r_key = rds_rdma_cookie_key(args->cookie); 637 op->op_rkey = rds_rdma_cookie_key(args->cookie);
547 op->r_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie); 638 op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie);
548 639
549 nr_bytes = 0; 640 nr_bytes = 0;
550 641
551 rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n", 642 rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n",
552 (unsigned long long)args->nr_local, 643 (unsigned long long)args->nr_local,
553 (unsigned long long)args->remote_vec.addr, 644 (unsigned long long)args->remote_vec.addr,
554 op->r_key); 645 op->op_rkey);
555 646
556 for (i = 0; i < args->nr_local; i++) { 647 for (i = 0; i < args->nr_local; i++) {
557 if (copy_from_user(&vec, &local_vec[i], 648 struct rds_iovec *iov = &iovs[i];
558 sizeof(struct rds_iovec))) { 649 /* don't need to check, rds_rdma_pages() verified nr will be +nonzero */
559 ret = -EFAULT; 650 unsigned int nr = rds_pages_in_vec(iov);
560 goto out;
561 }
562 651
563 nr = rds_pages_in_vec(&vec); 652 rs->rs_user_addr = iov->addr;
564 if (nr == 0) { 653 rs->rs_user_bytes = iov->bytes;
565 ret = -EINVAL;
566 goto out;
567 }
568 654
569 rs->rs_user_addr = vec.addr;
570 rs->rs_user_bytes = vec.bytes;
571
572 /* did the user change the vec under us? */
573 if (nr > max_pages || op->r_nents + nr > nr_pages) {
574 ret = -EINVAL;
575 goto out;
576 }
577 /* If it's a WRITE operation, we want to pin the pages for reading. 655 /* If it's a WRITE operation, we want to pin the pages for reading.
578 * If it's a READ operation, we need to pin the pages for writing. 656 * If it's a READ operation, we need to pin the pages for writing.
579 */ 657 */
580 ret = rds_pin_pages(vec.addr & PAGE_MASK, nr, pages, !op->r_write); 658 ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write);
581 if (ret < 0) 659 if (ret < 0)
582 goto out; 660 goto out;
583 661
584 rdsdebug("RDS: nr_bytes %u nr %u vec.bytes %llu vec.addr %llx\n", 662 rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n",
585 nr_bytes, nr, vec.bytes, vec.addr); 663 nr_bytes, nr, iov->bytes, iov->addr);
586 664
587 nr_bytes += vec.bytes; 665 nr_bytes += iov->bytes;
588 666
589 for (j = 0; j < nr; j++) { 667 for (j = 0; j < nr; j++) {
590 unsigned int offset = vec.addr & ~PAGE_MASK; 668 unsigned int offset = iov->addr & ~PAGE_MASK;
669 struct scatterlist *sg;
591 670
592 sg = &op->r_sg[op->r_nents + j]; 671 sg = &op->op_sg[op->op_nents + j];
593 sg_set_page(sg, pages[j], 672 sg_set_page(sg, pages[j],
594 min_t(unsigned int, vec.bytes, PAGE_SIZE - offset), 673 min_t(unsigned int, iov->bytes, PAGE_SIZE - offset),
595 offset); 674 offset);
596 675
597 rdsdebug("RDS: sg->offset %x sg->len %x vec.addr %llx vec.bytes %llu\n", 676 rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n",
598 sg->offset, sg->length, vec.addr, vec.bytes); 677 sg->offset, sg->length, iov->addr, iov->bytes);
599 678
600 vec.addr += sg->length; 679 iov->addr += sg->length;
601 vec.bytes -= sg->length; 680 iov->bytes -= sg->length;
602 } 681 }
603 682
604 op->r_nents += nr; 683 op->op_nents += nr;
605 } 684 }
606 685
607
608 if (nr_bytes > args->remote_vec.bytes) { 686 if (nr_bytes > args->remote_vec.bytes) {
609 rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n", 687 rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n",
610 nr_bytes, 688 nr_bytes,
@@ -612,38 +690,18 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
612 ret = -EINVAL; 690 ret = -EINVAL;
613 goto out; 691 goto out;
614 } 692 }
615 op->r_bytes = nr_bytes; 693 op->op_bytes = nr_bytes;
616 694
617 ret = 0;
618out: 695out:
696 if (iovs != iovstack)
697 sock_kfree_s(rds_rs_to_sk(rs), iovs, iov_size);
619 kfree(pages); 698 kfree(pages);
620 if (ret) { 699 if (ret)
621 if (op) 700 rds_rdma_free_op(op);
622 rds_rdma_free_op(op); 701 else
623 op = ERR_PTR(ret); 702 rds_stats_inc(s_send_rdma);
624 }
625 return op;
626}
627
628/*
629 * The application asks for a RDMA transfer.
630 * Extract all arguments and set up the rdma_op
631 */
632int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
633 struct cmsghdr *cmsg)
634{
635 struct rds_rdma_op *op;
636
637 if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) ||
638 rm->m_rdma_op != NULL)
639 return -EINVAL;
640 703
641 op = rds_rdma_prepare(rs, CMSG_DATA(cmsg)); 704 return ret;
642 if (IS_ERR(op))
643 return PTR_ERR(op);
644 rds_stats_inc(s_send_rdma);
645 rm->m_rdma_op = op;
646 return 0;
647} 705}
648 706
649/* 707/*
@@ -673,7 +731,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
673 731
674 spin_lock_irqsave(&rs->rs_rdma_lock, flags); 732 spin_lock_irqsave(&rs->rs_rdma_lock, flags);
675 mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); 733 mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
676 if (mr == NULL) 734 if (!mr)
677 err = -EINVAL; /* invalid r_key */ 735 err = -EINVAL; /* invalid r_key */
678 else 736 else
679 atomic_inc(&mr->r_refcount); 737 atomic_inc(&mr->r_refcount);
@@ -681,7 +739,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
681 739
682 if (mr) { 740 if (mr) {
683 mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE); 741 mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE);
684 rm->m_rdma_mr = mr; 742 rm->rdma.op_rdma_mr = mr;
685 } 743 }
686 return err; 744 return err;
687} 745}
@@ -699,5 +757,102 @@ int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
699 rm->m_rdma_cookie != 0) 757 rm->m_rdma_cookie != 0)
700 return -EINVAL; 758 return -EINVAL;
701 759
702 return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->m_rdma_mr); 760 return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr);
761}
762
763/*
764 * Fill in rds_message for an atomic request.
765 */
766int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
767 struct cmsghdr *cmsg)
768{
769 struct page *page = NULL;
770 struct rds_atomic_args *args;
771 int ret = 0;
772
773 if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args))
774 || rm->atomic.op_active)
775 return -EINVAL;
776
777 args = CMSG_DATA(cmsg);
778
779 /* Nonmasked & masked cmsg ops converted to masked hw ops */
780 switch (cmsg->cmsg_type) {
781 case RDS_CMSG_ATOMIC_FADD:
782 rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
783 rm->atomic.op_m_fadd.add = args->fadd.add;
784 rm->atomic.op_m_fadd.nocarry_mask = 0;
785 break;
786 case RDS_CMSG_MASKED_ATOMIC_FADD:
787 rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
788 rm->atomic.op_m_fadd.add = args->m_fadd.add;
789 rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask;
790 break;
791 case RDS_CMSG_ATOMIC_CSWP:
792 rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
793 rm->atomic.op_m_cswp.compare = args->cswp.compare;
794 rm->atomic.op_m_cswp.swap = args->cswp.swap;
795 rm->atomic.op_m_cswp.compare_mask = ~0;
796 rm->atomic.op_m_cswp.swap_mask = ~0;
797 break;
798 case RDS_CMSG_MASKED_ATOMIC_CSWP:
799 rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
800 rm->atomic.op_m_cswp.compare = args->m_cswp.compare;
801 rm->atomic.op_m_cswp.swap = args->m_cswp.swap;
802 rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask;
803 rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask;
804 break;
805 default:
806 BUG(); /* should never happen */
807 }
808
809 rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
810 rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT);
811 rm->atomic.op_active = 1;
812 rm->atomic.op_recverr = rs->rs_recverr;
813 rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1);
814 if (!rm->atomic.op_sg) {
815 ret = -ENOMEM;
816 goto err;
817 }
818
819 /* verify 8 byte-aligned */
820 if (args->local_addr & 0x7) {
821 ret = -EFAULT;
822 goto err;
823 }
824
825 ret = rds_pin_pages(args->local_addr, 1, &page, 1);
826 if (ret != 1)
827 goto err;
828 ret = 0;
829
830 sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr));
831
832 if (rm->atomic.op_notify || rm->atomic.op_recverr) {
833 /* We allocate an uninitialized notifier here, because
834 * we don't want to do that in the completion handler. We
835 * would have to use GFP_ATOMIC there, and don't want to deal
836 * with failed allocations.
837 */
838 rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL);
839 if (!rm->atomic.op_notifier) {
840 ret = -ENOMEM;
841 goto err;
842 }
843
844 rm->atomic.op_notifier->n_user_token = args->user_token;
845 rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS;
846 }
847
848 rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie);
849 rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie);
850
851 return ret;
852err:
853 if (page)
854 put_page(page);
855 kfree(rm->atomic.op_notifier);
856
857 return ret;
703} 858}
diff --git a/net/rds/rdma.h b/net/rds/rdma.h
deleted file mode 100644
index 909c39835a5d..000000000000
--- a/net/rds/rdma.h
+++ /dev/null
@@ -1,85 +0,0 @@
1#ifndef _RDS_RDMA_H
2#define _RDS_RDMA_H
3
4#include <linux/rbtree.h>
5#include <linux/spinlock.h>
6#include <linux/scatterlist.h>
7
8#include "rds.h"
9
10struct rds_mr {
11 struct rb_node r_rb_node;
12 atomic_t r_refcount;
13 u32 r_key;
14
15 /* A copy of the creation flags */
16 unsigned int r_use_once:1;
17 unsigned int r_invalidate:1;
18 unsigned int r_write:1;
19
20 /* This is for RDS_MR_DEAD.
21 * It would be nice & consistent to make this part of the above
22 * bit field here, but we need to use test_and_set_bit.
23 */
24 unsigned long r_state;
25 struct rds_sock *r_sock; /* back pointer to the socket that owns us */
26 struct rds_transport *r_trans;
27 void *r_trans_private;
28};
29
30/* Flags for mr->r_state */
31#define RDS_MR_DEAD 0
32
33struct rds_rdma_op {
34 u32 r_key;
35 u64 r_remote_addr;
36 unsigned int r_write:1;
37 unsigned int r_fence:1;
38 unsigned int r_notify:1;
39 unsigned int r_recverr:1;
40 unsigned int r_mapped:1;
41 struct rds_notifier *r_notifier;
42 unsigned int r_bytes;
43 unsigned int r_nents;
44 unsigned int r_count;
45 struct scatterlist r_sg[0];
46};
47
48static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
49{
50 return r_key | (((u64) offset) << 32);
51}
52
53static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
54{
55 return cookie;
56}
57
58static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
59{
60 return cookie >> 32;
61}
62
63int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
64int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen);
65int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
66void rds_rdma_drop_keys(struct rds_sock *rs);
67int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
68 struct cmsghdr *cmsg);
69int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
70 struct cmsghdr *cmsg);
71int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
72 struct cmsghdr *cmsg);
73int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
74 struct cmsghdr *cmsg);
75void rds_rdma_free_op(struct rds_rdma_op *ro);
76void rds_rdma_send_complete(struct rds_message *rm, int);
77
78extern void __rds_put_mr_final(struct rds_mr *mr);
79static inline void rds_mr_put(struct rds_mr *mr)
80{
81 if (atomic_dec_and_test(&mr->r_refcount))
82 __rds_put_mr_final(mr);
83}
84
85#endif
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index e599ba2f950d..4195a0539829 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -36,6 +36,34 @@
36 36
37static struct rdma_cm_id *rds_rdma_listen_id; 37static struct rdma_cm_id *rds_rdma_listen_id;
38 38
39static char *rds_cm_event_strings[] = {
40#define RDS_CM_EVENT_STRING(foo) \
41 [RDMA_CM_EVENT_##foo] = __stringify(RDMA_CM_EVENT_##foo)
42 RDS_CM_EVENT_STRING(ADDR_RESOLVED),
43 RDS_CM_EVENT_STRING(ADDR_ERROR),
44 RDS_CM_EVENT_STRING(ROUTE_RESOLVED),
45 RDS_CM_EVENT_STRING(ROUTE_ERROR),
46 RDS_CM_EVENT_STRING(CONNECT_REQUEST),
47 RDS_CM_EVENT_STRING(CONNECT_RESPONSE),
48 RDS_CM_EVENT_STRING(CONNECT_ERROR),
49 RDS_CM_EVENT_STRING(UNREACHABLE),
50 RDS_CM_EVENT_STRING(REJECTED),
51 RDS_CM_EVENT_STRING(ESTABLISHED),
52 RDS_CM_EVENT_STRING(DISCONNECTED),
53 RDS_CM_EVENT_STRING(DEVICE_REMOVAL),
54 RDS_CM_EVENT_STRING(MULTICAST_JOIN),
55 RDS_CM_EVENT_STRING(MULTICAST_ERROR),
56 RDS_CM_EVENT_STRING(ADDR_CHANGE),
57 RDS_CM_EVENT_STRING(TIMEWAIT_EXIT),
58#undef RDS_CM_EVENT_STRING
59};
60
61static char *rds_cm_event_str(enum rdma_cm_event_type type)
62{
63 return rds_str_array(rds_cm_event_strings,
64 ARRAY_SIZE(rds_cm_event_strings), type);
65};
66
39int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, 67int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
40 struct rdma_cm_event *event) 68 struct rdma_cm_event *event)
41{ 69{
@@ -44,8 +72,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
44 struct rds_transport *trans; 72 struct rds_transport *trans;
45 int ret = 0; 73 int ret = 0;
46 74
47 rdsdebug("conn %p id %p handling event %u\n", conn, cm_id, 75 rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id,
48 event->event); 76 event->event, rds_cm_event_str(event->event));
49 77
50 if (cm_id->device->node_type == RDMA_NODE_RNIC) 78 if (cm_id->device->node_type == RDMA_NODE_RNIC)
51 trans = &rds_iw_transport; 79 trans = &rds_iw_transport;
@@ -109,7 +137,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
109 137
110 default: 138 default:
111 /* things like device disconnect? */ 139 /* things like device disconnect? */
112 printk(KERN_ERR "RDS: unknown event %u!\n", event->event); 140 printk(KERN_ERR "RDS: unknown event %u (%s)!\n",
141 event->event, rds_cm_event_str(event->event));
113 break; 142 break;
114 } 143 }
115 144
@@ -117,12 +146,13 @@ out:
117 if (conn) 146 if (conn)
118 mutex_unlock(&conn->c_cm_lock); 147 mutex_unlock(&conn->c_cm_lock);
119 148
120 rdsdebug("id %p event %u handling ret %d\n", cm_id, event->event, ret); 149 rdsdebug("id %p event %u (%s) handling ret %d\n", cm_id, event->event,
150 rds_cm_event_str(event->event), ret);
121 151
122 return ret; 152 return ret;
123} 153}
124 154
125static int __init rds_rdma_listen_init(void) 155static int rds_rdma_listen_init(void)
126{ 156{
127 struct sockaddr_in sin; 157 struct sockaddr_in sin;
128 struct rdma_cm_id *cm_id; 158 struct rdma_cm_id *cm_id;
@@ -177,7 +207,7 @@ static void rds_rdma_listen_stop(void)
177 } 207 }
178} 208}
179 209
180int __init rds_rdma_init(void) 210static int rds_rdma_init(void)
181{ 211{
182 int ret; 212 int ret;
183 213
@@ -204,7 +234,7 @@ out:
204} 234}
205module_init(rds_rdma_init); 235module_init(rds_rdma_init);
206 236
207void rds_rdma_exit(void) 237static void rds_rdma_exit(void)
208{ 238{
209 /* stop listening first to ensure no new connections are attempted */ 239 /* stop listening first to ensure no new connections are attempted */
210 rds_rdma_listen_stop(); 240 rds_rdma_listen_stop();
diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h
index 2f2c7d976c21..faba4e382695 100644
--- a/net/rds/rdma_transport.h
+++ b/net/rds/rdma_transport.h
@@ -11,10 +11,6 @@ int rds_rdma_conn_connect(struct rds_connection *conn);
11int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, 11int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
12 struct rdma_cm_event *event); 12 struct rdma_cm_event *event);
13 13
14/* from rdma_transport.c */
15int rds_rdma_init(void);
16void rds_rdma_exit(void);
17
18/* from ib.c */ 14/* from ib.c */
19extern struct rds_transport rds_ib_transport; 15extern struct rds_transport rds_ib_transport;
20int rds_ib_init(void); 16int rds_ib_init(void);
diff --git a/net/rds/rds.h b/net/rds/rds.h
index c224b5bb3ba9..9542449c0720 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -80,6 +80,7 @@ enum {
80/* Bits for c_flags */ 80/* Bits for c_flags */
81#define RDS_LL_SEND_FULL 0 81#define RDS_LL_SEND_FULL 0
82#define RDS_RECONNECT_PENDING 1 82#define RDS_RECONNECT_PENDING 1
83#define RDS_IN_XMIT 2
83 84
84struct rds_connection { 85struct rds_connection {
85 struct hlist_node c_hash_node; 86 struct hlist_node c_hash_node;
@@ -91,12 +92,13 @@ struct rds_connection {
91 struct rds_cong_map *c_lcong; 92 struct rds_cong_map *c_lcong;
92 struct rds_cong_map *c_fcong; 93 struct rds_cong_map *c_fcong;
93 94
94 struct mutex c_send_lock; /* protect send ring */
95 struct rds_message *c_xmit_rm; 95 struct rds_message *c_xmit_rm;
96 unsigned long c_xmit_sg; 96 unsigned long c_xmit_sg;
97 unsigned int c_xmit_hdr_off; 97 unsigned int c_xmit_hdr_off;
98 unsigned int c_xmit_data_off; 98 unsigned int c_xmit_data_off;
99 unsigned int c_xmit_atomic_sent;
99 unsigned int c_xmit_rdma_sent; 100 unsigned int c_xmit_rdma_sent;
101 unsigned int c_xmit_data_sent;
100 102
101 spinlock_t c_lock; /* protect msg queues */ 103 spinlock_t c_lock; /* protect msg queues */
102 u64 c_next_tx_seq; 104 u64 c_next_tx_seq;
@@ -116,11 +118,10 @@ struct rds_connection {
116 struct delayed_work c_conn_w; 118 struct delayed_work c_conn_w;
117 struct work_struct c_down_w; 119 struct work_struct c_down_w;
118 struct mutex c_cm_lock; /* protect conn state & cm */ 120 struct mutex c_cm_lock; /* protect conn state & cm */
121 wait_queue_head_t c_waitq;
119 122
120 struct list_head c_map_item; 123 struct list_head c_map_item;
121 unsigned long c_map_queued; 124 unsigned long c_map_queued;
122 unsigned long c_map_offset;
123 unsigned long c_map_bytes;
124 125
125 unsigned int c_unacked_packets; 126 unsigned int c_unacked_packets;
126 unsigned int c_unacked_bytes; 127 unsigned int c_unacked_bytes;
@@ -206,6 +207,48 @@ struct rds_incoming {
206 rds_rdma_cookie_t i_rdma_cookie; 207 rds_rdma_cookie_t i_rdma_cookie;
207}; 208};
208 209
210struct rds_mr {
211 struct rb_node r_rb_node;
212 atomic_t r_refcount;
213 u32 r_key;
214
215 /* A copy of the creation flags */
216 unsigned int r_use_once:1;
217 unsigned int r_invalidate:1;
218 unsigned int r_write:1;
219
220 /* This is for RDS_MR_DEAD.
221 * It would be nice & consistent to make this part of the above
222 * bit field here, but we need to use test_and_set_bit.
223 */
224 unsigned long r_state;
225 struct rds_sock *r_sock; /* back pointer to the socket that owns us */
226 struct rds_transport *r_trans;
227 void *r_trans_private;
228};
229
230/* Flags for mr->r_state */
231#define RDS_MR_DEAD 0
232
233static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
234{
235 return r_key | (((u64) offset) << 32);
236}
237
238static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
239{
240 return cookie;
241}
242
243static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
244{
245 return cookie >> 32;
246}
247
248/* atomic operation types */
249#define RDS_ATOMIC_TYPE_CSWP 0
250#define RDS_ATOMIC_TYPE_FADD 1
251
209/* 252/*
210 * m_sock_item and m_conn_item are on lists that are serialized under 253 * m_sock_item and m_conn_item are on lists that are serialized under
211 * conn->c_lock. m_sock_item has additional meaning in that once it is empty 254 * conn->c_lock. m_sock_item has additional meaning in that once it is empty
@@ -258,13 +301,71 @@ struct rds_message {
258 * -> rs->rs_lock 301 * -> rs->rs_lock
259 */ 302 */
260 spinlock_t m_rs_lock; 303 spinlock_t m_rs_lock;
304 wait_queue_head_t m_flush_wait;
305
261 struct rds_sock *m_rs; 306 struct rds_sock *m_rs;
262 struct rds_rdma_op *m_rdma_op; 307
308 /* cookie to send to remote, in rds header */
263 rds_rdma_cookie_t m_rdma_cookie; 309 rds_rdma_cookie_t m_rdma_cookie;
264 struct rds_mr *m_rdma_mr; 310
265 unsigned int m_nents; 311 unsigned int m_used_sgs;
266 unsigned int m_count; 312 unsigned int m_total_sgs;
267 struct scatterlist m_sg[0]; 313
314 void *m_final_op;
315
316 struct {
317 struct rm_atomic_op {
318 int op_type;
319 union {
320 struct {
321 uint64_t compare;
322 uint64_t swap;
323 uint64_t compare_mask;
324 uint64_t swap_mask;
325 } op_m_cswp;
326 struct {
327 uint64_t add;
328 uint64_t nocarry_mask;
329 } op_m_fadd;
330 };
331
332 u32 op_rkey;
333 u64 op_remote_addr;
334 unsigned int op_notify:1;
335 unsigned int op_recverr:1;
336 unsigned int op_mapped:1;
337 unsigned int op_silent:1;
338 unsigned int op_active:1;
339 struct scatterlist *op_sg;
340 struct rds_notifier *op_notifier;
341
342 struct rds_mr *op_rdma_mr;
343 } atomic;
344 struct rm_rdma_op {
345 u32 op_rkey;
346 u64 op_remote_addr;
347 unsigned int op_write:1;
348 unsigned int op_fence:1;
349 unsigned int op_notify:1;
350 unsigned int op_recverr:1;
351 unsigned int op_mapped:1;
352 unsigned int op_silent:1;
353 unsigned int op_active:1;
354 unsigned int op_bytes;
355 unsigned int op_nents;
356 unsigned int op_count;
357 struct scatterlist *op_sg;
358 struct rds_notifier *op_notifier;
359
360 struct rds_mr *op_rdma_mr;
361 } rdma;
362 struct rm_data_op {
363 unsigned int op_active:1;
364 unsigned int op_nents;
365 unsigned int op_count;
366 struct scatterlist *op_sg;
367 } data;
368 };
268}; 369};
269 370
270/* 371/*
@@ -305,10 +406,6 @@ struct rds_notifier {
305 * transport is responsible for other serialization, including 406 * transport is responsible for other serialization, including
306 * rds_recv_incoming(). This is called in process context but 407 * rds_recv_incoming(). This is called in process context but
307 * should try hard not to block. 408 * should try hard not to block.
308 *
309 * @xmit_cong_map: This asks the transport to send the local bitmap down the
310 * given connection. XXX get a better story about the bitmap
311 * flag and header.
312 */ 409 */
313 410
314#define RDS_TRANS_IB 0 411#define RDS_TRANS_IB 0
@@ -332,13 +429,11 @@ struct rds_transport {
332 void (*xmit_complete)(struct rds_connection *conn); 429 void (*xmit_complete)(struct rds_connection *conn);
333 int (*xmit)(struct rds_connection *conn, struct rds_message *rm, 430 int (*xmit)(struct rds_connection *conn, struct rds_message *rm,
334 unsigned int hdr_off, unsigned int sg, unsigned int off); 431 unsigned int hdr_off, unsigned int sg, unsigned int off);
335 int (*xmit_cong_map)(struct rds_connection *conn, 432 int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op);
336 struct rds_cong_map *map, unsigned long offset); 433 int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op);
337 int (*xmit_rdma)(struct rds_connection *conn, struct rds_rdma_op *op);
338 int (*recv)(struct rds_connection *conn); 434 int (*recv)(struct rds_connection *conn);
339 int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov, 435 int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov,
340 size_t size); 436 size_t size);
341 void (*inc_purge)(struct rds_incoming *inc);
342 void (*inc_free)(struct rds_incoming *inc); 437 void (*inc_free)(struct rds_incoming *inc);
343 438
344 int (*cm_handle_connect)(struct rdma_cm_id *cm_id, 439 int (*cm_handle_connect)(struct rdma_cm_id *cm_id,
@@ -367,17 +462,11 @@ struct rds_sock {
367 * bound_addr used for both incoming and outgoing, no INADDR_ANY 462 * bound_addr used for both incoming and outgoing, no INADDR_ANY
368 * support. 463 * support.
369 */ 464 */
370 struct rb_node rs_bound_node; 465 struct hlist_node rs_bound_node;
371 __be32 rs_bound_addr; 466 __be32 rs_bound_addr;
372 __be32 rs_conn_addr; 467 __be32 rs_conn_addr;
373 __be16 rs_bound_port; 468 __be16 rs_bound_port;
374 __be16 rs_conn_port; 469 __be16 rs_conn_port;
375
376 /*
377 * This is only used to communicate the transport between bind and
378 * initiating connections. All other trans use is referenced through
379 * the connection.
380 */
381 struct rds_transport *rs_transport; 470 struct rds_transport *rs_transport;
382 471
383 /* 472 /*
@@ -466,8 +555,8 @@ struct rds_statistics {
466 uint64_t s_recv_ping; 555 uint64_t s_recv_ping;
467 uint64_t s_send_queue_empty; 556 uint64_t s_send_queue_empty;
468 uint64_t s_send_queue_full; 557 uint64_t s_send_queue_full;
469 uint64_t s_send_sem_contention; 558 uint64_t s_send_lock_contention;
470 uint64_t s_send_sem_queue_raced; 559 uint64_t s_send_lock_queue_raced;
471 uint64_t s_send_immediate_retry; 560 uint64_t s_send_immediate_retry;
472 uint64_t s_send_delayed_retry; 561 uint64_t s_send_delayed_retry;
473 uint64_t s_send_drop_acked; 562 uint64_t s_send_drop_acked;
@@ -487,6 +576,7 @@ struct rds_statistics {
487}; 576};
488 577
489/* af_rds.c */ 578/* af_rds.c */
579char *rds_str_array(char **array, size_t elements, size_t index);
490void rds_sock_addref(struct rds_sock *rs); 580void rds_sock_addref(struct rds_sock *rs);
491void rds_sock_put(struct rds_sock *rs); 581void rds_sock_put(struct rds_sock *rs);
492void rds_wake_sk_sleep(struct rds_sock *rs); 582void rds_wake_sk_sleep(struct rds_sock *rs);
@@ -521,15 +611,16 @@ void rds_cong_exit(void);
521struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); 611struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
522 612
523/* conn.c */ 613/* conn.c */
524int __init rds_conn_init(void); 614int rds_conn_init(void);
525void rds_conn_exit(void); 615void rds_conn_exit(void);
526struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, 616struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
527 struct rds_transport *trans, gfp_t gfp); 617 struct rds_transport *trans, gfp_t gfp);
528struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, 618struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
529 struct rds_transport *trans, gfp_t gfp); 619 struct rds_transport *trans, gfp_t gfp);
620void rds_conn_shutdown(struct rds_connection *conn);
530void rds_conn_destroy(struct rds_connection *conn); 621void rds_conn_destroy(struct rds_connection *conn);
531void rds_conn_reset(struct rds_connection *conn);
532void rds_conn_drop(struct rds_connection *conn); 622void rds_conn_drop(struct rds_connection *conn);
623void rds_conn_connect_if_down(struct rds_connection *conn);
533void rds_for_each_conn_info(struct socket *sock, unsigned int len, 624void rds_for_each_conn_info(struct socket *sock, unsigned int len,
534 struct rds_info_iterator *iter, 625 struct rds_info_iterator *iter,
535 struct rds_info_lengths *lens, 626 struct rds_info_lengths *lens,
@@ -566,7 +657,8 @@ rds_conn_connecting(struct rds_connection *conn)
566 657
567/* message.c */ 658/* message.c */
568struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp); 659struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp);
569struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, 660struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents);
661int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov,
570 size_t total_len); 662 size_t total_len);
571struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len); 663struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len);
572void rds_message_populate_header(struct rds_header *hdr, __be16 sport, 664void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
@@ -575,12 +667,9 @@ int rds_message_add_extension(struct rds_header *hdr,
575 unsigned int type, const void *data, unsigned int len); 667 unsigned int type, const void *data, unsigned int len);
576int rds_message_next_extension(struct rds_header *hdr, 668int rds_message_next_extension(struct rds_header *hdr,
577 unsigned int *pos, void *buf, unsigned int *buflen); 669 unsigned int *pos, void *buf, unsigned int *buflen);
578int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version);
579int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version);
580int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset); 670int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset);
581int rds_message_inc_copy_to_user(struct rds_incoming *inc, 671int rds_message_inc_copy_to_user(struct rds_incoming *inc,
582 struct iovec *first_iov, size_t size); 672 struct iovec *first_iov, size_t size);
583void rds_message_inc_purge(struct rds_incoming *inc);
584void rds_message_inc_free(struct rds_incoming *inc); 673void rds_message_inc_free(struct rds_incoming *inc);
585void rds_message_addref(struct rds_message *rm); 674void rds_message_addref(struct rds_message *rm);
586void rds_message_put(struct rds_message *rm); 675void rds_message_put(struct rds_message *rm);
@@ -614,7 +703,6 @@ void rds_page_exit(void);
614/* recv.c */ 703/* recv.c */
615void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, 704void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
616 __be32 saddr); 705 __be32 saddr);
617void rds_inc_addref(struct rds_incoming *inc);
618void rds_inc_put(struct rds_incoming *inc); 706void rds_inc_put(struct rds_incoming *inc);
619void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, 707void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
620 struct rds_incoming *inc, gfp_t gfp, enum km_type km); 708 struct rds_incoming *inc, gfp_t gfp, enum km_type km);
@@ -636,14 +724,38 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest);
636typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack); 724typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
637void rds_send_drop_acked(struct rds_connection *conn, u64 ack, 725void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
638 is_acked_func is_acked); 726 is_acked_func is_acked);
639int rds_send_acked_before(struct rds_connection *conn, u64 seq);
640void rds_send_remove_from_sock(struct list_head *messages, int status);
641int rds_send_pong(struct rds_connection *conn, __be16 dport); 727int rds_send_pong(struct rds_connection *conn, __be16 dport);
642struct rds_message *rds_send_get_message(struct rds_connection *, 728struct rds_message *rds_send_get_message(struct rds_connection *,
643 struct rds_rdma_op *); 729 struct rm_rdma_op *);
644 730
645/* rdma.c */ 731/* rdma.c */
646void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force); 732void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
733int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
734int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen);
735int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
736void rds_rdma_drop_keys(struct rds_sock *rs);
737int rds_rdma_extra_size(struct rds_rdma_args *args);
738int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
739 struct cmsghdr *cmsg);
740int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
741 struct cmsghdr *cmsg);
742int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
743 struct cmsghdr *cmsg);
744int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
745 struct cmsghdr *cmsg);
746void rds_rdma_free_op(struct rm_rdma_op *ro);
747void rds_atomic_free_op(struct rm_atomic_op *ao);
748void rds_rdma_send_complete(struct rds_message *rm, int wc_status);
749void rds_atomic_send_complete(struct rds_message *rm, int wc_status);
750int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
751 struct cmsghdr *cmsg);
752
753extern void __rds_put_mr_final(struct rds_mr *mr);
754static inline void rds_mr_put(struct rds_mr *mr)
755{
756 if (atomic_dec_and_test(&mr->r_refcount))
757 __rds_put_mr_final(mr);
758}
647 759
648/* stats.c */ 760/* stats.c */
649DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); 761DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
@@ -657,14 +769,14 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
657 put_cpu(); \ 769 put_cpu(); \
658} while (0) 770} while (0)
659#define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count) 771#define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count)
660int __init rds_stats_init(void); 772int rds_stats_init(void);
661void rds_stats_exit(void); 773void rds_stats_exit(void);
662void rds_stats_info_copy(struct rds_info_iterator *iter, 774void rds_stats_info_copy(struct rds_info_iterator *iter,
663 uint64_t *values, const char *const *names, 775 uint64_t *values, const char *const *names,
664 size_t nr); 776 size_t nr);
665 777
666/* sysctl.c */ 778/* sysctl.c */
667int __init rds_sysctl_init(void); 779int rds_sysctl_init(void);
668void rds_sysctl_exit(void); 780void rds_sysctl_exit(void);
669extern unsigned long rds_sysctl_sndbuf_min; 781extern unsigned long rds_sysctl_sndbuf_min;
670extern unsigned long rds_sysctl_sndbuf_default; 782extern unsigned long rds_sysctl_sndbuf_default;
@@ -678,9 +790,10 @@ extern unsigned long rds_sysctl_trace_flags;
678extern unsigned int rds_sysctl_trace_level; 790extern unsigned int rds_sysctl_trace_level;
679 791
680/* threads.c */ 792/* threads.c */
681int __init rds_threads_init(void); 793int rds_threads_init(void);
682void rds_threads_exit(void); 794void rds_threads_exit(void);
683extern struct workqueue_struct *rds_wq; 795extern struct workqueue_struct *rds_wq;
796void rds_queue_reconnect(struct rds_connection *conn);
684void rds_connect_worker(struct work_struct *); 797void rds_connect_worker(struct work_struct *);
685void rds_shutdown_worker(struct work_struct *); 798void rds_shutdown_worker(struct work_struct *);
686void rds_send_worker(struct work_struct *); 799void rds_send_worker(struct work_struct *);
@@ -691,9 +804,10 @@ void rds_connect_complete(struct rds_connection *conn);
691int rds_trans_register(struct rds_transport *trans); 804int rds_trans_register(struct rds_transport *trans);
692void rds_trans_unregister(struct rds_transport *trans); 805void rds_trans_unregister(struct rds_transport *trans);
693struct rds_transport *rds_trans_get_preferred(__be32 addr); 806struct rds_transport *rds_trans_get_preferred(__be32 addr);
807void rds_trans_put(struct rds_transport *trans);
694unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, 808unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
695 unsigned int avail); 809 unsigned int avail);
696int __init rds_trans_init(void); 810int rds_trans_init(void);
697void rds_trans_exit(void); 811void rds_trans_exit(void);
698 812
699#endif 813#endif
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 795a00b7f2cb..596689e59272 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -36,7 +36,6 @@
36#include <linux/in.h> 36#include <linux/in.h>
37 37
38#include "rds.h" 38#include "rds.h"
39#include "rdma.h"
40 39
41void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, 40void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
42 __be32 saddr) 41 __be32 saddr)
@@ -49,12 +48,11 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
49} 48}
50EXPORT_SYMBOL_GPL(rds_inc_init); 49EXPORT_SYMBOL_GPL(rds_inc_init);
51 50
52void rds_inc_addref(struct rds_incoming *inc) 51static void rds_inc_addref(struct rds_incoming *inc)
53{ 52{
54 rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount)); 53 rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
55 atomic_inc(&inc->i_refcount); 54 atomic_inc(&inc->i_refcount);
56} 55}
57EXPORT_SYMBOL_GPL(rds_inc_addref);
58 56
59void rds_inc_put(struct rds_incoming *inc) 57void rds_inc_put(struct rds_incoming *inc)
60{ 58{
@@ -210,7 +208,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
210 } 208 }
211 209
212 rs = rds_find_bound(daddr, inc->i_hdr.h_dport); 210 rs = rds_find_bound(daddr, inc->i_hdr.h_dport);
213 if (rs == NULL) { 211 if (!rs) {
214 rds_stats_inc(s_recv_drop_no_sock); 212 rds_stats_inc(s_recv_drop_no_sock);
215 goto out; 213 goto out;
216 } 214 }
@@ -251,7 +249,7 @@ static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc)
251{ 249{
252 unsigned long flags; 250 unsigned long flags;
253 251
254 if (*inc == NULL) { 252 if (!*inc) {
255 read_lock_irqsave(&rs->rs_recv_lock, flags); 253 read_lock_irqsave(&rs->rs_recv_lock, flags);
256 if (!list_empty(&rs->rs_recv_queue)) { 254 if (!list_empty(&rs->rs_recv_queue)) {
257 *inc = list_entry(rs->rs_recv_queue.next, 255 *inc = list_entry(rs->rs_recv_queue.next,
@@ -297,7 +295,7 @@ static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc,
297int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr) 295int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
298{ 296{
299 struct rds_notifier *notifier; 297 struct rds_notifier *notifier;
300 struct rds_rdma_notify cmsg; 298 struct rds_rdma_notify cmsg = { 0 }; /* fill holes with zero */
301 unsigned int count = 0, max_messages = ~0U; 299 unsigned int count = 0, max_messages = ~0U;
302 unsigned long flags; 300 unsigned long flags;
303 LIST_HEAD(copy); 301 LIST_HEAD(copy);
@@ -334,10 +332,10 @@ int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
334 332
335 if (msghdr) { 333 if (msghdr) {
336 cmsg.user_token = notifier->n_user_token; 334 cmsg.user_token = notifier->n_user_token;
337 cmsg.status = notifier->n_status; 335 cmsg.status = notifier->n_status;
338 336
339 err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS, 337 err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS,
340 sizeof(cmsg), &cmsg); 338 sizeof(cmsg), &cmsg);
341 if (err) 339 if (err)
342 break; 340 break;
343 } 341 }
diff --git a/net/rds/send.c b/net/rds/send.c
index 9c1c6bcaa6c9..35b9c2e9caf1 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -37,7 +37,6 @@
37#include <linux/list.h> 37#include <linux/list.h>
38 38
39#include "rds.h" 39#include "rds.h"
40#include "rdma.h"
41 40
42/* When transmitting messages in rds_send_xmit, we need to emerge from 41/* When transmitting messages in rds_send_xmit, we need to emerge from
43 * time to time and briefly release the CPU. Otherwise the softlock watchdog 42 * time to time and briefly release the CPU. Otherwise the softlock watchdog
@@ -53,8 +52,11 @@ static int send_batch_count = 64;
53module_param(send_batch_count, int, 0444); 52module_param(send_batch_count, int, 0444);
54MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue"); 53MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue");
55 54
55static void rds_send_remove_from_sock(struct list_head *messages, int status);
56
56/* 57/*
57 * Reset the send state. Caller must hold c_send_lock when calling here. 58 * Reset the send state. Callers must ensure that this doesn't race with
59 * rds_send_xmit().
58 */ 60 */
59void rds_send_reset(struct rds_connection *conn) 61void rds_send_reset(struct rds_connection *conn)
60{ 62{
@@ -62,18 +64,22 @@ void rds_send_reset(struct rds_connection *conn)
62 unsigned long flags; 64 unsigned long flags;
63 65
64 if (conn->c_xmit_rm) { 66 if (conn->c_xmit_rm) {
67 rm = conn->c_xmit_rm;
68 conn->c_xmit_rm = NULL;
65 /* Tell the user the RDMA op is no longer mapped by the 69 /* Tell the user the RDMA op is no longer mapped by the
66 * transport. This isn't entirely true (it's flushed out 70 * transport. This isn't entirely true (it's flushed out
67 * independently) but as the connection is down, there's 71 * independently) but as the connection is down, there's
68 * no ongoing RDMA to/from that memory */ 72 * no ongoing RDMA to/from that memory */
69 rds_message_unmapped(conn->c_xmit_rm); 73 rds_message_unmapped(rm);
70 rds_message_put(conn->c_xmit_rm); 74 rds_message_put(rm);
71 conn->c_xmit_rm = NULL;
72 } 75 }
76
73 conn->c_xmit_sg = 0; 77 conn->c_xmit_sg = 0;
74 conn->c_xmit_hdr_off = 0; 78 conn->c_xmit_hdr_off = 0;
75 conn->c_xmit_data_off = 0; 79 conn->c_xmit_data_off = 0;
80 conn->c_xmit_atomic_sent = 0;
76 conn->c_xmit_rdma_sent = 0; 81 conn->c_xmit_rdma_sent = 0;
82 conn->c_xmit_data_sent = 0;
77 83
78 conn->c_map_queued = 0; 84 conn->c_map_queued = 0;
79 85
@@ -90,6 +96,25 @@ void rds_send_reset(struct rds_connection *conn)
90 spin_unlock_irqrestore(&conn->c_lock, flags); 96 spin_unlock_irqrestore(&conn->c_lock, flags);
91} 97}
92 98
99static int acquire_in_xmit(struct rds_connection *conn)
100{
101 return test_and_set_bit(RDS_IN_XMIT, &conn->c_flags) == 0;
102}
103
104static void release_in_xmit(struct rds_connection *conn)
105{
106 clear_bit(RDS_IN_XMIT, &conn->c_flags);
107 smp_mb__after_clear_bit();
108 /*
109 * We don't use wait_on_bit()/wake_up_bit() because our waking is in a
110 * hot path and finding waiters is very rare. We don't want to walk
111 * the system-wide hashed waitqueue buckets in the fast path only to
112 * almost never find waiters.
113 */
114 if (waitqueue_active(&conn->c_waitq))
115 wake_up_all(&conn->c_waitq);
116}
117
93/* 118/*
94 * We're making the concious trade-off here to only send one message 119 * We're making the concious trade-off here to only send one message
95 * down the connection at a time. 120 * down the connection at a time.
@@ -109,102 +134,69 @@ int rds_send_xmit(struct rds_connection *conn)
109 struct rds_message *rm; 134 struct rds_message *rm;
110 unsigned long flags; 135 unsigned long flags;
111 unsigned int tmp; 136 unsigned int tmp;
112 unsigned int send_quota = send_batch_count;
113 struct scatterlist *sg; 137 struct scatterlist *sg;
114 int ret = 0; 138 int ret = 0;
115 int was_empty = 0;
116 LIST_HEAD(to_be_dropped); 139 LIST_HEAD(to_be_dropped);
117 140
141restart:
142
118 /* 143 /*
119 * sendmsg calls here after having queued its message on the send 144 * sendmsg calls here after having queued its message on the send
120 * queue. We only have one task feeding the connection at a time. If 145 * queue. We only have one task feeding the connection at a time. If
121 * another thread is already feeding the queue then we back off. This 146 * another thread is already feeding the queue then we back off. This
122 * avoids blocking the caller and trading per-connection data between 147 * avoids blocking the caller and trading per-connection data between
123 * caches per message. 148 * caches per message.
124 *
125 * The sem holder will issue a retry if they notice that someone queued
126 * a message after they stopped walking the send queue but before they
127 * dropped the sem.
128 */ 149 */
129 if (!mutex_trylock(&conn->c_send_lock)) { 150 if (!acquire_in_xmit(conn)) {
130 rds_stats_inc(s_send_sem_contention); 151 rds_stats_inc(s_send_lock_contention);
131 ret = -ENOMEM; 152 ret = -ENOMEM;
132 goto out; 153 goto out;
133 } 154 }
134 155
156 /*
157 * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT,
158 * we do the opposite to avoid races.
159 */
160 if (!rds_conn_up(conn)) {
161 release_in_xmit(conn);
162 ret = 0;
163 goto out;
164 }
165
135 if (conn->c_trans->xmit_prepare) 166 if (conn->c_trans->xmit_prepare)
136 conn->c_trans->xmit_prepare(conn); 167 conn->c_trans->xmit_prepare(conn);
137 168
138 /* 169 /*
139 * spin trying to push headers and data down the connection until 170 * spin trying to push headers and data down the connection until
140 * the connection doens't make forward progress. 171 * the connection doesn't make forward progress.
141 */ 172 */
142 while (--send_quota) { 173 while (1) {
143 /*
144 * See if need to send a congestion map update if we're
145 * between sending messages. The send_sem protects our sole
146 * use of c_map_offset and _bytes.
147 * Note this is used only by transports that define a special
148 * xmit_cong_map function. For all others, we create allocate
149 * a cong_map message and treat it just like any other send.
150 */
151 if (conn->c_map_bytes) {
152 ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong,
153 conn->c_map_offset);
154 if (ret <= 0)
155 break;
156 174
157 conn->c_map_offset += ret;
158 conn->c_map_bytes -= ret;
159 if (conn->c_map_bytes)
160 continue;
161 }
162
163 /* If we're done sending the current message, clear the
164 * offset and S/G temporaries.
165 */
166 rm = conn->c_xmit_rm; 175 rm = conn->c_xmit_rm;
167 if (rm != NULL &&
168 conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
169 conn->c_xmit_sg == rm->m_nents) {
170 conn->c_xmit_rm = NULL;
171 conn->c_xmit_sg = 0;
172 conn->c_xmit_hdr_off = 0;
173 conn->c_xmit_data_off = 0;
174 conn->c_xmit_rdma_sent = 0;
175
176 /* Release the reference to the previous message. */
177 rds_message_put(rm);
178 rm = NULL;
179 }
180 176
181 /* If we're asked to send a cong map update, do so. 177 /*
178 * If between sending messages, we can send a pending congestion
179 * map update.
182 */ 180 */
183 if (rm == NULL && test_and_clear_bit(0, &conn->c_map_queued)) { 181 if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) {
184 if (conn->c_trans->xmit_cong_map != NULL) {
185 conn->c_map_offset = 0;
186 conn->c_map_bytes = sizeof(struct rds_header) +
187 RDS_CONG_MAP_BYTES;
188 continue;
189 }
190
191 rm = rds_cong_update_alloc(conn); 182 rm = rds_cong_update_alloc(conn);
192 if (IS_ERR(rm)) { 183 if (IS_ERR(rm)) {
193 ret = PTR_ERR(rm); 184 ret = PTR_ERR(rm);
194 break; 185 break;
195 } 186 }
187 rm->data.op_active = 1;
196 188
197 conn->c_xmit_rm = rm; 189 conn->c_xmit_rm = rm;
198 } 190 }
199 191
200 /* 192 /*
201 * Grab the next message from the send queue, if there is one. 193 * If not already working on one, grab the next message.
202 * 194 *
203 * c_xmit_rm holds a ref while we're sending this message down 195 * c_xmit_rm holds a ref while we're sending this message down
204 * the connction. We can use this ref while holding the 196 * the connction. We can use this ref while holding the
205 * send_sem.. rds_send_reset() is serialized with it. 197 * send_sem.. rds_send_reset() is serialized with it.
206 */ 198 */
207 if (rm == NULL) { 199 if (!rm) {
208 unsigned int len; 200 unsigned int len;
209 201
210 spin_lock_irqsave(&conn->c_lock, flags); 202 spin_lock_irqsave(&conn->c_lock, flags);
@@ -224,10 +216,8 @@ int rds_send_xmit(struct rds_connection *conn)
224 216
225 spin_unlock_irqrestore(&conn->c_lock, flags); 217 spin_unlock_irqrestore(&conn->c_lock, flags);
226 218
227 if (rm == NULL) { 219 if (!rm)
228 was_empty = 1;
229 break; 220 break;
230 }
231 221
232 /* Unfortunately, the way Infiniband deals with 222 /* Unfortunately, the way Infiniband deals with
233 * RDMA to a bad MR key is by moving the entire 223 * RDMA to a bad MR key is by moving the entire
@@ -236,13 +226,12 @@ int rds_send_xmit(struct rds_connection *conn)
236 * connection. 226 * connection.
237 * Therefore, we never retransmit messages with RDMA ops. 227 * Therefore, we never retransmit messages with RDMA ops.
238 */ 228 */
239 if (rm->m_rdma_op && 229 if (rm->rdma.op_active &&
240 test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) { 230 test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
241 spin_lock_irqsave(&conn->c_lock, flags); 231 spin_lock_irqsave(&conn->c_lock, flags);
242 if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) 232 if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
243 list_move(&rm->m_conn_item, &to_be_dropped); 233 list_move(&rm->m_conn_item, &to_be_dropped);
244 spin_unlock_irqrestore(&conn->c_lock, flags); 234 spin_unlock_irqrestore(&conn->c_lock, flags);
245 rds_message_put(rm);
246 continue; 235 continue;
247 } 236 }
248 237
@@ -263,23 +252,55 @@ int rds_send_xmit(struct rds_connection *conn)
263 conn->c_xmit_rm = rm; 252 conn->c_xmit_rm = rm;
264 } 253 }
265 254
266 /* 255 /* The transport either sends the whole rdma or none of it */
267 * Try and send an rdma message. Let's see if we can 256 if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) {
268 * keep this simple and require that the transport either 257 rm->m_final_op = &rm->rdma;
269 * send the whole rdma or none of it. 258 ret = conn->c_trans->xmit_rdma(conn, &rm->rdma);
270 */
271 if (rm->m_rdma_op && !conn->c_xmit_rdma_sent) {
272 ret = conn->c_trans->xmit_rdma(conn, rm->m_rdma_op);
273 if (ret) 259 if (ret)
274 break; 260 break;
275 conn->c_xmit_rdma_sent = 1; 261 conn->c_xmit_rdma_sent = 1;
262
263 /* The transport owns the mapped memory for now.
264 * You can't unmap it while it's on the send queue */
265 set_bit(RDS_MSG_MAPPED, &rm->m_flags);
266 }
267
268 if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) {
269 rm->m_final_op = &rm->atomic;
270 ret = conn->c_trans->xmit_atomic(conn, &rm->atomic);
271 if (ret)
272 break;
273 conn->c_xmit_atomic_sent = 1;
274
276 /* The transport owns the mapped memory for now. 275 /* The transport owns the mapped memory for now.
277 * You can't unmap it while it's on the send queue */ 276 * You can't unmap it while it's on the send queue */
278 set_bit(RDS_MSG_MAPPED, &rm->m_flags); 277 set_bit(RDS_MSG_MAPPED, &rm->m_flags);
279 } 278 }
280 279
281 if (conn->c_xmit_hdr_off < sizeof(struct rds_header) || 280 /*
282 conn->c_xmit_sg < rm->m_nents) { 281 * A number of cases require an RDS header to be sent
282 * even if there is no data.
283 * We permit 0-byte sends; rds-ping depends on this.
284 * However, if there are exclusively attached silent ops,
285 * we skip the hdr/data send, to enable silent operation.
286 */
287 if (rm->data.op_nents == 0) {
288 int ops_present;
289 int all_ops_are_silent = 1;
290
291 ops_present = (rm->atomic.op_active || rm->rdma.op_active);
292 if (rm->atomic.op_active && !rm->atomic.op_silent)
293 all_ops_are_silent = 0;
294 if (rm->rdma.op_active && !rm->rdma.op_silent)
295 all_ops_are_silent = 0;
296
297 if (ops_present && all_ops_are_silent
298 && !rm->m_rdma_cookie)
299 rm->data.op_active = 0;
300 }
301
302 if (rm->data.op_active && !conn->c_xmit_data_sent) {
303 rm->m_final_op = &rm->data;
283 ret = conn->c_trans->xmit(conn, rm, 304 ret = conn->c_trans->xmit(conn, rm,
284 conn->c_xmit_hdr_off, 305 conn->c_xmit_hdr_off,
285 conn->c_xmit_sg, 306 conn->c_xmit_sg,
@@ -295,7 +316,7 @@ int rds_send_xmit(struct rds_connection *conn)
295 ret -= tmp; 316 ret -= tmp;
296 } 317 }
297 318
298 sg = &rm->m_sg[conn->c_xmit_sg]; 319 sg = &rm->data.op_sg[conn->c_xmit_sg];
299 while (ret) { 320 while (ret) {
300 tmp = min_t(int, ret, sg->length - 321 tmp = min_t(int, ret, sg->length -
301 conn->c_xmit_data_off); 322 conn->c_xmit_data_off);
@@ -306,49 +327,63 @@ int rds_send_xmit(struct rds_connection *conn)
306 sg++; 327 sg++;
307 conn->c_xmit_sg++; 328 conn->c_xmit_sg++;
308 BUG_ON(ret != 0 && 329 BUG_ON(ret != 0 &&
309 conn->c_xmit_sg == rm->m_nents); 330 conn->c_xmit_sg == rm->data.op_nents);
310 } 331 }
311 } 332 }
333
334 if (conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
335 (conn->c_xmit_sg == rm->data.op_nents))
336 conn->c_xmit_data_sent = 1;
312 } 337 }
313 }
314 338
315 /* Nuke any messages we decided not to retransmit. */ 339 /*
316 if (!list_empty(&to_be_dropped)) 340 * A rm will only take multiple times through this loop
317 rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED); 341 * if there is a data op. Thus, if the data is sent (or there was
342 * none), then we're done with the rm.
343 */
344 if (!rm->data.op_active || conn->c_xmit_data_sent) {
345 conn->c_xmit_rm = NULL;
346 conn->c_xmit_sg = 0;
347 conn->c_xmit_hdr_off = 0;
348 conn->c_xmit_data_off = 0;
349 conn->c_xmit_rdma_sent = 0;
350 conn->c_xmit_atomic_sent = 0;
351 conn->c_xmit_data_sent = 0;
352
353 rds_message_put(rm);
354 }
355 }
318 356
319 if (conn->c_trans->xmit_complete) 357 if (conn->c_trans->xmit_complete)
320 conn->c_trans->xmit_complete(conn); 358 conn->c_trans->xmit_complete(conn);
321 359
322 /* 360 release_in_xmit(conn);
323 * We might be racing with another sender who queued a message but
324 * backed off on noticing that we held the c_send_lock. If we check
325 * for queued messages after dropping the sem then either we'll
326 * see the queued message or the queuer will get the sem. If we
327 * notice the queued message then we trigger an immediate retry.
328 *
329 * We need to be careful only to do this when we stopped processing
330 * the send queue because it was empty. It's the only way we
331 * stop processing the loop when the transport hasn't taken
332 * responsibility for forward progress.
333 */
334 mutex_unlock(&conn->c_send_lock);
335 361
336 if (conn->c_map_bytes || (send_quota == 0 && !was_empty)) { 362 /* Nuke any messages we decided not to retransmit. */
337 /* We exhausted the send quota, but there's work left to 363 if (!list_empty(&to_be_dropped)) {
338 * do. Return and (re-)schedule the send worker. 364 /* irqs on here, so we can put(), unlike above */
339 */ 365 list_for_each_entry(rm, &to_be_dropped, m_conn_item)
340 ret = -EAGAIN; 366 rds_message_put(rm);
367 rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
341 } 368 }
342 369
343 if (ret == 0 && was_empty) { 370 /*
344 /* A simple bit test would be way faster than taking the 371 * Other senders can queue a message after we last test the send queue
345 * spin lock */ 372 * but before we clear RDS_IN_XMIT. In that case they'd back off and
346 spin_lock_irqsave(&conn->c_lock, flags); 373 * not try and send their newly queued message. We need to check the
374 * send queue after having cleared RDS_IN_XMIT so that their message
375 * doesn't get stuck on the send queue.
376 *
377 * If the transport cannot continue (i.e ret != 0), then it must
378 * call us when more room is available, such as from the tx
379 * completion handler.
380 */
381 if (ret == 0) {
382 smp_mb();
347 if (!list_empty(&conn->c_send_queue)) { 383 if (!list_empty(&conn->c_send_queue)) {
348 rds_stats_inc(s_send_sem_queue_raced); 384 rds_stats_inc(s_send_lock_queue_raced);
349 ret = -EAGAIN; 385 goto restart;
350 } 386 }
351 spin_unlock_irqrestore(&conn->c_lock, flags);
352 } 387 }
353out: 388out:
354 return ret; 389 return ret;
@@ -376,52 +411,60 @@ static inline int rds_send_is_acked(struct rds_message *rm, u64 ack,
376} 411}
377 412
378/* 413/*
379 * Returns true if there are no messages on the send and retransmit queues 414 * This is pretty similar to what happens below in the ACK
380 * which have a sequence number greater than or equal to the given sequence 415 * handling code - except that we call here as soon as we get
381 * number. 416 * the IB send completion on the RDMA op and the accompanying
417 * message.
382 */ 418 */
383int rds_send_acked_before(struct rds_connection *conn, u64 seq) 419void rds_rdma_send_complete(struct rds_message *rm, int status)
384{ 420{
385 struct rds_message *rm, *tmp; 421 struct rds_sock *rs = NULL;
386 int ret = 1; 422 struct rm_rdma_op *ro;
423 struct rds_notifier *notifier;
424 unsigned long flags;
387 425
388 spin_lock(&conn->c_lock); 426 spin_lock_irqsave(&rm->m_rs_lock, flags);
389 427
390 list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { 428 ro = &rm->rdma;
391 if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq) 429 if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
392 ret = 0; 430 ro->op_active && ro->op_notify && ro->op_notifier) {
393 break; 431 notifier = ro->op_notifier;
394 } 432 rs = rm->m_rs;
433 sock_hold(rds_rs_to_sk(rs));
395 434
396 list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) { 435 notifier->n_status = status;
397 if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq) 436 spin_lock(&rs->rs_lock);
398 ret = 0; 437 list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
399 break; 438 spin_unlock(&rs->rs_lock);
439
440 ro->op_notifier = NULL;
400 } 441 }
401 442
402 spin_unlock(&conn->c_lock); 443 spin_unlock_irqrestore(&rm->m_rs_lock, flags);
403 444
404 return ret; 445 if (rs) {
446 rds_wake_sk_sleep(rs);
447 sock_put(rds_rs_to_sk(rs));
448 }
405} 449}
450EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
406 451
407/* 452/*
408 * This is pretty similar to what happens below in the ACK 453 * Just like above, except looks at atomic op
409 * handling code - except that we call here as soon as we get
410 * the IB send completion on the RDMA op and the accompanying
411 * message.
412 */ 454 */
413void rds_rdma_send_complete(struct rds_message *rm, int status) 455void rds_atomic_send_complete(struct rds_message *rm, int status)
414{ 456{
415 struct rds_sock *rs = NULL; 457 struct rds_sock *rs = NULL;
416 struct rds_rdma_op *ro; 458 struct rm_atomic_op *ao;
417 struct rds_notifier *notifier; 459 struct rds_notifier *notifier;
460 unsigned long flags;
418 461
419 spin_lock(&rm->m_rs_lock); 462 spin_lock_irqsave(&rm->m_rs_lock, flags);
420 463
421 ro = rm->m_rdma_op; 464 ao = &rm->atomic;
422 if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && 465 if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)
423 ro && ro->r_notify && ro->r_notifier) { 466 && ao->op_active && ao->op_notify && ao->op_notifier) {
424 notifier = ro->r_notifier; 467 notifier = ao->op_notifier;
425 rs = rm->m_rs; 468 rs = rm->m_rs;
426 sock_hold(rds_rs_to_sk(rs)); 469 sock_hold(rds_rs_to_sk(rs));
427 470
@@ -430,17 +473,17 @@ void rds_rdma_send_complete(struct rds_message *rm, int status)
430 list_add_tail(&notifier->n_list, &rs->rs_notify_queue); 473 list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
431 spin_unlock(&rs->rs_lock); 474 spin_unlock(&rs->rs_lock);
432 475
433 ro->r_notifier = NULL; 476 ao->op_notifier = NULL;
434 } 477 }
435 478
436 spin_unlock(&rm->m_rs_lock); 479 spin_unlock_irqrestore(&rm->m_rs_lock, flags);
437 480
438 if (rs) { 481 if (rs) {
439 rds_wake_sk_sleep(rs); 482 rds_wake_sk_sleep(rs);
440 sock_put(rds_rs_to_sk(rs)); 483 sock_put(rds_rs_to_sk(rs));
441 } 484 }
442} 485}
443EXPORT_SYMBOL_GPL(rds_rdma_send_complete); 486EXPORT_SYMBOL_GPL(rds_atomic_send_complete);
444 487
445/* 488/*
446 * This is the same as rds_rdma_send_complete except we 489 * This is the same as rds_rdma_send_complete except we
@@ -448,15 +491,23 @@ EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
448 * socket, socket lock) and can just move the notifier. 491 * socket, socket lock) and can just move the notifier.
449 */ 492 */
450static inline void 493static inline void
451__rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status) 494__rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status)
452{ 495{
453 struct rds_rdma_op *ro; 496 struct rm_rdma_op *ro;
497 struct rm_atomic_op *ao;
498
499 ro = &rm->rdma;
500 if (ro->op_active && ro->op_notify && ro->op_notifier) {
501 ro->op_notifier->n_status = status;
502 list_add_tail(&ro->op_notifier->n_list, &rs->rs_notify_queue);
503 ro->op_notifier = NULL;
504 }
454 505
455 ro = rm->m_rdma_op; 506 ao = &rm->atomic;
456 if (ro && ro->r_notify && ro->r_notifier) { 507 if (ao->op_active && ao->op_notify && ao->op_notifier) {
457 ro->r_notifier->n_status = status; 508 ao->op_notifier->n_status = status;
458 list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue); 509 list_add_tail(&ao->op_notifier->n_list, &rs->rs_notify_queue);
459 ro->r_notifier = NULL; 510 ao->op_notifier = NULL;
460 } 511 }
461 512
462 /* No need to wake the app - caller does this */ 513 /* No need to wake the app - caller does this */
@@ -468,7 +519,7 @@ __rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status
468 * So speed is not an issue here. 519 * So speed is not an issue here.
469 */ 520 */
470struct rds_message *rds_send_get_message(struct rds_connection *conn, 521struct rds_message *rds_send_get_message(struct rds_connection *conn,
471 struct rds_rdma_op *op) 522 struct rm_rdma_op *op)
472{ 523{
473 struct rds_message *rm, *tmp, *found = NULL; 524 struct rds_message *rm, *tmp, *found = NULL;
474 unsigned long flags; 525 unsigned long flags;
@@ -476,7 +527,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn,
476 spin_lock_irqsave(&conn->c_lock, flags); 527 spin_lock_irqsave(&conn->c_lock, flags);
477 528
478 list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { 529 list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
479 if (rm->m_rdma_op == op) { 530 if (&rm->rdma == op) {
480 atomic_inc(&rm->m_refcount); 531 atomic_inc(&rm->m_refcount);
481 found = rm; 532 found = rm;
482 goto out; 533 goto out;
@@ -484,7 +535,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn,
484 } 535 }
485 536
486 list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) { 537 list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
487 if (rm->m_rdma_op == op) { 538 if (&rm->rdma == op) {
488 atomic_inc(&rm->m_refcount); 539 atomic_inc(&rm->m_refcount);
489 found = rm; 540 found = rm;
490 break; 541 break;
@@ -506,7 +557,7 @@ EXPORT_SYMBOL_GPL(rds_send_get_message);
506 * removing the messages from the 'messages' list regardless of if it found 557 * removing the messages from the 'messages' list regardless of if it found
507 * the messages on the socket list or not. 558 * the messages on the socket list or not.
508 */ 559 */
509void rds_send_remove_from_sock(struct list_head *messages, int status) 560static void rds_send_remove_from_sock(struct list_head *messages, int status)
510{ 561{
511 unsigned long flags; 562 unsigned long flags;
512 struct rds_sock *rs = NULL; 563 struct rds_sock *rs = NULL;
@@ -544,19 +595,20 @@ void rds_send_remove_from_sock(struct list_head *messages, int status)
544 spin_lock(&rs->rs_lock); 595 spin_lock(&rs->rs_lock);
545 596
546 if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) { 597 if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
547 struct rds_rdma_op *ro = rm->m_rdma_op; 598 struct rm_rdma_op *ro = &rm->rdma;
548 struct rds_notifier *notifier; 599 struct rds_notifier *notifier;
549 600
550 list_del_init(&rm->m_sock_item); 601 list_del_init(&rm->m_sock_item);
551 rds_send_sndbuf_remove(rs, rm); 602 rds_send_sndbuf_remove(rs, rm);
552 603
553 if (ro && ro->r_notifier && (status || ro->r_notify)) { 604 if (ro->op_active && ro->op_notifier &&
554 notifier = ro->r_notifier; 605 (ro->op_notify || (ro->op_recverr && status))) {
606 notifier = ro->op_notifier;
555 list_add_tail(&notifier->n_list, 607 list_add_tail(&notifier->n_list,
556 &rs->rs_notify_queue); 608 &rs->rs_notify_queue);
557 if (!notifier->n_status) 609 if (!notifier->n_status)
558 notifier->n_status = status; 610 notifier->n_status = status;
559 rm->m_rdma_op->r_notifier = NULL; 611 rm->rdma.op_notifier = NULL;
560 } 612 }
561 was_on_sock = 1; 613 was_on_sock = 1;
562 rm->m_rs = NULL; 614 rm->m_rs = NULL;
@@ -619,9 +671,8 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
619{ 671{
620 struct rds_message *rm, *tmp; 672 struct rds_message *rm, *tmp;
621 struct rds_connection *conn; 673 struct rds_connection *conn;
622 unsigned long flags, flags2; 674 unsigned long flags;
623 LIST_HEAD(list); 675 LIST_HEAD(list);
624 int wake = 0;
625 676
626 /* get all the messages we're dropping under the rs lock */ 677 /* get all the messages we're dropping under the rs lock */
627 spin_lock_irqsave(&rs->rs_lock, flags); 678 spin_lock_irqsave(&rs->rs_lock, flags);
@@ -631,59 +682,54 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
631 dest->sin_port != rm->m_inc.i_hdr.h_dport)) 682 dest->sin_port != rm->m_inc.i_hdr.h_dport))
632 continue; 683 continue;
633 684
634 wake = 1;
635 list_move(&rm->m_sock_item, &list); 685 list_move(&rm->m_sock_item, &list);
636 rds_send_sndbuf_remove(rs, rm); 686 rds_send_sndbuf_remove(rs, rm);
637 clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags); 687 clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
638 } 688 }
639 689
640 /* order flag updates with the rs lock */ 690 /* order flag updates with the rs lock */
641 if (wake) 691 smp_mb__after_clear_bit();
642 smp_mb__after_clear_bit();
643 692
644 spin_unlock_irqrestore(&rs->rs_lock, flags); 693 spin_unlock_irqrestore(&rs->rs_lock, flags);
645 694
646 conn = NULL; 695 if (list_empty(&list))
696 return;
647 697
648 /* now remove the messages from the conn list as needed */ 698 /* Remove the messages from the conn */
649 list_for_each_entry(rm, &list, m_sock_item) { 699 list_for_each_entry(rm, &list, m_sock_item) {
650 /* We do this here rather than in the loop above, so that
651 * we don't have to nest m_rs_lock under rs->rs_lock */
652 spin_lock_irqsave(&rm->m_rs_lock, flags2);
653 /* If this is a RDMA operation, notify the app. */
654 spin_lock(&rs->rs_lock);
655 __rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED);
656 spin_unlock(&rs->rs_lock);
657 rm->m_rs = NULL;
658 spin_unlock_irqrestore(&rm->m_rs_lock, flags2);
659 700
701 conn = rm->m_inc.i_conn;
702
703 spin_lock_irqsave(&conn->c_lock, flags);
660 /* 704 /*
661 * If we see this flag cleared then we're *sure* that someone 705 * Maybe someone else beat us to removing rm from the conn.
662 * else beat us to removing it from the conn. If we race 706 * If we race with their flag update we'll get the lock and
663 * with their flag update we'll get the lock and then really 707 * then really see that the flag has been cleared.
664 * see that the flag has been cleared.
665 */ 708 */
666 if (!test_bit(RDS_MSG_ON_CONN, &rm->m_flags)) 709 if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
710 spin_unlock_irqrestore(&conn->c_lock, flags);
667 continue; 711 continue;
668
669 if (conn != rm->m_inc.i_conn) {
670 if (conn)
671 spin_unlock_irqrestore(&conn->c_lock, flags);
672 conn = rm->m_inc.i_conn;
673 spin_lock_irqsave(&conn->c_lock, flags);
674 } 712 }
713 list_del_init(&rm->m_conn_item);
714 spin_unlock_irqrestore(&conn->c_lock, flags);
675 715
676 if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) { 716 /*
677 list_del_init(&rm->m_conn_item); 717 * Couldn't grab m_rs_lock in top loop (lock ordering),
678 rds_message_put(rm); 718 * but we can now.
679 } 719 */
680 } 720 spin_lock_irqsave(&rm->m_rs_lock, flags);
681 721
682 if (conn) 722 spin_lock(&rs->rs_lock);
683 spin_unlock_irqrestore(&conn->c_lock, flags); 723 __rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
724 spin_unlock(&rs->rs_lock);
684 725
685 if (wake) 726 rm->m_rs = NULL;
686 rds_wake_sk_sleep(rs); 727 spin_unlock_irqrestore(&rm->m_rs_lock, flags);
728
729 rds_message_put(rm);
730 }
731
732 rds_wake_sk_sleep(rs);
687 733
688 while (!list_empty(&list)) { 734 while (!list_empty(&list)) {
689 rm = list_entry(list.next, struct rds_message, m_sock_item); 735 rm = list_entry(list.next, struct rds_message, m_sock_item);
@@ -763,6 +809,63 @@ out:
763 return *queued; 809 return *queued;
764} 810}
765 811
812/*
813 * rds_message is getting to be quite complicated, and we'd like to allocate
814 * it all in one go. This figures out how big it needs to be up front.
815 */
816static int rds_rm_size(struct msghdr *msg, int data_len)
817{
818 struct cmsghdr *cmsg;
819 int size = 0;
820 int cmsg_groups = 0;
821 int retval;
822
823 for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
824 if (!CMSG_OK(msg, cmsg))
825 return -EINVAL;
826
827 if (cmsg->cmsg_level != SOL_RDS)
828 continue;
829
830 switch (cmsg->cmsg_type) {
831 case RDS_CMSG_RDMA_ARGS:
832 cmsg_groups |= 1;
833 retval = rds_rdma_extra_size(CMSG_DATA(cmsg));
834 if (retval < 0)
835 return retval;
836 size += retval;
837
838 break;
839
840 case RDS_CMSG_RDMA_DEST:
841 case RDS_CMSG_RDMA_MAP:
842 cmsg_groups |= 2;
843 /* these are valid but do no add any size */
844 break;
845
846 case RDS_CMSG_ATOMIC_CSWP:
847 case RDS_CMSG_ATOMIC_FADD:
848 case RDS_CMSG_MASKED_ATOMIC_CSWP:
849 case RDS_CMSG_MASKED_ATOMIC_FADD:
850 cmsg_groups |= 1;
851 size += sizeof(struct scatterlist);
852 break;
853
854 default:
855 return -EINVAL;
856 }
857
858 }
859
860 size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist);
861
862 /* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */
863 if (cmsg_groups == 3)
864 return -EINVAL;
865
866 return size;
867}
868
766static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, 869static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
767 struct msghdr *msg, int *allocated_mr) 870 struct msghdr *msg, int *allocated_mr)
768{ 871{
@@ -777,7 +880,7 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
777 continue; 880 continue;
778 881
779 /* As a side effect, RDMA_DEST and RDMA_MAP will set 882 /* As a side effect, RDMA_DEST and RDMA_MAP will set
780 * rm->m_rdma_cookie and rm->m_rdma_mr. 883 * rm->rdma.m_rdma_cookie and rm->rdma.m_rdma_mr.
781 */ 884 */
782 switch (cmsg->cmsg_type) { 885 switch (cmsg->cmsg_type) {
783 case RDS_CMSG_RDMA_ARGS: 886 case RDS_CMSG_RDMA_ARGS:
@@ -793,6 +896,12 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
793 if (!ret) 896 if (!ret)
794 *allocated_mr = 1; 897 *allocated_mr = 1;
795 break; 898 break;
899 case RDS_CMSG_ATOMIC_CSWP:
900 case RDS_CMSG_ATOMIC_FADD:
901 case RDS_CMSG_MASKED_ATOMIC_CSWP:
902 case RDS_CMSG_MASKED_ATOMIC_FADD:
903 ret = rds_cmsg_atomic(rs, rm, cmsg);
904 break;
796 905
797 default: 906 default:
798 return -EINVAL; 907 return -EINVAL;
@@ -850,13 +959,30 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
850 goto out; 959 goto out;
851 } 960 }
852 961
853 rm = rds_message_copy_from_user(msg->msg_iov, payload_len); 962 /* size of rm including all sgs */
854 if (IS_ERR(rm)) { 963 ret = rds_rm_size(msg, payload_len);
855 ret = PTR_ERR(rm); 964 if (ret < 0)
856 rm = NULL; 965 goto out;
966
967 rm = rds_message_alloc(ret, GFP_KERNEL);
968 if (!rm) {
969 ret = -ENOMEM;
857 goto out; 970 goto out;
858 } 971 }
859 972
973 /* Attach data to the rm */
974 if (payload_len) {
975 rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE));
976 if (!rm->data.op_sg) {
977 ret = -ENOMEM;
978 goto out;
979 }
980 ret = rds_message_copy_from_user(rm, msg->msg_iov, payload_len);
981 if (ret)
982 goto out;
983 }
984 rm->data.op_active = 1;
985
860 rm->m_daddr = daddr; 986 rm->m_daddr = daddr;
861 987
862 /* rds_conn_create has a spinlock that runs with IRQ off. 988 /* rds_conn_create has a spinlock that runs with IRQ off.
@@ -879,22 +1005,23 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
879 if (ret) 1005 if (ret)
880 goto out; 1006 goto out;
881 1007
882 if ((rm->m_rdma_cookie || rm->m_rdma_op) && 1008 if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) {
883 conn->c_trans->xmit_rdma == NULL) {
884 if (printk_ratelimit()) 1009 if (printk_ratelimit())
885 printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", 1010 printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
886 rm->m_rdma_op, conn->c_trans->xmit_rdma); 1011 &rm->rdma, conn->c_trans->xmit_rdma);
887 ret = -EOPNOTSUPP; 1012 ret = -EOPNOTSUPP;
888 goto out; 1013 goto out;
889 } 1014 }
890 1015
891 /* If the connection is down, trigger a connect. We may 1016 if (rm->atomic.op_active && !conn->c_trans->xmit_atomic) {
892 * have scheduled a delayed reconnect however - in this case 1017 if (printk_ratelimit())
893 * we should not interfere. 1018 printk(KERN_NOTICE "atomic_op %p conn xmit_atomic %p\n",
894 */ 1019 &rm->atomic, conn->c_trans->xmit_atomic);
895 if (rds_conn_state(conn) == RDS_CONN_DOWN && 1020 ret = -EOPNOTSUPP;
896 !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags)) 1021 goto out;
897 queue_delayed_work(rds_wq, &conn->c_conn_w, 0); 1022 }
1023
1024 rds_conn_connect_if_down(conn);
898 1025
899 ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs); 1026 ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
900 if (ret) { 1027 if (ret) {
@@ -938,7 +1065,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
938 rds_stats_inc(s_send_queued); 1065 rds_stats_inc(s_send_queued);
939 1066
940 if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) 1067 if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
941 rds_send_worker(&conn->c_send_w.work); 1068 rds_send_xmit(conn);
942 1069
943 rds_message_put(rm); 1070 rds_message_put(rm);
944 return payload_len; 1071 return payload_len;
@@ -966,20 +1093,15 @@ rds_send_pong(struct rds_connection *conn, __be16 dport)
966 int ret = 0; 1093 int ret = 0;
967 1094
968 rm = rds_message_alloc(0, GFP_ATOMIC); 1095 rm = rds_message_alloc(0, GFP_ATOMIC);
969 if (rm == NULL) { 1096 if (!rm) {
970 ret = -ENOMEM; 1097 ret = -ENOMEM;
971 goto out; 1098 goto out;
972 } 1099 }
973 1100
974 rm->m_daddr = conn->c_faddr; 1101 rm->m_daddr = conn->c_faddr;
1102 rm->data.op_active = 1;
975 1103
976 /* If the connection is down, trigger a connect. We may 1104 rds_conn_connect_if_down(conn);
977 * have scheduled a delayed reconnect however - in this case
978 * we should not interfere.
979 */
980 if (rds_conn_state(conn) == RDS_CONN_DOWN &&
981 !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
982 queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
983 1105
984 ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL); 1106 ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL);
985 if (ret) 1107 if (ret)
@@ -999,7 +1121,9 @@ rds_send_pong(struct rds_connection *conn, __be16 dport)
999 rds_stats_inc(s_send_queued); 1121 rds_stats_inc(s_send_queued);
1000 rds_stats_inc(s_send_pong); 1122 rds_stats_inc(s_send_pong);
1001 1123
1002 queue_delayed_work(rds_wq, &conn->c_send_w, 0); 1124 if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
1125 rds_send_xmit(conn);
1126
1003 rds_message_put(rm); 1127 rds_message_put(rm);
1004 return 0; 1128 return 0;
1005 1129
diff --git a/net/rds/stats.c b/net/rds/stats.c
index 7598eb07cfb1..10c759ccac0c 100644
--- a/net/rds/stats.c
+++ b/net/rds/stats.c
@@ -57,8 +57,8 @@ static const char *const rds_stat_names[] = {
57 "recv_ping", 57 "recv_ping",
58 "send_queue_empty", 58 "send_queue_empty",
59 "send_queue_full", 59 "send_queue_full",
60 "send_sem_contention", 60 "send_lock_contention",
61 "send_sem_queue_raced", 61 "send_lock_queue_raced",
62 "send_immediate_retry", 62 "send_immediate_retry",
63 "send_delayed_retry", 63 "send_delayed_retry",
64 "send_drop_acked", 64 "send_drop_acked",
@@ -143,7 +143,7 @@ void rds_stats_exit(void)
143 rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info); 143 rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info);
144} 144}
145 145
146int __init rds_stats_init(void) 146int rds_stats_init(void)
147{ 147{
148 rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info); 148 rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info);
149 return 0; 149 return 0;
diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c
index 7829a20325d3..25ad0c77a26c 100644
--- a/net/rds/sysctl.c
+++ b/net/rds/sysctl.c
@@ -105,13 +105,13 @@ void rds_sysctl_exit(void)
105 unregister_sysctl_table(rds_sysctl_reg_table); 105 unregister_sysctl_table(rds_sysctl_reg_table);
106} 106}
107 107
108int __init rds_sysctl_init(void) 108int rds_sysctl_init(void)
109{ 109{
110 rds_sysctl_reconnect_min = msecs_to_jiffies(1); 110 rds_sysctl_reconnect_min = msecs_to_jiffies(1);
111 rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min; 111 rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min;
112 112
113 rds_sysctl_reg_table = register_sysctl_paths(rds_sysctl_path, rds_sysctl_rds_table); 113 rds_sysctl_reg_table = register_sysctl_paths(rds_sysctl_path, rds_sysctl_rds_table);
114 if (rds_sysctl_reg_table == NULL) 114 if (!rds_sysctl_reg_table)
115 return -ENOMEM; 115 return -ENOMEM;
116 return 0; 116 return 0;
117} 117}
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index babf4577ff7d..08a8c6cf2d10 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -41,7 +41,7 @@
41/* only for info exporting */ 41/* only for info exporting */
42static DEFINE_SPINLOCK(rds_tcp_tc_list_lock); 42static DEFINE_SPINLOCK(rds_tcp_tc_list_lock);
43static LIST_HEAD(rds_tcp_tc_list); 43static LIST_HEAD(rds_tcp_tc_list);
44unsigned int rds_tcp_tc_count; 44static unsigned int rds_tcp_tc_count;
45 45
46/* Track rds_tcp_connection structs so they can be cleaned up */ 46/* Track rds_tcp_connection structs so they can be cleaned up */
47static DEFINE_SPINLOCK(rds_tcp_conn_lock); 47static DEFINE_SPINLOCK(rds_tcp_conn_lock);
@@ -200,7 +200,7 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
200 struct rds_tcp_connection *tc; 200 struct rds_tcp_connection *tc;
201 201
202 tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp); 202 tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp);
203 if (tc == NULL) 203 if (!tc)
204 return -ENOMEM; 204 return -ENOMEM;
205 205
206 tc->t_sock = NULL; 206 tc->t_sock = NULL;
@@ -243,7 +243,7 @@ static void rds_tcp_destroy_conns(void)
243 } 243 }
244} 244}
245 245
246void rds_tcp_exit(void) 246static void rds_tcp_exit(void)
247{ 247{
248 rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); 248 rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
249 rds_tcp_listen_stop(); 249 rds_tcp_listen_stop();
@@ -258,7 +258,6 @@ struct rds_transport rds_tcp_transport = {
258 .laddr_check = rds_tcp_laddr_check, 258 .laddr_check = rds_tcp_laddr_check,
259 .xmit_prepare = rds_tcp_xmit_prepare, 259 .xmit_prepare = rds_tcp_xmit_prepare,
260 .xmit_complete = rds_tcp_xmit_complete, 260 .xmit_complete = rds_tcp_xmit_complete,
261 .xmit_cong_map = rds_tcp_xmit_cong_map,
262 .xmit = rds_tcp_xmit, 261 .xmit = rds_tcp_xmit,
263 .recv = rds_tcp_recv, 262 .recv = rds_tcp_recv,
264 .conn_alloc = rds_tcp_conn_alloc, 263 .conn_alloc = rds_tcp_conn_alloc,
@@ -266,7 +265,6 @@ struct rds_transport rds_tcp_transport = {
266 .conn_connect = rds_tcp_conn_connect, 265 .conn_connect = rds_tcp_conn_connect,
267 .conn_shutdown = rds_tcp_conn_shutdown, 266 .conn_shutdown = rds_tcp_conn_shutdown,
268 .inc_copy_to_user = rds_tcp_inc_copy_to_user, 267 .inc_copy_to_user = rds_tcp_inc_copy_to_user,
269 .inc_purge = rds_tcp_inc_purge,
270 .inc_free = rds_tcp_inc_free, 268 .inc_free = rds_tcp_inc_free,
271 .stats_info_copy = rds_tcp_stats_info_copy, 269 .stats_info_copy = rds_tcp_stats_info_copy,
272 .exit = rds_tcp_exit, 270 .exit = rds_tcp_exit,
@@ -276,14 +274,14 @@ struct rds_transport rds_tcp_transport = {
276 .t_prefer_loopback = 1, 274 .t_prefer_loopback = 1,
277}; 275};
278 276
279int __init rds_tcp_init(void) 277static int rds_tcp_init(void)
280{ 278{
281 int ret; 279 int ret;
282 280
283 rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection", 281 rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection",
284 sizeof(struct rds_tcp_connection), 282 sizeof(struct rds_tcp_connection),
285 0, 0, NULL); 283 0, 0, NULL);
286 if (rds_tcp_conn_slab == NULL) { 284 if (!rds_tcp_conn_slab) {
287 ret = -ENOMEM; 285 ret = -ENOMEM;
288 goto out; 286 goto out;
289 } 287 }
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 844fa6b9cf5a..9cf2927d0021 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -43,8 +43,6 @@ struct rds_tcp_statistics {
43}; 43};
44 44
45/* tcp.c */ 45/* tcp.c */
46int __init rds_tcp_init(void);
47void rds_tcp_exit(void);
48void rds_tcp_tune(struct socket *sock); 46void rds_tcp_tune(struct socket *sock);
49void rds_tcp_nonagle(struct socket *sock); 47void rds_tcp_nonagle(struct socket *sock);
50void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn); 48void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn);
@@ -61,16 +59,15 @@ void rds_tcp_conn_shutdown(struct rds_connection *conn);
61void rds_tcp_state_change(struct sock *sk); 59void rds_tcp_state_change(struct sock *sk);
62 60
63/* tcp_listen.c */ 61/* tcp_listen.c */
64int __init rds_tcp_listen_init(void); 62int rds_tcp_listen_init(void);
65void rds_tcp_listen_stop(void); 63void rds_tcp_listen_stop(void);
66void rds_tcp_listen_data_ready(struct sock *sk, int bytes); 64void rds_tcp_listen_data_ready(struct sock *sk, int bytes);
67 65
68/* tcp_recv.c */ 66/* tcp_recv.c */
69int __init rds_tcp_recv_init(void); 67int rds_tcp_recv_init(void);
70void rds_tcp_recv_exit(void); 68void rds_tcp_recv_exit(void);
71void rds_tcp_data_ready(struct sock *sk, int bytes); 69void rds_tcp_data_ready(struct sock *sk, int bytes);
72int rds_tcp_recv(struct rds_connection *conn); 70int rds_tcp_recv(struct rds_connection *conn);
73void rds_tcp_inc_purge(struct rds_incoming *inc);
74void rds_tcp_inc_free(struct rds_incoming *inc); 71void rds_tcp_inc_free(struct rds_incoming *inc);
75int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, 72int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
76 size_t size); 73 size_t size);
@@ -81,8 +78,6 @@ void rds_tcp_xmit_complete(struct rds_connection *conn);
81int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, 78int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
82 unsigned int hdr_off, unsigned int sg, unsigned int off); 79 unsigned int hdr_off, unsigned int sg, unsigned int off);
83void rds_tcp_write_space(struct sock *sk); 80void rds_tcp_write_space(struct sock *sk);
84int rds_tcp_xmit_cong_map(struct rds_connection *conn,
85 struct rds_cong_map *map, unsigned long offset);
86 81
87/* tcp_stats.c */ 82/* tcp_stats.c */
88DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats); 83DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats);
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index c397524c039c..af95c8e058fc 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -43,9 +43,9 @@ void rds_tcp_state_change(struct sock *sk)
43 struct rds_connection *conn; 43 struct rds_connection *conn;
44 struct rds_tcp_connection *tc; 44 struct rds_tcp_connection *tc;
45 45
46 read_lock(&sk->sk_callback_lock); 46 read_lock_bh(&sk->sk_callback_lock);
47 conn = sk->sk_user_data; 47 conn = sk->sk_user_data;
48 if (conn == NULL) { 48 if (!conn) {
49 state_change = sk->sk_state_change; 49 state_change = sk->sk_state_change;
50 goto out; 50 goto out;
51 } 51 }
@@ -68,7 +68,7 @@ void rds_tcp_state_change(struct sock *sk)
68 break; 68 break;
69 } 69 }
70out: 70out:
71 read_unlock(&sk->sk_callback_lock); 71 read_unlock_bh(&sk->sk_callback_lock);
72 state_change(sk); 72 state_change(sk);
73} 73}
74 74
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 975183fe6950..8b5cc4aa8868 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -114,9 +114,9 @@ void rds_tcp_listen_data_ready(struct sock *sk, int bytes)
114 114
115 rdsdebug("listen data ready sk %p\n", sk); 115 rdsdebug("listen data ready sk %p\n", sk);
116 116
117 read_lock(&sk->sk_callback_lock); 117 read_lock_bh(&sk->sk_callback_lock);
118 ready = sk->sk_user_data; 118 ready = sk->sk_user_data;
119 if (ready == NULL) { /* check for teardown race */ 119 if (!ready) { /* check for teardown race */
120 ready = sk->sk_data_ready; 120 ready = sk->sk_data_ready;
121 goto out; 121 goto out;
122 } 122 }
@@ -131,11 +131,11 @@ void rds_tcp_listen_data_ready(struct sock *sk, int bytes)
131 queue_work(rds_wq, &rds_tcp_listen_work); 131 queue_work(rds_wq, &rds_tcp_listen_work);
132 132
133out: 133out:
134 read_unlock(&sk->sk_callback_lock); 134 read_unlock_bh(&sk->sk_callback_lock);
135 ready(sk, bytes); 135 ready(sk, bytes);
136} 136}
137 137
138int __init rds_tcp_listen_init(void) 138int rds_tcp_listen_init(void)
139{ 139{
140 struct sockaddr_in sin; 140 struct sockaddr_in sin;
141 struct socket *sock = NULL; 141 struct socket *sock = NULL;
@@ -178,7 +178,7 @@ void rds_tcp_listen_stop(void)
178 struct socket *sock = rds_tcp_listen_sock; 178 struct socket *sock = rds_tcp_listen_sock;
179 struct sock *sk; 179 struct sock *sk;
180 180
181 if (sock == NULL) 181 if (!sock)
182 return; 182 return;
183 183
184 sk = sock->sk; 184 sk = sock->sk;
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index 1aba6878fa5d..78205e25500a 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -39,7 +39,7 @@
39 39
40static struct kmem_cache *rds_tcp_incoming_slab; 40static struct kmem_cache *rds_tcp_incoming_slab;
41 41
42void rds_tcp_inc_purge(struct rds_incoming *inc) 42static void rds_tcp_inc_purge(struct rds_incoming *inc)
43{ 43{
44 struct rds_tcp_incoming *tinc; 44 struct rds_tcp_incoming *tinc;
45 tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); 45 tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
@@ -190,10 +190,10 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
190 * processing. 190 * processing.
191 */ 191 */
192 while (left) { 192 while (left) {
193 if (tinc == NULL) { 193 if (!tinc) {
194 tinc = kmem_cache_alloc(rds_tcp_incoming_slab, 194 tinc = kmem_cache_alloc(rds_tcp_incoming_slab,
195 arg->gfp); 195 arg->gfp);
196 if (tinc == NULL) { 196 if (!tinc) {
197 desc->error = -ENOMEM; 197 desc->error = -ENOMEM;
198 goto out; 198 goto out;
199 } 199 }
@@ -229,7 +229,7 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
229 229
230 if (left && tc->t_tinc_data_rem) { 230 if (left && tc->t_tinc_data_rem) {
231 clone = skb_clone(skb, arg->gfp); 231 clone = skb_clone(skb, arg->gfp);
232 if (clone == NULL) { 232 if (!clone) {
233 desc->error = -ENOMEM; 233 desc->error = -ENOMEM;
234 goto out; 234 goto out;
235 } 235 }
@@ -272,7 +272,8 @@ out:
272} 272}
273 273
274/* the caller has to hold the sock lock */ 274/* the caller has to hold the sock lock */
275int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp, enum km_type km) 275static int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp,
276 enum km_type km)
276{ 277{
277 struct rds_tcp_connection *tc = conn->c_transport_data; 278 struct rds_tcp_connection *tc = conn->c_transport_data;
278 struct socket *sock = tc->t_sock; 279 struct socket *sock = tc->t_sock;
@@ -324,9 +325,9 @@ void rds_tcp_data_ready(struct sock *sk, int bytes)
324 325
325 rdsdebug("data ready sk %p bytes %d\n", sk, bytes); 326 rdsdebug("data ready sk %p bytes %d\n", sk, bytes);
326 327
327 read_lock(&sk->sk_callback_lock); 328 read_lock_bh(&sk->sk_callback_lock);
328 conn = sk->sk_user_data; 329 conn = sk->sk_user_data;
329 if (conn == NULL) { /* check for teardown race */ 330 if (!conn) { /* check for teardown race */
330 ready = sk->sk_data_ready; 331 ready = sk->sk_data_ready;
331 goto out; 332 goto out;
332 } 333 }
@@ -338,16 +339,16 @@ void rds_tcp_data_ready(struct sock *sk, int bytes)
338 if (rds_tcp_read_sock(conn, GFP_ATOMIC, KM_SOFTIRQ0) == -ENOMEM) 339 if (rds_tcp_read_sock(conn, GFP_ATOMIC, KM_SOFTIRQ0) == -ENOMEM)
339 queue_delayed_work(rds_wq, &conn->c_recv_w, 0); 340 queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
340out: 341out:
341 read_unlock(&sk->sk_callback_lock); 342 read_unlock_bh(&sk->sk_callback_lock);
342 ready(sk, bytes); 343 ready(sk, bytes);
343} 344}
344 345
345int __init rds_tcp_recv_init(void) 346int rds_tcp_recv_init(void)
346{ 347{
347 rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming", 348 rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming",
348 sizeof(struct rds_tcp_incoming), 349 sizeof(struct rds_tcp_incoming),
349 0, 0, NULL); 350 0, 0, NULL);
350 if (rds_tcp_incoming_slab == NULL) 351 if (!rds_tcp_incoming_slab)
351 return -ENOMEM; 352 return -ENOMEM;
352 return 0; 353 return 0;
353} 354}
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index a28b895ff0d1..1b4fd68f0c7c 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -63,7 +63,7 @@ void rds_tcp_xmit_complete(struct rds_connection *conn)
63} 63}
64 64
65/* the core send_sem serializes this with other xmit and shutdown */ 65/* the core send_sem serializes this with other xmit and shutdown */
66int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len) 66static int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len)
67{ 67{
68 struct kvec vec = { 68 struct kvec vec = {
69 .iov_base = data, 69 .iov_base = data,
@@ -77,56 +77,6 @@ int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len)
77} 77}
78 78
79/* the core send_sem serializes this with other xmit and shutdown */ 79/* the core send_sem serializes this with other xmit and shutdown */
80int rds_tcp_xmit_cong_map(struct rds_connection *conn,
81 struct rds_cong_map *map, unsigned long offset)
82{
83 static struct rds_header rds_tcp_map_header = {
84 .h_flags = RDS_FLAG_CONG_BITMAP,
85 };
86 struct rds_tcp_connection *tc = conn->c_transport_data;
87 unsigned long i;
88 int ret;
89 int copied = 0;
90
91 /* Some problem claims cpu_to_be32(constant) isn't a constant. */
92 rds_tcp_map_header.h_len = cpu_to_be32(RDS_CONG_MAP_BYTES);
93
94 if (offset < sizeof(struct rds_header)) {
95 ret = rds_tcp_sendmsg(tc->t_sock,
96 (void *)&rds_tcp_map_header + offset,
97 sizeof(struct rds_header) - offset);
98 if (ret <= 0)
99 return ret;
100 offset += ret;
101 copied = ret;
102 if (offset < sizeof(struct rds_header))
103 return ret;
104 }
105
106 offset -= sizeof(struct rds_header);
107 i = offset / PAGE_SIZE;
108 offset = offset % PAGE_SIZE;
109 BUG_ON(i >= RDS_CONG_MAP_PAGES);
110
111 do {
112 ret = tc->t_sock->ops->sendpage(tc->t_sock,
113 virt_to_page(map->m_page_addrs[i]),
114 offset, PAGE_SIZE - offset,
115 MSG_DONTWAIT);
116 if (ret <= 0)
117 break;
118 copied += ret;
119 offset += ret;
120 if (offset == PAGE_SIZE) {
121 offset = 0;
122 i++;
123 }
124 } while (i < RDS_CONG_MAP_PAGES);
125
126 return copied ? copied : ret;
127}
128
129/* the core send_sem serializes this with other xmit and shutdown */
130int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, 80int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
131 unsigned int hdr_off, unsigned int sg, unsigned int off) 81 unsigned int hdr_off, unsigned int sg, unsigned int off)
132{ 82{
@@ -166,21 +116,21 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
166 goto out; 116 goto out;
167 } 117 }
168 118
169 while (sg < rm->m_nents) { 119 while (sg < rm->data.op_nents) {
170 ret = tc->t_sock->ops->sendpage(tc->t_sock, 120 ret = tc->t_sock->ops->sendpage(tc->t_sock,
171 sg_page(&rm->m_sg[sg]), 121 sg_page(&rm->data.op_sg[sg]),
172 rm->m_sg[sg].offset + off, 122 rm->data.op_sg[sg].offset + off,
173 rm->m_sg[sg].length - off, 123 rm->data.op_sg[sg].length - off,
174 MSG_DONTWAIT|MSG_NOSIGNAL); 124 MSG_DONTWAIT|MSG_NOSIGNAL);
175 rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->m_sg[sg]), 125 rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.op_sg[sg]),
176 rm->m_sg[sg].offset + off, rm->m_sg[sg].length - off, 126 rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off,
177 ret); 127 ret);
178 if (ret <= 0) 128 if (ret <= 0)
179 break; 129 break;
180 130
181 off += ret; 131 off += ret;
182 done += ret; 132 done += ret;
183 if (off == rm->m_sg[sg].length) { 133 if (off == rm->data.op_sg[sg].length) {
184 off = 0; 134 off = 0;
185 sg++; 135 sg++;
186 } 136 }
@@ -224,9 +174,9 @@ void rds_tcp_write_space(struct sock *sk)
224 struct rds_connection *conn; 174 struct rds_connection *conn;
225 struct rds_tcp_connection *tc; 175 struct rds_tcp_connection *tc;
226 176
227 read_lock(&sk->sk_callback_lock); 177 read_lock_bh(&sk->sk_callback_lock);
228 conn = sk->sk_user_data; 178 conn = sk->sk_user_data;
229 if (conn == NULL) { 179 if (!conn) {
230 write_space = sk->sk_write_space; 180 write_space = sk->sk_write_space;
231 goto out; 181 goto out;
232 } 182 }
@@ -244,7 +194,7 @@ void rds_tcp_write_space(struct sock *sk)
244 queue_delayed_work(rds_wq, &conn->c_send_w, 0); 194 queue_delayed_work(rds_wq, &conn->c_send_w, 0);
245 195
246out: 196out:
247 read_unlock(&sk->sk_callback_lock); 197 read_unlock_bh(&sk->sk_callback_lock);
248 198
249 /* 199 /*
250 * write_space is only called when data leaves tcp's send queue if 200 * write_space is only called when data leaves tcp's send queue if
diff --git a/net/rds/threads.c b/net/rds/threads.c
index 786c20eaaf5e..0fd90f8c5f59 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -61,7 +61,7 @@
61 * 61 *
62 * Transition to state DISCONNECTING/DOWN: 62 * Transition to state DISCONNECTING/DOWN:
63 * - Inside the shutdown worker; synchronizes with xmit path 63 * - Inside the shutdown worker; synchronizes with xmit path
64 * through c_send_lock, and with connection management callbacks 64 * through RDS_IN_XMIT, and with connection management callbacks
65 * via c_cm_lock. 65 * via c_cm_lock.
66 * 66 *
67 * For receive callbacks, we rely on the underlying transport 67 * For receive callbacks, we rely on the underlying transport
@@ -110,7 +110,7 @@ EXPORT_SYMBOL_GPL(rds_connect_complete);
110 * We should *always* start with a random backoff; otherwise a broken connection 110 * We should *always* start with a random backoff; otherwise a broken connection
111 * will always take several iterations to be re-established. 111 * will always take several iterations to be re-established.
112 */ 112 */
113static void rds_queue_reconnect(struct rds_connection *conn) 113void rds_queue_reconnect(struct rds_connection *conn)
114{ 114{
115 unsigned long rand; 115 unsigned long rand;
116 116
@@ -156,58 +156,6 @@ void rds_connect_worker(struct work_struct *work)
156 } 156 }
157} 157}
158 158
159void rds_shutdown_worker(struct work_struct *work)
160{
161 struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
162
163 /* shut it down unless it's down already */
164 if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
165 /*
166 * Quiesce the connection mgmt handlers before we start tearing
167 * things down. We don't hold the mutex for the entire
168 * duration of the shutdown operation, else we may be
169 * deadlocking with the CM handler. Instead, the CM event
170 * handler is supposed to check for state DISCONNECTING
171 */
172 mutex_lock(&conn->c_cm_lock);
173 if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING) &&
174 !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) {
175 rds_conn_error(conn, "shutdown called in state %d\n",
176 atomic_read(&conn->c_state));
177 mutex_unlock(&conn->c_cm_lock);
178 return;
179 }
180 mutex_unlock(&conn->c_cm_lock);
181
182 mutex_lock(&conn->c_send_lock);
183 conn->c_trans->conn_shutdown(conn);
184 rds_conn_reset(conn);
185 mutex_unlock(&conn->c_send_lock);
186
187 if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) {
188 /* This can happen - eg when we're in the middle of tearing
189 * down the connection, and someone unloads the rds module.
190 * Quite reproduceable with loopback connections.
191 * Mostly harmless.
192 */
193 rds_conn_error(conn,
194 "%s: failed to transition to state DOWN, "
195 "current state is %d\n",
196 __func__,
197 atomic_read(&conn->c_state));
198 return;
199 }
200 }
201
202 /* Then reconnect if it's still live.
203 * The passive side of an IB loopback connection is never added
204 * to the conn hash, so we never trigger a reconnect on this
205 * conn - the reconnect is always triggered by the active peer. */
206 cancel_delayed_work(&conn->c_conn_w);
207 if (!hlist_unhashed(&conn->c_hash_node))
208 rds_queue_reconnect(conn);
209}
210
211void rds_send_worker(struct work_struct *work) 159void rds_send_worker(struct work_struct *work)
212{ 160{
213 struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work); 161 struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work);
@@ -252,15 +200,22 @@ void rds_recv_worker(struct work_struct *work)
252 } 200 }
253} 201}
254 202
203void rds_shutdown_worker(struct work_struct *work)
204{
205 struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
206
207 rds_conn_shutdown(conn);
208}
209
255void rds_threads_exit(void) 210void rds_threads_exit(void)
256{ 211{
257 destroy_workqueue(rds_wq); 212 destroy_workqueue(rds_wq);
258} 213}
259 214
260int __init rds_threads_init(void) 215int rds_threads_init(void)
261{ 216{
262 rds_wq = create_workqueue("krdsd"); 217 rds_wq = create_singlethread_workqueue("krdsd");
263 if (rds_wq == NULL) 218 if (!rds_wq)
264 return -ENOMEM; 219 return -ENOMEM;
265 220
266 return 0; 221 return 0;
diff --git a/net/rds/transport.c b/net/rds/transport.c
index 7e1067901353..7f2ac4fec367 100644
--- a/net/rds/transport.c
+++ b/net/rds/transport.c
@@ -71,19 +71,28 @@ void rds_trans_unregister(struct rds_transport *trans)
71} 71}
72EXPORT_SYMBOL_GPL(rds_trans_unregister); 72EXPORT_SYMBOL_GPL(rds_trans_unregister);
73 73
74void rds_trans_put(struct rds_transport *trans)
75{
76 if (trans && trans->t_owner)
77 module_put(trans->t_owner);
78}
79
74struct rds_transport *rds_trans_get_preferred(__be32 addr) 80struct rds_transport *rds_trans_get_preferred(__be32 addr)
75{ 81{
76 struct rds_transport *ret = NULL; 82 struct rds_transport *ret = NULL;
77 int i; 83 struct rds_transport *trans;
84 unsigned int i;
78 85
79 if (IN_LOOPBACK(ntohl(addr))) 86 if (IN_LOOPBACK(ntohl(addr)))
80 return &rds_loop_transport; 87 return &rds_loop_transport;
81 88
82 down_read(&rds_trans_sem); 89 down_read(&rds_trans_sem);
83 for (i = 0; i < RDS_TRANS_COUNT; i++) 90 for (i = 0; i < RDS_TRANS_COUNT; i++) {
84 { 91 trans = transports[i];
85 if (transports[i] && (transports[i]->laddr_check(addr) == 0)) { 92
86 ret = transports[i]; 93 if (trans && (trans->laddr_check(addr) == 0) &&
94 (!trans->t_owner || try_module_get(trans->t_owner))) {
95 ret = trans;
87 break; 96 break;
88 } 97 }
89 } 98 }
diff --git a/net/rds/xlist.h b/net/rds/xlist.h
new file mode 100644
index 000000000000..e6b5190daddd
--- /dev/null
+++ b/net/rds/xlist.h
@@ -0,0 +1,80 @@
1#ifndef _LINUX_XLIST_H
2#define _LINUX_XLIST_H
3
4#include <linux/stddef.h>
5#include <linux/poison.h>
6#include <linux/prefetch.h>
7#include <asm/system.h>
8
9struct xlist_head {
10 struct xlist_head *next;
11};
12
13static inline void INIT_XLIST_HEAD(struct xlist_head *list)
14{
15 list->next = NULL;
16}
17
18static inline int xlist_empty(struct xlist_head *head)
19{
20 return head->next == NULL;
21}
22
23static inline void xlist_add(struct xlist_head *new, struct xlist_head *tail,
24 struct xlist_head *head)
25{
26 struct xlist_head *cur;
27 struct xlist_head *check;
28
29 while (1) {
30 cur = head->next;
31 tail->next = cur;
32 check = cmpxchg(&head->next, cur, new);
33 if (check == cur)
34 break;
35 }
36}
37
38static inline struct xlist_head *xlist_del_head(struct xlist_head *head)
39{
40 struct xlist_head *cur;
41 struct xlist_head *check;
42 struct xlist_head *next;
43
44 while (1) {
45 cur = head->next;
46 if (!cur)
47 goto out;
48
49 next = cur->next;
50 check = cmpxchg(&head->next, cur, next);
51 if (check == cur)
52 goto out;
53 }
54out:
55 return cur;
56}
57
58static inline struct xlist_head *xlist_del_head_fast(struct xlist_head *head)
59{
60 struct xlist_head *cur;
61
62 cur = head->next;
63 if (!cur)
64 return NULL;
65
66 head->next = cur->next;
67 return cur;
68}
69
70static inline void xlist_splice(struct xlist_head *list,
71 struct xlist_head *head)
72{
73 struct xlist_head *cur;
74
75 WARN_ON(head->next);
76 cur = xchg(&list->next, NULL);
77 head->next = cur;
78}
79
80#endif
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 51875a0c5d48..04f599089e6d 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -1241,6 +1241,7 @@ static const struct file_operations rfkill_fops = {
1241 .unlocked_ioctl = rfkill_fop_ioctl, 1241 .unlocked_ioctl = rfkill_fop_ioctl,
1242 .compat_ioctl = rfkill_fop_ioctl, 1242 .compat_ioctl = rfkill_fop_ioctl,
1243#endif 1243#endif
1244 .llseek = no_llseek,
1244}; 1245};
1245 1246
1246static struct miscdevice rfkill_miscdev = { 1247static struct miscdevice rfkill_miscdev = {
diff --git a/net/rfkill/input.c b/net/rfkill/input.c
index 3713d7ecab96..1bca6d49ec96 100644
--- a/net/rfkill/input.c
+++ b/net/rfkill/input.c
@@ -142,7 +142,7 @@ static unsigned long rfkill_last_scheduled;
142static unsigned long rfkill_ratelimit(const unsigned long last) 142static unsigned long rfkill_ratelimit(const unsigned long last)
143{ 143{
144 const unsigned long delay = msecs_to_jiffies(RFKILL_OPS_DELAY); 144 const unsigned long delay = msecs_to_jiffies(RFKILL_OPS_DELAY);
145 return (time_after(jiffies, last + delay)) ? 0 : delay; 145 return time_after(jiffies, last + delay) ? 0 : delay;
146} 146}
147 147
148static void rfkill_schedule_ratelimited(void) 148static void rfkill_schedule_ratelimited(void)
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 8e45e76a95f5..d952e7eac188 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -679,7 +679,7 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
679 if (addr_len == sizeof(struct sockaddr_rose) && addr->srose_ndigis > 1) 679 if (addr_len == sizeof(struct sockaddr_rose) && addr->srose_ndigis > 1)
680 return -EINVAL; 680 return -EINVAL;
681 681
682 if (addr->srose_ndigis > ROSE_MAX_DIGIS) 682 if ((unsigned int) addr->srose_ndigis > ROSE_MAX_DIGIS)
683 return -EINVAL; 683 return -EINVAL;
684 684
685 if ((dev = rose_dev_get(&addr->srose_addr)) == NULL) { 685 if ((dev = rose_dev_get(&addr->srose_addr)) == NULL) {
@@ -739,7 +739,7 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le
739 if (addr_len == sizeof(struct sockaddr_rose) && addr->srose_ndigis > 1) 739 if (addr_len == sizeof(struct sockaddr_rose) && addr->srose_ndigis > 1)
740 return -EINVAL; 740 return -EINVAL;
741 741
742 if (addr->srose_ndigis > ROSE_MAX_DIGIS) 742 if ((unsigned int) addr->srose_ndigis > ROSE_MAX_DIGIS)
743 return -EINVAL; 743 return -EINVAL;
744 744
745 /* Source + Destination digis should not exceed ROSE_MAX_DIGIS */ 745 /* Source + Destination digis should not exceed ROSE_MAX_DIGIS */
diff --git a/net/rose/rose_link.c b/net/rose/rose_link.c
index a750a28e0221..fa5f5641a2c2 100644
--- a/net/rose/rose_link.c
+++ b/net/rose/rose_link.c
@@ -114,7 +114,7 @@ static int rose_send_frame(struct sk_buff *skb, struct rose_neigh *neigh)
114 if (ax25s) 114 if (ax25s)
115 ax25_cb_put(ax25s); 115 ax25_cb_put(ax25s);
116 116
117 return (neigh->ax25 != NULL); 117 return neigh->ax25 != NULL;
118} 118}
119 119
120/* 120/*
@@ -137,7 +137,7 @@ static int rose_link_up(struct rose_neigh *neigh)
137 if (ax25s) 137 if (ax25s)
138 ax25_cb_put(ax25s); 138 ax25_cb_put(ax25s);
139 139
140 return (neigh->ax25 != NULL); 140 return neigh->ax25 != NULL;
141} 141}
142 142
143/* 143/*
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 2f691fb180d1..a36270a994d7 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -518,6 +518,16 @@ config NET_ACT_SKBEDIT
518 To compile this code as a module, choose M here: the 518 To compile this code as a module, choose M here: the
519 module will be called act_skbedit. 519 module will be called act_skbedit.
520 520
521config NET_ACT_CSUM
522 tristate "Checksum Updating"
523 depends on NET_CLS_ACT && INET
524 ---help---
525 Say Y here to update some common checksum after some direct
526 packet alterations.
527
528 To compile this code as a module, choose M here: the
529 module will be called act_csum.
530
521config NET_CLS_IND 531config NET_CLS_IND
522 bool "Incoming device classification" 532 bool "Incoming device classification"
523 depends on NET_CLS_U32 || NET_CLS_FW 533 depends on NET_CLS_U32 || NET_CLS_FW
diff --git a/net/sched/Makefile b/net/sched/Makefile
index f14e71bfa58f..960f5dba6304 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -15,6 +15,7 @@ obj-$(CONFIG_NET_ACT_NAT) += act_nat.o
15obj-$(CONFIG_NET_ACT_PEDIT) += act_pedit.o 15obj-$(CONFIG_NET_ACT_PEDIT) += act_pedit.o
16obj-$(CONFIG_NET_ACT_SIMP) += act_simple.o 16obj-$(CONFIG_NET_ACT_SIMP) += act_simple.o
17obj-$(CONFIG_NET_ACT_SKBEDIT) += act_skbedit.o 17obj-$(CONFIG_NET_ACT_SKBEDIT) += act_skbedit.o
18obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o
18obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o 19obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o
19obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o 20obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o
20obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o 21obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
new file mode 100644
index 000000000000..67dc7ce9b63a
--- /dev/null
+++ b/net/sched/act_csum.c
@@ -0,0 +1,595 @@
1/*
2 * Checksum updating actions
3 *
4 * Copyright (c) 2010 Gregoire Baron <baronchon@n7mm.org>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the Free
8 * Software Foundation; either version 2 of the License, or (at your option)
9 * any later version.
10 *
11 */
12
13#include <linux/types.h>
14#include <linux/init.h>
15#include <linux/kernel.h>
16#include <linux/module.h>
17#include <linux/spinlock.h>
18
19#include <linux/netlink.h>
20#include <net/netlink.h>
21#include <linux/rtnetlink.h>
22
23#include <linux/skbuff.h>
24
25#include <net/ip.h>
26#include <net/ipv6.h>
27#include <net/icmp.h>
28#include <linux/icmpv6.h>
29#include <linux/igmp.h>
30#include <net/tcp.h>
31#include <net/udp.h>
32#include <net/ip6_checksum.h>
33
34#include <net/act_api.h>
35
36#include <linux/tc_act/tc_csum.h>
37#include <net/tc_act/tc_csum.h>
38
39#define CSUM_TAB_MASK 15
40static struct tcf_common *tcf_csum_ht[CSUM_TAB_MASK + 1];
41static u32 csum_idx_gen;
42static DEFINE_RWLOCK(csum_lock);
43
44static struct tcf_hashinfo csum_hash_info = {
45 .htab = tcf_csum_ht,
46 .hmask = CSUM_TAB_MASK,
47 .lock = &csum_lock,
48};
49
50static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = {
51 [TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), },
52};
53
54static int tcf_csum_init(struct nlattr *nla, struct nlattr *est,
55 struct tc_action *a, int ovr, int bind)
56{
57 struct nlattr *tb[TCA_CSUM_MAX + 1];
58 struct tc_csum *parm;
59 struct tcf_common *pc;
60 struct tcf_csum *p;
61 int ret = 0, err;
62
63 if (nla == NULL)
64 return -EINVAL;
65
66 err = nla_parse_nested(tb, TCA_CSUM_MAX, nla,csum_policy);
67 if (err < 0)
68 return err;
69
70 if (tb[TCA_CSUM_PARMS] == NULL)
71 return -EINVAL;
72 parm = nla_data(tb[TCA_CSUM_PARMS]);
73
74 pc = tcf_hash_check(parm->index, a, bind, &csum_hash_info);
75 if (!pc) {
76 pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind,
77 &csum_idx_gen, &csum_hash_info);
78 if (IS_ERR(pc))
79 return PTR_ERR(pc);
80 p = to_tcf_csum(pc);
81 ret = ACT_P_CREATED;
82 } else {
83 p = to_tcf_csum(pc);
84 if (!ovr) {
85 tcf_hash_release(pc, bind, &csum_hash_info);
86 return -EEXIST;
87 }
88 }
89
90 spin_lock_bh(&p->tcf_lock);
91 p->tcf_action = parm->action;
92 p->update_flags = parm->update_flags;
93 spin_unlock_bh(&p->tcf_lock);
94
95 if (ret == ACT_P_CREATED)
96 tcf_hash_insert(pc, &csum_hash_info);
97
98 return ret;
99}
100
101static int tcf_csum_cleanup(struct tc_action *a, int bind)
102{
103 struct tcf_csum *p = a->priv;
104 return tcf_hash_release(&p->common, bind, &csum_hash_info);
105}
106
107/**
108 * tcf_csum_skb_nextlayer - Get next layer pointer
109 * @skb: sk_buff to use
110 * @ihl: previous summed headers length
111 * @ipl: complete packet length
112 * @jhl: next header length
113 *
114 * Check the expected next layer availability in the specified sk_buff.
115 * Return the next layer pointer if pass, NULL otherwise.
116 */
117static void *tcf_csum_skb_nextlayer(struct sk_buff *skb,
118 unsigned int ihl, unsigned int ipl,
119 unsigned int jhl)
120{
121 int ntkoff = skb_network_offset(skb);
122 int hl = ihl + jhl;
123
124 if (!pskb_may_pull(skb, ipl + ntkoff) || (ipl < hl) ||
125 (skb_cloned(skb) &&
126 !skb_clone_writable(skb, hl + ntkoff) &&
127 pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
128 return NULL;
129 else
130 return (void *)(skb_network_header(skb) + ihl);
131}
132
133static int tcf_csum_ipv4_icmp(struct sk_buff *skb,
134 unsigned int ihl, unsigned int ipl)
135{
136 struct icmphdr *icmph;
137
138 icmph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmph));
139 if (icmph == NULL)
140 return 0;
141
142 icmph->checksum = 0;
143 skb->csum = csum_partial(icmph, ipl - ihl, 0);
144 icmph->checksum = csum_fold(skb->csum);
145
146 skb->ip_summed = CHECKSUM_NONE;
147
148 return 1;
149}
150
151static int tcf_csum_ipv4_igmp(struct sk_buff *skb,
152 unsigned int ihl, unsigned int ipl)
153{
154 struct igmphdr *igmph;
155
156 igmph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*igmph));
157 if (igmph == NULL)
158 return 0;
159
160 igmph->csum = 0;
161 skb->csum = csum_partial(igmph, ipl - ihl, 0);
162 igmph->csum = csum_fold(skb->csum);
163
164 skb->ip_summed = CHECKSUM_NONE;
165
166 return 1;
167}
168
169static int tcf_csum_ipv6_icmp(struct sk_buff *skb, struct ipv6hdr *ip6h,
170 unsigned int ihl, unsigned int ipl)
171{
172 struct icmp6hdr *icmp6h;
173
174 icmp6h = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmp6h));
175 if (icmp6h == NULL)
176 return 0;
177
178 icmp6h->icmp6_cksum = 0;
179 skb->csum = csum_partial(icmp6h, ipl - ihl, 0);
180 icmp6h->icmp6_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
181 ipl - ihl, IPPROTO_ICMPV6,
182 skb->csum);
183
184 skb->ip_summed = CHECKSUM_NONE;
185
186 return 1;
187}
188
189static int tcf_csum_ipv4_tcp(struct sk_buff *skb, struct iphdr *iph,
190 unsigned int ihl, unsigned int ipl)
191{
192 struct tcphdr *tcph;
193
194 tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph));
195 if (tcph == NULL)
196 return 0;
197
198 tcph->check = 0;
199 skb->csum = csum_partial(tcph, ipl - ihl, 0);
200 tcph->check = tcp_v4_check(ipl - ihl,
201 iph->saddr, iph->daddr, skb->csum);
202
203 skb->ip_summed = CHECKSUM_NONE;
204
205 return 1;
206}
207
208static int tcf_csum_ipv6_tcp(struct sk_buff *skb, struct ipv6hdr *ip6h,
209 unsigned int ihl, unsigned int ipl)
210{
211 struct tcphdr *tcph;
212
213 tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph));
214 if (tcph == NULL)
215 return 0;
216
217 tcph->check = 0;
218 skb->csum = csum_partial(tcph, ipl - ihl, 0);
219 tcph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
220 ipl - ihl, IPPROTO_TCP,
221 skb->csum);
222
223 skb->ip_summed = CHECKSUM_NONE;
224
225 return 1;
226}
227
228static int tcf_csum_ipv4_udp(struct sk_buff *skb, struct iphdr *iph,
229 unsigned int ihl, unsigned int ipl, int udplite)
230{
231 struct udphdr *udph;
232 u16 ul;
233
234 /*
235 * Support both UDP and UDPLITE checksum algorithms, Don't use
236 * udph->len to get the real length without any protocol check,
237 * UDPLITE uses udph->len for another thing,
238 * Use iph->tot_len, or just ipl.
239 */
240
241 udph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*udph));
242 if (udph == NULL)
243 return 0;
244
245 ul = ntohs(udph->len);
246
247 if (udplite || udph->check) {
248
249 udph->check = 0;
250
251 if (udplite) {
252 if (ul == 0)
253 skb->csum = csum_partial(udph, ipl - ihl, 0);
254 else if ((ul >= sizeof(*udph)) && (ul <= ipl - ihl))
255 skb->csum = csum_partial(udph, ul, 0);
256 else
257 goto ignore_obscure_skb;
258 } else {
259 if (ul != ipl - ihl)
260 goto ignore_obscure_skb;
261
262 skb->csum = csum_partial(udph, ul, 0);
263 }
264
265 udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
266 ul, iph->protocol,
267 skb->csum);
268
269 if (!udph->check)
270 udph->check = CSUM_MANGLED_0;
271 }
272
273 skb->ip_summed = CHECKSUM_NONE;
274
275ignore_obscure_skb:
276 return 1;
277}
278
279static int tcf_csum_ipv6_udp(struct sk_buff *skb, struct ipv6hdr *ip6h,
280 unsigned int ihl, unsigned int ipl, int udplite)
281{
282 struct udphdr *udph;
283 u16 ul;
284
285 /*
286 * Support both UDP and UDPLITE checksum algorithms, Don't use
287 * udph->len to get the real length without any protocol check,
288 * UDPLITE uses udph->len for another thing,
289 * Use ip6h->payload_len + sizeof(*ip6h) ... , or just ipl.
290 */
291
292 udph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*udph));
293 if (udph == NULL)
294 return 0;
295
296 ul = ntohs(udph->len);
297
298 udph->check = 0;
299
300 if (udplite) {
301 if (ul == 0)
302 skb->csum = csum_partial(udph, ipl - ihl, 0);
303
304 else if ((ul >= sizeof(*udph)) && (ul <= ipl - ihl))
305 skb->csum = csum_partial(udph, ul, 0);
306
307 else
308 goto ignore_obscure_skb;
309 } else {
310 if (ul != ipl - ihl)
311 goto ignore_obscure_skb;
312
313 skb->csum = csum_partial(udph, ul, 0);
314 }
315
316 udph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, ul,
317 udplite ? IPPROTO_UDPLITE : IPPROTO_UDP,
318 skb->csum);
319
320 if (!udph->check)
321 udph->check = CSUM_MANGLED_0;
322
323 skb->ip_summed = CHECKSUM_NONE;
324
325ignore_obscure_skb:
326 return 1;
327}
328
329static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags)
330{
331 struct iphdr *iph;
332 int ntkoff;
333
334 ntkoff = skb_network_offset(skb);
335
336 if (!pskb_may_pull(skb, sizeof(*iph) + ntkoff))
337 goto fail;
338
339 iph = ip_hdr(skb);
340
341 switch (iph->frag_off & htons(IP_OFFSET) ? 0 : iph->protocol) {
342 case IPPROTO_ICMP:
343 if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP)
344 if (!tcf_csum_ipv4_icmp(skb, iph->ihl * 4,
345 ntohs(iph->tot_len)))
346 goto fail;
347 break;
348 case IPPROTO_IGMP:
349 if (update_flags & TCA_CSUM_UPDATE_FLAG_IGMP)
350 if (!tcf_csum_ipv4_igmp(skb, iph->ihl * 4,
351 ntohs(iph->tot_len)))
352 goto fail;
353 break;
354 case IPPROTO_TCP:
355 if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP)
356 if (!tcf_csum_ipv4_tcp(skb, iph, iph->ihl * 4,
357 ntohs(iph->tot_len)))
358 goto fail;
359 break;
360 case IPPROTO_UDP:
361 if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP)
362 if (!tcf_csum_ipv4_udp(skb, iph, iph->ihl * 4,
363 ntohs(iph->tot_len), 0))
364 goto fail;
365 break;
366 case IPPROTO_UDPLITE:
367 if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE)
368 if (!tcf_csum_ipv4_udp(skb, iph, iph->ihl * 4,
369 ntohs(iph->tot_len), 1))
370 goto fail;
371 break;
372 }
373
374 if (update_flags & TCA_CSUM_UPDATE_FLAG_IPV4HDR) {
375 if (skb_cloned(skb) &&
376 !skb_clone_writable(skb, sizeof(*iph) + ntkoff) &&
377 pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
378 goto fail;
379
380 ip_send_check(iph);
381 }
382
383 return 1;
384
385fail:
386 return 0;
387}
388
389static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh,
390 unsigned int ixhl, unsigned int *pl)
391{
392 int off, len, optlen;
393 unsigned char *xh = (void *)ip6xh;
394
395 off = sizeof(*ip6xh);
396 len = ixhl - off;
397
398 while (len > 1) {
399 switch (xh[off]) {
400 case IPV6_TLV_PAD0:
401 optlen = 1;
402 break;
403 case IPV6_TLV_JUMBO:
404 optlen = xh[off + 1] + 2;
405 if (optlen != 6 || len < 6 || (off & 3) != 2)
406 /* wrong jumbo option length/alignment */
407 return 0;
408 *pl = ntohl(*(__be32 *)(xh + off + 2));
409 goto done;
410 default:
411 optlen = xh[off + 1] + 2;
412 if (optlen > len)
413 /* ignore obscure options */
414 goto done;
415 break;
416 }
417 off += optlen;
418 len -= optlen;
419 }
420
421done:
422 return 1;
423}
424
425static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags)
426{
427 struct ipv6hdr *ip6h;
428 struct ipv6_opt_hdr *ip6xh;
429 unsigned int hl, ixhl;
430 unsigned int pl;
431 int ntkoff;
432 u8 nexthdr;
433
434 ntkoff = skb_network_offset(skb);
435
436 hl = sizeof(*ip6h);
437
438 if (!pskb_may_pull(skb, hl + ntkoff))
439 goto fail;
440
441 ip6h = ipv6_hdr(skb);
442
443 pl = ntohs(ip6h->payload_len);
444 nexthdr = ip6h->nexthdr;
445
446 do {
447 switch (nexthdr) {
448 case NEXTHDR_FRAGMENT:
449 goto ignore_skb;
450 case NEXTHDR_ROUTING:
451 case NEXTHDR_HOP:
452 case NEXTHDR_DEST:
453 if (!pskb_may_pull(skb, hl + sizeof(*ip6xh) + ntkoff))
454 goto fail;
455 ip6xh = (void *)(skb_network_header(skb) + hl);
456 ixhl = ipv6_optlen(ip6xh);
457 if (!pskb_may_pull(skb, hl + ixhl + ntkoff))
458 goto fail;
459 if ((nexthdr == NEXTHDR_HOP) &&
460 !(tcf_csum_ipv6_hopopts(ip6xh, ixhl, &pl)))
461 goto fail;
462 nexthdr = ip6xh->nexthdr;
463 hl += ixhl;
464 break;
465 case IPPROTO_ICMPV6:
466 if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP)
467 if (!tcf_csum_ipv6_icmp(skb, ip6h,
468 hl, pl + sizeof(*ip6h)))
469 goto fail;
470 goto done;
471 case IPPROTO_TCP:
472 if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP)
473 if (!tcf_csum_ipv6_tcp(skb, ip6h,
474 hl, pl + sizeof(*ip6h)))
475 goto fail;
476 goto done;
477 case IPPROTO_UDP:
478 if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP)
479 if (!tcf_csum_ipv6_udp(skb, ip6h, hl,
480 pl + sizeof(*ip6h), 0))
481 goto fail;
482 goto done;
483 case IPPROTO_UDPLITE:
484 if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE)
485 if (!tcf_csum_ipv6_udp(skb, ip6h, hl,
486 pl + sizeof(*ip6h), 1))
487 goto fail;
488 goto done;
489 default:
490 goto ignore_skb;
491 }
492 } while (pskb_may_pull(skb, hl + 1 + ntkoff));
493
494done:
495ignore_skb:
496 return 1;
497
498fail:
499 return 0;
500}
501
502static int tcf_csum(struct sk_buff *skb,
503 struct tc_action *a, struct tcf_result *res)
504{
505 struct tcf_csum *p = a->priv;
506 int action;
507 u32 update_flags;
508
509 spin_lock(&p->tcf_lock);
510 p->tcf_tm.lastuse = jiffies;
511 p->tcf_bstats.bytes += qdisc_pkt_len(skb);
512 p->tcf_bstats.packets++;
513 action = p->tcf_action;
514 update_flags = p->update_flags;
515 spin_unlock(&p->tcf_lock);
516
517 if (unlikely(action == TC_ACT_SHOT))
518 goto drop;
519
520 switch (skb->protocol) {
521 case cpu_to_be16(ETH_P_IP):
522 if (!tcf_csum_ipv4(skb, update_flags))
523 goto drop;
524 break;
525 case cpu_to_be16(ETH_P_IPV6):
526 if (!tcf_csum_ipv6(skb, update_flags))
527 goto drop;
528 break;
529 }
530
531 return action;
532
533drop:
534 spin_lock(&p->tcf_lock);
535 p->tcf_qstats.drops++;
536 spin_unlock(&p->tcf_lock);
537 return TC_ACT_SHOT;
538}
539
540static int tcf_csum_dump(struct sk_buff *skb,
541 struct tc_action *a, int bind, int ref)
542{
543 unsigned char *b = skb_tail_pointer(skb);
544 struct tcf_csum *p = a->priv;
545 struct tc_csum opt = {
546 .update_flags = p->update_flags,
547 .index = p->tcf_index,
548 .action = p->tcf_action,
549 .refcnt = p->tcf_refcnt - ref,
550 .bindcnt = p->tcf_bindcnt - bind,
551 };
552 struct tcf_t t;
553
554 NLA_PUT(skb, TCA_CSUM_PARMS, sizeof(opt), &opt);
555 t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
556 t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
557 t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
558 NLA_PUT(skb, TCA_CSUM_TM, sizeof(t), &t);
559
560 return skb->len;
561
562nla_put_failure:
563 nlmsg_trim(skb, b);
564 return -1;
565}
566
567static struct tc_action_ops act_csum_ops = {
568 .kind = "csum",
569 .hinfo = &csum_hash_info,
570 .type = TCA_ACT_CSUM,
571 .capab = TCA_CAP_NONE,
572 .owner = THIS_MODULE,
573 .act = tcf_csum,
574 .dump = tcf_csum_dump,
575 .cleanup = tcf_csum_cleanup,
576 .lookup = tcf_hash_search,
577 .init = tcf_csum_init,
578 .walk = tcf_generic_walker
579};
580
581MODULE_DESCRIPTION("Checksum updating actions");
582MODULE_LICENSE("GPL");
583
584static int __init csum_init_module(void)
585{
586 return tcf_register_action(&act_csum_ops);
587}
588
589static void __exit csum_cleanup_module(void)
590{
591 tcf_unregister_action(&act_csum_ops);
592}
593
594module_init(csum_init_module);
595module_exit(csum_cleanup_module);
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 8406c6654990..c2ed90a4c0b4 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -152,21 +152,24 @@ static int tcf_gact(struct sk_buff *skb, struct tc_action *a, struct tcf_result
152static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) 152static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
153{ 153{
154 unsigned char *b = skb_tail_pointer(skb); 154 unsigned char *b = skb_tail_pointer(skb);
155 struct tc_gact opt;
156 struct tcf_gact *gact = a->priv; 155 struct tcf_gact *gact = a->priv;
156 struct tc_gact opt = {
157 .index = gact->tcf_index,
158 .refcnt = gact->tcf_refcnt - ref,
159 .bindcnt = gact->tcf_bindcnt - bind,
160 .action = gact->tcf_action,
161 };
157 struct tcf_t t; 162 struct tcf_t t;
158 163
159 opt.index = gact->tcf_index;
160 opt.refcnt = gact->tcf_refcnt - ref;
161 opt.bindcnt = gact->tcf_bindcnt - bind;
162 opt.action = gact->tcf_action;
163 NLA_PUT(skb, TCA_GACT_PARMS, sizeof(opt), &opt); 164 NLA_PUT(skb, TCA_GACT_PARMS, sizeof(opt), &opt);
164#ifdef CONFIG_GACT_PROB 165#ifdef CONFIG_GACT_PROB
165 if (gact->tcfg_ptype) { 166 if (gact->tcfg_ptype) {
166 struct tc_gact_p p_opt; 167 struct tc_gact_p p_opt = {
167 p_opt.paction = gact->tcfg_paction; 168 .paction = gact->tcfg_paction,
168 p_opt.pval = gact->tcfg_pval; 169 .pval = gact->tcfg_pval,
169 p_opt.ptype = gact->tcfg_ptype; 170 .ptype = gact->tcfg_ptype,
171 };
172
170 NLA_PUT(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt); 173 NLA_PUT(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt);
171 } 174 }
172#endif 175#endif
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index c7e59e6ec349..8daef9632255 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -39,7 +39,7 @@ static struct tcf_hashinfo ipt_hash_info = {
39 .lock = &ipt_lock, 39 .lock = &ipt_lock,
40}; 40};
41 41
42static int ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int hook) 42static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int hook)
43{ 43{
44 struct xt_tgchk_param par; 44 struct xt_tgchk_param par;
45 struct xt_target *target; 45 struct xt_target *target;
@@ -66,7 +66,7 @@ static int ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int
66 return 0; 66 return 0;
67} 67}
68 68
69static void ipt_destroy_target(struct ipt_entry_target *t) 69static void ipt_destroy_target(struct xt_entry_target *t)
70{ 70{
71 struct xt_tgdtor_param par = { 71 struct xt_tgdtor_param par = {
72 .target = t->u.kernel.target, 72 .target = t->u.kernel.target,
@@ -99,7 +99,7 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = {
99 [TCA_IPT_TABLE] = { .type = NLA_STRING, .len = IFNAMSIZ }, 99 [TCA_IPT_TABLE] = { .type = NLA_STRING, .len = IFNAMSIZ },
100 [TCA_IPT_HOOK] = { .type = NLA_U32 }, 100 [TCA_IPT_HOOK] = { .type = NLA_U32 },
101 [TCA_IPT_INDEX] = { .type = NLA_U32 }, 101 [TCA_IPT_INDEX] = { .type = NLA_U32 },
102 [TCA_IPT_TARG] = { .len = sizeof(struct ipt_entry_target) }, 102 [TCA_IPT_TARG] = { .len = sizeof(struct xt_entry_target) },
103}; 103};
104 104
105static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est, 105static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est,
@@ -108,7 +108,7 @@ static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est,
108 struct nlattr *tb[TCA_IPT_MAX + 1]; 108 struct nlattr *tb[TCA_IPT_MAX + 1];
109 struct tcf_ipt *ipt; 109 struct tcf_ipt *ipt;
110 struct tcf_common *pc; 110 struct tcf_common *pc;
111 struct ipt_entry_target *td, *t; 111 struct xt_entry_target *td, *t;
112 char *tname; 112 char *tname;
113 int ret = 0, err; 113 int ret = 0, err;
114 u32 hook = 0; 114 u32 hook = 0;
@@ -126,7 +126,7 @@ static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est,
126 if (tb[TCA_IPT_TARG] == NULL) 126 if (tb[TCA_IPT_TARG] == NULL)
127 return -EINVAL; 127 return -EINVAL;
128 128
129 td = (struct ipt_entry_target *)nla_data(tb[TCA_IPT_TARG]); 129 td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]);
130 if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size) 130 if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size)
131 return -EINVAL; 131 return -EINVAL;
132 132
@@ -230,7 +230,7 @@ static int tcf_ipt(struct sk_buff *skb, struct tc_action *a,
230 result = TC_ACT_SHOT; 230 result = TC_ACT_SHOT;
231 ipt->tcf_qstats.drops++; 231 ipt->tcf_qstats.drops++;
232 break; 232 break;
233 case IPT_CONTINUE: 233 case XT_CONTINUE:
234 result = TC_ACT_PIPE; 234 result = TC_ACT_PIPE;
235 break; 235 break;
236 default: 236 default:
@@ -249,7 +249,7 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int
249{ 249{
250 unsigned char *b = skb_tail_pointer(skb); 250 unsigned char *b = skb_tail_pointer(skb);
251 struct tcf_ipt *ipt = a->priv; 251 struct tcf_ipt *ipt = a->priv;
252 struct ipt_entry_target *t; 252 struct xt_entry_target *t;
253 struct tcf_t tm; 253 struct tcf_t tm;
254 struct tc_cnt c; 254 struct tc_cnt c;
255 255
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 11f195af2da0..0c311be92827 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -219,15 +219,16 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, i
219{ 219{
220 unsigned char *b = skb_tail_pointer(skb); 220 unsigned char *b = skb_tail_pointer(skb);
221 struct tcf_mirred *m = a->priv; 221 struct tcf_mirred *m = a->priv;
222 struct tc_mirred opt; 222 struct tc_mirred opt = {
223 .index = m->tcf_index,
224 .action = m->tcf_action,
225 .refcnt = m->tcf_refcnt - ref,
226 .bindcnt = m->tcf_bindcnt - bind,
227 .eaction = m->tcfm_eaction,
228 .ifindex = m->tcfm_ifindex,
229 };
223 struct tcf_t t; 230 struct tcf_t t;
224 231
225 opt.index = m->tcf_index;
226 opt.action = m->tcf_action;
227 opt.refcnt = m->tcf_refcnt - ref;
228 opt.bindcnt = m->tcf_bindcnt - bind;
229 opt.eaction = m->tcfm_eaction;
230 opt.ifindex = m->tcfm_ifindex;
231 NLA_PUT(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt); 232 NLA_PUT(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt);
232 t.install = jiffies_to_clock_t(jiffies - m->tcf_tm.install); 233 t.install = jiffies_to_clock_t(jiffies - m->tcf_tm.install);
233 t.lastuse = jiffies_to_clock_t(jiffies - m->tcf_tm.lastuse); 234 t.lastuse = jiffies_to_clock_t(jiffies - m->tcf_tm.lastuse);
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 509a2d53a99d..186eb837e600 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -272,19 +272,19 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,
272{ 272{
273 unsigned char *b = skb_tail_pointer(skb); 273 unsigned char *b = skb_tail_pointer(skb);
274 struct tcf_nat *p = a->priv; 274 struct tcf_nat *p = a->priv;
275 struct tc_nat opt; 275 struct tc_nat opt = {
276 .old_addr = p->old_addr,
277 .new_addr = p->new_addr,
278 .mask = p->mask,
279 .flags = p->flags,
280
281 .index = p->tcf_index,
282 .action = p->tcf_action,
283 .refcnt = p->tcf_refcnt - ref,
284 .bindcnt = p->tcf_bindcnt - bind,
285 };
276 struct tcf_t t; 286 struct tcf_t t;
277 287
278 opt.old_addr = p->old_addr;
279 opt.new_addr = p->new_addr;
280 opt.mask = p->mask;
281 opt.flags = p->flags;
282
283 opt.index = p->tcf_index;
284 opt.action = p->tcf_action;
285 opt.refcnt = p->tcf_refcnt - ref;
286 opt.bindcnt = p->tcf_bindcnt - bind;
287
288 NLA_PUT(skb, TCA_NAT_PARMS, sizeof(opt), &opt); 288 NLA_PUT(skb, TCA_NAT_PARMS, sizeof(opt), &opt);
289 t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); 289 t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
290 t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); 290 t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 537a48732e9e..7ebf7439b478 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -350,22 +350,19 @@ tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
350{ 350{
351 unsigned char *b = skb_tail_pointer(skb); 351 unsigned char *b = skb_tail_pointer(skb);
352 struct tcf_police *police = a->priv; 352 struct tcf_police *police = a->priv;
353 struct tc_police opt; 353 struct tc_police opt = {
354 354 .index = police->tcf_index,
355 opt.index = police->tcf_index; 355 .action = police->tcf_action,
356 opt.action = police->tcf_action; 356 .mtu = police->tcfp_mtu,
357 opt.mtu = police->tcfp_mtu; 357 .burst = police->tcfp_burst,
358 opt.burst = police->tcfp_burst; 358 .refcnt = police->tcf_refcnt - ref,
359 opt.refcnt = police->tcf_refcnt - ref; 359 .bindcnt = police->tcf_bindcnt - bind,
360 opt.bindcnt = police->tcf_bindcnt - bind; 360 };
361
361 if (police->tcfp_R_tab) 362 if (police->tcfp_R_tab)
362 opt.rate = police->tcfp_R_tab->rate; 363 opt.rate = police->tcfp_R_tab->rate;
363 else
364 memset(&opt.rate, 0, sizeof(opt.rate));
365 if (police->tcfp_P_tab) 364 if (police->tcfp_P_tab)
366 opt.peakrate = police->tcfp_P_tab->rate; 365 opt.peakrate = police->tcfp_P_tab->rate;
367 else
368 memset(&opt.peakrate, 0, sizeof(opt.peakrate));
369 NLA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt); 366 NLA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt);
370 if (police->tcfp_result) 367 if (police->tcfp_result)
371 NLA_PUT_U32(skb, TCA_POLICE_RESULT, police->tcfp_result); 368 NLA_PUT_U32(skb, TCA_POLICE_RESULT, police->tcfp_result);
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 4a1d640b0cf1..97e84f3ee775 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -164,13 +164,14 @@ static inline int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
164{ 164{
165 unsigned char *b = skb_tail_pointer(skb); 165 unsigned char *b = skb_tail_pointer(skb);
166 struct tcf_defact *d = a->priv; 166 struct tcf_defact *d = a->priv;
167 struct tc_defact opt; 167 struct tc_defact opt = {
168 .index = d->tcf_index,
169 .refcnt = d->tcf_refcnt - ref,
170 .bindcnt = d->tcf_bindcnt - bind,
171 .action = d->tcf_action,
172 };
168 struct tcf_t t; 173 struct tcf_t t;
169 174
170 opt.index = d->tcf_index;
171 opt.refcnt = d->tcf_refcnt - ref;
172 opt.bindcnt = d->tcf_bindcnt - bind;
173 opt.action = d->tcf_action;
174 NLA_PUT(skb, TCA_DEF_PARMS, sizeof(opt), &opt); 175 NLA_PUT(skb, TCA_DEF_PARMS, sizeof(opt), &opt);
175 NLA_PUT_STRING(skb, TCA_DEF_DATA, d->tcfd_defdata); 176 NLA_PUT_STRING(skb, TCA_DEF_DATA, d->tcfd_defdata);
176 t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install); 177 t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index e9607fe55b58..66cbf4eb8855 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -159,13 +159,14 @@ static inline int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
159{ 159{
160 unsigned char *b = skb_tail_pointer(skb); 160 unsigned char *b = skb_tail_pointer(skb);
161 struct tcf_skbedit *d = a->priv; 161 struct tcf_skbedit *d = a->priv;
162 struct tc_skbedit opt; 162 struct tc_skbedit opt = {
163 .index = d->tcf_index,
164 .refcnt = d->tcf_refcnt - ref,
165 .bindcnt = d->tcf_bindcnt - bind,
166 .action = d->tcf_action,
167 };
163 struct tcf_t t; 168 struct tcf_t t;
164 169
165 opt.index = d->tcf_index;
166 opt.refcnt = d->tcf_refcnt - ref;
167 opt.bindcnt = d->tcf_bindcnt - bind;
168 opt.action = d->tcf_action;
169 NLA_PUT(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt); 170 NLA_PUT(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt);
170 if (d->flags & SKBEDIT_F_PRIORITY) 171 if (d->flags & SKBEDIT_F_PRIORITY)
171 NLA_PUT(skb, TCA_SKBEDIT_PRIORITY, sizeof(d->priority), 172 NLA_PUT(skb, TCA_SKBEDIT_PRIORITY, sizeof(d->priority),
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 78ef2c5e130b..37dff78e9cb1 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -123,7 +123,7 @@ static int cls_cgroup_classify(struct sk_buff *skb, struct tcf_proto *tp,
123 * calls by looking at the number of nested bh disable calls because 123 * calls by looking at the number of nested bh disable calls because
124 * softirqs always disables bh. 124 * softirqs always disables bh.
125 */ 125 */
126 if (softirq_count() != SOFTIRQ_OFFSET) { 126 if (in_serving_softirq()) {
127 /* If there is an sk_classid we'll use that. */ 127 /* If there is an sk_classid we'll use that. */
128 if (!skb->sk) 128 if (!skb->sk)
129 return -1; 129 return -1;
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index e17096e3913c..5b271a18bc3a 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -111,44 +111,41 @@ static u32 flow_get_proto(struct sk_buff *skb)
111 } 111 }
112} 112}
113 113
114static int has_ports(u8 protocol)
115{
116 switch (protocol) {
117 case IPPROTO_TCP:
118 case IPPROTO_UDP:
119 case IPPROTO_UDPLITE:
120 case IPPROTO_SCTP:
121 case IPPROTO_DCCP:
122 case IPPROTO_ESP:
123 return 1;
124 default:
125 return 0;
126 }
127}
128
129static u32 flow_get_proto_src(struct sk_buff *skb) 114static u32 flow_get_proto_src(struct sk_buff *skb)
130{ 115{
131 switch (skb->protocol) { 116 switch (skb->protocol) {
132 case htons(ETH_P_IP): { 117 case htons(ETH_P_IP): {
133 struct iphdr *iph; 118 struct iphdr *iph;
119 int poff;
134 120
135 if (!pskb_network_may_pull(skb, sizeof(*iph))) 121 if (!pskb_network_may_pull(skb, sizeof(*iph)))
136 break; 122 break;
137 iph = ip_hdr(skb); 123 iph = ip_hdr(skb);
138 if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) && 124 if (iph->frag_off & htons(IP_MF|IP_OFFSET))
139 has_ports(iph->protocol) && 125 break;
140 pskb_network_may_pull(skb, iph->ihl * 4 + 2)) 126 poff = proto_ports_offset(iph->protocol);
141 return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4)); 127 if (poff >= 0 &&
128 pskb_network_may_pull(skb, iph->ihl * 4 + 2 + poff)) {
129 iph = ip_hdr(skb);
130 return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 +
131 poff));
132 }
142 break; 133 break;
143 } 134 }
144 case htons(ETH_P_IPV6): { 135 case htons(ETH_P_IPV6): {
145 struct ipv6hdr *iph; 136 struct ipv6hdr *iph;
137 int poff;
146 138
147 if (!pskb_network_may_pull(skb, sizeof(*iph) + 2)) 139 if (!pskb_network_may_pull(skb, sizeof(*iph)))
148 break; 140 break;
149 iph = ipv6_hdr(skb); 141 iph = ipv6_hdr(skb);
150 if (has_ports(iph->nexthdr)) 142 poff = proto_ports_offset(iph->nexthdr);
151 return ntohs(*(__be16 *)&iph[1]); 143 if (poff >= 0 &&
144 pskb_network_may_pull(skb, sizeof(*iph) + poff + 2)) {
145 iph = ipv6_hdr(skb);
146 return ntohs(*(__be16 *)((void *)iph + sizeof(*iph) +
147 poff));
148 }
152 break; 149 break;
153 } 150 }
154 } 151 }
@@ -161,24 +158,36 @@ static u32 flow_get_proto_dst(struct sk_buff *skb)
161 switch (skb->protocol) { 158 switch (skb->protocol) {
162 case htons(ETH_P_IP): { 159 case htons(ETH_P_IP): {
163 struct iphdr *iph; 160 struct iphdr *iph;
161 int poff;
164 162
165 if (!pskb_network_may_pull(skb, sizeof(*iph))) 163 if (!pskb_network_may_pull(skb, sizeof(*iph)))
166 break; 164 break;
167 iph = ip_hdr(skb); 165 iph = ip_hdr(skb);
168 if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) && 166 if (iph->frag_off & htons(IP_MF|IP_OFFSET))
169 has_ports(iph->protocol) && 167 break;
170 pskb_network_may_pull(skb, iph->ihl * 4 + 4)) 168 poff = proto_ports_offset(iph->protocol);
171 return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 + 2)); 169 if (poff >= 0 &&
170 pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) {
171 iph = ip_hdr(skb);
172 return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 +
173 2 + poff));
174 }
172 break; 175 break;
173 } 176 }
174 case htons(ETH_P_IPV6): { 177 case htons(ETH_P_IPV6): {
175 struct ipv6hdr *iph; 178 struct ipv6hdr *iph;
179 int poff;
176 180
177 if (!pskb_network_may_pull(skb, sizeof(*iph) + 4)) 181 if (!pskb_network_may_pull(skb, sizeof(*iph)))
178 break; 182 break;
179 iph = ipv6_hdr(skb); 183 iph = ipv6_hdr(skb);
180 if (has_ports(iph->nexthdr)) 184 poff = proto_ports_offset(iph->nexthdr);
181 return ntohs(*(__be16 *)((void *)&iph[1] + 2)); 185 if (poff >= 0 &&
186 pskb_network_may_pull(skb, sizeof(*iph) + poff + 4)) {
187 iph = ipv6_hdr(skb);
188 return ntohs(*(__be16 *)((void *)iph + sizeof(*iph) +
189 poff + 2));
190 }
182 break; 191 break;
183 } 192 }
184 } 193 }
@@ -297,6 +306,11 @@ static u32 flow_get_vlan_tag(const struct sk_buff *skb)
297 return tag & VLAN_VID_MASK; 306 return tag & VLAN_VID_MASK;
298} 307}
299 308
309static u32 flow_get_rxhash(struct sk_buff *skb)
310{
311 return skb_get_rxhash(skb);
312}
313
300static u32 flow_key_get(struct sk_buff *skb, int key) 314static u32 flow_key_get(struct sk_buff *skb, int key)
301{ 315{
302 switch (key) { 316 switch (key) {
@@ -334,6 +348,8 @@ static u32 flow_key_get(struct sk_buff *skb, int key)
334 return flow_get_skgid(skb); 348 return flow_get_skgid(skb);
335 case FLOW_KEY_VLAN_TAG: 349 case FLOW_KEY_VLAN_TAG:
336 return flow_get_vlan_tag(skb); 350 return flow_get_vlan_tag(skb);
351 case FLOW_KEY_RXHASH:
352 return flow_get_rxhash(skb);
337 default: 353 default:
338 WARN_ON(1); 354 WARN_ON(1);
339 return 0; 355 return 0;
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 7416a5c73b2a..b0c2a82178af 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -137,7 +137,7 @@ next_knode:
137 int toff = off + key->off + (off2 & key->offmask); 137 int toff = off + key->off + (off2 & key->offmask);
138 __be32 *data, _data; 138 __be32 *data, _data;
139 139
140 if (skb_headroom(skb) + toff < 0) 140 if (skb_headroom(skb) + toff > INT_MAX)
141 goto out; 141 goto out;
142 142
143 data = skb_header_pointer(skb, toff, 4, &_data); 143 data = skb_header_pointer(skb, toff, 4, &_data);
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 3bcac8aa333c..34da5e29ea1a 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -223,6 +223,11 @@ META_COLLECTOR(int_maclen)
223 dst->value = skb->mac_len; 223 dst->value = skb->mac_len;
224} 224}
225 225
226META_COLLECTOR(int_rxhash)
227{
228 dst->value = skb_get_rxhash(skb);
229}
230
226/************************************************************************** 231/**************************************************************************
227 * Netfilter 232 * Netfilter
228 **************************************************************************/ 233 **************************************************************************/
@@ -541,6 +546,7 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
541 [META_ID(SK_SENDMSG_OFF)] = META_FUNC(int_sk_sendmsg_off), 546 [META_ID(SK_SENDMSG_OFF)] = META_FUNC(int_sk_sendmsg_off),
542 [META_ID(SK_WRITE_PENDING)] = META_FUNC(int_sk_write_pend), 547 [META_ID(SK_WRITE_PENDING)] = META_FUNC(int_sk_write_pend),
543 [META_ID(VLAN_TAG)] = META_FUNC(int_vlan_tag), 548 [META_ID(VLAN_TAG)] = META_FUNC(int_vlan_tag),
549 [META_ID(RXHASH)] = META_FUNC(int_rxhash),
544 } 550 }
545}; 551};
546 552
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 408eea7086aa..b22ca2d1cebc 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -240,7 +240,10 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
240 if (q) 240 if (q)
241 goto out; 241 goto out;
242 242
243 q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle); 243 if (dev_ingress_queue(dev))
244 q = qdisc_match_from_root(
245 dev_ingress_queue(dev)->qdisc_sleeping,
246 handle);
244out: 247out:
245 return q; 248 return q;
246} 249}
@@ -360,7 +363,7 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
360 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16); 363 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
361 } 364 }
362 365
363 if (!s || tsize != s->tsize || (!tab && tsize > 0)) 366 if (tsize != s->tsize || (!tab && tsize > 0))
364 return ERR_PTR(-EINVAL); 367 return ERR_PTR(-EINVAL);
365 368
366 spin_lock(&qdisc_stab_lock); 369 spin_lock(&qdisc_stab_lock);
@@ -690,6 +693,8 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
690 (new && new->flags & TCQ_F_INGRESS)) { 693 (new && new->flags & TCQ_F_INGRESS)) {
691 num_q = 1; 694 num_q = 1;
692 ingress = 1; 695 ingress = 1;
696 if (!dev_ingress_queue(dev))
697 return -ENOENT;
693 } 698 }
694 699
695 if (dev->flags & IFF_UP) 700 if (dev->flags & IFF_UP)
@@ -701,7 +706,7 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
701 } 706 }
702 707
703 for (i = 0; i < num_q; i++) { 708 for (i = 0; i < num_q; i++) {
704 struct netdev_queue *dev_queue = &dev->rx_queue; 709 struct netdev_queue *dev_queue = dev_ingress_queue(dev);
705 710
706 if (!ingress) 711 if (!ingress)
707 dev_queue = netdev_get_tx_queue(dev, i); 712 dev_queue = netdev_get_tx_queue(dev, i);
@@ -979,7 +984,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
979 return -ENOENT; 984 return -ENOENT;
980 q = qdisc_leaf(p, clid); 985 q = qdisc_leaf(p, clid);
981 } else { /* ingress */ 986 } else { /* ingress */
982 q = dev->rx_queue.qdisc_sleeping; 987 if (dev_ingress_queue(dev))
988 q = dev_ingress_queue(dev)->qdisc_sleeping;
983 } 989 }
984 } else { 990 } else {
985 q = dev->qdisc; 991 q = dev->qdisc;
@@ -1043,8 +1049,9 @@ replay:
1043 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL) 1049 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
1044 return -ENOENT; 1050 return -ENOENT;
1045 q = qdisc_leaf(p, clid); 1051 q = qdisc_leaf(p, clid);
1046 } else { /*ingress */ 1052 } else { /* ingress */
1047 q = dev->rx_queue.qdisc_sleeping; 1053 if (dev_ingress_queue_create(dev))
1054 q = dev_ingress_queue(dev)->qdisc_sleeping;
1048 } 1055 }
1049 } else { 1056 } else {
1050 q = dev->qdisc; 1057 q = dev->qdisc;
@@ -1123,11 +1130,14 @@ replay:
1123create_n_graft: 1130create_n_graft:
1124 if (!(n->nlmsg_flags&NLM_F_CREATE)) 1131 if (!(n->nlmsg_flags&NLM_F_CREATE))
1125 return -ENOENT; 1132 return -ENOENT;
1126 if (clid == TC_H_INGRESS) 1133 if (clid == TC_H_INGRESS) {
1127 q = qdisc_create(dev, &dev->rx_queue, p, 1134 if (dev_ingress_queue(dev))
1128 tcm->tcm_parent, tcm->tcm_parent, 1135 q = qdisc_create(dev, dev_ingress_queue(dev), p,
1129 tca, &err); 1136 tcm->tcm_parent, tcm->tcm_parent,
1130 else { 1137 tca, &err);
1138 else
1139 err = -ENOENT;
1140 } else {
1131 struct netdev_queue *dev_queue; 1141 struct netdev_queue *dev_queue;
1132 1142
1133 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue) 1143 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
@@ -1304,8 +1314,10 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1304 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0) 1314 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1305 goto done; 1315 goto done;
1306 1316
1307 dev_queue = &dev->rx_queue; 1317 dev_queue = dev_ingress_queue(dev);
1308 if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0) 1318 if (dev_queue &&
1319 tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1320 &q_idx, s_q_idx) < 0)
1309 goto done; 1321 goto done;
1310 1322
1311cont: 1323cont:
@@ -1595,8 +1607,10 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1595 if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0) 1607 if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1596 goto done; 1608 goto done;
1597 1609
1598 dev_queue = &dev->rx_queue; 1610 dev_queue = dev_ingress_queue(dev);
1599 if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0) 1611 if (dev_queue &&
1612 tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1613 &t, s_t) < 0)
1600 goto done; 1614 goto done;
1601 1615
1602done: 1616done:
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 340662789529..282540778aa8 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -255,10 +255,6 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
255 error = -EINVAL; 255 error = -EINVAL;
256 goto err_out; 256 goto err_out;
257 } 257 }
258 if (!list_empty(&flow->list)) {
259 error = -EEXIST;
260 goto err_out;
261 }
262 } else { 258 } else {
263 int i; 259 int i;
264 unsigned long cl; 260 unsigned long cl;
@@ -279,8 +275,7 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
279 goto err_out; 275 goto err_out;
280 } 276 }
281 flow->filter_list = NULL; 277 flow->filter_list = NULL;
282 flow->q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, 278 flow->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
283 &pfifo_qdisc_ops, classid);
284 if (!flow->q) 279 if (!flow->q)
285 flow->q = &noop_qdisc; 280 flow->q = &noop_qdisc;
286 pr_debug("atm_tc_change: qdisc %p\n", flow->q); 281 pr_debug("atm_tc_change: qdisc %p\n", flow->q);
@@ -547,7 +542,7 @@ static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt)
547 INIT_LIST_HEAD(&p->flows); 542 INIT_LIST_HEAD(&p->flows);
548 INIT_LIST_HEAD(&p->link.list); 543 INIT_LIST_HEAD(&p->link.list);
549 list_add(&p->link.list, &p->flows); 544 list_add(&p->link.list, &p->flows);
550 p->link.q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, 545 p->link.q = qdisc_create_dflt(sch->dev_queue,
551 &pfifo_qdisc_ops, sch->handle); 546 &pfifo_qdisc_ops, sch->handle);
552 if (!p->link.q) 547 if (!p->link.q)
553 p->link.q = &noop_qdisc; 548 p->link.q = &noop_qdisc;
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 28c01ef5abc8..eb7631590865 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1379,9 +1379,9 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
1379 q->link.sibling = &q->link; 1379 q->link.sibling = &q->link;
1380 q->link.common.classid = sch->handle; 1380 q->link.common.classid = sch->handle;
1381 q->link.qdisc = sch; 1381 q->link.qdisc = sch;
1382 if (!(q->link.q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, 1382 q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1383 &pfifo_qdisc_ops, 1383 sch->handle);
1384 sch->handle))) 1384 if (!q->link.q)
1385 q->link.q = &noop_qdisc; 1385 q->link.q = &noop_qdisc;
1386 1386
1387 q->link.priority = TC_CBQ_MAXPRIO-1; 1387 q->link.priority = TC_CBQ_MAXPRIO-1;
@@ -1623,7 +1623,7 @@ static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1623 struct cbq_class *cl = (struct cbq_class*)arg; 1623 struct cbq_class *cl = (struct cbq_class*)arg;
1624 1624
1625 if (new == NULL) { 1625 if (new == NULL) {
1626 new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, 1626 new = qdisc_create_dflt(sch->dev_queue,
1627 &pfifo_qdisc_ops, cl->common.classid); 1627 &pfifo_qdisc_ops, cl->common.classid);
1628 if (new == NULL) 1628 if (new == NULL)
1629 return -ENOBUFS; 1629 return -ENOBUFS;
@@ -1874,8 +1874,8 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
1874 cl->R_tab = rtab; 1874 cl->R_tab = rtab;
1875 rtab = NULL; 1875 rtab = NULL;
1876 cl->refcnt = 1; 1876 cl->refcnt = 1;
1877 if (!(cl->q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, 1877 cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
1878 &pfifo_qdisc_ops, classid))) 1878 if (!cl->q)
1879 cl->q = &noop_qdisc; 1879 cl->q = &noop_qdisc;
1880 cl->common.classid = classid; 1880 cl->common.classid = classid;
1881 cl->tparent = parent; 1881 cl->tparent = parent;
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index b74046a95397..aa8b5313f8cf 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -110,7 +110,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
110 cl->refcnt = 1; 110 cl->refcnt = 1;
111 cl->common.classid = classid; 111 cl->common.classid = classid;
112 cl->quantum = quantum; 112 cl->quantum = quantum;
113 cl->qdisc = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, 113 cl->qdisc = qdisc_create_dflt(sch->dev_queue,
114 &pfifo_qdisc_ops, classid); 114 &pfifo_qdisc_ops, classid);
115 if (cl->qdisc == NULL) 115 if (cl->qdisc == NULL)
116 cl->qdisc = &noop_qdisc; 116 cl->qdisc = &noop_qdisc;
@@ -218,7 +218,7 @@ static int drr_graft_class(struct Qdisc *sch, unsigned long arg,
218 struct drr_class *cl = (struct drr_class *)arg; 218 struct drr_class *cl = (struct drr_class *)arg;
219 219
220 if (new == NULL) { 220 if (new == NULL) {
221 new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, 221 new = qdisc_create_dflt(sch->dev_queue,
222 &pfifo_qdisc_ops, cl->common.classid); 222 &pfifo_qdisc_ops, cl->common.classid);
223 if (new == NULL) 223 if (new == NULL)
224 new = &noop_qdisc; 224 new = &noop_qdisc;
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 63d41f86679c..1d295d62bb5c 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -61,8 +61,7 @@ static int dsmark_graft(struct Qdisc *sch, unsigned long arg,
61 sch, p, new, old); 61 sch, p, new, old);
62 62
63 if (new == NULL) { 63 if (new == NULL) {
64 new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, 64 new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
65 &pfifo_qdisc_ops,
66 sch->handle); 65 sch->handle);
67 if (new == NULL) 66 if (new == NULL)
68 new = &noop_qdisc; 67 new = &noop_qdisc;
@@ -384,8 +383,7 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt)
384 p->default_index = default_index; 383 p->default_index = default_index;
385 p->set_tc_index = nla_get_flag(tb[TCA_DSMARK_SET_TC_INDEX]); 384 p->set_tc_index = nla_get_flag(tb[TCA_DSMARK_SET_TC_INDEX]);
386 385
387 p->q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, 386 p->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle);
388 &pfifo_qdisc_ops, sch->handle);
389 if (p->q == NULL) 387 if (p->q == NULL)
390 p->q = &noop_qdisc; 388 p->q = &noop_qdisc;
391 389
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index 5948bafa8ce2..4dfecb0cba37 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -172,8 +172,7 @@ struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops,
172 struct Qdisc *q; 172 struct Qdisc *q;
173 int err = -ENOMEM; 173 int err = -ENOMEM;
174 174
175 q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, 175 q = qdisc_create_dflt(sch->dev_queue, ops, TC_H_MAKE(sch->handle, 1));
176 ops, TC_H_MAKE(sch->handle, 1));
177 if (q) { 176 if (q) {
178 err = fifo_set_limit(q, limit); 177 err = fifo_set_limit(q, limit);
179 if (err < 0) { 178 if (err < 0) {
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 2aeb3a4386a1..5dbb3cd96e59 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -383,6 +383,7 @@ struct Qdisc noop_qdisc = {
383 .list = LIST_HEAD_INIT(noop_qdisc.list), 383 .list = LIST_HEAD_INIT(noop_qdisc.list),
384 .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), 384 .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
385 .dev_queue = &noop_netdev_queue, 385 .dev_queue = &noop_netdev_queue,
386 .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
386}; 387};
387EXPORT_SYMBOL(noop_qdisc); 388EXPORT_SYMBOL(noop_qdisc);
388 389
@@ -409,6 +410,7 @@ static struct Qdisc noqueue_qdisc = {
409 .list = LIST_HEAD_INIT(noqueue_qdisc.list), 410 .list = LIST_HEAD_INIT(noqueue_qdisc.list),
410 .q.lock = __SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock), 411 .q.lock = __SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock),
411 .dev_queue = &noqueue_netdev_queue, 412 .dev_queue = &noqueue_netdev_queue,
413 .busylock = __SPIN_LOCK_UNLOCKED(noqueue_qdisc.busylock),
412}; 414};
413 415
414 416
@@ -574,10 +576,8 @@ errout:
574 return ERR_PTR(err); 576 return ERR_PTR(err);
575} 577}
576 578
577struct Qdisc * qdisc_create_dflt(struct net_device *dev, 579struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
578 struct netdev_queue *dev_queue, 580 struct Qdisc_ops *ops, unsigned int parentid)
579 struct Qdisc_ops *ops,
580 unsigned int parentid)
581{ 581{
582 struct Qdisc *sch; 582 struct Qdisc *sch;
583 583
@@ -682,7 +682,7 @@ static void attach_one_default_qdisc(struct net_device *dev,
682 struct Qdisc *qdisc; 682 struct Qdisc *qdisc;
683 683
684 if (dev->tx_queue_len) { 684 if (dev->tx_queue_len) {
685 qdisc = qdisc_create_dflt(dev, dev_queue, 685 qdisc = qdisc_create_dflt(dev_queue,
686 &pfifo_fast_ops, TC_H_ROOT); 686 &pfifo_fast_ops, TC_H_ROOT);
687 if (!qdisc) { 687 if (!qdisc) {
688 printk(KERN_INFO "%s: activation failed\n", dev->name); 688 printk(KERN_INFO "%s: activation failed\n", dev->name);
@@ -709,7 +709,7 @@ static void attach_default_qdiscs(struct net_device *dev)
709 dev->qdisc = txq->qdisc_sleeping; 709 dev->qdisc = txq->qdisc_sleeping;
710 atomic_inc(&dev->qdisc->refcnt); 710 atomic_inc(&dev->qdisc->refcnt);
711 } else { 711 } else {
712 qdisc = qdisc_create_dflt(dev, txq, &mq_qdisc_ops, TC_H_ROOT); 712 qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT);
713 if (qdisc) { 713 if (qdisc) {
714 qdisc->ops->attach(qdisc); 714 qdisc->ops->attach(qdisc);
715 dev->qdisc = qdisc; 715 dev->qdisc = qdisc;
@@ -753,7 +753,8 @@ void dev_activate(struct net_device *dev)
753 753
754 need_watchdog = 0; 754 need_watchdog = 0;
755 netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog); 755 netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
756 transition_one_qdisc(dev, &dev->rx_queue, NULL); 756 if (dev_ingress_queue(dev))
757 transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
757 758
758 if (need_watchdog) { 759 if (need_watchdog) {
759 dev->trans_start = jiffies; 760 dev->trans_start = jiffies;
@@ -812,7 +813,8 @@ static bool some_qdisc_is_busy(struct net_device *dev)
812void dev_deactivate(struct net_device *dev) 813void dev_deactivate(struct net_device *dev)
813{ 814{
814 netdev_for_each_tx_queue(dev, dev_deactivate_queue, &noop_qdisc); 815 netdev_for_each_tx_queue(dev, dev_deactivate_queue, &noop_qdisc);
815 dev_deactivate_queue(dev, &dev->rx_queue, &noop_qdisc); 816 if (dev_ingress_queue(dev))
817 dev_deactivate_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
816 818
817 dev_watchdog_down(dev); 819 dev_watchdog_down(dev);
818 820
@@ -838,7 +840,8 @@ void dev_init_scheduler(struct net_device *dev)
838{ 840{
839 dev->qdisc = &noop_qdisc; 841 dev->qdisc = &noop_qdisc;
840 netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc); 842 netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
841 dev_init_scheduler_queue(dev, &dev->rx_queue, &noop_qdisc); 843 if (dev_ingress_queue(dev))
844 dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
842 845
843 setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev); 846 setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev);
844} 847}
@@ -861,7 +864,8 @@ static void shutdown_scheduler_queue(struct net_device *dev,
861void dev_shutdown(struct net_device *dev) 864void dev_shutdown(struct net_device *dev)
862{ 865{
863 netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc); 866 netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
864 shutdown_scheduler_queue(dev, &dev->rx_queue, &noop_qdisc); 867 if (dev_ingress_queue(dev))
868 shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
865 qdisc_destroy(dev->qdisc); 869 qdisc_destroy(dev->qdisc);
866 dev->qdisc = &noop_qdisc; 870 dev->qdisc = &noop_qdisc;
867 871
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index abd904be4287..069c62b7bb36 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -761,8 +761,8 @@ init_vf(struct hfsc_class *cl, unsigned int len)
761 if (f != cl->cl_f) { 761 if (f != cl->cl_f) {
762 cl->cl_f = f; 762 cl->cl_f = f;
763 cftree_update(cl); 763 cftree_update(cl);
764 update_cfmin(cl->cl_parent);
765 } 764 }
765 update_cfmin(cl->cl_parent);
766 } 766 }
767} 767}
768 768
@@ -1088,7 +1088,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
1088 cl->refcnt = 1; 1088 cl->refcnt = 1;
1089 cl->sched = q; 1089 cl->sched = q;
1090 cl->cl_parent = parent; 1090 cl->cl_parent = parent;
1091 cl->qdisc = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, 1091 cl->qdisc = qdisc_create_dflt(sch->dev_queue,
1092 &pfifo_qdisc_ops, classid); 1092 &pfifo_qdisc_ops, classid);
1093 if (cl->qdisc == NULL) 1093 if (cl->qdisc == NULL)
1094 cl->qdisc = &noop_qdisc; 1094 cl->qdisc = &noop_qdisc;
@@ -1209,8 +1209,7 @@ hfsc_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1209 if (cl->level > 0) 1209 if (cl->level > 0)
1210 return -EINVAL; 1210 return -EINVAL;
1211 if (new == NULL) { 1211 if (new == NULL) {
1212 new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, 1212 new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1213 &pfifo_qdisc_ops,
1214 cl->cl_common.classid); 1213 cl->cl_common.classid);
1215 if (new == NULL) 1214 if (new == NULL)
1216 new = &noop_qdisc; 1215 new = &noop_qdisc;
@@ -1452,8 +1451,7 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
1452 q->root.cl_common.classid = sch->handle; 1451 q->root.cl_common.classid = sch->handle;
1453 q->root.refcnt = 1; 1452 q->root.refcnt = 1;
1454 q->root.sched = q; 1453 q->root.sched = q;
1455 q->root.qdisc = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, 1454 q->root.qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1456 &pfifo_qdisc_ops,
1457 sch->handle); 1455 sch->handle);
1458 if (q->root.qdisc == NULL) 1456 if (q->root.qdisc == NULL)
1459 q->root.qdisc = &noop_qdisc; 1457 q->root.qdisc = &noop_qdisc;
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 4be8d04b262d..01b519d6c52d 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1121,8 +1121,7 @@ static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1121 if (cl->level) 1121 if (cl->level)
1122 return -EINVAL; 1122 return -EINVAL;
1123 if (new == NULL && 1123 if (new == NULL &&
1124 (new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, 1124 (new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1125 &pfifo_qdisc_ops,
1126 cl->common.classid)) == NULL) 1125 cl->common.classid)) == NULL)
1127 return -ENOBUFS; 1126 return -ENOBUFS;
1128 1127
@@ -1247,8 +1246,7 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg)
1247 return -EBUSY; 1246 return -EBUSY;
1248 1247
1249 if (!cl->level && htb_parent_last_child(cl)) { 1248 if (!cl->level && htb_parent_last_child(cl)) {
1250 new_q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, 1249 new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1251 &pfifo_qdisc_ops,
1252 cl->parent->common.classid); 1250 cl->parent->common.classid);
1253 last_child = 1; 1251 last_child = 1;
1254 } 1252 }
@@ -1302,14 +1300,14 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
1302 struct htb_class *cl = (struct htb_class *)*arg, *parent; 1300 struct htb_class *cl = (struct htb_class *)*arg, *parent;
1303 struct nlattr *opt = tca[TCA_OPTIONS]; 1301 struct nlattr *opt = tca[TCA_OPTIONS];
1304 struct qdisc_rate_table *rtab = NULL, *ctab = NULL; 1302 struct qdisc_rate_table *rtab = NULL, *ctab = NULL;
1305 struct nlattr *tb[TCA_HTB_RTAB + 1]; 1303 struct nlattr *tb[__TCA_HTB_MAX];
1306 struct tc_htb_opt *hopt; 1304 struct tc_htb_opt *hopt;
1307 1305
1308 /* extract all subattrs from opt attr */ 1306 /* extract all subattrs from opt attr */
1309 if (!opt) 1307 if (!opt)
1310 goto failure; 1308 goto failure;
1311 1309
1312 err = nla_parse_nested(tb, TCA_HTB_RTAB, opt, htb_policy); 1310 err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy);
1313 if (err < 0) 1311 if (err < 0)
1314 goto failure; 1312 goto failure;
1315 1313
@@ -1377,7 +1375,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
1377 /* create leaf qdisc early because it uses kmalloc(GFP_KERNEL) 1375 /* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1378 so that can't be used inside of sch_tree_lock 1376 so that can't be used inside of sch_tree_lock
1379 -- thanks to Karlis Peisenieks */ 1377 -- thanks to Karlis Peisenieks */
1380 new_q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, 1378 new_q = qdisc_create_dflt(sch->dev_queue,
1381 &pfifo_qdisc_ops, classid); 1379 &pfifo_qdisc_ops, classid);
1382 sch_tree_lock(sch); 1380 sch_tree_lock(sch);
1383 if (parent && !parent->level) { 1381 if (parent && !parent->level) {
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index fe91e50f9d98..ecc302f4d2a1 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -56,7 +56,7 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt)
56 56
57 for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { 57 for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
58 dev_queue = netdev_get_tx_queue(dev, ntx); 58 dev_queue = netdev_get_tx_queue(dev, ntx);
59 qdisc = qdisc_create_dflt(dev, dev_queue, &pfifo_fast_ops, 59 qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops,
60 TC_H_MAKE(TC_H_MAJ(sch->handle), 60 TC_H_MAKE(TC_H_MAJ(sch->handle),
61 TC_H_MIN(ntx + 1))); 61 TC_H_MIN(ntx + 1)));
62 if (qdisc == NULL) 62 if (qdisc == NULL)
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 6ae251279fc2..32690deab5d0 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -227,8 +227,7 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt)
227 for (i = 0; i < q->bands; i++) { 227 for (i = 0; i < q->bands; i++) {
228 if (q->queues[i] == &noop_qdisc) { 228 if (q->queues[i] == &noop_qdisc) {
229 struct Qdisc *child, *old; 229 struct Qdisc *child, *old;
230 child = qdisc_create_dflt(qdisc_dev(sch), 230 child = qdisc_create_dflt(sch->dev_queue,
231 sch->dev_queue,
232 &pfifo_qdisc_ops, 231 &pfifo_qdisc_ops,
233 TC_H_MAKE(sch->handle, 232 TC_H_MAKE(sch->handle,
234 i + 1)); 233 i + 1));
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 4714ff162bbd..e5593c083a78 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -538,8 +538,7 @@ static int netem_init(struct Qdisc *sch, struct nlattr *opt)
538 538
539 qdisc_watchdog_init(&q->watchdog, sch); 539 qdisc_watchdog_init(&q->watchdog, sch);
540 540
541 q->qdisc = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, 541 q->qdisc = qdisc_create_dflt(sch->dev_queue, &tfifo_qdisc_ops,
542 &tfifo_qdisc_ops,
543 TC_H_MAKE(sch->handle, 1)); 542 TC_H_MAKE(sch->handle, 1));
544 if (!q->qdisc) { 543 if (!q->qdisc) {
545 pr_debug("netem: qdisc create failed\n"); 544 pr_debug("netem: qdisc create failed\n");
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 0748fb1e3a49..b1c95bce33ce 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -200,7 +200,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
200 for (i=0; i<q->bands; i++) { 200 for (i=0; i<q->bands; i++) {
201 if (q->queues[i] == &noop_qdisc) { 201 if (q->queues[i] == &noop_qdisc) {
202 struct Qdisc *child, *old; 202 struct Qdisc *child, *old;
203 child = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, 203 child = qdisc_create_dflt(sch->dev_queue,
204 &pfifo_qdisc_ops, 204 &pfifo_qdisc_ops,
205 TC_H_MAKE(sch->handle, i + 1)); 205 TC_H_MAKE(sch->handle, i + 1));
206 if (child) { 206 if (child) {
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 201cbac2b32c..3cf478d012dd 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -123,40 +123,39 @@ static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
123 case htons(ETH_P_IP): 123 case htons(ETH_P_IP):
124 { 124 {
125 const struct iphdr *iph; 125 const struct iphdr *iph;
126 int poff;
126 127
127 if (!pskb_network_may_pull(skb, sizeof(*iph))) 128 if (!pskb_network_may_pull(skb, sizeof(*iph)))
128 goto err; 129 goto err;
129 iph = ip_hdr(skb); 130 iph = ip_hdr(skb);
130 h = (__force u32)iph->daddr; 131 h = (__force u32)iph->daddr;
131 h2 = (__force u32)iph->saddr ^ iph->protocol; 132 h2 = (__force u32)iph->saddr ^ iph->protocol;
132 if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) && 133 if (iph->frag_off & htons(IP_MF|IP_OFFSET))
133 (iph->protocol == IPPROTO_TCP || 134 break;
134 iph->protocol == IPPROTO_UDP || 135 poff = proto_ports_offset(iph->protocol);
135 iph->protocol == IPPROTO_UDPLITE || 136 if (poff >= 0 &&
136 iph->protocol == IPPROTO_SCTP || 137 pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) {
137 iph->protocol == IPPROTO_DCCP || 138 iph = ip_hdr(skb);
138 iph->protocol == IPPROTO_ESP) && 139 h2 ^= *(u32*)((void *)iph + iph->ihl * 4 + poff);
139 pskb_network_may_pull(skb, iph->ihl * 4 + 4)) 140 }
140 h2 ^= *(((u32*)iph) + iph->ihl);
141 break; 141 break;
142 } 142 }
143 case htons(ETH_P_IPV6): 143 case htons(ETH_P_IPV6):
144 { 144 {
145 struct ipv6hdr *iph; 145 struct ipv6hdr *iph;
146 int poff;
146 147
147 if (!pskb_network_may_pull(skb, sizeof(*iph))) 148 if (!pskb_network_may_pull(skb, sizeof(*iph)))
148 goto err; 149 goto err;
149 iph = ipv6_hdr(skb); 150 iph = ipv6_hdr(skb);
150 h = (__force u32)iph->daddr.s6_addr32[3]; 151 h = (__force u32)iph->daddr.s6_addr32[3];
151 h2 = (__force u32)iph->saddr.s6_addr32[3] ^ iph->nexthdr; 152 h2 = (__force u32)iph->saddr.s6_addr32[3] ^ iph->nexthdr;
152 if ((iph->nexthdr == IPPROTO_TCP || 153 poff = proto_ports_offset(iph->nexthdr);
153 iph->nexthdr == IPPROTO_UDP || 154 if (poff >= 0 &&
154 iph->nexthdr == IPPROTO_UDPLITE || 155 pskb_network_may_pull(skb, sizeof(*iph) + 4 + poff)) {
155 iph->nexthdr == IPPROTO_SCTP || 156 iph = ipv6_hdr(skb);
156 iph->nexthdr == IPPROTO_DCCP || 157 h2 ^= *(u32*)((void *)iph + sizeof(*iph) + poff);
157 iph->nexthdr == IPPROTO_ESP) && 158 }
158 pskb_network_may_pull(skb, sizeof(*iph) + 4))
159 h2 ^= *(u32*)&iph[1];
160 break; 159 break;
161 } 160 }
162 default: 161 default:
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index feaabc103ce6..401af9596709 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -241,11 +241,11 @@ __teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *
241 } 241 }
242 if (neigh_event_send(n, skb_res) == 0) { 242 if (neigh_event_send(n, skb_res) == 0) {
243 int err; 243 int err;
244 char haddr[MAX_ADDR_LEN];
244 245
245 read_lock(&n->lock); 246 neigh_ha_snapshot(haddr, n, dev);
246 err = dev_hard_header(skb, dev, ntohs(skb->protocol), 247 err = dev_hard_header(skb, dev, ntohs(skb->protocol), haddr,
247 n->ha, NULL, skb->len); 248 NULL, skb->len);
248 read_unlock(&n->lock);
249 249
250 if (err < 0) { 250 if (err < 0) {
251 neigh_release(n); 251 neigh_release(n);
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 0b85e5256434..5f1fb8bd862d 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -48,6 +48,8 @@
48 * be incorporated into the next SCTP release. 48 * be incorporated into the next SCTP release.
49 */ 49 */
50 50
51#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
52
51#include <linux/types.h> 53#include <linux/types.h>
52#include <linux/fcntl.h> 54#include <linux/fcntl.h>
53#include <linux/poll.h> 55#include <linux/poll.h>
diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index 86366390038a..ddbbf7c81fa1 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -543,16 +543,20 @@ struct sctp_hmac *sctp_auth_asoc_get_hmac(const struct sctp_association *asoc)
543 id = ntohs(hmacs->hmac_ids[i]); 543 id = ntohs(hmacs->hmac_ids[i]);
544 544
545 /* Check the id is in the supported range */ 545 /* Check the id is in the supported range */
546 if (id > SCTP_AUTH_HMAC_ID_MAX) 546 if (id > SCTP_AUTH_HMAC_ID_MAX) {
547 id = 0;
547 continue; 548 continue;
549 }
548 550
549 /* See is we support the id. Supported IDs have name and 551 /* See is we support the id. Supported IDs have name and
550 * length fields set, so that we can allocated and use 552 * length fields set, so that we can allocated and use
551 * them. We can safely just check for name, for without the 553 * them. We can safely just check for name, for without the
552 * name, we can't allocate the TFM. 554 * name, we can't allocate the TFM.
553 */ 555 */
554 if (!sctp_hmac_list[id].hmac_name) 556 if (!sctp_hmac_list[id].hmac_name) {
557 id = 0;
555 continue; 558 continue;
559 }
556 560
557 break; 561 break;
558 } 562 }
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 476caaf100ed..6c8556459a75 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -37,6 +37,8 @@
37 * be incorporated into the next SCTP release. 37 * be incorporated into the next SCTP release.
38 */ 38 */
39 39
40#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
41
40#include <linux/types.h> 42#include <linux/types.h>
41#include <linux/kernel.h> 43#include <linux/kernel.h>
42#include <linux/net.h> 44#include <linux/net.h>
diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
index ccb6dc48d15b..397296fb156f 100644
--- a/net/sctp/inqueue.c
+++ b/net/sctp/inqueue.c
@@ -43,6 +43,8 @@
43 * be incorporated into the next SCTP release. 43 * be incorporated into the next SCTP release.
44 */ 44 */
45 45
46#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
47
46#include <net/sctp/sctp.h> 48#include <net/sctp/sctp.h>
47#include <net/sctp/sm.h> 49#include <net/sctp/sm.h>
48#include <linux/interrupt.h> 50#include <linux/interrupt.h>
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 732689140fb8..95e0c8eda1a0 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -47,6 +47,8 @@
47 * be incorporated into the next SCTP release. 47 * be incorporated into the next SCTP release.
48 */ 48 */
49 49
50#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
51
50#include <linux/module.h> 52#include <linux/module.h>
51#include <linux/errno.h> 53#include <linux/errno.h>
52#include <linux/types.h> 54#include <linux/types.h>
@@ -336,7 +338,7 @@ static void sctp_v6_get_saddr(struct sctp_sock *sk,
336 memcpy(saddr, baddr, sizeof(union sctp_addr)); 338 memcpy(saddr, baddr, sizeof(union sctp_addr));
337 SCTP_DEBUG_PRINTK("saddr: %pI6\n", &saddr->v6.sin6_addr); 339 SCTP_DEBUG_PRINTK("saddr: %pI6\n", &saddr->v6.sin6_addr);
338 } else { 340 } else {
339 printk(KERN_ERR "%s: asoc:%p Could not find a valid source " 341 pr_err("%s: asoc:%p Could not find a valid source "
340 "address for the dest:%pI6\n", 342 "address for the dest:%pI6\n",
341 __func__, asoc, &daddr->v6.sin6_addr); 343 __func__, asoc, &daddr->v6.sin6_addr);
342 } 344 }
diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c
index f73ec0ea93ba..8ef8e7d9eb61 100644
--- a/net/sctp/objcnt.c
+++ b/net/sctp/objcnt.c
@@ -38,6 +38,8 @@
38 * be incorporated into the next SCTP release. 38 * be incorporated into the next SCTP release.
39 */ 39 */
40 40
41#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
42
41#include <linux/kernel.h> 43#include <linux/kernel.h>
42#include <net/sctp/sctp.h> 44#include <net/sctp/sctp.h>
43 45
@@ -134,8 +136,7 @@ void sctp_dbg_objcnt_init(void)
134 ent = proc_create("sctp_dbg_objcnt", 0, 136 ent = proc_create("sctp_dbg_objcnt", 0,
135 proc_net_sctp, &sctp_objcnt_ops); 137 proc_net_sctp, &sctp_objcnt_ops);
136 if (!ent) 138 if (!ent)
137 printk(KERN_WARNING 139 pr_warn("sctp_dbg_objcnt: Unable to create /proc entry.\n");
138 "sctp_dbg_objcnt: Unable to create /proc entry.\n");
139} 140}
140 141
141/* Cleanup the objcount entry in the proc filesystem. */ 142/* Cleanup the objcount entry in the proc filesystem. */
diff --git a/net/sctp/output.c b/net/sctp/output.c
index a646681f5acd..60600d337a3a 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -41,6 +41,8 @@
41 * be incorporated into the next SCTP release. 41 * be incorporated into the next SCTP release.
42 */ 42 */
43 43
44#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
45
44#include <linux/types.h> 46#include <linux/types.h>
45#include <linux/kernel.h> 47#include <linux/kernel.h>
46#include <linux/wait.h> 48#include <linux/wait.h>
@@ -92,7 +94,6 @@ struct sctp_packet *sctp_packet_config(struct sctp_packet *packet,
92 SCTP_DEBUG_PRINTK("%s: packet:%p vtag:0x%x\n", __func__, 94 SCTP_DEBUG_PRINTK("%s: packet:%p vtag:0x%x\n", __func__,
93 packet, vtag); 95 packet, vtag);
94 96
95 sctp_packet_reset(packet);
96 packet->vtag = vtag; 97 packet->vtag = vtag;
97 98
98 if (ecn_capable && sctp_packet_empty(packet)) { 99 if (ecn_capable && sctp_packet_empty(packet)) {
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index c04b2eb59186..8c6d379b4bb6 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -46,6 +46,8 @@
46 * be incorporated into the next SCTP release. 46 * be incorporated into the next SCTP release.
47 */ 47 */
48 48
49#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
50
49#include <linux/types.h> 51#include <linux/types.h>
50#include <linux/list.h> /* For struct list_head */ 52#include <linux/list.h> /* For struct list_head */
51#include <linux/socket.h> 53#include <linux/socket.h>
@@ -1463,23 +1465,23 @@ static void sctp_check_transmitted(struct sctp_outq *q,
1463 /* Display the end of the 1465 /* Display the end of the
1464 * current range. 1466 * current range.
1465 */ 1467 */
1466 SCTP_DEBUG_PRINTK("-%08x", 1468 SCTP_DEBUG_PRINTK_CONT("-%08x",
1467 dbg_last_ack_tsn); 1469 dbg_last_ack_tsn);
1468 } 1470 }
1469 1471
1470 /* Start a new range. */ 1472 /* Start a new range. */
1471 SCTP_DEBUG_PRINTK(",%08x", tsn); 1473 SCTP_DEBUG_PRINTK_CONT(",%08x", tsn);
1472 dbg_ack_tsn = tsn; 1474 dbg_ack_tsn = tsn;
1473 break; 1475 break;
1474 1476
1475 case 1: /* The last TSN was NOT ACKed. */ 1477 case 1: /* The last TSN was NOT ACKed. */
1476 if (dbg_last_kept_tsn != dbg_kept_tsn) { 1478 if (dbg_last_kept_tsn != dbg_kept_tsn) {
1477 /* Display the end of current range. */ 1479 /* Display the end of current range. */
1478 SCTP_DEBUG_PRINTK("-%08x", 1480 SCTP_DEBUG_PRINTK_CONT("-%08x",
1479 dbg_last_kept_tsn); 1481 dbg_last_kept_tsn);
1480 } 1482 }
1481 1483
1482 SCTP_DEBUG_PRINTK("\n"); 1484 SCTP_DEBUG_PRINTK_CONT("\n");
1483 1485
1484 /* FALL THROUGH... */ 1486 /* FALL THROUGH... */
1485 default: 1487 default:
@@ -1526,18 +1528,18 @@ static void sctp_check_transmitted(struct sctp_outq *q,
1526 break; 1528 break;
1527 1529
1528 if (dbg_last_kept_tsn != dbg_kept_tsn) 1530 if (dbg_last_kept_tsn != dbg_kept_tsn)
1529 SCTP_DEBUG_PRINTK("-%08x", 1531 SCTP_DEBUG_PRINTK_CONT("-%08x",
1530 dbg_last_kept_tsn); 1532 dbg_last_kept_tsn);
1531 1533
1532 SCTP_DEBUG_PRINTK(",%08x", tsn); 1534 SCTP_DEBUG_PRINTK_CONT(",%08x", tsn);
1533 dbg_kept_tsn = tsn; 1535 dbg_kept_tsn = tsn;
1534 break; 1536 break;
1535 1537
1536 case 0: 1538 case 0:
1537 if (dbg_last_ack_tsn != dbg_ack_tsn) 1539 if (dbg_last_ack_tsn != dbg_ack_tsn)
1538 SCTP_DEBUG_PRINTK("-%08x", 1540 SCTP_DEBUG_PRINTK_CONT("-%08x",
1539 dbg_last_ack_tsn); 1541 dbg_last_ack_tsn);
1540 SCTP_DEBUG_PRINTK("\n"); 1542 SCTP_DEBUG_PRINTK_CONT("\n");
1541 1543
1542 /* FALL THROUGH... */ 1544 /* FALL THROUGH... */
1543 default: 1545 default:
@@ -1556,17 +1558,17 @@ static void sctp_check_transmitted(struct sctp_outq *q,
1556 switch (dbg_prt_state) { 1558 switch (dbg_prt_state) {
1557 case 0: 1559 case 0:
1558 if (dbg_last_ack_tsn != dbg_ack_tsn) { 1560 if (dbg_last_ack_tsn != dbg_ack_tsn) {
1559 SCTP_DEBUG_PRINTK("-%08x\n", dbg_last_ack_tsn); 1561 SCTP_DEBUG_PRINTK_CONT("-%08x\n", dbg_last_ack_tsn);
1560 } else { 1562 } else {
1561 SCTP_DEBUG_PRINTK("\n"); 1563 SCTP_DEBUG_PRINTK_CONT("\n");
1562 } 1564 }
1563 break; 1565 break;
1564 1566
1565 case 1: 1567 case 1:
1566 if (dbg_last_kept_tsn != dbg_kept_tsn) { 1568 if (dbg_last_kept_tsn != dbg_kept_tsn) {
1567 SCTP_DEBUG_PRINTK("-%08x\n", dbg_last_kept_tsn); 1569 SCTP_DEBUG_PRINTK_CONT("-%08x\n", dbg_last_kept_tsn);
1568 } else { 1570 } else {
1569 SCTP_DEBUG_PRINTK("\n"); 1571 SCTP_DEBUG_PRINTK_CONT("\n");
1570 } 1572 }
1571 } 1573 }
1572#endif /* SCTP_DEBUG */ 1574#endif /* SCTP_DEBUG */
diff --git a/net/sctp/probe.c b/net/sctp/probe.c
index db3a42b8b349..bc6cd75cc1dc 100644
--- a/net/sctp/probe.c
+++ b/net/sctp/probe.c
@@ -22,6 +22,8 @@
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 */ 23 */
24 24
25#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
26
25#include <linux/kernel.h> 27#include <linux/kernel.h>
26#include <linux/kprobes.h> 28#include <linux/kprobes.h>
27#include <linux/socket.h> 29#include <linux/socket.h>
@@ -117,6 +119,7 @@ static const struct file_operations sctpprobe_fops = {
117 .owner = THIS_MODULE, 119 .owner = THIS_MODULE,
118 .open = sctpprobe_open, 120 .open = sctpprobe_open,
119 .read = sctpprobe_read, 121 .read = sctpprobe_read,
122 .llseek = noop_llseek,
120}; 123};
121 124
122sctp_disposition_t jsctp_sf_eat_sack(const struct sctp_endpoint *ep, 125sctp_disposition_t jsctp_sf_eat_sack(const struct sctp_endpoint *ep,
@@ -192,7 +195,7 @@ static __init int sctpprobe_init(void)
192 if (ret) 195 if (ret)
193 goto remove_proc; 196 goto remove_proc;
194 197
195 pr_info("SCTP probe registered (port=%d)\n", port); 198 pr_info("probe registered (port=%d)\n", port);
196 199
197 return 0; 200 return 0;
198 201
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 5027b83f1cc0..1ef29c74d85e 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -46,6 +46,8 @@
46 * be incorporated into the next SCTP release. 46 * be incorporated into the next SCTP release.
47 */ 47 */
48 48
49#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
50
49#include <linux/module.h> 51#include <linux/module.h>
50#include <linux/init.h> 52#include <linux/init.h>
51#include <linux/netdevice.h> 53#include <linux/netdevice.h>
@@ -707,8 +709,7 @@ static int sctp_ctl_sock_init(void)
707 &init_net); 709 &init_net);
708 710
709 if (err < 0) { 711 if (err < 0) {
710 printk(KERN_ERR 712 pr_err("Failed to create the SCTP control socket\n");
711 "SCTP: Failed to create the SCTP control socket.\n");
712 return err; 713 return err;
713 } 714 }
714 return 0; 715 return 0;
@@ -798,7 +799,7 @@ static void sctp_inet_skb_msgname(struct sk_buff *skb, char *msgname, int *len)
798static int sctp_inet_af_supported(sa_family_t family, struct sctp_sock *sp) 799static int sctp_inet_af_supported(sa_family_t family, struct sctp_sock *sp)
799{ 800{
800 /* PF_INET only supports AF_INET addresses. */ 801 /* PF_INET only supports AF_INET addresses. */
801 return (AF_INET == family); 802 return AF_INET == family;
802} 803}
803 804
804/* Address matching with wildcards allowed. */ 805/* Address matching with wildcards allowed. */
@@ -1206,7 +1207,7 @@ SCTP_STATIC __init int sctp_init(void)
1206 __get_free_pages(GFP_ATOMIC, order); 1207 __get_free_pages(GFP_ATOMIC, order);
1207 } while (!sctp_assoc_hashtable && --order > 0); 1208 } while (!sctp_assoc_hashtable && --order > 0);
1208 if (!sctp_assoc_hashtable) { 1209 if (!sctp_assoc_hashtable) {
1209 printk(KERN_ERR "SCTP: Failed association hash alloc.\n"); 1210 pr_err("Failed association hash alloc\n");
1210 status = -ENOMEM; 1211 status = -ENOMEM;
1211 goto err_ahash_alloc; 1212 goto err_ahash_alloc;
1212 } 1213 }
@@ -1220,7 +1221,7 @@ SCTP_STATIC __init int sctp_init(void)
1220 sctp_ep_hashtable = (struct sctp_hashbucket *) 1221 sctp_ep_hashtable = (struct sctp_hashbucket *)
1221 kmalloc(64 * sizeof(struct sctp_hashbucket), GFP_KERNEL); 1222 kmalloc(64 * sizeof(struct sctp_hashbucket), GFP_KERNEL);
1222 if (!sctp_ep_hashtable) { 1223 if (!sctp_ep_hashtable) {
1223 printk(KERN_ERR "SCTP: Failed endpoint_hash alloc.\n"); 1224 pr_err("Failed endpoint_hash alloc\n");
1224 status = -ENOMEM; 1225 status = -ENOMEM;
1225 goto err_ehash_alloc; 1226 goto err_ehash_alloc;
1226 } 1227 }
@@ -1239,7 +1240,7 @@ SCTP_STATIC __init int sctp_init(void)
1239 __get_free_pages(GFP_ATOMIC, order); 1240 __get_free_pages(GFP_ATOMIC, order);
1240 } while (!sctp_port_hashtable && --order > 0); 1241 } while (!sctp_port_hashtable && --order > 0);
1241 if (!sctp_port_hashtable) { 1242 if (!sctp_port_hashtable) {
1242 printk(KERN_ERR "SCTP: Failed bind hash alloc."); 1243 pr_err("Failed bind hash alloc\n");
1243 status = -ENOMEM; 1244 status = -ENOMEM;
1244 goto err_bhash_alloc; 1245 goto err_bhash_alloc;
1245 } 1246 }
@@ -1248,8 +1249,7 @@ SCTP_STATIC __init int sctp_init(void)
1248 INIT_HLIST_HEAD(&sctp_port_hashtable[i].chain); 1249 INIT_HLIST_HEAD(&sctp_port_hashtable[i].chain);
1249 } 1250 }
1250 1251
1251 printk(KERN_INFO "SCTP: Hash tables configured " 1252 pr_info("Hash tables configured (established %d bind %d)\n",
1252 "(established %d bind %d)\n",
1253 sctp_assoc_hashsize, sctp_port_hashsize); 1253 sctp_assoc_hashsize, sctp_port_hashsize);
1254 1254
1255 /* Disable ADDIP by default. */ 1255 /* Disable ADDIP by default. */
@@ -1290,8 +1290,7 @@ SCTP_STATIC __init int sctp_init(void)
1290 1290
1291 /* Initialize the control inode/socket for handling OOTB packets. */ 1291 /* Initialize the control inode/socket for handling OOTB packets. */
1292 if ((status = sctp_ctl_sock_init())) { 1292 if ((status = sctp_ctl_sock_init())) {
1293 printk (KERN_ERR 1293 pr_err("Failed to initialize the SCTP control sock\n");
1294 "SCTP: Failed to initialize the SCTP control sock.\n");
1295 goto err_ctl_sock_init; 1294 goto err_ctl_sock_init;
1296 } 1295 }
1297 1296
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 246f92924658..2cc46f0962ca 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -50,6 +50,8 @@
50 * be incorporated into the next SCTP release. 50 * be incorporated into the next SCTP release.
51 */ 51 */
52 52
53#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
54
53#include <linux/types.h> 55#include <linux/types.h>
54#include <linux/kernel.h> 56#include <linux/kernel.h>
55#include <linux/ip.h> 57#include <linux/ip.h>
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index f5e5e27cac5e..b21b218d564f 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -47,6 +47,8 @@
47 * be incorporated into the next SCTP release. 47 * be incorporated into the next SCTP release.
48 */ 48 */
49 49
50#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
51
50#include <linux/skbuff.h> 52#include <linux/skbuff.h>
51#include <linux/types.h> 53#include <linux/types.h>
52#include <linux/socket.h> 54#include <linux/socket.h>
@@ -1146,26 +1148,23 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
1146 1148
1147 case SCTP_DISPOSITION_VIOLATION: 1149 case SCTP_DISPOSITION_VIOLATION:
1148 if (net_ratelimit()) 1150 if (net_ratelimit())
1149 printk(KERN_ERR "sctp protocol violation state %d " 1151 pr_err("protocol violation state %d chunkid %d\n",
1150 "chunkid %d\n", state, subtype.chunk); 1152 state, subtype.chunk);
1151 break; 1153 break;
1152 1154
1153 case SCTP_DISPOSITION_NOT_IMPL: 1155 case SCTP_DISPOSITION_NOT_IMPL:
1154 printk(KERN_WARNING "sctp unimplemented feature in state %d, " 1156 pr_warn("unimplemented feature in state %d, event_type %d, event_id %d\n",
1155 "event_type %d, event_id %d\n", 1157 state, event_type, subtype.chunk);
1156 state, event_type, subtype.chunk);
1157 break; 1158 break;
1158 1159
1159 case SCTP_DISPOSITION_BUG: 1160 case SCTP_DISPOSITION_BUG:
1160 printk(KERN_ERR "sctp bug in state %d, " 1161 pr_err("bug in state %d, event_type %d, event_id %d\n",
1161 "event_type %d, event_id %d\n",
1162 state, event_type, subtype.chunk); 1162 state, event_type, subtype.chunk);
1163 BUG(); 1163 BUG();
1164 break; 1164 break;
1165 1165
1166 default: 1166 default:
1167 printk(KERN_ERR "sctp impossible disposition %d " 1167 pr_err("impossible disposition %d in state %d, event_type %d, event_id %d\n",
1168 "in state %d, event_type %d, event_id %d\n",
1169 status, state, event_type, subtype.chunk); 1168 status, state, event_type, subtype.chunk);
1170 BUG(); 1169 BUG();
1171 break; 1170 break;
@@ -1679,8 +1678,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
1679 sctp_cmd_send_asconf(asoc); 1678 sctp_cmd_send_asconf(asoc);
1680 break; 1679 break;
1681 default: 1680 default:
1682 printk(KERN_WARNING "Impossible command: %u, %p\n", 1681 pr_warn("Impossible command: %u, %p\n",
1683 cmd->verb, cmd->obj.ptr); 1682 cmd->verb, cmd->obj.ptr);
1684 break; 1683 break;
1685 } 1684 }
1686 1685
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 24b2cd555637..4b4eb7c96bbd 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -50,6 +50,8 @@
50 * be incorporated into the next SCTP release. 50 * be incorporated into the next SCTP release.
51 */ 51 */
52 52
53#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
54
53#include <linux/types.h> 55#include <linux/types.h>
54#include <linux/kernel.h> 56#include <linux/kernel.h>
55#include <linux/ip.h> 57#include <linux/ip.h>
@@ -1138,18 +1140,16 @@ sctp_disposition_t sctp_sf_backbeat_8_3(const struct sctp_endpoint *ep,
1138 if (unlikely(!link)) { 1140 if (unlikely(!link)) {
1139 if (from_addr.sa.sa_family == AF_INET6) { 1141 if (from_addr.sa.sa_family == AF_INET6) {
1140 if (net_ratelimit()) 1142 if (net_ratelimit())
1141 printk(KERN_WARNING 1143 pr_warn("%s association %p could not find address %pI6\n",
1142 "%s association %p could not find address %pI6\n", 1144 __func__,
1143 __func__, 1145 asoc,
1144 asoc, 1146 &from_addr.v6.sin6_addr);
1145 &from_addr.v6.sin6_addr);
1146 } else { 1147 } else {
1147 if (net_ratelimit()) 1148 if (net_ratelimit())
1148 printk(KERN_WARNING 1149 pr_warn("%s association %p could not find address %pI4\n",
1149 "%s association %p could not find address %pI4\n", 1150 __func__,
1150 __func__, 1151 asoc,
1151 asoc, 1152 &from_addr.v4.sin_addr.s_addr);
1152 &from_addr.v4.sin_addr.s_addr);
1153 } 1153 }
1154 return SCTP_DISPOSITION_DISCARD; 1154 return SCTP_DISPOSITION_DISCARD;
1155 } 1155 }
@@ -1232,6 +1232,18 @@ out:
1232 return 0; 1232 return 0;
1233} 1233}
1234 1234
1235static bool list_has_sctp_addr(const struct list_head *list,
1236 union sctp_addr *ipaddr)
1237{
1238 struct sctp_transport *addr;
1239
1240 list_for_each_entry(addr, list, transports) {
1241 if (sctp_cmp_addr_exact(ipaddr, &addr->ipaddr))
1242 return true;
1243 }
1244
1245 return false;
1246}
1235/* A restart is occurring, check to make sure no new addresses 1247/* A restart is occurring, check to make sure no new addresses
1236 * are being added as we may be under a takeover attack. 1248 * are being added as we may be under a takeover attack.
1237 */ 1249 */
@@ -1240,10 +1252,10 @@ static int sctp_sf_check_restart_addrs(const struct sctp_association *new_asoc,
1240 struct sctp_chunk *init, 1252 struct sctp_chunk *init,
1241 sctp_cmd_seq_t *commands) 1253 sctp_cmd_seq_t *commands)
1242{ 1254{
1243 struct sctp_transport *new_addr, *addr; 1255 struct sctp_transport *new_addr;
1244 int found; 1256 int ret = 1;
1245 1257
1246 /* Implementor's Guide - Sectin 5.2.2 1258 /* Implementor's Guide - Section 5.2.2
1247 * ... 1259 * ...
1248 * Before responding the endpoint MUST check to see if the 1260 * Before responding the endpoint MUST check to see if the
1249 * unexpected INIT adds new addresses to the association. If new 1261 * unexpected INIT adds new addresses to the association. If new
@@ -1254,31 +1266,19 @@ static int sctp_sf_check_restart_addrs(const struct sctp_association *new_asoc,
1254 /* Search through all current addresses and make sure 1266 /* Search through all current addresses and make sure
1255 * we aren't adding any new ones. 1267 * we aren't adding any new ones.
1256 */ 1268 */
1257 new_addr = NULL;
1258 found = 0;
1259
1260 list_for_each_entry(new_addr, &new_asoc->peer.transport_addr_list, 1269 list_for_each_entry(new_addr, &new_asoc->peer.transport_addr_list,
1261 transports) { 1270 transports) {
1262 found = 0; 1271 if (!list_has_sctp_addr(&asoc->peer.transport_addr_list,
1263 list_for_each_entry(addr, &asoc->peer.transport_addr_list, 1272 &new_addr->ipaddr)) {
1264 transports) { 1273 sctp_sf_send_restart_abort(&new_addr->ipaddr, init,
1265 if (sctp_cmp_addr_exact(&new_addr->ipaddr, 1274 commands);
1266 &addr->ipaddr)) { 1275 ret = 0;
1267 found = 1;
1268 break;
1269 }
1270 }
1271 if (!found)
1272 break; 1276 break;
1273 } 1277 }
1274
1275 /* If a new address was added, ABORT the sender. */
1276 if (!found && new_addr) {
1277 sctp_sf_send_restart_abort(&new_addr->ipaddr, init, commands);
1278 } 1278 }
1279 1279
1280 /* Return success if all addresses were found. */ 1280 /* Return success if all addresses were found. */
1281 return found; 1281 return ret;
1282} 1282}
1283 1283
1284/* Populate the verification/tie tags based on overlapping INIT 1284/* Populate the verification/tie tags based on overlapping INIT
diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c
index 6d9b3aafcc5d..546d4387fb3c 100644
--- a/net/sctp/sm_statetable.c
+++ b/net/sctp/sm_statetable.c
@@ -46,6 +46,8 @@
46 * be incorporated into the next SCTP release. 46 * be incorporated into the next SCTP release.
47 */ 47 */
48 48
49#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
50
49#include <linux/skbuff.h> 51#include <linux/skbuff.h>
50#include <net/sctp/sctp.h> 52#include <net/sctp/sctp.h>
51#include <net/sctp/sm.h> 53#include <net/sctp/sm.h>
@@ -66,15 +68,19 @@ static const sctp_sm_table_entry_t bug = {
66 .name = "sctp_sf_bug" 68 .name = "sctp_sf_bug"
67}; 69};
68 70
69#define DO_LOOKUP(_max, _type, _table) \ 71#define DO_LOOKUP(_max, _type, _table) \
70 if ((event_subtype._type > (_max))) { \ 72({ \
71 printk(KERN_WARNING \ 73 const sctp_sm_table_entry_t *rtn; \
72 "sctp table %p possible attack:" \ 74 \
73 " event %d exceeds max %d\n", \ 75 if ((event_subtype._type > (_max))) { \
74 _table, event_subtype._type, _max); \ 76 pr_warn("table %p possible attack: event %d exceeds max %d\n", \
75 return &bug; \ 77 _table, event_subtype._type, _max); \
76 } \ 78 rtn = &bug; \
77 return &_table[event_subtype._type][(int)state]; 79 } else \
80 rtn = &_table[event_subtype._type][(int)state]; \
81 \
82 rtn; \
83})
78 84
79const sctp_sm_table_entry_t *sctp_sm_lookup_event(sctp_event_t event_type, 85const sctp_sm_table_entry_t *sctp_sm_lookup_event(sctp_event_t event_type,
80 sctp_state_t state, 86 sctp_state_t state,
@@ -83,21 +89,15 @@ const sctp_sm_table_entry_t *sctp_sm_lookup_event(sctp_event_t event_type,
83 switch (event_type) { 89 switch (event_type) {
84 case SCTP_EVENT_T_CHUNK: 90 case SCTP_EVENT_T_CHUNK:
85 return sctp_chunk_event_lookup(event_subtype.chunk, state); 91 return sctp_chunk_event_lookup(event_subtype.chunk, state);
86 break;
87 case SCTP_EVENT_T_TIMEOUT: 92 case SCTP_EVENT_T_TIMEOUT:
88 DO_LOOKUP(SCTP_EVENT_TIMEOUT_MAX, timeout, 93 return DO_LOOKUP(SCTP_EVENT_TIMEOUT_MAX, timeout,
89 timeout_event_table); 94 timeout_event_table);
90 break;
91
92 case SCTP_EVENT_T_OTHER: 95 case SCTP_EVENT_T_OTHER:
93 DO_LOOKUP(SCTP_EVENT_OTHER_MAX, other, other_event_table); 96 return DO_LOOKUP(SCTP_EVENT_OTHER_MAX, other,
94 break; 97 other_event_table);
95
96 case SCTP_EVENT_T_PRIMITIVE: 98 case SCTP_EVENT_T_PRIMITIVE:
97 DO_LOOKUP(SCTP_EVENT_PRIMITIVE_MAX, primitive, 99 return DO_LOOKUP(SCTP_EVENT_PRIMITIVE_MAX, primitive,
98 primitive_event_table); 100 primitive_event_table);
99 break;
100
101 default: 101 default:
102 /* Yikes! We got an illegal event type. */ 102 /* Yikes! We got an illegal event type. */
103 return &bug; 103 return &bug;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index ca44917872d2..e34ca9cc1167 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -57,6 +57,8 @@
57 * be incorporated into the next SCTP release. 57 * be incorporated into the next SCTP release.
58 */ 58 */
59 59
60#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
61
60#include <linux/types.h> 62#include <linux/types.h>
61#include <linux/kernel.h> 63#include <linux/kernel.h>
62#include <linux/wait.h> 64#include <linux/wait.h>
@@ -916,6 +918,11 @@ SCTP_STATIC int sctp_setsockopt_bindx(struct sock* sk,
916 /* Walk through the addrs buffer and count the number of addresses. */ 918 /* Walk through the addrs buffer and count the number of addresses. */
917 addr_buf = kaddrs; 919 addr_buf = kaddrs;
918 while (walk_size < addrs_size) { 920 while (walk_size < addrs_size) {
921 if (walk_size + sizeof(sa_family_t) > addrs_size) {
922 kfree(kaddrs);
923 return -EINVAL;
924 }
925
919 sa_addr = (struct sockaddr *)addr_buf; 926 sa_addr = (struct sockaddr *)addr_buf;
920 af = sctp_get_af_specific(sa_addr->sa_family); 927 af = sctp_get_af_specific(sa_addr->sa_family);
921 928
@@ -1002,9 +1009,13 @@ static int __sctp_connect(struct sock* sk,
1002 /* Walk through the addrs buffer and count the number of addresses. */ 1009 /* Walk through the addrs buffer and count the number of addresses. */
1003 addr_buf = kaddrs; 1010 addr_buf = kaddrs;
1004 while (walk_size < addrs_size) { 1011 while (walk_size < addrs_size) {
1012 if (walk_size + sizeof(sa_family_t) > addrs_size) {
1013 err = -EINVAL;
1014 goto out_free;
1015 }
1016
1005 sa_addr = (union sctp_addr *)addr_buf; 1017 sa_addr = (union sctp_addr *)addr_buf;
1006 af = sctp_get_af_specific(sa_addr->sa.sa_family); 1018 af = sctp_get_af_specific(sa_addr->sa.sa_family);
1007 port = ntohs(sa_addr->v4.sin_port);
1008 1019
1009 /* If the address family is not supported or if this address 1020 /* If the address family is not supported or if this address
1010 * causes the address buffer to overflow return EINVAL. 1021 * causes the address buffer to overflow return EINVAL.
@@ -1014,6 +1025,8 @@ static int __sctp_connect(struct sock* sk,
1014 goto out_free; 1025 goto out_free;
1015 } 1026 }
1016 1027
1028 port = ntohs(sa_addr->v4.sin_port);
1029
1017 /* Save current address so we can work with it */ 1030 /* Save current address so we can work with it */
1018 memcpy(&to, sa_addr, af->sockaddr_len); 1031 memcpy(&to, sa_addr, af->sockaddr_len);
1019 1032
@@ -2458,9 +2471,8 @@ static int sctp_setsockopt_delayed_ack(struct sock *sk,
2458 if (params.sack_delay == 0 && params.sack_freq == 0) 2471 if (params.sack_delay == 0 && params.sack_freq == 0)
2459 return 0; 2472 return 0;
2460 } else if (optlen == sizeof(struct sctp_assoc_value)) { 2473 } else if (optlen == sizeof(struct sctp_assoc_value)) {
2461 printk(KERN_WARNING "SCTP: Use of struct sctp_assoc_value " 2474 pr_warn("Use of struct sctp_assoc_value in delayed_ack socket option deprecated\n");
2462 "in delayed_ack socket option deprecated\n"); 2475 pr_warn("Use struct sctp_sack_info instead\n");
2463 printk(KERN_WARNING "SCTP: Use struct sctp_sack_info instead\n");
2464 if (copy_from_user(&params, optval, optlen)) 2476 if (copy_from_user(&params, optval, optlen))
2465 return -EFAULT; 2477 return -EFAULT;
2466 2478
@@ -2868,10 +2880,8 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned
2868 int val; 2880 int val;
2869 2881
2870 if (optlen == sizeof(int)) { 2882 if (optlen == sizeof(int)) {
2871 printk(KERN_WARNING 2883 pr_warn("Use of int in maxseg socket option deprecated\n");
2872 "SCTP: Use of int in maxseg socket option deprecated\n"); 2884 pr_warn("Use struct sctp_assoc_value instead\n");
2873 printk(KERN_WARNING
2874 "SCTP: Use struct sctp_assoc_value instead\n");
2875 if (copy_from_user(&val, optval, optlen)) 2885 if (copy_from_user(&val, optval, optlen))
2876 return -EFAULT; 2886 return -EFAULT;
2877 params.assoc_id = 0; 2887 params.assoc_id = 0;
@@ -3121,10 +3131,8 @@ static int sctp_setsockopt_maxburst(struct sock *sk,
3121 int assoc_id = 0; 3131 int assoc_id = 0;
3122 3132
3123 if (optlen == sizeof(int)) { 3133 if (optlen == sizeof(int)) {
3124 printk(KERN_WARNING 3134 pr_warn("Use of int in max_burst socket option deprecated\n");
3125 "SCTP: Use of int in max_burst socket option deprecated\n"); 3135 pr_warn("Use struct sctp_assoc_value instead\n");
3126 printk(KERN_WARNING
3127 "SCTP: Use struct sctp_assoc_value instead\n");
3128 if (copy_from_user(&val, optval, optlen)) 3136 if (copy_from_user(&val, optval, optlen))
3129 return -EFAULT; 3137 return -EFAULT;
3130 } else if (optlen == sizeof(struct sctp_assoc_value)) { 3138 } else if (optlen == sizeof(struct sctp_assoc_value)) {
@@ -3595,7 +3603,40 @@ out:
3595/* The SCTP ioctl handler. */ 3603/* The SCTP ioctl handler. */
3596SCTP_STATIC int sctp_ioctl(struct sock *sk, int cmd, unsigned long arg) 3604SCTP_STATIC int sctp_ioctl(struct sock *sk, int cmd, unsigned long arg)
3597{ 3605{
3598 return -ENOIOCTLCMD; 3606 int rc = -ENOTCONN;
3607
3608 sctp_lock_sock(sk);
3609
3610 /*
3611 * SEQPACKET-style sockets in LISTENING state are valid, for
3612 * SCTP, so only discard TCP-style sockets in LISTENING state.
3613 */
3614 if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))
3615 goto out;
3616
3617 switch (cmd) {
3618 case SIOCINQ: {
3619 struct sk_buff *skb;
3620 unsigned int amount = 0;
3621
3622 skb = skb_peek(&sk->sk_receive_queue);
3623 if (skb != NULL) {
3624 /*
3625 * We will only return the amount of this packet since
3626 * that is all that will be read.
3627 */
3628 amount = skb->len;
3629 }
3630 rc = put_user(amount, (int __user *)arg);
3631 break;
3632 }
3633 default:
3634 rc = -ENOIOCTLCMD;
3635 break;
3636 }
3637out:
3638 sctp_release_sock(sk);
3639 return rc;
3599} 3640}
3600 3641
3601/* This is the function which gets called during socket creation to 3642/* This is the function which gets called during socket creation to
@@ -3854,7 +3895,7 @@ static int sctp_getsockopt_sctp_status(struct sock *sk, int len,
3854 } 3895 }
3855 3896
3856out: 3897out:
3857 return (retval); 3898 return retval;
3858} 3899}
3859 3900
3860 3901
@@ -3910,7 +3951,7 @@ static int sctp_getsockopt_peer_addr_info(struct sock *sk, int len,
3910 } 3951 }
3911 3952
3912out: 3953out:
3913 return (retval); 3954 return retval;
3914} 3955}
3915 3956
3916/* 7.1.12 Enable/Disable message fragmentation (SCTP_DISABLE_FRAGMENTS) 3957/* 7.1.12 Enable/Disable message fragmentation (SCTP_DISABLE_FRAGMENTS)
@@ -4281,9 +4322,8 @@ static int sctp_getsockopt_delayed_ack(struct sock *sk, int len,
4281 if (copy_from_user(&params, optval, len)) 4322 if (copy_from_user(&params, optval, len))
4282 return -EFAULT; 4323 return -EFAULT;
4283 } else if (len == sizeof(struct sctp_assoc_value)) { 4324 } else if (len == sizeof(struct sctp_assoc_value)) {
4284 printk(KERN_WARNING "SCTP: Use of struct sctp_assoc_value " 4325 pr_warn("Use of struct sctp_assoc_value in delayed_ack socket option deprecated\n");
4285 "in delayed_ack socket option deprecated\n"); 4326 pr_warn("Use struct sctp_sack_info instead\n");
4286 printk(KERN_WARNING "SCTP: Use struct sctp_sack_info instead\n");
4287 if (copy_from_user(&params, optval, len)) 4327 if (copy_from_user(&params, optval, len))
4288 return -EFAULT; 4328 return -EFAULT;
4289 } else 4329 } else
@@ -4929,10 +4969,8 @@ static int sctp_getsockopt_maxseg(struct sock *sk, int len,
4929 struct sctp_association *asoc; 4969 struct sctp_association *asoc;
4930 4970
4931 if (len == sizeof(int)) { 4971 if (len == sizeof(int)) {
4932 printk(KERN_WARNING 4972 pr_warn("Use of int in maxseg socket option deprecated\n");
4933 "SCTP: Use of int in maxseg socket option deprecated\n"); 4973 pr_warn("Use struct sctp_assoc_value instead\n");
4934 printk(KERN_WARNING
4935 "SCTP: Use struct sctp_assoc_value instead\n");
4936 params.assoc_id = 0; 4974 params.assoc_id = 0;
4937 } else if (len >= sizeof(struct sctp_assoc_value)) { 4975 } else if (len >= sizeof(struct sctp_assoc_value)) {
4938 len = sizeof(struct sctp_assoc_value); 4976 len = sizeof(struct sctp_assoc_value);
@@ -5023,10 +5061,8 @@ static int sctp_getsockopt_maxburst(struct sock *sk, int len,
5023 struct sctp_association *asoc; 5061 struct sctp_association *asoc;
5024 5062
5025 if (len == sizeof(int)) { 5063 if (len == sizeof(int)) {
5026 printk(KERN_WARNING 5064 pr_warn("Use of int in max_burst socket option deprecated\n");
5027 "SCTP: Use of int in max_burst socket option deprecated\n"); 5065 pr_warn("Use struct sctp_assoc_value instead\n");
5028 printk(KERN_WARNING
5029 "SCTP: Use struct sctp_assoc_value instead\n");
5030 params.assoc_id = 0; 5066 params.assoc_id = 0;
5031 } else if (len >= sizeof(struct sctp_assoc_value)) { 5067 } else if (len >= sizeof(struct sctp_assoc_value)) {
5032 len = sizeof(struct sctp_assoc_value); 5068 len = sizeof(struct sctp_assoc_value);
@@ -5569,7 +5605,7 @@ static int sctp_get_port(struct sock *sk, unsigned short snum)
5569 /* Note: sk->sk_num gets filled in if ephemeral port request. */ 5605 /* Note: sk->sk_num gets filled in if ephemeral port request. */
5570 ret = sctp_get_port_local(sk, &addr); 5606 ret = sctp_get_port_local(sk, &addr);
5571 5607
5572 return (ret ? 1 : 0); 5608 return ret ? 1 : 0;
5573} 5609}
5574 5610
5575/* 5611/*
@@ -5586,8 +5622,7 @@ SCTP_STATIC int sctp_listen_start(struct sock *sk, int backlog)
5586 tfm = crypto_alloc_hash(sctp_hmac_alg, 0, CRYPTO_ALG_ASYNC); 5622 tfm = crypto_alloc_hash(sctp_hmac_alg, 0, CRYPTO_ALG_ASYNC);
5587 if (IS_ERR(tfm)) { 5623 if (IS_ERR(tfm)) {
5588 if (net_ratelimit()) { 5624 if (net_ratelimit()) {
5589 printk(KERN_INFO 5625 pr_info("failed to load transform for %s: %ld\n",
5590 "SCTP: failed to load transform for %s: %ld\n",
5591 sctp_hmac_alg, PTR_ERR(tfm)); 5626 sctp_hmac_alg, PTR_ERR(tfm));
5592 } 5627 }
5593 return -ENOSYS; 5628 return -ENOSYS;
@@ -5716,13 +5751,12 @@ unsigned int sctp_poll(struct file *file, struct socket *sock, poll_table *wait)
5716 if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) 5751 if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
5717 mask |= POLLERR; 5752 mask |= POLLERR;
5718 if (sk->sk_shutdown & RCV_SHUTDOWN) 5753 if (sk->sk_shutdown & RCV_SHUTDOWN)
5719 mask |= POLLRDHUP; 5754 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
5720 if (sk->sk_shutdown == SHUTDOWN_MASK) 5755 if (sk->sk_shutdown == SHUTDOWN_MASK)
5721 mask |= POLLHUP; 5756 mask |= POLLHUP;
5722 5757
5723 /* Is it readable? Reconsider this code with TCP-style support. */ 5758 /* Is it readable? Reconsider this code with TCP-style support. */
5724 if (!skb_queue_empty(&sk->sk_receive_queue) || 5759 if (!skb_queue_empty(&sk->sk_receive_queue))
5725 (sk->sk_shutdown & RCV_SHUTDOWN))
5726 mask |= POLLIN | POLLRDNORM; 5760 mask |= POLLIN | POLLRDNORM;
5727 5761
5728 /* The association is either gone or not ready. */ 5762 /* The association is either gone or not ready. */
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 132046cb82fc..d3ae493d234a 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -48,6 +48,8 @@
48 * be incorporated into the next SCTP release. 48 * be incorporated into the next SCTP release.
49 */ 49 */
50 50
51#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
52
51#include <linux/slab.h> 53#include <linux/slab.h>
52#include <linux/types.h> 54#include <linux/types.h>
53#include <linux/random.h> 55#include <linux/random.h>
@@ -244,10 +246,9 @@ void sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu)
244 struct dst_entry *dst; 246 struct dst_entry *dst;
245 247
246 if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) { 248 if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) {
247 printk(KERN_WARNING "%s: Reported pmtu %d too low, " 249 pr_warn("%s: Reported pmtu %d too low, using default minimum of %d\n",
248 "using default minimum of %d\n", 250 __func__, pmtu,
249 __func__, pmtu, 251 SCTP_DEFAULT_MINSEGMENT);
250 SCTP_DEFAULT_MINSEGMENT);
251 /* Use default minimum segment size and disable 252 /* Use default minimum segment size and disable
252 * pmtu discovery on this transport. 253 * pmtu discovery on this transport.
253 */ 254 */
diff --git a/net/socket.c b/net/socket.c
index 2270b941bcc7..3ca2fd9e3720 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -209,8 +209,8 @@ int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *kaddr)
209 * specified. Zero is returned for a success. 209 * specified. Zero is returned for a success.
210 */ 210 */
211 211
212int move_addr_to_user(struct sockaddr *kaddr, int klen, void __user *uaddr, 212static int move_addr_to_user(struct sockaddr *kaddr, int klen,
213 int __user *ulen) 213 void __user *uaddr, int __user *ulen)
214{ 214{
215 int err; 215 int err;
216 int len; 216 int len;
@@ -305,19 +305,17 @@ static const struct super_operations sockfs_ops = {
305 .statfs = simple_statfs, 305 .statfs = simple_statfs,
306}; 306};
307 307
308static int sockfs_get_sb(struct file_system_type *fs_type, 308static struct dentry *sockfs_mount(struct file_system_type *fs_type,
309 int flags, const char *dev_name, void *data, 309 int flags, const char *dev_name, void *data)
310 struct vfsmount *mnt)
311{ 310{
312 return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC, 311 return mount_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC);
313 mnt);
314} 312}
315 313
316static struct vfsmount *sock_mnt __read_mostly; 314static struct vfsmount *sock_mnt __read_mostly;
317 315
318static struct file_system_type sock_fs_type = { 316static struct file_system_type sock_fs_type = {
319 .name = "sockfs", 317 .name = "sockfs",
320 .get_sb = sockfs_get_sb, 318 .mount = sockfs_mount,
321 .kill_sb = kill_anon_super, 319 .kill_sb = kill_anon_super,
322}; 320};
323 321
@@ -377,7 +375,7 @@ static int sock_alloc_file(struct socket *sock, struct file **f, int flags)
377 &socket_file_ops); 375 &socket_file_ops);
378 if (unlikely(!file)) { 376 if (unlikely(!file)) {
379 /* drop dentry, keep inode */ 377 /* drop dentry, keep inode */
380 atomic_inc(&path.dentry->d_inode->i_count); 378 ihold(path.dentry->d_inode);
381 path_put(&path); 379 path_put(&path);
382 put_unused_fd(fd); 380 put_unused_fd(fd);
383 return -ENFILE; 381 return -ENFILE;
@@ -480,6 +478,7 @@ static struct socket *sock_alloc(void)
480 sock = SOCKET_I(inode); 478 sock = SOCKET_I(inode);
481 479
482 kmemcheck_annotate_bitfield(sock, type); 480 kmemcheck_annotate_bitfield(sock, type);
481 inode->i_ino = get_next_ino();
483 inode->i_mode = S_IFSOCK | S_IRWXUGO; 482 inode->i_mode = S_IFSOCK | S_IRWXUGO;
484 inode->i_uid = current_fsuid(); 483 inode->i_uid = current_fsuid();
485 inode->i_gid = current_fsgid(); 484 inode->i_gid = current_fsgid();
@@ -502,6 +501,7 @@ static int sock_no_open(struct inode *irrelevant, struct file *dontcare)
502const struct file_operations bad_sock_fops = { 501const struct file_operations bad_sock_fops = {
503 .owner = THIS_MODULE, 502 .owner = THIS_MODULE,
504 .open = sock_no_open, 503 .open = sock_no_open,
504 .llseek = noop_llseek,
505}; 505};
506 506
507/** 507/**
@@ -535,14 +535,13 @@ void sock_release(struct socket *sock)
535} 535}
536EXPORT_SYMBOL(sock_release); 536EXPORT_SYMBOL(sock_release);
537 537
538int sock_tx_timestamp(struct msghdr *msg, struct sock *sk, 538int sock_tx_timestamp(struct sock *sk, __u8 *tx_flags)
539 union skb_shared_tx *shtx)
540{ 539{
541 shtx->flags = 0; 540 *tx_flags = 0;
542 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE)) 541 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
543 shtx->hardware = 1; 542 *tx_flags |= SKBTX_HW_TSTAMP;
544 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE)) 543 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
545 shtx->software = 1; 544 *tx_flags |= SKBTX_SW_TSTAMP;
546 return 0; 545 return 0;
547} 546}
548EXPORT_SYMBOL(sock_tx_timestamp); 547EXPORT_SYMBOL(sock_tx_timestamp);
@@ -662,7 +661,8 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
662} 661}
663EXPORT_SYMBOL_GPL(__sock_recv_timestamp); 662EXPORT_SYMBOL_GPL(__sock_recv_timestamp);
664 663
665inline void sock_recv_drops(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) 664static inline void sock_recv_drops(struct msghdr *msg, struct sock *sk,
665 struct sk_buff *skb)
666{ 666{
667 if (sock_flag(sk, SOCK_RXQ_OVFL) && skb && skb->dropcount) 667 if (sock_flag(sk, SOCK_RXQ_OVFL) && skb && skb->dropcount)
668 put_cmsg(msg, SOL_SOCKET, SO_RXQ_OVFL, 668 put_cmsg(msg, SOL_SOCKET, SO_RXQ_OVFL,
@@ -1144,7 +1144,7 @@ call_kill:
1144} 1144}
1145EXPORT_SYMBOL(sock_wake_async); 1145EXPORT_SYMBOL(sock_wake_async);
1146 1146
1147static int __sock_create(struct net *net, int family, int type, int protocol, 1147int __sock_create(struct net *net, int family, int type, int protocol,
1148 struct socket **res, int kern) 1148 struct socket **res, int kern)
1149{ 1149{
1150 int err; 1150 int err;
@@ -1256,6 +1256,7 @@ out_release:
1256 rcu_read_unlock(); 1256 rcu_read_unlock();
1257 goto out_sock_release; 1257 goto out_sock_release;
1258} 1258}
1259EXPORT_SYMBOL(__sock_create);
1259 1260
1260int sock_create(int family, int type, int protocol, struct socket **res) 1261int sock_create(int family, int type, int protocol, struct socket **res)
1261{ 1262{
@@ -1651,6 +1652,8 @@ SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len,
1651 struct iovec iov; 1652 struct iovec iov;
1652 int fput_needed; 1653 int fput_needed;
1653 1654
1655 if (len > INT_MAX)
1656 len = INT_MAX;
1654 sock = sockfd_lookup_light(fd, &err, &fput_needed); 1657 sock = sockfd_lookup_light(fd, &err, &fput_needed);
1655 if (!sock) 1658 if (!sock)
1656 goto out; 1659 goto out;
@@ -1708,6 +1711,8 @@ SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size,
1708 int err, err2; 1711 int err, err2;
1709 int fput_needed; 1712 int fput_needed;
1710 1713
1714 if (size > INT_MAX)
1715 size = INT_MAX;
1711 sock = sockfd_lookup_light(fd, &err, &fput_needed); 1716 sock = sockfd_lookup_light(fd, &err, &fput_needed);
1712 if (!sock) 1717 if (!sock)
1713 goto out; 1718 goto out;
@@ -1919,7 +1924,8 @@ SYSCALL_DEFINE3(sendmsg, int, fd, struct msghdr __user *, msg, unsigned, flags)
1919 * Afterwards, it will be a kernel pointer. Thus the compiler-assisted 1924 * Afterwards, it will be a kernel pointer. Thus the compiler-assisted
1920 * checking falls down on this. 1925 * checking falls down on this.
1921 */ 1926 */
1922 if (copy_from_user(ctl_buf, (void __user *)msg_sys.msg_control, 1927 if (copy_from_user(ctl_buf,
1928 (void __user __force *)msg_sys.msg_control,
1923 ctl_len)) 1929 ctl_len))
1924 goto out_freectl; 1930 goto out_freectl;
1925 msg_sys.msg_control = ctl_buf; 1931 msg_sys.msg_control = ctl_buf;
@@ -3054,14 +3060,19 @@ int kernel_getsockopt(struct socket *sock, int level, int optname,
3054 char *optval, int *optlen) 3060 char *optval, int *optlen)
3055{ 3061{
3056 mm_segment_t oldfs = get_fs(); 3062 mm_segment_t oldfs = get_fs();
3063 char __user *uoptval;
3064 int __user *uoptlen;
3057 int err; 3065 int err;
3058 3066
3067 uoptval = (char __user __force *) optval;
3068 uoptlen = (int __user __force *) optlen;
3069
3059 set_fs(KERNEL_DS); 3070 set_fs(KERNEL_DS);
3060 if (level == SOL_SOCKET) 3071 if (level == SOL_SOCKET)
3061 err = sock_getsockopt(sock, level, optname, optval, optlen); 3072 err = sock_getsockopt(sock, level, optname, uoptval, uoptlen);
3062 else 3073 else
3063 err = sock->ops->getsockopt(sock, level, optname, optval, 3074 err = sock->ops->getsockopt(sock, level, optname, uoptval,
3064 optlen); 3075 uoptlen);
3065 set_fs(oldfs); 3076 set_fs(oldfs);
3066 return err; 3077 return err;
3067} 3078}
@@ -3071,13 +3082,16 @@ int kernel_setsockopt(struct socket *sock, int level, int optname,
3071 char *optval, unsigned int optlen) 3082 char *optval, unsigned int optlen)
3072{ 3083{
3073 mm_segment_t oldfs = get_fs(); 3084 mm_segment_t oldfs = get_fs();
3085 char __user *uoptval;
3074 int err; 3086 int err;
3075 3087
3088 uoptval = (char __user __force *) optval;
3089
3076 set_fs(KERNEL_DS); 3090 set_fs(KERNEL_DS);
3077 if (level == SOL_SOCKET) 3091 if (level == SOL_SOCKET)
3078 err = sock_setsockopt(sock, level, optname, optval, optlen); 3092 err = sock_setsockopt(sock, level, optname, uoptval, optlen);
3079 else 3093 else
3080 err = sock->ops->setsockopt(sock, level, optname, optval, 3094 err = sock->ops->setsockopt(sock, level, optname, uoptval,
3081 optlen); 3095 optlen);
3082 set_fs(oldfs); 3096 set_fs(oldfs);
3083 return err; 3097 return err;
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index 443c161eb8bd..8873fd8ddacd 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -18,10 +18,11 @@ config SUNRPC_XPRT_RDMA
18 If unsure, say N. 18 If unsure, say N.
19 19
20config RPCSEC_GSS_KRB5 20config RPCSEC_GSS_KRB5
21 tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)" 21 tristate
22 depends on SUNRPC && EXPERIMENTAL 22 depends on SUNRPC && CRYPTO
23 prompt "Secure RPC: Kerberos V mechanism" if !(NFS_V4 || NFSD_V4)
24 default y
23 select SUNRPC_GSS 25 select SUNRPC_GSS
24 select CRYPTO
25 select CRYPTO_MD5 26 select CRYPTO_MD5
26 select CRYPTO_DES 27 select CRYPTO_DES
27 select CRYPTO_CBC 28 select CRYPTO_CBC
@@ -34,23 +35,4 @@ config RPCSEC_GSS_KRB5
34 available from http://linux-nfs.org/. In addition, user-space 35 available from http://linux-nfs.org/. In addition, user-space
35 Kerberos support should be installed. 36 Kerberos support should be installed.
36 37
37 If unsure, say N. 38 If unsure, say Y.
38
39config RPCSEC_GSS_SPKM3
40 tristate "Secure RPC: SPKM3 mechanism (EXPERIMENTAL)"
41 depends on SUNRPC && EXPERIMENTAL
42 select SUNRPC_GSS
43 select CRYPTO
44 select CRYPTO_MD5
45 select CRYPTO_DES
46 select CRYPTO_CAST5
47 select CRYPTO_CBC
48 help
49 Choose Y here to enable Secure RPC using the SPKM3 public key
50 GSS-API mechanism (RFC 2025).
51
52 Secure RPC calls with SPKM3 require an auxiliary userspace
53 daemon which may be found in the Linux nfs-utils package
54 available from http://linux-nfs.org/.
55
56 If unsure, say N.
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 36cb66022a27..afe67849269f 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -38,7 +38,7 @@ static const struct rpc_authops *auth_flavors[RPC_AUTH_MAXFLAVOR] = {
38static LIST_HEAD(cred_unused); 38static LIST_HEAD(cred_unused);
39static unsigned long number_cred_unused; 39static unsigned long number_cred_unused;
40 40
41#define MAX_HASHTABLE_BITS (10) 41#define MAX_HASHTABLE_BITS (14)
42static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp) 42static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp)
43{ 43{
44 unsigned long num; 44 unsigned long num;
@@ -595,7 +595,7 @@ rpcauth_unwrap_resp(struct rpc_task *task, kxdrproc_t decode, void *rqstp,
595int 595int
596rpcauth_refreshcred(struct rpc_task *task) 596rpcauth_refreshcred(struct rpc_task *task)
597{ 597{
598 struct rpc_cred *cred = task->tk_rqstp->rq_cred; 598 struct rpc_cred *cred;
599 int err; 599 int err;
600 600
601 cred = task->tk_rqstp->rq_cred; 601 cred = task->tk_rqstp->rq_cred;
@@ -658,7 +658,7 @@ out1:
658 return err; 658 return err;
659} 659}
660 660
661void __exit rpcauth_remove_module(void) 661void rpcauth_remove_module(void)
662{ 662{
663 rpc_destroy_authunix(); 663 rpc_destroy_authunix();
664 rpc_destroy_generic_auth(); 664 rpc_destroy_generic_auth();
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index 43162bb3b78f..e010a015d996 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -158,7 +158,7 @@ int __init rpc_init_generic_auth(void)
158 return rpcauth_init_credcache(&generic_auth); 158 return rpcauth_init_credcache(&generic_auth);
159} 159}
160 160
161void __exit rpc_destroy_generic_auth(void) 161void rpc_destroy_generic_auth(void)
162{ 162{
163 rpcauth_destroy_credcache(&generic_auth); 163 rpcauth_destroy_credcache(&generic_auth);
164} 164}
diff --git a/net/sunrpc/auth_gss/Makefile b/net/sunrpc/auth_gss/Makefile
index 74a231735f67..7350d86a32ee 100644
--- a/net/sunrpc/auth_gss/Makefile
+++ b/net/sunrpc/auth_gss/Makefile
@@ -11,8 +11,3 @@ obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o
11 11
12rpcsec_gss_krb5-objs := gss_krb5_mech.o gss_krb5_seal.o gss_krb5_unseal.o \ 12rpcsec_gss_krb5-objs := gss_krb5_mech.o gss_krb5_seal.o gss_krb5_unseal.o \
13 gss_krb5_seqnum.o gss_krb5_wrap.o gss_krb5_crypto.o gss_krb5_keys.o 13 gss_krb5_seqnum.o gss_krb5_wrap.o gss_krb5_crypto.o gss_krb5_keys.o
14
15obj-$(CONFIG_RPCSEC_GSS_SPKM3) += rpcsec_gss_spkm3.o
16
17rpcsec_gss_spkm3-objs := gss_spkm3_mech.o gss_spkm3_seal.o gss_spkm3_unseal.o \
18 gss_spkm3_token.o
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index dcfc66bab2bb..3835ce35e224 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -745,17 +745,18 @@ gss_pipe_release(struct inode *inode)
745 struct rpc_inode *rpci = RPC_I(inode); 745 struct rpc_inode *rpci = RPC_I(inode);
746 struct gss_upcall_msg *gss_msg; 746 struct gss_upcall_msg *gss_msg;
747 747
748restart:
748 spin_lock(&inode->i_lock); 749 spin_lock(&inode->i_lock);
749 while (!list_empty(&rpci->in_downcall)) { 750 list_for_each_entry(gss_msg, &rpci->in_downcall, list) {
750 751
751 gss_msg = list_entry(rpci->in_downcall.next, 752 if (!list_empty(&gss_msg->msg.list))
752 struct gss_upcall_msg, list); 753 continue;
753 gss_msg->msg.errno = -EPIPE; 754 gss_msg->msg.errno = -EPIPE;
754 atomic_inc(&gss_msg->count); 755 atomic_inc(&gss_msg->count);
755 __gss_unhash_msg(gss_msg); 756 __gss_unhash_msg(gss_msg);
756 spin_unlock(&inode->i_lock); 757 spin_unlock(&inode->i_lock);
757 gss_release_msg(gss_msg); 758 gss_release_msg(gss_msg);
758 spin_lock(&inode->i_lock); 759 goto restart;
759 } 760 }
760 spin_unlock(&inode->i_lock); 761 spin_unlock(&inode->i_lock);
761 762
@@ -1049,7 +1050,7 @@ gss_match(struct auth_cred *acred, struct rpc_cred *rc, int flags)
1049out: 1050out:
1050 if (acred->machine_cred != gss_cred->gc_machine_cred) 1051 if (acred->machine_cred != gss_cred->gc_machine_cred)
1051 return 0; 1052 return 0;
1052 return (rc->cr_uid == acred->uid); 1053 return rc->cr_uid == acred->uid;
1053} 1054}
1054 1055
1055/* 1056/*
diff --git a/net/sunrpc/auth_gss/gss_generic_token.c b/net/sunrpc/auth_gss/gss_generic_token.c
index 310b78e99456..c586e92bcf76 100644
--- a/net/sunrpc/auth_gss/gss_generic_token.c
+++ b/net/sunrpc/auth_gss/gss_generic_token.c
@@ -76,19 +76,19 @@ static int
76der_length_size( int length) 76der_length_size( int length)
77{ 77{
78 if (length < (1<<7)) 78 if (length < (1<<7))
79 return(1); 79 return 1;
80 else if (length < (1<<8)) 80 else if (length < (1<<8))
81 return(2); 81 return 2;
82#if (SIZEOF_INT == 2) 82#if (SIZEOF_INT == 2)
83 else 83 else
84 return(3); 84 return 3;
85#else 85#else
86 else if (length < (1<<16)) 86 else if (length < (1<<16))
87 return(3); 87 return 3;
88 else if (length < (1<<24)) 88 else if (length < (1<<24))
89 return(4); 89 return 4;
90 else 90 else
91 return(5); 91 return 5;
92#endif 92#endif
93} 93}
94 94
@@ -121,14 +121,14 @@ der_read_length(unsigned char **buf, int *bufsize)
121 int ret; 121 int ret;
122 122
123 if (*bufsize < 1) 123 if (*bufsize < 1)
124 return(-1); 124 return -1;
125 sf = *(*buf)++; 125 sf = *(*buf)++;
126 (*bufsize)--; 126 (*bufsize)--;
127 if (sf & 0x80) { 127 if (sf & 0x80) {
128 if ((sf &= 0x7f) > ((*bufsize)-1)) 128 if ((sf &= 0x7f) > ((*bufsize)-1))
129 return(-1); 129 return -1;
130 if (sf > SIZEOF_INT) 130 if (sf > SIZEOF_INT)
131 return (-1); 131 return -1;
132 ret = 0; 132 ret = 0;
133 for (; sf; sf--) { 133 for (; sf; sf--) {
134 ret = (ret<<8) + (*(*buf)++); 134 ret = (ret<<8) + (*(*buf)++);
@@ -138,7 +138,7 @@ der_read_length(unsigned char **buf, int *bufsize)
138 ret = sf; 138 ret = sf;
139 } 139 }
140 140
141 return(ret); 141 return ret;
142} 142}
143 143
144/* returns the length of a token, given the mech oid and the body size */ 144/* returns the length of a token, given the mech oid and the body size */
@@ -148,7 +148,7 @@ g_token_size(struct xdr_netobj *mech, unsigned int body_size)
148{ 148{
149 /* set body_size to sequence contents size */ 149 /* set body_size to sequence contents size */
150 body_size += 2 + (int) mech->len; /* NEED overflow check */ 150 body_size += 2 + (int) mech->len; /* NEED overflow check */
151 return(1 + der_length_size(body_size) + body_size); 151 return 1 + der_length_size(body_size) + body_size;
152} 152}
153 153
154EXPORT_SYMBOL_GPL(g_token_size); 154EXPORT_SYMBOL_GPL(g_token_size);
@@ -186,27 +186,27 @@ g_verify_token_header(struct xdr_netobj *mech, int *body_size,
186 int ret = 0; 186 int ret = 0;
187 187
188 if ((toksize-=1) < 0) 188 if ((toksize-=1) < 0)
189 return(G_BAD_TOK_HEADER); 189 return G_BAD_TOK_HEADER;
190 if (*buf++ != 0x60) 190 if (*buf++ != 0x60)
191 return(G_BAD_TOK_HEADER); 191 return G_BAD_TOK_HEADER;
192 192
193 if ((seqsize = der_read_length(&buf, &toksize)) < 0) 193 if ((seqsize = der_read_length(&buf, &toksize)) < 0)
194 return(G_BAD_TOK_HEADER); 194 return G_BAD_TOK_HEADER;
195 195
196 if (seqsize != toksize) 196 if (seqsize != toksize)
197 return(G_BAD_TOK_HEADER); 197 return G_BAD_TOK_HEADER;
198 198
199 if ((toksize-=1) < 0) 199 if ((toksize-=1) < 0)
200 return(G_BAD_TOK_HEADER); 200 return G_BAD_TOK_HEADER;
201 if (*buf++ != 0x06) 201 if (*buf++ != 0x06)
202 return(G_BAD_TOK_HEADER); 202 return G_BAD_TOK_HEADER;
203 203
204 if ((toksize-=1) < 0) 204 if ((toksize-=1) < 0)
205 return(G_BAD_TOK_HEADER); 205 return G_BAD_TOK_HEADER;
206 toid.len = *buf++; 206 toid.len = *buf++;
207 207
208 if ((toksize-=toid.len) < 0) 208 if ((toksize-=toid.len) < 0)
209 return(G_BAD_TOK_HEADER); 209 return G_BAD_TOK_HEADER;
210 toid.data = buf; 210 toid.data = buf;
211 buf+=toid.len; 211 buf+=toid.len;
212 212
@@ -217,17 +217,17 @@ g_verify_token_header(struct xdr_netobj *mech, int *body_size,
217 to return G_BAD_TOK_HEADER if the token header is in fact bad */ 217 to return G_BAD_TOK_HEADER if the token header is in fact bad */
218 218
219 if ((toksize-=2) < 0) 219 if ((toksize-=2) < 0)
220 return(G_BAD_TOK_HEADER); 220 return G_BAD_TOK_HEADER;
221 221
222 if (ret) 222 if (ret)
223 return(ret); 223 return ret;
224 224
225 if (!ret) { 225 if (!ret) {
226 *buf_in = buf; 226 *buf_in = buf;
227 *body_size = toksize; 227 *body_size = toksize;
228 } 228 }
229 229
230 return(ret); 230 return ret;
231} 231}
232 232
233EXPORT_SYMBOL_GPL(g_verify_token_header); 233EXPORT_SYMBOL_GPL(g_verify_token_header);
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 032644610524..f375decc024b 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -237,6 +237,7 @@ get_key(const void *p, const void *end,
237 if (!supported_gss_krb5_enctype(alg)) { 237 if (!supported_gss_krb5_enctype(alg)) {
238 printk(KERN_WARNING "gss_kerberos_mech: unsupported " 238 printk(KERN_WARNING "gss_kerberos_mech: unsupported "
239 "encryption key algorithm %d\n", alg); 239 "encryption key algorithm %d\n", alg);
240 p = ERR_PTR(-EINVAL);
240 goto out_err; 241 goto out_err;
241 } 242 }
242 p = simple_get_netobj(p, end, &key); 243 p = simple_get_netobj(p, end, &key);
@@ -282,15 +283,19 @@ gss_import_v1_context(const void *p, const void *end, struct krb5_ctx *ctx)
282 ctx->enctype = ENCTYPE_DES_CBC_RAW; 283 ctx->enctype = ENCTYPE_DES_CBC_RAW;
283 284
284 ctx->gk5e = get_gss_krb5_enctype(ctx->enctype); 285 ctx->gk5e = get_gss_krb5_enctype(ctx->enctype);
285 if (ctx->gk5e == NULL) 286 if (ctx->gk5e == NULL) {
287 p = ERR_PTR(-EINVAL);
286 goto out_err; 288 goto out_err;
289 }
287 290
288 /* The downcall format was designed before we completely understood 291 /* The downcall format was designed before we completely understood
289 * the uses of the context fields; so it includes some stuff we 292 * the uses of the context fields; so it includes some stuff we
290 * just give some minimal sanity-checking, and some we ignore 293 * just give some minimal sanity-checking, and some we ignore
291 * completely (like the next twenty bytes): */ 294 * completely (like the next twenty bytes): */
292 if (unlikely(p + 20 > end || p + 20 < p)) 295 if (unlikely(p + 20 > end || p + 20 < p)) {
296 p = ERR_PTR(-EFAULT);
293 goto out_err; 297 goto out_err;
298 }
294 p += 20; 299 p += 20;
295 p = simple_get_bytes(p, end, &tmp, sizeof(tmp)); 300 p = simple_get_bytes(p, end, &tmp, sizeof(tmp));
296 if (IS_ERR(p)) 301 if (IS_ERR(p))
@@ -422,7 +427,7 @@ static int
422context_derive_keys_rc4(struct krb5_ctx *ctx) 427context_derive_keys_rc4(struct krb5_ctx *ctx)
423{ 428{
424 struct crypto_hash *hmac; 429 struct crypto_hash *hmac;
425 char sigkeyconstant[] = "signaturekey"; 430 static const char sigkeyconstant[] = "signaturekey";
426 int slen = strlen(sigkeyconstant) + 1; /* include null terminator */ 431 int slen = strlen(sigkeyconstant) + 1; /* include null terminator */
427 struct hash_desc desc; 432 struct hash_desc desc;
428 struct scatterlist sg[1]; 433 struct scatterlist sg[1];
@@ -619,6 +624,7 @@ gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx,
619 if (ctx->seq_send64 != ctx->seq_send) { 624 if (ctx->seq_send64 != ctx->seq_send) {
620 dprintk("%s: seq_send64 %lx, seq_send %x overflow?\n", __func__, 625 dprintk("%s: seq_send64 %lx, seq_send %x overflow?\n", __func__,
621 (long unsigned)ctx->seq_send64, ctx->seq_send); 626 (long unsigned)ctx->seq_send64, ctx->seq_send);
627 p = ERR_PTR(-EINVAL);
622 goto out_err; 628 goto out_err;
623 } 629 }
624 p = simple_get_bytes(p, end, &ctx->enctype, sizeof(ctx->enctype)); 630 p = simple_get_bytes(p, end, &ctx->enctype, sizeof(ctx->enctype));
diff --git a/net/sunrpc/auth_gss/gss_krb5_seqnum.c b/net/sunrpc/auth_gss/gss_krb5_seqnum.c
index 415c013ba382..62ac90c62cb1 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seqnum.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seqnum.c
@@ -162,5 +162,5 @@ krb5_get_seq_num(struct krb5_ctx *kctx,
162 *seqnum = ((plain[0]) | 162 *seqnum = ((plain[0]) |
163 (plain[1] << 8) | (plain[2] << 16) | (plain[3] << 24)); 163 (plain[1] << 8) | (plain[2] << 16) | (plain[3] << 24));
164 164
165 return (0); 165 return 0;
166} 166}
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
index 2689de39dc78..8b4061049d76 100644
--- a/net/sunrpc/auth_gss/gss_mech_switch.c
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -331,7 +331,7 @@ gss_delete_sec_context(struct gss_ctx **context_handle)
331 *context_handle); 331 *context_handle);
332 332
333 if (!*context_handle) 333 if (!*context_handle)
334 return(GSS_S_NO_CONTEXT); 334 return GSS_S_NO_CONTEXT;
335 if ((*context_handle)->internal_ctx_id) 335 if ((*context_handle)->internal_ctx_id)
336 (*context_handle)->mech_type->gm_ops 336 (*context_handle)->mech_type->gm_ops
337 ->gss_delete_sec_context((*context_handle) 337 ->gss_delete_sec_context((*context_handle)
diff --git a/net/sunrpc/auth_gss/gss_spkm3_mech.c b/net/sunrpc/auth_gss/gss_spkm3_mech.c
deleted file mode 100644
index dc3f1f5ed865..000000000000
--- a/net/sunrpc/auth_gss/gss_spkm3_mech.c
+++ /dev/null
@@ -1,244 +0,0 @@
1/*
2 * linux/net/sunrpc/gss_spkm3_mech.c
3 *
4 * Copyright (c) 2003 The Regents of the University of Michigan.
5 * All rights reserved.
6 *
7 * Andy Adamson <andros@umich.edu>
8 * J. Bruce Fields <bfields@umich.edu>
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. Neither the name of the University nor the names of its
20 * contributors may be used to endorse or promote products derived
21 * from this software without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
24 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
25 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
26 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
28 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
29 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
30 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 *
35 */
36
37#include <linux/err.h>
38#include <linux/module.h>
39#include <linux/init.h>
40#include <linux/types.h>
41#include <linux/slab.h>
42#include <linux/sunrpc/auth.h>
43#include <linux/in.h>
44#include <linux/sunrpc/svcauth_gss.h>
45#include <linux/sunrpc/gss_spkm3.h>
46#include <linux/sunrpc/xdr.h>
47#include <linux/crypto.h>
48
49#ifdef RPC_DEBUG
50# define RPCDBG_FACILITY RPCDBG_AUTH
51#endif
52
53static const void *
54simple_get_bytes(const void *p, const void *end, void *res, int len)
55{
56 const void *q = (const void *)((const char *)p + len);
57 if (unlikely(q > end || q < p))
58 return ERR_PTR(-EFAULT);
59 memcpy(res, p, len);
60 return q;
61}
62
63static const void *
64simple_get_netobj(const void *p, const void *end, struct xdr_netobj *res)
65{
66 const void *q;
67 unsigned int len;
68 p = simple_get_bytes(p, end, &len, sizeof(len));
69 if (IS_ERR(p))
70 return p;
71 res->len = len;
72 if (len == 0) {
73 res->data = NULL;
74 return p;
75 }
76 q = (const void *)((const char *)p + len);
77 if (unlikely(q > end || q < p))
78 return ERR_PTR(-EFAULT);
79 res->data = kmemdup(p, len, GFP_NOFS);
80 if (unlikely(res->data == NULL))
81 return ERR_PTR(-ENOMEM);
82 return q;
83}
84
85static int
86gss_import_sec_context_spkm3(const void *p, size_t len,
87 struct gss_ctx *ctx_id,
88 gfp_t gfp_mask)
89{
90 const void *end = (const void *)((const char *)p + len);
91 struct spkm3_ctx *ctx;
92 int version;
93
94 if (!(ctx = kzalloc(sizeof(*ctx), gfp_mask)))
95 goto out_err;
96
97 p = simple_get_bytes(p, end, &version, sizeof(version));
98 if (IS_ERR(p))
99 goto out_err_free_ctx;
100 if (version != 1) {
101 dprintk("RPC: unknown spkm3 token format: "
102 "obsolete nfs-utils?\n");
103 goto out_err_free_ctx;
104 }
105
106 p = simple_get_netobj(p, end, &ctx->ctx_id);
107 if (IS_ERR(p))
108 goto out_err_free_ctx;
109
110 p = simple_get_bytes(p, end, &ctx->endtime, sizeof(ctx->endtime));
111 if (IS_ERR(p))
112 goto out_err_free_ctx_id;
113
114 p = simple_get_netobj(p, end, &ctx->mech_used);
115 if (IS_ERR(p))
116 goto out_err_free_ctx_id;
117
118 p = simple_get_bytes(p, end, &ctx->ret_flags, sizeof(ctx->ret_flags));
119 if (IS_ERR(p))
120 goto out_err_free_mech;
121
122 p = simple_get_netobj(p, end, &ctx->conf_alg);
123 if (IS_ERR(p))
124 goto out_err_free_mech;
125
126 p = simple_get_netobj(p, end, &ctx->derived_conf_key);
127 if (IS_ERR(p))
128 goto out_err_free_conf_alg;
129
130 p = simple_get_netobj(p, end, &ctx->intg_alg);
131 if (IS_ERR(p))
132 goto out_err_free_conf_key;
133
134 p = simple_get_netobj(p, end, &ctx->derived_integ_key);
135 if (IS_ERR(p))
136 goto out_err_free_intg_alg;
137
138 if (p != end)
139 goto out_err_free_intg_key;
140
141 ctx_id->internal_ctx_id = ctx;
142
143 dprintk("RPC: Successfully imported new spkm context.\n");
144 return 0;
145
146out_err_free_intg_key:
147 kfree(ctx->derived_integ_key.data);
148out_err_free_intg_alg:
149 kfree(ctx->intg_alg.data);
150out_err_free_conf_key:
151 kfree(ctx->derived_conf_key.data);
152out_err_free_conf_alg:
153 kfree(ctx->conf_alg.data);
154out_err_free_mech:
155 kfree(ctx->mech_used.data);
156out_err_free_ctx_id:
157 kfree(ctx->ctx_id.data);
158out_err_free_ctx:
159 kfree(ctx);
160out_err:
161 return PTR_ERR(p);
162}
163
164static void
165gss_delete_sec_context_spkm3(void *internal_ctx)
166{
167 struct spkm3_ctx *sctx = internal_ctx;
168
169 kfree(sctx->derived_integ_key.data);
170 kfree(sctx->intg_alg.data);
171 kfree(sctx->derived_conf_key.data);
172 kfree(sctx->conf_alg.data);
173 kfree(sctx->mech_used.data);
174 kfree(sctx->ctx_id.data);
175 kfree(sctx);
176}
177
178static u32
179gss_verify_mic_spkm3(struct gss_ctx *ctx,
180 struct xdr_buf *signbuf,
181 struct xdr_netobj *checksum)
182{
183 u32 maj_stat = 0;
184 struct spkm3_ctx *sctx = ctx->internal_ctx_id;
185
186 maj_stat = spkm3_read_token(sctx, checksum, signbuf, SPKM_MIC_TOK);
187
188 dprintk("RPC: gss_verify_mic_spkm3 returning %d\n", maj_stat);
189 return maj_stat;
190}
191
192static u32
193gss_get_mic_spkm3(struct gss_ctx *ctx,
194 struct xdr_buf *message_buffer,
195 struct xdr_netobj *message_token)
196{
197 u32 err = 0;
198 struct spkm3_ctx *sctx = ctx->internal_ctx_id;
199
200 err = spkm3_make_token(sctx, message_buffer,
201 message_token, SPKM_MIC_TOK);
202 dprintk("RPC: gss_get_mic_spkm3 returning %d\n", err);
203 return err;
204}
205
206static const struct gss_api_ops gss_spkm3_ops = {
207 .gss_import_sec_context = gss_import_sec_context_spkm3,
208 .gss_get_mic = gss_get_mic_spkm3,
209 .gss_verify_mic = gss_verify_mic_spkm3,
210 .gss_delete_sec_context = gss_delete_sec_context_spkm3,
211};
212
213static struct pf_desc gss_spkm3_pfs[] = {
214 {RPC_AUTH_GSS_SPKM, RPC_GSS_SVC_NONE, "spkm3"},
215 {RPC_AUTH_GSS_SPKMI, RPC_GSS_SVC_INTEGRITY, "spkm3i"},
216};
217
218static struct gss_api_mech gss_spkm3_mech = {
219 .gm_name = "spkm3",
220 .gm_owner = THIS_MODULE,
221 .gm_oid = {7, "\053\006\001\005\005\001\003"},
222 .gm_ops = &gss_spkm3_ops,
223 .gm_pf_num = ARRAY_SIZE(gss_spkm3_pfs),
224 .gm_pfs = gss_spkm3_pfs,
225};
226
227static int __init init_spkm3_module(void)
228{
229 int status;
230
231 status = gss_mech_register(&gss_spkm3_mech);
232 if (status)
233 printk("Failed to register spkm3 gss mechanism!\n");
234 return status;
235}
236
237static void __exit cleanup_spkm3_module(void)
238{
239 gss_mech_unregister(&gss_spkm3_mech);
240}
241
242MODULE_LICENSE("GPL");
243module_init(init_spkm3_module);
244module_exit(cleanup_spkm3_module);
diff --git a/net/sunrpc/auth_gss/gss_spkm3_seal.c b/net/sunrpc/auth_gss/gss_spkm3_seal.c
deleted file mode 100644
index 5a3a65a0e2b4..000000000000
--- a/net/sunrpc/auth_gss/gss_spkm3_seal.c
+++ /dev/null
@@ -1,186 +0,0 @@
1/*
2 * linux/net/sunrpc/gss_spkm3_seal.c
3 *
4 * Copyright (c) 2003 The Regents of the University of Michigan.
5 * All rights reserved.
6 *
7 * Andy Adamson <andros@umich.edu>
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 *
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its
19 * contributors may be used to endorse or promote products derived
20 * from this software without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
23 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
24 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
29 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
30 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
31 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 */
35
36#include <linux/types.h>
37#include <linux/jiffies.h>
38#include <linux/sunrpc/gss_spkm3.h>
39#include <linux/random.h>
40#include <linux/crypto.h>
41#include <linux/pagemap.h>
42#include <linux/scatterlist.h>
43#include <linux/sunrpc/xdr.h>
44
45#ifdef RPC_DEBUG
46# define RPCDBG_FACILITY RPCDBG_AUTH
47#endif
48
49const struct xdr_netobj hmac_md5_oid = { 8, "\x2B\x06\x01\x05\x05\x08\x01\x01"};
50const struct xdr_netobj cast5_cbc_oid = {9, "\x2A\x86\x48\x86\xF6\x7D\x07\x42\x0A"};
51
52/*
53 * spkm3_make_token()
54 *
55 * Only SPKM_MIC_TOK with md5 intg-alg is supported
56 */
57
58u32
59spkm3_make_token(struct spkm3_ctx *ctx,
60 struct xdr_buf * text, struct xdr_netobj * token,
61 int toktype)
62{
63 s32 checksum_type;
64 char tokhdrbuf[25];
65 char cksumdata[16];
66 struct xdr_netobj md5cksum = {.len = 0, .data = cksumdata};
67 struct xdr_netobj mic_hdr = {.len = 0, .data = tokhdrbuf};
68 int tokenlen = 0;
69 unsigned char *ptr;
70 s32 now;
71 int ctxelen = 0, ctxzbit = 0;
72 int md5elen = 0, md5zbit = 0;
73
74 now = jiffies;
75
76 if (ctx->ctx_id.len != 16) {
77 dprintk("RPC: spkm3_make_token BAD ctx_id.len %d\n",
78 ctx->ctx_id.len);
79 goto out_err;
80 }
81
82 if (!g_OID_equal(&ctx->intg_alg, &hmac_md5_oid)) {
83 dprintk("RPC: gss_spkm3_seal: unsupported I-ALG "
84 "algorithm. only support hmac-md5 I-ALG.\n");
85 goto out_err;
86 } else
87 checksum_type = CKSUMTYPE_HMAC_MD5;
88
89 if (!g_OID_equal(&ctx->conf_alg, &cast5_cbc_oid)) {
90 dprintk("RPC: gss_spkm3_seal: unsupported C-ALG "
91 "algorithm\n");
92 goto out_err;
93 }
94
95 if (toktype == SPKM_MIC_TOK) {
96 /* Calculate checksum over the mic-header */
97 asn1_bitstring_len(&ctx->ctx_id, &ctxelen, &ctxzbit);
98 spkm3_mic_header(&mic_hdr.data, &mic_hdr.len, ctx->ctx_id.data,
99 ctxelen, ctxzbit);
100 if (make_spkm3_checksum(checksum_type, &ctx->derived_integ_key,
101 (char *)mic_hdr.data, mic_hdr.len,
102 text, 0, &md5cksum))
103 goto out_err;
104
105 asn1_bitstring_len(&md5cksum, &md5elen, &md5zbit);
106 tokenlen = 10 + ctxelen + 1 + md5elen + 1;
107
108 /* Create token header using generic routines */
109 token->len = g_token_size(&ctx->mech_used, tokenlen + 2);
110
111 ptr = token->data;
112 g_make_token_header(&ctx->mech_used, tokenlen + 2, &ptr);
113
114 spkm3_make_mic_token(&ptr, tokenlen, &mic_hdr, &md5cksum, md5elen, md5zbit);
115 } else if (toktype == SPKM_WRAP_TOK) { /* Not Supported */
116 dprintk("RPC: gss_spkm3_seal: SPKM_WRAP_TOK "
117 "not supported\n");
118 goto out_err;
119 }
120
121 /* XXX need to implement sequence numbers, and ctx->expired */
122
123 return GSS_S_COMPLETE;
124out_err:
125 token->data = NULL;
126 token->len = 0;
127 return GSS_S_FAILURE;
128}
129
130static int
131spkm3_checksummer(struct scatterlist *sg, void *data)
132{
133 struct hash_desc *desc = data;
134
135 return crypto_hash_update(desc, sg, sg->length);
136}
137
138/* checksum the plaintext data and hdrlen bytes of the token header */
139s32
140make_spkm3_checksum(s32 cksumtype, struct xdr_netobj *key, char *header,
141 unsigned int hdrlen, struct xdr_buf *body,
142 unsigned int body_offset, struct xdr_netobj *cksum)
143{
144 char *cksumname;
145 struct hash_desc desc; /* XXX add to ctx? */
146 struct scatterlist sg[1];
147 int err;
148
149 switch (cksumtype) {
150 case CKSUMTYPE_HMAC_MD5:
151 cksumname = "hmac(md5)";
152 break;
153 default:
154 dprintk("RPC: spkm3_make_checksum:"
155 " unsupported checksum %d", cksumtype);
156 return GSS_S_FAILURE;
157 }
158
159 if (key->data == NULL || key->len <= 0) return GSS_S_FAILURE;
160
161 desc.tfm = crypto_alloc_hash(cksumname, 0, CRYPTO_ALG_ASYNC);
162 if (IS_ERR(desc.tfm))
163 return GSS_S_FAILURE;
164 cksum->len = crypto_hash_digestsize(desc.tfm);
165 desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
166
167 err = crypto_hash_setkey(desc.tfm, key->data, key->len);
168 if (err)
169 goto out;
170
171 err = crypto_hash_init(&desc);
172 if (err)
173 goto out;
174
175 sg_init_one(sg, header, hdrlen);
176 crypto_hash_update(&desc, sg, sg->length);
177
178 xdr_process_buf(body, body_offset, body->len - body_offset,
179 spkm3_checksummer, &desc);
180 crypto_hash_final(&desc, cksum->data);
181
182out:
183 crypto_free_hash(desc.tfm);
184
185 return err ? GSS_S_FAILURE : 0;
186}
diff --git a/net/sunrpc/auth_gss/gss_spkm3_token.c b/net/sunrpc/auth_gss/gss_spkm3_token.c
deleted file mode 100644
index a99825d7caa0..000000000000
--- a/net/sunrpc/auth_gss/gss_spkm3_token.c
+++ /dev/null
@@ -1,267 +0,0 @@
1/*
2 * linux/net/sunrpc/gss_spkm3_token.c
3 *
4 * Copyright (c) 2003 The Regents of the University of Michigan.
5 * All rights reserved.
6 *
7 * Andy Adamson <andros@umich.edu>
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 *
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its
19 * contributors may be used to endorse or promote products derived
20 * from this software without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
23 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
24 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
29 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
30 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
31 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 */
35
36#include <linux/types.h>
37#include <linux/slab.h>
38#include <linux/jiffies.h>
39#include <linux/sunrpc/gss_spkm3.h>
40#include <linux/random.h>
41#include <linux/crypto.h>
42
43#ifdef RPC_DEBUG
44# define RPCDBG_FACILITY RPCDBG_AUTH
45#endif
46
47/*
48 * asn1_bitstring_len()
49 *
50 * calculate the asn1 bitstring length of the xdr_netobject
51 */
52void
53asn1_bitstring_len(struct xdr_netobj *in, int *enclen, int *zerobits)
54{
55 int i, zbit = 0,elen = in->len;
56 char *ptr;
57
58 ptr = &in->data[in->len -1];
59
60 /* count trailing 0's */
61 for(i = in->len; i > 0; i--) {
62 if (*ptr == 0) {
63 ptr--;
64 elen--;
65 } else
66 break;
67 }
68
69 /* count number of 0 bits in final octet */
70 ptr = &in->data[elen - 1];
71 for(i = 0; i < 8; i++) {
72 short mask = 0x01;
73
74 if (!((mask << i) & *ptr))
75 zbit++;
76 else
77 break;
78 }
79 *enclen = elen;
80 *zerobits = zbit;
81}
82
83/*
84 * decode_asn1_bitstring()
85 *
86 * decode a bitstring into a buffer of the expected length.
87 * enclen = bit string length
88 * explen = expected length (define in rfc)
89 */
90int
91decode_asn1_bitstring(struct xdr_netobj *out, char *in, int enclen, int explen)
92{
93 if (!(out->data = kzalloc(explen,GFP_NOFS)))
94 return 0;
95 out->len = explen;
96 memcpy(out->data, in, enclen);
97 return 1;
98}
99
100/*
101 * SPKMInnerContextToken choice SPKM_MIC asn1 token layout
102 *
103 * contextid is always 16 bytes plain data. max asn1 bitstring len = 17.
104 *
105 * tokenlen = pos[0] to end of token (max pos[45] with MD5 cksum)
106 *
107 * pos value
108 * ----------
109 * [0] a4 SPKM-MIC tag
110 * [1] ?? innertoken length (max 44)
111 *
112 *
113 * tok_hdr piece of checksum data starts here
114 *
115 * the maximum mic-header len = 9 + 17 = 26
116 * mic-header
117 * ----------
118 * [2] 30 SEQUENCE tag
119 * [3] ?? mic-header length: (max 23) = TokenID + ContextID
120 *
121 * TokenID - all fields constant and can be hardcoded
122 * -------
123 * [4] 02 Type 2
124 * [5] 02 Length 2
125 * [6][7] 01 01 TokenID (SPKM_MIC_TOK)
126 *
127 * ContextID - encoded length not constant, calculated
128 * ---------
129 * [8] 03 Type 3
130 * [9] ?? encoded length
131 * [10] ?? ctxzbit
132 * [11] contextid
133 *
134 * mic_header piece of checksum data ends here.
135 *
136 * int-cksum - encoded length not constant, calculated
137 * ---------
138 * [??] 03 Type 3
139 * [??] ?? encoded length
140 * [??] ?? md5zbit
141 * [??] int-cksum (NID_md5 = 16)
142 *
143 * maximum SPKM-MIC innercontext token length =
144 * 10 + encoded contextid_size(17 max) + 2 + encoded
145 * cksum_size (17 maxfor NID_md5) = 46
146 */
147
148/*
149 * spkm3_mic_header()
150 *
151 * Prepare the SPKM_MIC_TOK mic-header for check-sum calculation
152 * elen: 16 byte context id asn1 bitstring encoded length
153 */
154void
155spkm3_mic_header(unsigned char **hdrbuf, unsigned int *hdrlen, unsigned char *ctxdata, int elen, int zbit)
156{
157 char *hptr = *hdrbuf;
158 char *top = *hdrbuf;
159
160 *(u8 *)hptr++ = 0x30;
161 *(u8 *)hptr++ = elen + 7; /* on the wire header length */
162
163 /* tokenid */
164 *(u8 *)hptr++ = 0x02;
165 *(u8 *)hptr++ = 0x02;
166 *(u8 *)hptr++ = 0x01;
167 *(u8 *)hptr++ = 0x01;
168
169 /* coniextid */
170 *(u8 *)hptr++ = 0x03;
171 *(u8 *)hptr++ = elen + 1; /* add 1 to include zbit */
172 *(u8 *)hptr++ = zbit;
173 memcpy(hptr, ctxdata, elen);
174 hptr += elen;
175 *hdrlen = hptr - top;
176}
177
178/*
179 * spkm3_mic_innercontext_token()
180 *
181 * *tokp points to the beginning of the SPKM_MIC token described
182 * in rfc 2025, section 3.2.1:
183 *
184 * toklen is the inner token length
185 */
186void
187spkm3_make_mic_token(unsigned char **tokp, int toklen, struct xdr_netobj *mic_hdr, struct xdr_netobj *md5cksum, int md5elen, int md5zbit)
188{
189 unsigned char *ict = *tokp;
190
191 *(u8 *)ict++ = 0xa4;
192 *(u8 *)ict++ = toklen;
193 memcpy(ict, mic_hdr->data, mic_hdr->len);
194 ict += mic_hdr->len;
195
196 *(u8 *)ict++ = 0x03;
197 *(u8 *)ict++ = md5elen + 1; /* add 1 to include zbit */
198 *(u8 *)ict++ = md5zbit;
199 memcpy(ict, md5cksum->data, md5elen);
200}
201
202u32
203spkm3_verify_mic_token(unsigned char **tokp, int *mic_hdrlen, unsigned char **cksum)
204{
205 struct xdr_netobj spkm3_ctx_id = {.len =0, .data = NULL};
206 unsigned char *ptr = *tokp;
207 int ctxelen;
208 u32 ret = GSS_S_DEFECTIVE_TOKEN;
209
210 /* spkm3 innercontext token preamble */
211 if ((ptr[0] != 0xa4) || (ptr[2] != 0x30)) {
212 dprintk("RPC: BAD SPKM ictoken preamble\n");
213 goto out;
214 }
215
216 *mic_hdrlen = ptr[3];
217
218 /* token type */
219 if ((ptr[4] != 0x02) || (ptr[5] != 0x02)) {
220 dprintk("RPC: BAD asn1 SPKM3 token type\n");
221 goto out;
222 }
223
224 /* only support SPKM_MIC_TOK */
225 if((ptr[6] != 0x01) || (ptr[7] != 0x01)) {
226 dprintk("RPC: ERROR unsupported SPKM3 token\n");
227 goto out;
228 }
229
230 /* contextid */
231 if (ptr[8] != 0x03) {
232 dprintk("RPC: BAD SPKM3 asn1 context-id type\n");
233 goto out;
234 }
235
236 ctxelen = ptr[9];
237 if (ctxelen > 17) { /* length includes asn1 zbit octet */
238 dprintk("RPC: BAD SPKM3 contextid len %d\n", ctxelen);
239 goto out;
240 }
241
242 /* ignore ptr[10] */
243
244 if(!decode_asn1_bitstring(&spkm3_ctx_id, &ptr[11], ctxelen - 1, 16))
245 goto out;
246
247 /*
248 * in the current implementation: the optional int-alg is not present
249 * so the default int-alg (md5) is used the optional snd-seq field is
250 * also not present
251 */
252
253 if (*mic_hdrlen != 6 + ctxelen) {
254 dprintk("RPC: BAD SPKM_ MIC_TOK header len %d: we only "
255 "support default int-alg (should be absent) "
256 "and do not support snd-seq\n", *mic_hdrlen);
257 goto out;
258 }
259 /* checksum */
260 *cksum = (&ptr[10] + ctxelen); /* ctxelen includes ptr[10] */
261
262 ret = GSS_S_COMPLETE;
263out:
264 kfree(spkm3_ctx_id.data);
265 return ret;
266}
267
diff --git a/net/sunrpc/auth_gss/gss_spkm3_unseal.c b/net/sunrpc/auth_gss/gss_spkm3_unseal.c
deleted file mode 100644
index cc21ee860bb6..000000000000
--- a/net/sunrpc/auth_gss/gss_spkm3_unseal.c
+++ /dev/null
@@ -1,127 +0,0 @@
1/*
2 * linux/net/sunrpc/gss_spkm3_unseal.c
3 *
4 * Copyright (c) 2003 The Regents of the University of Michigan.
5 * All rights reserved.
6 *
7 * Andy Adamson <andros@umich.edu>
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 *
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its
19 * contributors may be used to endorse or promote products derived
20 * from this software without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
23 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
24 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
29 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
30 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
31 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 */
35
36#include <linux/types.h>
37#include <linux/slab.h>
38#include <linux/jiffies.h>
39#include <linux/sunrpc/gss_spkm3.h>
40#include <linux/crypto.h>
41
42#ifdef RPC_DEBUG
43# define RPCDBG_FACILITY RPCDBG_AUTH
44#endif
45
46/*
47 * spkm3_read_token()
48 *
49 * only SPKM_MIC_TOK with md5 intg-alg is supported
50 */
51u32
52spkm3_read_token(struct spkm3_ctx *ctx,
53 struct xdr_netobj *read_token, /* checksum */
54 struct xdr_buf *message_buffer, /* signbuf */
55 int toktype)
56{
57 s32 checksum_type;
58 s32 code;
59 struct xdr_netobj wire_cksum = {.len =0, .data = NULL};
60 char cksumdata[16];
61 struct xdr_netobj md5cksum = {.len = 0, .data = cksumdata};
62 unsigned char *ptr = (unsigned char *)read_token->data;
63 unsigned char *cksum;
64 int bodysize, md5elen;
65 int mic_hdrlen;
66 u32 ret = GSS_S_DEFECTIVE_TOKEN;
67
68 if (g_verify_token_header((struct xdr_netobj *) &ctx->mech_used,
69 &bodysize, &ptr, read_token->len))
70 goto out;
71
72 /* decode the token */
73
74 if (toktype != SPKM_MIC_TOK) {
75 dprintk("RPC: BAD SPKM3 token type: %d\n", toktype);
76 goto out;
77 }
78
79 if ((ret = spkm3_verify_mic_token(&ptr, &mic_hdrlen, &cksum)))
80 goto out;
81
82 if (*cksum++ != 0x03) {
83 dprintk("RPC: spkm3_read_token BAD checksum type\n");
84 goto out;
85 }
86 md5elen = *cksum++;
87 cksum++; /* move past the zbit */
88
89 if (!decode_asn1_bitstring(&wire_cksum, cksum, md5elen - 1, 16))
90 goto out;
91
92 /* HARD CODED FOR MD5 */
93
94 /* compute the checksum of the message.
95 * ptr + 2 = start of header piece of checksum
96 * mic_hdrlen + 2 = length of header piece of checksum
97 */
98 ret = GSS_S_DEFECTIVE_TOKEN;
99 if (!g_OID_equal(&ctx->intg_alg, &hmac_md5_oid)) {
100 dprintk("RPC: gss_spkm3_seal: unsupported I-ALG "
101 "algorithm\n");
102 goto out;
103 }
104
105 checksum_type = CKSUMTYPE_HMAC_MD5;
106
107 code = make_spkm3_checksum(checksum_type,
108 &ctx->derived_integ_key, ptr + 2, mic_hdrlen + 2,
109 message_buffer, 0, &md5cksum);
110
111 if (code)
112 goto out;
113
114 ret = GSS_S_BAD_SIG;
115 code = memcmp(md5cksum.data, wire_cksum.data, wire_cksum.len);
116 if (code) {
117 dprintk("RPC: bad MIC checksum\n");
118 goto out;
119 }
120
121
122 /* XXX: need to add expiration and sequencing */
123 ret = GSS_S_COMPLETE;
124out:
125 kfree(wire_cksum.data);
126 return ret;
127}
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index cc385b3a59c2..dec2a6fc7c12 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -964,7 +964,7 @@ svcauth_gss_set_client(struct svc_rqst *rqstp)
964 if (rqstp->rq_gssclient == NULL) 964 if (rqstp->rq_gssclient == NULL)
965 return SVC_DENIED; 965 return SVC_DENIED;
966 stat = svcauth_unix_set_client(rqstp); 966 stat = svcauth_unix_set_client(rqstp);
967 if (stat == SVC_DROP) 967 if (stat == SVC_DROP || stat == SVC_CLOSE)
968 return stat; 968 return stat;
969 return SVC_OK; 969 return SVC_OK;
970} 970}
@@ -1018,7 +1018,7 @@ static int svcauth_gss_handle_init(struct svc_rqst *rqstp,
1018 return SVC_DENIED; 1018 return SVC_DENIED;
1019 memset(&rsikey, 0, sizeof(rsikey)); 1019 memset(&rsikey, 0, sizeof(rsikey));
1020 if (dup_netobj(&rsikey.in_handle, &gc->gc_ctx)) 1020 if (dup_netobj(&rsikey.in_handle, &gc->gc_ctx))
1021 return SVC_DROP; 1021 return SVC_CLOSE;
1022 *authp = rpc_autherr_badverf; 1022 *authp = rpc_autherr_badverf;
1023 if (svc_safe_getnetobj(argv, &tmpobj)) { 1023 if (svc_safe_getnetobj(argv, &tmpobj)) {
1024 kfree(rsikey.in_handle.data); 1024 kfree(rsikey.in_handle.data);
@@ -1026,38 +1026,35 @@ static int svcauth_gss_handle_init(struct svc_rqst *rqstp,
1026 } 1026 }
1027 if (dup_netobj(&rsikey.in_token, &tmpobj)) { 1027 if (dup_netobj(&rsikey.in_token, &tmpobj)) {
1028 kfree(rsikey.in_handle.data); 1028 kfree(rsikey.in_handle.data);
1029 return SVC_DROP; 1029 return SVC_CLOSE;
1030 } 1030 }
1031 1031
1032 /* Perform upcall, or find upcall result: */ 1032 /* Perform upcall, or find upcall result: */
1033 rsip = rsi_lookup(&rsikey); 1033 rsip = rsi_lookup(&rsikey);
1034 rsi_free(&rsikey); 1034 rsi_free(&rsikey);
1035 if (!rsip) 1035 if (!rsip)
1036 return SVC_DROP; 1036 return SVC_CLOSE;
1037 switch (cache_check(&rsi_cache, &rsip->h, &rqstp->rq_chandle)) { 1037 if (cache_check(&rsi_cache, &rsip->h, &rqstp->rq_chandle) < 0)
1038 case -EAGAIN:
1039 case -ETIMEDOUT:
1040 case -ENOENT:
1041 /* No upcall result: */ 1038 /* No upcall result: */
1042 return SVC_DROP; 1039 return SVC_CLOSE;
1043 case 0: 1040
1044 ret = SVC_DROP; 1041 ret = SVC_CLOSE;
1045 /* Got an answer to the upcall; use it: */ 1042 /* Got an answer to the upcall; use it: */
1046 if (gss_write_init_verf(rqstp, rsip)) 1043 if (gss_write_init_verf(rqstp, rsip))
1047 goto out; 1044 goto out;
1048 if (resv->iov_len + 4 > PAGE_SIZE) 1045 if (resv->iov_len + 4 > PAGE_SIZE)
1049 goto out; 1046 goto out;
1050 svc_putnl(resv, RPC_SUCCESS); 1047 svc_putnl(resv, RPC_SUCCESS);
1051 if (svc_safe_putnetobj(resv, &rsip->out_handle)) 1048 if (svc_safe_putnetobj(resv, &rsip->out_handle))
1052 goto out; 1049 goto out;
1053 if (resv->iov_len + 3 * 4 > PAGE_SIZE) 1050 if (resv->iov_len + 3 * 4 > PAGE_SIZE)
1054 goto out; 1051 goto out;
1055 svc_putnl(resv, rsip->major_status); 1052 svc_putnl(resv, rsip->major_status);
1056 svc_putnl(resv, rsip->minor_status); 1053 svc_putnl(resv, rsip->minor_status);
1057 svc_putnl(resv, GSS_SEQ_WIN); 1054 svc_putnl(resv, GSS_SEQ_WIN);
1058 if (svc_safe_putnetobj(resv, &rsip->out_token)) 1055 if (svc_safe_putnetobj(resv, &rsip->out_token))
1059 goto out; 1056 goto out;
1060 } 1057
1061 ret = SVC_COMPLETE; 1058 ret = SVC_COMPLETE;
1062out: 1059out:
1063 cache_put(&rsip->h, &rsi_cache); 1060 cache_put(&rsip->h, &rsi_cache);
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 2b06410e584e..e433e7580e27 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -28,21 +28,21 @@
28#include <linux/workqueue.h> 28#include <linux/workqueue.h>
29#include <linux/mutex.h> 29#include <linux/mutex.h>
30#include <linux/pagemap.h> 30#include <linux/pagemap.h>
31#include <linux/smp_lock.h>
32#include <asm/ioctls.h> 31#include <asm/ioctls.h>
33#include <linux/sunrpc/types.h> 32#include <linux/sunrpc/types.h>
34#include <linux/sunrpc/cache.h> 33#include <linux/sunrpc/cache.h>
35#include <linux/sunrpc/stats.h> 34#include <linux/sunrpc/stats.h>
36#include <linux/sunrpc/rpc_pipe_fs.h> 35#include <linux/sunrpc/rpc_pipe_fs.h>
36#include "netns.h"
37 37
38#define RPCDBG_FACILITY RPCDBG_CACHE 38#define RPCDBG_FACILITY RPCDBG_CACHE
39 39
40static int cache_defer_req(struct cache_req *req, struct cache_head *item); 40static void cache_defer_req(struct cache_req *req, struct cache_head *item);
41static void cache_revisit_request(struct cache_head *item); 41static void cache_revisit_request(struct cache_head *item);
42 42
43static void cache_init(struct cache_head *h) 43static void cache_init(struct cache_head *h)
44{ 44{
45 time_t now = get_seconds(); 45 time_t now = seconds_since_boot();
46 h->next = NULL; 46 h->next = NULL;
47 h->flags = 0; 47 h->flags = 0;
48 kref_init(&h->ref); 48 kref_init(&h->ref);
@@ -52,7 +52,7 @@ static void cache_init(struct cache_head *h)
52 52
53static inline int cache_is_expired(struct cache_detail *detail, struct cache_head *h) 53static inline int cache_is_expired(struct cache_detail *detail, struct cache_head *h)
54{ 54{
55 return (h->expiry_time < get_seconds()) || 55 return (h->expiry_time < seconds_since_boot()) ||
56 (detail->flush_time > h->last_refresh); 56 (detail->flush_time > h->last_refresh);
57} 57}
58 58
@@ -127,7 +127,7 @@ static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch);
127static void cache_fresh_locked(struct cache_head *head, time_t expiry) 127static void cache_fresh_locked(struct cache_head *head, time_t expiry)
128{ 128{
129 head->expiry_time = expiry; 129 head->expiry_time = expiry;
130 head->last_refresh = get_seconds(); 130 head->last_refresh = seconds_since_boot();
131 set_bit(CACHE_VALID, &head->flags); 131 set_bit(CACHE_VALID, &head->flags);
132} 132}
133 133
@@ -238,7 +238,7 @@ int cache_check(struct cache_detail *detail,
238 238
239 /* now see if we want to start an upcall */ 239 /* now see if we want to start an upcall */
240 refresh_age = (h->expiry_time - h->last_refresh); 240 refresh_age = (h->expiry_time - h->last_refresh);
241 age = get_seconds() - h->last_refresh; 241 age = seconds_since_boot() - h->last_refresh;
242 242
243 if (rqstp == NULL) { 243 if (rqstp == NULL) {
244 if (rv == -EAGAIN) 244 if (rv == -EAGAIN)
@@ -253,7 +253,7 @@ int cache_check(struct cache_detail *detail,
253 cache_revisit_request(h); 253 cache_revisit_request(h);
254 if (rv == -EAGAIN) { 254 if (rv == -EAGAIN) {
255 set_bit(CACHE_NEGATIVE, &h->flags); 255 set_bit(CACHE_NEGATIVE, &h->flags);
256 cache_fresh_locked(h, get_seconds()+CACHE_NEW_EXPIRY); 256 cache_fresh_locked(h, seconds_since_boot()+CACHE_NEW_EXPIRY);
257 cache_fresh_unlocked(h, detail); 257 cache_fresh_unlocked(h, detail);
258 rv = -ENOENT; 258 rv = -ENOENT;
259 } 259 }
@@ -268,7 +268,8 @@ int cache_check(struct cache_detail *detail,
268 } 268 }
269 269
270 if (rv == -EAGAIN) { 270 if (rv == -EAGAIN) {
271 if (cache_defer_req(rqstp, h) < 0) { 271 cache_defer_req(rqstp, h);
272 if (!test_bit(CACHE_PENDING, &h->flags)) {
272 /* Request is not deferred */ 273 /* Request is not deferred */
273 rv = cache_is_valid(detail, h); 274 rv = cache_is_valid(detail, h);
274 if (rv == -EAGAIN) 275 if (rv == -EAGAIN)
@@ -388,11 +389,11 @@ static int cache_clean(void)
388 return -1; 389 return -1;
389 } 390 }
390 current_detail = list_entry(next, struct cache_detail, others); 391 current_detail = list_entry(next, struct cache_detail, others);
391 if (current_detail->nextcheck > get_seconds()) 392 if (current_detail->nextcheck > seconds_since_boot())
392 current_index = current_detail->hash_size; 393 current_index = current_detail->hash_size;
393 else { 394 else {
394 current_index = 0; 395 current_index = 0;
395 current_detail->nextcheck = get_seconds()+30*60; 396 current_detail->nextcheck = seconds_since_boot()+30*60;
396 } 397 }
397 } 398 }
398 399
@@ -477,7 +478,7 @@ EXPORT_SYMBOL_GPL(cache_flush);
477void cache_purge(struct cache_detail *detail) 478void cache_purge(struct cache_detail *detail)
478{ 479{
479 detail->flush_time = LONG_MAX; 480 detail->flush_time = LONG_MAX;
480 detail->nextcheck = get_seconds(); 481 detail->nextcheck = seconds_since_boot();
481 cache_flush(); 482 cache_flush();
482 detail->flush_time = 1; 483 detail->flush_time = 1;
483} 484}
@@ -506,81 +507,155 @@ EXPORT_SYMBOL_GPL(cache_purge);
506 507
507static DEFINE_SPINLOCK(cache_defer_lock); 508static DEFINE_SPINLOCK(cache_defer_lock);
508static LIST_HEAD(cache_defer_list); 509static LIST_HEAD(cache_defer_list);
509static struct list_head cache_defer_hash[DFR_HASHSIZE]; 510static struct hlist_head cache_defer_hash[DFR_HASHSIZE];
510static int cache_defer_cnt; 511static int cache_defer_cnt;
511 512
512static int cache_defer_req(struct cache_req *req, struct cache_head *item) 513static void __unhash_deferred_req(struct cache_deferred_req *dreq)
514{
515 hlist_del_init(&dreq->hash);
516 if (!list_empty(&dreq->recent)) {
517 list_del_init(&dreq->recent);
518 cache_defer_cnt--;
519 }
520}
521
522static void __hash_deferred_req(struct cache_deferred_req *dreq, struct cache_head *item)
513{ 523{
514 struct cache_deferred_req *dreq, *discard;
515 int hash = DFR_HASH(item); 524 int hash = DFR_HASH(item);
516 525
517 if (cache_defer_cnt >= DFR_MAX) { 526 INIT_LIST_HEAD(&dreq->recent);
518 /* too much in the cache, randomly drop this one, 527 hlist_add_head(&dreq->hash, &cache_defer_hash[hash]);
519 * or continue and drop the oldest below 528}
520 */ 529
521 if (net_random()&1) 530static void setup_deferral(struct cache_deferred_req *dreq,
522 return -ENOMEM; 531 struct cache_head *item,
523 } 532 int count_me)
524 dreq = req->defer(req); 533{
525 if (dreq == NULL)
526 return -ENOMEM;
527 534
528 dreq->item = item; 535 dreq->item = item;
529 536
530 spin_lock(&cache_defer_lock); 537 spin_lock(&cache_defer_lock);
531 538
532 list_add(&dreq->recent, &cache_defer_list); 539 __hash_deferred_req(dreq, item);
533 540
534 if (cache_defer_hash[hash].next == NULL) 541 if (count_me) {
535 INIT_LIST_HEAD(&cache_defer_hash[hash]); 542 cache_defer_cnt++;
536 list_add(&dreq->hash, &cache_defer_hash[hash]); 543 list_add(&dreq->recent, &cache_defer_list);
537
538 /* it is in, now maybe clean up */
539 discard = NULL;
540 if (++cache_defer_cnt > DFR_MAX) {
541 discard = list_entry(cache_defer_list.prev,
542 struct cache_deferred_req, recent);
543 list_del_init(&discard->recent);
544 list_del_init(&discard->hash);
545 cache_defer_cnt--;
546 } 544 }
545
547 spin_unlock(&cache_defer_lock); 546 spin_unlock(&cache_defer_lock);
548 547
548}
549
550struct thread_deferred_req {
551 struct cache_deferred_req handle;
552 struct completion completion;
553};
554
555static void cache_restart_thread(struct cache_deferred_req *dreq, int too_many)
556{
557 struct thread_deferred_req *dr =
558 container_of(dreq, struct thread_deferred_req, handle);
559 complete(&dr->completion);
560}
561
562static void cache_wait_req(struct cache_req *req, struct cache_head *item)
563{
564 struct thread_deferred_req sleeper;
565 struct cache_deferred_req *dreq = &sleeper.handle;
566
567 sleeper.completion = COMPLETION_INITIALIZER_ONSTACK(sleeper.completion);
568 dreq->revisit = cache_restart_thread;
569
570 setup_deferral(dreq, item, 0);
571
572 if (!test_bit(CACHE_PENDING, &item->flags) ||
573 wait_for_completion_interruptible_timeout(
574 &sleeper.completion, req->thread_wait) <= 0) {
575 /* The completion wasn't completed, so we need
576 * to clean up
577 */
578 spin_lock(&cache_defer_lock);
579 if (!hlist_unhashed(&sleeper.handle.hash)) {
580 __unhash_deferred_req(&sleeper.handle);
581 spin_unlock(&cache_defer_lock);
582 } else {
583 /* cache_revisit_request already removed
584 * this from the hash table, but hasn't
585 * called ->revisit yet. It will very soon
586 * and we need to wait for it.
587 */
588 spin_unlock(&cache_defer_lock);
589 wait_for_completion(&sleeper.completion);
590 }
591 }
592}
593
594static void cache_limit_defers(void)
595{
596 /* Make sure we haven't exceed the limit of allowed deferred
597 * requests.
598 */
599 struct cache_deferred_req *discard = NULL;
600
601 if (cache_defer_cnt <= DFR_MAX)
602 return;
603
604 spin_lock(&cache_defer_lock);
605
606 /* Consider removing either the first or the last */
607 if (cache_defer_cnt > DFR_MAX) {
608 if (net_random() & 1)
609 discard = list_entry(cache_defer_list.next,
610 struct cache_deferred_req, recent);
611 else
612 discard = list_entry(cache_defer_list.prev,
613 struct cache_deferred_req, recent);
614 __unhash_deferred_req(discard);
615 }
616 spin_unlock(&cache_defer_lock);
549 if (discard) 617 if (discard)
550 /* there was one too many */
551 discard->revisit(discard, 1); 618 discard->revisit(discard, 1);
619}
552 620
553 if (!test_bit(CACHE_PENDING, &item->flags)) { 621static void cache_defer_req(struct cache_req *req, struct cache_head *item)
554 /* must have just been validated... */ 622{
555 cache_revisit_request(item); 623 struct cache_deferred_req *dreq;
556 return -EAGAIN; 624
625 if (req->thread_wait) {
626 cache_wait_req(req, item);
627 if (!test_bit(CACHE_PENDING, &item->flags))
628 return;
557 } 629 }
558 return 0; 630 dreq = req->defer(req);
631 if (dreq == NULL)
632 return;
633 setup_deferral(dreq, item, 1);
634 if (!test_bit(CACHE_PENDING, &item->flags))
635 /* Bit could have been cleared before we managed to
636 * set up the deferral, so need to revisit just in case
637 */
638 cache_revisit_request(item);
639
640 cache_limit_defers();
559} 641}
560 642
561static void cache_revisit_request(struct cache_head *item) 643static void cache_revisit_request(struct cache_head *item)
562{ 644{
563 struct cache_deferred_req *dreq; 645 struct cache_deferred_req *dreq;
564 struct list_head pending; 646 struct list_head pending;
565 647 struct hlist_node *lp, *tmp;
566 struct list_head *lp;
567 int hash = DFR_HASH(item); 648 int hash = DFR_HASH(item);
568 649
569 INIT_LIST_HEAD(&pending); 650 INIT_LIST_HEAD(&pending);
570 spin_lock(&cache_defer_lock); 651 spin_lock(&cache_defer_lock);
571 652
572 lp = cache_defer_hash[hash].next; 653 hlist_for_each_entry_safe(dreq, lp, tmp, &cache_defer_hash[hash], hash)
573 if (lp) { 654 if (dreq->item == item) {
574 while (lp != &cache_defer_hash[hash]) { 655 __unhash_deferred_req(dreq);
575 dreq = list_entry(lp, struct cache_deferred_req, hash); 656 list_add(&dreq->recent, &pending);
576 lp = lp->next;
577 if (dreq->item == item) {
578 list_del_init(&dreq->hash);
579 list_move(&dreq->recent, &pending);
580 cache_defer_cnt--;
581 }
582 } 657 }
583 } 658
584 spin_unlock(&cache_defer_lock); 659 spin_unlock(&cache_defer_lock);
585 660
586 while (!list_empty(&pending)) { 661 while (!list_empty(&pending)) {
@@ -601,9 +676,8 @@ void cache_clean_deferred(void *owner)
601 676
602 list_for_each_entry_safe(dreq, tmp, &cache_defer_list, recent) { 677 list_for_each_entry_safe(dreq, tmp, &cache_defer_list, recent) {
603 if (dreq->owner == owner) { 678 if (dreq->owner == owner) {
604 list_del_init(&dreq->hash); 679 __unhash_deferred_req(dreq);
605 list_move(&dreq->recent, &pending); 680 list_add(&dreq->recent, &pending);
606 cache_defer_cnt--;
607 } 681 }
608 } 682 }
609 spin_unlock(&cache_defer_lock); 683 spin_unlock(&cache_defer_lock);
@@ -902,7 +976,7 @@ static int cache_release(struct inode *inode, struct file *filp,
902 filp->private_data = NULL; 976 filp->private_data = NULL;
903 kfree(rp); 977 kfree(rp);
904 978
905 cd->last_close = get_seconds(); 979 cd->last_close = seconds_since_boot();
906 atomic_dec(&cd->readers); 980 atomic_dec(&cd->readers);
907 } 981 }
908 module_put(cd->owner); 982 module_put(cd->owner);
@@ -1015,6 +1089,23 @@ static void warn_no_listener(struct cache_detail *detail)
1015 } 1089 }
1016} 1090}
1017 1091
1092static bool cache_listeners_exist(struct cache_detail *detail)
1093{
1094 if (atomic_read(&detail->readers))
1095 return true;
1096 if (detail->last_close == 0)
1097 /* This cache was never opened */
1098 return false;
1099 if (detail->last_close < seconds_since_boot() - 30)
1100 /*
1101 * We allow for the possibility that someone might
1102 * restart a userspace daemon without restarting the
1103 * server; but after 30 seconds, we give up.
1104 */
1105 return false;
1106 return true;
1107}
1108
1018/* 1109/*
1019 * register an upcall request to user-space and queue it up for read() by the 1110 * register an upcall request to user-space and queue it up for read() by the
1020 * upcall daemon. 1111 * upcall daemon.
@@ -1033,10 +1124,9 @@ int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h,
1033 char *bp; 1124 char *bp;
1034 int len; 1125 int len;
1035 1126
1036 if (atomic_read(&detail->readers) == 0 && 1127 if (!cache_listeners_exist(detail)) {
1037 detail->last_close < get_seconds() - 30) { 1128 warn_no_listener(detail);
1038 warn_no_listener(detail); 1129 return -EINVAL;
1039 return -EINVAL;
1040 } 1130 }
1041 1131
1042 buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 1132 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
@@ -1095,13 +1185,19 @@ int qword_get(char **bpp, char *dest, int bufsize)
1095 if (bp[0] == '\\' && bp[1] == 'x') { 1185 if (bp[0] == '\\' && bp[1] == 'x') {
1096 /* HEX STRING */ 1186 /* HEX STRING */
1097 bp += 2; 1187 bp += 2;
1098 while (isxdigit(bp[0]) && isxdigit(bp[1]) && len < bufsize) { 1188 while (len < bufsize) {
1099 int byte = isdigit(*bp) ? *bp-'0' : toupper(*bp)-'A'+10; 1189 int h, l;
1100 bp++; 1190
1101 byte <<= 4; 1191 h = hex_to_bin(bp[0]);
1102 byte |= isdigit(*bp) ? *bp-'0' : toupper(*bp)-'A'+10; 1192 if (h < 0)
1103 *dest++ = byte; 1193 break;
1104 bp++; 1194
1195 l = hex_to_bin(bp[1]);
1196 if (l < 0)
1197 break;
1198
1199 *dest++ = (h << 4) | l;
1200 bp += 2;
1105 len++; 1201 len++;
1106 } 1202 }
1107 } else { 1203 } else {
@@ -1219,7 +1315,8 @@ static int c_show(struct seq_file *m, void *p)
1219 1315
1220 ifdebug(CACHE) 1316 ifdebug(CACHE)
1221 seq_printf(m, "# expiry=%ld refcnt=%d flags=%lx\n", 1317 seq_printf(m, "# expiry=%ld refcnt=%d flags=%lx\n",
1222 cp->expiry_time, atomic_read(&cp->ref.refcount), cp->flags); 1318 convert_to_wallclock(cp->expiry_time),
1319 atomic_read(&cp->ref.refcount), cp->flags);
1223 cache_get(cp); 1320 cache_get(cp);
1224 if (cache_check(cd, cp, NULL)) 1321 if (cache_check(cd, cp, NULL))
1225 /* cache_check does a cache_put on failure */ 1322 /* cache_check does a cache_put on failure */
@@ -1285,7 +1382,7 @@ static ssize_t read_flush(struct file *file, char __user *buf,
1285 unsigned long p = *ppos; 1382 unsigned long p = *ppos;
1286 size_t len; 1383 size_t len;
1287 1384
1288 sprintf(tbuf, "%lu\n", cd->flush_time); 1385 sprintf(tbuf, "%lu\n", convert_to_wallclock(cd->flush_time));
1289 len = strlen(tbuf); 1386 len = strlen(tbuf);
1290 if (p >= len) 1387 if (p >= len)
1291 return 0; 1388 return 0;
@@ -1303,19 +1400,20 @@ static ssize_t write_flush(struct file *file, const char __user *buf,
1303 struct cache_detail *cd) 1400 struct cache_detail *cd)
1304{ 1401{
1305 char tbuf[20]; 1402 char tbuf[20];
1306 char *ep; 1403 char *bp, *ep;
1307 long flushtime; 1404
1308 if (*ppos || count > sizeof(tbuf)-1) 1405 if (*ppos || count > sizeof(tbuf)-1)
1309 return -EINVAL; 1406 return -EINVAL;
1310 if (copy_from_user(tbuf, buf, count)) 1407 if (copy_from_user(tbuf, buf, count))
1311 return -EFAULT; 1408 return -EFAULT;
1312 tbuf[count] = 0; 1409 tbuf[count] = 0;
1313 flushtime = simple_strtoul(tbuf, &ep, 0); 1410 simple_strtoul(tbuf, &ep, 0);
1314 if (*ep && *ep != '\n') 1411 if (*ep && *ep != '\n')
1315 return -EINVAL; 1412 return -EINVAL;
1316 1413
1317 cd->flush_time = flushtime; 1414 bp = tbuf;
1318 cd->nextcheck = get_seconds(); 1415 cd->flush_time = get_expiry(&bp);
1416 cd->nextcheck = seconds_since_boot();
1319 cache_flush(); 1417 cache_flush();
1320 1418
1321 *ppos += count; 1419 *ppos += count;
@@ -1348,15 +1446,10 @@ static unsigned int cache_poll_procfs(struct file *filp, poll_table *wait)
1348static long cache_ioctl_procfs(struct file *filp, 1446static long cache_ioctl_procfs(struct file *filp,
1349 unsigned int cmd, unsigned long arg) 1447 unsigned int cmd, unsigned long arg)
1350{ 1448{
1351 long ret;
1352 struct inode *inode = filp->f_path.dentry->d_inode; 1449 struct inode *inode = filp->f_path.dentry->d_inode;
1353 struct cache_detail *cd = PDE(inode)->data; 1450 struct cache_detail *cd = PDE(inode)->data;
1354 1451
1355 lock_kernel(); 1452 return cache_ioctl(inode, filp, cmd, arg, cd);
1356 ret = cache_ioctl(inode, filp, cmd, arg, cd);
1357 unlock_kernel();
1358
1359 return ret;
1360} 1453}
1361 1454
1362static int cache_open_procfs(struct inode *inode, struct file *filp) 1455static int cache_open_procfs(struct inode *inode, struct file *filp)
@@ -1441,10 +1534,13 @@ static const struct file_operations cache_flush_operations_procfs = {
1441 .read = read_flush_procfs, 1534 .read = read_flush_procfs,
1442 .write = write_flush_procfs, 1535 .write = write_flush_procfs,
1443 .release = release_flush_procfs, 1536 .release = release_flush_procfs,
1537 .llseek = no_llseek,
1444}; 1538};
1445 1539
1446static void remove_cache_proc_entries(struct cache_detail *cd) 1540static void remove_cache_proc_entries(struct cache_detail *cd, struct net *net)
1447{ 1541{
1542 struct sunrpc_net *sn;
1543
1448 if (cd->u.procfs.proc_ent == NULL) 1544 if (cd->u.procfs.proc_ent == NULL)
1449 return; 1545 return;
1450 if (cd->u.procfs.flush_ent) 1546 if (cd->u.procfs.flush_ent)
@@ -1454,15 +1550,18 @@ static void remove_cache_proc_entries(struct cache_detail *cd)
1454 if (cd->u.procfs.content_ent) 1550 if (cd->u.procfs.content_ent)
1455 remove_proc_entry("content", cd->u.procfs.proc_ent); 1551 remove_proc_entry("content", cd->u.procfs.proc_ent);
1456 cd->u.procfs.proc_ent = NULL; 1552 cd->u.procfs.proc_ent = NULL;
1457 remove_proc_entry(cd->name, proc_net_rpc); 1553 sn = net_generic(net, sunrpc_net_id);
1554 remove_proc_entry(cd->name, sn->proc_net_rpc);
1458} 1555}
1459 1556
1460#ifdef CONFIG_PROC_FS 1557#ifdef CONFIG_PROC_FS
1461static int create_cache_proc_entries(struct cache_detail *cd) 1558static int create_cache_proc_entries(struct cache_detail *cd, struct net *net)
1462{ 1559{
1463 struct proc_dir_entry *p; 1560 struct proc_dir_entry *p;
1561 struct sunrpc_net *sn;
1464 1562
1465 cd->u.procfs.proc_ent = proc_mkdir(cd->name, proc_net_rpc); 1563 sn = net_generic(net, sunrpc_net_id);
1564 cd->u.procfs.proc_ent = proc_mkdir(cd->name, sn->proc_net_rpc);
1466 if (cd->u.procfs.proc_ent == NULL) 1565 if (cd->u.procfs.proc_ent == NULL)
1467 goto out_nomem; 1566 goto out_nomem;
1468 cd->u.procfs.channel_ent = NULL; 1567 cd->u.procfs.channel_ent = NULL;
@@ -1493,11 +1592,11 @@ static int create_cache_proc_entries(struct cache_detail *cd)
1493 } 1592 }
1494 return 0; 1593 return 0;
1495out_nomem: 1594out_nomem:
1496 remove_cache_proc_entries(cd); 1595 remove_cache_proc_entries(cd, net);
1497 return -ENOMEM; 1596 return -ENOMEM;
1498} 1597}
1499#else /* CONFIG_PROC_FS */ 1598#else /* CONFIG_PROC_FS */
1500static int create_cache_proc_entries(struct cache_detail *cd) 1599static int create_cache_proc_entries(struct cache_detail *cd, struct net *net)
1501{ 1600{
1502 return 0; 1601 return 0;
1503} 1602}
@@ -1508,23 +1607,33 @@ void __init cache_initialize(void)
1508 INIT_DELAYED_WORK_DEFERRABLE(&cache_cleaner, do_cache_clean); 1607 INIT_DELAYED_WORK_DEFERRABLE(&cache_cleaner, do_cache_clean);
1509} 1608}
1510 1609
1511int cache_register(struct cache_detail *cd) 1610int cache_register_net(struct cache_detail *cd, struct net *net)
1512{ 1611{
1513 int ret; 1612 int ret;
1514 1613
1515 sunrpc_init_cache_detail(cd); 1614 sunrpc_init_cache_detail(cd);
1516 ret = create_cache_proc_entries(cd); 1615 ret = create_cache_proc_entries(cd, net);
1517 if (ret) 1616 if (ret)
1518 sunrpc_destroy_cache_detail(cd); 1617 sunrpc_destroy_cache_detail(cd);
1519 return ret; 1618 return ret;
1520} 1619}
1620
1621int cache_register(struct cache_detail *cd)
1622{
1623 return cache_register_net(cd, &init_net);
1624}
1521EXPORT_SYMBOL_GPL(cache_register); 1625EXPORT_SYMBOL_GPL(cache_register);
1522 1626
1523void cache_unregister(struct cache_detail *cd) 1627void cache_unregister_net(struct cache_detail *cd, struct net *net)
1524{ 1628{
1525 remove_cache_proc_entries(cd); 1629 remove_cache_proc_entries(cd, net);
1526 sunrpc_destroy_cache_detail(cd); 1630 sunrpc_destroy_cache_detail(cd);
1527} 1631}
1632
1633void cache_unregister(struct cache_detail *cd)
1634{
1635 cache_unregister_net(cd, &init_net);
1636}
1528EXPORT_SYMBOL_GPL(cache_unregister); 1637EXPORT_SYMBOL_GPL(cache_unregister);
1529 1638
1530static ssize_t cache_read_pipefs(struct file *filp, char __user *buf, 1639static ssize_t cache_read_pipefs(struct file *filp, char __user *buf,
@@ -1555,13 +1664,8 @@ static long cache_ioctl_pipefs(struct file *filp,
1555{ 1664{
1556 struct inode *inode = filp->f_dentry->d_inode; 1665 struct inode *inode = filp->f_dentry->d_inode;
1557 struct cache_detail *cd = RPC_I(inode)->private; 1666 struct cache_detail *cd = RPC_I(inode)->private;
1558 long ret;
1559
1560 lock_kernel();
1561 ret = cache_ioctl(inode, filp, cmd, arg, cd);
1562 unlock_kernel();
1563 1667
1564 return ret; 1668 return cache_ioctl(inode, filp, cmd, arg, cd);
1565} 1669}
1566 1670
1567static int cache_open_pipefs(struct inode *inode, struct file *filp) 1671static int cache_open_pipefs(struct inode *inode, struct file *filp)
@@ -1646,6 +1750,7 @@ const struct file_operations cache_flush_operations_pipefs = {
1646 .read = read_flush_pipefs, 1750 .read = read_flush_pipefs,
1647 .write = write_flush_pipefs, 1751 .write = write_flush_pipefs,
1648 .release = release_flush_pipefs, 1752 .release = release_flush_pipefs,
1753 .llseek = no_llseek,
1649}; 1754};
1650 1755
1651int sunrpc_cache_register_pipefs(struct dentry *parent, 1756int sunrpc_cache_register_pipefs(struct dentry *parent,
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 2388d83b68ff..9dab9573be41 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -226,7 +226,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru
226 goto out_no_principal; 226 goto out_no_principal;
227 } 227 }
228 228
229 kref_init(&clnt->cl_kref); 229 atomic_set(&clnt->cl_count, 1);
230 230
231 err = rpc_setup_pipedir(clnt, program->pipe_dir_name); 231 err = rpc_setup_pipedir(clnt, program->pipe_dir_name);
232 if (err < 0) 232 if (err < 0)
@@ -284,6 +284,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
284 struct rpc_xprt *xprt; 284 struct rpc_xprt *xprt;
285 struct rpc_clnt *clnt; 285 struct rpc_clnt *clnt;
286 struct xprt_create xprtargs = { 286 struct xprt_create xprtargs = {
287 .net = args->net,
287 .ident = args->protocol, 288 .ident = args->protocol,
288 .srcaddr = args->saddress, 289 .srcaddr = args->saddress,
289 .dstaddr = args->address, 290 .dstaddr = args->address,
@@ -390,14 +391,14 @@ rpc_clone_client(struct rpc_clnt *clnt)
390 if (new->cl_principal == NULL) 391 if (new->cl_principal == NULL)
391 goto out_no_principal; 392 goto out_no_principal;
392 } 393 }
393 kref_init(&new->cl_kref); 394 atomic_set(&new->cl_count, 1);
394 err = rpc_setup_pipedir(new, clnt->cl_program->pipe_dir_name); 395 err = rpc_setup_pipedir(new, clnt->cl_program->pipe_dir_name);
395 if (err != 0) 396 if (err != 0)
396 goto out_no_path; 397 goto out_no_path;
397 if (new->cl_auth) 398 if (new->cl_auth)
398 atomic_inc(&new->cl_auth->au_count); 399 atomic_inc(&new->cl_auth->au_count);
399 xprt_get(clnt->cl_xprt); 400 xprt_get(clnt->cl_xprt);
400 kref_get(&clnt->cl_kref); 401 atomic_inc(&clnt->cl_count);
401 rpc_register_client(new); 402 rpc_register_client(new);
402 rpciod_up(); 403 rpciod_up();
403 return new; 404 return new;
@@ -465,10 +466,8 @@ EXPORT_SYMBOL_GPL(rpc_shutdown_client);
465 * Free an RPC client 466 * Free an RPC client
466 */ 467 */
467static void 468static void
468rpc_free_client(struct kref *kref) 469rpc_free_client(struct rpc_clnt *clnt)
469{ 470{
470 struct rpc_clnt *clnt = container_of(kref, struct rpc_clnt, cl_kref);
471
472 dprintk("RPC: destroying %s client for %s\n", 471 dprintk("RPC: destroying %s client for %s\n",
473 clnt->cl_protname, clnt->cl_server); 472 clnt->cl_protname, clnt->cl_server);
474 if (!IS_ERR(clnt->cl_path.dentry)) { 473 if (!IS_ERR(clnt->cl_path.dentry)) {
@@ -495,12 +494,10 @@ out_free:
495 * Free an RPC client 494 * Free an RPC client
496 */ 495 */
497static void 496static void
498rpc_free_auth(struct kref *kref) 497rpc_free_auth(struct rpc_clnt *clnt)
499{ 498{
500 struct rpc_clnt *clnt = container_of(kref, struct rpc_clnt, cl_kref);
501
502 if (clnt->cl_auth == NULL) { 499 if (clnt->cl_auth == NULL) {
503 rpc_free_client(kref); 500 rpc_free_client(clnt);
504 return; 501 return;
505 } 502 }
506 503
@@ -509,10 +506,11 @@ rpc_free_auth(struct kref *kref)
509 * release remaining GSS contexts. This mechanism ensures 506 * release remaining GSS contexts. This mechanism ensures
510 * that it can do so safely. 507 * that it can do so safely.
511 */ 508 */
512 kref_init(kref); 509 atomic_inc(&clnt->cl_count);
513 rpcauth_release(clnt->cl_auth); 510 rpcauth_release(clnt->cl_auth);
514 clnt->cl_auth = NULL; 511 clnt->cl_auth = NULL;
515 kref_put(kref, rpc_free_client); 512 if (atomic_dec_and_test(&clnt->cl_count))
513 rpc_free_client(clnt);
516} 514}
517 515
518/* 516/*
@@ -525,7 +523,8 @@ rpc_release_client(struct rpc_clnt *clnt)
525 523
526 if (list_empty(&clnt->cl_tasks)) 524 if (list_empty(&clnt->cl_tasks))
527 wake_up(&destroy_wait); 525 wake_up(&destroy_wait);
528 kref_put(&clnt->cl_kref, rpc_free_auth); 526 if (atomic_dec_and_test(&clnt->cl_count))
527 rpc_free_auth(clnt);
529} 528}
530 529
531/** 530/**
@@ -588,7 +587,7 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
588 if (clnt != NULL) { 587 if (clnt != NULL) {
589 rpc_task_release_client(task); 588 rpc_task_release_client(task);
590 task->tk_client = clnt; 589 task->tk_client = clnt;
591 kref_get(&clnt->cl_kref); 590 atomic_inc(&clnt->cl_count);
592 if (clnt->cl_softrtry) 591 if (clnt->cl_softrtry)
593 task->tk_flags |= RPC_TASK_SOFT; 592 task->tk_flags |= RPC_TASK_SOFT;
594 /* Add to the client's list of all tasks */ 593 /* Add to the client's list of all tasks */
@@ -931,7 +930,7 @@ call_reserveresult(struct rpc_task *task)
931 task->tk_status = 0; 930 task->tk_status = 0;
932 if (status >= 0) { 931 if (status >= 0) {
933 if (task->tk_rqstp) { 932 if (task->tk_rqstp) {
934 task->tk_action = call_allocate; 933 task->tk_action = call_refresh;
935 return; 934 return;
936 } 935 }
937 936
@@ -966,13 +965,54 @@ call_reserveresult(struct rpc_task *task)
966} 965}
967 966
968/* 967/*
969 * 2. Allocate the buffer. For details, see sched.c:rpc_malloc. 968 * 2. Bind and/or refresh the credentials
969 */
970static void
971call_refresh(struct rpc_task *task)
972{
973 dprint_status(task);
974
975 task->tk_action = call_refreshresult;
976 task->tk_status = 0;
977 task->tk_client->cl_stats->rpcauthrefresh++;
978 rpcauth_refreshcred(task);
979}
980
981/*
982 * 2a. Process the results of a credential refresh
983 */
984static void
985call_refreshresult(struct rpc_task *task)
986{
987 int status = task->tk_status;
988
989 dprint_status(task);
990
991 task->tk_status = 0;
992 task->tk_action = call_allocate;
993 if (status >= 0 && rpcauth_uptodatecred(task))
994 return;
995 switch (status) {
996 case -EACCES:
997 rpc_exit(task, -EACCES);
998 return;
999 case -ENOMEM:
1000 rpc_exit(task, -ENOMEM);
1001 return;
1002 case -ETIMEDOUT:
1003 rpc_delay(task, 3*HZ);
1004 }
1005 task->tk_action = call_refresh;
1006}
1007
1008/*
1009 * 2b. Allocate the buffer. For details, see sched.c:rpc_malloc.
970 * (Note: buffer memory is freed in xprt_release). 1010 * (Note: buffer memory is freed in xprt_release).
971 */ 1011 */
972static void 1012static void
973call_allocate(struct rpc_task *task) 1013call_allocate(struct rpc_task *task)
974{ 1014{
975 unsigned int slack = task->tk_client->cl_auth->au_cslack; 1015 unsigned int slack = task->tk_rqstp->rq_cred->cr_auth->au_cslack;
976 struct rpc_rqst *req = task->tk_rqstp; 1016 struct rpc_rqst *req = task->tk_rqstp;
977 struct rpc_xprt *xprt = task->tk_xprt; 1017 struct rpc_xprt *xprt = task->tk_xprt;
978 struct rpc_procinfo *proc = task->tk_msg.rpc_proc; 1018 struct rpc_procinfo *proc = task->tk_msg.rpc_proc;
@@ -980,7 +1020,7 @@ call_allocate(struct rpc_task *task)
980 dprint_status(task); 1020 dprint_status(task);
981 1021
982 task->tk_status = 0; 1022 task->tk_status = 0;
983 task->tk_action = call_refresh; 1023 task->tk_action = call_bind;
984 1024
985 if (req->rq_buffer) 1025 if (req->rq_buffer)
986 return; 1026 return;
@@ -1017,47 +1057,6 @@ call_allocate(struct rpc_task *task)
1017 rpc_exit(task, -ERESTARTSYS); 1057 rpc_exit(task, -ERESTARTSYS);
1018} 1058}
1019 1059
1020/*
1021 * 2a. Bind and/or refresh the credentials
1022 */
1023static void
1024call_refresh(struct rpc_task *task)
1025{
1026 dprint_status(task);
1027
1028 task->tk_action = call_refreshresult;
1029 task->tk_status = 0;
1030 task->tk_client->cl_stats->rpcauthrefresh++;
1031 rpcauth_refreshcred(task);
1032}
1033
1034/*
1035 * 2b. Process the results of a credential refresh
1036 */
1037static void
1038call_refreshresult(struct rpc_task *task)
1039{
1040 int status = task->tk_status;
1041
1042 dprint_status(task);
1043
1044 task->tk_status = 0;
1045 task->tk_action = call_bind;
1046 if (status >= 0 && rpcauth_uptodatecred(task))
1047 return;
1048 switch (status) {
1049 case -EACCES:
1050 rpc_exit(task, -EACCES);
1051 return;
1052 case -ENOMEM:
1053 rpc_exit(task, -ENOMEM);
1054 return;
1055 case -ETIMEDOUT:
1056 rpc_delay(task, 3*HZ);
1057 }
1058 task->tk_action = call_refresh;
1059}
1060
1061static inline int 1060static inline int
1062rpc_task_need_encode(struct rpc_task *task) 1061rpc_task_need_encode(struct rpc_task *task)
1063{ 1062{
@@ -1677,7 +1676,7 @@ rpc_verify_header(struct rpc_task *task)
1677 rpcauth_invalcred(task); 1676 rpcauth_invalcred(task);
1678 /* Ensure we obtain a new XID! */ 1677 /* Ensure we obtain a new XID! */
1679 xprt_release(task); 1678 xprt_release(task);
1680 task->tk_action = call_refresh; 1679 task->tk_action = call_reserve;
1681 goto out_retry; 1680 goto out_retry;
1682 case RPC_AUTH_BADCRED: 1681 case RPC_AUTH_BADCRED:
1683 case RPC_AUTH_BADVERF: 1682 case RPC_AUTH_BADVERF:
diff --git a/net/sunrpc/netns.h b/net/sunrpc/netns.h
new file mode 100644
index 000000000000..d013bf211cae
--- /dev/null
+++ b/net/sunrpc/netns.h
@@ -0,0 +1,19 @@
1#ifndef __SUNRPC_NETNS_H__
2#define __SUNRPC_NETNS_H__
3
4#include <net/net_namespace.h>
5#include <net/netns/generic.h>
6
7struct cache_detail;
8
9struct sunrpc_net {
10 struct proc_dir_entry *proc_net_rpc;
11 struct cache_detail *ip_map_cache;
12};
13
14extern int sunrpc_net_id;
15
16int ip_map_cache_create(struct net *);
17void ip_map_cache_destroy(struct net *);
18
19#endif
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 95ccbcf45d3e..10a17a37ec4e 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -27,9 +27,8 @@
27#include <linux/workqueue.h> 27#include <linux/workqueue.h>
28#include <linux/sunrpc/rpc_pipe_fs.h> 28#include <linux/sunrpc/rpc_pipe_fs.h>
29#include <linux/sunrpc/cache.h> 29#include <linux/sunrpc/cache.h>
30#include <linux/smp_lock.h>
31 30
32static struct vfsmount *rpc_mount __read_mostly; 31static struct vfsmount *rpc_mnt __read_mostly;
33static int rpc_mount_count; 32static int rpc_mount_count;
34 33
35static struct file_system_type rpc_pipe_fs_type; 34static struct file_system_type rpc_pipe_fs_type;
@@ -48,7 +47,7 @@ static void rpc_purge_list(struct rpc_inode *rpci, struct list_head *head,
48 return; 47 return;
49 do { 48 do {
50 msg = list_entry(head->next, struct rpc_pipe_msg, list); 49 msg = list_entry(head->next, struct rpc_pipe_msg, list);
51 list_del(&msg->list); 50 list_del_init(&msg->list);
52 msg->errno = err; 51 msg->errno = err;
53 destroy_msg(msg); 52 destroy_msg(msg);
54 } while (!list_empty(head)); 53 } while (!list_empty(head));
@@ -204,11 +203,11 @@ rpc_pipe_release(struct inode *inode, struct file *filp)
204 mutex_lock(&inode->i_mutex); 203 mutex_lock(&inode->i_mutex);
205 if (rpci->ops == NULL) 204 if (rpci->ops == NULL)
206 goto out; 205 goto out;
207 msg = (struct rpc_pipe_msg *)filp->private_data; 206 msg = filp->private_data;
208 if (msg != NULL) { 207 if (msg != NULL) {
209 spin_lock(&inode->i_lock); 208 spin_lock(&inode->i_lock);
210 msg->errno = -EAGAIN; 209 msg->errno = -EAGAIN;
211 list_del(&msg->list); 210 list_del_init(&msg->list);
212 spin_unlock(&inode->i_lock); 211 spin_unlock(&inode->i_lock);
213 rpci->ops->destroy_msg(msg); 212 rpci->ops->destroy_msg(msg);
214 } 213 }
@@ -268,7 +267,7 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset)
268 if (res < 0 || msg->len == msg->copied) { 267 if (res < 0 || msg->len == msg->copied) {
269 filp->private_data = NULL; 268 filp->private_data = NULL;
270 spin_lock(&inode->i_lock); 269 spin_lock(&inode->i_lock);
271 list_del(&msg->list); 270 list_del_init(&msg->list);
272 spin_unlock(&inode->i_lock); 271 spin_unlock(&inode->i_lock);
273 rpci->ops->destroy_msg(msg); 272 rpci->ops->destroy_msg(msg);
274 } 273 }
@@ -309,40 +308,33 @@ rpc_pipe_poll(struct file *filp, struct poll_table_struct *wait)
309 return mask; 308 return mask;
310} 309}
311 310
312static int 311static long
313rpc_pipe_ioctl_unlocked(struct file *filp, unsigned int cmd, unsigned long arg) 312rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
314{ 313{
315 struct rpc_inode *rpci = RPC_I(filp->f_path.dentry->d_inode); 314 struct inode *inode = filp->f_path.dentry->d_inode;
315 struct rpc_inode *rpci = RPC_I(inode);
316 int len; 316 int len;
317 317
318 switch (cmd) { 318 switch (cmd) {
319 case FIONREAD: 319 case FIONREAD:
320 if (rpci->ops == NULL) 320 spin_lock(&inode->i_lock);
321 if (rpci->ops == NULL) {
322 spin_unlock(&inode->i_lock);
321 return -EPIPE; 323 return -EPIPE;
324 }
322 len = rpci->pipelen; 325 len = rpci->pipelen;
323 if (filp->private_data) { 326 if (filp->private_data) {
324 struct rpc_pipe_msg *msg; 327 struct rpc_pipe_msg *msg;
325 msg = (struct rpc_pipe_msg *)filp->private_data; 328 msg = filp->private_data;
326 len += msg->len - msg->copied; 329 len += msg->len - msg->copied;
327 } 330 }
331 spin_unlock(&inode->i_lock);
328 return put_user(len, (int __user *)arg); 332 return put_user(len, (int __user *)arg);
329 default: 333 default:
330 return -EINVAL; 334 return -EINVAL;
331 } 335 }
332} 336}
333 337
334static long
335rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
336{
337 long ret;
338
339 lock_kernel();
340 ret = rpc_pipe_ioctl_unlocked(filp, cmd, arg);
341 unlock_kernel();
342
343 return ret;
344}
345
346static const struct file_operations rpc_pipe_fops = { 338static const struct file_operations rpc_pipe_fops = {
347 .owner = THIS_MODULE, 339 .owner = THIS_MODULE,
348 .llseek = no_llseek, 340 .llseek = no_llseek,
@@ -371,21 +363,23 @@ rpc_show_info(struct seq_file *m, void *v)
371static int 363static int
372rpc_info_open(struct inode *inode, struct file *file) 364rpc_info_open(struct inode *inode, struct file *file)
373{ 365{
374 struct rpc_clnt *clnt; 366 struct rpc_clnt *clnt = NULL;
375 int ret = single_open(file, rpc_show_info, NULL); 367 int ret = single_open(file, rpc_show_info, NULL);
376 368
377 if (!ret) { 369 if (!ret) {
378 struct seq_file *m = file->private_data; 370 struct seq_file *m = file->private_data;
379 mutex_lock(&inode->i_mutex); 371
380 clnt = RPC_I(inode)->private; 372 spin_lock(&file->f_path.dentry->d_lock);
381 if (clnt) { 373 if (!d_unhashed(file->f_path.dentry))
382 kref_get(&clnt->cl_kref); 374 clnt = RPC_I(inode)->private;
375 if (clnt != NULL && atomic_inc_not_zero(&clnt->cl_count)) {
376 spin_unlock(&file->f_path.dentry->d_lock);
383 m->private = clnt; 377 m->private = clnt;
384 } else { 378 } else {
379 spin_unlock(&file->f_path.dentry->d_lock);
385 single_release(inode, file); 380 single_release(inode, file);
386 ret = -EINVAL; 381 ret = -EINVAL;
387 } 382 }
388 mutex_unlock(&inode->i_mutex);
389 } 383 }
390 return ret; 384 return ret;
391} 385}
@@ -423,16 +417,16 @@ struct vfsmount *rpc_get_mount(void)
423{ 417{
424 int err; 418 int err;
425 419
426 err = simple_pin_fs(&rpc_pipe_fs_type, &rpc_mount, &rpc_mount_count); 420 err = simple_pin_fs(&rpc_pipe_fs_type, &rpc_mnt, &rpc_mount_count);
427 if (err != 0) 421 if (err != 0)
428 return ERR_PTR(err); 422 return ERR_PTR(err);
429 return rpc_mount; 423 return rpc_mnt;
430} 424}
431EXPORT_SYMBOL_GPL(rpc_get_mount); 425EXPORT_SYMBOL_GPL(rpc_get_mount);
432 426
433void rpc_put_mount(void) 427void rpc_put_mount(void)
434{ 428{
435 simple_release_fs(&rpc_mount, &rpc_mount_count); 429 simple_release_fs(&rpc_mnt, &rpc_mount_count);
436} 430}
437EXPORT_SYMBOL_GPL(rpc_put_mount); 431EXPORT_SYMBOL_GPL(rpc_put_mount);
438 432
@@ -451,6 +445,7 @@ rpc_get_inode(struct super_block *sb, umode_t mode)
451 struct inode *inode = new_inode(sb); 445 struct inode *inode = new_inode(sb);
452 if (!inode) 446 if (!inode)
453 return NULL; 447 return NULL;
448 inode->i_ino = get_next_ino();
454 inode->i_mode = mode; 449 inode->i_mode = mode;
455 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 450 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
456 switch(mode & S_IFMT) { 451 switch(mode & S_IFMT) {
@@ -1023,17 +1018,17 @@ rpc_fill_super(struct super_block *sb, void *data, int silent)
1023 return 0; 1018 return 0;
1024} 1019}
1025 1020
1026static int 1021static struct dentry *
1027rpc_get_sb(struct file_system_type *fs_type, 1022rpc_mount(struct file_system_type *fs_type,
1028 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 1023 int flags, const char *dev_name, void *data)
1029{ 1024{
1030 return get_sb_single(fs_type, flags, data, rpc_fill_super, mnt); 1025 return mount_single(fs_type, flags, data, rpc_fill_super);
1031} 1026}
1032 1027
1033static struct file_system_type rpc_pipe_fs_type = { 1028static struct file_system_type rpc_pipe_fs_type = {
1034 .owner = THIS_MODULE, 1029 .owner = THIS_MODULE,
1035 .name = "rpc_pipefs", 1030 .name = "rpc_pipefs",
1036 .get_sb = rpc_get_sb, 1031 .mount = rpc_mount,
1037 .kill_sb = kill_litter_super, 1032 .kill_sb = kill_litter_super,
1038}; 1033};
1039 1034
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index dac219a56ae1..fa6d7ca2c851 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -177,6 +177,7 @@ static DEFINE_MUTEX(rpcb_create_local_mutex);
177static int rpcb_create_local(void) 177static int rpcb_create_local(void)
178{ 178{
179 struct rpc_create_args args = { 179 struct rpc_create_args args = {
180 .net = &init_net,
180 .protocol = XPRT_TRANSPORT_TCP, 181 .protocol = XPRT_TRANSPORT_TCP,
181 .address = (struct sockaddr *)&rpcb_inaddr_loopback, 182 .address = (struct sockaddr *)&rpcb_inaddr_loopback,
182 .addrsize = sizeof(rpcb_inaddr_loopback), 183 .addrsize = sizeof(rpcb_inaddr_loopback),
@@ -211,8 +212,9 @@ static int rpcb_create_local(void)
211 */ 212 */
212 clnt4 = rpc_bind_new_program(clnt, &rpcb_program, RPCBVERS_4); 213 clnt4 = rpc_bind_new_program(clnt, &rpcb_program, RPCBVERS_4);
213 if (IS_ERR(clnt4)) { 214 if (IS_ERR(clnt4)) {
214 dprintk("RPC: failed to create local rpcbind v4 " 215 dprintk("RPC: failed to bind second program to "
215 "cleint (errno %ld).\n", PTR_ERR(clnt4)); 216 "rpcbind v4 client (errno %ld).\n",
217 PTR_ERR(clnt4));
216 clnt4 = NULL; 218 clnt4 = NULL;
217 } 219 }
218 220
@@ -228,6 +230,7 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr,
228 size_t salen, int proto, u32 version) 230 size_t salen, int proto, u32 version)
229{ 231{
230 struct rpc_create_args args = { 232 struct rpc_create_args args = {
233 .net = &init_net,
231 .protocol = proto, 234 .protocol = proto,
232 .address = srvaddr, 235 .address = srvaddr,
233 .addrsize = salen, 236 .addrsize = salen,
@@ -247,7 +250,7 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr,
247 ((struct sockaddr_in6 *)srvaddr)->sin6_port = htons(RPCBIND_PORT); 250 ((struct sockaddr_in6 *)srvaddr)->sin6_port = htons(RPCBIND_PORT);
248 break; 251 break;
249 default: 252 default:
250 return NULL; 253 return ERR_PTR(-EAFNOSUPPORT);
251 } 254 }
252 255
253 return rpc_create(&args); 256 return rpc_create(&args);
@@ -475,57 +478,6 @@ int rpcb_v4_register(const u32 program, const u32 version,
475 return -EAFNOSUPPORT; 478 return -EAFNOSUPPORT;
476} 479}
477 480
478/**
479 * rpcb_getport_sync - obtain the port for an RPC service on a given host
480 * @sin: address of remote peer
481 * @prog: RPC program number to bind
482 * @vers: RPC version number to bind
483 * @prot: transport protocol to use to make this request
484 *
485 * Return value is the requested advertised port number,
486 * or a negative errno value.
487 *
488 * Called from outside the RPC client in a synchronous task context.
489 * Uses default timeout parameters specified by underlying transport.
490 *
491 * XXX: Needs to support IPv6
492 */
493int rpcb_getport_sync(struct sockaddr_in *sin, u32 prog, u32 vers, int prot)
494{
495 struct rpcbind_args map = {
496 .r_prog = prog,
497 .r_vers = vers,
498 .r_prot = prot,
499 .r_port = 0,
500 };
501 struct rpc_message msg = {
502 .rpc_proc = &rpcb_procedures2[RPCBPROC_GETPORT],
503 .rpc_argp = &map,
504 .rpc_resp = &map,
505 };
506 struct rpc_clnt *rpcb_clnt;
507 int status;
508
509 dprintk("RPC: %s(%pI4, %u, %u, %d)\n",
510 __func__, &sin->sin_addr.s_addr, prog, vers, prot);
511
512 rpcb_clnt = rpcb_create(NULL, (struct sockaddr *)sin,
513 sizeof(*sin), prot, RPCBVERS_2);
514 if (IS_ERR(rpcb_clnt))
515 return PTR_ERR(rpcb_clnt);
516
517 status = rpc_call_sync(rpcb_clnt, &msg, 0);
518 rpc_shutdown_client(rpcb_clnt);
519
520 if (status >= 0) {
521 if (map.r_port != 0)
522 return map.r_port;
523 status = -EACCES;
524 }
525 return status;
526}
527EXPORT_SYMBOL_GPL(rpcb_getport_sync);
528
529static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, struct rpcbind_args *map, struct rpc_procinfo *proc) 481static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, struct rpcbind_args *map, struct rpc_procinfo *proc)
530{ 482{
531 struct rpc_message msg = { 483 struct rpc_message msg = {
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index cace6049e4a5..243fc09b164e 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -376,7 +376,7 @@ int rpc_queue_empty(struct rpc_wait_queue *queue)
376 spin_lock_bh(&queue->lock); 376 spin_lock_bh(&queue->lock);
377 res = queue->qlen; 377 res = queue->qlen;
378 spin_unlock_bh(&queue->lock); 378 spin_unlock_bh(&queue->lock);
379 return (res == 0); 379 return res == 0;
380} 380}
381EXPORT_SYMBOL_GPL(rpc_queue_empty); 381EXPORT_SYMBOL_GPL(rpc_queue_empty);
382 382
@@ -908,7 +908,7 @@ static int rpciod_start(void)
908 * Create the rpciod thread and wait for it to start. 908 * Create the rpciod thread and wait for it to start.
909 */ 909 */
910 dprintk("RPC: creating workqueue rpciod\n"); 910 dprintk("RPC: creating workqueue rpciod\n");
911 wq = create_workqueue("rpciod"); 911 wq = alloc_workqueue("rpciod", WQ_RESCUER, 0);
912 rpciod_workqueue = wq; 912 rpciod_workqueue = wq;
913 return rpciod_workqueue != NULL; 913 return rpciod_workqueue != NULL;
914} 914}
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index ea1046f3f9a3..f71a73107ae9 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -22,11 +22,10 @@
22#include <linux/sunrpc/clnt.h> 22#include <linux/sunrpc/clnt.h>
23#include <linux/sunrpc/svcsock.h> 23#include <linux/sunrpc/svcsock.h>
24#include <linux/sunrpc/metrics.h> 24#include <linux/sunrpc/metrics.h>
25#include <net/net_namespace.h>
26 25
27#define RPCDBG_FACILITY RPCDBG_MISC 26#include "netns.h"
28 27
29struct proc_dir_entry *proc_net_rpc = NULL; 28#define RPCDBG_FACILITY RPCDBG_MISC
30 29
31/* 30/*
32 * Get RPC client stats 31 * Get RPC client stats
@@ -218,10 +217,11 @@ EXPORT_SYMBOL_GPL(rpc_print_iostats);
218static inline struct proc_dir_entry * 217static inline struct proc_dir_entry *
219do_register(const char *name, void *data, const struct file_operations *fops) 218do_register(const char *name, void *data, const struct file_operations *fops)
220{ 219{
221 rpc_proc_init(); 220 struct sunrpc_net *sn;
222 dprintk("RPC: registering /proc/net/rpc/%s\n", name);
223 221
224 return proc_create_data(name, 0, proc_net_rpc, fops, data); 222 dprintk("RPC: registering /proc/net/rpc/%s\n", name);
223 sn = net_generic(&init_net, sunrpc_net_id);
224 return proc_create_data(name, 0, sn->proc_net_rpc, fops, data);
225} 225}
226 226
227struct proc_dir_entry * 227struct proc_dir_entry *
@@ -234,7 +234,10 @@ EXPORT_SYMBOL_GPL(rpc_proc_register);
234void 234void
235rpc_proc_unregister(const char *name) 235rpc_proc_unregister(const char *name)
236{ 236{
237 remove_proc_entry(name, proc_net_rpc); 237 struct sunrpc_net *sn;
238
239 sn = net_generic(&init_net, sunrpc_net_id);
240 remove_proc_entry(name, sn->proc_net_rpc);
238} 241}
239EXPORT_SYMBOL_GPL(rpc_proc_unregister); 242EXPORT_SYMBOL_GPL(rpc_proc_unregister);
240 243
@@ -248,25 +251,29 @@ EXPORT_SYMBOL_GPL(svc_proc_register);
248void 251void
249svc_proc_unregister(const char *name) 252svc_proc_unregister(const char *name)
250{ 253{
251 remove_proc_entry(name, proc_net_rpc); 254 struct sunrpc_net *sn;
255
256 sn = net_generic(&init_net, sunrpc_net_id);
257 remove_proc_entry(name, sn->proc_net_rpc);
252} 258}
253EXPORT_SYMBOL_GPL(svc_proc_unregister); 259EXPORT_SYMBOL_GPL(svc_proc_unregister);
254 260
255void 261int rpc_proc_init(struct net *net)
256rpc_proc_init(void)
257{ 262{
263 struct sunrpc_net *sn;
264
258 dprintk("RPC: registering /proc/net/rpc\n"); 265 dprintk("RPC: registering /proc/net/rpc\n");
259 if (!proc_net_rpc) 266 sn = net_generic(net, sunrpc_net_id);
260 proc_net_rpc = proc_mkdir("rpc", init_net.proc_net); 267 sn->proc_net_rpc = proc_mkdir("rpc", net->proc_net);
268 if (sn->proc_net_rpc == NULL)
269 return -ENOMEM;
270
271 return 0;
261} 272}
262 273
263void 274void rpc_proc_exit(struct net *net)
264rpc_proc_exit(void)
265{ 275{
266 dprintk("RPC: unregistering /proc/net/rpc\n"); 276 dprintk("RPC: unregistering /proc/net/rpc\n");
267 if (proc_net_rpc) { 277 remove_proc_entry("rpc", net->proc_net);
268 proc_net_rpc = NULL;
269 remove_proc_entry("rpc", init_net.proc_net);
270 }
271} 278}
272 279
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index c0d085013a2b..9d0809160994 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -22,7 +22,44 @@
22#include <linux/sunrpc/rpc_pipe_fs.h> 22#include <linux/sunrpc/rpc_pipe_fs.h>
23#include <linux/sunrpc/xprtsock.h> 23#include <linux/sunrpc/xprtsock.h>
24 24
25extern struct cache_detail ip_map_cache, unix_gid_cache; 25#include "netns.h"
26
27int sunrpc_net_id;
28
29static __net_init int sunrpc_init_net(struct net *net)
30{
31 int err;
32
33 err = rpc_proc_init(net);
34 if (err)
35 goto err_proc;
36
37 err = ip_map_cache_create(net);
38 if (err)
39 goto err_ipmap;
40
41 return 0;
42
43err_ipmap:
44 rpc_proc_exit(net);
45err_proc:
46 return err;
47}
48
49static __net_exit void sunrpc_exit_net(struct net *net)
50{
51 ip_map_cache_destroy(net);
52 rpc_proc_exit(net);
53}
54
55static struct pernet_operations sunrpc_net_ops = {
56 .init = sunrpc_init_net,
57 .exit = sunrpc_exit_net,
58 .id = &sunrpc_net_id,
59 .size = sizeof(struct sunrpc_net),
60};
61
62extern struct cache_detail unix_gid_cache;
26 63
27extern void cleanup_rpcb_clnt(void); 64extern void cleanup_rpcb_clnt(void);
28 65
@@ -38,18 +75,22 @@ init_sunrpc(void)
38 err = rpcauth_init_module(); 75 err = rpcauth_init_module();
39 if (err) 76 if (err)
40 goto out3; 77 goto out3;
78
79 cache_initialize();
80
81 err = register_pernet_subsys(&sunrpc_net_ops);
82 if (err)
83 goto out4;
41#ifdef RPC_DEBUG 84#ifdef RPC_DEBUG
42 rpc_register_sysctl(); 85 rpc_register_sysctl();
43#endif 86#endif
44#ifdef CONFIG_PROC_FS
45 rpc_proc_init();
46#endif
47 cache_initialize();
48 cache_register(&ip_map_cache);
49 cache_register(&unix_gid_cache); 87 cache_register(&unix_gid_cache);
50 svc_init_xprt_sock(); /* svc sock transport */ 88 svc_init_xprt_sock(); /* svc sock transport */
51 init_socket_xprt(); /* clnt sock transport */ 89 init_socket_xprt(); /* clnt sock transport */
52 return 0; 90 return 0;
91
92out4:
93 rpcauth_remove_module();
53out3: 94out3:
54 rpc_destroy_mempool(); 95 rpc_destroy_mempool();
55out2: 96out2:
@@ -67,14 +108,11 @@ cleanup_sunrpc(void)
67 svc_cleanup_xprt_sock(); 108 svc_cleanup_xprt_sock();
68 unregister_rpc_pipefs(); 109 unregister_rpc_pipefs();
69 rpc_destroy_mempool(); 110 rpc_destroy_mempool();
70 cache_unregister(&ip_map_cache);
71 cache_unregister(&unix_gid_cache); 111 cache_unregister(&unix_gid_cache);
112 unregister_pernet_subsys(&sunrpc_net_ops);
72#ifdef RPC_DEBUG 113#ifdef RPC_DEBUG
73 rpc_unregister_sysctl(); 114 rpc_unregister_sysctl();
74#endif 115#endif
75#ifdef CONFIG_PROC_FS
76 rpc_proc_exit();
77#endif
78 rcu_barrier(); /* Wait for completion of call_rcu()'s */ 116 rcu_barrier(); /* Wait for completion of call_rcu()'s */
79} 117}
80MODULE_LICENSE("GPL"); 118MODULE_LICENSE("GPL");
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index d9017d64597e..6359c42c4941 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1055,6 +1055,9 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
1055 goto err_bad; 1055 goto err_bad;
1056 case SVC_DENIED: 1056 case SVC_DENIED:
1057 goto err_bad_auth; 1057 goto err_bad_auth;
1058 case SVC_CLOSE:
1059 if (test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags))
1060 svc_close_xprt(rqstp->rq_xprt);
1058 case SVC_DROP: 1061 case SVC_DROP:
1059 goto dropit; 1062 goto dropit;
1060 case SVC_COMPLETE: 1063 case SVC_COMPLETE:
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index cbc084939dd8..c82fe739fbdc 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -100,16 +100,14 @@ EXPORT_SYMBOL_GPL(svc_unreg_xprt_class);
100 */ 100 */
101int svc_print_xprts(char *buf, int maxlen) 101int svc_print_xprts(char *buf, int maxlen)
102{ 102{
103 struct list_head *le; 103 struct svc_xprt_class *xcl;
104 char tmpstr[80]; 104 char tmpstr[80];
105 int len = 0; 105 int len = 0;
106 buf[0] = '\0'; 106 buf[0] = '\0';
107 107
108 spin_lock(&svc_xprt_class_lock); 108 spin_lock(&svc_xprt_class_lock);
109 list_for_each(le, &svc_xprt_class_list) { 109 list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) {
110 int slen; 110 int slen;
111 struct svc_xprt_class *xcl =
112 list_entry(le, struct svc_xprt_class, xcl_list);
113 111
114 sprintf(tmpstr, "%s %d\n", xcl->xcl_name, xcl->xcl_max_payload); 112 sprintf(tmpstr, "%s %d\n", xcl->xcl_name, xcl->xcl_max_payload);
115 slen = strlen(tmpstr); 113 slen = strlen(tmpstr);
@@ -128,9 +126,9 @@ static void svc_xprt_free(struct kref *kref)
128 struct svc_xprt *xprt = 126 struct svc_xprt *xprt =
129 container_of(kref, struct svc_xprt, xpt_ref); 127 container_of(kref, struct svc_xprt, xpt_ref);
130 struct module *owner = xprt->xpt_class->xcl_owner; 128 struct module *owner = xprt->xpt_class->xcl_owner;
131 if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags) && 129 if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags))
132 xprt->xpt_auth_cache != NULL) 130 svcauth_unix_info_release(xprt);
133 svcauth_unix_info_release(xprt->xpt_auth_cache); 131 put_net(xprt->xpt_net);
134 xprt->xpt_ops->xpo_free(xprt); 132 xprt->xpt_ops->xpo_free(xprt);
135 module_put(owner); 133 module_put(owner);
136} 134}
@@ -156,15 +154,18 @@ void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt,
156 INIT_LIST_HEAD(&xprt->xpt_list); 154 INIT_LIST_HEAD(&xprt->xpt_list);
157 INIT_LIST_HEAD(&xprt->xpt_ready); 155 INIT_LIST_HEAD(&xprt->xpt_ready);
158 INIT_LIST_HEAD(&xprt->xpt_deferred); 156 INIT_LIST_HEAD(&xprt->xpt_deferred);
157 INIT_LIST_HEAD(&xprt->xpt_users);
159 mutex_init(&xprt->xpt_mutex); 158 mutex_init(&xprt->xpt_mutex);
160 spin_lock_init(&xprt->xpt_lock); 159 spin_lock_init(&xprt->xpt_lock);
161 set_bit(XPT_BUSY, &xprt->xpt_flags); 160 set_bit(XPT_BUSY, &xprt->xpt_flags);
162 rpc_init_wait_queue(&xprt->xpt_bc_pending, "xpt_bc_pending"); 161 rpc_init_wait_queue(&xprt->xpt_bc_pending, "xpt_bc_pending");
162 xprt->xpt_net = get_net(&init_net);
163} 163}
164EXPORT_SYMBOL_GPL(svc_xprt_init); 164EXPORT_SYMBOL_GPL(svc_xprt_init);
165 165
166static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, 166static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
167 struct svc_serv *serv, 167 struct svc_serv *serv,
168 struct net *net,
168 const int family, 169 const int family,
169 const unsigned short port, 170 const unsigned short port,
170 int flags) 171 int flags)
@@ -199,12 +200,12 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
199 return ERR_PTR(-EAFNOSUPPORT); 200 return ERR_PTR(-EAFNOSUPPORT);
200 } 201 }
201 202
202 return xcl->xcl_ops->xpo_create(serv, sap, len, flags); 203 return xcl->xcl_ops->xpo_create(serv, net, sap, len, flags);
203} 204}
204 205
205int svc_create_xprt(struct svc_serv *serv, const char *xprt_name, 206int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
206 const int family, const unsigned short port, 207 struct net *net, const int family,
207 int flags) 208 const unsigned short port, int flags)
208{ 209{
209 struct svc_xprt_class *xcl; 210 struct svc_xprt_class *xcl;
210 211
@@ -220,7 +221,7 @@ int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
220 goto err; 221 goto err;
221 222
222 spin_unlock(&svc_xprt_class_lock); 223 spin_unlock(&svc_xprt_class_lock);
223 newxprt = __svc_xpo_create(xcl, serv, family, port, flags); 224 newxprt = __svc_xpo_create(xcl, serv, net, family, port, flags);
224 if (IS_ERR(newxprt)) { 225 if (IS_ERR(newxprt)) {
225 module_put(xcl->xcl_owner); 226 module_put(xcl->xcl_owner);
226 return PTR_ERR(newxprt); 227 return PTR_ERR(newxprt);
@@ -329,12 +330,6 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
329 "svc_xprt_enqueue: " 330 "svc_xprt_enqueue: "
330 "threads and transports both waiting??\n"); 331 "threads and transports both waiting??\n");
331 332
332 if (test_bit(XPT_DEAD, &xprt->xpt_flags)) {
333 /* Don't enqueue dead transports */
334 dprintk("svc: transport %p is dead, not enqueued\n", xprt);
335 goto out_unlock;
336 }
337
338 pool->sp_stats.packets++; 333 pool->sp_stats.packets++;
339 334
340 /* Mark transport as busy. It will remain in this state until 335 /* Mark transport as busy. It will remain in this state until
@@ -651,6 +646,11 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
651 if (signalled() || kthread_should_stop()) 646 if (signalled() || kthread_should_stop())
652 return -EINTR; 647 return -EINTR;
653 648
649 /* Normally we will wait up to 5 seconds for any required
650 * cache information to be provided.
651 */
652 rqstp->rq_chandle.thread_wait = 5*HZ;
653
654 spin_lock_bh(&pool->sp_lock); 654 spin_lock_bh(&pool->sp_lock);
655 xprt = svc_xprt_dequeue(pool); 655 xprt = svc_xprt_dequeue(pool);
656 if (xprt) { 656 if (xprt) {
@@ -658,6 +658,12 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
658 svc_xprt_get(xprt); 658 svc_xprt_get(xprt);
659 rqstp->rq_reserved = serv->sv_max_mesg; 659 rqstp->rq_reserved = serv->sv_max_mesg;
660 atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); 660 atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
661
662 /* As there is a shortage of threads and this request
663 * had to be queued, don't allow the thread to wait so
664 * long for cache updates.
665 */
666 rqstp->rq_chandle.thread_wait = 1*HZ;
661 } else { 667 } else {
662 /* No data pending. Go to sleep */ 668 /* No data pending. Go to sleep */
663 svc_thread_enqueue(pool, rqstp); 669 svc_thread_enqueue(pool, rqstp);
@@ -868,6 +874,19 @@ static void svc_age_temp_xprts(unsigned long closure)
868 mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ); 874 mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ);
869} 875}
870 876
877static void call_xpt_users(struct svc_xprt *xprt)
878{
879 struct svc_xpt_user *u;
880
881 spin_lock(&xprt->xpt_lock);
882 while (!list_empty(&xprt->xpt_users)) {
883 u = list_first_entry(&xprt->xpt_users, struct svc_xpt_user, list);
884 list_del(&u->list);
885 u->callback(u);
886 }
887 spin_unlock(&xprt->xpt_lock);
888}
889
871/* 890/*
872 * Remove a dead transport 891 * Remove a dead transport
873 */ 892 */
@@ -878,7 +897,7 @@ void svc_delete_xprt(struct svc_xprt *xprt)
878 897
879 /* Only do this once */ 898 /* Only do this once */
880 if (test_and_set_bit(XPT_DEAD, &xprt->xpt_flags)) 899 if (test_and_set_bit(XPT_DEAD, &xprt->xpt_flags))
881 return; 900 BUG();
882 901
883 dprintk("svc: svc_delete_xprt(%p)\n", xprt); 902 dprintk("svc: svc_delete_xprt(%p)\n", xprt);
884 xprt->xpt_ops->xpo_detach(xprt); 903 xprt->xpt_ops->xpo_detach(xprt);
@@ -900,6 +919,7 @@ void svc_delete_xprt(struct svc_xprt *xprt)
900 while ((dr = svc_deferred_dequeue(xprt)) != NULL) 919 while ((dr = svc_deferred_dequeue(xprt)) != NULL)
901 kfree(dr); 920 kfree(dr);
902 921
922 call_xpt_users(xprt);
903 svc_xprt_put(xprt); 923 svc_xprt_put(xprt);
904} 924}
905 925
@@ -910,10 +930,7 @@ void svc_close_xprt(struct svc_xprt *xprt)
910 /* someone else will have to effect the close */ 930 /* someone else will have to effect the close */
911 return; 931 return;
912 932
913 svc_xprt_get(xprt);
914 svc_delete_xprt(xprt); 933 svc_delete_xprt(xprt);
915 clear_bit(XPT_BUSY, &xprt->xpt_flags);
916 svc_xprt_put(xprt);
917} 934}
918EXPORT_SYMBOL_GPL(svc_close_xprt); 935EXPORT_SYMBOL_GPL(svc_close_xprt);
919 936
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 207311610988..560677d187f1 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -18,6 +18,8 @@
18 18
19#include <linux/sunrpc/clnt.h> 19#include <linux/sunrpc/clnt.h>
20 20
21#include "netns.h"
22
21/* 23/*
22 * AUTHUNIX and AUTHNULL credentials are both handled here. 24 * AUTHUNIX and AUTHNULL credentials are both handled here.
23 * AUTHNULL is treated just like AUTHUNIX except that the uid/gid 25 * AUTHNULL is treated just like AUTHUNIX except that the uid/gid
@@ -92,7 +94,6 @@ struct ip_map {
92 struct unix_domain *m_client; 94 struct unix_domain *m_client;
93 int m_add_change; 95 int m_add_change;
94}; 96};
95static struct cache_head *ip_table[IP_HASHMAX];
96 97
97static void ip_map_put(struct kref *kref) 98static void ip_map_put(struct kref *kref)
98{ 99{
@@ -178,8 +179,8 @@ static int ip_map_upcall(struct cache_detail *cd, struct cache_head *h)
178 return sunrpc_cache_pipe_upcall(cd, h, ip_map_request); 179 return sunrpc_cache_pipe_upcall(cd, h, ip_map_request);
179} 180}
180 181
181static struct ip_map *ip_map_lookup(char *class, struct in6_addr *addr); 182static struct ip_map *__ip_map_lookup(struct cache_detail *cd, char *class, struct in6_addr *addr);
182static int ip_map_update(struct ip_map *ipm, struct unix_domain *udom, time_t expiry); 183static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm, struct unix_domain *udom, time_t expiry);
183 184
184static int ip_map_parse(struct cache_detail *cd, 185static int ip_map_parse(struct cache_detail *cd,
185 char *mesg, int mlen) 186 char *mesg, int mlen)
@@ -219,10 +220,9 @@ static int ip_map_parse(struct cache_detail *cd,
219 switch (address.sa.sa_family) { 220 switch (address.sa.sa_family) {
220 case AF_INET: 221 case AF_INET:
221 /* Form a mapped IPv4 address in sin6 */ 222 /* Form a mapped IPv4 address in sin6 */
222 memset(&sin6, 0, sizeof(sin6));
223 sin6.sin6_family = AF_INET6; 223 sin6.sin6_family = AF_INET6;
224 sin6.sin6_addr.s6_addr32[2] = htonl(0xffff); 224 ipv6_addr_set_v4mapped(address.s4.sin_addr.s_addr,
225 sin6.sin6_addr.s6_addr32[3] = address.s4.sin_addr.s_addr; 225 &sin6.sin6_addr);
226 break; 226 break;
227#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 227#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
228 case AF_INET6: 228 case AF_INET6:
@@ -249,9 +249,9 @@ static int ip_map_parse(struct cache_detail *cd,
249 dom = NULL; 249 dom = NULL;
250 250
251 /* IPv6 scope IDs are ignored for now */ 251 /* IPv6 scope IDs are ignored for now */
252 ipmp = ip_map_lookup(class, &sin6.sin6_addr); 252 ipmp = __ip_map_lookup(cd, class, &sin6.sin6_addr);
253 if (ipmp) { 253 if (ipmp) {
254 err = ip_map_update(ipmp, 254 err = __ip_map_update(cd, ipmp,
255 container_of(dom, struct unix_domain, h), 255 container_of(dom, struct unix_domain, h),
256 expiry); 256 expiry);
257 } else 257 } else
@@ -294,29 +294,15 @@ static int ip_map_show(struct seq_file *m,
294} 294}
295 295
296 296
297struct cache_detail ip_map_cache = { 297static struct ip_map *__ip_map_lookup(struct cache_detail *cd, char *class,
298 .owner = THIS_MODULE, 298 struct in6_addr *addr)
299 .hash_size = IP_HASHMAX,
300 .hash_table = ip_table,
301 .name = "auth.unix.ip",
302 .cache_put = ip_map_put,
303 .cache_upcall = ip_map_upcall,
304 .cache_parse = ip_map_parse,
305 .cache_show = ip_map_show,
306 .match = ip_map_match,
307 .init = ip_map_init,
308 .update = update,
309 .alloc = ip_map_alloc,
310};
311
312static struct ip_map *ip_map_lookup(char *class, struct in6_addr *addr)
313{ 299{
314 struct ip_map ip; 300 struct ip_map ip;
315 struct cache_head *ch; 301 struct cache_head *ch;
316 302
317 strcpy(ip.m_class, class); 303 strcpy(ip.m_class, class);
318 ipv6_addr_copy(&ip.m_addr, addr); 304 ipv6_addr_copy(&ip.m_addr, addr);
319 ch = sunrpc_cache_lookup(&ip_map_cache, &ip.h, 305 ch = sunrpc_cache_lookup(cd, &ip.h,
320 hash_str(class, IP_HASHBITS) ^ 306 hash_str(class, IP_HASHBITS) ^
321 hash_ip6(*addr)); 307 hash_ip6(*addr));
322 308
@@ -326,7 +312,17 @@ static struct ip_map *ip_map_lookup(char *class, struct in6_addr *addr)
326 return NULL; 312 return NULL;
327} 313}
328 314
329static int ip_map_update(struct ip_map *ipm, struct unix_domain *udom, time_t expiry) 315static inline struct ip_map *ip_map_lookup(struct net *net, char *class,
316 struct in6_addr *addr)
317{
318 struct sunrpc_net *sn;
319
320 sn = net_generic(net, sunrpc_net_id);
321 return __ip_map_lookup(sn->ip_map_cache, class, addr);
322}
323
324static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm,
325 struct unix_domain *udom, time_t expiry)
330{ 326{
331 struct ip_map ip; 327 struct ip_map ip;
332 struct cache_head *ch; 328 struct cache_head *ch;
@@ -344,17 +340,25 @@ static int ip_map_update(struct ip_map *ipm, struct unix_domain *udom, time_t ex
344 ip.m_add_change++; 340 ip.m_add_change++;
345 } 341 }
346 ip.h.expiry_time = expiry; 342 ip.h.expiry_time = expiry;
347 ch = sunrpc_cache_update(&ip_map_cache, 343 ch = sunrpc_cache_update(cd, &ip.h, &ipm->h,
348 &ip.h, &ipm->h,
349 hash_str(ipm->m_class, IP_HASHBITS) ^ 344 hash_str(ipm->m_class, IP_HASHBITS) ^
350 hash_ip6(ipm->m_addr)); 345 hash_ip6(ipm->m_addr));
351 if (!ch) 346 if (!ch)
352 return -ENOMEM; 347 return -ENOMEM;
353 cache_put(ch, &ip_map_cache); 348 cache_put(ch, cd);
354 return 0; 349 return 0;
355} 350}
356 351
357int auth_unix_add_addr(struct in6_addr *addr, struct auth_domain *dom) 352static inline int ip_map_update(struct net *net, struct ip_map *ipm,
353 struct unix_domain *udom, time_t expiry)
354{
355 struct sunrpc_net *sn;
356
357 sn = net_generic(net, sunrpc_net_id);
358 return __ip_map_update(sn->ip_map_cache, ipm, udom, expiry);
359}
360
361int auth_unix_add_addr(struct net *net, struct in6_addr *addr, struct auth_domain *dom)
358{ 362{
359 struct unix_domain *udom; 363 struct unix_domain *udom;
360 struct ip_map *ipmp; 364 struct ip_map *ipmp;
@@ -362,10 +366,10 @@ int auth_unix_add_addr(struct in6_addr *addr, struct auth_domain *dom)
362 if (dom->flavour != &svcauth_unix) 366 if (dom->flavour != &svcauth_unix)
363 return -EINVAL; 367 return -EINVAL;
364 udom = container_of(dom, struct unix_domain, h); 368 udom = container_of(dom, struct unix_domain, h);
365 ipmp = ip_map_lookup("nfsd", addr); 369 ipmp = ip_map_lookup(net, "nfsd", addr);
366 370
367 if (ipmp) 371 if (ipmp)
368 return ip_map_update(ipmp, udom, NEVER); 372 return ip_map_update(net, ipmp, udom, NEVER);
369 else 373 else
370 return -ENOMEM; 374 return -ENOMEM;
371} 375}
@@ -383,16 +387,18 @@ int auth_unix_forget_old(struct auth_domain *dom)
383} 387}
384EXPORT_SYMBOL_GPL(auth_unix_forget_old); 388EXPORT_SYMBOL_GPL(auth_unix_forget_old);
385 389
386struct auth_domain *auth_unix_lookup(struct in6_addr *addr) 390struct auth_domain *auth_unix_lookup(struct net *net, struct in6_addr *addr)
387{ 391{
388 struct ip_map *ipm; 392 struct ip_map *ipm;
389 struct auth_domain *rv; 393 struct auth_domain *rv;
394 struct sunrpc_net *sn;
390 395
391 ipm = ip_map_lookup("nfsd", addr); 396 sn = net_generic(net, sunrpc_net_id);
397 ipm = ip_map_lookup(net, "nfsd", addr);
392 398
393 if (!ipm) 399 if (!ipm)
394 return NULL; 400 return NULL;
395 if (cache_check(&ip_map_cache, &ipm->h, NULL)) 401 if (cache_check(sn->ip_map_cache, &ipm->h, NULL))
396 return NULL; 402 return NULL;
397 403
398 if ((ipm->m_client->addr_changes - ipm->m_add_change) >0) { 404 if ((ipm->m_client->addr_changes - ipm->m_add_change) >0) {
@@ -403,22 +409,29 @@ struct auth_domain *auth_unix_lookup(struct in6_addr *addr)
403 rv = &ipm->m_client->h; 409 rv = &ipm->m_client->h;
404 kref_get(&rv->ref); 410 kref_get(&rv->ref);
405 } 411 }
406 cache_put(&ipm->h, &ip_map_cache); 412 cache_put(&ipm->h, sn->ip_map_cache);
407 return rv; 413 return rv;
408} 414}
409EXPORT_SYMBOL_GPL(auth_unix_lookup); 415EXPORT_SYMBOL_GPL(auth_unix_lookup);
410 416
411void svcauth_unix_purge(void) 417void svcauth_unix_purge(void)
412{ 418{
413 cache_purge(&ip_map_cache); 419 struct net *net;
420
421 for_each_net(net) {
422 struct sunrpc_net *sn;
423
424 sn = net_generic(net, sunrpc_net_id);
425 cache_purge(sn->ip_map_cache);
426 }
414} 427}
415EXPORT_SYMBOL_GPL(svcauth_unix_purge); 428EXPORT_SYMBOL_GPL(svcauth_unix_purge);
416 429
417static inline struct ip_map * 430static inline struct ip_map *
418ip_map_cached_get(struct svc_rqst *rqstp) 431ip_map_cached_get(struct svc_xprt *xprt)
419{ 432{
420 struct ip_map *ipm = NULL; 433 struct ip_map *ipm = NULL;
421 struct svc_xprt *xprt = rqstp->rq_xprt; 434 struct sunrpc_net *sn;
422 435
423 if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) { 436 if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) {
424 spin_lock(&xprt->xpt_lock); 437 spin_lock(&xprt->xpt_lock);
@@ -430,9 +443,10 @@ ip_map_cached_get(struct svc_rqst *rqstp)
430 * remembered, e.g. by a second mount from the 443 * remembered, e.g. by a second mount from the
431 * same IP address. 444 * same IP address.
432 */ 445 */
446 sn = net_generic(xprt->xpt_net, sunrpc_net_id);
433 xprt->xpt_auth_cache = NULL; 447 xprt->xpt_auth_cache = NULL;
434 spin_unlock(&xprt->xpt_lock); 448 spin_unlock(&xprt->xpt_lock);
435 cache_put(&ipm->h, &ip_map_cache); 449 cache_put(&ipm->h, sn->ip_map_cache);
436 return NULL; 450 return NULL;
437 } 451 }
438 cache_get(&ipm->h); 452 cache_get(&ipm->h);
@@ -443,10 +457,8 @@ ip_map_cached_get(struct svc_rqst *rqstp)
443} 457}
444 458
445static inline void 459static inline void
446ip_map_cached_put(struct svc_rqst *rqstp, struct ip_map *ipm) 460ip_map_cached_put(struct svc_xprt *xprt, struct ip_map *ipm)
447{ 461{
448 struct svc_xprt *xprt = rqstp->rq_xprt;
449
450 if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) { 462 if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) {
451 spin_lock(&xprt->xpt_lock); 463 spin_lock(&xprt->xpt_lock);
452 if (xprt->xpt_auth_cache == NULL) { 464 if (xprt->xpt_auth_cache == NULL) {
@@ -456,15 +468,26 @@ ip_map_cached_put(struct svc_rqst *rqstp, struct ip_map *ipm)
456 } 468 }
457 spin_unlock(&xprt->xpt_lock); 469 spin_unlock(&xprt->xpt_lock);
458 } 470 }
459 if (ipm) 471 if (ipm) {
460 cache_put(&ipm->h, &ip_map_cache); 472 struct sunrpc_net *sn;
473
474 sn = net_generic(xprt->xpt_net, sunrpc_net_id);
475 cache_put(&ipm->h, sn->ip_map_cache);
476 }
461} 477}
462 478
463void 479void
464svcauth_unix_info_release(void *info) 480svcauth_unix_info_release(struct svc_xprt *xpt)
465{ 481{
466 struct ip_map *ipm = info; 482 struct ip_map *ipm;
467 cache_put(&ipm->h, &ip_map_cache); 483
484 ipm = xpt->xpt_auth_cache;
485 if (ipm != NULL) {
486 struct sunrpc_net *sn;
487
488 sn = net_generic(xpt->xpt_net, sunrpc_net_id);
489 cache_put(&ipm->h, sn->ip_map_cache);
490 }
468} 491}
469 492
470/**************************************************************************** 493/****************************************************************************
@@ -674,6 +697,8 @@ static struct group_info *unix_gid_find(uid_t uid, struct svc_rqst *rqstp)
674 switch (ret) { 697 switch (ret) {
675 case -ENOENT: 698 case -ENOENT:
676 return ERR_PTR(-ENOENT); 699 return ERR_PTR(-ENOENT);
700 case -ETIMEDOUT:
701 return ERR_PTR(-ESHUTDOWN);
677 case 0: 702 case 0:
678 gi = get_group_info(ug->gi); 703 gi = get_group_info(ug->gi);
679 cache_put(&ug->h, &unix_gid_cache); 704 cache_put(&ug->h, &unix_gid_cache);
@@ -691,6 +716,9 @@ svcauth_unix_set_client(struct svc_rqst *rqstp)
691 struct ip_map *ipm; 716 struct ip_map *ipm;
692 struct group_info *gi; 717 struct group_info *gi;
693 struct svc_cred *cred = &rqstp->rq_cred; 718 struct svc_cred *cred = &rqstp->rq_cred;
719 struct svc_xprt *xprt = rqstp->rq_xprt;
720 struct net *net = xprt->xpt_net;
721 struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
694 722
695 switch (rqstp->rq_addr.ss_family) { 723 switch (rqstp->rq_addr.ss_family) {
696 case AF_INET: 724 case AF_INET:
@@ -709,26 +737,27 @@ svcauth_unix_set_client(struct svc_rqst *rqstp)
709 if (rqstp->rq_proc == 0) 737 if (rqstp->rq_proc == 0)
710 return SVC_OK; 738 return SVC_OK;
711 739
712 ipm = ip_map_cached_get(rqstp); 740 ipm = ip_map_cached_get(xprt);
713 if (ipm == NULL) 741 if (ipm == NULL)
714 ipm = ip_map_lookup(rqstp->rq_server->sv_program->pg_class, 742 ipm = __ip_map_lookup(sn->ip_map_cache, rqstp->rq_server->sv_program->pg_class,
715 &sin6->sin6_addr); 743 &sin6->sin6_addr);
716 744
717 if (ipm == NULL) 745 if (ipm == NULL)
718 return SVC_DENIED; 746 return SVC_DENIED;
719 747
720 switch (cache_check(&ip_map_cache, &ipm->h, &rqstp->rq_chandle)) { 748 switch (cache_check(sn->ip_map_cache, &ipm->h, &rqstp->rq_chandle)) {
721 default: 749 default:
722 BUG(); 750 BUG();
723 case -EAGAIN:
724 case -ETIMEDOUT: 751 case -ETIMEDOUT:
752 return SVC_CLOSE;
753 case -EAGAIN:
725 return SVC_DROP; 754 return SVC_DROP;
726 case -ENOENT: 755 case -ENOENT:
727 return SVC_DENIED; 756 return SVC_DENIED;
728 case 0: 757 case 0:
729 rqstp->rq_client = &ipm->m_client->h; 758 rqstp->rq_client = &ipm->m_client->h;
730 kref_get(&rqstp->rq_client->ref); 759 kref_get(&rqstp->rq_client->ref);
731 ip_map_cached_put(rqstp, ipm); 760 ip_map_cached_put(xprt, ipm);
732 break; 761 break;
733 } 762 }
734 763
@@ -736,6 +765,8 @@ svcauth_unix_set_client(struct svc_rqst *rqstp)
736 switch (PTR_ERR(gi)) { 765 switch (PTR_ERR(gi)) {
737 case -EAGAIN: 766 case -EAGAIN:
738 return SVC_DROP; 767 return SVC_DROP;
768 case -ESHUTDOWN:
769 return SVC_CLOSE;
739 case -ENOENT: 770 case -ENOENT:
740 break; 771 break;
741 default: 772 default:
@@ -776,7 +807,7 @@ svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp)
776 cred->cr_gid = (gid_t) -1; 807 cred->cr_gid = (gid_t) -1;
777 cred->cr_group_info = groups_alloc(0); 808 cred->cr_group_info = groups_alloc(0);
778 if (cred->cr_group_info == NULL) 809 if (cred->cr_group_info == NULL)
779 return SVC_DROP; /* kmalloc failure - client must retry */ 810 return SVC_CLOSE; /* kmalloc failure - client must retry */
780 811
781 /* Put NULL verifier */ 812 /* Put NULL verifier */
782 svc_putnl(resv, RPC_AUTH_NULL); 813 svc_putnl(resv, RPC_AUTH_NULL);
@@ -840,7 +871,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp)
840 goto badcred; 871 goto badcred;
841 cred->cr_group_info = groups_alloc(slen); 872 cred->cr_group_info = groups_alloc(slen);
842 if (cred->cr_group_info == NULL) 873 if (cred->cr_group_info == NULL)
843 return SVC_DROP; 874 return SVC_CLOSE;
844 for (i = 0; i < slen; i++) 875 for (i = 0; i < slen; i++)
845 GROUP_AT(cred->cr_group_info, i) = svc_getnl(argv); 876 GROUP_AT(cred->cr_group_info, i) = svc_getnl(argv);
846 if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) { 877 if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) {
@@ -886,3 +917,56 @@ struct auth_ops svcauth_unix = {
886 .set_client = svcauth_unix_set_client, 917 .set_client = svcauth_unix_set_client,
887}; 918};
888 919
920int ip_map_cache_create(struct net *net)
921{
922 int err = -ENOMEM;
923 struct cache_detail *cd;
924 struct cache_head **tbl;
925 struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
926
927 cd = kzalloc(sizeof(struct cache_detail), GFP_KERNEL);
928 if (cd == NULL)
929 goto err_cd;
930
931 tbl = kzalloc(IP_HASHMAX * sizeof(struct cache_head *), GFP_KERNEL);
932 if (tbl == NULL)
933 goto err_tbl;
934
935 cd->owner = THIS_MODULE,
936 cd->hash_size = IP_HASHMAX,
937 cd->hash_table = tbl,
938 cd->name = "auth.unix.ip",
939 cd->cache_put = ip_map_put,
940 cd->cache_upcall = ip_map_upcall,
941 cd->cache_parse = ip_map_parse,
942 cd->cache_show = ip_map_show,
943 cd->match = ip_map_match,
944 cd->init = ip_map_init,
945 cd->update = update,
946 cd->alloc = ip_map_alloc,
947
948 err = cache_register_net(cd, net);
949 if (err)
950 goto err_reg;
951
952 sn->ip_map_cache = cd;
953 return 0;
954
955err_reg:
956 kfree(tbl);
957err_tbl:
958 kfree(cd);
959err_cd:
960 return err;
961}
962
963void ip_map_cache_destroy(struct net *net)
964{
965 struct sunrpc_net *sn;
966
967 sn = net_generic(net, sunrpc_net_id);
968 cache_purge(sn->ip_map_cache);
969 cache_unregister_net(sn->ip_map_cache, net);
970 kfree(sn->ip_map_cache->hash_table);
971 kfree(sn->ip_map_cache);
972}
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 7e534dd09077..07919e16be3e 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -64,7 +64,8 @@ static void svc_tcp_sock_detach(struct svc_xprt *);
64static void svc_sock_free(struct svc_xprt *); 64static void svc_sock_free(struct svc_xprt *);
65 65
66static struct svc_xprt *svc_create_socket(struct svc_serv *, int, 66static struct svc_xprt *svc_create_socket(struct svc_serv *, int,
67 struct sockaddr *, int, int); 67 struct net *, struct sockaddr *,
68 int, int);
68#ifdef CONFIG_DEBUG_LOCK_ALLOC 69#ifdef CONFIG_DEBUG_LOCK_ALLOC
69static struct lock_class_key svc_key[2]; 70static struct lock_class_key svc_key[2];
70static struct lock_class_key svc_slock_key[2]; 71static struct lock_class_key svc_slock_key[2];
@@ -657,10 +658,11 @@ static struct svc_xprt *svc_udp_accept(struct svc_xprt *xprt)
657} 658}
658 659
659static struct svc_xprt *svc_udp_create(struct svc_serv *serv, 660static struct svc_xprt *svc_udp_create(struct svc_serv *serv,
661 struct net *net,
660 struct sockaddr *sa, int salen, 662 struct sockaddr *sa, int salen,
661 int flags) 663 int flags)
662{ 664{
663 return svc_create_socket(serv, IPPROTO_UDP, sa, salen, flags); 665 return svc_create_socket(serv, IPPROTO_UDP, net, sa, salen, flags);
664} 666}
665 667
666static struct svc_xprt_ops svc_udp_ops = { 668static struct svc_xprt_ops svc_udp_ops = {
@@ -1133,9 +1135,6 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp)
1133 reclen = htonl(0x80000000|((xbufp->len ) - 4)); 1135 reclen = htonl(0x80000000|((xbufp->len ) - 4));
1134 memcpy(xbufp->head[0].iov_base, &reclen, 4); 1136 memcpy(xbufp->head[0].iov_base, &reclen, 4);
1135 1137
1136 if (test_bit(XPT_DEAD, &rqstp->rq_xprt->xpt_flags))
1137 return -ENOTCONN;
1138
1139 sent = svc_sendto(rqstp, &rqstp->rq_res); 1138 sent = svc_sendto(rqstp, &rqstp->rq_res);
1140 if (sent != xbufp->len) { 1139 if (sent != xbufp->len) {
1141 printk(KERN_NOTICE 1140 printk(KERN_NOTICE
@@ -1178,10 +1177,11 @@ static int svc_tcp_has_wspace(struct svc_xprt *xprt)
1178} 1177}
1179 1178
1180static struct svc_xprt *svc_tcp_create(struct svc_serv *serv, 1179static struct svc_xprt *svc_tcp_create(struct svc_serv *serv,
1180 struct net *net,
1181 struct sockaddr *sa, int salen, 1181 struct sockaddr *sa, int salen,
1182 int flags) 1182 int flags)
1183{ 1183{
1184 return svc_create_socket(serv, IPPROTO_TCP, sa, salen, flags); 1184 return svc_create_socket(serv, IPPROTO_TCP, net, sa, salen, flags);
1185} 1185}
1186 1186
1187static struct svc_xprt_ops svc_tcp_ops = { 1187static struct svc_xprt_ops svc_tcp_ops = {
@@ -1258,19 +1258,13 @@ void svc_sock_update_bufs(struct svc_serv *serv)
1258 * The number of server threads has changed. Update 1258 * The number of server threads has changed. Update
1259 * rcvbuf and sndbuf accordingly on all sockets 1259 * rcvbuf and sndbuf accordingly on all sockets
1260 */ 1260 */
1261 struct list_head *le; 1261 struct svc_sock *svsk;
1262 1262
1263 spin_lock_bh(&serv->sv_lock); 1263 spin_lock_bh(&serv->sv_lock);
1264 list_for_each(le, &serv->sv_permsocks) { 1264 list_for_each_entry(svsk, &serv->sv_permsocks, sk_xprt.xpt_list)
1265 struct svc_sock *svsk =
1266 list_entry(le, struct svc_sock, sk_xprt.xpt_list);
1267 set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); 1265 set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
1268 } 1266 list_for_each_entry(svsk, &serv->sv_tempsocks, sk_xprt.xpt_list)
1269 list_for_each(le, &serv->sv_tempsocks) {
1270 struct svc_sock *svsk =
1271 list_entry(le, struct svc_sock, sk_xprt.xpt_list);
1272 set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); 1267 set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
1273 }
1274 spin_unlock_bh(&serv->sv_lock); 1268 spin_unlock_bh(&serv->sv_lock);
1275} 1269}
1276EXPORT_SYMBOL_GPL(svc_sock_update_bufs); 1270EXPORT_SYMBOL_GPL(svc_sock_update_bufs);
@@ -1385,6 +1379,7 @@ EXPORT_SYMBOL_GPL(svc_addsock);
1385 */ 1379 */
1386static struct svc_xprt *svc_create_socket(struct svc_serv *serv, 1380static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
1387 int protocol, 1381 int protocol,
1382 struct net *net,
1388 struct sockaddr *sin, int len, 1383 struct sockaddr *sin, int len,
1389 int flags) 1384 int flags)
1390{ 1385{
@@ -1421,7 +1416,7 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
1421 return ERR_PTR(-EINVAL); 1416 return ERR_PTR(-EINVAL);
1422 } 1417 }
1423 1418
1424 error = sock_create_kern(family, type, protocol, &sock); 1419 error = __sock_create(net, family, type, protocol, &sock, 1);
1425 if (error < 0) 1420 if (error < 0)
1426 return ERR_PTR(error); 1421 return ERR_PTR(error);
1427 1422
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index a1f82a87d34d..cd9e841e7492 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -111,6 +111,23 @@ xdr_decode_string_inplace(__be32 *p, char **sp,
111} 111}
112EXPORT_SYMBOL_GPL(xdr_decode_string_inplace); 112EXPORT_SYMBOL_GPL(xdr_decode_string_inplace);
113 113
114/**
115 * xdr_terminate_string - '\0'-terminate a string residing in an xdr_buf
116 * @buf: XDR buffer where string resides
117 * @len: length of string, in bytes
118 *
119 */
120void
121xdr_terminate_string(struct xdr_buf *buf, const u32 len)
122{
123 char *kaddr;
124
125 kaddr = kmap_atomic(buf->pages[0], KM_USER0);
126 kaddr[buf->page_base + len] = '\0';
127 kunmap_atomic(kaddr, KM_USER0);
128}
129EXPORT_SYMBOL(xdr_terminate_string);
130
114void 131void
115xdr_encode_pages(struct xdr_buf *xdr, struct page **pages, unsigned int base, 132xdr_encode_pages(struct xdr_buf *xdr, struct page **pages, unsigned int base,
116 unsigned int len) 133 unsigned int len)
@@ -395,24 +412,29 @@ xdr_shrink_pagelen(struct xdr_buf *buf, size_t len)
395{ 412{
396 struct kvec *tail; 413 struct kvec *tail;
397 size_t copy; 414 size_t copy;
398 char *p;
399 unsigned int pglen = buf->page_len; 415 unsigned int pglen = buf->page_len;
416 unsigned int tailbuf_len;
400 417
401 tail = buf->tail; 418 tail = buf->tail;
402 BUG_ON (len > pglen); 419 BUG_ON (len > pglen);
403 420
421 tailbuf_len = buf->buflen - buf->head->iov_len - buf->page_len;
422
404 /* Shift the tail first */ 423 /* Shift the tail first */
405 if (tail->iov_len != 0) { 424 if (tailbuf_len != 0) {
406 p = (char *)tail->iov_base + len; 425 unsigned int free_space = tailbuf_len - tail->iov_len;
426
427 if (len < free_space)
428 free_space = len;
429 tail->iov_len += free_space;
430
431 copy = len;
407 if (tail->iov_len > len) { 432 if (tail->iov_len > len) {
408 copy = tail->iov_len - len; 433 char *p = (char *)tail->iov_base + len;
409 memmove(p, tail->iov_base, copy); 434 memmove(p, tail->iov_base, tail->iov_len - len);
410 } else 435 } else
411 buf->buflen -= len;
412 /* Copy from the inlined pages into the tail */
413 copy = len;
414 if (copy > tail->iov_len)
415 copy = tail->iov_len; 436 copy = tail->iov_len;
437 /* Copy from the inlined pages into the tail */
416 _copy_from_pages((char *)tail->iov_base, 438 _copy_from_pages((char *)tail->iov_base,
417 buf->pages, buf->page_base + pglen - len, 439 buf->pages, buf->page_base + pglen - len,
418 copy); 440 copy);
@@ -551,6 +573,27 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
551EXPORT_SYMBOL_GPL(xdr_init_decode); 573EXPORT_SYMBOL_GPL(xdr_init_decode);
552 574
553/** 575/**
576 * xdr_inline_peek - Allow read-ahead in the XDR data stream
577 * @xdr: pointer to xdr_stream struct
578 * @nbytes: number of bytes of data to decode
579 *
580 * Check if the input buffer is long enough to enable us to decode
581 * 'nbytes' more bytes of data starting at the current position.
582 * If so return the current pointer without updating the current
583 * pointer position.
584 */
585__be32 * xdr_inline_peek(struct xdr_stream *xdr, size_t nbytes)
586{
587 __be32 *p = xdr->p;
588 __be32 *q = p + XDR_QUADLEN(nbytes);
589
590 if (unlikely(q > xdr->end || q < p))
591 return NULL;
592 return p;
593}
594EXPORT_SYMBOL_GPL(xdr_inline_peek);
595
596/**
554 * xdr_inline_decode - Retrieve non-page XDR data to decode 597 * xdr_inline_decode - Retrieve non-page XDR data to decode
555 * @xdr: pointer to xdr_stream struct 598 * @xdr: pointer to xdr_stream struct
556 * @nbytes: number of bytes of data to decode 599 * @nbytes: number of bytes of data to decode
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 970fb00f388c..4c8f18aff7c3 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -199,8 +199,6 @@ int xprt_reserve_xprt(struct rpc_task *task)
199 if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { 199 if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) {
200 if (task == xprt->snd_task) 200 if (task == xprt->snd_task)
201 return 1; 201 return 1;
202 if (task == NULL)
203 return 0;
204 goto out_sleep; 202 goto out_sleep;
205 } 203 }
206 xprt->snd_task = task; 204 xprt->snd_task = task;
@@ -757,13 +755,11 @@ static void xprt_connect_status(struct rpc_task *task)
757 */ 755 */
758struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid) 756struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid)
759{ 757{
760 struct list_head *pos; 758 struct rpc_rqst *entry;
761 759
762 list_for_each(pos, &xprt->recv) { 760 list_for_each_entry(entry, &xprt->recv, rq_list)
763 struct rpc_rqst *entry = list_entry(pos, struct rpc_rqst, rq_list);
764 if (entry->rq_xid == xid) 761 if (entry->rq_xid == xid)
765 return entry; 762 return entry;
766 }
767 763
768 dprintk("RPC: xprt_lookup_rqst did not find xid %08x\n", 764 dprintk("RPC: xprt_lookup_rqst did not find xid %08x\n",
769 ntohl(xid)); 765 ntohl(xid));
@@ -962,6 +958,37 @@ static void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req)
962 spin_unlock(&xprt->reserve_lock); 958 spin_unlock(&xprt->reserve_lock);
963} 959}
964 960
961struct rpc_xprt *xprt_alloc(struct net *net, int size, int max_req)
962{
963 struct rpc_xprt *xprt;
964
965 xprt = kzalloc(size, GFP_KERNEL);
966 if (xprt == NULL)
967 goto out;
968
969 xprt->max_reqs = max_req;
970 xprt->slot = kcalloc(max_req, sizeof(struct rpc_rqst), GFP_KERNEL);
971 if (xprt->slot == NULL)
972 goto out_free;
973
974 xprt->xprt_net = get_net(net);
975 return xprt;
976
977out_free:
978 kfree(xprt);
979out:
980 return NULL;
981}
982EXPORT_SYMBOL_GPL(xprt_alloc);
983
984void xprt_free(struct rpc_xprt *xprt)
985{
986 put_net(xprt->xprt_net);
987 kfree(xprt->slot);
988 kfree(xprt);
989}
990EXPORT_SYMBOL_GPL(xprt_free);
991
965/** 992/**
966 * xprt_reserve - allocate an RPC request slot 993 * xprt_reserve - allocate an RPC request slot
967 * @task: RPC task requesting a slot allocation 994 * @task: RPC task requesting a slot allocation
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index e5e28d1946a4..2ac3f6e8adff 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -249,6 +249,8 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
249 req->rl_nchunks = nchunks; 249 req->rl_nchunks = nchunks;
250 250
251 BUG_ON(nchunks == 0); 251 BUG_ON(nchunks == 0);
252 BUG_ON((r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
253 && (nchunks > 3));
252 254
253 /* 255 /*
254 * finish off header. If write, marshal discrim and nchunks. 256 * finish off header. If write, marshal discrim and nchunks.
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index d718b8fa9525..09af4fab1a45 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -43,6 +43,7 @@
43#include <linux/slab.h> 43#include <linux/slab.h>
44#include <linux/fs.h> 44#include <linux/fs.h>
45#include <linux/sysctl.h> 45#include <linux/sysctl.h>
46#include <linux/workqueue.h>
46#include <linux/sunrpc/clnt.h> 47#include <linux/sunrpc/clnt.h>
47#include <linux/sunrpc/sched.h> 48#include <linux/sunrpc/sched.h>
48#include <linux/sunrpc/svc_rdma.h> 49#include <linux/sunrpc/svc_rdma.h>
@@ -74,6 +75,8 @@ atomic_t rdma_stat_sq_prod;
74struct kmem_cache *svc_rdma_map_cachep; 75struct kmem_cache *svc_rdma_map_cachep;
75struct kmem_cache *svc_rdma_ctxt_cachep; 76struct kmem_cache *svc_rdma_ctxt_cachep;
76 77
78struct workqueue_struct *svc_rdma_wq;
79
77/* 80/*
78 * This function implements reading and resetting an atomic_t stat 81 * This function implements reading and resetting an atomic_t stat
79 * variable through read/write to a proc file. Any write to the file 82 * variable through read/write to a proc file. Any write to the file
@@ -231,7 +234,7 @@ static ctl_table svcrdma_root_table[] = {
231void svc_rdma_cleanup(void) 234void svc_rdma_cleanup(void)
232{ 235{
233 dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n"); 236 dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n");
234 flush_scheduled_work(); 237 destroy_workqueue(svc_rdma_wq);
235 if (svcrdma_table_header) { 238 if (svcrdma_table_header) {
236 unregister_sysctl_table(svcrdma_table_header); 239 unregister_sysctl_table(svcrdma_table_header);
237 svcrdma_table_header = NULL; 240 svcrdma_table_header = NULL;
@@ -249,6 +252,11 @@ int svc_rdma_init(void)
249 dprintk("\tsq_depth : %d\n", 252 dprintk("\tsq_depth : %d\n",
250 svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT); 253 svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT);
251 dprintk("\tmax_inline : %d\n", svcrdma_max_req_size); 254 dprintk("\tmax_inline : %d\n", svcrdma_max_req_size);
255
256 svc_rdma_wq = alloc_workqueue("svc_rdma", 0, 0);
257 if (!svc_rdma_wq)
258 return -ENOMEM;
259
252 if (!svcrdma_table_header) 260 if (!svcrdma_table_header)
253 svcrdma_table_header = 261 svcrdma_table_header =
254 register_sysctl_table(svcrdma_root_table); 262 register_sysctl_table(svcrdma_root_table);
@@ -283,6 +291,7 @@ int svc_rdma_init(void)
283 kmem_cache_destroy(svc_rdma_map_cachep); 291 kmem_cache_destroy(svc_rdma_map_cachep);
284 err0: 292 err0:
285 unregister_sysctl_table(svcrdma_table_header); 293 unregister_sysctl_table(svcrdma_table_header);
294 destroy_workqueue(svc_rdma_wq);
286 return -ENOMEM; 295 return -ENOMEM;
287} 296}
288MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>"); 297MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 0194de814933..df67211c4baf 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -263,9 +263,9 @@ static int fast_reg_read_chunks(struct svcxprt_rdma *xprt,
263 frmr->page_list_len = PAGE_ALIGN(byte_count) >> PAGE_SHIFT; 263 frmr->page_list_len = PAGE_ALIGN(byte_count) >> PAGE_SHIFT;
264 for (page_no = 0; page_no < frmr->page_list_len; page_no++) { 264 for (page_no = 0; page_no < frmr->page_list_len; page_no++) {
265 frmr->page_list->page_list[page_no] = 265 frmr->page_list->page_list[page_no] =
266 ib_dma_map_single(xprt->sc_cm_id->device, 266 ib_dma_map_page(xprt->sc_cm_id->device,
267 page_address(rqstp->rq_arg.pages[page_no]), 267 rqstp->rq_arg.pages[page_no], 0,
268 PAGE_SIZE, DMA_FROM_DEVICE); 268 PAGE_SIZE, DMA_FROM_DEVICE);
269 if (ib_dma_mapping_error(xprt->sc_cm_id->device, 269 if (ib_dma_mapping_error(xprt->sc_cm_id->device,
270 frmr->page_list->page_list[page_no])) 270 frmr->page_list->page_list[page_no]))
271 goto fatal_err; 271 goto fatal_err;
@@ -309,17 +309,21 @@ static int rdma_set_ctxt_sge(struct svcxprt_rdma *xprt,
309 int count) 309 int count)
310{ 310{
311 int i; 311 int i;
312 unsigned long off;
312 313
313 ctxt->count = count; 314 ctxt->count = count;
314 ctxt->direction = DMA_FROM_DEVICE; 315 ctxt->direction = DMA_FROM_DEVICE;
315 for (i = 0; i < count; i++) { 316 for (i = 0; i < count; i++) {
316 ctxt->sge[i].length = 0; /* in case map fails */ 317 ctxt->sge[i].length = 0; /* in case map fails */
317 if (!frmr) { 318 if (!frmr) {
319 BUG_ON(0 == virt_to_page(vec[i].iov_base));
320 off = (unsigned long)vec[i].iov_base & ~PAGE_MASK;
318 ctxt->sge[i].addr = 321 ctxt->sge[i].addr =
319 ib_dma_map_single(xprt->sc_cm_id->device, 322 ib_dma_map_page(xprt->sc_cm_id->device,
320 vec[i].iov_base, 323 virt_to_page(vec[i].iov_base),
321 vec[i].iov_len, 324 off,
322 DMA_FROM_DEVICE); 325 vec[i].iov_len,
326 DMA_FROM_DEVICE);
323 if (ib_dma_mapping_error(xprt->sc_cm_id->device, 327 if (ib_dma_mapping_error(xprt->sc_cm_id->device,
324 ctxt->sge[i].addr)) 328 ctxt->sge[i].addr))
325 return -EINVAL; 329 return -EINVAL;
@@ -491,6 +495,7 @@ next_sge:
491 printk(KERN_ERR "svcrdma: Error %d posting RDMA_READ\n", 495 printk(KERN_ERR "svcrdma: Error %d posting RDMA_READ\n",
492 err); 496 err);
493 set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); 497 set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
498 svc_rdma_unmap_dma(ctxt);
494 svc_rdma_put_context(ctxt, 0); 499 svc_rdma_put_context(ctxt, 0);
495 goto out; 500 goto out;
496 } 501 }
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index b15e1ebb2bfa..249a835b703f 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -70,8 +70,8 @@
70 * on extra page for the RPCRMDA header. 70 * on extra page for the RPCRMDA header.
71 */ 71 */
72static int fast_reg_xdr(struct svcxprt_rdma *xprt, 72static int fast_reg_xdr(struct svcxprt_rdma *xprt,
73 struct xdr_buf *xdr, 73 struct xdr_buf *xdr,
74 struct svc_rdma_req_map *vec) 74 struct svc_rdma_req_map *vec)
75{ 75{
76 int sge_no; 76 int sge_no;
77 u32 sge_bytes; 77 u32 sge_bytes;
@@ -96,21 +96,25 @@ static int fast_reg_xdr(struct svcxprt_rdma *xprt,
96 vec->count = 2; 96 vec->count = 2;
97 sge_no++; 97 sge_no++;
98 98
99 /* Build the FRMR */ 99 /* Map the XDR head */
100 frmr->kva = frva; 100 frmr->kva = frva;
101 frmr->direction = DMA_TO_DEVICE; 101 frmr->direction = DMA_TO_DEVICE;
102 frmr->access_flags = 0; 102 frmr->access_flags = 0;
103 frmr->map_len = PAGE_SIZE; 103 frmr->map_len = PAGE_SIZE;
104 frmr->page_list_len = 1; 104 frmr->page_list_len = 1;
105 page_off = (unsigned long)xdr->head[0].iov_base & ~PAGE_MASK;
105 frmr->page_list->page_list[page_no] = 106 frmr->page_list->page_list[page_no] =
106 ib_dma_map_single(xprt->sc_cm_id->device, 107 ib_dma_map_page(xprt->sc_cm_id->device,
107 (void *)xdr->head[0].iov_base, 108 virt_to_page(xdr->head[0].iov_base),
108 PAGE_SIZE, DMA_TO_DEVICE); 109 page_off,
110 PAGE_SIZE - page_off,
111 DMA_TO_DEVICE);
109 if (ib_dma_mapping_error(xprt->sc_cm_id->device, 112 if (ib_dma_mapping_error(xprt->sc_cm_id->device,
110 frmr->page_list->page_list[page_no])) 113 frmr->page_list->page_list[page_no]))
111 goto fatal_err; 114 goto fatal_err;
112 atomic_inc(&xprt->sc_dma_used); 115 atomic_inc(&xprt->sc_dma_used);
113 116
117 /* Map the XDR page list */
114 page_off = xdr->page_base; 118 page_off = xdr->page_base;
115 page_bytes = xdr->page_len + page_off; 119 page_bytes = xdr->page_len + page_off;
116 if (!page_bytes) 120 if (!page_bytes)
@@ -128,9 +132,9 @@ static int fast_reg_xdr(struct svcxprt_rdma *xprt,
128 page_bytes -= sge_bytes; 132 page_bytes -= sge_bytes;
129 133
130 frmr->page_list->page_list[page_no] = 134 frmr->page_list->page_list[page_no] =
131 ib_dma_map_single(xprt->sc_cm_id->device, 135 ib_dma_map_page(xprt->sc_cm_id->device,
132 page_address(page), 136 page, page_off,
133 PAGE_SIZE, DMA_TO_DEVICE); 137 sge_bytes, DMA_TO_DEVICE);
134 if (ib_dma_mapping_error(xprt->sc_cm_id->device, 138 if (ib_dma_mapping_error(xprt->sc_cm_id->device,
135 frmr->page_list->page_list[page_no])) 139 frmr->page_list->page_list[page_no]))
136 goto fatal_err; 140 goto fatal_err;
@@ -166,8 +170,10 @@ static int fast_reg_xdr(struct svcxprt_rdma *xprt,
166 vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off; 170 vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off;
167 171
168 frmr->page_list->page_list[page_no] = 172 frmr->page_list->page_list[page_no] =
169 ib_dma_map_single(xprt->sc_cm_id->device, va, PAGE_SIZE, 173 ib_dma_map_page(xprt->sc_cm_id->device, virt_to_page(va),
170 DMA_TO_DEVICE); 174 page_off,
175 PAGE_SIZE,
176 DMA_TO_DEVICE);
171 if (ib_dma_mapping_error(xprt->sc_cm_id->device, 177 if (ib_dma_mapping_error(xprt->sc_cm_id->device,
172 frmr->page_list->page_list[page_no])) 178 frmr->page_list->page_list[page_no]))
173 goto fatal_err; 179 goto fatal_err;
@@ -245,6 +251,35 @@ static int map_xdr(struct svcxprt_rdma *xprt,
245 return 0; 251 return 0;
246} 252}
247 253
254static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt,
255 struct xdr_buf *xdr,
256 u32 xdr_off, size_t len, int dir)
257{
258 struct page *page;
259 dma_addr_t dma_addr;
260 if (xdr_off < xdr->head[0].iov_len) {
261 /* This offset is in the head */
262 xdr_off += (unsigned long)xdr->head[0].iov_base & ~PAGE_MASK;
263 page = virt_to_page(xdr->head[0].iov_base);
264 } else {
265 xdr_off -= xdr->head[0].iov_len;
266 if (xdr_off < xdr->page_len) {
267 /* This offset is in the page list */
268 page = xdr->pages[xdr_off >> PAGE_SHIFT];
269 xdr_off &= ~PAGE_MASK;
270 } else {
271 /* This offset is in the tail */
272 xdr_off -= xdr->page_len;
273 xdr_off += (unsigned long)
274 xdr->tail[0].iov_base & ~PAGE_MASK;
275 page = virt_to_page(xdr->tail[0].iov_base);
276 }
277 }
278 dma_addr = ib_dma_map_page(xprt->sc_cm_id->device, page, xdr_off,
279 min_t(size_t, PAGE_SIZE, len), dir);
280 return dma_addr;
281}
282
248/* Assumptions: 283/* Assumptions:
249 * - We are using FRMR 284 * - We are using FRMR
250 * - or - 285 * - or -
@@ -293,10 +328,9 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
293 sge[sge_no].length = sge_bytes; 328 sge[sge_no].length = sge_bytes;
294 if (!vec->frmr) { 329 if (!vec->frmr) {
295 sge[sge_no].addr = 330 sge[sge_no].addr =
296 ib_dma_map_single(xprt->sc_cm_id->device, 331 dma_map_xdr(xprt, &rqstp->rq_res, xdr_off,
297 (void *) 332 sge_bytes, DMA_TO_DEVICE);
298 vec->sge[xdr_sge_no].iov_base + sge_off, 333 xdr_off += sge_bytes;
299 sge_bytes, DMA_TO_DEVICE);
300 if (ib_dma_mapping_error(xprt->sc_cm_id->device, 334 if (ib_dma_mapping_error(xprt->sc_cm_id->device,
301 sge[sge_no].addr)) 335 sge[sge_no].addr))
302 goto err; 336 goto err;
@@ -333,6 +367,8 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
333 goto err; 367 goto err;
334 return 0; 368 return 0;
335 err: 369 err:
370 svc_rdma_unmap_dma(ctxt);
371 svc_rdma_put_frmr(xprt, vec->frmr);
336 svc_rdma_put_context(ctxt, 0); 372 svc_rdma_put_context(ctxt, 0);
337 /* Fatal error, close transport */ 373 /* Fatal error, close transport */
338 return -EIO; 374 return -EIO;
@@ -494,7 +530,8 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
494 * In all three cases, this function prepares the RPCRDMA header in 530 * In all three cases, this function prepares the RPCRDMA header in
495 * sge[0], the 'type' parameter indicates the type to place in the 531 * sge[0], the 'type' parameter indicates the type to place in the
496 * RPCRDMA header, and the 'byte_count' field indicates how much of 532 * RPCRDMA header, and the 'byte_count' field indicates how much of
497 * the XDR to include in this RDMA_SEND. 533 * the XDR to include in this RDMA_SEND. NB: The offset of the payload
534 * to send is zero in the XDR.
498 */ 535 */
499static int send_reply(struct svcxprt_rdma *rdma, 536static int send_reply(struct svcxprt_rdma *rdma,
500 struct svc_rqst *rqstp, 537 struct svc_rqst *rqstp,
@@ -536,23 +573,24 @@ static int send_reply(struct svcxprt_rdma *rdma,
536 ctxt->sge[0].lkey = rdma->sc_dma_lkey; 573 ctxt->sge[0].lkey = rdma->sc_dma_lkey;
537 ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp); 574 ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
538 ctxt->sge[0].addr = 575 ctxt->sge[0].addr =
539 ib_dma_map_single(rdma->sc_cm_id->device, page_address(page), 576 ib_dma_map_page(rdma->sc_cm_id->device, page, 0,
540 ctxt->sge[0].length, DMA_TO_DEVICE); 577 ctxt->sge[0].length, DMA_TO_DEVICE);
541 if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) 578 if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr))
542 goto err; 579 goto err;
543 atomic_inc(&rdma->sc_dma_used); 580 atomic_inc(&rdma->sc_dma_used);
544 581
545 ctxt->direction = DMA_TO_DEVICE; 582 ctxt->direction = DMA_TO_DEVICE;
546 583
547 /* Determine how many of our SGE are to be transmitted */ 584 /* Map the payload indicated by 'byte_count' */
548 for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) { 585 for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) {
586 int xdr_off = 0;
549 sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count); 587 sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
550 byte_count -= sge_bytes; 588 byte_count -= sge_bytes;
551 if (!vec->frmr) { 589 if (!vec->frmr) {
552 ctxt->sge[sge_no].addr = 590 ctxt->sge[sge_no].addr =
553 ib_dma_map_single(rdma->sc_cm_id->device, 591 dma_map_xdr(rdma, &rqstp->rq_res, xdr_off,
554 vec->sge[sge_no].iov_base, 592 sge_bytes, DMA_TO_DEVICE);
555 sge_bytes, DMA_TO_DEVICE); 593 xdr_off += sge_bytes;
556 if (ib_dma_mapping_error(rdma->sc_cm_id->device, 594 if (ib_dma_mapping_error(rdma->sc_cm_id->device,
557 ctxt->sge[sge_no].addr)) 595 ctxt->sge[sge_no].addr))
558 goto err; 596 goto err;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index edea15a54e51..9df1eadc912a 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -45,6 +45,7 @@
45#include <linux/sched.h> 45#include <linux/sched.h>
46#include <linux/slab.h> 46#include <linux/slab.h>
47#include <linux/spinlock.h> 47#include <linux/spinlock.h>
48#include <linux/workqueue.h>
48#include <rdma/ib_verbs.h> 49#include <rdma/ib_verbs.h>
49#include <rdma/rdma_cm.h> 50#include <rdma/rdma_cm.h>
50#include <linux/sunrpc/svc_rdma.h> 51#include <linux/sunrpc/svc_rdma.h>
@@ -52,6 +53,7 @@
52#define RPCDBG_FACILITY RPCDBG_SVCXPRT 53#define RPCDBG_FACILITY RPCDBG_SVCXPRT
53 54
54static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, 55static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
56 struct net *net,
55 struct sockaddr *sa, int salen, 57 struct sockaddr *sa, int salen,
56 int flags); 58 int flags);
57static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt); 59static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt);
@@ -89,6 +91,9 @@ struct svc_xprt_class svc_rdma_class = {
89/* WR context cache. Created in svc_rdma.c */ 91/* WR context cache. Created in svc_rdma.c */
90extern struct kmem_cache *svc_rdma_ctxt_cachep; 92extern struct kmem_cache *svc_rdma_ctxt_cachep;
91 93
94/* Workqueue created in svc_rdma.c */
95extern struct workqueue_struct *svc_rdma_wq;
96
92struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) 97struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
93{ 98{
94 struct svc_rdma_op_ctxt *ctxt; 99 struct svc_rdma_op_ctxt *ctxt;
@@ -120,7 +125,7 @@ void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
120 */ 125 */
121 if (ctxt->sge[i].lkey == xprt->sc_dma_lkey) { 126 if (ctxt->sge[i].lkey == xprt->sc_dma_lkey) {
122 atomic_dec(&xprt->sc_dma_used); 127 atomic_dec(&xprt->sc_dma_used);
123 ib_dma_unmap_single(xprt->sc_cm_id->device, 128 ib_dma_unmap_page(xprt->sc_cm_id->device,
124 ctxt->sge[i].addr, 129 ctxt->sge[i].addr,
125 ctxt->sge[i].length, 130 ctxt->sge[i].length,
126 ctxt->direction); 131 ctxt->direction);
@@ -502,8 +507,8 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
502 BUG_ON(sge_no >= xprt->sc_max_sge); 507 BUG_ON(sge_no >= xprt->sc_max_sge);
503 page = svc_rdma_get_page(); 508 page = svc_rdma_get_page();
504 ctxt->pages[sge_no] = page; 509 ctxt->pages[sge_no] = page;
505 pa = ib_dma_map_single(xprt->sc_cm_id->device, 510 pa = ib_dma_map_page(xprt->sc_cm_id->device,
506 page_address(page), PAGE_SIZE, 511 page, 0, PAGE_SIZE,
507 DMA_FROM_DEVICE); 512 DMA_FROM_DEVICE);
508 if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa)) 513 if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa))
509 goto err_put_ctxt; 514 goto err_put_ctxt;
@@ -511,9 +516,9 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
511 ctxt->sge[sge_no].addr = pa; 516 ctxt->sge[sge_no].addr = pa;
512 ctxt->sge[sge_no].length = PAGE_SIZE; 517 ctxt->sge[sge_no].length = PAGE_SIZE;
513 ctxt->sge[sge_no].lkey = xprt->sc_dma_lkey; 518 ctxt->sge[sge_no].lkey = xprt->sc_dma_lkey;
519 ctxt->count = sge_no + 1;
514 buflen += PAGE_SIZE; 520 buflen += PAGE_SIZE;
515 } 521 }
516 ctxt->count = sge_no;
517 recv_wr.next = NULL; 522 recv_wr.next = NULL;
518 recv_wr.sg_list = &ctxt->sge[0]; 523 recv_wr.sg_list = &ctxt->sge[0];
519 recv_wr.num_sge = ctxt->count; 524 recv_wr.num_sge = ctxt->count;
@@ -529,6 +534,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
529 return ret; 534 return ret;
530 535
531 err_put_ctxt: 536 err_put_ctxt:
537 svc_rdma_unmap_dma(ctxt);
532 svc_rdma_put_context(ctxt, 1); 538 svc_rdma_put_context(ctxt, 1);
533 return -ENOMEM; 539 return -ENOMEM;
534} 540}
@@ -670,6 +676,7 @@ static int rdma_cma_handler(struct rdma_cm_id *cma_id,
670 * Create a listening RDMA service endpoint. 676 * Create a listening RDMA service endpoint.
671 */ 677 */
672static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, 678static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
679 struct net *net,
673 struct sockaddr *sa, int salen, 680 struct sockaddr *sa, int salen,
674 int flags) 681 int flags)
675{ 682{
@@ -798,8 +805,8 @@ static void frmr_unmap_dma(struct svcxprt_rdma *xprt,
798 if (ib_dma_mapping_error(frmr->mr->device, addr)) 805 if (ib_dma_mapping_error(frmr->mr->device, addr))
799 continue; 806 continue;
800 atomic_dec(&xprt->sc_dma_used); 807 atomic_dec(&xprt->sc_dma_used);
801 ib_dma_unmap_single(frmr->mr->device, addr, PAGE_SIZE, 808 ib_dma_unmap_page(frmr->mr->device, addr, PAGE_SIZE,
802 frmr->direction); 809 frmr->direction);
803 } 810 }
804} 811}
805 812
@@ -1184,7 +1191,7 @@ static void svc_rdma_free(struct svc_xprt *xprt)
1184 struct svcxprt_rdma *rdma = 1191 struct svcxprt_rdma *rdma =
1185 container_of(xprt, struct svcxprt_rdma, sc_xprt); 1192 container_of(xprt, struct svcxprt_rdma, sc_xprt);
1186 INIT_WORK(&rdma->sc_work, __svc_rdma_free); 1193 INIT_WORK(&rdma->sc_work, __svc_rdma_free);
1187 schedule_work(&rdma->sc_work); 1194 queue_work(svc_rdma_wq, &rdma->sc_work);
1188} 1195}
1189 1196
1190static int svc_rdma_has_wspace(struct svc_xprt *xprt) 1197static int svc_rdma_has_wspace(struct svc_xprt *xprt)
@@ -1274,7 +1281,7 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
1274 atomic_read(&xprt->sc_sq_count) < 1281 atomic_read(&xprt->sc_sq_count) <
1275 xprt->sc_sq_depth); 1282 xprt->sc_sq_depth);
1276 if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags)) 1283 if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
1277 return 0; 1284 return -ENOTCONN;
1278 continue; 1285 continue;
1279 } 1286 }
1280 /* Take a transport ref for each WR posted */ 1287 /* Take a transport ref for each WR posted */
@@ -1306,7 +1313,6 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
1306 enum rpcrdma_errcode err) 1313 enum rpcrdma_errcode err)
1307{ 1314{
1308 struct ib_send_wr err_wr; 1315 struct ib_send_wr err_wr;
1309 struct ib_sge sge;
1310 struct page *p; 1316 struct page *p;
1311 struct svc_rdma_op_ctxt *ctxt; 1317 struct svc_rdma_op_ctxt *ctxt;
1312 u32 *va; 1318 u32 *va;
@@ -1319,26 +1325,27 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
1319 /* XDR encode error */ 1325 /* XDR encode error */
1320 length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va); 1326 length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va);
1321 1327
1328 ctxt = svc_rdma_get_context(xprt);
1329 ctxt->direction = DMA_FROM_DEVICE;
1330 ctxt->count = 1;
1331 ctxt->pages[0] = p;
1332
1322 /* Prepare SGE for local address */ 1333 /* Prepare SGE for local address */
1323 sge.addr = ib_dma_map_single(xprt->sc_cm_id->device, 1334 ctxt->sge[0].addr = ib_dma_map_page(xprt->sc_cm_id->device,
1324 page_address(p), PAGE_SIZE, DMA_FROM_DEVICE); 1335 p, 0, length, DMA_FROM_DEVICE);
1325 if (ib_dma_mapping_error(xprt->sc_cm_id->device, sge.addr)) { 1336 if (ib_dma_mapping_error(xprt->sc_cm_id->device, ctxt->sge[0].addr)) {
1326 put_page(p); 1337 put_page(p);
1327 return; 1338 return;
1328 } 1339 }
1329 atomic_inc(&xprt->sc_dma_used); 1340 atomic_inc(&xprt->sc_dma_used);
1330 sge.lkey = xprt->sc_dma_lkey; 1341 ctxt->sge[0].lkey = xprt->sc_dma_lkey;
1331 sge.length = length; 1342 ctxt->sge[0].length = length;
1332
1333 ctxt = svc_rdma_get_context(xprt);
1334 ctxt->count = 1;
1335 ctxt->pages[0] = p;
1336 1343
1337 /* Prepare SEND WR */ 1344 /* Prepare SEND WR */
1338 memset(&err_wr, 0, sizeof err_wr); 1345 memset(&err_wr, 0, sizeof err_wr);
1339 ctxt->wr_op = IB_WR_SEND; 1346 ctxt->wr_op = IB_WR_SEND;
1340 err_wr.wr_id = (unsigned long)ctxt; 1347 err_wr.wr_id = (unsigned long)ctxt;
1341 err_wr.sg_list = &sge; 1348 err_wr.sg_list = ctxt->sge;
1342 err_wr.num_sge = 1; 1349 err_wr.num_sge = 1;
1343 err_wr.opcode = IB_WR_SEND; 1350 err_wr.opcode = IB_WR_SEND;
1344 err_wr.send_flags = IB_SEND_SIGNALED; 1351 err_wr.send_flags = IB_SEND_SIGNALED;
@@ -1348,9 +1355,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
1348 if (ret) { 1355 if (ret) {
1349 dprintk("svcrdma: Error %d posting send for protocol error\n", 1356 dprintk("svcrdma: Error %d posting send for protocol error\n",
1350 ret); 1357 ret);
1351 ib_dma_unmap_single(xprt->sc_cm_id->device, 1358 svc_rdma_unmap_dma(ctxt);
1352 sge.addr, PAGE_SIZE,
1353 DMA_FROM_DEVICE);
1354 svc_rdma_put_context(ctxt, 1); 1359 svc_rdma_put_context(ctxt, 1);
1355 } 1360 }
1356} 1361}
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index a85e866a77f7..0867070bb5ca 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -237,8 +237,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt)
237 237
238 dprintk("RPC: %s: called\n", __func__); 238 dprintk("RPC: %s: called\n", __func__);
239 239
240 cancel_delayed_work(&r_xprt->rdma_connect); 240 cancel_delayed_work_sync(&r_xprt->rdma_connect);
241 flush_scheduled_work();
242 241
243 xprt_clear_connected(xprt); 242 xprt_clear_connected(xprt);
244 243
@@ -251,9 +250,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt)
251 250
252 xprt_rdma_free_addresses(xprt); 251 xprt_rdma_free_addresses(xprt);
253 252
254 kfree(xprt->slot); 253 xprt_free(xprt);
255 xprt->slot = NULL;
256 kfree(xprt);
257 254
258 dprintk("RPC: %s: returning\n", __func__); 255 dprintk("RPC: %s: returning\n", __func__);
259 256
@@ -285,23 +282,14 @@ xprt_setup_rdma(struct xprt_create *args)
285 return ERR_PTR(-EBADF); 282 return ERR_PTR(-EBADF);
286 } 283 }
287 284
288 xprt = kzalloc(sizeof(struct rpcrdma_xprt), GFP_KERNEL); 285 xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt),
286 xprt_rdma_slot_table_entries);
289 if (xprt == NULL) { 287 if (xprt == NULL) {
290 dprintk("RPC: %s: couldn't allocate rpcrdma_xprt\n", 288 dprintk("RPC: %s: couldn't allocate rpcrdma_xprt\n",
291 __func__); 289 __func__);
292 return ERR_PTR(-ENOMEM); 290 return ERR_PTR(-ENOMEM);
293 } 291 }
294 292
295 xprt->max_reqs = xprt_rdma_slot_table_entries;
296 xprt->slot = kcalloc(xprt->max_reqs,
297 sizeof(struct rpc_rqst), GFP_KERNEL);
298 if (xprt->slot == NULL) {
299 dprintk("RPC: %s: couldn't allocate %d slots\n",
300 __func__, xprt->max_reqs);
301 kfree(xprt);
302 return ERR_PTR(-ENOMEM);
303 }
304
305 /* 60 second timeout, no retries */ 293 /* 60 second timeout, no retries */
306 xprt->timeout = &xprt_rdma_default_timeout; 294 xprt->timeout = &xprt_rdma_default_timeout;
307 xprt->bind_timeout = (60U * HZ); 295 xprt->bind_timeout = (60U * HZ);
@@ -410,8 +398,7 @@ out3:
410out2: 398out2:
411 rpcrdma_ia_close(&new_xprt->rx_ia); 399 rpcrdma_ia_close(&new_xprt->rx_ia);
412out1: 400out1:
413 kfree(xprt->slot); 401 xprt_free(xprt);
414 kfree(xprt);
415 return ERR_PTR(rc); 402 return ERR_PTR(rc);
416} 403}
417 404
@@ -460,7 +447,7 @@ xprt_rdma_connect(struct rpc_task *task)
460 } else { 447 } else {
461 schedule_delayed_work(&r_xprt->rdma_connect, 0); 448 schedule_delayed_work(&r_xprt->rdma_connect, 0);
462 if (!RPC_IS_ASYNC(task)) 449 if (!RPC_IS_ASYNC(task))
463 flush_scheduled_work(); 450 flush_delayed_work(&r_xprt->rdma_connect);
464 } 451 }
465} 452}
466 453
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 27015c6d8eb5..5f4c7b3bc711 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -650,10 +650,22 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
650 ep->rep_attr.cap.max_send_wr = cdata->max_requests; 650 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
651 switch (ia->ri_memreg_strategy) { 651 switch (ia->ri_memreg_strategy) {
652 case RPCRDMA_FRMR: 652 case RPCRDMA_FRMR:
653 /* Add room for frmr register and invalidate WRs */ 653 /* Add room for frmr register and invalidate WRs.
654 ep->rep_attr.cap.max_send_wr *= 3; 654 * 1. FRMR reg WR for head
655 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) 655 * 2. FRMR invalidate WR for head
656 return -EINVAL; 656 * 3. FRMR reg WR for pagelist
657 * 4. FRMR invalidate WR for pagelist
658 * 5. FRMR reg WR for tail
659 * 6. FRMR invalidate WR for tail
660 * 7. The RDMA_SEND WR
661 */
662 ep->rep_attr.cap.max_send_wr *= 7;
663 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
664 cdata->max_requests = devattr.max_qp_wr / 7;
665 if (!cdata->max_requests)
666 return -EINVAL;
667 ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7;
668 }
657 break; 669 break;
658 case RPCRDMA_MEMWINDOWS_ASYNC: 670 case RPCRDMA_MEMWINDOWS_ASYNC:
659 case RPCRDMA_MEMWINDOWS: 671 case RPCRDMA_MEMWINDOWS:
@@ -1490,7 +1502,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1490 memset(&frmr_wr, 0, sizeof frmr_wr); 1502 memset(&frmr_wr, 0, sizeof frmr_wr);
1491 frmr_wr.opcode = IB_WR_FAST_REG_MR; 1503 frmr_wr.opcode = IB_WR_FAST_REG_MR;
1492 frmr_wr.send_flags = 0; /* unsignaled */ 1504 frmr_wr.send_flags = 0; /* unsignaled */
1493 frmr_wr.wr.fast_reg.iova_start = (unsigned long)seg1->mr_dma; 1505 frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
1494 frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl; 1506 frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
1495 frmr_wr.wr.fast_reg.page_list_len = i; 1507 frmr_wr.wr.fast_reg.page_list_len = i;
1496 frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; 1508 frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 49a62f0c4b87..dfcab5ac65af 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -774,8 +774,7 @@ static void xs_destroy(struct rpc_xprt *xprt)
774 774
775 xs_close(xprt); 775 xs_close(xprt);
776 xs_free_peer_addresses(xprt); 776 xs_free_peer_addresses(xprt);
777 kfree(xprt->slot); 777 xprt_free(xprt);
778 kfree(xprt);
779 module_put(THIS_MODULE); 778 module_put(THIS_MODULE);
780} 779}
781 780
@@ -800,7 +799,7 @@ static void xs_udp_data_ready(struct sock *sk, int len)
800 u32 _xid; 799 u32 _xid;
801 __be32 *xp; 800 __be32 *xp;
802 801
803 read_lock(&sk->sk_callback_lock); 802 read_lock_bh(&sk->sk_callback_lock);
804 dprintk("RPC: xs_udp_data_ready...\n"); 803 dprintk("RPC: xs_udp_data_ready...\n");
805 if (!(xprt = xprt_from_sock(sk))) 804 if (!(xprt = xprt_from_sock(sk)))
806 goto out; 805 goto out;
@@ -852,7 +851,7 @@ static void xs_udp_data_ready(struct sock *sk, int len)
852 dropit: 851 dropit:
853 skb_free_datagram(sk, skb); 852 skb_free_datagram(sk, skb);
854 out: 853 out:
855 read_unlock(&sk->sk_callback_lock); 854 read_unlock_bh(&sk->sk_callback_lock);
856} 855}
857 856
858static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, struct xdr_skb_reader *desc) 857static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, struct xdr_skb_reader *desc)
@@ -1229,7 +1228,7 @@ static void xs_tcp_data_ready(struct sock *sk, int bytes)
1229 1228
1230 dprintk("RPC: xs_tcp_data_ready...\n"); 1229 dprintk("RPC: xs_tcp_data_ready...\n");
1231 1230
1232 read_lock(&sk->sk_callback_lock); 1231 read_lock_bh(&sk->sk_callback_lock);
1233 if (!(xprt = xprt_from_sock(sk))) 1232 if (!(xprt = xprt_from_sock(sk)))
1234 goto out; 1233 goto out;
1235 if (xprt->shutdown) 1234 if (xprt->shutdown)
@@ -1248,7 +1247,7 @@ static void xs_tcp_data_ready(struct sock *sk, int bytes)
1248 read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); 1247 read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv);
1249 } while (read > 0); 1248 } while (read > 0);
1250out: 1249out:
1251 read_unlock(&sk->sk_callback_lock); 1250 read_unlock_bh(&sk->sk_callback_lock);
1252} 1251}
1253 1252
1254/* 1253/*
@@ -1301,18 +1300,19 @@ static void xs_tcp_state_change(struct sock *sk)
1301{ 1300{
1302 struct rpc_xprt *xprt; 1301 struct rpc_xprt *xprt;
1303 1302
1304 read_lock(&sk->sk_callback_lock); 1303 read_lock_bh(&sk->sk_callback_lock);
1305 if (!(xprt = xprt_from_sock(sk))) 1304 if (!(xprt = xprt_from_sock(sk)))
1306 goto out; 1305 goto out;
1307 dprintk("RPC: xs_tcp_state_change client %p...\n", xprt); 1306 dprintk("RPC: xs_tcp_state_change client %p...\n", xprt);
1308 dprintk("RPC: state %x conn %d dead %d zapped %d\n", 1307 dprintk("RPC: state %x conn %d dead %d zapped %d sk_shutdown %d\n",
1309 sk->sk_state, xprt_connected(xprt), 1308 sk->sk_state, xprt_connected(xprt),
1310 sock_flag(sk, SOCK_DEAD), 1309 sock_flag(sk, SOCK_DEAD),
1311 sock_flag(sk, SOCK_ZAPPED)); 1310 sock_flag(sk, SOCK_ZAPPED),
1311 sk->sk_shutdown);
1312 1312
1313 switch (sk->sk_state) { 1313 switch (sk->sk_state) {
1314 case TCP_ESTABLISHED: 1314 case TCP_ESTABLISHED:
1315 spin_lock_bh(&xprt->transport_lock); 1315 spin_lock(&xprt->transport_lock);
1316 if (!xprt_test_and_set_connected(xprt)) { 1316 if (!xprt_test_and_set_connected(xprt)) {
1317 struct sock_xprt *transport = container_of(xprt, 1317 struct sock_xprt *transport = container_of(xprt,
1318 struct sock_xprt, xprt); 1318 struct sock_xprt, xprt);
@@ -1326,7 +1326,7 @@ static void xs_tcp_state_change(struct sock *sk)
1326 1326
1327 xprt_wake_pending_tasks(xprt, -EAGAIN); 1327 xprt_wake_pending_tasks(xprt, -EAGAIN);
1328 } 1328 }
1329 spin_unlock_bh(&xprt->transport_lock); 1329 spin_unlock(&xprt->transport_lock);
1330 break; 1330 break;
1331 case TCP_FIN_WAIT1: 1331 case TCP_FIN_WAIT1:
1332 /* The client initiated a shutdown of the socket */ 1332 /* The client initiated a shutdown of the socket */
@@ -1364,7 +1364,7 @@ static void xs_tcp_state_change(struct sock *sk)
1364 xs_sock_mark_closed(xprt); 1364 xs_sock_mark_closed(xprt);
1365 } 1365 }
1366 out: 1366 out:
1367 read_unlock(&sk->sk_callback_lock); 1367 read_unlock_bh(&sk->sk_callback_lock);
1368} 1368}
1369 1369
1370/** 1370/**
@@ -1375,7 +1375,7 @@ static void xs_error_report(struct sock *sk)
1375{ 1375{
1376 struct rpc_xprt *xprt; 1376 struct rpc_xprt *xprt;
1377 1377
1378 read_lock(&sk->sk_callback_lock); 1378 read_lock_bh(&sk->sk_callback_lock);
1379 if (!(xprt = xprt_from_sock(sk))) 1379 if (!(xprt = xprt_from_sock(sk)))
1380 goto out; 1380 goto out;
1381 dprintk("RPC: %s client %p...\n" 1381 dprintk("RPC: %s client %p...\n"
@@ -1383,7 +1383,7 @@ static void xs_error_report(struct sock *sk)
1383 __func__, xprt, sk->sk_err); 1383 __func__, xprt, sk->sk_err);
1384 xprt_wake_pending_tasks(xprt, -EAGAIN); 1384 xprt_wake_pending_tasks(xprt, -EAGAIN);
1385out: 1385out:
1386 read_unlock(&sk->sk_callback_lock); 1386 read_unlock_bh(&sk->sk_callback_lock);
1387} 1387}
1388 1388
1389static void xs_write_space(struct sock *sk) 1389static void xs_write_space(struct sock *sk)
@@ -1415,13 +1415,13 @@ static void xs_write_space(struct sock *sk)
1415 */ 1415 */
1416static void xs_udp_write_space(struct sock *sk) 1416static void xs_udp_write_space(struct sock *sk)
1417{ 1417{
1418 read_lock(&sk->sk_callback_lock); 1418 read_lock_bh(&sk->sk_callback_lock);
1419 1419
1420 /* from net/core/sock.c:sock_def_write_space */ 1420 /* from net/core/sock.c:sock_def_write_space */
1421 if (sock_writeable(sk)) 1421 if (sock_writeable(sk))
1422 xs_write_space(sk); 1422 xs_write_space(sk);
1423 1423
1424 read_unlock(&sk->sk_callback_lock); 1424 read_unlock_bh(&sk->sk_callback_lock);
1425} 1425}
1426 1426
1427/** 1427/**
@@ -1436,13 +1436,13 @@ static void xs_udp_write_space(struct sock *sk)
1436 */ 1436 */
1437static void xs_tcp_write_space(struct sock *sk) 1437static void xs_tcp_write_space(struct sock *sk)
1438{ 1438{
1439 read_lock(&sk->sk_callback_lock); 1439 read_lock_bh(&sk->sk_callback_lock);
1440 1440
1441 /* from net/core/stream.c:sk_stream_write_space */ 1441 /* from net/core/stream.c:sk_stream_write_space */
1442 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) 1442 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
1443 xs_write_space(sk); 1443 xs_write_space(sk);
1444 1444
1445 read_unlock(&sk->sk_callback_lock); 1445 read_unlock_bh(&sk->sk_callback_lock);
1446} 1446}
1447 1447
1448static void xs_udp_do_set_buffer_size(struct rpc_xprt *xprt) 1448static void xs_udp_do_set_buffer_size(struct rpc_xprt *xprt)
@@ -1515,7 +1515,7 @@ static void xs_set_port(struct rpc_xprt *xprt, unsigned short port)
1515 xs_update_peer_port(xprt); 1515 xs_update_peer_port(xprt);
1516} 1516}
1517 1517
1518static unsigned short xs_get_srcport(struct sock_xprt *transport, struct socket *sock) 1518static unsigned short xs_get_srcport(struct sock_xprt *transport)
1519{ 1519{
1520 unsigned short port = transport->srcport; 1520 unsigned short port = transport->srcport;
1521 1521
@@ -1524,7 +1524,7 @@ static unsigned short xs_get_srcport(struct sock_xprt *transport, struct socket
1524 return port; 1524 return port;
1525} 1525}
1526 1526
1527static unsigned short xs_next_srcport(struct sock_xprt *transport, struct socket *sock, unsigned short port) 1527static unsigned short xs_next_srcport(struct sock_xprt *transport, unsigned short port)
1528{ 1528{
1529 if (transport->srcport != 0) 1529 if (transport->srcport != 0)
1530 transport->srcport = 0; 1530 transport->srcport = 0;
@@ -1534,23 +1534,18 @@ static unsigned short xs_next_srcport(struct sock_xprt *transport, struct socket
1534 return xprt_max_resvport; 1534 return xprt_max_resvport;
1535 return --port; 1535 return --port;
1536} 1536}
1537 1537static int xs_bind(struct sock_xprt *transport, struct socket *sock)
1538static int xs_bind4(struct sock_xprt *transport, struct socket *sock)
1539{ 1538{
1540 struct sockaddr_in myaddr = { 1539 struct sockaddr_storage myaddr;
1541 .sin_family = AF_INET,
1542 };
1543 struct sockaddr_in *sa;
1544 int err, nloop = 0; 1540 int err, nloop = 0;
1545 unsigned short port = xs_get_srcport(transport, sock); 1541 unsigned short port = xs_get_srcport(transport);
1546 unsigned short last; 1542 unsigned short last;
1547 1543
1548 sa = (struct sockaddr_in *)&transport->srcaddr; 1544 memcpy(&myaddr, &transport->srcaddr, transport->xprt.addrlen);
1549 myaddr.sin_addr = sa->sin_addr;
1550 do { 1545 do {
1551 myaddr.sin_port = htons(port); 1546 rpc_set_port((struct sockaddr *)&myaddr, port);
1552 err = kernel_bind(sock, (struct sockaddr *) &myaddr, 1547 err = kernel_bind(sock, (struct sockaddr *)&myaddr,
1553 sizeof(myaddr)); 1548 transport->xprt.addrlen);
1554 if (port == 0) 1549 if (port == 0)
1555 break; 1550 break;
1556 if (err == 0) { 1551 if (err == 0) {
@@ -1558,48 +1553,23 @@ static int xs_bind4(struct sock_xprt *transport, struct socket *sock)
1558 break; 1553 break;
1559 } 1554 }
1560 last = port; 1555 last = port;
1561 port = xs_next_srcport(transport, sock, port); 1556 port = xs_next_srcport(transport, port);
1562 if (port > last) 1557 if (port > last)
1563 nloop++; 1558 nloop++;
1564 } while (err == -EADDRINUSE && nloop != 2); 1559 } while (err == -EADDRINUSE && nloop != 2);
1565 dprintk("RPC: %s %pI4:%u: %s (%d)\n",
1566 __func__, &myaddr.sin_addr,
1567 port, err ? "failed" : "ok", err);
1568 return err;
1569}
1570
1571static int xs_bind6(struct sock_xprt *transport, struct socket *sock)
1572{
1573 struct sockaddr_in6 myaddr = {
1574 .sin6_family = AF_INET6,
1575 };
1576 struct sockaddr_in6 *sa;
1577 int err, nloop = 0;
1578 unsigned short port = xs_get_srcport(transport, sock);
1579 unsigned short last;
1580 1560
1581 sa = (struct sockaddr_in6 *)&transport->srcaddr; 1561 if (myaddr.ss_family == AF_INET)
1582 myaddr.sin6_addr = sa->sin6_addr; 1562 dprintk("RPC: %s %pI4:%u: %s (%d)\n", __func__,
1583 do { 1563 &((struct sockaddr_in *)&myaddr)->sin_addr,
1584 myaddr.sin6_port = htons(port); 1564 port, err ? "failed" : "ok", err);
1585 err = kernel_bind(sock, (struct sockaddr *) &myaddr, 1565 else
1586 sizeof(myaddr)); 1566 dprintk("RPC: %s %pI6:%u: %s (%d)\n", __func__,
1587 if (port == 0) 1567 &((struct sockaddr_in6 *)&myaddr)->sin6_addr,
1588 break; 1568 port, err ? "failed" : "ok", err);
1589 if (err == 0) {
1590 transport->srcport = port;
1591 break;
1592 }
1593 last = port;
1594 port = xs_next_srcport(transport, sock, port);
1595 if (port > last)
1596 nloop++;
1597 } while (err == -EADDRINUSE && nloop != 2);
1598 dprintk("RPC: xs_bind6 %pI6:%u: %s (%d)\n",
1599 &myaddr.sin6_addr, port, err ? "failed" : "ok", err);
1600 return err; 1569 return err;
1601} 1570}
1602 1571
1572
1603#ifdef CONFIG_DEBUG_LOCK_ALLOC 1573#ifdef CONFIG_DEBUG_LOCK_ALLOC
1604static struct lock_class_key xs_key[2]; 1574static struct lock_class_key xs_key[2];
1605static struct lock_class_key xs_slock_key[2]; 1575static struct lock_class_key xs_slock_key[2];
@@ -1621,6 +1591,18 @@ static inline void xs_reclassify_socket6(struct socket *sock)
1621 sock_lock_init_class_and_name(sk, "slock-AF_INET6-RPC", 1591 sock_lock_init_class_and_name(sk, "slock-AF_INET6-RPC",
1622 &xs_slock_key[1], "sk_lock-AF_INET6-RPC", &xs_key[1]); 1592 &xs_slock_key[1], "sk_lock-AF_INET6-RPC", &xs_key[1]);
1623} 1593}
1594
1595static inline void xs_reclassify_socket(int family, struct socket *sock)
1596{
1597 switch (family) {
1598 case AF_INET:
1599 xs_reclassify_socket4(sock);
1600 break;
1601 case AF_INET6:
1602 xs_reclassify_socket6(sock);
1603 break;
1604 }
1605}
1624#else 1606#else
1625static inline void xs_reclassify_socket4(struct socket *sock) 1607static inline void xs_reclassify_socket4(struct socket *sock)
1626{ 1608{
@@ -1629,8 +1611,36 @@ static inline void xs_reclassify_socket4(struct socket *sock)
1629static inline void xs_reclassify_socket6(struct socket *sock) 1611static inline void xs_reclassify_socket6(struct socket *sock)
1630{ 1612{
1631} 1613}
1614
1615static inline void xs_reclassify_socket(int family, struct socket *sock)
1616{
1617}
1632#endif 1618#endif
1633 1619
1620static struct socket *xs_create_sock(struct rpc_xprt *xprt,
1621 struct sock_xprt *transport, int family, int type, int protocol)
1622{
1623 struct socket *sock;
1624 int err;
1625
1626 err = __sock_create(xprt->xprt_net, family, type, protocol, &sock, 1);
1627 if (err < 0) {
1628 dprintk("RPC: can't create %d transport socket (%d).\n",
1629 protocol, -err);
1630 goto out;
1631 }
1632 xs_reclassify_socket(family, sock);
1633
1634 if (xs_bind(transport, sock)) {
1635 sock_release(sock);
1636 goto out;
1637 }
1638
1639 return sock;
1640out:
1641 return ERR_PTR(err);
1642}
1643
1634static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) 1644static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
1635{ 1645{
1636 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); 1646 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
@@ -1660,82 +1670,23 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
1660 xs_udp_do_set_buffer_size(xprt); 1670 xs_udp_do_set_buffer_size(xprt);
1661} 1671}
1662 1672
1663/** 1673static void xs_udp_setup_socket(struct work_struct *work)
1664 * xs_udp_connect_worker4 - set up a UDP socket
1665 * @work: RPC transport to connect
1666 *
1667 * Invoked by a work queue tasklet.
1668 */
1669static void xs_udp_connect_worker4(struct work_struct *work)
1670{ 1674{
1671 struct sock_xprt *transport = 1675 struct sock_xprt *transport =
1672 container_of(work, struct sock_xprt, connect_worker.work); 1676 container_of(work, struct sock_xprt, connect_worker.work);
1673 struct rpc_xprt *xprt = &transport->xprt; 1677 struct rpc_xprt *xprt = &transport->xprt;
1674 struct socket *sock = transport->sock; 1678 struct socket *sock = transport->sock;
1675 int err, status = -EIO; 1679 int status = -EIO;
1676
1677 if (xprt->shutdown)
1678 goto out;
1679
1680 /* Start by resetting any existing state */
1681 xs_reset_transport(transport);
1682
1683 err = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
1684 if (err < 0) {
1685 dprintk("RPC: can't create UDP transport socket (%d).\n", -err);
1686 goto out;
1687 }
1688 xs_reclassify_socket4(sock);
1689
1690 if (xs_bind4(transport, sock)) {
1691 sock_release(sock);
1692 goto out;
1693 }
1694
1695 dprintk("RPC: worker connecting xprt %p via %s to "
1696 "%s (port %s)\n", xprt,
1697 xprt->address_strings[RPC_DISPLAY_PROTO],
1698 xprt->address_strings[RPC_DISPLAY_ADDR],
1699 xprt->address_strings[RPC_DISPLAY_PORT]);
1700
1701 xs_udp_finish_connecting(xprt, sock);
1702 status = 0;
1703out:
1704 xprt_clear_connecting(xprt);
1705 xprt_wake_pending_tasks(xprt, status);
1706}
1707
1708/**
1709 * xs_udp_connect_worker6 - set up a UDP socket
1710 * @work: RPC transport to connect
1711 *
1712 * Invoked by a work queue tasklet.
1713 */
1714static void xs_udp_connect_worker6(struct work_struct *work)
1715{
1716 struct sock_xprt *transport =
1717 container_of(work, struct sock_xprt, connect_worker.work);
1718 struct rpc_xprt *xprt = &transport->xprt;
1719 struct socket *sock = transport->sock;
1720 int err, status = -EIO;
1721 1680
1722 if (xprt->shutdown) 1681 if (xprt->shutdown)
1723 goto out; 1682 goto out;
1724 1683
1725 /* Start by resetting any existing state */ 1684 /* Start by resetting any existing state */
1726 xs_reset_transport(transport); 1685 xs_reset_transport(transport);
1727 1686 sock = xs_create_sock(xprt, transport,
1728 err = sock_create_kern(PF_INET6, SOCK_DGRAM, IPPROTO_UDP, &sock); 1687 xs_addr(xprt)->sa_family, SOCK_DGRAM, IPPROTO_UDP);
1729 if (err < 0) { 1688 if (IS_ERR(sock))
1730 dprintk("RPC: can't create UDP transport socket (%d).\n", -err);
1731 goto out; 1689 goto out;
1732 }
1733 xs_reclassify_socket6(sock);
1734
1735 if (xs_bind6(transport, sock) < 0) {
1736 sock_release(sock);
1737 goto out;
1738 }
1739 1690
1740 dprintk("RPC: worker connecting xprt %p via %s to " 1691 dprintk("RPC: worker connecting xprt %p via %s to "
1741 "%s (port %s)\n", xprt, 1692 "%s (port %s)\n", xprt,
@@ -1754,12 +1705,12 @@ out:
1754 * We need to preserve the port number so the reply cache on the server can 1705 * We need to preserve the port number so the reply cache on the server can
1755 * find our cached RPC replies when we get around to reconnecting. 1706 * find our cached RPC replies when we get around to reconnecting.
1756 */ 1707 */
1757static void xs_abort_connection(struct rpc_xprt *xprt, struct sock_xprt *transport) 1708static void xs_abort_connection(struct sock_xprt *transport)
1758{ 1709{
1759 int result; 1710 int result;
1760 struct sockaddr any; 1711 struct sockaddr any;
1761 1712
1762 dprintk("RPC: disconnecting xprt %p to reuse port\n", xprt); 1713 dprintk("RPC: disconnecting xprt %p to reuse port\n", transport);
1763 1714
1764 /* 1715 /*
1765 * Disconnect the transport socket by doing a connect operation 1716 * Disconnect the transport socket by doing a connect operation
@@ -1769,21 +1720,36 @@ static void xs_abort_connection(struct rpc_xprt *xprt, struct sock_xprt *transpo
1769 any.sa_family = AF_UNSPEC; 1720 any.sa_family = AF_UNSPEC;
1770 result = kernel_connect(transport->sock, &any, sizeof(any), 0); 1721 result = kernel_connect(transport->sock, &any, sizeof(any), 0);
1771 if (!result) 1722 if (!result)
1772 xs_sock_mark_closed(xprt); 1723 xs_sock_mark_closed(&transport->xprt);
1773 else 1724 else
1774 dprintk("RPC: AF_UNSPEC connect return code %d\n", 1725 dprintk("RPC: AF_UNSPEC connect return code %d\n",
1775 result); 1726 result);
1776} 1727}
1777 1728
1778static void xs_tcp_reuse_connection(struct rpc_xprt *xprt, struct sock_xprt *transport) 1729static void xs_tcp_reuse_connection(struct sock_xprt *transport)
1779{ 1730{
1780 unsigned int state = transport->inet->sk_state; 1731 unsigned int state = transport->inet->sk_state;
1781 1732
1782 if (state == TCP_CLOSE && transport->sock->state == SS_UNCONNECTED) 1733 if (state == TCP_CLOSE && transport->sock->state == SS_UNCONNECTED) {
1783 return; 1734 /* we don't need to abort the connection if the socket
1784 if ((1 << state) & (TCPF_ESTABLISHED|TCPF_SYN_SENT)) 1735 * hasn't undergone a shutdown
1785 return; 1736 */
1786 xs_abort_connection(xprt, transport); 1737 if (transport->inet->sk_shutdown == 0)
1738 return;
1739 dprintk("RPC: %s: TCP_CLOSEd and sk_shutdown set to %d\n",
1740 __func__, transport->inet->sk_shutdown);
1741 }
1742 if ((1 << state) & (TCPF_ESTABLISHED|TCPF_SYN_SENT)) {
1743 /* we don't need to abort the connection if the socket
1744 * hasn't undergone a shutdown
1745 */
1746 if (transport->inet->sk_shutdown == 0)
1747 return;
1748 dprintk("RPC: %s: ESTABLISHED/SYN_SENT "
1749 "sk_shutdown set to %d\n",
1750 __func__, transport->inet->sk_shutdown);
1751 }
1752 xs_abort_connection(transport);
1787} 1753}
1788 1754
1789static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) 1755static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
@@ -1836,12 +1802,12 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
1836 * 1802 *
1837 * Invoked by a work queue tasklet. 1803 * Invoked by a work queue tasklet.
1838 */ 1804 */
1839static void xs_tcp_setup_socket(struct rpc_xprt *xprt, 1805static void xs_tcp_setup_socket(struct work_struct *work)
1840 struct sock_xprt *transport,
1841 struct socket *(*create_sock)(struct rpc_xprt *,
1842 struct sock_xprt *))
1843{ 1806{
1807 struct sock_xprt *transport =
1808 container_of(work, struct sock_xprt, connect_worker.work);
1844 struct socket *sock = transport->sock; 1809 struct socket *sock = transport->sock;
1810 struct rpc_xprt *xprt = &transport->xprt;
1845 int status = -EIO; 1811 int status = -EIO;
1846 1812
1847 if (xprt->shutdown) 1813 if (xprt->shutdown)
@@ -1849,7 +1815,8 @@ static void xs_tcp_setup_socket(struct rpc_xprt *xprt,
1849 1815
1850 if (!sock) { 1816 if (!sock) {
1851 clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); 1817 clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
1852 sock = create_sock(xprt, transport); 1818 sock = xs_create_sock(xprt, transport,
1819 xs_addr(xprt)->sa_family, SOCK_STREAM, IPPROTO_TCP);
1853 if (IS_ERR(sock)) { 1820 if (IS_ERR(sock)) {
1854 status = PTR_ERR(sock); 1821 status = PTR_ERR(sock);
1855 goto out; 1822 goto out;
@@ -1860,7 +1827,7 @@ static void xs_tcp_setup_socket(struct rpc_xprt *xprt,
1860 abort_and_exit = test_and_clear_bit(XPRT_CONNECTION_ABORT, 1827 abort_and_exit = test_and_clear_bit(XPRT_CONNECTION_ABORT,
1861 &xprt->state); 1828 &xprt->state);
1862 /* "close" the socket, preserving the local port */ 1829 /* "close" the socket, preserving the local port */
1863 xs_tcp_reuse_connection(xprt, transport); 1830 xs_tcp_reuse_connection(transport);
1864 1831
1865 if (abort_and_exit) 1832 if (abort_and_exit)
1866 goto out_eagain; 1833 goto out_eagain;
@@ -1909,84 +1876,6 @@ out:
1909 xprt_wake_pending_tasks(xprt, status); 1876 xprt_wake_pending_tasks(xprt, status);
1910} 1877}
1911 1878
1912static struct socket *xs_create_tcp_sock4(struct rpc_xprt *xprt,
1913 struct sock_xprt *transport)
1914{
1915 struct socket *sock;
1916 int err;
1917
1918 /* start from scratch */
1919 err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
1920 if (err < 0) {
1921 dprintk("RPC: can't create TCP transport socket (%d).\n",
1922 -err);
1923 goto out_err;
1924 }
1925 xs_reclassify_socket4(sock);
1926
1927 if (xs_bind4(transport, sock) < 0) {
1928 sock_release(sock);
1929 goto out_err;
1930 }
1931 return sock;
1932out_err:
1933 return ERR_PTR(-EIO);
1934}
1935
1936/**
1937 * xs_tcp_connect_worker4 - connect a TCP socket to a remote endpoint
1938 * @work: RPC transport to connect
1939 *
1940 * Invoked by a work queue tasklet.
1941 */
1942static void xs_tcp_connect_worker4(struct work_struct *work)
1943{
1944 struct sock_xprt *transport =
1945 container_of(work, struct sock_xprt, connect_worker.work);
1946 struct rpc_xprt *xprt = &transport->xprt;
1947
1948 xs_tcp_setup_socket(xprt, transport, xs_create_tcp_sock4);
1949}
1950
1951static struct socket *xs_create_tcp_sock6(struct rpc_xprt *xprt,
1952 struct sock_xprt *transport)
1953{
1954 struct socket *sock;
1955 int err;
1956
1957 /* start from scratch */
1958 err = sock_create_kern(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &sock);
1959 if (err < 0) {
1960 dprintk("RPC: can't create TCP transport socket (%d).\n",
1961 -err);
1962 goto out_err;
1963 }
1964 xs_reclassify_socket6(sock);
1965
1966 if (xs_bind6(transport, sock) < 0) {
1967 sock_release(sock);
1968 goto out_err;
1969 }
1970 return sock;
1971out_err:
1972 return ERR_PTR(-EIO);
1973}
1974
1975/**
1976 * xs_tcp_connect_worker6 - connect a TCP socket to a remote endpoint
1977 * @work: RPC transport to connect
1978 *
1979 * Invoked by a work queue tasklet.
1980 */
1981static void xs_tcp_connect_worker6(struct work_struct *work)
1982{
1983 struct sock_xprt *transport =
1984 container_of(work, struct sock_xprt, connect_worker.work);
1985 struct rpc_xprt *xprt = &transport->xprt;
1986
1987 xs_tcp_setup_socket(xprt, transport, xs_create_tcp_sock6);
1988}
1989
1990/** 1879/**
1991 * xs_connect - connect a socket to a remote endpoint 1880 * xs_connect - connect a socket to a remote endpoint
1992 * @task: address of RPC task that manages state of connect request 1881 * @task: address of RPC task that manages state of connect request
@@ -2246,6 +2135,31 @@ static struct rpc_xprt_ops bc_tcp_ops = {
2246 .print_stats = xs_tcp_print_stats, 2135 .print_stats = xs_tcp_print_stats,
2247}; 2136};
2248 2137
2138static int xs_init_anyaddr(const int family, struct sockaddr *sap)
2139{
2140 static const struct sockaddr_in sin = {
2141 .sin_family = AF_INET,
2142 .sin_addr.s_addr = htonl(INADDR_ANY),
2143 };
2144 static const struct sockaddr_in6 sin6 = {
2145 .sin6_family = AF_INET6,
2146 .sin6_addr = IN6ADDR_ANY_INIT,
2147 };
2148
2149 switch (family) {
2150 case AF_INET:
2151 memcpy(sap, &sin, sizeof(sin));
2152 break;
2153 case AF_INET6:
2154 memcpy(sap, &sin6, sizeof(sin6));
2155 break;
2156 default:
2157 dprintk("RPC: %s: Bad address family\n", __func__);
2158 return -EAFNOSUPPORT;
2159 }
2160 return 0;
2161}
2162
2249static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args, 2163static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args,
2250 unsigned int slot_table_size) 2164 unsigned int slot_table_size)
2251{ 2165{
@@ -2257,27 +2171,25 @@ static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args,
2257 return ERR_PTR(-EBADF); 2171 return ERR_PTR(-EBADF);
2258 } 2172 }
2259 2173
2260 new = kzalloc(sizeof(*new), GFP_KERNEL); 2174 xprt = xprt_alloc(args->net, sizeof(*new), slot_table_size);
2261 if (new == NULL) { 2175 if (xprt == NULL) {
2262 dprintk("RPC: xs_setup_xprt: couldn't allocate " 2176 dprintk("RPC: xs_setup_xprt: couldn't allocate "
2263 "rpc_xprt\n"); 2177 "rpc_xprt\n");
2264 return ERR_PTR(-ENOMEM); 2178 return ERR_PTR(-ENOMEM);
2265 } 2179 }
2266 xprt = &new->xprt;
2267
2268 xprt->max_reqs = slot_table_size;
2269 xprt->slot = kcalloc(xprt->max_reqs, sizeof(struct rpc_rqst), GFP_KERNEL);
2270 if (xprt->slot == NULL) {
2271 kfree(xprt);
2272 dprintk("RPC: xs_setup_xprt: couldn't allocate slot "
2273 "table\n");
2274 return ERR_PTR(-ENOMEM);
2275 }
2276 2180
2181 new = container_of(xprt, struct sock_xprt, xprt);
2277 memcpy(&xprt->addr, args->dstaddr, args->addrlen); 2182 memcpy(&xprt->addr, args->dstaddr, args->addrlen);
2278 xprt->addrlen = args->addrlen; 2183 xprt->addrlen = args->addrlen;
2279 if (args->srcaddr) 2184 if (args->srcaddr)
2280 memcpy(&new->srcaddr, args->srcaddr, args->addrlen); 2185 memcpy(&new->srcaddr, args->srcaddr, args->addrlen);
2186 else {
2187 int err;
2188 err = xs_init_anyaddr(args->dstaddr->sa_family,
2189 (struct sockaddr *)&new->srcaddr);
2190 if (err != 0)
2191 return ERR_PTR(err);
2192 }
2281 2193
2282 return xprt; 2194 return xprt;
2283} 2195}
@@ -2325,7 +2237,7 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args)
2325 xprt_set_bound(xprt); 2237 xprt_set_bound(xprt);
2326 2238
2327 INIT_DELAYED_WORK(&transport->connect_worker, 2239 INIT_DELAYED_WORK(&transport->connect_worker,
2328 xs_udp_connect_worker4); 2240 xs_udp_setup_socket);
2329 xs_format_peer_addresses(xprt, "udp", RPCBIND_NETID_UDP); 2241 xs_format_peer_addresses(xprt, "udp", RPCBIND_NETID_UDP);
2330 break; 2242 break;
2331 case AF_INET6: 2243 case AF_INET6:
@@ -2333,7 +2245,7 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args)
2333 xprt_set_bound(xprt); 2245 xprt_set_bound(xprt);
2334 2246
2335 INIT_DELAYED_WORK(&transport->connect_worker, 2247 INIT_DELAYED_WORK(&transport->connect_worker,
2336 xs_udp_connect_worker6); 2248 xs_udp_setup_socket);
2337 xs_format_peer_addresses(xprt, "udp", RPCBIND_NETID_UDP6); 2249 xs_format_peer_addresses(xprt, "udp", RPCBIND_NETID_UDP6);
2338 break; 2250 break;
2339 default: 2251 default:
@@ -2355,8 +2267,7 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args)
2355 return xprt; 2267 return xprt;
2356 ret = ERR_PTR(-EINVAL); 2268 ret = ERR_PTR(-EINVAL);
2357out_err: 2269out_err:
2358 kfree(xprt->slot); 2270 xprt_free(xprt);
2359 kfree(xprt);
2360 return ret; 2271 return ret;
2361} 2272}
2362 2273
@@ -2400,7 +2311,7 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
2400 xprt_set_bound(xprt); 2311 xprt_set_bound(xprt);
2401 2312
2402 INIT_DELAYED_WORK(&transport->connect_worker, 2313 INIT_DELAYED_WORK(&transport->connect_worker,
2403 xs_tcp_connect_worker4); 2314 xs_tcp_setup_socket);
2404 xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP); 2315 xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP);
2405 break; 2316 break;
2406 case AF_INET6: 2317 case AF_INET6:
@@ -2408,7 +2319,7 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
2408 xprt_set_bound(xprt); 2319 xprt_set_bound(xprt);
2409 2320
2410 INIT_DELAYED_WORK(&transport->connect_worker, 2321 INIT_DELAYED_WORK(&transport->connect_worker,
2411 xs_tcp_connect_worker6); 2322 xs_tcp_setup_socket);
2412 xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP6); 2323 xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP6);
2413 break; 2324 break;
2414 default: 2325 default:
@@ -2431,8 +2342,7 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
2431 return xprt; 2342 return xprt;
2432 ret = ERR_PTR(-EINVAL); 2343 ret = ERR_PTR(-EINVAL);
2433out_err: 2344out_err:
2434 kfree(xprt->slot); 2345 xprt_free(xprt);
2435 kfree(xprt);
2436 return ret; 2346 return ret;
2437} 2347}
2438 2348
@@ -2491,15 +2401,10 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args)
2491 goto out_err; 2401 goto out_err;
2492 } 2402 }
2493 2403
2494 if (xprt_bound(xprt)) 2404 dprintk("RPC: set up xprt to %s (port %s) via %s\n",
2495 dprintk("RPC: set up xprt to %s (port %s) via %s\n", 2405 xprt->address_strings[RPC_DISPLAY_ADDR],
2496 xprt->address_strings[RPC_DISPLAY_ADDR], 2406 xprt->address_strings[RPC_DISPLAY_PORT],
2497 xprt->address_strings[RPC_DISPLAY_PORT], 2407 xprt->address_strings[RPC_DISPLAY_PROTO]);
2498 xprt->address_strings[RPC_DISPLAY_PROTO]);
2499 else
2500 dprintk("RPC: set up xprt to %s (autobind) via %s\n",
2501 xprt->address_strings[RPC_DISPLAY_ADDR],
2502 xprt->address_strings[RPC_DISPLAY_PROTO]);
2503 2408
2504 /* 2409 /*
2505 * Since we don't want connections for the backchannel, we set 2410 * Since we don't want connections for the backchannel, we set
@@ -2512,8 +2417,7 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args)
2512 return xprt; 2417 return xprt;
2513 ret = ERR_PTR(-EINVAL); 2418 ret = ERR_PTR(-EINVAL);
2514out_err: 2419out_err:
2515 kfree(xprt->slot); 2420 xprt_free(xprt);
2516 kfree(xprt);
2517 return ret; 2421 return ret;
2518} 2422}
2519 2423
diff --git a/net/tipc/addr.c b/net/tipc/addr.c
index c048543ffbeb..8a2e89bffde5 100644
--- a/net/tipc/addr.c
+++ b/net/tipc/addr.c
@@ -41,11 +41,6 @@
41#include "cluster.h" 41#include "cluster.h"
42#include "net.h" 42#include "net.h"
43 43
44u32 tipc_get_addr(void)
45{
46 return tipc_own_addr;
47}
48
49/** 44/**
50 * tipc_addr_domain_valid - validates a network domain address 45 * tipc_addr_domain_valid - validates a network domain address
51 * 46 *
@@ -89,7 +84,7 @@ int tipc_addr_domain_valid(u32 addr)
89 84
90int tipc_addr_node_valid(u32 addr) 85int tipc_addr_node_valid(u32 addr)
91{ 86{
92 return (tipc_addr_domain_valid(addr) && tipc_node(addr)); 87 return tipc_addr_domain_valid(addr) && tipc_node(addr);
93} 88}
94 89
95int tipc_in_scope(u32 domain, u32 addr) 90int tipc_in_scope(u32 domain, u32 addr)
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index a008c6689305..22a60fc98392 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -121,6 +121,9 @@ static DEFINE_SPINLOCK(bc_lock);
121 121
122const char tipc_bclink_name[] = "broadcast-link"; 122const char tipc_bclink_name[] = "broadcast-link";
123 123
124static void tipc_nmap_diff(struct tipc_node_map *nm_a,
125 struct tipc_node_map *nm_b,
126 struct tipc_node_map *nm_diff);
124 127
125static u32 buf_seqno(struct sk_buff *buf) 128static u32 buf_seqno(struct sk_buff *buf)
126{ 129{
@@ -143,6 +146,19 @@ static void bcbuf_decr_acks(struct sk_buff *buf)
143} 146}
144 147
145 148
149static void bclink_set_last_sent(void)
150{
151 if (bcl->next_out)
152 bcl->fsm_msg_cnt = mod(buf_seqno(bcl->next_out) - 1);
153 else
154 bcl->fsm_msg_cnt = mod(bcl->next_out_no - 1);
155}
156
157u32 tipc_bclink_get_last_sent(void)
158{
159 return bcl->fsm_msg_cnt;
160}
161
146/** 162/**
147 * bclink_set_gap - set gap according to contents of current deferred pkt queue 163 * bclink_set_gap - set gap according to contents of current deferred pkt queue
148 * 164 *
@@ -171,7 +187,7 @@ static void bclink_set_gap(struct tipc_node *n_ptr)
171 187
172static int bclink_ack_allowed(u32 n) 188static int bclink_ack_allowed(u32 n)
173{ 189{
174 return((n % TIPC_MIN_LINK_WIN) == tipc_own_tag); 190 return (n % TIPC_MIN_LINK_WIN) == tipc_own_tag;
175} 191}
176 192
177 193
@@ -237,8 +253,10 @@ void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked)
237 253
238 /* Try resolving broadcast link congestion, if necessary */ 254 /* Try resolving broadcast link congestion, if necessary */
239 255
240 if (unlikely(bcl->next_out)) 256 if (unlikely(bcl->next_out)) {
241 tipc_link_push_queue(bcl); 257 tipc_link_push_queue(bcl);
258 bclink_set_last_sent();
259 }
242 if (unlikely(released && !list_empty(&bcl->waiting_ports))) 260 if (unlikely(released && !list_empty(&bcl->waiting_ports)))
243 tipc_link_wakeup_ports(bcl, 0); 261 tipc_link_wakeup_ports(bcl, 0);
244 spin_unlock_bh(&bc_lock); 262 spin_unlock_bh(&bc_lock);
@@ -272,7 +290,7 @@ static void bclink_send_nack(struct tipc_node *n_ptr)
272 if (!less(n_ptr->bclink.gap_after, n_ptr->bclink.gap_to)) 290 if (!less(n_ptr->bclink.gap_after, n_ptr->bclink.gap_to))
273 return; 291 return;
274 292
275 buf = buf_acquire(INT_H_SIZE); 293 buf = tipc_buf_acquire(INT_H_SIZE);
276 if (buf) { 294 if (buf) {
277 msg = buf_msg(buf); 295 msg = buf_msg(buf);
278 tipc_msg_init(msg, BCAST_PROTOCOL, STATE_MSG, 296 tipc_msg_init(msg, BCAST_PROTOCOL, STATE_MSG,
@@ -395,7 +413,7 @@ int tipc_bclink_send_msg(struct sk_buff *buf)
395 if (unlikely(res == -ELINKCONG)) 413 if (unlikely(res == -ELINKCONG))
396 buf_discard(buf); 414 buf_discard(buf);
397 else 415 else
398 bcl->stats.sent_info++; 416 bclink_set_last_sent();
399 417
400 if (bcl->out_queue_size > bcl->stats.max_queue_sz) 418 if (bcl->out_queue_size > bcl->stats.max_queue_sz)
401 bcl->stats.max_queue_sz = bcl->out_queue_size; 419 bcl->stats.max_queue_sz = bcl->out_queue_size;
@@ -529,15 +547,6 @@ receive:
529 tipc_node_unlock(node); 547 tipc_node_unlock(node);
530} 548}
531 549
532u32 tipc_bclink_get_last_sent(void)
533{
534 u32 last_sent = mod(bcl->next_out_no - 1);
535
536 if (bcl->next_out)
537 last_sent = mod(buf_seqno(bcl->next_out) - 1);
538 return last_sent;
539}
540
541u32 tipc_bclink_acks_missing(struct tipc_node *n_ptr) 550u32 tipc_bclink_acks_missing(struct tipc_node *n_ptr)
542{ 551{
543 return (n_ptr->bclink.supported && 552 return (n_ptr->bclink.supported &&
@@ -570,6 +579,7 @@ static int tipc_bcbearer_send(struct sk_buff *buf,
570 msg = buf_msg(buf); 579 msg = buf_msg(buf);
571 msg_set_non_seq(msg, 1); 580 msg_set_non_seq(msg, 1);
572 msg_set_mc_netid(msg, tipc_net_id); 581 msg_set_mc_netid(msg, tipc_net_id);
582 bcl->stats.sent_info++;
573 } 583 }
574 584
575 /* Send buffer over bearers until all targets reached */ 585 /* Send buffer over bearers until all targets reached */
@@ -609,11 +619,13 @@ static int tipc_bcbearer_send(struct sk_buff *buf,
609 bcbearer->remains = bcbearer->remains_new; 619 bcbearer->remains = bcbearer->remains_new;
610 } 620 }
611 621
612 /* Unable to reach all targets */ 622 /*
623 * Unable to reach all targets (indicate success, since currently
624 * there isn't code in place to properly block & unblock the
625 * pseudo-bearer used by the broadcast link)
626 */
613 627
614 bcbearer->bearer.publ.blocked = 1; 628 return TIPC_OK;
615 bcl->stats.bearer_congs++;
616 return 1;
617} 629}
618 630
619/** 631/**
@@ -862,8 +874,9 @@ void tipc_nmap_remove(struct tipc_node_map *nm_ptr, u32 node)
862 * @nm_diff: output node map A-B (i.e. nodes of A that are not in B) 874 * @nm_diff: output node map A-B (i.e. nodes of A that are not in B)
863 */ 875 */
864 876
865void tipc_nmap_diff(struct tipc_node_map *nm_a, struct tipc_node_map *nm_b, 877static void tipc_nmap_diff(struct tipc_node_map *nm_a,
866 struct tipc_node_map *nm_diff) 878 struct tipc_node_map *nm_b,
879 struct tipc_node_map *nm_diff)
867{ 880{
868 int stop = ARRAY_SIZE(nm_a->map); 881 int stop = ARRAY_SIZE(nm_a->map);
869 int w; 882 int w;
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
index e8c2b81658c7..011c03f0a4ab 100644
--- a/net/tipc/bcast.h
+++ b/net/tipc/bcast.h
@@ -84,9 +84,6 @@ static inline int tipc_nmap_equal(struct tipc_node_map *nm_a, struct tipc_node_m
84 return !memcmp(nm_a, nm_b, sizeof(*nm_a)); 84 return !memcmp(nm_a, nm_b, sizeof(*nm_a));
85} 85}
86 86
87void tipc_nmap_diff(struct tipc_node_map *nm_a, struct tipc_node_map *nm_b,
88 struct tipc_node_map *nm_diff);
89
90void tipc_port_list_add(struct port_list *pl_ptr, u32 port); 87void tipc_port_list_add(struct port_list *pl_ptr, u32 port);
91void tipc_port_list_free(struct port_list *pl_ptr); 88void tipc_port_list_free(struct port_list *pl_ptr);
92 89
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 52ae17b2583e..9927d1d56c4f 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -63,7 +63,7 @@ static int media_name_valid(const char *name)
63 len = strlen(name); 63 len = strlen(name);
64 if ((len + 1) > TIPC_MAX_MEDIA_NAME) 64 if ((len + 1) > TIPC_MAX_MEDIA_NAME)
65 return 0; 65 return 0;
66 return (strspn(name, tipc_alphabet) == len); 66 return strspn(name, tipc_alphabet) == len;
67} 67}
68 68
69/** 69/**
@@ -288,9 +288,6 @@ static struct bearer *bearer_find(const char *name)
288 struct bearer *b_ptr; 288 struct bearer *b_ptr;
289 u32 i; 289 u32 i;
290 290
291 if (tipc_mode != TIPC_NET_MODE)
292 return NULL;
293
294 for (i = 0, b_ptr = tipc_bearers; i < MAX_BEARERS; i++, b_ptr++) { 291 for (i = 0, b_ptr = tipc_bearers; i < MAX_BEARERS; i++, b_ptr++) {
295 if (b_ptr->active && (!strcmp(b_ptr->publ.name, name))) 292 if (b_ptr->active && (!strcmp(b_ptr->publ.name, name)))
296 return b_ptr; 293 return b_ptr;
@@ -559,8 +556,6 @@ restart:
559 } 556 }
560 557
561 b_ptr = &tipc_bearers[bearer_id]; 558 b_ptr = &tipc_bearers[bearer_id];
562 memset(b_ptr, 0, sizeof(struct bearer));
563
564 strcpy(b_ptr->publ.name, name); 559 strcpy(b_ptr->publ.name, name);
565 res = m_ptr->enable_bearer(&b_ptr->publ); 560 res = m_ptr->enable_bearer(&b_ptr->publ);
566 if (res) { 561 if (res) {
@@ -630,30 +625,17 @@ int tipc_block_bearer(const char *name)
630 * Note: This routine assumes caller holds tipc_net_lock. 625 * Note: This routine assumes caller holds tipc_net_lock.
631 */ 626 */
632 627
633static int bearer_disable(const char *name) 628static int bearer_disable(struct bearer *b_ptr)
634{ 629{
635 struct bearer *b_ptr;
636 struct link *l_ptr; 630 struct link *l_ptr;
637 struct link *temp_l_ptr; 631 struct link *temp_l_ptr;
638 632
639 b_ptr = bearer_find(name); 633 info("Disabling bearer <%s>\n", b_ptr->publ.name);
640 if (!b_ptr) {
641 warn("Attempt to disable unknown bearer <%s>\n", name);
642 return -EINVAL;
643 }
644
645 info("Disabling bearer <%s>\n", name);
646 tipc_disc_stop_link_req(b_ptr->link_req); 634 tipc_disc_stop_link_req(b_ptr->link_req);
647 spin_lock_bh(&b_ptr->publ.lock); 635 spin_lock_bh(&b_ptr->publ.lock);
648 b_ptr->link_req = NULL; 636 b_ptr->link_req = NULL;
649 b_ptr->publ.blocked = 1; 637 b_ptr->publ.blocked = 1;
650 if (b_ptr->media->disable_bearer) { 638 b_ptr->media->disable_bearer(&b_ptr->publ);
651 spin_unlock_bh(&b_ptr->publ.lock);
652 write_unlock_bh(&tipc_net_lock);
653 b_ptr->media->disable_bearer(&b_ptr->publ);
654 write_lock_bh(&tipc_net_lock);
655 spin_lock_bh(&b_ptr->publ.lock);
656 }
657 list_for_each_entry_safe(l_ptr, temp_l_ptr, &b_ptr->links, link_list) { 639 list_for_each_entry_safe(l_ptr, temp_l_ptr, &b_ptr->links, link_list) {
658 tipc_link_delete(l_ptr); 640 tipc_link_delete(l_ptr);
659 } 641 }
@@ -664,10 +646,16 @@ static int bearer_disable(const char *name)
664 646
665int tipc_disable_bearer(const char *name) 647int tipc_disable_bearer(const char *name)
666{ 648{
649 struct bearer *b_ptr;
667 int res; 650 int res;
668 651
669 write_lock_bh(&tipc_net_lock); 652 write_lock_bh(&tipc_net_lock);
670 res = bearer_disable(name); 653 b_ptr = bearer_find(name);
654 if (b_ptr == NULL) {
655 warn("Attempt to disable unknown bearer <%s>\n", name);
656 res = -EINVAL;
657 } else
658 res = bearer_disable(b_ptr);
671 write_unlock_bh(&tipc_net_lock); 659 write_unlock_bh(&tipc_net_lock);
672 return res; 660 return res;
673} 661}
@@ -680,13 +668,7 @@ void tipc_bearer_stop(void)
680 668
681 for (i = 0; i < MAX_BEARERS; i++) { 669 for (i = 0; i < MAX_BEARERS; i++) {
682 if (tipc_bearers[i].active) 670 if (tipc_bearers[i].active)
683 tipc_bearers[i].publ.blocked = 1; 671 bearer_disable(&tipc_bearers[i]);
684 }
685 for (i = 0; i < MAX_BEARERS; i++) {
686 if (tipc_bearers[i].active)
687 bearer_disable(tipc_bearers[i].publ.name);
688 } 672 }
689 media_count = 0; 673 media_count = 0;
690} 674}
691
692
diff --git a/net/tipc/cluster.c b/net/tipc/cluster.c
index e68f705381bc..7fea14b98b97 100644
--- a/net/tipc/cluster.c
+++ b/net/tipc/cluster.c
@@ -113,25 +113,6 @@ void tipc_cltr_delete(struct cluster *c_ptr)
113 kfree(c_ptr); 113 kfree(c_ptr);
114} 114}
115 115
116u32 tipc_cltr_next_node(struct cluster *c_ptr, u32 addr)
117{
118 struct tipc_node *n_ptr;
119 u32 n_num = tipc_node(addr) + 1;
120
121 if (!c_ptr)
122 return addr;
123 for (; n_num <= c_ptr->highest_node; n_num++) {
124 n_ptr = c_ptr->nodes[n_num];
125 if (n_ptr && tipc_node_has_active_links(n_ptr))
126 return n_ptr->addr;
127 }
128 for (n_num = 1; n_num < tipc_node(addr); n_num++) {
129 n_ptr = c_ptr->nodes[n_num];
130 if (n_ptr && tipc_node_has_active_links(n_ptr))
131 return n_ptr->addr;
132 }
133 return 0;
134}
135 116
136void tipc_cltr_attach_node(struct cluster *c_ptr, struct tipc_node *n_ptr) 117void tipc_cltr_attach_node(struct cluster *c_ptr, struct tipc_node *n_ptr)
137{ 118{
@@ -232,7 +213,7 @@ struct tipc_node *tipc_cltr_select_node(struct cluster *c_ptr, u32 selector)
232static struct sk_buff *tipc_cltr_prepare_routing_msg(u32 data_size, u32 dest) 213static struct sk_buff *tipc_cltr_prepare_routing_msg(u32 data_size, u32 dest)
233{ 214{
234 u32 size = INT_H_SIZE + data_size; 215 u32 size = INT_H_SIZE + data_size;
235 struct sk_buff *buf = buf_acquire(size); 216 struct sk_buff *buf = tipc_buf_acquire(size);
236 struct tipc_msg *msg; 217 struct tipc_msg *msg;
237 218
238 if (buf) { 219 if (buf) {
diff --git a/net/tipc/cluster.h b/net/tipc/cluster.h
index 333efb0b9c44..32636d98c9c6 100644
--- a/net/tipc/cluster.h
+++ b/net/tipc/cluster.h
@@ -75,7 +75,7 @@ void tipc_cltr_attach_node(struct cluster *c_ptr, struct tipc_node *n_ptr);
75void tipc_cltr_send_slave_routes(struct cluster *c_ptr, u32 dest); 75void tipc_cltr_send_slave_routes(struct cluster *c_ptr, u32 dest);
76void tipc_cltr_broadcast(struct sk_buff *buf); 76void tipc_cltr_broadcast(struct sk_buff *buf);
77int tipc_cltr_init(void); 77int tipc_cltr_init(void);
78u32 tipc_cltr_next_node(struct cluster *c_ptr, u32 addr); 78
79void tipc_cltr_bcast_new_route(struct cluster *c_ptr, u32 dest, u32 lo, u32 hi); 79void tipc_cltr_bcast_new_route(struct cluster *c_ptr, u32 dest, u32 lo, u32 hi);
80void tipc_cltr_send_local_routes(struct cluster *c_ptr, u32 dest); 80void tipc_cltr_send_local_routes(struct cluster *c_ptr, u32 dest);
81void tipc_cltr_bcast_lost_route(struct cluster *c_ptr, u32 dest, u32 lo, u32 hi); 81void tipc_cltr_bcast_lost_route(struct cluster *c_ptr, u32 dest, u32 lo, u32 hi);
diff --git a/net/tipc/config.c b/net/tipc/config.c
index 961d1b097146..50a6133a3668 100644
--- a/net/tipc/config.c
+++ b/net/tipc/config.c
@@ -95,7 +95,7 @@ int tipc_cfg_append_tlv(struct sk_buff *buf, int tlv_type,
95 return 1; 95 return 1;
96} 96}
97 97
98struct sk_buff *tipc_cfg_reply_unsigned_type(u16 tlv_type, u32 value) 98static struct sk_buff *tipc_cfg_reply_unsigned_type(u16 tlv_type, u32 value)
99{ 99{
100 struct sk_buff *buf; 100 struct sk_buff *buf;
101 __be32 value_net; 101 __be32 value_net;
@@ -109,6 +109,11 @@ struct sk_buff *tipc_cfg_reply_unsigned_type(u16 tlv_type, u32 value)
109 return buf; 109 return buf;
110} 110}
111 111
112static struct sk_buff *tipc_cfg_reply_unsigned(u32 value)
113{
114 return tipc_cfg_reply_unsigned_type(TIPC_TLV_UNSIGNED, value);
115}
116
112struct sk_buff *tipc_cfg_reply_string_type(u16 tlv_type, char *string) 117struct sk_buff *tipc_cfg_reply_string_type(u16 tlv_type, char *string)
113{ 118{
114 struct sk_buff *buf; 119 struct sk_buff *buf;
@@ -120,139 +125,6 @@ struct sk_buff *tipc_cfg_reply_string_type(u16 tlv_type, char *string)
120 return buf; 125 return buf;
121} 126}
122 127
123
124#if 0
125
126/* Now obsolete code for handling commands not yet implemented the new way */
127
128/*
129 * Some of this code assumed that the manager structure contains two added
130 * fields:
131 * u32 link_subscriptions;
132 * struct list_head link_subscribers;
133 * which are currently not present. These fields may need to be re-introduced
134 * if and when support for link subscriptions is added.
135 */
136
137void tipc_cfg_link_event(u32 addr, char *name, int up)
138{
139 /* TIPC DOESN'T HANDLE LINK EVENT SUBSCRIPTIONS AT THE MOMENT */
140}
141
142int tipc_cfg_cmd(const struct tipc_cmd_msg * msg,
143 char *data,
144 u32 sz,
145 u32 *ret_size,
146 struct tipc_portid *orig)
147{
148 int rv = -EINVAL;
149 u32 cmd = msg->cmd;
150
151 *ret_size = 0;
152 switch (cmd) {
153 case TIPC_REMOVE_LINK:
154 case TIPC_CMD_BLOCK_LINK:
155 case TIPC_CMD_UNBLOCK_LINK:
156 if (!cfg_check_connection(orig))
157 rv = link_control(msg->argv.link_name, msg->cmd, 0);
158 break;
159 case TIPC_ESTABLISH:
160 {
161 int connected;
162
163 tipc_isconnected(mng.conn_port_ref, &connected);
164 if (connected || !orig) {
165 rv = TIPC_FAILURE;
166 break;
167 }
168 rv = tipc_connect2port(mng.conn_port_ref, orig);
169 if (rv == TIPC_OK)
170 orig = 0;
171 break;
172 }
173 case TIPC_GET_PEER_ADDRESS:
174 *ret_size = link_peer_addr(msg->argv.link_name, data, sz);
175 break;
176 case TIPC_GET_ROUTES:
177 rv = TIPC_OK;
178 break;
179 default: {}
180 }
181 if (*ret_size)
182 rv = TIPC_OK;
183 return rv;
184}
185
186static void cfg_cmd_event(struct tipc_cmd_msg *msg,
187 char *data,
188 u32 sz,
189 struct tipc_portid const *orig)
190{
191 int rv = -EINVAL;
192 struct tipc_cmd_result_msg rmsg;
193 struct iovec msg_sect[2];
194 int *arg;
195
196 msg->cmd = ntohl(msg->cmd);
197
198 cfg_prepare_res_msg(msg->cmd, msg->usr_handle, rv, &rmsg, msg_sect,
199 data, 0);
200 if (ntohl(msg->magic) != TIPC_MAGIC)
201 goto exit;
202
203 switch (msg->cmd) {
204 case TIPC_CREATE_LINK:
205 if (!cfg_check_connection(orig))
206 rv = disc_create_link(&msg->argv.create_link);
207 break;
208 case TIPC_LINK_SUBSCRIBE:
209 {
210 struct subscr_data *sub;
211
212 if (mng.link_subscriptions > 64)
213 break;
214 sub = kmalloc(sizeof(*sub),
215 GFP_ATOMIC);
216 if (sub == NULL) {
217 warn("Memory squeeze; dropped remote link subscription\n");
218 break;
219 }
220 INIT_LIST_HEAD(&sub->subd_list);
221 tipc_createport(mng.user_ref,
222 (void *)sub,
223 TIPC_HIGH_IMPORTANCE,
224 0,
225 0,
226 (tipc_conn_shutdown_event)cfg_linksubscr_cancel,
227 0,
228 0,
229 (tipc_conn_msg_event)cfg_linksubscr_cancel,
230 0,
231 &sub->port_ref);
232 if (!sub->port_ref) {
233 kfree(sub);
234 break;
235 }
236 memcpy(sub->usr_handle,msg->usr_handle,
237 sizeof(sub->usr_handle));
238 sub->domain = msg->argv.domain;
239 list_add_tail(&sub->subd_list, &mng.link_subscribers);
240 tipc_connect2port(sub->port_ref, orig);
241 rmsg.retval = TIPC_OK;
242 tipc_send(sub->port_ref, 2u, msg_sect);
243 mng.link_subscriptions++;
244 return;
245 }
246 default:
247 rv = tipc_cfg_cmd(msg, data, sz, (u32 *)&msg_sect[1].iov_len, orig);
248 }
249exit:
250 rmsg.result_len = htonl(msg_sect[1].iov_len);
251 rmsg.retval = htonl(rv);
252 tipc_cfg_respond(msg_sect, 2u, orig);
253}
254#endif
255
256#define MAX_STATS_INFO 2000 128#define MAX_STATS_INFO 2000
257 129
258static struct sk_buff *tipc_show_stats(void) 130static struct sk_buff *tipc_show_stats(void)
@@ -557,14 +429,6 @@ struct sk_buff *tipc_cfg_do_cmd(u32 orig_node, u16 cmd, const void *request_area
557 case TIPC_CMD_SHOW_PORTS: 429 case TIPC_CMD_SHOW_PORTS:
558 rep_tlv_buf = tipc_port_get_ports(); 430 rep_tlv_buf = tipc_port_get_ports();
559 break; 431 break;
560#if 0
561 case TIPC_CMD_SHOW_PORT_STATS:
562 rep_tlv_buf = port_show_stats(req_tlv_area, req_tlv_space);
563 break;
564 case TIPC_CMD_RESET_PORT_STATS:
565 rep_tlv_buf = tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED);
566 break;
567#endif
568 case TIPC_CMD_SET_LOG_SIZE: 432 case TIPC_CMD_SET_LOG_SIZE:
569 rep_tlv_buf = tipc_log_resize_cmd(req_tlv_area, req_tlv_space); 433 rep_tlv_buf = tipc_log_resize_cmd(req_tlv_area, req_tlv_space);
570 break; 434 break;
diff --git a/net/tipc/config.h b/net/tipc/config.h
index 5cd7cc56c54d..481e12ece715 100644
--- a/net/tipc/config.h
+++ b/net/tipc/config.h
@@ -45,7 +45,6 @@
45struct sk_buff *tipc_cfg_reply_alloc(int payload_size); 45struct sk_buff *tipc_cfg_reply_alloc(int payload_size);
46int tipc_cfg_append_tlv(struct sk_buff *buf, int tlv_type, 46int tipc_cfg_append_tlv(struct sk_buff *buf, int tlv_type,
47 void *tlv_data, int tlv_data_size); 47 void *tlv_data, int tlv_data_size);
48struct sk_buff *tipc_cfg_reply_unsigned_type(u16 tlv_type, u32 value);
49struct sk_buff *tipc_cfg_reply_string_type(u16 tlv_type, char *string); 48struct sk_buff *tipc_cfg_reply_string_type(u16 tlv_type, char *string);
50 49
51static inline struct sk_buff *tipc_cfg_reply_none(void) 50static inline struct sk_buff *tipc_cfg_reply_none(void)
@@ -53,11 +52,6 @@ static inline struct sk_buff *tipc_cfg_reply_none(void)
53 return tipc_cfg_reply_alloc(0); 52 return tipc_cfg_reply_alloc(0);
54} 53}
55 54
56static inline struct sk_buff *tipc_cfg_reply_unsigned(u32 value)
57{
58 return tipc_cfg_reply_unsigned_type(TIPC_TLV_UNSIGNED, value);
59}
60
61static inline struct sk_buff *tipc_cfg_reply_error_string(char *string) 55static inline struct sk_buff *tipc_cfg_reply_error_string(char *string)
62{ 56{
63 return tipc_cfg_reply_string_type(TIPC_TLV_ERROR_STRING, string); 57 return tipc_cfg_reply_string_type(TIPC_TLV_ERROR_STRING, string);
diff --git a/net/tipc/core.c b/net/tipc/core.c
index 696468117985..e2a09eb8efd4 100644
--- a/net/tipc/core.c
+++ b/net/tipc/core.c
@@ -96,13 +96,8 @@ int tipc_net_id;
96int tipc_remote_management; 96int tipc_remote_management;
97 97
98 98
99int tipc_get_mode(void)
100{
101 return tipc_mode;
102}
103
104/** 99/**
105 * buf_acquire - creates a TIPC message buffer 100 * tipc_buf_acquire - creates a TIPC message buffer
106 * @size: message size (including TIPC header) 101 * @size: message size (including TIPC header)
107 * 102 *
108 * Returns a new buffer with data pointers set to the specified size. 103 * Returns a new buffer with data pointers set to the specified size.
@@ -111,7 +106,7 @@ int tipc_get_mode(void)
111 * There may also be unrequested tailroom present at the buffer's end. 106 * There may also be unrequested tailroom present at the buffer's end.
112 */ 107 */
113 108
114struct sk_buff *buf_acquire(u32 size) 109struct sk_buff *tipc_buf_acquire(u32 size)
115{ 110{
116 struct sk_buff *skb; 111 struct sk_buff *skb;
117 unsigned int buf_size = (BUF_HEADROOM + size + 3) & ~3u; 112 unsigned int buf_size = (BUF_HEADROOM + size + 3) & ~3u;
@@ -129,7 +124,7 @@ struct sk_buff *buf_acquire(u32 size)
129 * tipc_core_stop_net - shut down TIPC networking sub-systems 124 * tipc_core_stop_net - shut down TIPC networking sub-systems
130 */ 125 */
131 126
132void tipc_core_stop_net(void) 127static void tipc_core_stop_net(void)
133{ 128{
134 tipc_eth_media_stop(); 129 tipc_eth_media_stop();
135 tipc_net_stop(); 130 tipc_net_stop();
@@ -154,7 +149,7 @@ int tipc_core_start_net(unsigned long addr)
154 * tipc_core_stop - switch TIPC from SINGLE NODE to NOT RUNNING mode 149 * tipc_core_stop - switch TIPC from SINGLE NODE to NOT RUNNING mode
155 */ 150 */
156 151
157void tipc_core_stop(void) 152static void tipc_core_stop(void)
158{ 153{
159 if (tipc_mode != TIPC_NODE_MODE) 154 if (tipc_mode != TIPC_NODE_MODE)
160 return; 155 return;
@@ -169,13 +164,14 @@ void tipc_core_stop(void)
169 tipc_nametbl_stop(); 164 tipc_nametbl_stop();
170 tipc_ref_table_stop(); 165 tipc_ref_table_stop();
171 tipc_socket_stop(); 166 tipc_socket_stop();
167 tipc_log_resize(0);
172} 168}
173 169
174/** 170/**
175 * tipc_core_start - switch TIPC from NOT RUNNING to SINGLE NODE mode 171 * tipc_core_start - switch TIPC from NOT RUNNING to SINGLE NODE mode
176 */ 172 */
177 173
178int tipc_core_start(void) 174static int tipc_core_start(void)
179{ 175{
180 int res; 176 int res;
181 177
@@ -203,7 +199,9 @@ static int __init tipc_init(void)
203{ 199{
204 int res; 200 int res;
205 201
206 tipc_log_resize(CONFIG_TIPC_LOG); 202 if (tipc_log_resize(CONFIG_TIPC_LOG) != 0)
203 warn("Unable to create log buffer\n");
204
207 info("Activated (version " TIPC_MOD_VER 205 info("Activated (version " TIPC_MOD_VER
208 " compiled " __DATE__ " " __TIME__ ")\n"); 206 " compiled " __DATE__ " " __TIME__ ")\n");
209 207
@@ -230,7 +228,6 @@ static void __exit tipc_exit(void)
230 tipc_core_stop_net(); 228 tipc_core_stop_net();
231 tipc_core_stop(); 229 tipc_core_stop();
232 info("Deactivated\n"); 230 info("Deactivated\n");
233 tipc_log_resize(0);
234} 231}
235 232
236module_init(tipc_init); 233module_init(tipc_init);
@@ -244,8 +241,6 @@ MODULE_VERSION(TIPC_MOD_VER);
244 241
245EXPORT_SYMBOL(tipc_attach); 242EXPORT_SYMBOL(tipc_attach);
246EXPORT_SYMBOL(tipc_detach); 243EXPORT_SYMBOL(tipc_detach);
247EXPORT_SYMBOL(tipc_get_addr);
248EXPORT_SYMBOL(tipc_get_mode);
249EXPORT_SYMBOL(tipc_createport); 244EXPORT_SYMBOL(tipc_createport);
250EXPORT_SYMBOL(tipc_deleteport); 245EXPORT_SYMBOL(tipc_deleteport);
251EXPORT_SYMBOL(tipc_ownidentity); 246EXPORT_SYMBOL(tipc_ownidentity);
@@ -260,23 +255,10 @@ EXPORT_SYMBOL(tipc_withdraw);
260EXPORT_SYMBOL(tipc_connect2port); 255EXPORT_SYMBOL(tipc_connect2port);
261EXPORT_SYMBOL(tipc_disconnect); 256EXPORT_SYMBOL(tipc_disconnect);
262EXPORT_SYMBOL(tipc_shutdown); 257EXPORT_SYMBOL(tipc_shutdown);
263EXPORT_SYMBOL(tipc_isconnected);
264EXPORT_SYMBOL(tipc_peer);
265EXPORT_SYMBOL(tipc_ref_valid);
266EXPORT_SYMBOL(tipc_send); 258EXPORT_SYMBOL(tipc_send);
267EXPORT_SYMBOL(tipc_send_buf);
268EXPORT_SYMBOL(tipc_send2name); 259EXPORT_SYMBOL(tipc_send2name);
269EXPORT_SYMBOL(tipc_forward2name);
270EXPORT_SYMBOL(tipc_send_buf2name);
271EXPORT_SYMBOL(tipc_forward_buf2name);
272EXPORT_SYMBOL(tipc_send2port); 260EXPORT_SYMBOL(tipc_send2port);
273EXPORT_SYMBOL(tipc_forward2port);
274EXPORT_SYMBOL(tipc_send_buf2port);
275EXPORT_SYMBOL(tipc_forward_buf2port);
276EXPORT_SYMBOL(tipc_multicast); 261EXPORT_SYMBOL(tipc_multicast);
277/* EXPORT_SYMBOL(tipc_multicast_buf); not available yet */
278EXPORT_SYMBOL(tipc_ispublished);
279EXPORT_SYMBOL(tipc_available_nodes);
280 262
281/* TIPC API for external bearers (see tipc_bearer.h) */ 263/* TIPC API for external bearers (see tipc_bearer.h) */
282 264
@@ -293,6 +275,4 @@ EXPORT_SYMBOL(tipc_createport_raw);
293EXPORT_SYMBOL(tipc_reject_msg); 275EXPORT_SYMBOL(tipc_reject_msg);
294EXPORT_SYMBOL(tipc_send_buf_fast); 276EXPORT_SYMBOL(tipc_send_buf_fast);
295EXPORT_SYMBOL(tipc_acknowledge); 277EXPORT_SYMBOL(tipc_acknowledge);
296EXPORT_SYMBOL(tipc_get_port);
297EXPORT_SYMBOL(tipc_get_handle);
298 278
diff --git a/net/tipc/core.h b/net/tipc/core.h
index 188799017abd..e19389e57227 100644
--- a/net/tipc/core.h
+++ b/net/tipc/core.h
@@ -83,9 +83,7 @@
83 * Note: TIPC_LOG is configured to echo its output to the system console; 83 * Note: TIPC_LOG is configured to echo its output to the system console;
84 * user-defined buffers can be configured to do the same thing. 84 * user-defined buffers can be configured to do the same thing.
85 */ 85 */
86
87extern struct print_buf *const TIPC_NULL; 86extern struct print_buf *const TIPC_NULL;
88extern struct print_buf *const TIPC_CONS;
89extern struct print_buf *const TIPC_LOG; 87extern struct print_buf *const TIPC_LOG;
90 88
91void tipc_printf(struct print_buf *, const char *fmt, ...); 89void tipc_printf(struct print_buf *, const char *fmt, ...);
@@ -204,10 +202,7 @@ extern atomic_t tipc_user_count;
204 * Routines available to privileged subsystems 202 * Routines available to privileged subsystems
205 */ 203 */
206 204
207extern int tipc_core_start(void); 205extern int tipc_core_start_net(unsigned long);
208extern void tipc_core_stop(void);
209extern int tipc_core_start_net(unsigned long addr);
210extern void tipc_core_stop_net(void);
211extern int tipc_handler_start(void); 206extern int tipc_handler_start(void);
212extern void tipc_handler_stop(void); 207extern void tipc_handler_stop(void);
213extern int tipc_netlink_start(void); 208extern int tipc_netlink_start(void);
@@ -328,7 +323,7 @@ static inline struct tipc_msg *buf_msg(struct sk_buff *skb)
328 return (struct tipc_msg *)skb->data; 323 return (struct tipc_msg *)skb->data;
329} 324}
330 325
331extern struct sk_buff *buf_acquire(u32 size); 326extern struct sk_buff *tipc_buf_acquire(u32 size);
332 327
333/** 328/**
334 * buf_discard - frees a TIPC message buffer 329 * buf_discard - frees a TIPC message buffer
diff --git a/net/tipc/dbg.c b/net/tipc/dbg.c
index 1885a7edb0c8..46f51d208e5e 100644
--- a/net/tipc/dbg.c
+++ b/net/tipc/dbg.c
@@ -52,7 +52,7 @@ static struct print_buf null_buf = { NULL, 0, NULL, 0 };
52struct print_buf *const TIPC_NULL = &null_buf; 52struct print_buf *const TIPC_NULL = &null_buf;
53 53
54static struct print_buf cons_buf = { NULL, 0, NULL, 1 }; 54static struct print_buf cons_buf = { NULL, 0, NULL, 1 };
55struct print_buf *const TIPC_CONS = &cons_buf; 55static struct print_buf *const TIPC_CONS = &cons_buf;
56 56
57static struct print_buf log_buf = { NULL, 0, NULL, 1 }; 57static struct print_buf log_buf = { NULL, 0, NULL, 1 };
58struct print_buf *const TIPC_LOG = &log_buf; 58struct print_buf *const TIPC_LOG = &log_buf;
@@ -76,6 +76,10 @@ struct print_buf *const TIPC_LOG = &log_buf;
76static char print_string[TIPC_PB_MAX_STR]; 76static char print_string[TIPC_PB_MAX_STR];
77static DEFINE_SPINLOCK(print_lock); 77static DEFINE_SPINLOCK(print_lock);
78 78
79static void tipc_printbuf_reset(struct print_buf *pb);
80static int tipc_printbuf_empty(struct print_buf *pb);
81static void tipc_printbuf_move(struct print_buf *pb_to,
82 struct print_buf *pb_from);
79 83
80#define FORMAT(PTR,LEN,FMT) \ 84#define FORMAT(PTR,LEN,FMT) \
81{\ 85{\
@@ -116,7 +120,7 @@ void tipc_printbuf_init(struct print_buf *pb, char *raw, u32 size)
116 * @pb: pointer to print buffer structure 120 * @pb: pointer to print buffer structure
117 */ 121 */
118 122
119void tipc_printbuf_reset(struct print_buf *pb) 123static void tipc_printbuf_reset(struct print_buf *pb)
120{ 124{
121 if (pb->buf) { 125 if (pb->buf) {
122 pb->crs = pb->buf; 126 pb->crs = pb->buf;
@@ -132,9 +136,9 @@ void tipc_printbuf_reset(struct print_buf *pb)
132 * Returns non-zero if print buffer is empty. 136 * Returns non-zero if print buffer is empty.
133 */ 137 */
134 138
135int tipc_printbuf_empty(struct print_buf *pb) 139static int tipc_printbuf_empty(struct print_buf *pb)
136{ 140{
137 return (!pb->buf || (pb->crs == pb->buf)); 141 return !pb->buf || (pb->crs == pb->buf);
138} 142}
139 143
140/** 144/**
@@ -169,7 +173,7 @@ int tipc_printbuf_validate(struct print_buf *pb)
169 tipc_printf(pb, err); 173 tipc_printf(pb, err);
170 } 174 }
171 } 175 }
172 return (pb->crs - pb->buf + 1); 176 return pb->crs - pb->buf + 1;
173} 177}
174 178
175/** 179/**
@@ -181,7 +185,8 @@ int tipc_printbuf_validate(struct print_buf *pb)
181 * Source print buffer becomes empty if a successful move occurs. 185 * Source print buffer becomes empty if a successful move occurs.
182 */ 186 */
183 187
184void tipc_printbuf_move(struct print_buf *pb_to, struct print_buf *pb_from) 188static void tipc_printbuf_move(struct print_buf *pb_to,
189 struct print_buf *pb_from)
185{ 190{
186 int len; 191 int len;
187 192
diff --git a/net/tipc/dbg.h b/net/tipc/dbg.h
index 5ef1bc8f64ef..3ba6ba8b434a 100644
--- a/net/tipc/dbg.h
+++ b/net/tipc/dbg.h
@@ -56,10 +56,7 @@ struct print_buf {
56#define TIPC_PB_MAX_STR 512 /* max printable string (with trailing NUL) */ 56#define TIPC_PB_MAX_STR 512 /* max printable string (with trailing NUL) */
57 57
58void tipc_printbuf_init(struct print_buf *pb, char *buf, u32 size); 58void tipc_printbuf_init(struct print_buf *pb, char *buf, u32 size);
59void tipc_printbuf_reset(struct print_buf *pb);
60int tipc_printbuf_empty(struct print_buf *pb);
61int tipc_printbuf_validate(struct print_buf *pb); 59int tipc_printbuf_validate(struct print_buf *pb);
62void tipc_printbuf_move(struct print_buf *pb_to, struct print_buf *pb_from);
63 60
64int tipc_log_resize(int log_size); 61int tipc_log_resize(int log_size);
65 62
diff --git a/net/tipc/discover.c b/net/tipc/discover.c
index fc1fcf5e6b53..4a7cd3719b78 100644
--- a/net/tipc/discover.c
+++ b/net/tipc/discover.c
@@ -46,16 +46,6 @@
46#define TIPC_LINK_REQ_FAST 2000 /* normal delay if bearer has no links */ 46#define TIPC_LINK_REQ_FAST 2000 /* normal delay if bearer has no links */
47#define TIPC_LINK_REQ_SLOW 600000 /* normal delay if bearer has links */ 47#define TIPC_LINK_REQ_SLOW 600000 /* normal delay if bearer has links */
48 48
49#if 0
50#define GET_NODE_INFO 300
51#define GET_NODE_INFO_RESULT 301
52#define FORWARD_LINK_PROBE 302
53#define LINK_REQUEST_REJECTED 303
54#define LINK_REQUEST_ACCEPTED 304
55#define DROP_LINK_REQUEST 305
56#define CHECK_LINK_COUNT 306
57#endif
58
59/* 49/*
60 * TODO: Most of the inter-cluster setup stuff should be 50 * TODO: Most of the inter-cluster setup stuff should be
61 * rewritten, and be made conformant with specification. 51 * rewritten, and be made conformant with specification.
@@ -78,30 +68,6 @@ struct link_req {
78 unsigned int timer_intv; 68 unsigned int timer_intv;
79}; 69};
80 70
81
82#if 0
83int disc_create_link(const struct tipc_link_create *argv)
84{
85 /*
86 * Code for inter cluster link setup here
87 */
88 return TIPC_OK;
89}
90#endif
91
92/*
93 * disc_lost_link(): A link has lost contact
94 */
95
96void tipc_disc_link_event(u32 addr, char *name, int up)
97{
98 if (in_own_cluster(addr))
99 return;
100 /*
101 * Code for inter cluster link setup here
102 */
103}
104
105/** 71/**
106 * tipc_disc_init_msg - initialize a link setup message 72 * tipc_disc_init_msg - initialize a link setup message
107 * @type: message type (request or response) 73 * @type: message type (request or response)
@@ -115,7 +81,7 @@ static struct sk_buff *tipc_disc_init_msg(u32 type,
115 u32 dest_domain, 81 u32 dest_domain,
116 struct bearer *b_ptr) 82 struct bearer *b_ptr)
117{ 83{
118 struct sk_buff *buf = buf_acquire(DSC_H_SIZE); 84 struct sk_buff *buf = tipc_buf_acquire(DSC_H_SIZE);
119 struct tipc_msg *msg; 85 struct tipc_msg *msg;
120 86
121 if (buf) { 87 if (buf) {
@@ -203,6 +169,14 @@ void tipc_disc_recv_msg(struct sk_buff *buf, struct bearer *b_ptr)
203 return; 169 return;
204 } 170 }
205 spin_lock_bh(&n_ptr->lock); 171 spin_lock_bh(&n_ptr->lock);
172
173 /* Don't talk to neighbor during cleanup after last session */
174
175 if (n_ptr->cleanup_required) {
176 spin_unlock_bh(&n_ptr->lock);
177 return;
178 }
179
206 link = n_ptr->links[b_ptr->identity]; 180 link = n_ptr->links[b_ptr->identity];
207 if (!link) { 181 if (!link) {
208 dbg("creating link\n"); 182 dbg("creating link\n");
diff --git a/net/tipc/discover.h b/net/tipc/discover.h
index c36eaeb7d5d0..f8e750636123 100644
--- a/net/tipc/discover.h
+++ b/net/tipc/discover.h
@@ -50,9 +50,4 @@ void tipc_disc_stop_link_req(struct link_req *req);
50 50
51void tipc_disc_recv_msg(struct sk_buff *buf, struct bearer *b_ptr); 51void tipc_disc_recv_msg(struct sk_buff *buf, struct bearer *b_ptr);
52 52
53void tipc_disc_link_event(u32 addr, char *name, int up);
54#if 0
55int disc_create_link(const struct tipc_link_create *argv);
56#endif
57
58#endif 53#endif
diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c
index 6230d16020c4..6e988ba485fd 100644
--- a/net/tipc/eth_media.c
+++ b/net/tipc/eth_media.c
@@ -72,17 +72,26 @@ static int send_msg(struct sk_buff *buf, struct tipc_bearer *tb_ptr,
72{ 72{
73 struct sk_buff *clone; 73 struct sk_buff *clone;
74 struct net_device *dev; 74 struct net_device *dev;
75 int delta;
75 76
76 clone = skb_clone(buf, GFP_ATOMIC); 77 clone = skb_clone(buf, GFP_ATOMIC);
77 if (clone) { 78 if (!clone)
78 skb_reset_network_header(clone); 79 return 0;
79 dev = ((struct eth_bearer *)(tb_ptr->usr_handle))->dev; 80
80 clone->dev = dev; 81 dev = ((struct eth_bearer *)(tb_ptr->usr_handle))->dev;
81 dev_hard_header(clone, dev, ETH_P_TIPC, 82 delta = dev->hard_header_len - skb_headroom(buf);
82 &dest->dev_addr.eth_addr, 83
83 dev->dev_addr, clone->len); 84 if ((delta > 0) &&
84 dev_queue_xmit(clone); 85 pskb_expand_head(clone, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
86 kfree_skb(clone);
87 return 0;
85 } 88 }
89
90 skb_reset_network_header(clone);
91 clone->dev = dev;
92 dev_hard_header(clone, dev, ETH_P_TIPC, &dest->dev_addr.eth_addr,
93 dev->dev_addr, clone->len);
94 dev_queue_xmit(clone);
86 return 0; 95 return 0;
87} 96}
88 97
@@ -92,15 +101,12 @@ static int send_msg(struct sk_buff *buf, struct tipc_bearer *tb_ptr,
92 * Accept only packets explicitly sent to this node, or broadcast packets; 101 * Accept only packets explicitly sent to this node, or broadcast packets;
93 * ignores packets sent using Ethernet multicast, and traffic sent to other 102 * ignores packets sent using Ethernet multicast, and traffic sent to other
94 * nodes (which can happen if interface is running in promiscuous mode). 103 * nodes (which can happen if interface is running in promiscuous mode).
95 * Routine truncates any Ethernet padding/CRC appended to the message,
96 * and ensures message size matches actual length
97 */ 104 */
98 105
99static int recv_msg(struct sk_buff *buf, struct net_device *dev, 106static int recv_msg(struct sk_buff *buf, struct net_device *dev,
100 struct packet_type *pt, struct net_device *orig_dev) 107 struct packet_type *pt, struct net_device *orig_dev)
101{ 108{
102 struct eth_bearer *eb_ptr = (struct eth_bearer *)pt->af_packet_priv; 109 struct eth_bearer *eb_ptr = (struct eth_bearer *)pt->af_packet_priv;
103 u32 size;
104 110
105 if (!net_eq(dev_net(dev), &init_net)) { 111 if (!net_eq(dev_net(dev), &init_net)) {
106 kfree_skb(buf); 112 kfree_skb(buf);
@@ -109,13 +115,9 @@ static int recv_msg(struct sk_buff *buf, struct net_device *dev,
109 115
110 if (likely(eb_ptr->bearer)) { 116 if (likely(eb_ptr->bearer)) {
111 if (likely(buf->pkt_type <= PACKET_BROADCAST)) { 117 if (likely(buf->pkt_type <= PACKET_BROADCAST)) {
112 size = msg_size((struct tipc_msg *)buf->data); 118 buf->next = NULL;
113 skb_trim(buf, size); 119 tipc_recv_msg(buf, eb_ptr->bearer);
114 if (likely(buf->len == size)) { 120 return 0;
115 buf->next = NULL;
116 tipc_recv_msg(buf, eb_ptr->bearer);
117 return 0;
118 }
119 } 121 }
120 } 122 }
121 kfree_skb(buf); 123 kfree_skb(buf);
@@ -133,6 +135,16 @@ static int enable_bearer(struct tipc_bearer *tb_ptr)
133 struct eth_bearer *eb_ptr = &eth_bearers[0]; 135 struct eth_bearer *eb_ptr = &eth_bearers[0];
134 struct eth_bearer *stop = &eth_bearers[MAX_ETH_BEARERS]; 136 struct eth_bearer *stop = &eth_bearers[MAX_ETH_BEARERS];
135 char *driver_name = strchr((const char *)tb_ptr->name, ':') + 1; 137 char *driver_name = strchr((const char *)tb_ptr->name, ':') + 1;
138 int pending_dev = 0;
139
140 /* Find unused Ethernet bearer structure */
141
142 while (eb_ptr->dev) {
143 if (!eb_ptr->bearer)
144 pending_dev++;
145 if (++eb_ptr == stop)
146 return pending_dev ? -EAGAIN : -EDQUOT;
147 }
136 148
137 /* Find device with specified name */ 149 /* Find device with specified name */
138 150
diff --git a/net/tipc/link.c b/net/tipc/link.c
index a3616b99529b..b31992ccd5d3 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -99,23 +99,6 @@ struct link_name {
99 char if_peer[TIPC_MAX_IF_NAME]; 99 char if_peer[TIPC_MAX_IF_NAME];
100}; 100};
101 101
102#if 0
103
104/* LINK EVENT CODE IS NOT SUPPORTED AT PRESENT */
105
106/**
107 * struct link_event - link up/down event notification
108 */
109
110struct link_event {
111 u32 addr;
112 int up;
113 void (*fcn)(u32, char *, int);
114 char name[TIPC_MAX_LINK_NAME];
115};
116
117#endif
118
119static void link_handle_out_of_seq_msg(struct link *l_ptr, 102static void link_handle_out_of_seq_msg(struct link *l_ptr,
120 struct sk_buff *buf); 103 struct sk_buff *buf);
121static void link_recv_proto_msg(struct link *l_ptr, struct sk_buff *buf); 104static void link_recv_proto_msg(struct link *l_ptr, struct sk_buff *buf);
@@ -129,6 +112,9 @@ static void link_state_event(struct link *l_ptr, u32 event);
129static void link_reset_statistics(struct link *l_ptr); 112static void link_reset_statistics(struct link *l_ptr);
130static void link_print(struct link *l_ptr, struct print_buf *buf, 113static void link_print(struct link *l_ptr, struct print_buf *buf,
131 const char *str); 114 const char *str);
115static void link_start(struct link *l_ptr);
116static int link_send_long_buf(struct link *l_ptr, struct sk_buff *buf);
117
132 118
133/* 119/*
134 * Debugging code used by link routines only 120 * Debugging code used by link routines only
@@ -239,13 +225,13 @@ int tipc_link_is_up(struct link *l_ptr)
239{ 225{
240 if (!l_ptr) 226 if (!l_ptr)
241 return 0; 227 return 0;
242 return (link_working_working(l_ptr) || link_working_unknown(l_ptr)); 228 return link_working_working(l_ptr) || link_working_unknown(l_ptr);
243} 229}
244 230
245int tipc_link_is_active(struct link *l_ptr) 231int tipc_link_is_active(struct link *l_ptr)
246{ 232{
247 return ((l_ptr->owner->active_links[0] == l_ptr) || 233 return (l_ptr->owner->active_links[0] == l_ptr) ||
248 (l_ptr->owner->active_links[1] == l_ptr)); 234 (l_ptr->owner->active_links[1] == l_ptr);
249} 235}
250 236
251/** 237/**
@@ -459,7 +445,7 @@ struct link *tipc_link_create(struct bearer *b_ptr, const u32 peer,
459 445
460 k_init_timer(&l_ptr->timer, (Handler)link_timeout, (unsigned long)l_ptr); 446 k_init_timer(&l_ptr->timer, (Handler)link_timeout, (unsigned long)l_ptr);
461 list_add_tail(&l_ptr->link_list, &b_ptr->links); 447 list_add_tail(&l_ptr->link_list, &b_ptr->links);
462 tipc_k_signal((Handler)tipc_link_start, (unsigned long)l_ptr); 448 tipc_k_signal((Handler)link_start, (unsigned long)l_ptr);
463 449
464 dbg("tipc_link_create(): tolerance = %u,cont intv = %u, abort_limit = %u\n", 450 dbg("tipc_link_create(): tolerance = %u,cont intv = %u, abort_limit = %u\n",
465 l_ptr->tolerance, l_ptr->continuity_interval, l_ptr->abort_limit); 451 l_ptr->tolerance, l_ptr->continuity_interval, l_ptr->abort_limit);
@@ -499,9 +485,9 @@ void tipc_link_delete(struct link *l_ptr)
499 kfree(l_ptr); 485 kfree(l_ptr);
500} 486}
501 487
502void tipc_link_start(struct link *l_ptr) 488static void link_start(struct link *l_ptr)
503{ 489{
504 dbg("tipc_link_start %x\n", l_ptr); 490 dbg("link_start %x\n", l_ptr);
505 link_state_event(l_ptr, STARTING_EVT); 491 link_state_event(l_ptr, STARTING_EVT);
506} 492}
507 493
@@ -634,39 +620,9 @@ void tipc_link_stop(struct link *l_ptr)
634 l_ptr->proto_msg_queue = NULL; 620 l_ptr->proto_msg_queue = NULL;
635} 621}
636 622
637#if 0
638
639/* LINK EVENT CODE IS NOT SUPPORTED AT PRESENT */ 623/* LINK EVENT CODE IS NOT SUPPORTED AT PRESENT */
640
641static void link_recv_event(struct link_event *ev)
642{
643 ev->fcn(ev->addr, ev->name, ev->up);
644 kfree(ev);
645}
646
647static void link_send_event(void (*fcn)(u32 a, char *n, int up),
648 struct link *l_ptr, int up)
649{
650 struct link_event *ev;
651
652 ev = kmalloc(sizeof(*ev), GFP_ATOMIC);
653 if (!ev) {
654 warn("Link event allocation failure\n");
655 return;
656 }
657 ev->addr = l_ptr->addr;
658 ev->up = up;
659 ev->fcn = fcn;
660 memcpy(ev->name, l_ptr->name, TIPC_MAX_LINK_NAME);
661 tipc_k_signal((Handler)link_recv_event, (unsigned long)ev);
662}
663
664#else
665
666#define link_send_event(fcn, l_ptr, up) do { } while (0) 624#define link_send_event(fcn, l_ptr, up) do { } while (0)
667 625
668#endif
669
670void tipc_link_reset(struct link *l_ptr) 626void tipc_link_reset(struct link *l_ptr)
671{ 627{
672 struct sk_buff *buf; 628 struct sk_buff *buf;
@@ -690,10 +646,7 @@ void tipc_link_reset(struct link *l_ptr)
690 646
691 tipc_node_link_down(l_ptr->owner, l_ptr); 647 tipc_node_link_down(l_ptr->owner, l_ptr);
692 tipc_bearer_remove_dest(l_ptr->b_ptr, l_ptr->addr); 648 tipc_bearer_remove_dest(l_ptr->b_ptr, l_ptr->addr);
693#if 0 649
694 tipc_printf(TIPC_CONS, "\nReset link <%s>\n", l_ptr->name);
695 dbg_link_dump();
696#endif
697 if (was_active_link && tipc_node_has_active_links(l_ptr->owner) && 650 if (was_active_link && tipc_node_has_active_links(l_ptr->owner) &&
698 l_ptr->owner->permit_changeover) { 651 l_ptr->owner->permit_changeover) {
699 l_ptr->reset_checkpoint = checkpoint; 652 l_ptr->reset_checkpoint = checkpoint;
@@ -1050,7 +1003,7 @@ int tipc_link_send_buf(struct link *l_ptr, struct sk_buff *buf)
1050 /* Fragmentation needed ? */ 1003 /* Fragmentation needed ? */
1051 1004
1052 if (size > max_packet) 1005 if (size > max_packet)
1053 return tipc_link_send_long_buf(l_ptr, buf); 1006 return link_send_long_buf(l_ptr, buf);
1054 1007
1055 /* Packet can be queued or sent: */ 1008 /* Packet can be queued or sent: */
1056 1009
@@ -1086,7 +1039,7 @@ int tipc_link_send_buf(struct link *l_ptr, struct sk_buff *buf)
1086 /* Try creating a new bundle */ 1039 /* Try creating a new bundle */
1087 1040
1088 if (size <= max_packet * 2 / 3) { 1041 if (size <= max_packet * 2 / 3) {
1089 struct sk_buff *bundler = buf_acquire(max_packet); 1042 struct sk_buff *bundler = tipc_buf_acquire(max_packet);
1090 struct tipc_msg bundler_hdr; 1043 struct tipc_msg bundler_hdr;
1091 1044
1092 if (bundler) { 1045 if (bundler) {
@@ -1362,7 +1315,7 @@ again:
1362 1315
1363 /* Prepare header of first fragment: */ 1316 /* Prepare header of first fragment: */
1364 1317
1365 buf_chain = buf = buf_acquire(max_pkt); 1318 buf_chain = buf = tipc_buf_acquire(max_pkt);
1366 if (!buf) 1319 if (!buf)
1367 return -ENOMEM; 1320 return -ENOMEM;
1368 buf->next = NULL; 1321 buf->next = NULL;
@@ -1419,7 +1372,7 @@ error:
1419 msg_set_size(&fragm_hdr, fragm_sz + INT_H_SIZE); 1372 msg_set_size(&fragm_hdr, fragm_sz + INT_H_SIZE);
1420 msg_set_fragm_no(&fragm_hdr, ++fragm_no); 1373 msg_set_fragm_no(&fragm_hdr, ++fragm_no);
1421 prev = buf; 1374 prev = buf;
1422 buf = buf_acquire(fragm_sz + INT_H_SIZE); 1375 buf = tipc_buf_acquire(fragm_sz + INT_H_SIZE);
1423 if (!buf) 1376 if (!buf)
1424 goto error; 1377 goto error;
1425 1378
@@ -1802,6 +1755,15 @@ static int link_recv_buf_validate(struct sk_buff *buf)
1802 return pskb_may_pull(buf, hdr_size); 1755 return pskb_may_pull(buf, hdr_size);
1803} 1756}
1804 1757
1758/**
1759 * tipc_recv_msg - process TIPC messages arriving from off-node
1760 * @head: pointer to message buffer chain
1761 * @tb_ptr: pointer to bearer message arrived on
1762 *
1763 * Invoked with no locks held. Bearer pointer must point to a valid bearer
1764 * structure (i.e. cannot be NULL), but bearer can be inactive.
1765 */
1766
1805void tipc_recv_msg(struct sk_buff *head, struct tipc_bearer *tb_ptr) 1767void tipc_recv_msg(struct sk_buff *head, struct tipc_bearer *tb_ptr)
1806{ 1768{
1807 read_lock_bh(&tipc_net_lock); 1769 read_lock_bh(&tipc_net_lock);
@@ -1819,6 +1781,11 @@ void tipc_recv_msg(struct sk_buff *head, struct tipc_bearer *tb_ptr)
1819 1781
1820 head = head->next; 1782 head = head->next;
1821 1783
1784 /* Ensure bearer is still enabled */
1785
1786 if (unlikely(!b_ptr->active))
1787 goto cont;
1788
1822 /* Ensure message is well-formed */ 1789 /* Ensure message is well-formed */
1823 1790
1824 if (unlikely(!link_recv_buf_validate(buf))) 1791 if (unlikely(!link_recv_buf_validate(buf)))
@@ -1855,13 +1822,22 @@ void tipc_recv_msg(struct sk_buff *head, struct tipc_bearer *tb_ptr)
1855 goto cont; 1822 goto cont;
1856 } 1823 }
1857 1824
1858 /* Locate unicast link endpoint that should handle message */ 1825 /* Locate neighboring node that sent message */
1859 1826
1860 n_ptr = tipc_node_find(msg_prevnode(msg)); 1827 n_ptr = tipc_node_find(msg_prevnode(msg));
1861 if (unlikely(!n_ptr)) 1828 if (unlikely(!n_ptr))
1862 goto cont; 1829 goto cont;
1863 tipc_node_lock(n_ptr); 1830 tipc_node_lock(n_ptr);
1864 1831
1832 /* Don't talk to neighbor during cleanup after last session */
1833
1834 if (n_ptr->cleanup_required) {
1835 tipc_node_unlock(n_ptr);
1836 goto cont;
1837 }
1838
1839 /* Locate unicast link endpoint that should handle message */
1840
1865 l_ptr = n_ptr->links[b_ptr->identity]; 1841 l_ptr = n_ptr->links[b_ptr->identity];
1866 if (unlikely(!l_ptr)) { 1842 if (unlikely(!l_ptr)) {
1867 tipc_node_unlock(n_ptr); 1843 tipc_node_unlock(n_ptr);
@@ -2172,7 +2148,7 @@ void tipc_link_send_proto_msg(struct link *l_ptr, u32 msg_typ, int probe_msg,
2172 if (tipc_bearer_congested(l_ptr->b_ptr, l_ptr)) { 2148 if (tipc_bearer_congested(l_ptr->b_ptr, l_ptr)) {
2173 if (!l_ptr->proto_msg_queue) { 2149 if (!l_ptr->proto_msg_queue) {
2174 l_ptr->proto_msg_queue = 2150 l_ptr->proto_msg_queue =
2175 buf_acquire(sizeof(l_ptr->proto_msg)); 2151 tipc_buf_acquire(sizeof(l_ptr->proto_msg));
2176 } 2152 }
2177 buf = l_ptr->proto_msg_queue; 2153 buf = l_ptr->proto_msg_queue;
2178 if (!buf) 2154 if (!buf)
@@ -2186,7 +2162,7 @@ void tipc_link_send_proto_msg(struct link *l_ptr, u32 msg_typ, int probe_msg,
2186 2162
2187 msg_dbg(msg, ">>"); 2163 msg_dbg(msg, ">>");
2188 2164
2189 buf = buf_acquire(msg_size); 2165 buf = tipc_buf_acquire(msg_size);
2190 if (!buf) 2166 if (!buf)
2191 return; 2167 return;
2192 2168
@@ -2345,10 +2321,10 @@ exit:
2345 * tipc_link_tunnel(): Send one message via a link belonging to 2321 * tipc_link_tunnel(): Send one message via a link belonging to
2346 * another bearer. Owner node is locked. 2322 * another bearer. Owner node is locked.
2347 */ 2323 */
2348void tipc_link_tunnel(struct link *l_ptr, 2324static void tipc_link_tunnel(struct link *l_ptr,
2349 struct tipc_msg *tunnel_hdr, 2325 struct tipc_msg *tunnel_hdr,
2350 struct tipc_msg *msg, 2326 struct tipc_msg *msg,
2351 u32 selector) 2327 u32 selector)
2352{ 2328{
2353 struct link *tunnel; 2329 struct link *tunnel;
2354 struct sk_buff *buf; 2330 struct sk_buff *buf;
@@ -2361,7 +2337,7 @@ void tipc_link_tunnel(struct link *l_ptr,
2361 return; 2337 return;
2362 } 2338 }
2363 msg_set_size(tunnel_hdr, length + INT_H_SIZE); 2339 msg_set_size(tunnel_hdr, length + INT_H_SIZE);
2364 buf = buf_acquire(length + INT_H_SIZE); 2340 buf = tipc_buf_acquire(length + INT_H_SIZE);
2365 if (!buf) { 2341 if (!buf) {
2366 warn("Link changeover error, " 2342 warn("Link changeover error, "
2367 "unable to send tunnel msg\n"); 2343 "unable to send tunnel msg\n");
@@ -2407,7 +2383,7 @@ void tipc_link_changeover(struct link *l_ptr)
2407 if (!l_ptr->first_out) { 2383 if (!l_ptr->first_out) {
2408 struct sk_buff *buf; 2384 struct sk_buff *buf;
2409 2385
2410 buf = buf_acquire(INT_H_SIZE); 2386 buf = tipc_buf_acquire(INT_H_SIZE);
2411 if (buf) { 2387 if (buf) {
2412 skb_copy_to_linear_data(buf, &tunnel_hdr, INT_H_SIZE); 2388 skb_copy_to_linear_data(buf, &tunnel_hdr, INT_H_SIZE);
2413 msg_set_size(&tunnel_hdr, INT_H_SIZE); 2389 msg_set_size(&tunnel_hdr, INT_H_SIZE);
@@ -2468,7 +2444,7 @@ void tipc_link_send_duplicate(struct link *l_ptr, struct link *tunnel)
2468 msg_set_ack(msg, mod(l_ptr->next_in_no - 1)); /* Update */ 2444 msg_set_ack(msg, mod(l_ptr->next_in_no - 1)); /* Update */
2469 msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in); 2445 msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in);
2470 msg_set_size(&tunnel_hdr, length + INT_H_SIZE); 2446 msg_set_size(&tunnel_hdr, length + INT_H_SIZE);
2471 outbuf = buf_acquire(length + INT_H_SIZE); 2447 outbuf = tipc_buf_acquire(length + INT_H_SIZE);
2472 if (outbuf == NULL) { 2448 if (outbuf == NULL) {
2473 warn("Link changeover error, " 2449 warn("Link changeover error, "
2474 "unable to send duplicate msg\n"); 2450 "unable to send duplicate msg\n");
@@ -2504,7 +2480,7 @@ static struct sk_buff *buf_extract(struct sk_buff *skb, u32 from_pos)
2504 u32 size = msg_size(msg); 2480 u32 size = msg_size(msg);
2505 struct sk_buff *eb; 2481 struct sk_buff *eb;
2506 2482
2507 eb = buf_acquire(size); 2483 eb = tipc_buf_acquire(size);
2508 if (eb) 2484 if (eb)
2509 skb_copy_to_linear_data(eb, msg, size); 2485 skb_copy_to_linear_data(eb, msg, size);
2510 return eb; 2486 return eb;
@@ -2632,11 +2608,11 @@ void tipc_link_recv_bundle(struct sk_buff *buf)
2632 2608
2633 2609
2634/* 2610/*
2635 * tipc_link_send_long_buf: Entry for buffers needing fragmentation. 2611 * link_send_long_buf: Entry for buffers needing fragmentation.
2636 * The buffer is complete, inclusive total message length. 2612 * The buffer is complete, inclusive total message length.
2637 * Returns user data length. 2613 * Returns user data length.
2638 */ 2614 */
2639int tipc_link_send_long_buf(struct link *l_ptr, struct sk_buff *buf) 2615static int link_send_long_buf(struct link *l_ptr, struct sk_buff *buf)
2640{ 2616{
2641 struct tipc_msg *inmsg = buf_msg(buf); 2617 struct tipc_msg *inmsg = buf_msg(buf);
2642 struct tipc_msg fragm_hdr; 2618 struct tipc_msg fragm_hdr;
@@ -2675,7 +2651,7 @@ int tipc_link_send_long_buf(struct link *l_ptr, struct sk_buff *buf)
2675 fragm_sz = rest; 2651 fragm_sz = rest;
2676 msg_set_type(&fragm_hdr, LAST_FRAGMENT); 2652 msg_set_type(&fragm_hdr, LAST_FRAGMENT);
2677 } 2653 }
2678 fragm = buf_acquire(fragm_sz + INT_H_SIZE); 2654 fragm = tipc_buf_acquire(fragm_sz + INT_H_SIZE);
2679 if (fragm == NULL) { 2655 if (fragm == NULL) {
2680 warn("Link unable to fragment message\n"); 2656 warn("Link unable to fragment message\n");
2681 dsz = -ENOMEM; 2657 dsz = -ENOMEM;
@@ -2780,7 +2756,7 @@ int tipc_link_recv_fragment(struct sk_buff **pending, struct sk_buff **fb,
2780 buf_discard(fbuf); 2756 buf_discard(fbuf);
2781 return 0; 2757 return 0;
2782 } 2758 }
2783 pbuf = buf_acquire(msg_size(imsg)); 2759 pbuf = tipc_buf_acquire(msg_size(imsg));
2784 if (pbuf != NULL) { 2760 if (pbuf != NULL) {
2785 pbuf->next = *pending; 2761 pbuf->next = *pending;
2786 *pending = pbuf; 2762 *pending = pbuf;
@@ -3174,44 +3150,6 @@ struct sk_buff *tipc_link_cmd_show_stats(const void *req_tlv_area, int req_tlv_s
3174 return buf; 3150 return buf;
3175} 3151}
3176 3152
3177#if 0
3178int link_control(const char *name, u32 op, u32 val)
3179{
3180 int res = -EINVAL;
3181 struct link *l_ptr;
3182 u32 bearer_id;
3183 struct tipc_node * node;
3184 u32 a;
3185
3186 a = link_name2addr(name, &bearer_id);
3187 read_lock_bh(&tipc_net_lock);
3188 node = tipc_node_find(a);
3189 if (node) {
3190 tipc_node_lock(node);
3191 l_ptr = node->links[bearer_id];
3192 if (l_ptr) {
3193 if (op == TIPC_REMOVE_LINK) {
3194 struct bearer *b_ptr = l_ptr->b_ptr;
3195 spin_lock_bh(&b_ptr->publ.lock);
3196 tipc_link_delete(l_ptr);
3197 spin_unlock_bh(&b_ptr->publ.lock);
3198 }
3199 if (op == TIPC_CMD_BLOCK_LINK) {
3200 tipc_link_reset(l_ptr);
3201 l_ptr->blocked = 1;
3202 }
3203 if (op == TIPC_CMD_UNBLOCK_LINK) {
3204 l_ptr->blocked = 0;
3205 }
3206 res = 0;
3207 }
3208 tipc_node_unlock(node);
3209 }
3210 read_unlock_bh(&tipc_net_lock);
3211 return res;
3212}
3213#endif
3214
3215/** 3153/**
3216 * tipc_link_get_max_pkt - get maximum packet size to use when sending to destination 3154 * tipc_link_get_max_pkt - get maximum packet size to use when sending to destination
3217 * @dest: network address of destination node 3155 * @dest: network address of destination node
@@ -3242,28 +3180,6 @@ u32 tipc_link_get_max_pkt(u32 dest, u32 selector)
3242 return res; 3180 return res;
3243} 3181}
3244 3182
3245#if 0
3246static void link_dump_rec_queue(struct link *l_ptr)
3247{
3248 struct sk_buff *crs;
3249
3250 if (!l_ptr->oldest_deferred_in) {
3251 info("Reception queue empty\n");
3252 return;
3253 }
3254 info("Contents of Reception queue:\n");
3255 crs = l_ptr->oldest_deferred_in;
3256 while (crs) {
3257 if (crs->data == (void *)0x0000a3a3) {
3258 info("buffer %x invalid\n", crs);
3259 return;
3260 }
3261 msg_dbg(buf_msg(crs), "In rec queue:\n");
3262 crs = crs->next;
3263 }
3264}
3265#endif
3266
3267static void link_dump_send_queue(struct link *l_ptr) 3183static void link_dump_send_queue(struct link *l_ptr)
3268{ 3184{
3269 if (l_ptr->next_out) { 3185 if (l_ptr->next_out) {
diff --git a/net/tipc/link.h b/net/tipc/link.h
index 2e5385c47d30..f98bc613de67 100644
--- a/net/tipc/link.h
+++ b/net/tipc/link.h
@@ -210,10 +210,6 @@ struct link {
210 u32 msg_length_counts; 210 u32 msg_length_counts;
211 u32 msg_lengths_total; 211 u32 msg_lengths_total;
212 u32 msg_length_profile[7]; 212 u32 msg_length_profile[7];
213#if 0
214 u32 sent_tunneled;
215 u32 recv_tunneled;
216#endif
217 } stats; 213 } stats;
218 214
219 struct print_buf print_buf; 215 struct print_buf print_buf;
@@ -229,7 +225,6 @@ void tipc_link_send_duplicate(struct link *l_ptr, struct link *dest);
229void tipc_link_reset_fragments(struct link *l_ptr); 225void tipc_link_reset_fragments(struct link *l_ptr);
230int tipc_link_is_up(struct link *l_ptr); 226int tipc_link_is_up(struct link *l_ptr);
231int tipc_link_is_active(struct link *l_ptr); 227int tipc_link_is_active(struct link *l_ptr);
232void tipc_link_start(struct link *l_ptr);
233u32 tipc_link_push_packet(struct link *l_ptr); 228u32 tipc_link_push_packet(struct link *l_ptr);
234void tipc_link_stop(struct link *l_ptr); 229void tipc_link_stop(struct link *l_ptr);
235struct sk_buff *tipc_link_cmd_config(const void *req_tlv_area, int req_tlv_space, u16 cmd); 230struct sk_buff *tipc_link_cmd_config(const void *req_tlv_area, int req_tlv_space, u16 cmd);
@@ -243,9 +238,6 @@ int tipc_link_send_sections_fast(struct port* sender,
243 struct iovec const *msg_sect, 238 struct iovec const *msg_sect,
244 const u32 num_sect, 239 const u32 num_sect,
245 u32 destnode); 240 u32 destnode);
246int tipc_link_send_long_buf(struct link *l_ptr, struct sk_buff *buf);
247void tipc_link_tunnel(struct link *l_ptr, struct tipc_msg *tnl_hdr,
248 struct tipc_msg *msg, u32 selector);
249void tipc_link_recv_bundle(struct sk_buff *buf); 241void tipc_link_recv_bundle(struct sk_buff *buf);
250int tipc_link_recv_fragment(struct sk_buff **pending, 242int tipc_link_recv_fragment(struct sk_buff **pending,
251 struct sk_buff **fb, 243 struct sk_buff **fb,
@@ -279,12 +271,12 @@ static inline int between(u32 lower, u32 upper, u32 n)
279 271
280static inline int less_eq(u32 left, u32 right) 272static inline int less_eq(u32 left, u32 right)
281{ 273{
282 return (mod(right - left) < 32768u); 274 return mod(right - left) < 32768u;
283} 275}
284 276
285static inline int less(u32 left, u32 right) 277static inline int less(u32 left, u32 right)
286{ 278{
287 return (less_eq(left, right) && (mod(right) != mod(left))); 279 return less_eq(left, right) && (mod(right) != mod(left));
288} 280}
289 281
290static inline u32 lesser(u32 left, u32 right) 282static inline u32 lesser(u32 left, u32 right)
@@ -299,32 +291,32 @@ static inline u32 lesser(u32 left, u32 right)
299 291
300static inline int link_working_working(struct link *l_ptr) 292static inline int link_working_working(struct link *l_ptr)
301{ 293{
302 return (l_ptr->state == WORKING_WORKING); 294 return l_ptr->state == WORKING_WORKING;
303} 295}
304 296
305static inline int link_working_unknown(struct link *l_ptr) 297static inline int link_working_unknown(struct link *l_ptr)
306{ 298{
307 return (l_ptr->state == WORKING_UNKNOWN); 299 return l_ptr->state == WORKING_UNKNOWN;
308} 300}
309 301
310static inline int link_reset_unknown(struct link *l_ptr) 302static inline int link_reset_unknown(struct link *l_ptr)
311{ 303{
312 return (l_ptr->state == RESET_UNKNOWN); 304 return l_ptr->state == RESET_UNKNOWN;
313} 305}
314 306
315static inline int link_reset_reset(struct link *l_ptr) 307static inline int link_reset_reset(struct link *l_ptr)
316{ 308{
317 return (l_ptr->state == RESET_RESET); 309 return l_ptr->state == RESET_RESET;
318} 310}
319 311
320static inline int link_blocked(struct link *l_ptr) 312static inline int link_blocked(struct link *l_ptr)
321{ 313{
322 return (l_ptr->exp_msg_count || l_ptr->blocked); 314 return l_ptr->exp_msg_count || l_ptr->blocked;
323} 315}
324 316
325static inline int link_congested(struct link *l_ptr) 317static inline int link_congested(struct link *l_ptr)
326{ 318{
327 return (l_ptr->out_queue_size >= l_ptr->queue_limit[0]); 319 return l_ptr->out_queue_size >= l_ptr->queue_limit[0];
328} 320}
329 321
330#endif 322#endif
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index 381063817b41..ecb532fb0351 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -112,7 +112,7 @@ int tipc_msg_build(struct tipc_msg *hdr,
112 return dsz; 112 return dsz;
113 } 113 }
114 114
115 *buf = buf_acquire(sz); 115 *buf = tipc_buf_acquire(sz);
116 if (!(*buf)) 116 if (!(*buf))
117 return -ENOMEM; 117 return -ENOMEM;
118 skb_copy_to_linear_data(*buf, hdr, hsz); 118 skb_copy_to_linear_data(*buf, hdr, hsz);
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index 995d2da35b01..031aad18efce 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -104,7 +104,7 @@ static inline u32 msg_user(struct tipc_msg *m)
104 104
105static inline u32 msg_isdata(struct tipc_msg *m) 105static inline u32 msg_isdata(struct tipc_msg *m)
106{ 106{
107 return (msg_user(m) <= TIPC_CRITICAL_IMPORTANCE); 107 return msg_user(m) <= TIPC_CRITICAL_IMPORTANCE;
108} 108}
109 109
110static inline void msg_set_user(struct tipc_msg *m, u32 n) 110static inline void msg_set_user(struct tipc_msg *m, u32 n)
@@ -289,7 +289,7 @@ static inline void msg_set_destnode(struct tipc_msg *m, u32 a)
289 289
290static inline int msg_is_dest(struct tipc_msg *m, u32 d) 290static inline int msg_is_dest(struct tipc_msg *m, u32 d)
291{ 291{
292 return(msg_short(m) || (msg_destnode(m) == d)); 292 return msg_short(m) || (msg_destnode(m) == d);
293} 293}
294 294
295static inline u32 msg_routed(struct tipc_msg *m) 295static inline u32 msg_routed(struct tipc_msg *m)
@@ -632,7 +632,7 @@ static inline void msg_set_bcast_tag(struct tipc_msg *m, u32 n)
632 632
633static inline u32 msg_max_pkt(struct tipc_msg *m) 633static inline u32 msg_max_pkt(struct tipc_msg *m)
634{ 634{
635 return (msg_bits(m, 9, 16, 0xffff) * 4); 635 return msg_bits(m, 9, 16, 0xffff) * 4;
636} 636}
637 637
638static inline void msg_set_max_pkt(struct tipc_msg *m, u32 n) 638static inline void msg_set_max_pkt(struct tipc_msg *m, u32 n)
diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index 6ac3c543250b..7b907171f879 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -98,7 +98,7 @@ static void publ_to_item(struct distr_item *i, struct publication *p)
98 98
99static struct sk_buff *named_prepare_buf(u32 type, u32 size, u32 dest) 99static struct sk_buff *named_prepare_buf(u32 type, u32 size, u32 dest)
100{ 100{
101 struct sk_buff *buf = buf_acquire(LONG_H_SIZE + size); 101 struct sk_buff *buf = tipc_buf_acquire(LONG_H_SIZE + size);
102 struct tipc_msg *msg; 102 struct tipc_msg *msg;
103 103
104 if (buf != NULL) { 104 if (buf != NULL) {
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index 8ba79620db3f..3a8de4334da1 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -116,7 +116,7 @@ DEFINE_RWLOCK(tipc_nametbl_lock);
116 116
117static int hash(int x) 117static int hash(int x)
118{ 118{
119 return(x & (tipc_nametbl_size - 1)); 119 return x & (tipc_nametbl_size - 1);
120} 120}
121 121
122/** 122/**
@@ -613,8 +613,7 @@ struct publication *tipc_nametbl_remove_publ(u32 type, u32 lower,
613} 613}
614 614
615/* 615/*
616 * tipc_nametbl_translate(): Translate tipc_name -> tipc_portid. 616 * tipc_nametbl_translate - translate name to port id
617 * Very time-critical.
618 * 617 *
619 * Note: on entry 'destnode' is the search domain used during translation; 618 * Note: on entry 'destnode' is the search domain used during translation;
620 * on exit it passes back the node address of the matching port (if any) 619 * on exit it passes back the node address of the matching port (if any)
@@ -685,7 +684,6 @@ found:
685 } 684 }
686 spin_unlock_bh(&seq->lock); 685 spin_unlock_bh(&seq->lock);
687not_found: 686not_found:
688 *destnode = 0;
689 read_unlock_bh(&tipc_nametbl_lock); 687 read_unlock_bh(&tipc_nametbl_lock);
690 return 0; 688 return 0;
691} 689}
@@ -877,7 +875,7 @@ static void subseq_list(struct sub_seq *sseq, struct print_buf *buf, u32 depth,
877 u32 index) 875 u32 index)
878{ 876{
879 char portIdStr[27]; 877 char portIdStr[27];
880 char *scopeStr; 878 const char *scope_str[] = {"", " zone", " cluster", " node"};
881 struct publication *publ = sseq->zone_list; 879 struct publication *publ = sseq->zone_list;
882 880
883 tipc_printf(buf, "%-10u %-10u ", sseq->lower, sseq->upper); 881 tipc_printf(buf, "%-10u %-10u ", sseq->lower, sseq->upper);
@@ -893,15 +891,8 @@ static void subseq_list(struct sub_seq *sseq, struct print_buf *buf, u32 depth,
893 tipc_node(publ->node), publ->ref); 891 tipc_node(publ->node), publ->ref);
894 tipc_printf(buf, "%-26s ", portIdStr); 892 tipc_printf(buf, "%-26s ", portIdStr);
895 if (depth > 3) { 893 if (depth > 3) {
896 if (publ->node != tipc_own_addr) 894 tipc_printf(buf, "%-10u %s", publ->key,
897 scopeStr = ""; 895 scope_str[publ->scope]);
898 else if (publ->scope == TIPC_NODE_SCOPE)
899 scopeStr = "node";
900 else if (publ->scope == TIPC_CLUSTER_SCOPE)
901 scopeStr = "cluster";
902 else
903 scopeStr = "zone";
904 tipc_printf(buf, "%-10u %s", publ->key, scopeStr);
905 } 896 }
906 897
907 publ = publ->zone_list_next; 898 publ = publ->zone_list_next;
@@ -951,24 +942,19 @@ static void nameseq_list(struct name_seq *seq, struct print_buf *buf, u32 depth,
951 942
952static void nametbl_header(struct print_buf *buf, u32 depth) 943static void nametbl_header(struct print_buf *buf, u32 depth)
953{ 944{
954 tipc_printf(buf, "Type "); 945 const char *header[] = {
955 946 "Type ",
956 if (depth > 1) 947 "Lower Upper ",
957 tipc_printf(buf, "Lower Upper "); 948 "Port Identity ",
958 if (depth > 2) 949 "Publication Scope"
959 tipc_printf(buf, "Port Identity "); 950 };
960 if (depth > 3) 951
961 tipc_printf(buf, "Publication"); 952 int i;
962 953
963 tipc_printf(buf, "\n-----------"); 954 if (depth > 4)
964 955 depth = 4;
965 if (depth > 1) 956 for (i = 0; i < depth; i++)
966 tipc_printf(buf, "--------------------- "); 957 tipc_printf(buf, header[i]);
967 if (depth > 2)
968 tipc_printf(buf, "-------------------------- ");
969 if (depth > 3)
970 tipc_printf(buf, "------------------");
971
972 tipc_printf(buf, "\n"); 958 tipc_printf(buf, "\n");
973} 959}
974 960
@@ -1023,16 +1009,6 @@ static void nametbl_list(struct print_buf *buf, u32 depth_info,
1023 } 1009 }
1024} 1010}
1025 1011
1026#if 0
1027void tipc_nametbl_print(struct print_buf *buf, const char *str)
1028{
1029 tipc_printf(buf, str);
1030 read_lock_bh(&tipc_nametbl_lock);
1031 nametbl_list(buf, 0, 0, 0, 0);
1032 read_unlock_bh(&tipc_nametbl_lock);
1033}
1034#endif
1035
1036#define MAX_NAME_TBL_QUERY 32768 1012#define MAX_NAME_TBL_QUERY 32768
1037 1013
1038struct sk_buff *tipc_nametbl_get(const void *req_tlv_area, int req_tlv_space) 1014struct sk_buff *tipc_nametbl_get(const void *req_tlv_area, int req_tlv_space)
@@ -1065,13 +1041,6 @@ struct sk_buff *tipc_nametbl_get(const void *req_tlv_area, int req_tlv_space)
1065 return buf; 1041 return buf;
1066} 1042}
1067 1043
1068#if 0
1069void tipc_nametbl_dump(void)
1070{
1071 nametbl_list(TIPC_CONS, 0, 0, 0, 0);
1072}
1073#endif
1074
1075int tipc_nametbl_init(void) 1044int tipc_nametbl_init(void)
1076{ 1045{
1077 table.types = kcalloc(tipc_nametbl_size, sizeof(struct hlist_head), 1046 table.types = kcalloc(tipc_nametbl_size, sizeof(struct hlist_head),
diff --git a/net/tipc/net.c b/net/tipc/net.c
index f61b7694138b..1a621cfd6604 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -129,15 +129,6 @@ u32 tipc_net_select_router(u32 addr, u32 ref)
129 return tipc_zone_select_router(tipc_net.zones[tipc_zone(addr)], addr, ref); 129 return tipc_zone_select_router(tipc_net.zones[tipc_zone(addr)], addr, ref);
130} 130}
131 131
132#if 0
133u32 tipc_net_next_node(u32 a)
134{
135 if (tipc_net.zones[tipc_zone(a)])
136 return tipc_zone_next_node(a);
137 return 0;
138}
139#endif
140
141void tipc_net_remove_as_router(u32 router) 132void tipc_net_remove_as_router(u32 router)
142{ 133{
143 u32 z_num; 134 u32 z_num;
@@ -248,6 +239,7 @@ void tipc_net_route_msg(struct sk_buff *buf)
248 239
249 /* Handle message for another node */ 240 /* Handle message for another node */
250 msg_dbg(msg, "NET>SEND>: "); 241 msg_dbg(msg, "NET>SEND>: ");
242 skb_trim(buf, msg_size(msg));
251 tipc_link_send(buf, dnode, msg_link_selector(msg)); 243 tipc_link_send(buf, dnode, msg_link_selector(msg));
252} 244}
253 245
diff --git a/net/tipc/node.c b/net/tipc/node.c
index b634942caba5..b4d87eb2dc5d 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -50,7 +50,8 @@ void node_print(struct print_buf *buf, struct tipc_node *n_ptr, char *str);
50static void node_lost_contact(struct tipc_node *n_ptr); 50static void node_lost_contact(struct tipc_node *n_ptr);
51static void node_established_contact(struct tipc_node *n_ptr); 51static void node_established_contact(struct tipc_node *n_ptr);
52 52
53struct tipc_node *tipc_nodes = NULL; /* sorted list of nodes within cluster */ 53/* sorted list of nodes within cluster */
54static struct tipc_node *tipc_nodes = NULL;
54 55
55static DEFINE_SPINLOCK(node_create_lock); 56static DEFINE_SPINLOCK(node_create_lock);
56 57
@@ -125,16 +126,6 @@ void tipc_node_delete(struct tipc_node *n_ptr)
125 if (!n_ptr) 126 if (!n_ptr)
126 return; 127 return;
127 128
128#if 0
129 /* Not needed because links are already deleted via tipc_bearer_stop() */
130
131 u32 l_num;
132
133 for (l_num = 0; l_num < MAX_BEARERS; l_num++) {
134 link_delete(n_ptr->links[l_num]);
135 }
136#endif
137
138 dbg("node %x deleted\n", n_ptr->addr); 129 dbg("node %x deleted\n", n_ptr->addr);
139 kfree(n_ptr); 130 kfree(n_ptr);
140} 131}
@@ -237,23 +228,22 @@ void tipc_node_link_down(struct tipc_node *n_ptr, struct link *l_ptr)
237 228
238int tipc_node_has_active_links(struct tipc_node *n_ptr) 229int tipc_node_has_active_links(struct tipc_node *n_ptr)
239{ 230{
240 return (n_ptr && 231 return n_ptr->active_links[0] != NULL;
241 ((n_ptr->active_links[0]) || (n_ptr->active_links[1])));
242} 232}
243 233
244int tipc_node_has_redundant_links(struct tipc_node *n_ptr) 234int tipc_node_has_redundant_links(struct tipc_node *n_ptr)
245{ 235{
246 return (n_ptr->working_links > 1); 236 return n_ptr->working_links > 1;
247} 237}
248 238
249static int tipc_node_has_active_routes(struct tipc_node *n_ptr) 239static int tipc_node_has_active_routes(struct tipc_node *n_ptr)
250{ 240{
251 return (n_ptr && (n_ptr->last_router >= 0)); 241 return n_ptr && (n_ptr->last_router >= 0);
252} 242}
253 243
254int tipc_node_is_up(struct tipc_node *n_ptr) 244int tipc_node_is_up(struct tipc_node *n_ptr)
255{ 245{
256 return (tipc_node_has_active_links(n_ptr) || tipc_node_has_active_routes(n_ptr)); 246 return tipc_node_has_active_links(n_ptr) || tipc_node_has_active_routes(n_ptr);
257} 247}
258 248
259struct tipc_node *tipc_node_attach_link(struct link *l_ptr) 249struct tipc_node *tipc_node_attach_link(struct link *l_ptr)
@@ -384,6 +374,20 @@ static void node_established_contact(struct tipc_node *n_ptr)
384 tipc_highest_allowed_slave); 374 tipc_highest_allowed_slave);
385} 375}
386 376
377static void node_cleanup_finished(unsigned long node_addr)
378{
379 struct tipc_node *n_ptr;
380
381 read_lock_bh(&tipc_net_lock);
382 n_ptr = tipc_node_find(node_addr);
383 if (n_ptr) {
384 tipc_node_lock(n_ptr);
385 n_ptr->cleanup_required = 0;
386 tipc_node_unlock(n_ptr);
387 }
388 read_unlock_bh(&tipc_net_lock);
389}
390
387static void node_lost_contact(struct tipc_node *n_ptr) 391static void node_lost_contact(struct tipc_node *n_ptr)
388{ 392{
389 struct cluster *c_ptr; 393 struct cluster *c_ptr;
@@ -458,6 +462,11 @@ static void node_lost_contact(struct tipc_node *n_ptr)
458 tipc_k_signal((Handler)ns->handle_node_down, 462 tipc_k_signal((Handler)ns->handle_node_down,
459 (unsigned long)ns->usr_handle); 463 (unsigned long)ns->usr_handle);
460 } 464 }
465
466 /* Prevent re-contact with node until all cleanup is done */
467
468 n_ptr->cleanup_required = 1;
469 tipc_k_signal((Handler)node_cleanup_finished, n_ptr->addr);
461} 470}
462 471
463/** 472/**
@@ -579,38 +588,6 @@ void tipc_node_remove_router(struct tipc_node *n_ptr, u32 router)
579 node_lost_contact(n_ptr); 588 node_lost_contact(n_ptr);
580} 589}
581 590
582#if 0
583void node_print(struct print_buf *buf, struct tipc_node *n_ptr, char *str)
584{
585 u32 i;
586
587 tipc_printf(buf, "\n\n%s", str);
588 for (i = 0; i < MAX_BEARERS; i++) {
589 if (!n_ptr->links[i])
590 continue;
591 tipc_printf(buf, "Links[%u]: %x, ", i, n_ptr->links[i]);
592 }
593 tipc_printf(buf, "Active links: [%x,%x]\n",
594 n_ptr->active_links[0], n_ptr->active_links[1]);
595}
596#endif
597
598u32 tipc_available_nodes(const u32 domain)
599{
600 struct tipc_node *n_ptr;
601 u32 cnt = 0;
602
603 read_lock_bh(&tipc_net_lock);
604 for (n_ptr = tipc_nodes; n_ptr; n_ptr = n_ptr->next) {
605 if (!tipc_in_scope(domain, n_ptr->addr))
606 continue;
607 if (tipc_node_is_up(n_ptr))
608 cnt++;
609 }
610 read_unlock_bh(&tipc_net_lock);
611 return cnt;
612}
613
614struct sk_buff *tipc_node_get_nodes(const void *req_tlv_area, int req_tlv_space) 591struct sk_buff *tipc_node_get_nodes(const void *req_tlv_area, int req_tlv_space)
615{ 592{
616 u32 domain; 593 u32 domain;
diff --git a/net/tipc/node.h b/net/tipc/node.h
index 6f990da5d143..fff331b2d26c 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -52,6 +52,7 @@
52 * @active_links: pointers to active links to node 52 * @active_links: pointers to active links to node
53 * @links: pointers to all links to node 53 * @links: pointers to all links to node
54 * @working_links: number of working links to node (both active and standby) 54 * @working_links: number of working links to node (both active and standby)
55 * @cleanup_required: non-zero if cleaning up after a prior loss of contact
55 * @link_cnt: number of links to node 56 * @link_cnt: number of links to node
56 * @permit_changeover: non-zero if node has redundant links to this system 57 * @permit_changeover: non-zero if node has redundant links to this system
57 * @routers: bitmap (used for multicluster communication) 58 * @routers: bitmap (used for multicluster communication)
@@ -78,6 +79,7 @@ struct tipc_node {
78 struct link *links[MAX_BEARERS]; 79 struct link *links[MAX_BEARERS];
79 int link_cnt; 80 int link_cnt;
80 int working_links; 81 int working_links;
82 int cleanup_required;
81 int permit_changeover; 83 int permit_changeover;
82 u32 routers[512/32]; 84 u32 routers[512/32];
83 int last_router; 85 int last_router;
@@ -94,7 +96,6 @@ struct tipc_node {
94 } bclink; 96 } bclink;
95}; 97};
96 98
97extern struct tipc_node *tipc_nodes;
98extern u32 tipc_own_tag; 99extern u32 tipc_own_tag;
99 100
100struct tipc_node *tipc_node_create(u32 addr); 101struct tipc_node *tipc_node_create(u32 addr);
diff --git a/net/tipc/port.c b/net/tipc/port.c
index 0737680e9266..82092eaa1536 100644
--- a/net/tipc/port.c
+++ b/net/tipc/port.c
@@ -293,34 +293,6 @@ int tipc_deleteport(u32 ref)
293 return 0; 293 return 0;
294} 294}
295 295
296/**
297 * tipc_get_port() - return port associated with 'ref'
298 *
299 * Note: Port is not locked.
300 */
301
302struct tipc_port *tipc_get_port(const u32 ref)
303{
304 return (struct tipc_port *)tipc_ref_deref(ref);
305}
306
307/**
308 * tipc_get_handle - return user handle associated to port 'ref'
309 */
310
311void *tipc_get_handle(const u32 ref)
312{
313 struct port *p_ptr;
314 void * handle;
315
316 p_ptr = tipc_port_lock(ref);
317 if (!p_ptr)
318 return NULL;
319 handle = p_ptr->publ.usr_handle;
320 tipc_port_unlock(p_ptr);
321 return handle;
322}
323
324static int port_unreliable(struct port *p_ptr) 296static int port_unreliable(struct port *p_ptr)
325{ 297{
326 return msg_src_droppable(&p_ptr->publ.phdr); 298 return msg_src_droppable(&p_ptr->publ.phdr);
@@ -392,7 +364,7 @@ static struct sk_buff *port_build_proto_msg(u32 destport, u32 destnode,
392 struct sk_buff *buf; 364 struct sk_buff *buf;
393 struct tipc_msg *msg; 365 struct tipc_msg *msg;
394 366
395 buf = buf_acquire(LONG_H_SIZE); 367 buf = tipc_buf_acquire(LONG_H_SIZE);
396 if (buf) { 368 if (buf) {
397 msg = buf_msg(buf); 369 msg = buf_msg(buf);
398 tipc_msg_init(msg, usr, type, LONG_H_SIZE, destnode); 370 tipc_msg_init(msg, usr, type, LONG_H_SIZE, destnode);
@@ -433,7 +405,7 @@ int tipc_reject_msg(struct sk_buff *buf, u32 err)
433 hdr_sz = MCAST_H_SIZE; 405 hdr_sz = MCAST_H_SIZE;
434 else 406 else
435 hdr_sz = LONG_H_SIZE; 407 hdr_sz = LONG_H_SIZE;
436 rbuf = buf_acquire(data_sz + hdr_sz); 408 rbuf = tipc_buf_acquire(data_sz + hdr_sz);
437 if (rbuf == NULL) { 409 if (rbuf == NULL) {
438 buf_discard(buf); 410 buf_discard(buf);
439 return data_sz; 411 return data_sz;
@@ -588,19 +560,10 @@ void tipc_port_recv_proto_msg(struct sk_buff *buf)
588 if (!p_ptr) { 560 if (!p_ptr) {
589 err = TIPC_ERR_NO_PORT; 561 err = TIPC_ERR_NO_PORT;
590 } else if (p_ptr->publ.connected) { 562 } else if (p_ptr->publ.connected) {
591 if (port_peernode(p_ptr) != msg_orignode(msg)) 563 if ((port_peernode(p_ptr) != msg_orignode(msg)) ||
564 (port_peerport(p_ptr) != msg_origport(msg))) {
592 err = TIPC_ERR_NO_PORT; 565 err = TIPC_ERR_NO_PORT;
593 if (port_peerport(p_ptr) != msg_origport(msg)) 566 } else if (msg_type(msg) == CONN_ACK) {
594 err = TIPC_ERR_NO_PORT;
595 if (!err && msg_routed(msg)) {
596 u32 seqno = msg_transp_seqno(msg);
597 u32 myno = ++p_ptr->last_in_seqno;
598 if (seqno != myno) {
599 err = TIPC_ERR_NO_PORT;
600 abort_buf = port_build_self_abort_msg(p_ptr, err);
601 }
602 }
603 if (msg_type(msg) == CONN_ACK) {
604 int wakeup = tipc_port_congested(p_ptr) && 567 int wakeup = tipc_port_congested(p_ptr) &&
605 p_ptr->publ.congested && 568 p_ptr->publ.congested &&
606 p_ptr->wakeup; 569 p_ptr->wakeup;
@@ -719,50 +682,6 @@ struct sk_buff *tipc_port_get_ports(void)
719 return buf; 682 return buf;
720} 683}
721 684
722#if 0
723
724#define MAX_PORT_STATS 2000
725
726struct sk_buff *port_show_stats(const void *req_tlv_area, int req_tlv_space)
727{
728 u32 ref;
729 struct port *p_ptr;
730 struct sk_buff *buf;
731 struct tlv_desc *rep_tlv;
732 struct print_buf pb;
733 int str_len;
734
735 if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_PORT_REF))
736 return cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
737
738 ref = *(u32 *)TLV_DATA(req_tlv_area);
739 ref = ntohl(ref);
740
741 p_ptr = tipc_port_lock(ref);
742 if (!p_ptr)
743 return cfg_reply_error_string("port not found");
744
745 buf = tipc_cfg_reply_alloc(TLV_SPACE(MAX_PORT_STATS));
746 if (!buf) {
747 tipc_port_unlock(p_ptr);
748 return NULL;
749 }
750 rep_tlv = (struct tlv_desc *)buf->data;
751
752 tipc_printbuf_init(&pb, TLV_DATA(rep_tlv), MAX_PORT_STATS);
753 port_print(p_ptr, &pb, 1);
754 /* NEED TO FILL IN ADDITIONAL PORT STATISTICS HERE */
755 tipc_port_unlock(p_ptr);
756 str_len = tipc_printbuf_validate(&pb);
757
758 skb_put(buf, TLV_SPACE(str_len));
759 TLV_SET(rep_tlv, TIPC_TLV_ULTRA_STRING, NULL, str_len);
760
761 return buf;
762}
763
764#endif
765
766void tipc_port_reinit(void) 685void tipc_port_reinit(void)
767{ 686{
768 struct port *p_ptr; 687 struct port *p_ptr;
@@ -1295,50 +1214,13 @@ int tipc_shutdown(u32 ref)
1295 return tipc_disconnect(ref); 1214 return tipc_disconnect(ref);
1296} 1215}
1297 1216
1298int tipc_isconnected(u32 ref, int *isconnected)
1299{
1300 struct port *p_ptr;
1301
1302 p_ptr = tipc_port_lock(ref);
1303 if (!p_ptr)
1304 return -EINVAL;
1305 *isconnected = p_ptr->publ.connected;
1306 tipc_port_unlock(p_ptr);
1307 return 0;
1308}
1309
1310int tipc_peer(u32 ref, struct tipc_portid *peer)
1311{
1312 struct port *p_ptr;
1313 int res;
1314
1315 p_ptr = tipc_port_lock(ref);
1316 if (!p_ptr)
1317 return -EINVAL;
1318 if (p_ptr->publ.connected) {
1319 peer->ref = port_peerport(p_ptr);
1320 peer->node = port_peernode(p_ptr);
1321 res = 0;
1322 } else
1323 res = -ENOTCONN;
1324 tipc_port_unlock(p_ptr);
1325 return res;
1326}
1327
1328int tipc_ref_valid(u32 ref)
1329{
1330 /* Works irrespective of type */
1331 return !!tipc_ref_deref(ref);
1332}
1333
1334
1335/* 1217/*
1336 * tipc_port_recv_sections(): Concatenate and deliver sectioned 1218 * tipc_port_recv_sections(): Concatenate and deliver sectioned
1337 * message for this node. 1219 * message for this node.
1338 */ 1220 */
1339 1221
1340int tipc_port_recv_sections(struct port *sender, unsigned int num_sect, 1222static int tipc_port_recv_sections(struct port *sender, unsigned int num_sect,
1341 struct iovec const *msg_sect) 1223 struct iovec const *msg_sect)
1342{ 1224{
1343 struct sk_buff *buf; 1225 struct sk_buff *buf;
1344 int res; 1226 int res;
@@ -1389,65 +1271,16 @@ int tipc_send(u32 ref, unsigned int num_sect, struct iovec const *msg_sect)
1389} 1271}
1390 1272
1391/** 1273/**
1392 * tipc_send_buf - send message buffer on connection
1393 */
1394
1395int tipc_send_buf(u32 ref, struct sk_buff *buf, unsigned int dsz)
1396{
1397 struct port *p_ptr;
1398 struct tipc_msg *msg;
1399 u32 destnode;
1400 u32 hsz;
1401 u32 sz;
1402 u32 res;
1403
1404 p_ptr = tipc_port_deref(ref);
1405 if (!p_ptr || !p_ptr->publ.connected)
1406 return -EINVAL;
1407
1408 msg = &p_ptr->publ.phdr;
1409 hsz = msg_hdr_sz(msg);
1410 sz = hsz + dsz;
1411 msg_set_size(msg, sz);
1412 if (skb_cow(buf, hsz))
1413 return -ENOMEM;
1414
1415 skb_push(buf, hsz);
1416 skb_copy_to_linear_data(buf, msg, hsz);
1417 destnode = msg_destnode(msg);
1418 p_ptr->publ.congested = 1;
1419 if (!tipc_port_congested(p_ptr)) {
1420 if (likely(destnode != tipc_own_addr))
1421 res = tipc_send_buf_fast(buf, destnode);
1422 else {
1423 tipc_port_recv_msg(buf);
1424 res = sz;
1425 }
1426 if (likely(res != -ELINKCONG)) {
1427 port_incr_out_seqno(p_ptr);
1428 p_ptr->sent++;
1429 p_ptr->publ.congested = 0;
1430 return res;
1431 }
1432 }
1433 if (port_unreliable(p_ptr)) {
1434 p_ptr->publ.congested = 0;
1435 return dsz;
1436 }
1437 return -ELINKCONG;
1438}
1439
1440/**
1441 * tipc_forward2name - forward message sections to port name 1274 * tipc_forward2name - forward message sections to port name
1442 */ 1275 */
1443 1276
1444int tipc_forward2name(u32 ref, 1277static int tipc_forward2name(u32 ref,
1445 struct tipc_name const *name, 1278 struct tipc_name const *name,
1446 u32 domain, 1279 u32 domain,
1447 u32 num_sect, 1280 u32 num_sect,
1448 struct iovec const *msg_sect, 1281 struct iovec const *msg_sect,
1449 struct tipc_portid const *orig, 1282 struct tipc_portid const *orig,
1450 unsigned int importance) 1283 unsigned int importance)
1451{ 1284{
1452 struct port *p_ptr; 1285 struct port *p_ptr;
1453 struct tipc_msg *msg; 1286 struct tipc_msg *msg;
@@ -1473,7 +1306,7 @@ int tipc_forward2name(u32 ref,
1473 msg_set_destnode(msg, destnode); 1306 msg_set_destnode(msg, destnode);
1474 msg_set_destport(msg, destport); 1307 msg_set_destport(msg, destport);
1475 1308
1476 if (likely(destport || destnode)) { 1309 if (likely(destport)) {
1477 p_ptr->sent++; 1310 p_ptr->sent++;
1478 if (likely(destnode == tipc_own_addr)) 1311 if (likely(destnode == tipc_own_addr))
1479 return tipc_port_recv_sections(p_ptr, num_sect, msg_sect); 1312 return tipc_port_recv_sections(p_ptr, num_sect, msg_sect);
@@ -1510,89 +1343,15 @@ int tipc_send2name(u32 ref,
1510} 1343}
1511 1344
1512/** 1345/**
1513 * tipc_forward_buf2name - forward message buffer to port name
1514 */
1515
1516int tipc_forward_buf2name(u32 ref,
1517 struct tipc_name const *name,
1518 u32 domain,
1519 struct sk_buff *buf,
1520 unsigned int dsz,
1521 struct tipc_portid const *orig,
1522 unsigned int importance)
1523{
1524 struct port *p_ptr;
1525 struct tipc_msg *msg;
1526 u32 destnode = domain;
1527 u32 destport;
1528 int res;
1529
1530 p_ptr = (struct port *)tipc_ref_deref(ref);
1531 if (!p_ptr || p_ptr->publ.connected)
1532 return -EINVAL;
1533
1534 msg = &p_ptr->publ.phdr;
1535 if (importance <= TIPC_CRITICAL_IMPORTANCE)
1536 msg_set_importance(msg, importance);
1537 msg_set_type(msg, TIPC_NAMED_MSG);
1538 msg_set_orignode(msg, orig->node);
1539 msg_set_origport(msg, orig->ref);
1540 msg_set_nametype(msg, name->type);
1541 msg_set_nameinst(msg, name->instance);
1542 msg_set_lookup_scope(msg, tipc_addr_scope(domain));
1543 msg_set_hdr_sz(msg, LONG_H_SIZE);
1544 msg_set_size(msg, LONG_H_SIZE + dsz);
1545 destport = tipc_nametbl_translate(name->type, name->instance, &destnode);
1546 msg_set_destnode(msg, destnode);
1547 msg_set_destport(msg, destport);
1548 msg_dbg(msg, "forw2name ==> ");
1549 if (skb_cow(buf, LONG_H_SIZE))
1550 return -ENOMEM;
1551 skb_push(buf, LONG_H_SIZE);
1552 skb_copy_to_linear_data(buf, msg, LONG_H_SIZE);
1553 msg_dbg(buf_msg(buf),"PREP:");
1554 if (likely(destport || destnode)) {
1555 p_ptr->sent++;
1556 if (destnode == tipc_own_addr)
1557 return tipc_port_recv_msg(buf);
1558 res = tipc_send_buf_fast(buf, destnode);
1559 if (likely(res != -ELINKCONG))
1560 return res;
1561 if (port_unreliable(p_ptr))
1562 return dsz;
1563 return -ELINKCONG;
1564 }
1565 return tipc_reject_msg(buf, TIPC_ERR_NO_NAME);
1566}
1567
1568/**
1569 * tipc_send_buf2name - send message buffer to port name
1570 */
1571
1572int tipc_send_buf2name(u32 ref,
1573 struct tipc_name const *dest,
1574 u32 domain,
1575 struct sk_buff *buf,
1576 unsigned int dsz)
1577{
1578 struct tipc_portid orig;
1579
1580 orig.ref = ref;
1581 orig.node = tipc_own_addr;
1582 return tipc_forward_buf2name(ref, dest, domain, buf, dsz, &orig,
1583 TIPC_PORT_IMPORTANCE);
1584}
1585
1586/**
1587 * tipc_forward2port - forward message sections to port identity 1346 * tipc_forward2port - forward message sections to port identity
1588 */ 1347 */
1589 1348
1590int tipc_forward2port(u32 ref, 1349static int tipc_forward2port(u32 ref,
1591 struct tipc_portid const *dest, 1350 struct tipc_portid const *dest,
1592 unsigned int num_sect, 1351 unsigned int num_sect,
1593 struct iovec const *msg_sect, 1352 struct iovec const *msg_sect,
1594 struct tipc_portid const *orig, 1353 struct tipc_portid const *orig,
1595 unsigned int importance) 1354 unsigned int importance)
1596{ 1355{
1597 struct port *p_ptr; 1356 struct port *p_ptr;
1598 struct tipc_msg *msg; 1357 struct tipc_msg *msg;
@@ -1644,12 +1403,12 @@ int tipc_send2port(u32 ref,
1644/** 1403/**
1645 * tipc_forward_buf2port - forward message buffer to port identity 1404 * tipc_forward_buf2port - forward message buffer to port identity
1646 */ 1405 */
1647int tipc_forward_buf2port(u32 ref, 1406static int tipc_forward_buf2port(u32 ref,
1648 struct tipc_portid const *dest, 1407 struct tipc_portid const *dest,
1649 struct sk_buff *buf, 1408 struct sk_buff *buf,
1650 unsigned int dsz, 1409 unsigned int dsz,
1651 struct tipc_portid const *orig, 1410 struct tipc_portid const *orig,
1652 unsigned int importance) 1411 unsigned int importance)
1653{ 1412{
1654 struct port *p_ptr; 1413 struct port *p_ptr;
1655 struct tipc_msg *msg; 1414 struct tipc_msg *msg;
diff --git a/net/tipc/port.h b/net/tipc/port.h
index 8d1652aab298..73bbf442b346 100644
--- a/net/tipc/port.h
+++ b/net/tipc/port.h
@@ -109,8 +109,6 @@ struct port {
109extern spinlock_t tipc_port_list_lock; 109extern spinlock_t tipc_port_list_lock;
110struct port_list; 110struct port_list;
111 111
112int tipc_port_recv_sections(struct port *p_ptr, u32 num_sect,
113 struct iovec const *msg_sect);
114int tipc_port_reject_sections(struct port *p_ptr, struct tipc_msg *hdr, 112int tipc_port_reject_sections(struct port *p_ptr, struct tipc_msg *hdr,
115 struct iovec const *msg_sect, u32 num_sect, 113 struct iovec const *msg_sect, u32 num_sect,
116 int err); 114 int err);
@@ -157,7 +155,7 @@ static inline u32 tipc_peer_node(struct port *p_ptr)
157 155
158static inline int tipc_port_congested(struct port *p_ptr) 156static inline int tipc_port_congested(struct port *p_ptr)
159{ 157{
160 return((p_ptr->sent - p_ptr->acked) >= (TIPC_FLOW_CONTROL_WIN * 2)); 158 return (p_ptr->sent - p_ptr->acked) >= (TIPC_FLOW_CONTROL_WIN * 2);
161} 159}
162 160
163/** 161/**
diff --git a/net/tipc/ref.c b/net/tipc/ref.c
index 8dea66500cf5..ab8ad32d8c20 100644
--- a/net/tipc/ref.c
+++ b/net/tipc/ref.c
@@ -282,23 +282,6 @@ void *tipc_ref_lock(u32 ref)
282 return NULL; 282 return NULL;
283} 283}
284 284
285/**
286 * tipc_ref_unlock - unlock referenced object
287 */
288
289void tipc_ref_unlock(u32 ref)
290{
291 if (likely(tipc_ref_table.entries)) {
292 struct reference *entry;
293
294 entry = &tipc_ref_table.entries[ref &
295 tipc_ref_table.index_mask];
296 if (likely((entry->ref == ref) && (entry->object)))
297 spin_unlock_bh(&entry->lock);
298 else
299 err("Attempt to unlock non-existent reference\n");
300 }
301}
302 285
303/** 286/**
304 * tipc_ref_deref - return pointer referenced object (without locking it) 287 * tipc_ref_deref - return pointer referenced object (without locking it)
diff --git a/net/tipc/ref.h b/net/tipc/ref.h
index 7e3798ea93b9..5bc8e7ab84de 100644
--- a/net/tipc/ref.h
+++ b/net/tipc/ref.h
@@ -44,7 +44,6 @@ u32 tipc_ref_acquire(void *object, spinlock_t **lock);
44void tipc_ref_discard(u32 ref); 44void tipc_ref_discard(u32 ref);
45 45
46void *tipc_ref_lock(u32 ref); 46void *tipc_ref_lock(u32 ref);
47void tipc_ref_unlock(u32 ref);
48void *tipc_ref_deref(u32 ref); 47void *tipc_ref_deref(u32 ref);
49 48
50#endif 49#endif
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 66e889ba48fd..33217fc3d697 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -64,6 +64,7 @@ struct tipc_sock {
64 struct sock sk; 64 struct sock sk;
65 struct tipc_port *p; 65 struct tipc_port *p;
66 struct tipc_portid peer_name; 66 struct tipc_portid peer_name;
67 long conn_timeout;
67}; 68};
68 69
69#define tipc_sk(sk) ((struct tipc_sock *)(sk)) 70#define tipc_sk(sk) ((struct tipc_sock *)(sk))
@@ -240,9 +241,9 @@ static int tipc_create(struct net *net, struct socket *sock, int protocol,
240 sock->state = state; 241 sock->state = state;
241 242
242 sock_init_data(sock, sk); 243 sock_init_data(sock, sk);
243 sk->sk_rcvtimeo = msecs_to_jiffies(CONN_TIMEOUT_DEFAULT);
244 sk->sk_backlog_rcv = backlog_rcv; 244 sk->sk_backlog_rcv = backlog_rcv;
245 tipc_sk(sk)->p = tp_ptr; 245 tipc_sk(sk)->p = tp_ptr;
246 tipc_sk(sk)->conn_timeout = msecs_to_jiffies(CONN_TIMEOUT_DEFAULT);
246 247
247 spin_unlock_bh(tp_ptr->lock); 248 spin_unlock_bh(tp_ptr->lock);
248 249
@@ -429,36 +430,55 @@ static int get_name(struct socket *sock, struct sockaddr *uaddr,
429 * to handle any preventable race conditions, so TIPC will do the same ... 430 * to handle any preventable race conditions, so TIPC will do the same ...
430 * 431 *
431 * TIPC sets the returned events as follows: 432 * TIPC sets the returned events as follows:
432 * a) POLLRDNORM and POLLIN are set if the socket's receive queue is non-empty 433 *
433 * or if a connection-oriented socket is does not have an active connection 434 * socket state flags set
434 * (i.e. a read operation will not block). 435 * ------------ ---------
435 * b) POLLOUT is set except when a socket's connection has been terminated 436 * unconnected no read flags
436 * (i.e. a write operation will not block). 437 * no write flags
437 * c) POLLHUP is set when a socket's connection has been terminated. 438 *
438 * 439 * connecting POLLIN/POLLRDNORM if ACK/NACK in rx queue
439 * IMPORTANT: The fact that a read or write operation will not block does NOT 440 * no write flags
440 * imply that the operation will succeed! 441 *
442 * connected POLLIN/POLLRDNORM if data in rx queue
443 * POLLOUT if port is not congested
444 *
445 * disconnecting POLLIN/POLLRDNORM/POLLHUP
446 * no write flags
447 *
448 * listening POLLIN if SYN in rx queue
449 * no write flags
450 *
451 * ready POLLIN/POLLRDNORM if data in rx queue
452 * [connectionless] POLLOUT (since port cannot be congested)
453 *
454 * IMPORTANT: The fact that a read or write operation is indicated does NOT
455 * imply that the operation will succeed, merely that it should be performed
456 * and will not block.
441 */ 457 */
442 458
443static unsigned int poll(struct file *file, struct socket *sock, 459static unsigned int poll(struct file *file, struct socket *sock,
444 poll_table *wait) 460 poll_table *wait)
445{ 461{
446 struct sock *sk = sock->sk; 462 struct sock *sk = sock->sk;
447 u32 mask; 463 u32 mask = 0;
448 464
449 poll_wait(file, sk_sleep(sk), wait); 465 poll_wait(file, sk_sleep(sk), wait);
450 466
451 if (!skb_queue_empty(&sk->sk_receive_queue) || 467 switch ((int)sock->state) {
452 (sock->state == SS_UNCONNECTED) || 468 case SS_READY:
453 (sock->state == SS_DISCONNECTING)) 469 case SS_CONNECTED:
454 mask = (POLLRDNORM | POLLIN); 470 if (!tipc_sk_port(sk)->congested)
455 else 471 mask |= POLLOUT;
456 mask = 0; 472 /* fall thru' */
457 473 case SS_CONNECTING:
458 if (sock->state == SS_DISCONNECTING) 474 case SS_LISTENING:
459 mask |= POLLHUP; 475 if (!skb_queue_empty(&sk->sk_receive_queue))
460 else 476 mask |= (POLLIN | POLLRDNORM);
461 mask |= POLLOUT; 477 break;
478 case SS_DISCONNECTING:
479 mask = (POLLIN | POLLRDNORM | POLLHUP);
480 break;
481 }
462 482
463 return mask; 483 return mask;
464} 484}
@@ -1026,9 +1046,8 @@ static int recv_stream(struct kiocb *iocb, struct socket *sock,
1026 struct sk_buff *buf; 1046 struct sk_buff *buf;
1027 struct tipc_msg *msg; 1047 struct tipc_msg *msg;
1028 unsigned int sz; 1048 unsigned int sz;
1029 int sz_to_copy; 1049 int sz_to_copy, target, needed;
1030 int sz_copied = 0; 1050 int sz_copied = 0;
1031 int needed;
1032 char __user *crs = m->msg_iov->iov_base; 1051 char __user *crs = m->msg_iov->iov_base;
1033 unsigned char *buf_crs; 1052 unsigned char *buf_crs;
1034 u32 err; 1053 u32 err;
@@ -1050,6 +1069,8 @@ static int recv_stream(struct kiocb *iocb, struct socket *sock,
1050 goto exit; 1069 goto exit;
1051 } 1070 }
1052 1071
1072 target = sock_rcvlowat(sk, flags & MSG_WAITALL, buf_len);
1073
1053restart: 1074restart:
1054 1075
1055 /* Look for a message in receive queue; wait if necessary */ 1076 /* Look for a message in receive queue; wait if necessary */
@@ -1138,7 +1159,7 @@ restart:
1138 1159
1139 if ((sz_copied < buf_len) && /* didn't get all requested data */ 1160 if ((sz_copied < buf_len) && /* didn't get all requested data */
1140 (!skb_queue_empty(&sk->sk_receive_queue) || 1161 (!skb_queue_empty(&sk->sk_receive_queue) ||
1141 (flags & MSG_WAITALL)) && /* and more is ready or required */ 1162 (sz_copied < target)) && /* and more is ready or required */
1142 (!(flags & MSG_PEEK)) && /* and aren't just peeking at data */ 1163 (!(flags & MSG_PEEK)) && /* and aren't just peeking at data */
1143 (!err)) /* and haven't reached a FIN */ 1164 (!err)) /* and haven't reached a FIN */
1144 goto restart; 1165 goto restart;
@@ -1174,7 +1195,7 @@ static int rx_queue_full(struct tipc_msg *msg, u32 queue_size, u32 base)
1174 if (msg_connected(msg)) 1195 if (msg_connected(msg))
1175 threshold *= 4; 1196 threshold *= 4;
1176 1197
1177 return (queue_size >= threshold); 1198 return queue_size >= threshold;
1178} 1199}
1179 1200
1180/** 1201/**
@@ -1365,6 +1386,7 @@ static int connect(struct socket *sock, struct sockaddr *dest, int destlen,
1365 struct msghdr m = {NULL,}; 1386 struct msghdr m = {NULL,};
1366 struct sk_buff *buf; 1387 struct sk_buff *buf;
1367 struct tipc_msg *msg; 1388 struct tipc_msg *msg;
1389 long timeout;
1368 int res; 1390 int res;
1369 1391
1370 lock_sock(sk); 1392 lock_sock(sk);
@@ -1379,7 +1401,7 @@ static int connect(struct socket *sock, struct sockaddr *dest, int destlen,
1379 /* For now, TIPC does not support the non-blocking form of connect() */ 1401 /* For now, TIPC does not support the non-blocking form of connect() */
1380 1402
1381 if (flags & O_NONBLOCK) { 1403 if (flags & O_NONBLOCK) {
1382 res = -EWOULDBLOCK; 1404 res = -EOPNOTSUPP;
1383 goto exit; 1405 goto exit;
1384 } 1406 }
1385 1407
@@ -1425,11 +1447,12 @@ static int connect(struct socket *sock, struct sockaddr *dest, int destlen,
1425 1447
1426 /* Wait until an 'ACK' or 'RST' arrives, or a timeout occurs */ 1448 /* Wait until an 'ACK' or 'RST' arrives, or a timeout occurs */
1427 1449
1450 timeout = tipc_sk(sk)->conn_timeout;
1428 release_sock(sk); 1451 release_sock(sk);
1429 res = wait_event_interruptible_timeout(*sk_sleep(sk), 1452 res = wait_event_interruptible_timeout(*sk_sleep(sk),
1430 (!skb_queue_empty(&sk->sk_receive_queue) || 1453 (!skb_queue_empty(&sk->sk_receive_queue) ||
1431 (sock->state != SS_CONNECTING)), 1454 (sock->state != SS_CONNECTING)),
1432 sk->sk_rcvtimeo); 1455 timeout ? timeout : MAX_SCHEDULE_TIMEOUT);
1433 lock_sock(sk); 1456 lock_sock(sk);
1434 1457
1435 if (res > 0) { 1458 if (res > 0) {
@@ -1692,7 +1715,7 @@ static int setsockopt(struct socket *sock,
1692 res = tipc_set_portunreturnable(tport->ref, value); 1715 res = tipc_set_portunreturnable(tport->ref, value);
1693 break; 1716 break;
1694 case TIPC_CONN_TIMEOUT: 1717 case TIPC_CONN_TIMEOUT:
1695 sk->sk_rcvtimeo = msecs_to_jiffies(value); 1718 tipc_sk(sk)->conn_timeout = msecs_to_jiffies(value);
1696 /* no need to set "res", since already 0 at this point */ 1719 /* no need to set "res", since already 0 at this point */
1697 break; 1720 break;
1698 default: 1721 default:
@@ -1747,7 +1770,7 @@ static int getsockopt(struct socket *sock,
1747 res = tipc_portunreturnable(tport->ref, &value); 1770 res = tipc_portunreturnable(tport->ref, &value);
1748 break; 1771 break;
1749 case TIPC_CONN_TIMEOUT: 1772 case TIPC_CONN_TIMEOUT:
1750 value = jiffies_to_msecs(sk->sk_rcvtimeo); 1773 value = jiffies_to_msecs(tipc_sk(sk)->conn_timeout);
1751 /* no need to set "res", since already 0 at this point */ 1774 /* no need to set "res", since already 0 at this point */
1752 break; 1775 break;
1753 case TIPC_NODE_RECVQ_DEPTH: 1776 case TIPC_NODE_RECVQ_DEPTH:
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index ab6eab4c45e2..33313961d010 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -76,6 +76,19 @@ struct top_srv {
76static struct top_srv topsrv = { 0 }; 76static struct top_srv topsrv = { 0 };
77 77
78/** 78/**
79 * htohl - convert value to endianness used by destination
80 * @in: value to convert
81 * @swap: non-zero if endianness must be reversed
82 *
83 * Returns converted value
84 */
85
86static u32 htohl(u32 in, int swap)
87{
88 return swap ? swab32(in) : in;
89}
90
91/**
79 * subscr_send_event - send a message containing a tipc_event to the subscriber 92 * subscr_send_event - send a message containing a tipc_event to the subscriber
80 * 93 *
81 * Note: Must not hold subscriber's server port lock, since tipc_send() will 94 * Note: Must not hold subscriber's server port lock, since tipc_send() will
@@ -94,11 +107,11 @@ static void subscr_send_event(struct subscription *sub,
94 msg_sect.iov_base = (void *)&sub->evt; 107 msg_sect.iov_base = (void *)&sub->evt;
95 msg_sect.iov_len = sizeof(struct tipc_event); 108 msg_sect.iov_len = sizeof(struct tipc_event);
96 109
97 sub->evt.event = htonl(event); 110 sub->evt.event = htohl(event, sub->swap);
98 sub->evt.found_lower = htonl(found_lower); 111 sub->evt.found_lower = htohl(found_lower, sub->swap);
99 sub->evt.found_upper = htonl(found_upper); 112 sub->evt.found_upper = htohl(found_upper, sub->swap);
100 sub->evt.port.ref = htonl(port_ref); 113 sub->evt.port.ref = htohl(port_ref, sub->swap);
101 sub->evt.port.node = htonl(node); 114 sub->evt.port.node = htohl(node, sub->swap);
102 tipc_send(sub->server_ref, 1, &msg_sect); 115 tipc_send(sub->server_ref, 1, &msg_sect);
103} 116}
104 117
@@ -274,29 +287,16 @@ static void subscr_cancel(struct tipc_subscr *s,
274{ 287{
275 struct subscription *sub; 288 struct subscription *sub;
276 struct subscription *sub_temp; 289 struct subscription *sub_temp;
277 __u32 type, lower, upper, timeout, filter;
278 int found = 0; 290 int found = 0;
279 291
280 /* Find first matching subscription, exit if not found */ 292 /* Find first matching subscription, exit if not found */
281 293
282 type = ntohl(s->seq.type);
283 lower = ntohl(s->seq.lower);
284 upper = ntohl(s->seq.upper);
285 timeout = ntohl(s->timeout);
286 filter = ntohl(s->filter) & ~TIPC_SUB_CANCEL;
287
288 list_for_each_entry_safe(sub, sub_temp, &subscriber->subscription_list, 294 list_for_each_entry_safe(sub, sub_temp, &subscriber->subscription_list,
289 subscription_list) { 295 subscription_list) {
290 if ((type == sub->seq.type) && 296 if (!memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr))) {
291 (lower == sub->seq.lower) && 297 found = 1;
292 (upper == sub->seq.upper) && 298 break;
293 (timeout == sub->timeout) && 299 }
294 (filter == sub->filter) &&
295 !memcmp(s->usr_handle,sub->evt.s.usr_handle,
296 sizeof(s->usr_handle)) ){
297 found = 1;
298 break;
299 }
300 } 300 }
301 if (!found) 301 if (!found)
302 return; 302 return;
@@ -310,7 +310,7 @@ static void subscr_cancel(struct tipc_subscr *s,
310 k_term_timer(&sub->timer); 310 k_term_timer(&sub->timer);
311 spin_lock_bh(subscriber->lock); 311 spin_lock_bh(subscriber->lock);
312 } 312 }
313 dbg("Cancel: removing sub %u,%u,%u from subscriber %p list\n", 313 dbg("Cancel: removing sub %u,%u,%u from subscriber %x list\n",
314 sub->seq.type, sub->seq.lower, sub->seq.upper, subscriber); 314 sub->seq.type, sub->seq.lower, sub->seq.upper, subscriber);
315 subscr_del(sub); 315 subscr_del(sub);
316} 316}
@@ -325,10 +325,16 @@ static struct subscription *subscr_subscribe(struct tipc_subscr *s,
325 struct subscriber *subscriber) 325 struct subscriber *subscriber)
326{ 326{
327 struct subscription *sub; 327 struct subscription *sub;
328 int swap;
329
330 /* Determine subscriber's endianness */
331
332 swap = !(s->filter & (TIPC_SUB_PORTS | TIPC_SUB_SERVICE));
328 333
329 /* Detect & process a subscription cancellation request */ 334 /* Detect & process a subscription cancellation request */
330 335
331 if (ntohl(s->filter) & TIPC_SUB_CANCEL) { 336 if (s->filter & htohl(TIPC_SUB_CANCEL, swap)) {
337 s->filter &= ~htohl(TIPC_SUB_CANCEL, swap);
332 subscr_cancel(s, subscriber); 338 subscr_cancel(s, subscriber);
333 return NULL; 339 return NULL;
334 } 340 }
@@ -353,12 +359,13 @@ static struct subscription *subscr_subscribe(struct tipc_subscr *s,
353 359
354 /* Initialize subscription object */ 360 /* Initialize subscription object */
355 361
356 sub->seq.type = ntohl(s->seq.type); 362 sub->seq.type = htohl(s->seq.type, swap);
357 sub->seq.lower = ntohl(s->seq.lower); 363 sub->seq.lower = htohl(s->seq.lower, swap);
358 sub->seq.upper = ntohl(s->seq.upper); 364 sub->seq.upper = htohl(s->seq.upper, swap);
359 sub->timeout = ntohl(s->timeout); 365 sub->timeout = htohl(s->timeout, swap);
360 sub->filter = ntohl(s->filter); 366 sub->filter = htohl(s->filter, swap);
361 if ((sub->filter && (sub->filter != TIPC_SUB_PORTS)) || 367 if ((!(sub->filter & TIPC_SUB_PORTS) ==
368 !(sub->filter & TIPC_SUB_SERVICE)) ||
362 (sub->seq.lower > sub->seq.upper)) { 369 (sub->seq.lower > sub->seq.upper)) {
363 warn("Subscription rejected, illegal request\n"); 370 warn("Subscription rejected, illegal request\n");
364 kfree(sub); 371 kfree(sub);
@@ -369,6 +376,7 @@ static struct subscription *subscr_subscribe(struct tipc_subscr *s,
369 INIT_LIST_HEAD(&sub->nameseq_list); 376 INIT_LIST_HEAD(&sub->nameseq_list);
370 list_add(&sub->subscription_list, &subscriber->subscription_list); 377 list_add(&sub->subscription_list, &subscriber->subscription_list);
371 sub->server_ref = subscriber->port_ref; 378 sub->server_ref = subscriber->port_ref;
379 sub->swap = swap;
372 memcpy(&sub->evt.s, s, sizeof(struct tipc_subscr)); 380 memcpy(&sub->evt.s, s, sizeof(struct tipc_subscr));
373 atomic_inc(&topsrv.subscription_count); 381 atomic_inc(&topsrv.subscription_count);
374 if (sub->timeout != TIPC_WAIT_FOREVER) { 382 if (sub->timeout != TIPC_WAIT_FOREVER) {
@@ -598,12 +606,3 @@ void tipc_subscr_stop(void)
598 topsrv.user_ref = 0; 606 topsrv.user_ref = 0;
599 } 607 }
600} 608}
601
602
603int tipc_ispublished(struct tipc_name const *name)
604{
605 u32 domain = 0;
606
607 return(tipc_nametbl_translate(name->type, name->instance,&domain) != 0);
608}
609
diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h
index c20f496d95b2..45d89bf4d202 100644
--- a/net/tipc/subscr.h
+++ b/net/tipc/subscr.h
@@ -53,6 +53,7 @@ typedef void (*tipc_subscr_event) (struct subscription *sub,
53 * @nameseq_list: adjacent subscriptions in name sequence's subscription list 53 * @nameseq_list: adjacent subscriptions in name sequence's subscription list
54 * @subscription_list: adjacent subscriptions in subscriber's subscription list 54 * @subscription_list: adjacent subscriptions in subscriber's subscription list
55 * @server_ref: object reference of server port associated with subscription 55 * @server_ref: object reference of server port associated with subscription
56 * @swap: indicates if subscriber uses opposite endianness in its messages
56 * @evt: template for events generated by subscription 57 * @evt: template for events generated by subscription
57 */ 58 */
58 59
@@ -65,6 +66,7 @@ struct subscription {
65 struct list_head nameseq_list; 66 struct list_head nameseq_list;
66 struct list_head subscription_list; 67 struct list_head subscription_list;
67 u32 server_ref; 68 u32 server_ref;
69 int swap;
68 struct tipc_event evt; 70 struct tipc_event evt;
69}; 71};
70 72
diff --git a/net/tipc/zone.c b/net/tipc/zone.c
index 2c01ba2d86bf..83f8b5e91fc8 100644
--- a/net/tipc/zone.c
+++ b/net/tipc/zone.c
@@ -160,14 +160,3 @@ u32 tipc_zone_select_router(struct _zone *z_ptr, u32 addr, u32 ref)
160 } 160 }
161 return 0; 161 return 0;
162} 162}
163
164
165u32 tipc_zone_next_node(u32 addr)
166{
167 struct cluster *c_ptr = tipc_cltr_find(addr);
168
169 if (c_ptr)
170 return tipc_cltr_next_node(c_ptr, addr);
171 return 0;
172}
173
diff --git a/net/tipc/zone.h b/net/tipc/zone.h
index 7bdc3406ba9b..bd1c20ce9d06 100644
--- a/net/tipc/zone.h
+++ b/net/tipc/zone.h
@@ -61,7 +61,6 @@ void tipc_zone_send_external_routes(struct _zone *z_ptr, u32 dest);
61struct _zone *tipc_zone_create(u32 addr); 61struct _zone *tipc_zone_create(u32 addr);
62void tipc_zone_delete(struct _zone *z_ptr); 62void tipc_zone_delete(struct _zone *z_ptr);
63void tipc_zone_attach_cluster(struct _zone *z_ptr, struct cluster *c_ptr); 63void tipc_zone_attach_cluster(struct _zone *z_ptr, struct cluster *c_ptr);
64u32 tipc_zone_next_node(u32 addr);
65 64
66static inline struct _zone *tipc_zone_find(u32 addr) 65static inline struct _zone *tipc_zone_find(u32 addr)
67{ 66{
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 4414a18c63b4..3c95304a0817 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -117,7 +117,7 @@
117 117
118static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; 118static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
119static DEFINE_SPINLOCK(unix_table_lock); 119static DEFINE_SPINLOCK(unix_table_lock);
120static atomic_t unix_nr_socks = ATOMIC_INIT(0); 120static atomic_long_t unix_nr_socks;
121 121
122#define unix_sockets_unbound (&unix_socket_table[UNIX_HASH_SIZE]) 122#define unix_sockets_unbound (&unix_socket_table[UNIX_HASH_SIZE])
123 123
@@ -360,13 +360,13 @@ static void unix_sock_destructor(struct sock *sk)
360 if (u->addr) 360 if (u->addr)
361 unix_release_addr(u->addr); 361 unix_release_addr(u->addr);
362 362
363 atomic_dec(&unix_nr_socks); 363 atomic_long_dec(&unix_nr_socks);
364 local_bh_disable(); 364 local_bh_disable();
365 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 365 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
366 local_bh_enable(); 366 local_bh_enable();
367#ifdef UNIX_REFCNT_DEBUG 367#ifdef UNIX_REFCNT_DEBUG
368 printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk, 368 printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk,
369 atomic_read(&unix_nr_socks)); 369 atomic_long_read(&unix_nr_socks));
370#endif 370#endif
371} 371}
372 372
@@ -606,8 +606,8 @@ static struct sock *unix_create1(struct net *net, struct socket *sock)
606 struct sock *sk = NULL; 606 struct sock *sk = NULL;
607 struct unix_sock *u; 607 struct unix_sock *u;
608 608
609 atomic_inc(&unix_nr_socks); 609 atomic_long_inc(&unix_nr_socks);
610 if (atomic_read(&unix_nr_socks) > 2 * get_max_files()) 610 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
611 goto out; 611 goto out;
612 612
613 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto); 613 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
@@ -632,7 +632,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock)
632 unix_insert_socket(unix_sockets_unbound, sk); 632 unix_insert_socket(unix_sockets_unbound, sk);
633out: 633out:
634 if (sk == NULL) 634 if (sk == NULL)
635 atomic_dec(&unix_nr_socks); 635 atomic_long_dec(&unix_nr_socks);
636 else { 636 else {
637 local_bh_disable(); 637 local_bh_disable();
638 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 638 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
@@ -692,6 +692,7 @@ static int unix_autobind(struct socket *sock)
692 static u32 ordernum = 1; 692 static u32 ordernum = 1;
693 struct unix_address *addr; 693 struct unix_address *addr;
694 int err; 694 int err;
695 unsigned int retries = 0;
695 696
696 mutex_lock(&u->readlock); 697 mutex_lock(&u->readlock);
697 698
@@ -717,9 +718,17 @@ retry:
717 if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type, 718 if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
718 addr->hash)) { 719 addr->hash)) {
719 spin_unlock(&unix_table_lock); 720 spin_unlock(&unix_table_lock);
720 /* Sanity yield. It is unusual case, but yet... */ 721 /*
721 if (!(ordernum&0xFF)) 722 * __unix_find_socket_byname() may take long time if many names
722 yield(); 723 * are already in use.
724 */
725 cond_resched();
726 /* Give up if all names seems to be in use. */
727 if (retries++ == 0xFFFFF) {
728 err = -ENOSPC;
729 kfree(addr);
730 goto out;
731 }
723 goto retry; 732 goto retry;
724 } 733 }
725 addr->hash ^= sk->sk_type; 734 addr->hash ^= sk->sk_type;
@@ -1502,6 +1511,8 @@ restart:
1502 goto restart; 1511 goto restart;
1503 } 1512 }
1504 1513
1514 if (sock_flag(other, SOCK_RCVTSTAMP))
1515 __net_timestamp(skb);
1505 skb_queue_tail(&other->sk_receive_queue, skb); 1516 skb_queue_tail(&other->sk_receive_queue, skb);
1506 unix_state_unlock(other); 1517 unix_state_unlock(other);
1507 other->sk_data_ready(other, len); 1518 other->sk_data_ready(other, len);
@@ -1713,6 +1724,9 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1713 if (err) 1724 if (err)
1714 goto out_free; 1725 goto out_free;
1715 1726
1727 if (sock_flag(sk, SOCK_RCVTSTAMP))
1728 __sock_recv_timestamp(msg, sk, skb);
1729
1716 if (!siocb->scm) { 1730 if (!siocb->scm) {
1717 siocb->scm = &tmp_scm; 1731 siocb->scm = &tmp_scm;
1718 memset(&tmp_scm, 0, sizeof(tmp_scm)); 1732 memset(&tmp_scm, 0, sizeof(tmp_scm));
@@ -2024,11 +2038,10 @@ static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table
2024 if (sk->sk_shutdown == SHUTDOWN_MASK) 2038 if (sk->sk_shutdown == SHUTDOWN_MASK)
2025 mask |= POLLHUP; 2039 mask |= POLLHUP;
2026 if (sk->sk_shutdown & RCV_SHUTDOWN) 2040 if (sk->sk_shutdown & RCV_SHUTDOWN)
2027 mask |= POLLRDHUP; 2041 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2028 2042
2029 /* readable? */ 2043 /* readable? */
2030 if (!skb_queue_empty(&sk->sk_receive_queue) || 2044 if (!skb_queue_empty(&sk->sk_receive_queue))
2031 (sk->sk_shutdown & RCV_SHUTDOWN))
2032 mask |= POLLIN | POLLRDNORM; 2045 mask |= POLLIN | POLLRDNORM;
2033 2046
2034 /* Connection-based need to check for termination and startup */ 2047 /* Connection-based need to check for termination and startup */
diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c
index 2bf23406637a..74944a2dd436 100644
--- a/net/wanrouter/wanmain.c
+++ b/net/wanrouter/wanmain.c
@@ -471,7 +471,7 @@ static int wanrouter_device_setup(struct wan_device *wandev,
471 data = vmalloc(conf->data_size); 471 data = vmalloc(conf->data_size);
472 if (!data) { 472 if (!data) {
473 printk(KERN_INFO 473 printk(KERN_INFO
474 "%s: ERROR, Faild allocate kernel memory !\n", 474 "%s: ERROR, Failed allocate kernel memory !\n",
475 wandev->name); 475 wandev->name);
476 kfree(conf); 476 kfree(conf);
477 return -ENOBUFS; 477 return -ENOBUFS;
@@ -481,7 +481,7 @@ static int wanrouter_device_setup(struct wan_device *wandev,
481 err = wandev->setup(wandev, conf); 481 err = wandev->setup(wandev, conf);
482 } else { 482 } else {
483 printk(KERN_INFO 483 printk(KERN_INFO
484 "%s: ERROR, Faild to copy from user data !\n", 484 "%s: ERROR, Failed to copy from user data !\n",
485 wandev->name); 485 wandev->name);
486 err = -EFAULT; 486 err = -EFAULT;
487 } 487 }
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 541e2fff5e9c..9c21ebf9780e 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -253,11 +253,16 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev,
253 WARN_ON(err); 253 WARN_ON(err);
254 wdev->netdev->features |= NETIF_F_NETNS_LOCAL; 254 wdev->netdev->features |= NETIF_F_NETNS_LOCAL;
255 } 255 }
256
257 return err;
256 } 258 }
257 259
258 wiphy_net_set(&rdev->wiphy, net); 260 wiphy_net_set(&rdev->wiphy, net);
259 261
260 return err; 262 err = device_rename(&rdev->wiphy.dev, dev_name(&rdev->wiphy.dev));
263 WARN_ON(err);
264
265 return 0;
261} 266}
262 267
263static void cfg80211_rfkill_poll(struct rfkill *rfkill, void *data) 268static void cfg80211_rfkill_poll(struct rfkill *rfkill, void *data)
@@ -428,7 +433,7 @@ int wiphy_register(struct wiphy *wiphy)
428 433
429 /* sanity check ifmodes */ 434 /* sanity check ifmodes */
430 WARN_ON(!ifmodes); 435 WARN_ON(!ifmodes);
431 ifmodes &= ((1 << __NL80211_IFTYPE_AFTER_LAST) - 1) & ~1; 436 ifmodes &= ((1 << NUM_NL80211_IFTYPES) - 1) & ~1;
432 if (WARN_ON(ifmodes != wiphy->interface_modes)) 437 if (WARN_ON(ifmodes != wiphy->interface_modes))
433 wiphy->interface_modes = ifmodes; 438 wiphy->interface_modes = ifmodes;
434 439
@@ -475,12 +480,10 @@ int wiphy_register(struct wiphy *wiphy)
475 mutex_lock(&cfg80211_mutex); 480 mutex_lock(&cfg80211_mutex);
476 481
477 res = device_add(&rdev->wiphy.dev); 482 res = device_add(&rdev->wiphy.dev);
478 if (res) 483 if (res) {
479 goto out_unlock; 484 mutex_unlock(&cfg80211_mutex);
480 485 return res;
481 res = rfkill_register(rdev->rfkill); 486 }
482 if (res)
483 goto out_rm_dev;
484 487
485 /* set up regulatory info */ 488 /* set up regulatory info */
486 wiphy_update_regulatory(wiphy, NL80211_REGDOM_SET_BY_CORE); 489 wiphy_update_regulatory(wiphy, NL80211_REGDOM_SET_BY_CORE);
@@ -509,13 +512,18 @@ int wiphy_register(struct wiphy *wiphy)
509 cfg80211_debugfs_rdev_add(rdev); 512 cfg80211_debugfs_rdev_add(rdev);
510 mutex_unlock(&cfg80211_mutex); 513 mutex_unlock(&cfg80211_mutex);
511 514
515 /*
516 * due to a locking dependency this has to be outside of the
517 * cfg80211_mutex lock
518 */
519 res = rfkill_register(rdev->rfkill);
520 if (res)
521 goto out_rm_dev;
522
512 return 0; 523 return 0;
513 524
514out_rm_dev: 525out_rm_dev:
515 device_del(&rdev->wiphy.dev); 526 device_del(&rdev->wiphy.dev);
516
517out_unlock:
518 mutex_unlock(&cfg80211_mutex);
519 return res; 527 return res;
520} 528}
521EXPORT_SYMBOL(wiphy_register); 529EXPORT_SYMBOL(wiphy_register);
@@ -680,8 +688,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block * nb,
680 INIT_WORK(&wdev->cleanup_work, wdev_cleanup_work); 688 INIT_WORK(&wdev->cleanup_work, wdev_cleanup_work);
681 INIT_LIST_HEAD(&wdev->event_list); 689 INIT_LIST_HEAD(&wdev->event_list);
682 spin_lock_init(&wdev->event_lock); 690 spin_lock_init(&wdev->event_lock);
683 INIT_LIST_HEAD(&wdev->action_registrations); 691 INIT_LIST_HEAD(&wdev->mgmt_registrations);
684 spin_lock_init(&wdev->action_registrations_lock); 692 spin_lock_init(&wdev->mgmt_registrations_lock);
685 693
686 mutex_lock(&rdev->devlist_mtx); 694 mutex_lock(&rdev->devlist_mtx);
687 list_add_rcu(&wdev->list, &rdev->netdev_list); 695 list_add_rcu(&wdev->list, &rdev->netdev_list);
@@ -721,6 +729,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block * nb,
721 dev->ethtool_ops = &cfg80211_ethtool_ops; 729 dev->ethtool_ops = &cfg80211_ethtool_ops;
722 730
723 if ((wdev->iftype == NL80211_IFTYPE_STATION || 731 if ((wdev->iftype == NL80211_IFTYPE_STATION ||
732 wdev->iftype == NL80211_IFTYPE_P2P_CLIENT ||
724 wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr) 733 wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr)
725 dev->priv_flags |= IFF_DONT_BRIDGE; 734 dev->priv_flags |= IFF_DONT_BRIDGE;
726 break; 735 break;
@@ -729,6 +738,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block * nb,
729 case NL80211_IFTYPE_ADHOC: 738 case NL80211_IFTYPE_ADHOC:
730 cfg80211_leave_ibss(rdev, dev, true); 739 cfg80211_leave_ibss(rdev, dev, true);
731 break; 740 break;
741 case NL80211_IFTYPE_P2P_CLIENT:
732 case NL80211_IFTYPE_STATION: 742 case NL80211_IFTYPE_STATION:
733 wdev_lock(wdev); 743 wdev_lock(wdev);
734#ifdef CONFIG_CFG80211_WEXT 744#ifdef CONFIG_CFG80211_WEXT
@@ -801,7 +811,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block * nb,
801 sysfs_remove_link(&dev->dev.kobj, "phy80211"); 811 sysfs_remove_link(&dev->dev.kobj, "phy80211");
802 list_del_rcu(&wdev->list); 812 list_del_rcu(&wdev->list);
803 rdev->devlist_generation++; 813 rdev->devlist_generation++;
804 cfg80211_mlme_purge_actions(wdev); 814 cfg80211_mlme_purge_registrations(wdev);
805#ifdef CONFIG_CFG80211_WEXT 815#ifdef CONFIG_CFG80211_WEXT
806 kfree(wdev->wext.keys); 816 kfree(wdev->wext.keys);
807#endif 817#endif
@@ -907,52 +917,3 @@ static void __exit cfg80211_exit(void)
907 destroy_workqueue(cfg80211_wq); 917 destroy_workqueue(cfg80211_wq);
908} 918}
909module_exit(cfg80211_exit); 919module_exit(cfg80211_exit);
910
911static int ___wiphy_printk(const char *level, const struct wiphy *wiphy,
912 struct va_format *vaf)
913{
914 if (!wiphy)
915 return printk("%s(NULL wiphy *): %pV", level, vaf);
916
917 return printk("%s%s: %pV", level, wiphy_name(wiphy), vaf);
918}
919
920int __wiphy_printk(const char *level, const struct wiphy *wiphy,
921 const char *fmt, ...)
922{
923 struct va_format vaf;
924 va_list args;
925 int r;
926
927 va_start(args, fmt);
928
929 vaf.fmt = fmt;
930 vaf.va = &args;
931
932 r = ___wiphy_printk(level, wiphy, &vaf);
933 va_end(args);
934
935 return r;
936}
937EXPORT_SYMBOL(__wiphy_printk);
938
939#define define_wiphy_printk_level(func, kern_level) \
940int func(const struct wiphy *wiphy, const char *fmt, ...) \
941{ \
942 struct va_format vaf; \
943 va_list args; \
944 int r; \
945 \
946 va_start(args, fmt); \
947 \
948 vaf.fmt = fmt; \
949 vaf.va = &args; \
950 \
951 r = ___wiphy_printk(kern_level, wiphy, &vaf); \
952 va_end(args); \
953 \
954 return r; \
955} \
956EXPORT_SYMBOL(func);
957
958define_wiphy_printk_level(wiphy_debug, KERN_DEBUG);
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 63d57ae399c3..6583cca0e2ee 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -86,7 +86,7 @@ struct cfg80211_registered_device *wiphy_to_dev(struct wiphy *wiphy)
86static inline 86static inline
87bool wiphy_idx_valid(int wiphy_idx) 87bool wiphy_idx_valid(int wiphy_idx)
88{ 88{
89 return (wiphy_idx >= 0); 89 return wiphy_idx >= 0;
90} 90}
91 91
92 92
@@ -95,7 +95,10 @@ extern struct mutex cfg80211_mutex;
95extern struct list_head cfg80211_rdev_list; 95extern struct list_head cfg80211_rdev_list;
96extern int cfg80211_rdev_list_generation; 96extern int cfg80211_rdev_list_generation;
97 97
98#define assert_cfg80211_lock() WARN_ON(!mutex_is_locked(&cfg80211_mutex)) 98static inline void assert_cfg80211_lock(void)
99{
100 lockdep_assert_held(&cfg80211_mutex);
101}
99 102
100/* 103/*
101 * You can use this to mark a wiphy_idx as not having an associated wiphy. 104 * You can use this to mark a wiphy_idx as not having an associated wiphy.
@@ -202,8 +205,8 @@ static inline void wdev_unlock(struct wireless_dev *wdev)
202 mutex_unlock(&wdev->mtx); 205 mutex_unlock(&wdev->mtx);
203} 206}
204 207
205#define ASSERT_RDEV_LOCK(rdev) WARN_ON(!mutex_is_locked(&(rdev)->mtx)); 208#define ASSERT_RDEV_LOCK(rdev) lockdep_assert_held(&(rdev)->mtx)
206#define ASSERT_WDEV_LOCK(wdev) WARN_ON(!mutex_is_locked(&(wdev)->mtx)); 209#define ASSERT_WDEV_LOCK(wdev) lockdep_assert_held(&(wdev)->mtx)
207 210
208enum cfg80211_event_type { 211enum cfg80211_event_type {
209 EVENT_CONNECT_RESULT, 212 EVENT_CONNECT_RESULT,
@@ -331,16 +334,17 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
331 const u8 *resp_ie, size_t resp_ie_len, 334 const u8 *resp_ie, size_t resp_ie_len,
332 u16 status, bool wextev, 335 u16 status, bool wextev,
333 struct cfg80211_bss *bss); 336 struct cfg80211_bss *bss);
334int cfg80211_mlme_register_action(struct wireless_dev *wdev, u32 snd_pid, 337int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid,
335 const u8 *match_data, int match_len); 338 u16 frame_type, const u8 *match_data,
336void cfg80211_mlme_unregister_actions(struct wireless_dev *wdev, u32 nlpid); 339 int match_len);
337void cfg80211_mlme_purge_actions(struct wireless_dev *wdev); 340void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid);
338int cfg80211_mlme_action(struct cfg80211_registered_device *rdev, 341void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev);
339 struct net_device *dev, 342int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
340 struct ieee80211_channel *chan, 343 struct net_device *dev,
341 enum nl80211_channel_type channel_type, 344 struct ieee80211_channel *chan,
342 bool channel_type_valid, 345 enum nl80211_channel_type channel_type,
343 const u8 *buf, size_t len, u64 *cookie); 346 bool channel_type_valid,
347 const u8 *buf, size_t len, u64 *cookie);
344 348
345/* SME */ 349/* SME */
346int __cfg80211_connect(struct cfg80211_registered_device *rdev, 350int __cfg80211_connect(struct cfg80211_registered_device *rdev,
@@ -371,7 +375,7 @@ bool cfg80211_sme_failed_reassoc(struct wireless_dev *wdev);
371/* internal helpers */ 375/* internal helpers */
372int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, 376int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
373 struct key_params *params, int key_idx, 377 struct key_params *params, int key_idx,
374 const u8 *mac_addr); 378 bool pairwise, const u8 *mac_addr);
375void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, 379void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
376 size_t ie_len, u16 reason, bool from_ap); 380 size_t ie_len, u16 reason, bool from_ap);
377void cfg80211_sme_scan_done(struct net_device *dev); 381void cfg80211_sme_scan_done(struct net_device *dev);
diff --git a/net/wireless/debugfs.c b/net/wireless/debugfs.c
index a4991a3efec0..39765bcfb472 100644
--- a/net/wireless/debugfs.c
+++ b/net/wireless/debugfs.c
@@ -34,6 +34,7 @@ static ssize_t name## _read(struct file *file, char __user *userbuf, \
34static const struct file_operations name## _ops = { \ 34static const struct file_operations name## _ops = { \
35 .read = name## _read, \ 35 .read = name## _read, \
36 .open = cfg80211_open_file_generic, \ 36 .open = cfg80211_open_file_generic, \
37 .llseek = generic_file_llseek, \
37}; 38};
38 39
39DEBUGFS_READONLY_FILE(rts_threshold, 20, "%d", 40DEBUGFS_READONLY_FILE(rts_threshold, 20, "%d",
@@ -102,6 +103,7 @@ static ssize_t ht40allow_map_read(struct file *file,
102static const struct file_operations ht40allow_map_ops = { 103static const struct file_operations ht40allow_map_ops = {
103 .read = ht40allow_map_read, 104 .read = ht40allow_map_read,
104 .open = cfg80211_open_file_generic, 105 .open = cfg80211_open_file_generic,
106 .llseek = default_llseek,
105}; 107};
106 108
107#define DEBUGFS_ADD(name) \ 109#define DEBUGFS_ADD(name) \
diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c
index 27a8ce9343c3..f33fbb79437c 100644
--- a/net/wireless/ibss.c
+++ b/net/wireless/ibss.c
@@ -88,6 +88,25 @@ int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
88 if (wdev->ssid_len) 88 if (wdev->ssid_len)
89 return -EALREADY; 89 return -EALREADY;
90 90
91 if (!params->basic_rates) {
92 /*
93 * If no rates were explicitly configured,
94 * use the mandatory rate set for 11b or
95 * 11a for maximum compatibility.
96 */
97 struct ieee80211_supported_band *sband =
98 rdev->wiphy.bands[params->channel->band];
99 int j;
100 u32 flag = params->channel->band == IEEE80211_BAND_5GHZ ?
101 IEEE80211_RATE_MANDATORY_A :
102 IEEE80211_RATE_MANDATORY_B;
103
104 for (j = 0; j < sband->n_bitrates; j++) {
105 if (sband->bitrates[j].flags & flag)
106 params->basic_rates |= BIT(j);
107 }
108 }
109
91 if (WARN_ON(wdev->connect_keys)) 110 if (WARN_ON(wdev->connect_keys))
92 kfree(wdev->connect_keys); 111 kfree(wdev->connect_keys);
93 wdev->connect_keys = connkeys; 112 wdev->connect_keys = connkeys;
@@ -141,7 +160,7 @@ static void __cfg80211_clear_ibss(struct net_device *dev, bool nowext)
141 */ 160 */
142 if (rdev->ops->del_key) 161 if (rdev->ops->del_key)
143 for (i = 0; i < 6; i++) 162 for (i = 0; i < 6; i++)
144 rdev->ops->del_key(wdev->wiphy, dev, i, NULL); 163 rdev->ops->del_key(wdev->wiphy, dev, i, false, NULL);
145 164
146 if (wdev->current_bss) { 165 if (wdev->current_bss) {
147 cfg80211_unhold_bss(wdev->current_bss); 166 cfg80211_unhold_bss(wdev->current_bss);
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index d1a3fb99fdf2..26838d903b9a 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -149,7 +149,7 @@ void __cfg80211_send_deauth(struct net_device *dev,
149 struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf; 149 struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf;
150 const u8 *bssid = mgmt->bssid; 150 const u8 *bssid = mgmt->bssid;
151 int i; 151 int i;
152 bool found = false; 152 bool found = false, was_current = false;
153 153
154 ASSERT_WDEV_LOCK(wdev); 154 ASSERT_WDEV_LOCK(wdev);
155 155
@@ -159,6 +159,7 @@ void __cfg80211_send_deauth(struct net_device *dev,
159 cfg80211_put_bss(&wdev->current_bss->pub); 159 cfg80211_put_bss(&wdev->current_bss->pub);
160 wdev->current_bss = NULL; 160 wdev->current_bss = NULL;
161 found = true; 161 found = true;
162 was_current = true;
162 } else for (i = 0; i < MAX_AUTH_BSSES; i++) { 163 } else for (i = 0; i < MAX_AUTH_BSSES; i++) {
163 if (wdev->auth_bsses[i] && 164 if (wdev->auth_bsses[i] &&
164 memcmp(wdev->auth_bsses[i]->pub.bssid, bssid, ETH_ALEN) == 0) { 165 memcmp(wdev->auth_bsses[i]->pub.bssid, bssid, ETH_ALEN) == 0) {
@@ -183,7 +184,7 @@ void __cfg80211_send_deauth(struct net_device *dev,
183 184
184 nl80211_send_deauth(rdev, dev, buf, len, GFP_KERNEL); 185 nl80211_send_deauth(rdev, dev, buf, len, GFP_KERNEL);
185 186
186 if (wdev->sme_state == CFG80211_SME_CONNECTED) { 187 if (wdev->sme_state == CFG80211_SME_CONNECTED && was_current) {
187 u16 reason_code; 188 u16 reason_code;
188 bool from_ap; 189 bool from_ap;
189 190
@@ -747,31 +748,53 @@ void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr,
747} 748}
748EXPORT_SYMBOL(cfg80211_new_sta); 749EXPORT_SYMBOL(cfg80211_new_sta);
749 750
750struct cfg80211_action_registration { 751struct cfg80211_mgmt_registration {
751 struct list_head list; 752 struct list_head list;
752 753
753 u32 nlpid; 754 u32 nlpid;
754 755
755 int match_len; 756 int match_len;
756 757
758 __le16 frame_type;
759
757 u8 match[]; 760 u8 match[];
758}; 761};
759 762
760int cfg80211_mlme_register_action(struct wireless_dev *wdev, u32 snd_pid, 763int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid,
761 const u8 *match_data, int match_len) 764 u16 frame_type, const u8 *match_data,
765 int match_len)
762{ 766{
763 struct cfg80211_action_registration *reg, *nreg; 767 struct wiphy *wiphy = wdev->wiphy;
768 struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
769 struct cfg80211_mgmt_registration *reg, *nreg;
764 int err = 0; 770 int err = 0;
771 u16 mgmt_type;
772
773 if (!wdev->wiphy->mgmt_stypes)
774 return -EOPNOTSUPP;
775
776 if ((frame_type & IEEE80211_FCTL_FTYPE) != IEEE80211_FTYPE_MGMT)
777 return -EINVAL;
778
779 if (frame_type & ~(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE))
780 return -EINVAL;
781
782 mgmt_type = (frame_type & IEEE80211_FCTL_STYPE) >> 4;
783 if (!(wdev->wiphy->mgmt_stypes[wdev->iftype].rx & BIT(mgmt_type)))
784 return -EINVAL;
765 785
766 nreg = kzalloc(sizeof(*reg) + match_len, GFP_KERNEL); 786 nreg = kzalloc(sizeof(*reg) + match_len, GFP_KERNEL);
767 if (!nreg) 787 if (!nreg)
768 return -ENOMEM; 788 return -ENOMEM;
769 789
770 spin_lock_bh(&wdev->action_registrations_lock); 790 spin_lock_bh(&wdev->mgmt_registrations_lock);
771 791
772 list_for_each_entry(reg, &wdev->action_registrations, list) { 792 list_for_each_entry(reg, &wdev->mgmt_registrations, list) {
773 int mlen = min(match_len, reg->match_len); 793 int mlen = min(match_len, reg->match_len);
774 794
795 if (frame_type != le16_to_cpu(reg->frame_type))
796 continue;
797
775 if (memcmp(reg->match, match_data, mlen) == 0) { 798 if (memcmp(reg->match, match_data, mlen) == 0) {
776 err = -EALREADY; 799 err = -EALREADY;
777 break; 800 break;
@@ -786,140 +809,212 @@ int cfg80211_mlme_register_action(struct wireless_dev *wdev, u32 snd_pid,
786 memcpy(nreg->match, match_data, match_len); 809 memcpy(nreg->match, match_data, match_len);
787 nreg->match_len = match_len; 810 nreg->match_len = match_len;
788 nreg->nlpid = snd_pid; 811 nreg->nlpid = snd_pid;
789 list_add(&nreg->list, &wdev->action_registrations); 812 nreg->frame_type = cpu_to_le16(frame_type);
813 list_add(&nreg->list, &wdev->mgmt_registrations);
814
815 if (rdev->ops->mgmt_frame_register)
816 rdev->ops->mgmt_frame_register(wiphy, wdev->netdev,
817 frame_type, true);
790 818
791 out: 819 out:
792 spin_unlock_bh(&wdev->action_registrations_lock); 820 spin_unlock_bh(&wdev->mgmt_registrations_lock);
821
793 return err; 822 return err;
794} 823}
795 824
796void cfg80211_mlme_unregister_actions(struct wireless_dev *wdev, u32 nlpid) 825void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid)
797{ 826{
798 struct cfg80211_action_registration *reg, *tmp; 827 struct wiphy *wiphy = wdev->wiphy;
828 struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
829 struct cfg80211_mgmt_registration *reg, *tmp;
799 830
800 spin_lock_bh(&wdev->action_registrations_lock); 831 spin_lock_bh(&wdev->mgmt_registrations_lock);
801 832
802 list_for_each_entry_safe(reg, tmp, &wdev->action_registrations, list) { 833 list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) {
803 if (reg->nlpid == nlpid) { 834 if (reg->nlpid != nlpid)
804 list_del(&reg->list); 835 continue;
805 kfree(reg); 836
837 if (rdev->ops->mgmt_frame_register) {
838 u16 frame_type = le16_to_cpu(reg->frame_type);
839
840 rdev->ops->mgmt_frame_register(wiphy, wdev->netdev,
841 frame_type, false);
806 } 842 }
843
844 list_del(&reg->list);
845 kfree(reg);
807 } 846 }
808 847
809 spin_unlock_bh(&wdev->action_registrations_lock); 848 spin_unlock_bh(&wdev->mgmt_registrations_lock);
810} 849}
811 850
812void cfg80211_mlme_purge_actions(struct wireless_dev *wdev) 851void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev)
813{ 852{
814 struct cfg80211_action_registration *reg, *tmp; 853 struct cfg80211_mgmt_registration *reg, *tmp;
815 854
816 spin_lock_bh(&wdev->action_registrations_lock); 855 spin_lock_bh(&wdev->mgmt_registrations_lock);
817 856
818 list_for_each_entry_safe(reg, tmp, &wdev->action_registrations, list) { 857 list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) {
819 list_del(&reg->list); 858 list_del(&reg->list);
820 kfree(reg); 859 kfree(reg);
821 } 860 }
822 861
823 spin_unlock_bh(&wdev->action_registrations_lock); 862 spin_unlock_bh(&wdev->mgmt_registrations_lock);
824} 863}
825 864
826int cfg80211_mlme_action(struct cfg80211_registered_device *rdev, 865int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
827 struct net_device *dev, 866 struct net_device *dev,
828 struct ieee80211_channel *chan, 867 struct ieee80211_channel *chan,
829 enum nl80211_channel_type channel_type, 868 enum nl80211_channel_type channel_type,
830 bool channel_type_valid, 869 bool channel_type_valid,
831 const u8 *buf, size_t len, u64 *cookie) 870 const u8 *buf, size_t len, u64 *cookie)
832{ 871{
833 struct wireless_dev *wdev = dev->ieee80211_ptr; 872 struct wireless_dev *wdev = dev->ieee80211_ptr;
834 const struct ieee80211_mgmt *mgmt; 873 const struct ieee80211_mgmt *mgmt;
874 u16 stype;
835 875
836 if (rdev->ops->action == NULL) 876 if (!wdev->wiphy->mgmt_stypes)
837 return -EOPNOTSUPP; 877 return -EOPNOTSUPP;
878
879 if (!rdev->ops->mgmt_tx)
880 return -EOPNOTSUPP;
881
838 if (len < 24 + 1) 882 if (len < 24 + 1)
839 return -EINVAL; 883 return -EINVAL;
840 884
841 mgmt = (const struct ieee80211_mgmt *) buf; 885 mgmt = (const struct ieee80211_mgmt *) buf;
842 if (!ieee80211_is_action(mgmt->frame_control)) 886
887 if (!ieee80211_is_mgmt(mgmt->frame_control))
843 return -EINVAL; 888 return -EINVAL;
844 if (mgmt->u.action.category != WLAN_CATEGORY_PUBLIC) { 889
845 /* Verify that we are associated with the destination AP */ 890 stype = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE;
891 if (!(wdev->wiphy->mgmt_stypes[wdev->iftype].tx & BIT(stype >> 4)))
892 return -EINVAL;
893
894 if (ieee80211_is_action(mgmt->frame_control) &&
895 mgmt->u.action.category != WLAN_CATEGORY_PUBLIC) {
896 int err = 0;
897
846 wdev_lock(wdev); 898 wdev_lock(wdev);
847 899
848 if (!wdev->current_bss || 900 switch (wdev->iftype) {
849 memcmp(wdev->current_bss->pub.bssid, mgmt->bssid, 901 case NL80211_IFTYPE_ADHOC:
850 ETH_ALEN) != 0 || 902 case NL80211_IFTYPE_STATION:
851 (wdev->iftype == NL80211_IFTYPE_STATION && 903 case NL80211_IFTYPE_P2P_CLIENT:
852 memcmp(wdev->current_bss->pub.bssid, mgmt->da, 904 if (!wdev->current_bss) {
853 ETH_ALEN) != 0)) { 905 err = -ENOTCONN;
854 wdev_unlock(wdev); 906 break;
855 return -ENOTCONN; 907 }
856 } 908
909 if (memcmp(wdev->current_bss->pub.bssid,
910 mgmt->bssid, ETH_ALEN)) {
911 err = -ENOTCONN;
912 break;
913 }
914
915 /*
916 * check for IBSS DA must be done by driver as
917 * cfg80211 doesn't track the stations
918 */
919 if (wdev->iftype == NL80211_IFTYPE_ADHOC)
920 break;
857 921
922 /* for station, check that DA is the AP */
923 if (memcmp(wdev->current_bss->pub.bssid,
924 mgmt->da, ETH_ALEN)) {
925 err = -ENOTCONN;
926 break;
927 }
928 break;
929 case NL80211_IFTYPE_AP:
930 case NL80211_IFTYPE_P2P_GO:
931 case NL80211_IFTYPE_AP_VLAN:
932 if (memcmp(mgmt->bssid, dev->dev_addr, ETH_ALEN))
933 err = -EINVAL;
934 break;
935 default:
936 err = -EOPNOTSUPP;
937 break;
938 }
858 wdev_unlock(wdev); 939 wdev_unlock(wdev);
940
941 if (err)
942 return err;
859 } 943 }
860 944
861 if (memcmp(mgmt->sa, dev->dev_addr, ETH_ALEN) != 0) 945 if (memcmp(mgmt->sa, dev->dev_addr, ETH_ALEN) != 0)
862 return -EINVAL; 946 return -EINVAL;
863 947
864 /* Transmit the Action frame as requested by user space */ 948 /* Transmit the Action frame as requested by user space */
865 return rdev->ops->action(&rdev->wiphy, dev, chan, channel_type, 949 return rdev->ops->mgmt_tx(&rdev->wiphy, dev, chan, channel_type,
866 channel_type_valid, buf, len, cookie); 950 channel_type_valid, buf, len, cookie);
867} 951}
868 952
869bool cfg80211_rx_action(struct net_device *dev, int freq, const u8 *buf, 953bool cfg80211_rx_mgmt(struct net_device *dev, int freq, const u8 *buf,
870 size_t len, gfp_t gfp) 954 size_t len, gfp_t gfp)
871{ 955{
872 struct wireless_dev *wdev = dev->ieee80211_ptr; 956 struct wireless_dev *wdev = dev->ieee80211_ptr;
873 struct wiphy *wiphy = wdev->wiphy; 957 struct wiphy *wiphy = wdev->wiphy;
874 struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); 958 struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
875 struct cfg80211_action_registration *reg; 959 struct cfg80211_mgmt_registration *reg;
876 const u8 *action_data; 960 const struct ieee80211_txrx_stypes *stypes =
877 int action_data_len; 961 &wiphy->mgmt_stypes[wdev->iftype];
962 struct ieee80211_mgmt *mgmt = (void *)buf;
963 const u8 *data;
964 int data_len;
878 bool result = false; 965 bool result = false;
966 __le16 ftype = mgmt->frame_control &
967 cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE);
968 u16 stype;
879 969
880 /* frame length - min size excluding category */ 970 stype = (le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE) >> 4;
881 action_data_len = len - (IEEE80211_MIN_ACTION_SIZE - 1);
882 971
883 /* action data starts with category */ 972 if (!(stypes->rx & BIT(stype)))
884 action_data = buf + IEEE80211_MIN_ACTION_SIZE - 1; 973 return false;
885 974
886 spin_lock_bh(&wdev->action_registrations_lock); 975 data = buf + ieee80211_hdrlen(mgmt->frame_control);
976 data_len = len - ieee80211_hdrlen(mgmt->frame_control);
977
978 spin_lock_bh(&wdev->mgmt_registrations_lock);
979
980 list_for_each_entry(reg, &wdev->mgmt_registrations, list) {
981 if (reg->frame_type != ftype)
982 continue;
887 983
888 list_for_each_entry(reg, &wdev->action_registrations, list) { 984 if (reg->match_len > data_len)
889 if (reg->match_len > action_data_len)
890 continue; 985 continue;
891 986
892 if (memcmp(reg->match, action_data, reg->match_len)) 987 if (memcmp(reg->match, data, reg->match_len))
893 continue; 988 continue;
894 989
895 /* found match! */ 990 /* found match! */
896 991
897 /* Indicate the received Action frame to user space */ 992 /* Indicate the received Action frame to user space */
898 if (nl80211_send_action(rdev, dev, reg->nlpid, freq, 993 if (nl80211_send_mgmt(rdev, dev, reg->nlpid, freq,
899 buf, len, gfp)) 994 buf, len, gfp))
900 continue; 995 continue;
901 996
902 result = true; 997 result = true;
903 break; 998 break;
904 } 999 }
905 1000
906 spin_unlock_bh(&wdev->action_registrations_lock); 1001 spin_unlock_bh(&wdev->mgmt_registrations_lock);
907 1002
908 return result; 1003 return result;
909} 1004}
910EXPORT_SYMBOL(cfg80211_rx_action); 1005EXPORT_SYMBOL(cfg80211_rx_mgmt);
911 1006
912void cfg80211_action_tx_status(struct net_device *dev, u64 cookie, 1007void cfg80211_mgmt_tx_status(struct net_device *dev, u64 cookie,
913 const u8 *buf, size_t len, bool ack, gfp_t gfp) 1008 const u8 *buf, size_t len, bool ack, gfp_t gfp)
914{ 1009{
915 struct wireless_dev *wdev = dev->ieee80211_ptr; 1010 struct wireless_dev *wdev = dev->ieee80211_ptr;
916 struct wiphy *wiphy = wdev->wiphy; 1011 struct wiphy *wiphy = wdev->wiphy;
917 struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); 1012 struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
918 1013
919 /* Indicate TX status of the Action frame to user space */ 1014 /* Indicate TX status of the Action frame to user space */
920 nl80211_send_action_tx_status(rdev, dev, cookie, buf, len, ack, gfp); 1015 nl80211_send_mgmt_tx_status(rdev, dev, cookie, buf, len, ack, gfp);
921} 1016}
922EXPORT_SYMBOL(cfg80211_action_tx_status); 1017EXPORT_SYMBOL(cfg80211_mgmt_tx_status);
923 1018
924void cfg80211_cqm_rssi_notify(struct net_device *dev, 1019void cfg80211_cqm_rssi_notify(struct net_device *dev,
925 enum nl80211_cqm_rssi_threshold_event rssi_event, 1020 enum nl80211_cqm_rssi_threshold_event rssi_event,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 37902a54e9c1..c506241f8637 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -23,6 +23,11 @@
23#include "nl80211.h" 23#include "nl80211.h"
24#include "reg.h" 24#include "reg.h"
25 25
26static int nl80211_pre_doit(struct genl_ops *ops, struct sk_buff *skb,
27 struct genl_info *info);
28static void nl80211_post_doit(struct genl_ops *ops, struct sk_buff *skb,
29 struct genl_info *info);
30
26/* the netlink family */ 31/* the netlink family */
27static struct genl_family nl80211_fam = { 32static struct genl_family nl80211_fam = {
28 .id = GENL_ID_GENERATE, /* don't bother with a hardcoded ID */ 33 .id = GENL_ID_GENERATE, /* don't bother with a hardcoded ID */
@@ -31,6 +36,8 @@ static struct genl_family nl80211_fam = {
31 .version = 1, /* no particular meaning now */ 36 .version = 1, /* no particular meaning now */
32 .maxattr = NL80211_ATTR_MAX, 37 .maxattr = NL80211_ATTR_MAX,
33 .netnsok = true, 38 .netnsok = true,
39 .pre_doit = nl80211_pre_doit,
40 .post_doit = nl80211_post_doit,
34}; 41};
35 42
36/* internal helper: get rdev and dev */ 43/* internal helper: get rdev and dev */
@@ -86,6 +93,7 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = {
86 [NL80211_ATTR_KEY_CIPHER] = { .type = NLA_U32 }, 93 [NL80211_ATTR_KEY_CIPHER] = { .type = NLA_U32 },
87 [NL80211_ATTR_KEY_DEFAULT] = { .type = NLA_FLAG }, 94 [NL80211_ATTR_KEY_DEFAULT] = { .type = NLA_FLAG },
88 [NL80211_ATTR_KEY_SEQ] = { .type = NLA_BINARY, .len = 8 }, 95 [NL80211_ATTR_KEY_SEQ] = { .type = NLA_BINARY, .len = 8 },
96 [NL80211_ATTR_KEY_TYPE] = { .type = NLA_U32 },
89 97
90 [NL80211_ATTR_BEACON_INTERVAL] = { .type = NLA_U32 }, 98 [NL80211_ATTR_BEACON_INTERVAL] = { .type = NLA_U32 },
91 [NL80211_ATTR_DTIM_PERIOD] = { .type = NLA_U32 }, 99 [NL80211_ATTR_DTIM_PERIOD] = { .type = NLA_U32 },
@@ -136,6 +144,8 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = {
136 .len = sizeof(struct nl80211_sta_flag_update), 144 .len = sizeof(struct nl80211_sta_flag_update),
137 }, 145 },
138 [NL80211_ATTR_CONTROL_PORT] = { .type = NLA_FLAG }, 146 [NL80211_ATTR_CONTROL_PORT] = { .type = NLA_FLAG },
147 [NL80211_ATTR_CONTROL_PORT_ETHERTYPE] = { .type = NLA_U16 },
148 [NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT] = { .type = NLA_FLAG },
139 [NL80211_ATTR_PRIVACY] = { .type = NLA_FLAG }, 149 [NL80211_ATTR_PRIVACY] = { .type = NLA_FLAG },
140 [NL80211_ATTR_CIPHER_SUITE_GROUP] = { .type = NLA_U32 }, 150 [NL80211_ATTR_CIPHER_SUITE_GROUP] = { .type = NLA_U32 },
141 [NL80211_ATTR_WPA_VERSIONS] = { .type = NLA_U32 }, 151 [NL80211_ATTR_WPA_VERSIONS] = { .type = NLA_U32 },
@@ -156,9 +166,10 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = {
156 166
157 [NL80211_ATTR_WIPHY_TX_POWER_SETTING] = { .type = NLA_U32 }, 167 [NL80211_ATTR_WIPHY_TX_POWER_SETTING] = { .type = NLA_U32 },
158 [NL80211_ATTR_WIPHY_TX_POWER_LEVEL] = { .type = NLA_U32 }, 168 [NL80211_ATTR_WIPHY_TX_POWER_LEVEL] = { .type = NLA_U32 },
169 [NL80211_ATTR_FRAME_TYPE] = { .type = NLA_U16 },
159}; 170};
160 171
161/* policy for the attributes */ 172/* policy for the key attributes */
162static const struct nla_policy nl80211_key_policy[NL80211_KEY_MAX + 1] = { 173static const struct nla_policy nl80211_key_policy[NL80211_KEY_MAX + 1] = {
163 [NL80211_KEY_DATA] = { .type = NLA_BINARY, .len = WLAN_MAX_KEY_LEN }, 174 [NL80211_KEY_DATA] = { .type = NLA_BINARY, .len = WLAN_MAX_KEY_LEN },
164 [NL80211_KEY_IDX] = { .type = NLA_U8 }, 175 [NL80211_KEY_IDX] = { .type = NLA_U8 },
@@ -166,6 +177,7 @@ static const struct nla_policy nl80211_key_policy[NL80211_KEY_MAX + 1] = {
166 [NL80211_KEY_SEQ] = { .type = NLA_BINARY, .len = 8 }, 177 [NL80211_KEY_SEQ] = { .type = NLA_BINARY, .len = 8 },
167 [NL80211_KEY_DEFAULT] = { .type = NLA_FLAG }, 178 [NL80211_KEY_DEFAULT] = { .type = NLA_FLAG },
168 [NL80211_KEY_DEFAULT_MGMT] = { .type = NLA_FLAG }, 179 [NL80211_KEY_DEFAULT_MGMT] = { .type = NLA_FLAG },
180 [NL80211_KEY_TYPE] = { .type = NLA_U32 },
169}; 181};
170 182
171/* ifidx get helper */ 183/* ifidx get helper */
@@ -188,6 +200,47 @@ static int nl80211_get_ifidx(struct netlink_callback *cb)
188 return res; 200 return res;
189} 201}
190 202
203static int nl80211_prepare_netdev_dump(struct sk_buff *skb,
204 struct netlink_callback *cb,
205 struct cfg80211_registered_device **rdev,
206 struct net_device **dev)
207{
208 int ifidx = cb->args[0];
209 int err;
210
211 if (!ifidx)
212 ifidx = nl80211_get_ifidx(cb);
213 if (ifidx < 0)
214 return ifidx;
215
216 cb->args[0] = ifidx;
217
218 rtnl_lock();
219
220 *dev = __dev_get_by_index(sock_net(skb->sk), ifidx);
221 if (!*dev) {
222 err = -ENODEV;
223 goto out_rtnl;
224 }
225
226 *rdev = cfg80211_get_dev_from_ifindex(sock_net(skb->sk), ifidx);
227 if (IS_ERR(dev)) {
228 err = PTR_ERR(dev);
229 goto out_rtnl;
230 }
231
232 return 0;
233 out_rtnl:
234 rtnl_unlock();
235 return err;
236}
237
238static void nl80211_finish_netdev_dump(struct cfg80211_registered_device *rdev)
239{
240 cfg80211_unlock_rdev(rdev);
241 rtnl_unlock();
242}
243
191/* IE validation */ 244/* IE validation */
192static bool is_valid_ie_attr(const struct nlattr *attr) 245static bool is_valid_ie_attr(const struct nlattr *attr)
193{ 246{
@@ -255,6 +308,7 @@ static int nl80211_msg_put_channel(struct sk_buff *msg,
255struct key_parse { 308struct key_parse {
256 struct key_params p; 309 struct key_params p;
257 int idx; 310 int idx;
311 int type;
258 bool def, defmgmt; 312 bool def, defmgmt;
259}; 313};
260 314
@@ -285,6 +339,12 @@ static int nl80211_parse_key_new(struct nlattr *key, struct key_parse *k)
285 if (tb[NL80211_KEY_CIPHER]) 339 if (tb[NL80211_KEY_CIPHER])
286 k->p.cipher = nla_get_u32(tb[NL80211_KEY_CIPHER]); 340 k->p.cipher = nla_get_u32(tb[NL80211_KEY_CIPHER]);
287 341
342 if (tb[NL80211_KEY_TYPE]) {
343 k->type = nla_get_u32(tb[NL80211_KEY_TYPE]);
344 if (k->type < 0 || k->type >= NUM_NL80211_KEYTYPES)
345 return -EINVAL;
346 }
347
288 return 0; 348 return 0;
289} 349}
290 350
@@ -309,6 +369,12 @@ static int nl80211_parse_key_old(struct genl_info *info, struct key_parse *k)
309 k->def = !!info->attrs[NL80211_ATTR_KEY_DEFAULT]; 369 k->def = !!info->attrs[NL80211_ATTR_KEY_DEFAULT];
310 k->defmgmt = !!info->attrs[NL80211_ATTR_KEY_DEFAULT_MGMT]; 370 k->defmgmt = !!info->attrs[NL80211_ATTR_KEY_DEFAULT_MGMT];
311 371
372 if (info->attrs[NL80211_ATTR_KEY_TYPE]) {
373 k->type = nla_get_u32(info->attrs[NL80211_ATTR_KEY_TYPE]);
374 if (k->type < 0 || k->type >= NUM_NL80211_KEYTYPES)
375 return -EINVAL;
376 }
377
312 return 0; 378 return 0;
313} 379}
314 380
@@ -318,6 +384,7 @@ static int nl80211_parse_key(struct genl_info *info, struct key_parse *k)
318 384
319 memset(k, 0, sizeof(*k)); 385 memset(k, 0, sizeof(*k));
320 k->idx = -1; 386 k->idx = -1;
387 k->type = -1;
321 388
322 if (info->attrs[NL80211_ATTR_KEY]) 389 if (info->attrs[NL80211_ATTR_KEY])
323 err = nl80211_parse_key_new(info->attrs[NL80211_ATTR_KEY], k); 390 err = nl80211_parse_key_new(info->attrs[NL80211_ATTR_KEY], k);
@@ -382,7 +449,7 @@ nl80211_parse_connkeys(struct cfg80211_registered_device *rdev,
382 } else if (parse.defmgmt) 449 } else if (parse.defmgmt)
383 goto error; 450 goto error;
384 err = cfg80211_validate_key_settings(rdev, &parse.p, 451 err = cfg80211_validate_key_settings(rdev, &parse.p,
385 parse.idx, NULL); 452 parse.idx, false, NULL);
386 if (err) 453 if (err)
387 goto error; 454 goto error;
388 result->params[parse.idx].cipher = parse.p.cipher; 455 result->params[parse.idx].cipher = parse.p.cipher;
@@ -401,18 +468,17 @@ static int nl80211_key_allowed(struct wireless_dev *wdev)
401{ 468{
402 ASSERT_WDEV_LOCK(wdev); 469 ASSERT_WDEV_LOCK(wdev);
403 470
404 if (!netif_running(wdev->netdev))
405 return -ENETDOWN;
406
407 switch (wdev->iftype) { 471 switch (wdev->iftype) {
408 case NL80211_IFTYPE_AP: 472 case NL80211_IFTYPE_AP:
409 case NL80211_IFTYPE_AP_VLAN: 473 case NL80211_IFTYPE_AP_VLAN:
474 case NL80211_IFTYPE_P2P_GO:
410 break; 475 break;
411 case NL80211_IFTYPE_ADHOC: 476 case NL80211_IFTYPE_ADHOC:
412 if (!wdev->current_bss) 477 if (!wdev->current_bss)
413 return -ENOLINK; 478 return -ENOLINK;
414 break; 479 break;
415 case NL80211_IFTYPE_STATION: 480 case NL80211_IFTYPE_STATION:
481 case NL80211_IFTYPE_P2P_CLIENT:
416 if (wdev->sme_state != CFG80211_SME_CONNECTED) 482 if (wdev->sme_state != CFG80211_SME_CONNECTED)
417 return -ENOLINK; 483 return -ENOLINK;
418 break; 484 break;
@@ -437,6 +503,8 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
437 struct ieee80211_rate *rate; 503 struct ieee80211_rate *rate;
438 int i; 504 int i;
439 u16 ifmodes = dev->wiphy.interface_modes; 505 u16 ifmodes = dev->wiphy.interface_modes;
506 const struct ieee80211_txrx_stypes *mgmt_stypes =
507 dev->wiphy.mgmt_stypes;
440 508
441 hdr = nl80211hdr_put(msg, pid, seq, flags, NL80211_CMD_NEW_WIPHY); 509 hdr = nl80211hdr_put(msg, pid, seq, flags, NL80211_CMD_NEW_WIPHY);
442 if (!hdr) 510 if (!hdr)
@@ -464,6 +532,9 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
464 NLA_PUT_U16(msg, NL80211_ATTR_MAX_SCAN_IE_LEN, 532 NLA_PUT_U16(msg, NL80211_ATTR_MAX_SCAN_IE_LEN,
465 dev->wiphy.max_scan_ie_len); 533 dev->wiphy.max_scan_ie_len);
466 534
535 if (dev->wiphy.flags & WIPHY_FLAG_IBSS_RSN)
536 NLA_PUT_FLAG(msg, NL80211_ATTR_SUPPORT_IBSS_RSN);
537
467 NLA_PUT(msg, NL80211_ATTR_CIPHER_SUITES, 538 NLA_PUT(msg, NL80211_ATTR_CIPHER_SUITES,
468 sizeof(u32) * dev->wiphy.n_cipher_suites, 539 sizeof(u32) * dev->wiphy.n_cipher_suites,
469 dev->wiphy.cipher_suites); 540 dev->wiphy.cipher_suites);
@@ -471,6 +542,9 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
471 NLA_PUT_U8(msg, NL80211_ATTR_MAX_NUM_PMKIDS, 542 NLA_PUT_U8(msg, NL80211_ATTR_MAX_NUM_PMKIDS,
472 dev->wiphy.max_num_pmkids); 543 dev->wiphy.max_num_pmkids);
473 544
545 if (dev->wiphy.flags & WIPHY_FLAG_CONTROL_PORT_PROTOCOL)
546 NLA_PUT_FLAG(msg, NL80211_ATTR_CONTROL_PORT_ETHERTYPE);
547
474 nl_modes = nla_nest_start(msg, NL80211_ATTR_SUPPORTED_IFTYPES); 548 nl_modes = nla_nest_start(msg, NL80211_ATTR_SUPPORTED_IFTYPES);
475 if (!nl_modes) 549 if (!nl_modes)
476 goto nla_put_failure; 550 goto nla_put_failure;
@@ -587,12 +661,13 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
587 CMD(flush_pmksa, FLUSH_PMKSA); 661 CMD(flush_pmksa, FLUSH_PMKSA);
588 CMD(remain_on_channel, REMAIN_ON_CHANNEL); 662 CMD(remain_on_channel, REMAIN_ON_CHANNEL);
589 CMD(set_bitrate_mask, SET_TX_BITRATE_MASK); 663 CMD(set_bitrate_mask, SET_TX_BITRATE_MASK);
590 CMD(action, ACTION); 664 CMD(mgmt_tx, FRAME);
591 if (dev->wiphy.flags & WIPHY_FLAG_NETNS_OK) { 665 if (dev->wiphy.flags & WIPHY_FLAG_NETNS_OK) {
592 i++; 666 i++;
593 NLA_PUT_U32(msg, i, NL80211_CMD_SET_WIPHY_NETNS); 667 NLA_PUT_U32(msg, i, NL80211_CMD_SET_WIPHY_NETNS);
594 } 668 }
595 CMD(set_channel, SET_CHANNEL); 669 CMD(set_channel, SET_CHANNEL);
670 CMD(set_wds_peer, SET_WDS_PEER);
596 671
597#undef CMD 672#undef CMD
598 673
@@ -608,6 +683,55 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
608 683
609 nla_nest_end(msg, nl_cmds); 684 nla_nest_end(msg, nl_cmds);
610 685
686 if (mgmt_stypes) {
687 u16 stypes;
688 struct nlattr *nl_ftypes, *nl_ifs;
689 enum nl80211_iftype ift;
690
691 nl_ifs = nla_nest_start(msg, NL80211_ATTR_TX_FRAME_TYPES);
692 if (!nl_ifs)
693 goto nla_put_failure;
694
695 for (ift = 0; ift < NUM_NL80211_IFTYPES; ift++) {
696 nl_ftypes = nla_nest_start(msg, ift);
697 if (!nl_ftypes)
698 goto nla_put_failure;
699 i = 0;
700 stypes = mgmt_stypes[ift].tx;
701 while (stypes) {
702 if (stypes & 1)
703 NLA_PUT_U16(msg, NL80211_ATTR_FRAME_TYPE,
704 (i << 4) | IEEE80211_FTYPE_MGMT);
705 stypes >>= 1;
706 i++;
707 }
708 nla_nest_end(msg, nl_ftypes);
709 }
710
711 nla_nest_end(msg, nl_ifs);
712
713 nl_ifs = nla_nest_start(msg, NL80211_ATTR_RX_FRAME_TYPES);
714 if (!nl_ifs)
715 goto nla_put_failure;
716
717 for (ift = 0; ift < NUM_NL80211_IFTYPES; ift++) {
718 nl_ftypes = nla_nest_start(msg, ift);
719 if (!nl_ftypes)
720 goto nla_put_failure;
721 i = 0;
722 stypes = mgmt_stypes[ift].rx;
723 while (stypes) {
724 if (stypes & 1)
725 NLA_PUT_U16(msg, NL80211_ATTR_FRAME_TYPE,
726 (i << 4) | IEEE80211_FTYPE_MGMT);
727 stypes >>= 1;
728 i++;
729 }
730 nla_nest_end(msg, nl_ftypes);
731 }
732 nla_nest_end(msg, nl_ifs);
733 }
734
611 return genlmsg_end(msg, hdr); 735 return genlmsg_end(msg, hdr);
612 736
613 nla_put_failure: 737 nla_put_failure:
@@ -644,28 +768,18 @@ static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb)
644static int nl80211_get_wiphy(struct sk_buff *skb, struct genl_info *info) 768static int nl80211_get_wiphy(struct sk_buff *skb, struct genl_info *info)
645{ 769{
646 struct sk_buff *msg; 770 struct sk_buff *msg;
647 struct cfg80211_registered_device *dev; 771 struct cfg80211_registered_device *dev = info->user_ptr[0];
648
649 dev = cfg80211_get_dev_from_info(info);
650 if (IS_ERR(dev))
651 return PTR_ERR(dev);
652 772
653 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 773 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
654 if (!msg) 774 if (!msg)
655 goto out_err; 775 return -ENOMEM;
656
657 if (nl80211_send_wiphy(msg, info->snd_pid, info->snd_seq, 0, dev) < 0)
658 goto out_free;
659 776
660 cfg80211_unlock_rdev(dev); 777 if (nl80211_send_wiphy(msg, info->snd_pid, info->snd_seq, 0, dev) < 0) {
778 nlmsg_free(msg);
779 return -ENOBUFS;
780 }
661 781
662 return genlmsg_reply(msg, info); 782 return genlmsg_reply(msg, info);
663
664 out_free:
665 nlmsg_free(msg);
666 out_err:
667 cfg80211_unlock_rdev(dev);
668 return -ENOBUFS;
669} 783}
670 784
671static const struct nla_policy txq_params_policy[NL80211_TXQ_ATTR_MAX + 1] = { 785static const struct nla_policy txq_params_policy[NL80211_TXQ_ATTR_MAX + 1] = {
@@ -709,7 +823,8 @@ static bool nl80211_can_set_dev_channel(struct wireless_dev *wdev)
709 wdev->iftype == NL80211_IFTYPE_AP || 823 wdev->iftype == NL80211_IFTYPE_AP ||
710 wdev->iftype == NL80211_IFTYPE_WDS || 824 wdev->iftype == NL80211_IFTYPE_WDS ||
711 wdev->iftype == NL80211_IFTYPE_MESH_POINT || 825 wdev->iftype == NL80211_IFTYPE_MESH_POINT ||
712 wdev->iftype == NL80211_IFTYPE_MONITOR; 826 wdev->iftype == NL80211_IFTYPE_MONITOR ||
827 wdev->iftype == NL80211_IFTYPE_P2P_GO;
713} 828}
714 829
715static int __nl80211_set_channel(struct cfg80211_registered_device *rdev, 830static int __nl80211_set_channel(struct cfg80211_registered_device *rdev,
@@ -753,38 +868,48 @@ static int __nl80211_set_channel(struct cfg80211_registered_device *rdev,
753 868
754static int nl80211_set_channel(struct sk_buff *skb, struct genl_info *info) 869static int nl80211_set_channel(struct sk_buff *skb, struct genl_info *info)
755{ 870{
756 struct cfg80211_registered_device *rdev; 871 struct cfg80211_registered_device *rdev = info->user_ptr[0];
757 struct net_device *netdev; 872 struct net_device *netdev = info->user_ptr[1];
758 int result;
759 873
760 rtnl_lock(); 874 return __nl80211_set_channel(rdev, netdev->ieee80211_ptr, info);
875}
761 876
762 result = get_rdev_dev_by_info_ifindex(info, &rdev, &netdev); 877static int nl80211_set_wds_peer(struct sk_buff *skb, struct genl_info *info)
763 if (result) 878{
764 goto unlock; 879 struct cfg80211_registered_device *rdev = info->user_ptr[0];
880 struct net_device *dev = info->user_ptr[1];
881 struct wireless_dev *wdev = dev->ieee80211_ptr;
882 const u8 *bssid;
765 883
766 result = __nl80211_set_channel(rdev, netdev->ieee80211_ptr, info); 884 if (!info->attrs[NL80211_ATTR_MAC])
885 return -EINVAL;
767 886
768 unlock: 887 if (netif_running(dev))
769 rtnl_unlock(); 888 return -EBUSY;
770 889
771 return result; 890 if (!rdev->ops->set_wds_peer)
891 return -EOPNOTSUPP;
892
893 if (wdev->iftype != NL80211_IFTYPE_WDS)
894 return -EOPNOTSUPP;
895
896 bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
897 return rdev->ops->set_wds_peer(wdev->wiphy, dev, bssid);
772} 898}
773 899
900
774static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) 901static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
775{ 902{
776 struct cfg80211_registered_device *rdev; 903 struct cfg80211_registered_device *rdev;
777 struct net_device *netdev = NULL; 904 struct net_device *netdev = NULL;
778 struct wireless_dev *wdev; 905 struct wireless_dev *wdev;
779 int result, rem_txq_params = 0; 906 int result = 0, rem_txq_params = 0;
780 struct nlattr *nl_txq_params; 907 struct nlattr *nl_txq_params;
781 u32 changed; 908 u32 changed;
782 u8 retry_short = 0, retry_long = 0; 909 u8 retry_short = 0, retry_long = 0;
783 u32 frag_threshold = 0, rts_threshold = 0; 910 u32 frag_threshold = 0, rts_threshold = 0;
784 u8 coverage_class = 0; 911 u8 coverage_class = 0;
785 912
786 rtnl_lock();
787
788 /* 913 /*
789 * Try to find the wiphy and netdev. Normally this 914 * Try to find the wiphy and netdev. Normally this
790 * function shouldn't need the netdev, but this is 915 * function shouldn't need the netdev, but this is
@@ -811,8 +936,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
811 rdev = __cfg80211_rdev_from_info(info); 936 rdev = __cfg80211_rdev_from_info(info);
812 if (IS_ERR(rdev)) { 937 if (IS_ERR(rdev)) {
813 mutex_unlock(&cfg80211_mutex); 938 mutex_unlock(&cfg80211_mutex);
814 result = PTR_ERR(rdev); 939 return PTR_ERR(rdev);
815 goto unlock;
816 } 940 }
817 wdev = NULL; 941 wdev = NULL;
818 netdev = NULL; 942 netdev = NULL;
@@ -994,8 +1118,6 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
994 mutex_unlock(&rdev->mtx); 1118 mutex_unlock(&rdev->mtx);
995 if (netdev) 1119 if (netdev)
996 dev_put(netdev); 1120 dev_put(netdev);
997 unlock:
998 rtnl_unlock();
999 return result; 1121 return result;
1000} 1122}
1001 1123
@@ -1075,33 +1197,20 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *
1075static int nl80211_get_interface(struct sk_buff *skb, struct genl_info *info) 1197static int nl80211_get_interface(struct sk_buff *skb, struct genl_info *info)
1076{ 1198{
1077 struct sk_buff *msg; 1199 struct sk_buff *msg;
1078 struct cfg80211_registered_device *dev; 1200 struct cfg80211_registered_device *dev = info->user_ptr[0];
1079 struct net_device *netdev; 1201 struct net_device *netdev = info->user_ptr[1];
1080 int err;
1081
1082 err = get_rdev_dev_by_info_ifindex(info, &dev, &netdev);
1083 if (err)
1084 return err;
1085 1202
1086 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 1203 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
1087 if (!msg) 1204 if (!msg)
1088 goto out_err; 1205 return -ENOMEM;
1089 1206
1090 if (nl80211_send_iface(msg, info->snd_pid, info->snd_seq, 0, 1207 if (nl80211_send_iface(msg, info->snd_pid, info->snd_seq, 0,
1091 dev, netdev) < 0) 1208 dev, netdev) < 0) {
1092 goto out_free; 1209 nlmsg_free(msg);
1093 1210 return -ENOBUFS;
1094 dev_put(netdev); 1211 }
1095 cfg80211_unlock_rdev(dev);
1096 1212
1097 return genlmsg_reply(msg, info); 1213 return genlmsg_reply(msg, info);
1098
1099 out_free:
1100 nlmsg_free(msg);
1101 out_err:
1102 dev_put(netdev);
1103 cfg80211_unlock_rdev(dev);
1104 return -ENOBUFS;
1105} 1214}
1106 1215
1107static const struct nla_policy mntr_flags_policy[NL80211_MNTR_FLAG_MAX + 1] = { 1216static const struct nla_policy mntr_flags_policy[NL80211_MNTR_FLAG_MAX + 1] = {
@@ -1161,39 +1270,29 @@ static int nl80211_valid_4addr(struct cfg80211_registered_device *rdev,
1161 1270
1162static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info) 1271static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
1163{ 1272{
1164 struct cfg80211_registered_device *rdev; 1273 struct cfg80211_registered_device *rdev = info->user_ptr[0];
1165 struct vif_params params; 1274 struct vif_params params;
1166 int err; 1275 int err;
1167 enum nl80211_iftype otype, ntype; 1276 enum nl80211_iftype otype, ntype;
1168 struct net_device *dev; 1277 struct net_device *dev = info->user_ptr[1];
1169 u32 _flags, *flags = NULL; 1278 u32 _flags, *flags = NULL;
1170 bool change = false; 1279 bool change = false;
1171 1280
1172 memset(&params, 0, sizeof(params)); 1281 memset(&params, 0, sizeof(params));
1173 1282
1174 rtnl_lock();
1175
1176 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
1177 if (err)
1178 goto unlock_rtnl;
1179
1180 otype = ntype = dev->ieee80211_ptr->iftype; 1283 otype = ntype = dev->ieee80211_ptr->iftype;
1181 1284
1182 if (info->attrs[NL80211_ATTR_IFTYPE]) { 1285 if (info->attrs[NL80211_ATTR_IFTYPE]) {
1183 ntype = nla_get_u32(info->attrs[NL80211_ATTR_IFTYPE]); 1286 ntype = nla_get_u32(info->attrs[NL80211_ATTR_IFTYPE]);
1184 if (otype != ntype) 1287 if (otype != ntype)
1185 change = true; 1288 change = true;
1186 if (ntype > NL80211_IFTYPE_MAX) { 1289 if (ntype > NL80211_IFTYPE_MAX)
1187 err = -EINVAL; 1290 return -EINVAL;
1188 goto unlock;
1189 }
1190 } 1291 }
1191 1292
1192 if (info->attrs[NL80211_ATTR_MESH_ID]) { 1293 if (info->attrs[NL80211_ATTR_MESH_ID]) {
1193 if (ntype != NL80211_IFTYPE_MESH_POINT) { 1294 if (ntype != NL80211_IFTYPE_MESH_POINT)
1194 err = -EINVAL; 1295 return -EINVAL;
1195 goto unlock;
1196 }
1197 params.mesh_id = nla_data(info->attrs[NL80211_ATTR_MESH_ID]); 1296 params.mesh_id = nla_data(info->attrs[NL80211_ATTR_MESH_ID]);
1198 params.mesh_id_len = nla_len(info->attrs[NL80211_ATTR_MESH_ID]); 1297 params.mesh_id_len = nla_len(info->attrs[NL80211_ATTR_MESH_ID]);
1199 change = true; 1298 change = true;
@@ -1204,20 +1303,18 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
1204 change = true; 1303 change = true;
1205 err = nl80211_valid_4addr(rdev, dev, params.use_4addr, ntype); 1304 err = nl80211_valid_4addr(rdev, dev, params.use_4addr, ntype);
1206 if (err) 1305 if (err)
1207 goto unlock; 1306 return err;
1208 } else { 1307 } else {
1209 params.use_4addr = -1; 1308 params.use_4addr = -1;
1210 } 1309 }
1211 1310
1212 if (info->attrs[NL80211_ATTR_MNTR_FLAGS]) { 1311 if (info->attrs[NL80211_ATTR_MNTR_FLAGS]) {
1213 if (ntype != NL80211_IFTYPE_MONITOR) { 1312 if (ntype != NL80211_IFTYPE_MONITOR)
1214 err = -EINVAL; 1313 return -EINVAL;
1215 goto unlock;
1216 }
1217 err = parse_monitor_flags(info->attrs[NL80211_ATTR_MNTR_FLAGS], 1314 err = parse_monitor_flags(info->attrs[NL80211_ATTR_MNTR_FLAGS],
1218 &_flags); 1315 &_flags);
1219 if (err) 1316 if (err)
1220 goto unlock; 1317 return err;
1221 1318
1222 flags = &_flags; 1319 flags = &_flags;
1223 change = true; 1320 change = true;
@@ -1231,17 +1328,12 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
1231 if (!err && params.use_4addr != -1) 1328 if (!err && params.use_4addr != -1)
1232 dev->ieee80211_ptr->use_4addr = params.use_4addr; 1329 dev->ieee80211_ptr->use_4addr = params.use_4addr;
1233 1330
1234 unlock:
1235 dev_put(dev);
1236 cfg80211_unlock_rdev(rdev);
1237 unlock_rtnl:
1238 rtnl_unlock();
1239 return err; 1331 return err;
1240} 1332}
1241 1333
1242static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) 1334static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
1243{ 1335{
1244 struct cfg80211_registered_device *rdev; 1336 struct cfg80211_registered_device *rdev = info->user_ptr[0];
1245 struct vif_params params; 1337 struct vif_params params;
1246 int err; 1338 int err;
1247 enum nl80211_iftype type = NL80211_IFTYPE_UNSPECIFIED; 1339 enum nl80211_iftype type = NL80211_IFTYPE_UNSPECIFIED;
@@ -1258,19 +1350,9 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
1258 return -EINVAL; 1350 return -EINVAL;
1259 } 1351 }
1260 1352
1261 rtnl_lock();
1262
1263 rdev = cfg80211_get_dev_from_info(info);
1264 if (IS_ERR(rdev)) {
1265 err = PTR_ERR(rdev);
1266 goto unlock_rtnl;
1267 }
1268
1269 if (!rdev->ops->add_virtual_intf || 1353 if (!rdev->ops->add_virtual_intf ||
1270 !(rdev->wiphy.interface_modes & (1 << type))) { 1354 !(rdev->wiphy.interface_modes & (1 << type)))
1271 err = -EOPNOTSUPP; 1355 return -EOPNOTSUPP;
1272 goto unlock;
1273 }
1274 1356
1275 if (type == NL80211_IFTYPE_MESH_POINT && 1357 if (type == NL80211_IFTYPE_MESH_POINT &&
1276 info->attrs[NL80211_ATTR_MESH_ID]) { 1358 info->attrs[NL80211_ATTR_MESH_ID]) {
@@ -1282,7 +1364,7 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
1282 params.use_4addr = !!nla_get_u8(info->attrs[NL80211_ATTR_4ADDR]); 1364 params.use_4addr = !!nla_get_u8(info->attrs[NL80211_ATTR_4ADDR]);
1283 err = nl80211_valid_4addr(rdev, NULL, params.use_4addr, type); 1365 err = nl80211_valid_4addr(rdev, NULL, params.use_4addr, type);
1284 if (err) 1366 if (err)
1285 goto unlock; 1367 return err;
1286 } 1368 }
1287 1369
1288 err = parse_monitor_flags(type == NL80211_IFTYPE_MONITOR ? 1370 err = parse_monitor_flags(type == NL80211_IFTYPE_MONITOR ?
@@ -1292,38 +1374,18 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
1292 nla_data(info->attrs[NL80211_ATTR_IFNAME]), 1374 nla_data(info->attrs[NL80211_ATTR_IFNAME]),
1293 type, err ? NULL : &flags, &params); 1375 type, err ? NULL : &flags, &params);
1294 1376
1295 unlock:
1296 cfg80211_unlock_rdev(rdev);
1297 unlock_rtnl:
1298 rtnl_unlock();
1299 return err; 1377 return err;
1300} 1378}
1301 1379
1302static int nl80211_del_interface(struct sk_buff *skb, struct genl_info *info) 1380static int nl80211_del_interface(struct sk_buff *skb, struct genl_info *info)
1303{ 1381{
1304 struct cfg80211_registered_device *rdev; 1382 struct cfg80211_registered_device *rdev = info->user_ptr[0];
1305 int err; 1383 struct net_device *dev = info->user_ptr[1];
1306 struct net_device *dev;
1307 1384
1308 rtnl_lock(); 1385 if (!rdev->ops->del_virtual_intf)
1309 1386 return -EOPNOTSUPP;
1310 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
1311 if (err)
1312 goto unlock_rtnl;
1313
1314 if (!rdev->ops->del_virtual_intf) {
1315 err = -EOPNOTSUPP;
1316 goto out;
1317 }
1318
1319 err = rdev->ops->del_virtual_intf(&rdev->wiphy, dev);
1320 1387
1321 out: 1388 return rdev->ops->del_virtual_intf(&rdev->wiphy, dev);
1322 cfg80211_unlock_rdev(rdev);
1323 dev_put(dev);
1324 unlock_rtnl:
1325 rtnl_unlock();
1326 return err;
1327} 1389}
1328 1390
1329struct get_key_cookie { 1391struct get_key_cookie {
@@ -1376,11 +1438,12 @@ static void get_key_callback(void *c, struct key_params *params)
1376 1438
1377static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info) 1439static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info)
1378{ 1440{
1379 struct cfg80211_registered_device *rdev; 1441 struct cfg80211_registered_device *rdev = info->user_ptr[0];
1380 int err; 1442 int err;
1381 struct net_device *dev; 1443 struct net_device *dev = info->user_ptr[1];
1382 u8 key_idx = 0; 1444 u8 key_idx = 0;
1383 u8 *mac_addr = NULL; 1445 const u8 *mac_addr = NULL;
1446 bool pairwise;
1384 struct get_key_cookie cookie = { 1447 struct get_key_cookie cookie = {
1385 .error = 0, 1448 .error = 0,
1386 }; 1449 };
@@ -1396,30 +1459,28 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info)
1396 if (info->attrs[NL80211_ATTR_MAC]) 1459 if (info->attrs[NL80211_ATTR_MAC])
1397 mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); 1460 mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
1398 1461
1399 rtnl_lock(); 1462 pairwise = !!mac_addr;
1400 1463 if (info->attrs[NL80211_ATTR_KEY_TYPE]) {
1401 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); 1464 u32 kt = nla_get_u32(info->attrs[NL80211_ATTR_KEY_TYPE]);
1402 if (err) 1465 if (kt >= NUM_NL80211_KEYTYPES)
1403 goto unlock_rtnl; 1466 return -EINVAL;
1404 1467 if (kt != NL80211_KEYTYPE_GROUP &&
1405 if (!rdev->ops->get_key) { 1468 kt != NL80211_KEYTYPE_PAIRWISE)
1406 err = -EOPNOTSUPP; 1469 return -EINVAL;
1407 goto out; 1470 pairwise = kt == NL80211_KEYTYPE_PAIRWISE;
1408 } 1471 }
1409 1472
1473 if (!rdev->ops->get_key)
1474 return -EOPNOTSUPP;
1475
1410 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 1476 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
1411 if (!msg) { 1477 if (!msg)
1412 err = -ENOMEM; 1478 return -ENOMEM;
1413 goto out;
1414 }
1415 1479
1416 hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0, 1480 hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0,
1417 NL80211_CMD_NEW_KEY); 1481 NL80211_CMD_NEW_KEY);
1418 1482 if (IS_ERR(hdr))
1419 if (IS_ERR(hdr)) { 1483 return PTR_ERR(hdr);
1420 err = PTR_ERR(hdr);
1421 goto free_msg;
1422 }
1423 1484
1424 cookie.msg = msg; 1485 cookie.msg = msg;
1425 cookie.idx = key_idx; 1486 cookie.idx = key_idx;
@@ -1429,8 +1490,12 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info)
1429 if (mac_addr) 1490 if (mac_addr)
1430 NLA_PUT(msg, NL80211_ATTR_MAC, ETH_ALEN, mac_addr); 1491 NLA_PUT(msg, NL80211_ATTR_MAC, ETH_ALEN, mac_addr);
1431 1492
1432 err = rdev->ops->get_key(&rdev->wiphy, dev, key_idx, mac_addr, 1493 if (pairwise && mac_addr &&
1433 &cookie, get_key_callback); 1494 !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN))
1495 return -ENOENT;
1496
1497 err = rdev->ops->get_key(&rdev->wiphy, dev, key_idx, pairwise,
1498 mac_addr, &cookie, get_key_callback);
1434 1499
1435 if (err) 1500 if (err)
1436 goto free_msg; 1501 goto free_msg;
@@ -1439,28 +1504,21 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info)
1439 goto nla_put_failure; 1504 goto nla_put_failure;
1440 1505
1441 genlmsg_end(msg, hdr); 1506 genlmsg_end(msg, hdr);
1442 err = genlmsg_reply(msg, info); 1507 return genlmsg_reply(msg, info);
1443 goto out;
1444 1508
1445 nla_put_failure: 1509 nla_put_failure:
1446 err = -ENOBUFS; 1510 err = -ENOBUFS;
1447 free_msg: 1511 free_msg:
1448 nlmsg_free(msg); 1512 nlmsg_free(msg);
1449 out:
1450 cfg80211_unlock_rdev(rdev);
1451 dev_put(dev);
1452 unlock_rtnl:
1453 rtnl_unlock();
1454
1455 return err; 1513 return err;
1456} 1514}
1457 1515
1458static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info) 1516static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info)
1459{ 1517{
1460 struct cfg80211_registered_device *rdev; 1518 struct cfg80211_registered_device *rdev = info->user_ptr[0];
1461 struct key_parse key; 1519 struct key_parse key;
1462 int err; 1520 int err;
1463 struct net_device *dev; 1521 struct net_device *dev = info->user_ptr[1];
1464 int (*func)(struct wiphy *wiphy, struct net_device *netdev, 1522 int (*func)(struct wiphy *wiphy, struct net_device *netdev,
1465 u8 key_index); 1523 u8 key_index);
1466 1524
@@ -1475,21 +1533,13 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info)
1475 if (!key.def && !key.defmgmt) 1533 if (!key.def && !key.defmgmt)
1476 return -EINVAL; 1534 return -EINVAL;
1477 1535
1478 rtnl_lock();
1479
1480 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
1481 if (err)
1482 goto unlock_rtnl;
1483
1484 if (key.def) 1536 if (key.def)
1485 func = rdev->ops->set_default_key; 1537 func = rdev->ops->set_default_key;
1486 else 1538 else
1487 func = rdev->ops->set_default_mgmt_key; 1539 func = rdev->ops->set_default_mgmt_key;
1488 1540
1489 if (!func) { 1541 if (!func)
1490 err = -EOPNOTSUPP; 1542 return -EOPNOTSUPP;
1491 goto out;
1492 }
1493 1543
1494 wdev_lock(dev->ieee80211_ptr); 1544 wdev_lock(dev->ieee80211_ptr);
1495 err = nl80211_key_allowed(dev->ieee80211_ptr); 1545 err = nl80211_key_allowed(dev->ieee80211_ptr);
@@ -1506,23 +1556,16 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info)
1506#endif 1556#endif
1507 wdev_unlock(dev->ieee80211_ptr); 1557 wdev_unlock(dev->ieee80211_ptr);
1508 1558
1509 out:
1510 cfg80211_unlock_rdev(rdev);
1511 dev_put(dev);
1512
1513 unlock_rtnl:
1514 rtnl_unlock();
1515
1516 return err; 1559 return err;
1517} 1560}
1518 1561
1519static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info) 1562static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info)
1520{ 1563{
1521 struct cfg80211_registered_device *rdev; 1564 struct cfg80211_registered_device *rdev = info->user_ptr[0];
1522 int err; 1565 int err;
1523 struct net_device *dev; 1566 struct net_device *dev = info->user_ptr[1];
1524 struct key_parse key; 1567 struct key_parse key;
1525 u8 *mac_addr = NULL; 1568 const u8 *mac_addr = NULL;
1526 1569
1527 err = nl80211_parse_key(info, &key); 1570 err = nl80211_parse_key(info, &key);
1528 if (err) 1571 if (err)
@@ -1534,43 +1577,42 @@ static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info)
1534 if (info->attrs[NL80211_ATTR_MAC]) 1577 if (info->attrs[NL80211_ATTR_MAC])
1535 mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); 1578 mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
1536 1579
1537 rtnl_lock(); 1580 if (key.type == -1) {
1581 if (mac_addr)
1582 key.type = NL80211_KEYTYPE_PAIRWISE;
1583 else
1584 key.type = NL80211_KEYTYPE_GROUP;
1585 }
1538 1586
1539 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); 1587 /* for now */
1540 if (err) 1588 if (key.type != NL80211_KEYTYPE_PAIRWISE &&
1541 goto unlock_rtnl; 1589 key.type != NL80211_KEYTYPE_GROUP)
1590 return -EINVAL;
1542 1591
1543 if (!rdev->ops->add_key) { 1592 if (!rdev->ops->add_key)
1544 err = -EOPNOTSUPP; 1593 return -EOPNOTSUPP;
1545 goto out;
1546 }
1547 1594
1548 if (cfg80211_validate_key_settings(rdev, &key.p, key.idx, mac_addr)) { 1595 if (cfg80211_validate_key_settings(rdev, &key.p, key.idx,
1549 err = -EINVAL; 1596 key.type == NL80211_KEYTYPE_PAIRWISE,
1550 goto out; 1597 mac_addr))
1551 } 1598 return -EINVAL;
1552 1599
1553 wdev_lock(dev->ieee80211_ptr); 1600 wdev_lock(dev->ieee80211_ptr);
1554 err = nl80211_key_allowed(dev->ieee80211_ptr); 1601 err = nl80211_key_allowed(dev->ieee80211_ptr);
1555 if (!err) 1602 if (!err)
1556 err = rdev->ops->add_key(&rdev->wiphy, dev, key.idx, 1603 err = rdev->ops->add_key(&rdev->wiphy, dev, key.idx,
1604 key.type == NL80211_KEYTYPE_PAIRWISE,
1557 mac_addr, &key.p); 1605 mac_addr, &key.p);
1558 wdev_unlock(dev->ieee80211_ptr); 1606 wdev_unlock(dev->ieee80211_ptr);
1559 1607
1560 out:
1561 cfg80211_unlock_rdev(rdev);
1562 dev_put(dev);
1563 unlock_rtnl:
1564 rtnl_unlock();
1565
1566 return err; 1608 return err;
1567} 1609}
1568 1610
1569static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info) 1611static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info)
1570{ 1612{
1571 struct cfg80211_registered_device *rdev; 1613 struct cfg80211_registered_device *rdev = info->user_ptr[0];
1572 int err; 1614 int err;
1573 struct net_device *dev; 1615 struct net_device *dev = info->user_ptr[1];
1574 u8 *mac_addr = NULL; 1616 u8 *mac_addr = NULL;
1575 struct key_parse key; 1617 struct key_parse key;
1576 1618
@@ -1581,21 +1623,32 @@ static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info)
1581 if (info->attrs[NL80211_ATTR_MAC]) 1623 if (info->attrs[NL80211_ATTR_MAC])
1582 mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); 1624 mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
1583 1625
1584 rtnl_lock(); 1626 if (key.type == -1) {
1627 if (mac_addr)
1628 key.type = NL80211_KEYTYPE_PAIRWISE;
1629 else
1630 key.type = NL80211_KEYTYPE_GROUP;
1631 }
1585 1632
1586 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); 1633 /* for now */
1587 if (err) 1634 if (key.type != NL80211_KEYTYPE_PAIRWISE &&
1588 goto unlock_rtnl; 1635 key.type != NL80211_KEYTYPE_GROUP)
1636 return -EINVAL;
1589 1637
1590 if (!rdev->ops->del_key) { 1638 if (!rdev->ops->del_key)
1591 err = -EOPNOTSUPP; 1639 return -EOPNOTSUPP;
1592 goto out;
1593 }
1594 1640
1595 wdev_lock(dev->ieee80211_ptr); 1641 wdev_lock(dev->ieee80211_ptr);
1596 err = nl80211_key_allowed(dev->ieee80211_ptr); 1642 err = nl80211_key_allowed(dev->ieee80211_ptr);
1643
1644 if (key.type == NL80211_KEYTYPE_PAIRWISE && mac_addr &&
1645 !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN))
1646 err = -ENOENT;
1647
1597 if (!err) 1648 if (!err)
1598 err = rdev->ops->del_key(&rdev->wiphy, dev, key.idx, mac_addr); 1649 err = rdev->ops->del_key(&rdev->wiphy, dev, key.idx,
1650 key.type == NL80211_KEYTYPE_PAIRWISE,
1651 mac_addr);
1599 1652
1600#ifdef CONFIG_CFG80211_WEXT 1653#ifdef CONFIG_CFG80211_WEXT
1601 if (!err) { 1654 if (!err) {
@@ -1607,13 +1660,6 @@ static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info)
1607#endif 1660#endif
1608 wdev_unlock(dev->ieee80211_ptr); 1661 wdev_unlock(dev->ieee80211_ptr);
1609 1662
1610 out:
1611 cfg80211_unlock_rdev(rdev);
1612 dev_put(dev);
1613
1614 unlock_rtnl:
1615 rtnl_unlock();
1616
1617 return err; 1663 return err;
1618} 1664}
1619 1665
@@ -1621,35 +1667,25 @@ static int nl80211_addset_beacon(struct sk_buff *skb, struct genl_info *info)
1621{ 1667{
1622 int (*call)(struct wiphy *wiphy, struct net_device *dev, 1668 int (*call)(struct wiphy *wiphy, struct net_device *dev,
1623 struct beacon_parameters *info); 1669 struct beacon_parameters *info);
1624 struct cfg80211_registered_device *rdev; 1670 struct cfg80211_registered_device *rdev = info->user_ptr[0];
1625 int err; 1671 struct net_device *dev = info->user_ptr[1];
1626 struct net_device *dev;
1627 struct beacon_parameters params; 1672 struct beacon_parameters params;
1628 int haveinfo = 0; 1673 int haveinfo = 0;
1629 1674
1630 if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_BEACON_TAIL])) 1675 if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_BEACON_TAIL]))
1631 return -EINVAL; 1676 return -EINVAL;
1632 1677
1633 rtnl_lock(); 1678 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
1634 1679 dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
1635 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); 1680 return -EOPNOTSUPP;
1636 if (err)
1637 goto unlock_rtnl;
1638
1639 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP) {
1640 err = -EOPNOTSUPP;
1641 goto out;
1642 }
1643 1681
1644 switch (info->genlhdr->cmd) { 1682 switch (info->genlhdr->cmd) {
1645 case NL80211_CMD_NEW_BEACON: 1683 case NL80211_CMD_NEW_BEACON:
1646 /* these are required for NEW_BEACON */ 1684 /* these are required for NEW_BEACON */
1647 if (!info->attrs[NL80211_ATTR_BEACON_INTERVAL] || 1685 if (!info->attrs[NL80211_ATTR_BEACON_INTERVAL] ||
1648 !info->attrs[NL80211_ATTR_DTIM_PERIOD] || 1686 !info->attrs[NL80211_ATTR_DTIM_PERIOD] ||
1649 !info->attrs[NL80211_ATTR_BEACON_HEAD]) { 1687 !info->attrs[NL80211_ATTR_BEACON_HEAD])
1650 err = -EINVAL; 1688 return -EINVAL;
1651 goto out;
1652 }
1653 1689
1654 call = rdev->ops->add_beacon; 1690 call = rdev->ops->add_beacon;
1655 break; 1691 break;
@@ -1658,14 +1694,11 @@ static int nl80211_addset_beacon(struct sk_buff *skb, struct genl_info *info)
1658 break; 1694 break;
1659 default: 1695 default:
1660 WARN_ON(1); 1696 WARN_ON(1);
1661 err = -EOPNOTSUPP; 1697 return -EOPNOTSUPP;
1662 goto out;
1663 } 1698 }
1664 1699
1665 if (!call) { 1700 if (!call)
1666 err = -EOPNOTSUPP; 1701 return -EOPNOTSUPP;
1667 goto out;
1668 }
1669 1702
1670 memset(&params, 0, sizeof(params)); 1703 memset(&params, 0, sizeof(params));
1671 1704
@@ -1695,52 +1728,25 @@ static int nl80211_addset_beacon(struct sk_buff *skb, struct genl_info *info)
1695 haveinfo = 1; 1728 haveinfo = 1;
1696 } 1729 }
1697 1730
1698 if (!haveinfo) { 1731 if (!haveinfo)
1699 err = -EINVAL; 1732 return -EINVAL;
1700 goto out;
1701 }
1702
1703 err = call(&rdev->wiphy, dev, &params);
1704
1705 out:
1706 cfg80211_unlock_rdev(rdev);
1707 dev_put(dev);
1708 unlock_rtnl:
1709 rtnl_unlock();
1710 1733
1711 return err; 1734 return call(&rdev->wiphy, dev, &params);
1712} 1735}
1713 1736
1714static int nl80211_del_beacon(struct sk_buff *skb, struct genl_info *info) 1737static int nl80211_del_beacon(struct sk_buff *skb, struct genl_info *info)
1715{ 1738{
1716 struct cfg80211_registered_device *rdev; 1739 struct cfg80211_registered_device *rdev = info->user_ptr[0];
1717 int err; 1740 struct net_device *dev = info->user_ptr[1];
1718 struct net_device *dev;
1719 1741
1720 rtnl_lock(); 1742 if (!rdev->ops->del_beacon)
1721 1743 return -EOPNOTSUPP;
1722 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
1723 if (err)
1724 goto unlock_rtnl;
1725
1726 if (!rdev->ops->del_beacon) {
1727 err = -EOPNOTSUPP;
1728 goto out;
1729 }
1730
1731 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP) {
1732 err = -EOPNOTSUPP;
1733 goto out;
1734 }
1735 err = rdev->ops->del_beacon(&rdev->wiphy, dev);
1736 1744
1737 out: 1745 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
1738 cfg80211_unlock_rdev(rdev); 1746 dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
1739 dev_put(dev); 1747 return -EOPNOTSUPP;
1740 unlock_rtnl:
1741 rtnl_unlock();
1742 1748
1743 return err; 1749 return rdev->ops->del_beacon(&rdev->wiphy, dev);
1744} 1750}
1745 1751
1746static const struct nla_policy sta_flags_policy[NL80211_STA_FLAG_MAX + 1] = { 1752static const struct nla_policy sta_flags_policy[NL80211_STA_FLAG_MAX + 1] = {
@@ -1861,6 +1867,12 @@ static int nl80211_send_station(struct sk_buff *msg, u32 pid, u32 seq,
1861 if (sinfo->filled & STATION_INFO_TX_PACKETS) 1867 if (sinfo->filled & STATION_INFO_TX_PACKETS)
1862 NLA_PUT_U32(msg, NL80211_STA_INFO_TX_PACKETS, 1868 NLA_PUT_U32(msg, NL80211_STA_INFO_TX_PACKETS,
1863 sinfo->tx_packets); 1869 sinfo->tx_packets);
1870 if (sinfo->filled & STATION_INFO_TX_RETRIES)
1871 NLA_PUT_U32(msg, NL80211_STA_INFO_TX_RETRIES,
1872 sinfo->tx_retries);
1873 if (sinfo->filled & STATION_INFO_TX_FAILED)
1874 NLA_PUT_U32(msg, NL80211_STA_INFO_TX_FAILED,
1875 sinfo->tx_failed);
1864 nla_nest_end(msg, sinfoattr); 1876 nla_nest_end(msg, sinfoattr);
1865 1877
1866 return genlmsg_end(msg, hdr); 1878 return genlmsg_end(msg, hdr);
@@ -1877,28 +1889,12 @@ static int nl80211_dump_station(struct sk_buff *skb,
1877 struct cfg80211_registered_device *dev; 1889 struct cfg80211_registered_device *dev;
1878 struct net_device *netdev; 1890 struct net_device *netdev;
1879 u8 mac_addr[ETH_ALEN]; 1891 u8 mac_addr[ETH_ALEN];
1880 int ifidx = cb->args[0];
1881 int sta_idx = cb->args[1]; 1892 int sta_idx = cb->args[1];
1882 int err; 1893 int err;
1883 1894
1884 if (!ifidx) 1895 err = nl80211_prepare_netdev_dump(skb, cb, &dev, &netdev);
1885 ifidx = nl80211_get_ifidx(cb); 1896 if (err)
1886 if (ifidx < 0) 1897 return err;
1887 return ifidx;
1888
1889 rtnl_lock();
1890
1891 netdev = __dev_get_by_index(sock_net(skb->sk), ifidx);
1892 if (!netdev) {
1893 err = -ENODEV;
1894 goto out_rtnl;
1895 }
1896
1897 dev = cfg80211_get_dev_from_ifindex(sock_net(skb->sk), ifidx);
1898 if (IS_ERR(dev)) {
1899 err = PTR_ERR(dev);
1900 goto out_rtnl;
1901 }
1902 1898
1903 if (!dev->ops->dump_station) { 1899 if (!dev->ops->dump_station) {
1904 err = -EOPNOTSUPP; 1900 err = -EOPNOTSUPP;
@@ -1928,21 +1924,19 @@ static int nl80211_dump_station(struct sk_buff *skb,
1928 cb->args[1] = sta_idx; 1924 cb->args[1] = sta_idx;
1929 err = skb->len; 1925 err = skb->len;
1930 out_err: 1926 out_err:
1931 cfg80211_unlock_rdev(dev); 1927 nl80211_finish_netdev_dump(dev);
1932 out_rtnl:
1933 rtnl_unlock();
1934 1928
1935 return err; 1929 return err;
1936} 1930}
1937 1931
1938static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info) 1932static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info)
1939{ 1933{
1940 struct cfg80211_registered_device *rdev; 1934 struct cfg80211_registered_device *rdev = info->user_ptr[0];
1941 int err; 1935 struct net_device *dev = info->user_ptr[1];
1942 struct net_device *dev;
1943 struct station_info sinfo; 1936 struct station_info sinfo;
1944 struct sk_buff *msg; 1937 struct sk_buff *msg;
1945 u8 *mac_addr = NULL; 1938 u8 *mac_addr = NULL;
1939 int err;
1946 1940
1947 memset(&sinfo, 0, sizeof(sinfo)); 1941 memset(&sinfo, 0, sizeof(sinfo));
1948 1942
@@ -1951,41 +1945,24 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info)
1951 1945
1952 mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); 1946 mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
1953 1947
1954 rtnl_lock(); 1948 if (!rdev->ops->get_station)
1955 1949 return -EOPNOTSUPP;
1956 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
1957 if (err)
1958 goto out_rtnl;
1959
1960 if (!rdev->ops->get_station) {
1961 err = -EOPNOTSUPP;
1962 goto out;
1963 }
1964 1950
1965 err = rdev->ops->get_station(&rdev->wiphy, dev, mac_addr, &sinfo); 1951 err = rdev->ops->get_station(&rdev->wiphy, dev, mac_addr, &sinfo);
1966 if (err) 1952 if (err)
1967 goto out; 1953 return err;
1968 1954
1969 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 1955 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
1970 if (!msg) 1956 if (!msg)
1971 goto out; 1957 return -ENOMEM;
1972 1958
1973 if (nl80211_send_station(msg, info->snd_pid, info->snd_seq, 0, 1959 if (nl80211_send_station(msg, info->snd_pid, info->snd_seq, 0,
1974 dev, mac_addr, &sinfo) < 0) 1960 dev, mac_addr, &sinfo) < 0) {
1975 goto out_free; 1961 nlmsg_free(msg);
1976 1962 return -ENOBUFS;
1977 err = genlmsg_reply(msg, info); 1963 }
1978 goto out;
1979
1980 out_free:
1981 nlmsg_free(msg);
1982 out:
1983 cfg80211_unlock_rdev(rdev);
1984 dev_put(dev);
1985 out_rtnl:
1986 rtnl_unlock();
1987 1964
1988 return err; 1965 return genlmsg_reply(msg, info);
1989} 1966}
1990 1967
1991/* 1968/*
@@ -2015,9 +1992,9 @@ static int get_vlan(struct genl_info *info,
2015 1992
2016static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) 1993static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
2017{ 1994{
2018 struct cfg80211_registered_device *rdev; 1995 struct cfg80211_registered_device *rdev = info->user_ptr[0];
2019 int err; 1996 int err;
2020 struct net_device *dev; 1997 struct net_device *dev = info->user_ptr[1];
2021 struct station_parameters params; 1998 struct station_parameters params;
2022 u8 *mac_addr = NULL; 1999 u8 *mac_addr = NULL;
2023 2000
@@ -2055,12 +2032,6 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
2055 params.plink_action = 2032 params.plink_action =
2056 nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_ACTION]); 2033 nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_ACTION]);
2057 2034
2058 rtnl_lock();
2059
2060 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
2061 if (err)
2062 goto out_rtnl;
2063
2064 err = get_vlan(info, rdev, &params.vlan); 2035 err = get_vlan(info, rdev, &params.vlan);
2065 if (err) 2036 if (err)
2066 goto out; 2037 goto out;
@@ -2071,10 +2042,12 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
2071 switch (dev->ieee80211_ptr->iftype) { 2042 switch (dev->ieee80211_ptr->iftype) {
2072 case NL80211_IFTYPE_AP: 2043 case NL80211_IFTYPE_AP:
2073 case NL80211_IFTYPE_AP_VLAN: 2044 case NL80211_IFTYPE_AP_VLAN:
2045 case NL80211_IFTYPE_P2P_GO:
2074 /* disallow mesh-specific things */ 2046 /* disallow mesh-specific things */
2075 if (params.plink_action) 2047 if (params.plink_action)
2076 err = -EINVAL; 2048 err = -EINVAL;
2077 break; 2049 break;
2050 case NL80211_IFTYPE_P2P_CLIENT:
2078 case NL80211_IFTYPE_STATION: 2051 case NL80211_IFTYPE_STATION:
2079 /* disallow everything but AUTHORIZED flag */ 2052 /* disallow everything but AUTHORIZED flag */
2080 if (params.plink_action) 2053 if (params.plink_action)
@@ -2120,19 +2093,15 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
2120 out: 2093 out:
2121 if (params.vlan) 2094 if (params.vlan)
2122 dev_put(params.vlan); 2095 dev_put(params.vlan);
2123 cfg80211_unlock_rdev(rdev);
2124 dev_put(dev);
2125 out_rtnl:
2126 rtnl_unlock();
2127 2096
2128 return err; 2097 return err;
2129} 2098}
2130 2099
2131static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) 2100static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
2132{ 2101{
2133 struct cfg80211_registered_device *rdev; 2102 struct cfg80211_registered_device *rdev = info->user_ptr[0];
2134 int err; 2103 int err;
2135 struct net_device *dev; 2104 struct net_device *dev = info->user_ptr[1];
2136 struct station_parameters params; 2105 struct station_parameters params;
2137 u8 *mac_addr = NULL; 2106 u8 *mac_addr = NULL;
2138 2107
@@ -2169,17 +2138,10 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
2169 if (parse_station_flags(info, &params)) 2138 if (parse_station_flags(info, &params))
2170 return -EINVAL; 2139 return -EINVAL;
2171 2140
2172 rtnl_lock();
2173
2174 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
2175 if (err)
2176 goto out_rtnl;
2177
2178 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && 2141 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
2179 dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN) { 2142 dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN &&
2180 err = -EINVAL; 2143 dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
2181 goto out; 2144 return -EINVAL;
2182 }
2183 2145
2184 err = get_vlan(info, rdev, &params.vlan); 2146 err = get_vlan(info, rdev, &params.vlan);
2185 if (err) 2147 if (err)
@@ -2193,61 +2155,33 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
2193 goto out; 2155 goto out;
2194 } 2156 }
2195 2157
2196 if (!netif_running(dev)) {
2197 err = -ENETDOWN;
2198 goto out;
2199 }
2200
2201 err = rdev->ops->add_station(&rdev->wiphy, dev, mac_addr, &params); 2158 err = rdev->ops->add_station(&rdev->wiphy, dev, mac_addr, &params);
2202 2159
2203 out: 2160 out:
2204 if (params.vlan) 2161 if (params.vlan)
2205 dev_put(params.vlan); 2162 dev_put(params.vlan);
2206 cfg80211_unlock_rdev(rdev);
2207 dev_put(dev);
2208 out_rtnl:
2209 rtnl_unlock();
2210
2211 return err; 2163 return err;
2212} 2164}
2213 2165
2214static int nl80211_del_station(struct sk_buff *skb, struct genl_info *info) 2166static int nl80211_del_station(struct sk_buff *skb, struct genl_info *info)
2215{ 2167{
2216 struct cfg80211_registered_device *rdev; 2168 struct cfg80211_registered_device *rdev = info->user_ptr[0];
2217 int err; 2169 struct net_device *dev = info->user_ptr[1];
2218 struct net_device *dev;
2219 u8 *mac_addr = NULL; 2170 u8 *mac_addr = NULL;
2220 2171
2221 if (info->attrs[NL80211_ATTR_MAC]) 2172 if (info->attrs[NL80211_ATTR_MAC])
2222 mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); 2173 mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
2223 2174
2224 rtnl_lock();
2225
2226 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
2227 if (err)
2228 goto out_rtnl;
2229
2230 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && 2175 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
2231 dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN && 2176 dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN &&
2232 dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) { 2177 dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT &&
2233 err = -EINVAL; 2178 dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
2234 goto out; 2179 return -EINVAL;
2235 }
2236
2237 if (!rdev->ops->del_station) {
2238 err = -EOPNOTSUPP;
2239 goto out;
2240 }
2241
2242 err = rdev->ops->del_station(&rdev->wiphy, dev, mac_addr);
2243 2180
2244 out: 2181 if (!rdev->ops->del_station)
2245 cfg80211_unlock_rdev(rdev); 2182 return -EOPNOTSUPP;
2246 dev_put(dev);
2247 out_rtnl:
2248 rtnl_unlock();
2249 2183
2250 return err; 2184 return rdev->ops->del_station(&rdev->wiphy, dev, mac_addr);
2251} 2185}
2252 2186
2253static int nl80211_send_mpath(struct sk_buff *msg, u32 pid, u32 seq, 2187static int nl80211_send_mpath(struct sk_buff *msg, u32 pid, u32 seq,
@@ -2310,28 +2244,12 @@ static int nl80211_dump_mpath(struct sk_buff *skb,
2310 struct net_device *netdev; 2244 struct net_device *netdev;
2311 u8 dst[ETH_ALEN]; 2245 u8 dst[ETH_ALEN];
2312 u8 next_hop[ETH_ALEN]; 2246 u8 next_hop[ETH_ALEN];
2313 int ifidx = cb->args[0];
2314 int path_idx = cb->args[1]; 2247 int path_idx = cb->args[1];
2315 int err; 2248 int err;
2316 2249
2317 if (!ifidx) 2250 err = nl80211_prepare_netdev_dump(skb, cb, &dev, &netdev);
2318 ifidx = nl80211_get_ifidx(cb); 2251 if (err)
2319 if (ifidx < 0) 2252 return err;
2320 return ifidx;
2321
2322 rtnl_lock();
2323
2324 netdev = __dev_get_by_index(sock_net(skb->sk), ifidx);
2325 if (!netdev) {
2326 err = -ENODEV;
2327 goto out_rtnl;
2328 }
2329
2330 dev = cfg80211_get_dev_from_ifindex(sock_net(skb->sk), ifidx);
2331 if (IS_ERR(dev)) {
2332 err = PTR_ERR(dev);
2333 goto out_rtnl;
2334 }
2335 2253
2336 if (!dev->ops->dump_mpath) { 2254 if (!dev->ops->dump_mpath) {
2337 err = -EOPNOTSUPP; 2255 err = -EOPNOTSUPP;
@@ -2365,18 +2283,15 @@ static int nl80211_dump_mpath(struct sk_buff *skb,
2365 cb->args[1] = path_idx; 2283 cb->args[1] = path_idx;
2366 err = skb->len; 2284 err = skb->len;
2367 out_err: 2285 out_err:
2368 cfg80211_unlock_rdev(dev); 2286 nl80211_finish_netdev_dump(dev);
2369 out_rtnl:
2370 rtnl_unlock();
2371
2372 return err; 2287 return err;
2373} 2288}
2374 2289
2375static int nl80211_get_mpath(struct sk_buff *skb, struct genl_info *info) 2290static int nl80211_get_mpath(struct sk_buff *skb, struct genl_info *info)
2376{ 2291{
2377 struct cfg80211_registered_device *rdev; 2292 struct cfg80211_registered_device *rdev = info->user_ptr[0];
2378 int err; 2293 int err;
2379 struct net_device *dev; 2294 struct net_device *dev = info->user_ptr[1];
2380 struct mpath_info pinfo; 2295 struct mpath_info pinfo;
2381 struct sk_buff *msg; 2296 struct sk_buff *msg;
2382 u8 *dst = NULL; 2297 u8 *dst = NULL;
@@ -2389,53 +2304,33 @@ static int nl80211_get_mpath(struct sk_buff *skb, struct genl_info *info)
2389 2304
2390 dst = nla_data(info->attrs[NL80211_ATTR_MAC]); 2305 dst = nla_data(info->attrs[NL80211_ATTR_MAC]);
2391 2306
2392 rtnl_lock(); 2307 if (!rdev->ops->get_mpath)
2393 2308 return -EOPNOTSUPP;
2394 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
2395 if (err)
2396 goto out_rtnl;
2397
2398 if (!rdev->ops->get_mpath) {
2399 err = -EOPNOTSUPP;
2400 goto out;
2401 }
2402 2309
2403 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) { 2310 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT)
2404 err = -EOPNOTSUPP; 2311 return -EOPNOTSUPP;
2405 goto out;
2406 }
2407 2312
2408 err = rdev->ops->get_mpath(&rdev->wiphy, dev, dst, next_hop, &pinfo); 2313 err = rdev->ops->get_mpath(&rdev->wiphy, dev, dst, next_hop, &pinfo);
2409 if (err) 2314 if (err)
2410 goto out; 2315 return err;
2411 2316
2412 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 2317 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
2413 if (!msg) 2318 if (!msg)
2414 goto out; 2319 return -ENOMEM;
2415 2320
2416 if (nl80211_send_mpath(msg, info->snd_pid, info->snd_seq, 0, 2321 if (nl80211_send_mpath(msg, info->snd_pid, info->snd_seq, 0,
2417 dev, dst, next_hop, &pinfo) < 0) 2322 dev, dst, next_hop, &pinfo) < 0) {
2418 goto out_free; 2323 nlmsg_free(msg);
2419 2324 return -ENOBUFS;
2420 err = genlmsg_reply(msg, info); 2325 }
2421 goto out;
2422
2423 out_free:
2424 nlmsg_free(msg);
2425 out:
2426 cfg80211_unlock_rdev(rdev);
2427 dev_put(dev);
2428 out_rtnl:
2429 rtnl_unlock();
2430 2326
2431 return err; 2327 return genlmsg_reply(msg, info);
2432} 2328}
2433 2329
2434static int nl80211_set_mpath(struct sk_buff *skb, struct genl_info *info) 2330static int nl80211_set_mpath(struct sk_buff *skb, struct genl_info *info)
2435{ 2331{
2436 struct cfg80211_registered_device *rdev; 2332 struct cfg80211_registered_device *rdev = info->user_ptr[0];
2437 int err; 2333 struct net_device *dev = info->user_ptr[1];
2438 struct net_device *dev;
2439 u8 *dst = NULL; 2334 u8 *dst = NULL;
2440 u8 *next_hop = NULL; 2335 u8 *next_hop = NULL;
2441 2336
@@ -2448,42 +2343,19 @@ static int nl80211_set_mpath(struct sk_buff *skb, struct genl_info *info)
2448 dst = nla_data(info->attrs[NL80211_ATTR_MAC]); 2343 dst = nla_data(info->attrs[NL80211_ATTR_MAC]);
2449 next_hop = nla_data(info->attrs[NL80211_ATTR_MPATH_NEXT_HOP]); 2344 next_hop = nla_data(info->attrs[NL80211_ATTR_MPATH_NEXT_HOP]);
2450 2345
2451 rtnl_lock(); 2346 if (!rdev->ops->change_mpath)
2452 2347 return -EOPNOTSUPP;
2453 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
2454 if (err)
2455 goto out_rtnl;
2456
2457 if (!rdev->ops->change_mpath) {
2458 err = -EOPNOTSUPP;
2459 goto out;
2460 }
2461
2462 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) {
2463 err = -EOPNOTSUPP;
2464 goto out;
2465 }
2466
2467 if (!netif_running(dev)) {
2468 err = -ENETDOWN;
2469 goto out;
2470 }
2471
2472 err = rdev->ops->change_mpath(&rdev->wiphy, dev, dst, next_hop);
2473 2348
2474 out: 2349 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT)
2475 cfg80211_unlock_rdev(rdev); 2350 return -EOPNOTSUPP;
2476 dev_put(dev);
2477 out_rtnl:
2478 rtnl_unlock();
2479 2351
2480 return err; 2352 return rdev->ops->change_mpath(&rdev->wiphy, dev, dst, next_hop);
2481} 2353}
2354
2482static int nl80211_new_mpath(struct sk_buff *skb, struct genl_info *info) 2355static int nl80211_new_mpath(struct sk_buff *skb, struct genl_info *info)
2483{ 2356{
2484 struct cfg80211_registered_device *rdev; 2357 struct cfg80211_registered_device *rdev = info->user_ptr[0];
2485 int err; 2358 struct net_device *dev = info->user_ptr[1];
2486 struct net_device *dev;
2487 u8 *dst = NULL; 2359 u8 *dst = NULL;
2488 u8 *next_hop = NULL; 2360 u8 *next_hop = NULL;
2489 2361
@@ -2496,75 +2368,34 @@ static int nl80211_new_mpath(struct sk_buff *skb, struct genl_info *info)
2496 dst = nla_data(info->attrs[NL80211_ATTR_MAC]); 2368 dst = nla_data(info->attrs[NL80211_ATTR_MAC]);
2497 next_hop = nla_data(info->attrs[NL80211_ATTR_MPATH_NEXT_HOP]); 2369 next_hop = nla_data(info->attrs[NL80211_ATTR_MPATH_NEXT_HOP]);
2498 2370
2499 rtnl_lock(); 2371 if (!rdev->ops->add_mpath)
2500 2372 return -EOPNOTSUPP;
2501 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
2502 if (err)
2503 goto out_rtnl;
2504
2505 if (!rdev->ops->add_mpath) {
2506 err = -EOPNOTSUPP;
2507 goto out;
2508 }
2509
2510 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) {
2511 err = -EOPNOTSUPP;
2512 goto out;
2513 }
2514
2515 if (!netif_running(dev)) {
2516 err = -ENETDOWN;
2517 goto out;
2518 }
2519
2520 err = rdev->ops->add_mpath(&rdev->wiphy, dev, dst, next_hop);
2521 2373
2522 out: 2374 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT)
2523 cfg80211_unlock_rdev(rdev); 2375 return -EOPNOTSUPP;
2524 dev_put(dev);
2525 out_rtnl:
2526 rtnl_unlock();
2527 2376
2528 return err; 2377 return rdev->ops->add_mpath(&rdev->wiphy, dev, dst, next_hop);
2529} 2378}
2530 2379
2531static int nl80211_del_mpath(struct sk_buff *skb, struct genl_info *info) 2380static int nl80211_del_mpath(struct sk_buff *skb, struct genl_info *info)
2532{ 2381{
2533 struct cfg80211_registered_device *rdev; 2382 struct cfg80211_registered_device *rdev = info->user_ptr[0];
2534 int err; 2383 struct net_device *dev = info->user_ptr[1];
2535 struct net_device *dev;
2536 u8 *dst = NULL; 2384 u8 *dst = NULL;
2537 2385
2538 if (info->attrs[NL80211_ATTR_MAC]) 2386 if (info->attrs[NL80211_ATTR_MAC])
2539 dst = nla_data(info->attrs[NL80211_ATTR_MAC]); 2387 dst = nla_data(info->attrs[NL80211_ATTR_MAC]);
2540 2388
2541 rtnl_lock(); 2389 if (!rdev->ops->del_mpath)
2542 2390 return -EOPNOTSUPP;
2543 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
2544 if (err)
2545 goto out_rtnl;
2546
2547 if (!rdev->ops->del_mpath) {
2548 err = -EOPNOTSUPP;
2549 goto out;
2550 }
2551
2552 err = rdev->ops->del_mpath(&rdev->wiphy, dev, dst);
2553
2554 out:
2555 cfg80211_unlock_rdev(rdev);
2556 dev_put(dev);
2557 out_rtnl:
2558 rtnl_unlock();
2559 2391
2560 return err; 2392 return rdev->ops->del_mpath(&rdev->wiphy, dev, dst);
2561} 2393}
2562 2394
2563static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) 2395static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info)
2564{ 2396{
2565 struct cfg80211_registered_device *rdev; 2397 struct cfg80211_registered_device *rdev = info->user_ptr[0];
2566 int err; 2398 struct net_device *dev = info->user_ptr[1];
2567 struct net_device *dev;
2568 struct bss_parameters params; 2399 struct bss_parameters params;
2569 2400
2570 memset(&params, 0, sizeof(params)); 2401 memset(&params, 0, sizeof(params));
@@ -2592,31 +2423,14 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info)
2592 if (info->attrs[NL80211_ATTR_AP_ISOLATE]) 2423 if (info->attrs[NL80211_ATTR_AP_ISOLATE])
2593 params.ap_isolate = !!nla_get_u8(info->attrs[NL80211_ATTR_AP_ISOLATE]); 2424 params.ap_isolate = !!nla_get_u8(info->attrs[NL80211_ATTR_AP_ISOLATE]);
2594 2425
2595 rtnl_lock(); 2426 if (!rdev->ops->change_bss)
2596 2427 return -EOPNOTSUPP;
2597 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
2598 if (err)
2599 goto out_rtnl;
2600
2601 if (!rdev->ops->change_bss) {
2602 err = -EOPNOTSUPP;
2603 goto out;
2604 }
2605
2606 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP) {
2607 err = -EOPNOTSUPP;
2608 goto out;
2609 }
2610
2611 err = rdev->ops->change_bss(&rdev->wiphy, dev, &params);
2612 2428
2613 out: 2429 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
2614 cfg80211_unlock_rdev(rdev); 2430 dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
2615 dev_put(dev); 2431 return -EOPNOTSUPP;
2616 out_rtnl:
2617 rtnl_unlock();
2618 2432
2619 return err; 2433 return rdev->ops->change_bss(&rdev->wiphy, dev, &params);
2620} 2434}
2621 2435
2622static const struct nla_policy reg_rule_policy[NL80211_REG_RULE_ATTR_MAX + 1] = { 2436static const struct nla_policy reg_rule_policy[NL80211_REG_RULE_ATTR_MAX + 1] = {
@@ -2695,37 +2509,26 @@ static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info)
2695static int nl80211_get_mesh_params(struct sk_buff *skb, 2509static int nl80211_get_mesh_params(struct sk_buff *skb,
2696 struct genl_info *info) 2510 struct genl_info *info)
2697{ 2511{
2698 struct cfg80211_registered_device *rdev; 2512 struct cfg80211_registered_device *rdev = info->user_ptr[0];
2699 struct mesh_config cur_params; 2513 struct mesh_config cur_params;
2700 int err; 2514 int err;
2701 struct net_device *dev; 2515 struct net_device *dev = info->user_ptr[1];
2702 void *hdr; 2516 void *hdr;
2703 struct nlattr *pinfoattr; 2517 struct nlattr *pinfoattr;
2704 struct sk_buff *msg; 2518 struct sk_buff *msg;
2705 2519
2706 rtnl_lock(); 2520 if (!rdev->ops->get_mesh_params)
2707 2521 return -EOPNOTSUPP;
2708 /* Look up our device */
2709 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
2710 if (err)
2711 goto out_rtnl;
2712
2713 if (!rdev->ops->get_mesh_params) {
2714 err = -EOPNOTSUPP;
2715 goto out;
2716 }
2717 2522
2718 /* Get the mesh params */ 2523 /* Get the mesh params */
2719 err = rdev->ops->get_mesh_params(&rdev->wiphy, dev, &cur_params); 2524 err = rdev->ops->get_mesh_params(&rdev->wiphy, dev, &cur_params);
2720 if (err) 2525 if (err)
2721 goto out; 2526 return err;
2722 2527
2723 /* Draw up a netlink message to send back */ 2528 /* Draw up a netlink message to send back */
2724 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 2529 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
2725 if (!msg) { 2530 if (!msg)
2726 err = -ENOBUFS; 2531 return -ENOMEM;
2727 goto out;
2728 }
2729 hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0, 2532 hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0,
2730 NL80211_CMD_GET_MESH_PARAMS); 2533 NL80211_CMD_GET_MESH_PARAMS);
2731 if (!hdr) 2534 if (!hdr)
@@ -2764,21 +2567,12 @@ static int nl80211_get_mesh_params(struct sk_buff *skb,
2764 cur_params.dot11MeshHWMPRootMode); 2567 cur_params.dot11MeshHWMPRootMode);
2765 nla_nest_end(msg, pinfoattr); 2568 nla_nest_end(msg, pinfoattr);
2766 genlmsg_end(msg, hdr); 2569 genlmsg_end(msg, hdr);
2767 err = genlmsg_reply(msg, info); 2570 return genlmsg_reply(msg, info);
2768 goto out;
2769 2571
2770 nla_put_failure: 2572 nla_put_failure:
2771 genlmsg_cancel(msg, hdr); 2573 genlmsg_cancel(msg, hdr);
2772 nlmsg_free(msg); 2574 nlmsg_free(msg);
2773 err = -EMSGSIZE; 2575 return -ENOBUFS;
2774 out:
2775 /* Cleanup */
2776 cfg80211_unlock_rdev(rdev);
2777 dev_put(dev);
2778 out_rtnl:
2779 rtnl_unlock();
2780
2781 return err;
2782} 2576}
2783 2577
2784#define FILL_IN_MESH_PARAM_IF_SET(table, cfg, param, mask, attr_num, nla_fn) \ 2578#define FILL_IN_MESH_PARAM_IF_SET(table, cfg, param, mask, attr_num, nla_fn) \
@@ -2808,10 +2602,9 @@ static const struct nla_policy nl80211_meshconf_params_policy[NL80211_MESHCONF_A
2808 2602
2809static int nl80211_set_mesh_params(struct sk_buff *skb, struct genl_info *info) 2603static int nl80211_set_mesh_params(struct sk_buff *skb, struct genl_info *info)
2810{ 2604{
2811 int err;
2812 u32 mask; 2605 u32 mask;
2813 struct cfg80211_registered_device *rdev; 2606 struct cfg80211_registered_device *rdev = info->user_ptr[0];
2814 struct net_device *dev; 2607 struct net_device *dev = info->user_ptr[1];
2815 struct mesh_config cfg; 2608 struct mesh_config cfg;
2816 struct nlattr *tb[NL80211_MESHCONF_ATTR_MAX + 1]; 2609 struct nlattr *tb[NL80211_MESHCONF_ATTR_MAX + 1];
2817 struct nlattr *parent_attr; 2610 struct nlattr *parent_attr;
@@ -2823,16 +2616,8 @@ static int nl80211_set_mesh_params(struct sk_buff *skb, struct genl_info *info)
2823 parent_attr, nl80211_meshconf_params_policy)) 2616 parent_attr, nl80211_meshconf_params_policy))
2824 return -EINVAL; 2617 return -EINVAL;
2825 2618
2826 rtnl_lock(); 2619 if (!rdev->ops->set_mesh_params)
2827 2620 return -EOPNOTSUPP;
2828 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
2829 if (err)
2830 goto out_rtnl;
2831
2832 if (!rdev->ops->set_mesh_params) {
2833 err = -EOPNOTSUPP;
2834 goto out;
2835 }
2836 2621
2837 /* This makes sure that there aren't more than 32 mesh config 2622 /* This makes sure that there aren't more than 32 mesh config
2838 * parameters (otherwise our bitfield scheme would not work.) */ 2623 * parameters (otherwise our bitfield scheme would not work.) */
@@ -2878,16 +2663,7 @@ static int nl80211_set_mesh_params(struct sk_buff *skb, struct genl_info *info)
2878 nla_get_u8); 2663 nla_get_u8);
2879 2664
2880 /* Apply changes */ 2665 /* Apply changes */
2881 err = rdev->ops->set_mesh_params(&rdev->wiphy, dev, &cfg, mask); 2666 return rdev->ops->set_mesh_params(&rdev->wiphy, dev, &cfg, mask);
2882
2883 out:
2884 /* cleanup */
2885 cfg80211_unlock_rdev(rdev);
2886 dev_put(dev);
2887 out_rtnl:
2888 rtnl_unlock();
2889
2890 return err;
2891} 2667}
2892 2668
2893#undef FILL_IN_MESH_PARAM_IF_SET 2669#undef FILL_IN_MESH_PARAM_IF_SET
@@ -3070,8 +2846,8 @@ static int validate_scan_freqs(struct nlattr *freqs)
3070 2846
3071static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) 2847static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
3072{ 2848{
3073 struct cfg80211_registered_device *rdev; 2849 struct cfg80211_registered_device *rdev = info->user_ptr[0];
3074 struct net_device *dev; 2850 struct net_device *dev = info->user_ptr[1];
3075 struct cfg80211_scan_request *request; 2851 struct cfg80211_scan_request *request;
3076 struct cfg80211_ssid *ssid; 2852 struct cfg80211_ssid *ssid;
3077 struct ieee80211_channel *channel; 2853 struct ieee80211_channel *channel;
@@ -3084,36 +2860,19 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
3084 if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE])) 2860 if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
3085 return -EINVAL; 2861 return -EINVAL;
3086 2862
3087 rtnl_lock();
3088
3089 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
3090 if (err)
3091 goto out_rtnl;
3092
3093 wiphy = &rdev->wiphy; 2863 wiphy = &rdev->wiphy;
3094 2864
3095 if (!rdev->ops->scan) { 2865 if (!rdev->ops->scan)
3096 err = -EOPNOTSUPP; 2866 return -EOPNOTSUPP;
3097 goto out;
3098 }
3099
3100 if (!netif_running(dev)) {
3101 err = -ENETDOWN;
3102 goto out;
3103 }
3104 2867
3105 if (rdev->scan_req) { 2868 if (rdev->scan_req)
3106 err = -EBUSY; 2869 return -EBUSY;
3107 goto out;
3108 }
3109 2870
3110 if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) { 2871 if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) {
3111 n_channels = validate_scan_freqs( 2872 n_channels = validate_scan_freqs(
3112 info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]); 2873 info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]);
3113 if (!n_channels) { 2874 if (!n_channels)
3114 err = -EINVAL; 2875 return -EINVAL;
3115 goto out;
3116 }
3117 } else { 2876 } else {
3118 n_channels = 0; 2877 n_channels = 0;
3119 2878
@@ -3126,29 +2885,23 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
3126 nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_SSIDS], tmp) 2885 nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_SSIDS], tmp)
3127 n_ssids++; 2886 n_ssids++;
3128 2887
3129 if (n_ssids > wiphy->max_scan_ssids) { 2888 if (n_ssids > wiphy->max_scan_ssids)
3130 err = -EINVAL; 2889 return -EINVAL;
3131 goto out;
3132 }
3133 2890
3134 if (info->attrs[NL80211_ATTR_IE]) 2891 if (info->attrs[NL80211_ATTR_IE])
3135 ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); 2892 ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
3136 else 2893 else
3137 ie_len = 0; 2894 ie_len = 0;
3138 2895
3139 if (ie_len > wiphy->max_scan_ie_len) { 2896 if (ie_len > wiphy->max_scan_ie_len)
3140 err = -EINVAL; 2897 return -EINVAL;
3141 goto out;
3142 }
3143 2898
3144 request = kzalloc(sizeof(*request) 2899 request = kzalloc(sizeof(*request)
3145 + sizeof(*ssid) * n_ssids 2900 + sizeof(*ssid) * n_ssids
3146 + sizeof(channel) * n_channels 2901 + sizeof(channel) * n_channels
3147 + ie_len, GFP_KERNEL); 2902 + ie_len, GFP_KERNEL);
3148 if (!request) { 2903 if (!request)
3149 err = -ENOMEM; 2904 return -ENOMEM;
3150 goto out;
3151 }
3152 2905
3153 if (n_ssids) 2906 if (n_ssids)
3154 request->ssids = (void *)&request->channels[n_channels]; 2907 request->ssids = (void *)&request->channels[n_channels];
@@ -3236,18 +2989,11 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
3236 if (!err) { 2989 if (!err) {
3237 nl80211_send_scan_start(rdev, dev); 2990 nl80211_send_scan_start(rdev, dev);
3238 dev_hold(dev); 2991 dev_hold(dev);
3239 } 2992 } else {
3240
3241 out_free: 2993 out_free:
3242 if (err) {
3243 rdev->scan_req = NULL; 2994 rdev->scan_req = NULL;
3244 kfree(request); 2995 kfree(request);
3245 } 2996 }
3246 out:
3247 cfg80211_unlock_rdev(rdev);
3248 dev_put(dev);
3249 out_rtnl:
3250 rtnl_unlock();
3251 2997
3252 return err; 2998 return err;
3253} 2999}
@@ -3306,6 +3052,7 @@ static int nl80211_send_bss(struct sk_buff *msg, u32 pid, u32 seq, int flags,
3306 } 3052 }
3307 3053
3308 switch (wdev->iftype) { 3054 switch (wdev->iftype) {
3055 case NL80211_IFTYPE_P2P_CLIENT:
3309 case NL80211_IFTYPE_STATION: 3056 case NL80211_IFTYPE_STATION:
3310 if (intbss == wdev->current_bss) 3057 if (intbss == wdev->current_bss)
3311 NLA_PUT_U32(msg, NL80211_BSS_STATUS, 3058 NLA_PUT_U32(msg, NL80211_BSS_STATUS,
@@ -3343,25 +3090,12 @@ static int nl80211_dump_scan(struct sk_buff *skb,
3343 struct net_device *dev; 3090 struct net_device *dev;
3344 struct cfg80211_internal_bss *scan; 3091 struct cfg80211_internal_bss *scan;
3345 struct wireless_dev *wdev; 3092 struct wireless_dev *wdev;
3346 int ifidx = cb->args[0];
3347 int start = cb->args[1], idx = 0; 3093 int start = cb->args[1], idx = 0;
3348 int err; 3094 int err;
3349 3095
3350 if (!ifidx) 3096 err = nl80211_prepare_netdev_dump(skb, cb, &rdev, &dev);
3351 ifidx = nl80211_get_ifidx(cb); 3097 if (err)
3352 if (ifidx < 0) 3098 return err;
3353 return ifidx;
3354 cb->args[0] = ifidx;
3355
3356 dev = dev_get_by_index(sock_net(skb->sk), ifidx);
3357 if (!dev)
3358 return -ENODEV;
3359
3360 rdev = cfg80211_get_dev_from_ifindex(sock_net(skb->sk), ifidx);
3361 if (IS_ERR(rdev)) {
3362 err = PTR_ERR(rdev);
3363 goto out_put_netdev;
3364 }
3365 3099
3366 wdev = dev->ieee80211_ptr; 3100 wdev = dev->ieee80211_ptr;
3367 3101
@@ -3377,21 +3111,17 @@ static int nl80211_dump_scan(struct sk_buff *skb,
3377 cb->nlh->nlmsg_seq, NLM_F_MULTI, 3111 cb->nlh->nlmsg_seq, NLM_F_MULTI,
3378 rdev, wdev, scan) < 0) { 3112 rdev, wdev, scan) < 0) {
3379 idx--; 3113 idx--;
3380 goto out; 3114 break;
3381 } 3115 }
3382 } 3116 }
3383 3117
3384 out:
3385 spin_unlock_bh(&rdev->bss_lock); 3118 spin_unlock_bh(&rdev->bss_lock);
3386 wdev_unlock(wdev); 3119 wdev_unlock(wdev);
3387 3120
3388 cb->args[1] = idx; 3121 cb->args[1] = idx;
3389 err = skb->len; 3122 nl80211_finish_netdev_dump(rdev);
3390 cfg80211_unlock_rdev(rdev);
3391 out_put_netdev:
3392 dev_put(dev);
3393 3123
3394 return err; 3124 return skb->len;
3395} 3125}
3396 3126
3397static int nl80211_send_survey(struct sk_buff *msg, u32 pid, u32 seq, 3127static int nl80211_send_survey(struct sk_buff *msg, u32 pid, u32 seq,
@@ -3421,6 +3151,23 @@ static int nl80211_send_survey(struct sk_buff *msg, u32 pid, u32 seq,
3421 if (survey->filled & SURVEY_INFO_NOISE_DBM) 3151 if (survey->filled & SURVEY_INFO_NOISE_DBM)
3422 NLA_PUT_U8(msg, NL80211_SURVEY_INFO_NOISE, 3152 NLA_PUT_U8(msg, NL80211_SURVEY_INFO_NOISE,
3423 survey->noise); 3153 survey->noise);
3154 if (survey->filled & SURVEY_INFO_IN_USE)
3155 NLA_PUT_FLAG(msg, NL80211_SURVEY_INFO_IN_USE);
3156 if (survey->filled & SURVEY_INFO_CHANNEL_TIME)
3157 NLA_PUT_U64(msg, NL80211_SURVEY_INFO_CHANNEL_TIME,
3158 survey->channel_time);
3159 if (survey->filled & SURVEY_INFO_CHANNEL_TIME_BUSY)
3160 NLA_PUT_U64(msg, NL80211_SURVEY_INFO_CHANNEL_TIME_BUSY,
3161 survey->channel_time_busy);
3162 if (survey->filled & SURVEY_INFO_CHANNEL_TIME_EXT_BUSY)
3163 NLA_PUT_U64(msg, NL80211_SURVEY_INFO_CHANNEL_TIME_EXT_BUSY,
3164 survey->channel_time_ext_busy);
3165 if (survey->filled & SURVEY_INFO_CHANNEL_TIME_RX)
3166 NLA_PUT_U64(msg, NL80211_SURVEY_INFO_CHANNEL_TIME_RX,
3167 survey->channel_time_rx);
3168 if (survey->filled & SURVEY_INFO_CHANNEL_TIME_TX)
3169 NLA_PUT_U64(msg, NL80211_SURVEY_INFO_CHANNEL_TIME_TX,
3170 survey->channel_time_tx);
3424 3171
3425 nla_nest_end(msg, infoattr); 3172 nla_nest_end(msg, infoattr);
3426 3173
@@ -3437,29 +3184,12 @@ static int nl80211_dump_survey(struct sk_buff *skb,
3437 struct survey_info survey; 3184 struct survey_info survey;
3438 struct cfg80211_registered_device *dev; 3185 struct cfg80211_registered_device *dev;
3439 struct net_device *netdev; 3186 struct net_device *netdev;
3440 int ifidx = cb->args[0];
3441 int survey_idx = cb->args[1]; 3187 int survey_idx = cb->args[1];
3442 int res; 3188 int res;
3443 3189
3444 if (!ifidx) 3190 res = nl80211_prepare_netdev_dump(skb, cb, &dev, &netdev);
3445 ifidx = nl80211_get_ifidx(cb); 3191 if (res)
3446 if (ifidx < 0) 3192 return res;
3447 return ifidx;
3448 cb->args[0] = ifidx;
3449
3450 rtnl_lock();
3451
3452 netdev = __dev_get_by_index(sock_net(skb->sk), ifidx);
3453 if (!netdev) {
3454 res = -ENODEV;
3455 goto out_rtnl;
3456 }
3457
3458 dev = cfg80211_get_dev_from_ifindex(sock_net(skb->sk), ifidx);
3459 if (IS_ERR(dev)) {
3460 res = PTR_ERR(dev);
3461 goto out_rtnl;
3462 }
3463 3193
3464 if (!dev->ops->dump_survey) { 3194 if (!dev->ops->dump_survey) {
3465 res = -EOPNOTSUPP; 3195 res = -EOPNOTSUPP;
@@ -3487,10 +3217,7 @@ static int nl80211_dump_survey(struct sk_buff *skb,
3487 cb->args[1] = survey_idx; 3217 cb->args[1] = survey_idx;
3488 res = skb->len; 3218 res = skb->len;
3489 out_err: 3219 out_err:
3490 cfg80211_unlock_rdev(dev); 3220 nl80211_finish_netdev_dump(dev);
3491 out_rtnl:
3492 rtnl_unlock();
3493
3494 return res; 3221 return res;
3495} 3222}
3496 3223
@@ -3523,8 +3250,8 @@ static bool nl80211_valid_cipher_suite(u32 cipher)
3523 3250
3524static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) 3251static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
3525{ 3252{
3526 struct cfg80211_registered_device *rdev; 3253 struct cfg80211_registered_device *rdev = info->user_ptr[0];
3527 struct net_device *dev; 3254 struct net_device *dev = info->user_ptr[1];
3528 struct ieee80211_channel *chan; 3255 struct ieee80211_channel *chan;
3529 const u8 *bssid, *ssid, *ie = NULL; 3256 const u8 *bssid, *ssid, *ie = NULL;
3530 int err, ssid_len, ie_len = 0; 3257 int err, ssid_len, ie_len = 0;
@@ -3552,6 +3279,8 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
3552 return err; 3279 return err;
3553 3280
3554 if (key.idx >= 0) { 3281 if (key.idx >= 0) {
3282 if (key.type != -1 && key.type != NL80211_KEYTYPE_GROUP)
3283 return -EINVAL;
3555 if (!key.p.key || !key.p.key_len) 3284 if (!key.p.key || !key.p.key_len)
3556 return -EINVAL; 3285 return -EINVAL;
3557 if ((key.p.cipher != WLAN_CIPHER_SUITE_WEP40 || 3286 if ((key.p.cipher != WLAN_CIPHER_SUITE_WEP40 ||
@@ -3566,34 +3295,31 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
3566 key.p.key = NULL; 3295 key.p.key = NULL;
3567 } 3296 }
3568 3297
3569 rtnl_lock(); 3298 if (key.idx >= 0) {
3570 3299 int i;
3571 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); 3300 bool ok = false;
3572 if (err) 3301 for (i = 0; i < rdev->wiphy.n_cipher_suites; i++) {
3573 goto unlock_rtnl; 3302 if (key.p.cipher == rdev->wiphy.cipher_suites[i]) {
3574 3303 ok = true;
3575 if (!rdev->ops->auth) { 3304 break;
3576 err = -EOPNOTSUPP; 3305 }
3577 goto out; 3306 }
3307 if (!ok)
3308 return -EINVAL;
3578 } 3309 }
3579 3310
3580 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) { 3311 if (!rdev->ops->auth)
3581 err = -EOPNOTSUPP; 3312 return -EOPNOTSUPP;
3582 goto out;
3583 }
3584 3313
3585 if (!netif_running(dev)) { 3314 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
3586 err = -ENETDOWN; 3315 dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
3587 goto out; 3316 return -EOPNOTSUPP;
3588 }
3589 3317
3590 bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); 3318 bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
3591 chan = ieee80211_get_channel(&rdev->wiphy, 3319 chan = ieee80211_get_channel(&rdev->wiphy,
3592 nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ])); 3320 nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]));
3593 if (!chan || (chan->flags & IEEE80211_CHAN_DISABLED)) { 3321 if (!chan || (chan->flags & IEEE80211_CHAN_DISABLED))
3594 err = -EINVAL; 3322 return -EINVAL;
3595 goto out;
3596 }
3597 3323
3598 ssid = nla_data(info->attrs[NL80211_ATTR_SSID]); 3324 ssid = nla_data(info->attrs[NL80211_ATTR_SSID]);
3599 ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]); 3325 ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]);
@@ -3604,27 +3330,19 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
3604 } 3330 }
3605 3331
3606 auth_type = nla_get_u32(info->attrs[NL80211_ATTR_AUTH_TYPE]); 3332 auth_type = nla_get_u32(info->attrs[NL80211_ATTR_AUTH_TYPE]);
3607 if (!nl80211_valid_auth_type(auth_type)) { 3333 if (!nl80211_valid_auth_type(auth_type))
3608 err = -EINVAL; 3334 return -EINVAL;
3609 goto out;
3610 }
3611 3335
3612 local_state_change = !!info->attrs[NL80211_ATTR_LOCAL_STATE_CHANGE]; 3336 local_state_change = !!info->attrs[NL80211_ATTR_LOCAL_STATE_CHANGE];
3613 3337
3614 err = cfg80211_mlme_auth(rdev, dev, chan, auth_type, bssid, 3338 return cfg80211_mlme_auth(rdev, dev, chan, auth_type, bssid,
3615 ssid, ssid_len, ie, ie_len, 3339 ssid, ssid_len, ie, ie_len,
3616 key.p.key, key.p.key_len, key.idx, 3340 key.p.key, key.p.key_len, key.idx,
3617 local_state_change); 3341 local_state_change);
3618
3619out:
3620 cfg80211_unlock_rdev(rdev);
3621 dev_put(dev);
3622unlock_rtnl:
3623 rtnl_unlock();
3624 return err;
3625} 3342}
3626 3343
3627static int nl80211_crypto_settings(struct genl_info *info, 3344static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev,
3345 struct genl_info *info,
3628 struct cfg80211_crypto_settings *settings, 3346 struct cfg80211_crypto_settings *settings,
3629 int cipher_limit) 3347 int cipher_limit)
3630{ 3348{
@@ -3632,6 +3350,19 @@ static int nl80211_crypto_settings(struct genl_info *info,
3632 3350
3633 settings->control_port = info->attrs[NL80211_ATTR_CONTROL_PORT]; 3351 settings->control_port = info->attrs[NL80211_ATTR_CONTROL_PORT];
3634 3352
3353 if (info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]) {
3354 u16 proto;
3355 proto = nla_get_u16(
3356 info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]);
3357 settings->control_port_ethertype = cpu_to_be16(proto);
3358 if (!(rdev->wiphy.flags & WIPHY_FLAG_CONTROL_PORT_PROTOCOL) &&
3359 proto != ETH_P_PAE)
3360 return -EINVAL;
3361 if (info->attrs[NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT])
3362 settings->control_port_no_encrypt = true;
3363 } else
3364 settings->control_port_ethertype = cpu_to_be16(ETH_P_PAE);
3365
3635 if (info->attrs[NL80211_ATTR_CIPHER_SUITES_PAIRWISE]) { 3366 if (info->attrs[NL80211_ATTR_CIPHER_SUITES_PAIRWISE]) {
3636 void *data; 3367 void *data;
3637 int len, i; 3368 int len, i;
@@ -3691,8 +3422,8 @@ static int nl80211_crypto_settings(struct genl_info *info,
3691 3422
3692static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) 3423static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
3693{ 3424{
3694 struct cfg80211_registered_device *rdev; 3425 struct cfg80211_registered_device *rdev = info->user_ptr[0];
3695 struct net_device *dev; 3426 struct net_device *dev = info->user_ptr[1];
3696 struct cfg80211_crypto_settings crypto; 3427 struct cfg80211_crypto_settings crypto;
3697 struct ieee80211_channel *chan; 3428 struct ieee80211_channel *chan;
3698 const u8 *bssid, *ssid, *ie = NULL, *prev_bssid = NULL; 3429 const u8 *bssid, *ssid, *ie = NULL, *prev_bssid = NULL;
@@ -3707,35 +3438,19 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
3707 !info->attrs[NL80211_ATTR_WIPHY_FREQ]) 3438 !info->attrs[NL80211_ATTR_WIPHY_FREQ])
3708 return -EINVAL; 3439 return -EINVAL;
3709 3440
3710 rtnl_lock(); 3441 if (!rdev->ops->assoc)
3711 3442 return -EOPNOTSUPP;
3712 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
3713 if (err)
3714 goto unlock_rtnl;
3715
3716 if (!rdev->ops->assoc) {
3717 err = -EOPNOTSUPP;
3718 goto out;
3719 }
3720
3721 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) {
3722 err = -EOPNOTSUPP;
3723 goto out;
3724 }
3725 3443
3726 if (!netif_running(dev)) { 3444 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
3727 err = -ENETDOWN; 3445 dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
3728 goto out; 3446 return -EOPNOTSUPP;
3729 }
3730 3447
3731 bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); 3448 bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
3732 3449
3733 chan = ieee80211_get_channel(&rdev->wiphy, 3450 chan = ieee80211_get_channel(&rdev->wiphy,
3734 nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ])); 3451 nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]));
3735 if (!chan || (chan->flags & IEEE80211_CHAN_DISABLED)) { 3452 if (!chan || (chan->flags & IEEE80211_CHAN_DISABLED))
3736 err = -EINVAL; 3453 return -EINVAL;
3737 goto out;
3738 }
3739 3454
3740 ssid = nla_data(info->attrs[NL80211_ATTR_SSID]); 3455 ssid = nla_data(info->attrs[NL80211_ATTR_SSID]);
3741 ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]); 3456 ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]);
@@ -3750,35 +3465,28 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
3750 nla_get_u32(info->attrs[NL80211_ATTR_USE_MFP]); 3465 nla_get_u32(info->attrs[NL80211_ATTR_USE_MFP]);
3751 if (mfp == NL80211_MFP_REQUIRED) 3466 if (mfp == NL80211_MFP_REQUIRED)
3752 use_mfp = true; 3467 use_mfp = true;
3753 else if (mfp != NL80211_MFP_NO) { 3468 else if (mfp != NL80211_MFP_NO)
3754 err = -EINVAL; 3469 return -EINVAL;
3755 goto out;
3756 }
3757 } 3470 }
3758 3471
3759 if (info->attrs[NL80211_ATTR_PREV_BSSID]) 3472 if (info->attrs[NL80211_ATTR_PREV_BSSID])
3760 prev_bssid = nla_data(info->attrs[NL80211_ATTR_PREV_BSSID]); 3473 prev_bssid = nla_data(info->attrs[NL80211_ATTR_PREV_BSSID]);
3761 3474
3762 err = nl80211_crypto_settings(info, &crypto, 1); 3475 err = nl80211_crypto_settings(rdev, info, &crypto, 1);
3763 if (!err) 3476 if (!err)
3764 err = cfg80211_mlme_assoc(rdev, dev, chan, bssid, prev_bssid, 3477 err = cfg80211_mlme_assoc(rdev, dev, chan, bssid, prev_bssid,
3765 ssid, ssid_len, ie, ie_len, use_mfp, 3478 ssid, ssid_len, ie, ie_len, use_mfp,
3766 &crypto); 3479 &crypto);
3767 3480
3768out:
3769 cfg80211_unlock_rdev(rdev);
3770 dev_put(dev);
3771unlock_rtnl:
3772 rtnl_unlock();
3773 return err; 3481 return err;
3774} 3482}
3775 3483
3776static int nl80211_deauthenticate(struct sk_buff *skb, struct genl_info *info) 3484static int nl80211_deauthenticate(struct sk_buff *skb, struct genl_info *info)
3777{ 3485{
3778 struct cfg80211_registered_device *rdev; 3486 struct cfg80211_registered_device *rdev = info->user_ptr[0];
3779 struct net_device *dev; 3487 struct net_device *dev = info->user_ptr[1];
3780 const u8 *ie = NULL, *bssid; 3488 const u8 *ie = NULL, *bssid;
3781 int err, ie_len = 0; 3489 int ie_len = 0;
3782 u16 reason_code; 3490 u16 reason_code;
3783 bool local_state_change; 3491 bool local_state_change;
3784 3492
@@ -3791,34 +3499,19 @@ static int nl80211_deauthenticate(struct sk_buff *skb, struct genl_info *info)
3791 if (!info->attrs[NL80211_ATTR_REASON_CODE]) 3499 if (!info->attrs[NL80211_ATTR_REASON_CODE])
3792 return -EINVAL; 3500 return -EINVAL;
3793 3501
3794 rtnl_lock(); 3502 if (!rdev->ops->deauth)
3795 3503 return -EOPNOTSUPP;
3796 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
3797 if (err)
3798 goto unlock_rtnl;
3799
3800 if (!rdev->ops->deauth) {
3801 err = -EOPNOTSUPP;
3802 goto out;
3803 }
3804
3805 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) {
3806 err = -EOPNOTSUPP;
3807 goto out;
3808 }
3809 3504
3810 if (!netif_running(dev)) { 3505 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
3811 err = -ENETDOWN; 3506 dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
3812 goto out; 3507 return -EOPNOTSUPP;
3813 }
3814 3508
3815 bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); 3509 bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
3816 3510
3817 reason_code = nla_get_u16(info->attrs[NL80211_ATTR_REASON_CODE]); 3511 reason_code = nla_get_u16(info->attrs[NL80211_ATTR_REASON_CODE]);
3818 if (reason_code == 0) { 3512 if (reason_code == 0) {
3819 /* Reason Code 0 is reserved */ 3513 /* Reason Code 0 is reserved */
3820 err = -EINVAL; 3514 return -EINVAL;
3821 goto out;
3822 } 3515 }
3823 3516
3824 if (info->attrs[NL80211_ATTR_IE]) { 3517 if (info->attrs[NL80211_ATTR_IE]) {
@@ -3828,23 +3521,16 @@ static int nl80211_deauthenticate(struct sk_buff *skb, struct genl_info *info)
3828 3521
3829 local_state_change = !!info->attrs[NL80211_ATTR_LOCAL_STATE_CHANGE]; 3522 local_state_change = !!info->attrs[NL80211_ATTR_LOCAL_STATE_CHANGE];
3830 3523
3831 err = cfg80211_mlme_deauth(rdev, dev, bssid, ie, ie_len, reason_code, 3524 return cfg80211_mlme_deauth(rdev, dev, bssid, ie, ie_len, reason_code,
3832 local_state_change); 3525 local_state_change);
3833
3834out:
3835 cfg80211_unlock_rdev(rdev);
3836 dev_put(dev);
3837unlock_rtnl:
3838 rtnl_unlock();
3839 return err;
3840} 3526}
3841 3527
3842static int nl80211_disassociate(struct sk_buff *skb, struct genl_info *info) 3528static int nl80211_disassociate(struct sk_buff *skb, struct genl_info *info)
3843{ 3529{
3844 struct cfg80211_registered_device *rdev; 3530 struct cfg80211_registered_device *rdev = info->user_ptr[0];
3845 struct net_device *dev; 3531 struct net_device *dev = info->user_ptr[1];
3846 const u8 *ie = NULL, *bssid; 3532 const u8 *ie = NULL, *bssid;
3847 int err, ie_len = 0; 3533 int ie_len = 0;
3848 u16 reason_code; 3534 u16 reason_code;
3849 bool local_state_change; 3535 bool local_state_change;
3850 3536
@@ -3857,34 +3543,19 @@ static int nl80211_disassociate(struct sk_buff *skb, struct genl_info *info)
3857 if (!info->attrs[NL80211_ATTR_REASON_CODE]) 3543 if (!info->attrs[NL80211_ATTR_REASON_CODE])
3858 return -EINVAL; 3544 return -EINVAL;
3859 3545
3860 rtnl_lock(); 3546 if (!rdev->ops->disassoc)
3861 3547 return -EOPNOTSUPP;
3862 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
3863 if (err)
3864 goto unlock_rtnl;
3865
3866 if (!rdev->ops->disassoc) {
3867 err = -EOPNOTSUPP;
3868 goto out;
3869 }
3870
3871 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) {
3872 err = -EOPNOTSUPP;
3873 goto out;
3874 }
3875 3548
3876 if (!netif_running(dev)) { 3549 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
3877 err = -ENETDOWN; 3550 dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
3878 goto out; 3551 return -EOPNOTSUPP;
3879 }
3880 3552
3881 bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); 3553 bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
3882 3554
3883 reason_code = nla_get_u16(info->attrs[NL80211_ATTR_REASON_CODE]); 3555 reason_code = nla_get_u16(info->attrs[NL80211_ATTR_REASON_CODE]);
3884 if (reason_code == 0) { 3556 if (reason_code == 0) {
3885 /* Reason Code 0 is reserved */ 3557 /* Reason Code 0 is reserved */
3886 err = -EINVAL; 3558 return -EINVAL;
3887 goto out;
3888 } 3559 }
3889 3560
3890 if (info->attrs[NL80211_ATTR_IE]) { 3561 if (info->attrs[NL80211_ATTR_IE]) {
@@ -3894,21 +3565,14 @@ static int nl80211_disassociate(struct sk_buff *skb, struct genl_info *info)
3894 3565
3895 local_state_change = !!info->attrs[NL80211_ATTR_LOCAL_STATE_CHANGE]; 3566 local_state_change = !!info->attrs[NL80211_ATTR_LOCAL_STATE_CHANGE];
3896 3567
3897 err = cfg80211_mlme_disassoc(rdev, dev, bssid, ie, ie_len, reason_code, 3568 return cfg80211_mlme_disassoc(rdev, dev, bssid, ie, ie_len, reason_code,
3898 local_state_change); 3569 local_state_change);
3899
3900out:
3901 cfg80211_unlock_rdev(rdev);
3902 dev_put(dev);
3903unlock_rtnl:
3904 rtnl_unlock();
3905 return err;
3906} 3570}
3907 3571
3908static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info) 3572static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
3909{ 3573{
3910 struct cfg80211_registered_device *rdev; 3574 struct cfg80211_registered_device *rdev = info->user_ptr[0];
3911 struct net_device *dev; 3575 struct net_device *dev = info->user_ptr[1];
3912 struct cfg80211_ibss_params ibss; 3576 struct cfg80211_ibss_params ibss;
3913 struct wiphy *wiphy; 3577 struct wiphy *wiphy;
3914 struct cfg80211_cached_keys *connkeys = NULL; 3578 struct cfg80211_cached_keys *connkeys = NULL;
@@ -3933,26 +3597,11 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
3933 return -EINVAL; 3597 return -EINVAL;
3934 } 3598 }
3935 3599
3936 rtnl_lock(); 3600 if (!rdev->ops->join_ibss)
3937 3601 return -EOPNOTSUPP;
3938 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
3939 if (err)
3940 goto unlock_rtnl;
3941
3942 if (!rdev->ops->join_ibss) {
3943 err = -EOPNOTSUPP;
3944 goto out;
3945 }
3946
3947 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC) {
3948 err = -EOPNOTSUPP;
3949 goto out;
3950 }
3951 3602
3952 if (!netif_running(dev)) { 3603 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC)
3953 err = -ENETDOWN; 3604 return -EOPNOTSUPP;
3954 goto out;
3955 }
3956 3605
3957 wiphy = &rdev->wiphy; 3606 wiphy = &rdev->wiphy;
3958 3607
@@ -3970,24 +3619,12 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
3970 nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ])); 3619 nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]));
3971 if (!ibss.channel || 3620 if (!ibss.channel ||
3972 ibss.channel->flags & IEEE80211_CHAN_NO_IBSS || 3621 ibss.channel->flags & IEEE80211_CHAN_NO_IBSS ||
3973 ibss.channel->flags & IEEE80211_CHAN_DISABLED) { 3622 ibss.channel->flags & IEEE80211_CHAN_DISABLED)
3974 err = -EINVAL; 3623 return -EINVAL;
3975 goto out;
3976 }
3977 3624
3978 ibss.channel_fixed = !!info->attrs[NL80211_ATTR_FREQ_FIXED]; 3625 ibss.channel_fixed = !!info->attrs[NL80211_ATTR_FREQ_FIXED];
3979 ibss.privacy = !!info->attrs[NL80211_ATTR_PRIVACY]; 3626 ibss.privacy = !!info->attrs[NL80211_ATTR_PRIVACY];
3980 3627
3981 if (ibss.privacy && info->attrs[NL80211_ATTR_KEYS]) {
3982 connkeys = nl80211_parse_connkeys(rdev,
3983 info->attrs[NL80211_ATTR_KEYS]);
3984 if (IS_ERR(connkeys)) {
3985 err = PTR_ERR(connkeys);
3986 connkeys = NULL;
3987 goto out;
3988 }
3989 }
3990
3991 if (info->attrs[NL80211_ATTR_BSS_BASIC_RATES]) { 3628 if (info->attrs[NL80211_ATTR_BSS_BASIC_RATES]) {
3992 u8 *rates = 3629 u8 *rates =
3993 nla_data(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]); 3630 nla_data(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]);
@@ -3997,10 +3634,8 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
3997 wiphy->bands[ibss.channel->band]; 3634 wiphy->bands[ibss.channel->band];
3998 int i, j; 3635 int i, j;
3999 3636
4000 if (n_rates == 0) { 3637 if (n_rates == 0)
4001 err = -EINVAL; 3638 return -EINVAL;
4002 goto out;
4003 }
4004 3639
4005 for (i = 0; i < n_rates; i++) { 3640 for (i = 0; i < n_rates; i++) {
4006 int rate = (rates[i] & 0x7f) * 5; 3641 int rate = (rates[i] & 0x7f) * 5;
@@ -4013,77 +3648,36 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
4013 break; 3648 break;
4014 } 3649 }
4015 } 3650 }
4016 if (!found) { 3651 if (!found)
4017 err = -EINVAL; 3652 return -EINVAL;
4018 goto out;
4019 }
4020 }
4021 } else {
4022 /*
4023 * If no rates were explicitly configured,
4024 * use the mandatory rate set for 11b or
4025 * 11a for maximum compatibility.
4026 */
4027 struct ieee80211_supported_band *sband =
4028 wiphy->bands[ibss.channel->band];
4029 int j;
4030 u32 flag = ibss.channel->band == IEEE80211_BAND_5GHZ ?
4031 IEEE80211_RATE_MANDATORY_A :
4032 IEEE80211_RATE_MANDATORY_B;
4033
4034 for (j = 0; j < sband->n_bitrates; j++) {
4035 if (sband->bitrates[j].flags & flag)
4036 ibss.basic_rates |= BIT(j);
4037 } 3653 }
4038 } 3654 }
4039 3655
4040 err = cfg80211_join_ibss(rdev, dev, &ibss, connkeys); 3656 if (ibss.privacy && info->attrs[NL80211_ATTR_KEYS]) {
3657 connkeys = nl80211_parse_connkeys(rdev,
3658 info->attrs[NL80211_ATTR_KEYS]);
3659 if (IS_ERR(connkeys))
3660 return PTR_ERR(connkeys);
3661 }
4041 3662
4042out: 3663 err = cfg80211_join_ibss(rdev, dev, &ibss, connkeys);
4043 cfg80211_unlock_rdev(rdev);
4044 dev_put(dev);
4045unlock_rtnl:
4046 if (err) 3664 if (err)
4047 kfree(connkeys); 3665 kfree(connkeys);
4048 rtnl_unlock();
4049 return err; 3666 return err;
4050} 3667}
4051 3668
4052static int nl80211_leave_ibss(struct sk_buff *skb, struct genl_info *info) 3669static int nl80211_leave_ibss(struct sk_buff *skb, struct genl_info *info)
4053{ 3670{
4054 struct cfg80211_registered_device *rdev; 3671 struct cfg80211_registered_device *rdev = info->user_ptr[0];
4055 struct net_device *dev; 3672 struct net_device *dev = info->user_ptr[1];
4056 int err;
4057
4058 rtnl_lock();
4059
4060 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
4061 if (err)
4062 goto unlock_rtnl;
4063
4064 if (!rdev->ops->leave_ibss) {
4065 err = -EOPNOTSUPP;
4066 goto out;
4067 }
4068 3673
4069 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC) { 3674 if (!rdev->ops->leave_ibss)
4070 err = -EOPNOTSUPP; 3675 return -EOPNOTSUPP;
4071 goto out;
4072 }
4073
4074 if (!netif_running(dev)) {
4075 err = -ENETDOWN;
4076 goto out;
4077 }
4078 3676
4079 err = cfg80211_leave_ibss(rdev, dev, false); 3677 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC)
3678 return -EOPNOTSUPP;
4080 3679
4081out: 3680 return cfg80211_leave_ibss(rdev, dev, false);
4082 cfg80211_unlock_rdev(rdev);
4083 dev_put(dev);
4084unlock_rtnl:
4085 rtnl_unlock();
4086 return err;
4087} 3681}
4088 3682
4089#ifdef CONFIG_NL80211_TESTMODE 3683#ifdef CONFIG_NL80211_TESTMODE
@@ -4093,20 +3687,12 @@ static struct genl_multicast_group nl80211_testmode_mcgrp = {
4093 3687
4094static int nl80211_testmode_do(struct sk_buff *skb, struct genl_info *info) 3688static int nl80211_testmode_do(struct sk_buff *skb, struct genl_info *info)
4095{ 3689{
4096 struct cfg80211_registered_device *rdev; 3690 struct cfg80211_registered_device *rdev = info->user_ptr[0];
4097 int err; 3691 int err;
4098 3692
4099 if (!info->attrs[NL80211_ATTR_TESTDATA]) 3693 if (!info->attrs[NL80211_ATTR_TESTDATA])
4100 return -EINVAL; 3694 return -EINVAL;
4101 3695
4102 rtnl_lock();
4103
4104 rdev = cfg80211_get_dev_from_info(info);
4105 if (IS_ERR(rdev)) {
4106 err = PTR_ERR(rdev);
4107 goto unlock_rtnl;
4108 }
4109
4110 err = -EOPNOTSUPP; 3696 err = -EOPNOTSUPP;
4111 if (rdev->ops->testmode_cmd) { 3697 if (rdev->ops->testmode_cmd) {
4112 rdev->testmode_info = info; 3698 rdev->testmode_info = info;
@@ -4116,10 +3702,6 @@ static int nl80211_testmode_do(struct sk_buff *skb, struct genl_info *info)
4116 rdev->testmode_info = NULL; 3702 rdev->testmode_info = NULL;
4117 } 3703 }
4118 3704
4119 cfg80211_unlock_rdev(rdev);
4120
4121 unlock_rtnl:
4122 rtnl_unlock();
4123 return err; 3705 return err;
4124} 3706}
4125 3707
@@ -4210,8 +3792,8 @@ EXPORT_SYMBOL(cfg80211_testmode_event);
4210 3792
4211static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) 3793static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
4212{ 3794{
4213 struct cfg80211_registered_device *rdev; 3795 struct cfg80211_registered_device *rdev = info->user_ptr[0];
4214 struct net_device *dev; 3796 struct net_device *dev = info->user_ptr[1];
4215 struct cfg80211_connect_params connect; 3797 struct cfg80211_connect_params connect;
4216 struct wiphy *wiphy; 3798 struct wiphy *wiphy;
4217 struct cfg80211_cached_keys *connkeys = NULL; 3799 struct cfg80211_cached_keys *connkeys = NULL;
@@ -4236,25 +3818,14 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
4236 3818
4237 connect.privacy = info->attrs[NL80211_ATTR_PRIVACY]; 3819 connect.privacy = info->attrs[NL80211_ATTR_PRIVACY];
4238 3820
4239 err = nl80211_crypto_settings(info, &connect.crypto, 3821 err = nl80211_crypto_settings(rdev, info, &connect.crypto,
4240 NL80211_MAX_NR_CIPHER_SUITES); 3822 NL80211_MAX_NR_CIPHER_SUITES);
4241 if (err) 3823 if (err)
4242 return err; 3824 return err;
4243 rtnl_lock();
4244 3825
4245 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); 3826 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
4246 if (err) 3827 dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
4247 goto unlock_rtnl; 3828 return -EOPNOTSUPP;
4248
4249 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) {
4250 err = -EOPNOTSUPP;
4251 goto out;
4252 }
4253
4254 if (!netif_running(dev)) {
4255 err = -ENETDOWN;
4256 goto out;
4257 }
4258 3829
4259 wiphy = &rdev->wiphy; 3830 wiphy = &rdev->wiphy;
4260 3831
@@ -4273,39 +3844,27 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
4273 ieee80211_get_channel(wiphy, 3844 ieee80211_get_channel(wiphy,
4274 nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ])); 3845 nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]));
4275 if (!connect.channel || 3846 if (!connect.channel ||
4276 connect.channel->flags & IEEE80211_CHAN_DISABLED) { 3847 connect.channel->flags & IEEE80211_CHAN_DISABLED)
4277 err = -EINVAL; 3848 return -EINVAL;
4278 goto out;
4279 }
4280 } 3849 }
4281 3850
4282 if (connect.privacy && info->attrs[NL80211_ATTR_KEYS]) { 3851 if (connect.privacy && info->attrs[NL80211_ATTR_KEYS]) {
4283 connkeys = nl80211_parse_connkeys(rdev, 3852 connkeys = nl80211_parse_connkeys(rdev,
4284 info->attrs[NL80211_ATTR_KEYS]); 3853 info->attrs[NL80211_ATTR_KEYS]);
4285 if (IS_ERR(connkeys)) { 3854 if (IS_ERR(connkeys))
4286 err = PTR_ERR(connkeys); 3855 return PTR_ERR(connkeys);
4287 connkeys = NULL;
4288 goto out;
4289 }
4290 } 3856 }
4291 3857
4292 err = cfg80211_connect(rdev, dev, &connect, connkeys); 3858 err = cfg80211_connect(rdev, dev, &connect, connkeys);
4293
4294out:
4295 cfg80211_unlock_rdev(rdev);
4296 dev_put(dev);
4297unlock_rtnl:
4298 if (err) 3859 if (err)
4299 kfree(connkeys); 3860 kfree(connkeys);
4300 rtnl_unlock();
4301 return err; 3861 return err;
4302} 3862}
4303 3863
4304static int nl80211_disconnect(struct sk_buff *skb, struct genl_info *info) 3864static int nl80211_disconnect(struct sk_buff *skb, struct genl_info *info)
4305{ 3865{
4306 struct cfg80211_registered_device *rdev; 3866 struct cfg80211_registered_device *rdev = info->user_ptr[0];
4307 struct net_device *dev; 3867 struct net_device *dev = info->user_ptr[1];
4308 int err;
4309 u16 reason; 3868 u16 reason;
4310 3869
4311 if (!info->attrs[NL80211_ATTR_REASON_CODE]) 3870 if (!info->attrs[NL80211_ATTR_REASON_CODE])
@@ -4316,35 +3875,16 @@ static int nl80211_disconnect(struct sk_buff *skb, struct genl_info *info)
4316 if (reason == 0) 3875 if (reason == 0)
4317 return -EINVAL; 3876 return -EINVAL;
4318 3877
4319 rtnl_lock(); 3878 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
4320 3879 dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
4321 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); 3880 return -EOPNOTSUPP;
4322 if (err)
4323 goto unlock_rtnl;
4324
4325 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) {
4326 err = -EOPNOTSUPP;
4327 goto out;
4328 }
4329
4330 if (!netif_running(dev)) {
4331 err = -ENETDOWN;
4332 goto out;
4333 }
4334
4335 err = cfg80211_disconnect(rdev, dev, reason, true);
4336 3881
4337out: 3882 return cfg80211_disconnect(rdev, dev, reason, true);
4338 cfg80211_unlock_rdev(rdev);
4339 dev_put(dev);
4340unlock_rtnl:
4341 rtnl_unlock();
4342 return err;
4343} 3883}
4344 3884
4345static int nl80211_wiphy_netns(struct sk_buff *skb, struct genl_info *info) 3885static int nl80211_wiphy_netns(struct sk_buff *skb, struct genl_info *info)
4346{ 3886{
4347 struct cfg80211_registered_device *rdev; 3887 struct cfg80211_registered_device *rdev = info->user_ptr[0];
4348 struct net *net; 3888 struct net *net;
4349 int err; 3889 int err;
4350 u32 pid; 3890 u32 pid;
@@ -4354,43 +3894,26 @@ static int nl80211_wiphy_netns(struct sk_buff *skb, struct genl_info *info)
4354 3894
4355 pid = nla_get_u32(info->attrs[NL80211_ATTR_PID]); 3895 pid = nla_get_u32(info->attrs[NL80211_ATTR_PID]);
4356 3896
4357 rtnl_lock();
4358
4359 rdev = cfg80211_get_dev_from_info(info);
4360 if (IS_ERR(rdev)) {
4361 err = PTR_ERR(rdev);
4362 goto out_rtnl;
4363 }
4364
4365 net = get_net_ns_by_pid(pid); 3897 net = get_net_ns_by_pid(pid);
4366 if (IS_ERR(net)) { 3898 if (IS_ERR(net))
4367 err = PTR_ERR(net); 3899 return PTR_ERR(net);
4368 goto out;
4369 }
4370 3900
4371 err = 0; 3901 err = 0;
4372 3902
4373 /* check if anything to do */ 3903 /* check if anything to do */
4374 if (net_eq(wiphy_net(&rdev->wiphy), net)) 3904 if (!net_eq(wiphy_net(&rdev->wiphy), net))
4375 goto out_put_net; 3905 err = cfg80211_switch_netns(rdev, net);
4376 3906
4377 err = cfg80211_switch_netns(rdev, net);
4378 out_put_net:
4379 put_net(net); 3907 put_net(net);
4380 out:
4381 cfg80211_unlock_rdev(rdev);
4382 out_rtnl:
4383 rtnl_unlock();
4384 return err; 3908 return err;
4385} 3909}
4386 3910
4387static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info) 3911static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info)
4388{ 3912{
4389 struct cfg80211_registered_device *rdev; 3913 struct cfg80211_registered_device *rdev = info->user_ptr[0];
4390 int (*rdev_ops)(struct wiphy *wiphy, struct net_device *dev, 3914 int (*rdev_ops)(struct wiphy *wiphy, struct net_device *dev,
4391 struct cfg80211_pmksa *pmksa) = NULL; 3915 struct cfg80211_pmksa *pmksa) = NULL;
4392 int err; 3916 struct net_device *dev = info->user_ptr[1];
4393 struct net_device *dev;
4394 struct cfg80211_pmksa pmksa; 3917 struct cfg80211_pmksa pmksa;
4395 3918
4396 memset(&pmksa, 0, sizeof(struct cfg80211_pmksa)); 3919 memset(&pmksa, 0, sizeof(struct cfg80211_pmksa));
@@ -4401,19 +3924,12 @@ static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info)
4401 if (!info->attrs[NL80211_ATTR_PMKID]) 3924 if (!info->attrs[NL80211_ATTR_PMKID])
4402 return -EINVAL; 3925 return -EINVAL;
4403 3926
4404 rtnl_lock();
4405
4406 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
4407 if (err)
4408 goto out_rtnl;
4409
4410 pmksa.pmkid = nla_data(info->attrs[NL80211_ATTR_PMKID]); 3927 pmksa.pmkid = nla_data(info->attrs[NL80211_ATTR_PMKID]);
4411 pmksa.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); 3928 pmksa.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
4412 3929
4413 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) { 3930 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
4414 err = -EOPNOTSUPP; 3931 dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
4415 goto out; 3932 return -EOPNOTSUPP;
4416 }
4417 3933
4418 switch (info->genlhdr->cmd) { 3934 switch (info->genlhdr->cmd) {
4419 case NL80211_CMD_SET_PMKSA: 3935 case NL80211_CMD_SET_PMKSA:
@@ -4427,61 +3943,32 @@ static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info)
4427 break; 3943 break;
4428 } 3944 }
4429 3945
4430 if (!rdev_ops) { 3946 if (!rdev_ops)
4431 err = -EOPNOTSUPP; 3947 return -EOPNOTSUPP;
4432 goto out;
4433 }
4434
4435 err = rdev_ops(&rdev->wiphy, dev, &pmksa);
4436
4437 out:
4438 cfg80211_unlock_rdev(rdev);
4439 dev_put(dev);
4440 out_rtnl:
4441 rtnl_unlock();
4442 3948
4443 return err; 3949 return rdev_ops(&rdev->wiphy, dev, &pmksa);
4444} 3950}
4445 3951
4446static int nl80211_flush_pmksa(struct sk_buff *skb, struct genl_info *info) 3952static int nl80211_flush_pmksa(struct sk_buff *skb, struct genl_info *info)
4447{ 3953{
4448 struct cfg80211_registered_device *rdev; 3954 struct cfg80211_registered_device *rdev = info->user_ptr[0];
4449 int err; 3955 struct net_device *dev = info->user_ptr[1];
4450 struct net_device *dev;
4451
4452 rtnl_lock();
4453
4454 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
4455 if (err)
4456 goto out_rtnl;
4457
4458 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) {
4459 err = -EOPNOTSUPP;
4460 goto out;
4461 }
4462
4463 if (!rdev->ops->flush_pmksa) {
4464 err = -EOPNOTSUPP;
4465 goto out;
4466 }
4467
4468 err = rdev->ops->flush_pmksa(&rdev->wiphy, dev);
4469 3956
4470 out: 3957 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
4471 cfg80211_unlock_rdev(rdev); 3958 dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
4472 dev_put(dev); 3959 return -EOPNOTSUPP;
4473 out_rtnl:
4474 rtnl_unlock();
4475 3960
4476 return err; 3961 if (!rdev->ops->flush_pmksa)
3962 return -EOPNOTSUPP;
4477 3963
3964 return rdev->ops->flush_pmksa(&rdev->wiphy, dev);
4478} 3965}
4479 3966
4480static int nl80211_remain_on_channel(struct sk_buff *skb, 3967static int nl80211_remain_on_channel(struct sk_buff *skb,
4481 struct genl_info *info) 3968 struct genl_info *info)
4482{ 3969{
4483 struct cfg80211_registered_device *rdev; 3970 struct cfg80211_registered_device *rdev = info->user_ptr[0];
4484 struct net_device *dev; 3971 struct net_device *dev = info->user_ptr[1];
4485 struct ieee80211_channel *chan; 3972 struct ieee80211_channel *chan;
4486 struct sk_buff *msg; 3973 struct sk_buff *msg;
4487 void *hdr; 3974 void *hdr;
@@ -4503,21 +3990,8 @@ static int nl80211_remain_on_channel(struct sk_buff *skb,
4503 if (!duration || !msecs_to_jiffies(duration) || duration > 5000) 3990 if (!duration || !msecs_to_jiffies(duration) || duration > 5000)
4504 return -EINVAL; 3991 return -EINVAL;
4505 3992
4506 rtnl_lock(); 3993 if (!rdev->ops->remain_on_channel)
4507 3994 return -EOPNOTSUPP;
4508 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
4509 if (err)
4510 goto unlock_rtnl;
4511
4512 if (!rdev->ops->remain_on_channel) {
4513 err = -EOPNOTSUPP;
4514 goto out;
4515 }
4516
4517 if (!netif_running(dev)) {
4518 err = -ENETDOWN;
4519 goto out;
4520 }
4521 3995
4522 if (info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]) { 3996 if (info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]) {
4523 channel_type = nla_get_u32( 3997 channel_type = nla_get_u32(
@@ -4525,24 +3999,18 @@ static int nl80211_remain_on_channel(struct sk_buff *skb,
4525 if (channel_type != NL80211_CHAN_NO_HT && 3999 if (channel_type != NL80211_CHAN_NO_HT &&
4526 channel_type != NL80211_CHAN_HT20 && 4000 channel_type != NL80211_CHAN_HT20 &&
4527 channel_type != NL80211_CHAN_HT40PLUS && 4001 channel_type != NL80211_CHAN_HT40PLUS &&
4528 channel_type != NL80211_CHAN_HT40MINUS) { 4002 channel_type != NL80211_CHAN_HT40MINUS)
4529 err = -EINVAL; 4003 return -EINVAL;
4530 goto out;
4531 }
4532 } 4004 }
4533 4005
4534 freq = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]); 4006 freq = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]);
4535 chan = rdev_freq_to_chan(rdev, freq, channel_type); 4007 chan = rdev_freq_to_chan(rdev, freq, channel_type);
4536 if (chan == NULL) { 4008 if (chan == NULL)
4537 err = -EINVAL; 4009 return -EINVAL;
4538 goto out;
4539 }
4540 4010
4541 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 4011 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
4542 if (!msg) { 4012 if (!msg)
4543 err = -ENOMEM; 4013 return -ENOMEM;
4544 goto out;
4545 }
4546 4014
4547 hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0, 4015 hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0,
4548 NL80211_CMD_REMAIN_ON_CHANNEL); 4016 NL80211_CMD_REMAIN_ON_CHANNEL);
@@ -4561,58 +4029,32 @@ static int nl80211_remain_on_channel(struct sk_buff *skb,
4561 NLA_PUT_U64(msg, NL80211_ATTR_COOKIE, cookie); 4029 NLA_PUT_U64(msg, NL80211_ATTR_COOKIE, cookie);
4562 4030
4563 genlmsg_end(msg, hdr); 4031 genlmsg_end(msg, hdr);
4564 err = genlmsg_reply(msg, info); 4032
4565 goto out; 4033 return genlmsg_reply(msg, info);
4566 4034
4567 nla_put_failure: 4035 nla_put_failure:
4568 err = -ENOBUFS; 4036 err = -ENOBUFS;
4569 free_msg: 4037 free_msg:
4570 nlmsg_free(msg); 4038 nlmsg_free(msg);
4571 out:
4572 cfg80211_unlock_rdev(rdev);
4573 dev_put(dev);
4574 unlock_rtnl:
4575 rtnl_unlock();
4576 return err; 4039 return err;
4577} 4040}
4578 4041
4579static int nl80211_cancel_remain_on_channel(struct sk_buff *skb, 4042static int nl80211_cancel_remain_on_channel(struct sk_buff *skb,
4580 struct genl_info *info) 4043 struct genl_info *info)
4581{ 4044{
4582 struct cfg80211_registered_device *rdev; 4045 struct cfg80211_registered_device *rdev = info->user_ptr[0];
4583 struct net_device *dev; 4046 struct net_device *dev = info->user_ptr[1];
4584 u64 cookie; 4047 u64 cookie;
4585 int err;
4586 4048
4587 if (!info->attrs[NL80211_ATTR_COOKIE]) 4049 if (!info->attrs[NL80211_ATTR_COOKIE])
4588 return -EINVAL; 4050 return -EINVAL;
4589 4051
4590 rtnl_lock(); 4052 if (!rdev->ops->cancel_remain_on_channel)
4591 4053 return -EOPNOTSUPP;
4592 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
4593 if (err)
4594 goto unlock_rtnl;
4595
4596 if (!rdev->ops->cancel_remain_on_channel) {
4597 err = -EOPNOTSUPP;
4598 goto out;
4599 }
4600
4601 if (!netif_running(dev)) {
4602 err = -ENETDOWN;
4603 goto out;
4604 }
4605 4054
4606 cookie = nla_get_u64(info->attrs[NL80211_ATTR_COOKIE]); 4055 cookie = nla_get_u64(info->attrs[NL80211_ATTR_COOKIE]);
4607 4056
4608 err = rdev->ops->cancel_remain_on_channel(&rdev->wiphy, dev, cookie); 4057 return rdev->ops->cancel_remain_on_channel(&rdev->wiphy, dev, cookie);
4609
4610 out:
4611 cfg80211_unlock_rdev(rdev);
4612 dev_put(dev);
4613 unlock_rtnl:
4614 rtnl_unlock();
4615 return err;
4616} 4058}
4617 4059
4618static u32 rateset_to_mask(struct ieee80211_supported_band *sband, 4060static u32 rateset_to_mask(struct ieee80211_supported_band *sband,
@@ -4648,26 +4090,18 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb,
4648 struct genl_info *info) 4090 struct genl_info *info)
4649{ 4091{
4650 struct nlattr *tb[NL80211_TXRATE_MAX + 1]; 4092 struct nlattr *tb[NL80211_TXRATE_MAX + 1];
4651 struct cfg80211_registered_device *rdev; 4093 struct cfg80211_registered_device *rdev = info->user_ptr[0];
4652 struct cfg80211_bitrate_mask mask; 4094 struct cfg80211_bitrate_mask mask;
4653 int err, rem, i; 4095 int rem, i;
4654 struct net_device *dev; 4096 struct net_device *dev = info->user_ptr[1];
4655 struct nlattr *tx_rates; 4097 struct nlattr *tx_rates;
4656 struct ieee80211_supported_band *sband; 4098 struct ieee80211_supported_band *sband;
4657 4099
4658 if (info->attrs[NL80211_ATTR_TX_RATES] == NULL) 4100 if (info->attrs[NL80211_ATTR_TX_RATES] == NULL)
4659 return -EINVAL; 4101 return -EINVAL;
4660 4102
4661 rtnl_lock(); 4103 if (!rdev->ops->set_bitrate_mask)
4662 4104 return -EOPNOTSUPP;
4663 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
4664 if (err)
4665 goto unlock_rtnl;
4666
4667 if (!rdev->ops->set_bitrate_mask) {
4668 err = -EOPNOTSUPP;
4669 goto unlock;
4670 }
4671 4105
4672 memset(&mask, 0, sizeof(mask)); 4106 memset(&mask, 0, sizeof(mask));
4673 /* Default to all rates enabled */ 4107 /* Default to all rates enabled */
@@ -4684,15 +4118,11 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb,
4684 nla_for_each_nested(tx_rates, info->attrs[NL80211_ATTR_TX_RATES], rem) 4118 nla_for_each_nested(tx_rates, info->attrs[NL80211_ATTR_TX_RATES], rem)
4685 { 4119 {
4686 enum ieee80211_band band = nla_type(tx_rates); 4120 enum ieee80211_band band = nla_type(tx_rates);
4687 if (band < 0 || band >= IEEE80211_NUM_BANDS) { 4121 if (band < 0 || band >= IEEE80211_NUM_BANDS)
4688 err = -EINVAL; 4122 return -EINVAL;
4689 goto unlock;
4690 }
4691 sband = rdev->wiphy.bands[band]; 4123 sband = rdev->wiphy.bands[band];
4692 if (sband == NULL) { 4124 if (sband == NULL)
4693 err = -EINVAL; 4125 return -EINVAL;
4694 goto unlock;
4695 }
4696 nla_parse(tb, NL80211_TXRATE_MAX, nla_data(tx_rates), 4126 nla_parse(tb, NL80211_TXRATE_MAX, nla_data(tx_rates),
4697 nla_len(tx_rates), nl80211_txattr_policy); 4127 nla_len(tx_rates), nl80211_txattr_policy);
4698 if (tb[NL80211_TXRATE_LEGACY]) { 4128 if (tb[NL80211_TXRATE_LEGACY]) {
@@ -4700,68 +4130,48 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb,
4700 sband, 4130 sband,
4701 nla_data(tb[NL80211_TXRATE_LEGACY]), 4131 nla_data(tb[NL80211_TXRATE_LEGACY]),
4702 nla_len(tb[NL80211_TXRATE_LEGACY])); 4132 nla_len(tb[NL80211_TXRATE_LEGACY]));
4703 if (mask.control[band].legacy == 0) { 4133 if (mask.control[band].legacy == 0)
4704 err = -EINVAL; 4134 return -EINVAL;
4705 goto unlock;
4706 }
4707 } 4135 }
4708 } 4136 }
4709 4137
4710 err = rdev->ops->set_bitrate_mask(&rdev->wiphy, dev, NULL, &mask); 4138 return rdev->ops->set_bitrate_mask(&rdev->wiphy, dev, NULL, &mask);
4711
4712 unlock:
4713 dev_put(dev);
4714 cfg80211_unlock_rdev(rdev);
4715 unlock_rtnl:
4716 rtnl_unlock();
4717 return err;
4718} 4139}
4719 4140
4720static int nl80211_register_action(struct sk_buff *skb, struct genl_info *info) 4141static int nl80211_register_mgmt(struct sk_buff *skb, struct genl_info *info)
4721{ 4142{
4722 struct cfg80211_registered_device *rdev; 4143 struct cfg80211_registered_device *rdev = info->user_ptr[0];
4723 struct net_device *dev; 4144 struct net_device *dev = info->user_ptr[1];
4724 int err; 4145 u16 frame_type = IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION;
4725 4146
4726 if (!info->attrs[NL80211_ATTR_FRAME_MATCH]) 4147 if (!info->attrs[NL80211_ATTR_FRAME_MATCH])
4727 return -EINVAL; 4148 return -EINVAL;
4728 4149
4729 if (nla_len(info->attrs[NL80211_ATTR_FRAME_MATCH]) < 1) 4150 if (info->attrs[NL80211_ATTR_FRAME_TYPE])
4730 return -EINVAL; 4151 frame_type = nla_get_u16(info->attrs[NL80211_ATTR_FRAME_TYPE]);
4731
4732 rtnl_lock();
4733
4734 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
4735 if (err)
4736 goto unlock_rtnl;
4737 4152
4738 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && 4153 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
4739 dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC) { 4154 dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC &&
4740 err = -EOPNOTSUPP; 4155 dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT &&
4741 goto out; 4156 dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
4742 } 4157 dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN &&
4158 dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
4159 return -EOPNOTSUPP;
4743 4160
4744 /* not much point in registering if we can't reply */ 4161 /* not much point in registering if we can't reply */
4745 if (!rdev->ops->action) { 4162 if (!rdev->ops->mgmt_tx)
4746 err = -EOPNOTSUPP; 4163 return -EOPNOTSUPP;
4747 goto out;
4748 }
4749 4164
4750 err = cfg80211_mlme_register_action(dev->ieee80211_ptr, info->snd_pid, 4165 return cfg80211_mlme_register_mgmt(dev->ieee80211_ptr, info->snd_pid,
4166 frame_type,
4751 nla_data(info->attrs[NL80211_ATTR_FRAME_MATCH]), 4167 nla_data(info->attrs[NL80211_ATTR_FRAME_MATCH]),
4752 nla_len(info->attrs[NL80211_ATTR_FRAME_MATCH])); 4168 nla_len(info->attrs[NL80211_ATTR_FRAME_MATCH]));
4753 out:
4754 cfg80211_unlock_rdev(rdev);
4755 dev_put(dev);
4756 unlock_rtnl:
4757 rtnl_unlock();
4758 return err;
4759} 4169}
4760 4170
4761static int nl80211_action(struct sk_buff *skb, struct genl_info *info) 4171static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info)
4762{ 4172{
4763 struct cfg80211_registered_device *rdev; 4173 struct cfg80211_registered_device *rdev = info->user_ptr[0];
4764 struct net_device *dev; 4174 struct net_device *dev = info->user_ptr[1];
4765 struct ieee80211_channel *chan; 4175 struct ieee80211_channel *chan;
4766 enum nl80211_channel_type channel_type = NL80211_CHAN_NO_HT; 4176 enum nl80211_channel_type channel_type = NL80211_CHAN_NO_HT;
4767 bool channel_type_valid = false; 4177 bool channel_type_valid = false;
@@ -4775,27 +4185,16 @@ static int nl80211_action(struct sk_buff *skb, struct genl_info *info)
4775 !info->attrs[NL80211_ATTR_WIPHY_FREQ]) 4185 !info->attrs[NL80211_ATTR_WIPHY_FREQ])
4776 return -EINVAL; 4186 return -EINVAL;
4777 4187
4778 rtnl_lock(); 4188 if (!rdev->ops->mgmt_tx)
4779 4189 return -EOPNOTSUPP;
4780 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
4781 if (err)
4782 goto unlock_rtnl;
4783
4784 if (!rdev->ops->action) {
4785 err = -EOPNOTSUPP;
4786 goto out;
4787 }
4788 4190
4789 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && 4191 if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
4790 dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC) { 4192 dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC &&
4791 err = -EOPNOTSUPP; 4193 dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT &&
4792 goto out; 4194 dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
4793 } 4195 dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN &&
4794 4196 dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
4795 if (!netif_running(dev)) { 4197 return -EOPNOTSUPP;
4796 err = -ENETDOWN;
4797 goto out;
4798 }
4799 4198
4800 if (info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]) { 4199 if (info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]) {
4801 channel_type = nla_get_u32( 4200 channel_type = nla_get_u32(
@@ -4803,147 +4202,104 @@ static int nl80211_action(struct sk_buff *skb, struct genl_info *info)
4803 if (channel_type != NL80211_CHAN_NO_HT && 4202 if (channel_type != NL80211_CHAN_NO_HT &&
4804 channel_type != NL80211_CHAN_HT20 && 4203 channel_type != NL80211_CHAN_HT20 &&
4805 channel_type != NL80211_CHAN_HT40PLUS && 4204 channel_type != NL80211_CHAN_HT40PLUS &&
4806 channel_type != NL80211_CHAN_HT40MINUS) { 4205 channel_type != NL80211_CHAN_HT40MINUS)
4807 err = -EINVAL; 4206 return -EINVAL;
4808 goto out;
4809 }
4810 channel_type_valid = true; 4207 channel_type_valid = true;
4811 } 4208 }
4812 4209
4813 freq = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]); 4210 freq = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]);
4814 chan = rdev_freq_to_chan(rdev, freq, channel_type); 4211 chan = rdev_freq_to_chan(rdev, freq, channel_type);
4815 if (chan == NULL) { 4212 if (chan == NULL)
4816 err = -EINVAL; 4213 return -EINVAL;
4817 goto out;
4818 }
4819 4214
4820 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 4215 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
4821 if (!msg) { 4216 if (!msg)
4822 err = -ENOMEM; 4217 return -ENOMEM;
4823 goto out;
4824 }
4825 4218
4826 hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0, 4219 hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0,
4827 NL80211_CMD_ACTION); 4220 NL80211_CMD_FRAME);
4828 4221
4829 if (IS_ERR(hdr)) { 4222 if (IS_ERR(hdr)) {
4830 err = PTR_ERR(hdr); 4223 err = PTR_ERR(hdr);
4831 goto free_msg; 4224 goto free_msg;
4832 } 4225 }
4833 err = cfg80211_mlme_action(rdev, dev, chan, channel_type, 4226 err = cfg80211_mlme_mgmt_tx(rdev, dev, chan, channel_type,
4834 channel_type_valid, 4227 channel_type_valid,
4835 nla_data(info->attrs[NL80211_ATTR_FRAME]), 4228 nla_data(info->attrs[NL80211_ATTR_FRAME]),
4836 nla_len(info->attrs[NL80211_ATTR_FRAME]), 4229 nla_len(info->attrs[NL80211_ATTR_FRAME]),
4837 &cookie); 4230 &cookie);
4838 if (err) 4231 if (err)
4839 goto free_msg; 4232 goto free_msg;
4840 4233
4841 NLA_PUT_U64(msg, NL80211_ATTR_COOKIE, cookie); 4234 NLA_PUT_U64(msg, NL80211_ATTR_COOKIE, cookie);
4842 4235
4843 genlmsg_end(msg, hdr); 4236 genlmsg_end(msg, hdr);
4844 err = genlmsg_reply(msg, info); 4237 return genlmsg_reply(msg, info);
4845 goto out;
4846 4238
4847 nla_put_failure: 4239 nla_put_failure:
4848 err = -ENOBUFS; 4240 err = -ENOBUFS;
4849 free_msg: 4241 free_msg:
4850 nlmsg_free(msg); 4242 nlmsg_free(msg);
4851 out:
4852 cfg80211_unlock_rdev(rdev);
4853 dev_put(dev);
4854unlock_rtnl:
4855 rtnl_unlock();
4856 return err; 4243 return err;
4857} 4244}
4858 4245
4859static int nl80211_set_power_save(struct sk_buff *skb, struct genl_info *info) 4246static int nl80211_set_power_save(struct sk_buff *skb, struct genl_info *info)
4860{ 4247{
4861 struct cfg80211_registered_device *rdev; 4248 struct cfg80211_registered_device *rdev = info->user_ptr[0];
4862 struct wireless_dev *wdev; 4249 struct wireless_dev *wdev;
4863 struct net_device *dev; 4250 struct net_device *dev = info->user_ptr[1];
4864 u8 ps_state; 4251 u8 ps_state;
4865 bool state; 4252 bool state;
4866 int err; 4253 int err;
4867 4254
4868 if (!info->attrs[NL80211_ATTR_PS_STATE]) { 4255 if (!info->attrs[NL80211_ATTR_PS_STATE])
4869 err = -EINVAL; 4256 return -EINVAL;
4870 goto out;
4871 }
4872 4257
4873 ps_state = nla_get_u32(info->attrs[NL80211_ATTR_PS_STATE]); 4258 ps_state = nla_get_u32(info->attrs[NL80211_ATTR_PS_STATE]);
4874 4259
4875 if (ps_state != NL80211_PS_DISABLED && ps_state != NL80211_PS_ENABLED) { 4260 if (ps_state != NL80211_PS_DISABLED && ps_state != NL80211_PS_ENABLED)
4876 err = -EINVAL; 4261 return -EINVAL;
4877 goto out;
4878 }
4879
4880 rtnl_lock();
4881
4882 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
4883 if (err)
4884 goto unlock_rdev;
4885 4262
4886 wdev = dev->ieee80211_ptr; 4263 wdev = dev->ieee80211_ptr;
4887 4264
4888 if (!rdev->ops->set_power_mgmt) { 4265 if (!rdev->ops->set_power_mgmt)
4889 err = -EOPNOTSUPP; 4266 return -EOPNOTSUPP;
4890 goto unlock_rdev;
4891 }
4892 4267
4893 state = (ps_state == NL80211_PS_ENABLED) ? true : false; 4268 state = (ps_state == NL80211_PS_ENABLED) ? true : false;
4894 4269
4895 if (state == wdev->ps) 4270 if (state == wdev->ps)
4896 goto unlock_rdev; 4271 return 0;
4897
4898 wdev->ps = state;
4899
4900 if (rdev->ops->set_power_mgmt(wdev->wiphy, dev, wdev->ps,
4901 wdev->ps_timeout))
4902 /* assume this means it's off */
4903 wdev->ps = false;
4904
4905unlock_rdev:
4906 cfg80211_unlock_rdev(rdev);
4907 dev_put(dev);
4908 rtnl_unlock();
4909 4272
4910out: 4273 err = rdev->ops->set_power_mgmt(wdev->wiphy, dev, state,
4274 wdev->ps_timeout);
4275 if (!err)
4276 wdev->ps = state;
4911 return err; 4277 return err;
4912} 4278}
4913 4279
4914static int nl80211_get_power_save(struct sk_buff *skb, struct genl_info *info) 4280static int nl80211_get_power_save(struct sk_buff *skb, struct genl_info *info)
4915{ 4281{
4916 struct cfg80211_registered_device *rdev; 4282 struct cfg80211_registered_device *rdev = info->user_ptr[0];
4917 enum nl80211_ps_state ps_state; 4283 enum nl80211_ps_state ps_state;
4918 struct wireless_dev *wdev; 4284 struct wireless_dev *wdev;
4919 struct net_device *dev; 4285 struct net_device *dev = info->user_ptr[1];
4920 struct sk_buff *msg; 4286 struct sk_buff *msg;
4921 void *hdr; 4287 void *hdr;
4922 int err; 4288 int err;
4923 4289
4924 rtnl_lock();
4925
4926 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
4927 if (err)
4928 goto unlock_rtnl;
4929
4930 wdev = dev->ieee80211_ptr; 4290 wdev = dev->ieee80211_ptr;
4931 4291
4932 if (!rdev->ops->set_power_mgmt) { 4292 if (!rdev->ops->set_power_mgmt)
4933 err = -EOPNOTSUPP; 4293 return -EOPNOTSUPP;
4934 goto out;
4935 }
4936 4294
4937 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 4295 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
4938 if (!msg) { 4296 if (!msg)
4939 err = -ENOMEM; 4297 return -ENOMEM;
4940 goto out;
4941 }
4942 4298
4943 hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0, 4299 hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0,
4944 NL80211_CMD_GET_POWER_SAVE); 4300 NL80211_CMD_GET_POWER_SAVE);
4945 if (!hdr) { 4301 if (!hdr) {
4946 err = -ENOMEM; 4302 err = -ENOBUFS;
4947 goto free_msg; 4303 goto free_msg;
4948 } 4304 }
4949 4305
@@ -4955,22 +4311,12 @@ static int nl80211_get_power_save(struct sk_buff *skb, struct genl_info *info)
4955 NLA_PUT_U32(msg, NL80211_ATTR_PS_STATE, ps_state); 4311 NLA_PUT_U32(msg, NL80211_ATTR_PS_STATE, ps_state);
4956 4312
4957 genlmsg_end(msg, hdr); 4313 genlmsg_end(msg, hdr);
4958 err = genlmsg_reply(msg, info); 4314 return genlmsg_reply(msg, info);
4959 goto out;
4960 4315
4961nla_put_failure: 4316 nla_put_failure:
4962 err = -ENOBUFS; 4317 err = -ENOBUFS;
4963 4318 free_msg:
4964free_msg:
4965 nlmsg_free(msg); 4319 nlmsg_free(msg);
4966
4967out:
4968 cfg80211_unlock_rdev(rdev);
4969 dev_put(dev);
4970
4971unlock_rtnl:
4972 rtnl_unlock();
4973
4974 return err; 4320 return err;
4975} 4321}
4976 4322
@@ -4984,41 +4330,24 @@ nl80211_attr_cqm_policy[NL80211_ATTR_CQM_MAX + 1] __read_mostly = {
4984static int nl80211_set_cqm_rssi(struct genl_info *info, 4330static int nl80211_set_cqm_rssi(struct genl_info *info,
4985 s32 threshold, u32 hysteresis) 4331 s32 threshold, u32 hysteresis)
4986{ 4332{
4987 struct cfg80211_registered_device *rdev; 4333 struct cfg80211_registered_device *rdev = info->user_ptr[0];
4988 struct wireless_dev *wdev; 4334 struct wireless_dev *wdev;
4989 struct net_device *dev; 4335 struct net_device *dev = info->user_ptr[1];
4990 int err;
4991 4336
4992 if (threshold > 0) 4337 if (threshold > 0)
4993 return -EINVAL; 4338 return -EINVAL;
4994 4339
4995 rtnl_lock();
4996
4997 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
4998 if (err)
4999 goto unlock_rdev;
5000
5001 wdev = dev->ieee80211_ptr; 4340 wdev = dev->ieee80211_ptr;
5002 4341
5003 if (!rdev->ops->set_cqm_rssi_config) { 4342 if (!rdev->ops->set_cqm_rssi_config)
5004 err = -EOPNOTSUPP; 4343 return -EOPNOTSUPP;
5005 goto unlock_rdev;
5006 }
5007
5008 if (wdev->iftype != NL80211_IFTYPE_STATION) {
5009 err = -EOPNOTSUPP;
5010 goto unlock_rdev;
5011 }
5012
5013 err = rdev->ops->set_cqm_rssi_config(wdev->wiphy, dev,
5014 threshold, hysteresis);
5015 4344
5016unlock_rdev: 4345 if (wdev->iftype != NL80211_IFTYPE_STATION &&
5017 cfg80211_unlock_rdev(rdev); 4346 wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)
5018 dev_put(dev); 4347 return -EOPNOTSUPP;
5019 rtnl_unlock();
5020 4348
5021 return err; 4349 return rdev->ops->set_cqm_rssi_config(wdev->wiphy, dev,
4350 threshold, hysteresis);
5022} 4351}
5023 4352
5024static int nl80211_set_cqm(struct sk_buff *skb, struct genl_info *info) 4353static int nl80211_set_cqm(struct sk_buff *skb, struct genl_info *info)
@@ -5052,6 +4381,65 @@ out:
5052 return err; 4381 return err;
5053} 4382}
5054 4383
4384#define NL80211_FLAG_NEED_WIPHY 0x01
4385#define NL80211_FLAG_NEED_NETDEV 0x02
4386#define NL80211_FLAG_NEED_RTNL 0x04
4387#define NL80211_FLAG_CHECK_NETDEV_UP 0x08
4388#define NL80211_FLAG_NEED_NETDEV_UP (NL80211_FLAG_NEED_NETDEV |\
4389 NL80211_FLAG_CHECK_NETDEV_UP)
4390
4391static int nl80211_pre_doit(struct genl_ops *ops, struct sk_buff *skb,
4392 struct genl_info *info)
4393{
4394 struct cfg80211_registered_device *rdev;
4395 struct net_device *dev;
4396 int err;
4397 bool rtnl = ops->internal_flags & NL80211_FLAG_NEED_RTNL;
4398
4399 if (rtnl)
4400 rtnl_lock();
4401
4402 if (ops->internal_flags & NL80211_FLAG_NEED_WIPHY) {
4403 rdev = cfg80211_get_dev_from_info(info);
4404 if (IS_ERR(rdev)) {
4405 if (rtnl)
4406 rtnl_unlock();
4407 return PTR_ERR(rdev);
4408 }
4409 info->user_ptr[0] = rdev;
4410 } else if (ops->internal_flags & NL80211_FLAG_NEED_NETDEV) {
4411 err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
4412 if (err) {
4413 if (rtnl)
4414 rtnl_unlock();
4415 return err;
4416 }
4417 if (ops->internal_flags & NL80211_FLAG_CHECK_NETDEV_UP &&
4418 !netif_running(dev)) {
4419 cfg80211_unlock_rdev(rdev);
4420 dev_put(dev);
4421 if (rtnl)
4422 rtnl_unlock();
4423 return -ENETDOWN;
4424 }
4425 info->user_ptr[0] = rdev;
4426 info->user_ptr[1] = dev;
4427 }
4428
4429 return 0;
4430}
4431
4432static void nl80211_post_doit(struct genl_ops *ops, struct sk_buff *skb,
4433 struct genl_info *info)
4434{
4435 if (info->user_ptr[0])
4436 cfg80211_unlock_rdev(info->user_ptr[0]);
4437 if (info->user_ptr[1])
4438 dev_put(info->user_ptr[1]);
4439 if (ops->internal_flags & NL80211_FLAG_NEED_RTNL)
4440 rtnl_unlock();
4441}
4442
5055static struct genl_ops nl80211_ops[] = { 4443static struct genl_ops nl80211_ops[] = {
5056 { 4444 {
5057 .cmd = NL80211_CMD_GET_WIPHY, 4445 .cmd = NL80211_CMD_GET_WIPHY,
@@ -5059,12 +4447,14 @@ static struct genl_ops nl80211_ops[] = {
5059 .dumpit = nl80211_dump_wiphy, 4447 .dumpit = nl80211_dump_wiphy,
5060 .policy = nl80211_policy, 4448 .policy = nl80211_policy,
5061 /* can be retrieved by unprivileged users */ 4449 /* can be retrieved by unprivileged users */
4450 .internal_flags = NL80211_FLAG_NEED_WIPHY,
5062 }, 4451 },
5063 { 4452 {
5064 .cmd = NL80211_CMD_SET_WIPHY, 4453 .cmd = NL80211_CMD_SET_WIPHY,
5065 .doit = nl80211_set_wiphy, 4454 .doit = nl80211_set_wiphy,
5066 .policy = nl80211_policy, 4455 .policy = nl80211_policy,
5067 .flags = GENL_ADMIN_PERM, 4456 .flags = GENL_ADMIN_PERM,
4457 .internal_flags = NL80211_FLAG_NEED_RTNL,
5068 }, 4458 },
5069 { 4459 {
5070 .cmd = NL80211_CMD_GET_INTERFACE, 4460 .cmd = NL80211_CMD_GET_INTERFACE,
@@ -5072,90 +4462,119 @@ static struct genl_ops nl80211_ops[] = {
5072 .dumpit = nl80211_dump_interface, 4462 .dumpit = nl80211_dump_interface,
5073 .policy = nl80211_policy, 4463 .policy = nl80211_policy,
5074 /* can be retrieved by unprivileged users */ 4464 /* can be retrieved by unprivileged users */
4465 .internal_flags = NL80211_FLAG_NEED_NETDEV,
5075 }, 4466 },
5076 { 4467 {
5077 .cmd = NL80211_CMD_SET_INTERFACE, 4468 .cmd = NL80211_CMD_SET_INTERFACE,
5078 .doit = nl80211_set_interface, 4469 .doit = nl80211_set_interface,
5079 .policy = nl80211_policy, 4470 .policy = nl80211_policy,
5080 .flags = GENL_ADMIN_PERM, 4471 .flags = GENL_ADMIN_PERM,
4472 .internal_flags = NL80211_FLAG_NEED_NETDEV |
4473 NL80211_FLAG_NEED_RTNL,
5081 }, 4474 },
5082 { 4475 {
5083 .cmd = NL80211_CMD_NEW_INTERFACE, 4476 .cmd = NL80211_CMD_NEW_INTERFACE,
5084 .doit = nl80211_new_interface, 4477 .doit = nl80211_new_interface,
5085 .policy = nl80211_policy, 4478 .policy = nl80211_policy,
5086 .flags = GENL_ADMIN_PERM, 4479 .flags = GENL_ADMIN_PERM,
4480 .internal_flags = NL80211_FLAG_NEED_WIPHY |
4481 NL80211_FLAG_NEED_RTNL,
5087 }, 4482 },
5088 { 4483 {
5089 .cmd = NL80211_CMD_DEL_INTERFACE, 4484 .cmd = NL80211_CMD_DEL_INTERFACE,
5090 .doit = nl80211_del_interface, 4485 .doit = nl80211_del_interface,
5091 .policy = nl80211_policy, 4486 .policy = nl80211_policy,
5092 .flags = GENL_ADMIN_PERM, 4487 .flags = GENL_ADMIN_PERM,
4488 .internal_flags = NL80211_FLAG_NEED_NETDEV |
4489 NL80211_FLAG_NEED_RTNL,
5093 }, 4490 },
5094 { 4491 {
5095 .cmd = NL80211_CMD_GET_KEY, 4492 .cmd = NL80211_CMD_GET_KEY,
5096 .doit = nl80211_get_key, 4493 .doit = nl80211_get_key,
5097 .policy = nl80211_policy, 4494 .policy = nl80211_policy,
5098 .flags = GENL_ADMIN_PERM, 4495 .flags = GENL_ADMIN_PERM,
4496 .internal_flags = NL80211_FLAG_NEED_NETDEV |
4497 NL80211_FLAG_NEED_RTNL,
5099 }, 4498 },
5100 { 4499 {
5101 .cmd = NL80211_CMD_SET_KEY, 4500 .cmd = NL80211_CMD_SET_KEY,
5102 .doit = nl80211_set_key, 4501 .doit = nl80211_set_key,
5103 .policy = nl80211_policy, 4502 .policy = nl80211_policy,
5104 .flags = GENL_ADMIN_PERM, 4503 .flags = GENL_ADMIN_PERM,
4504 .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
4505 NL80211_FLAG_NEED_RTNL,
5105 }, 4506 },
5106 { 4507 {
5107 .cmd = NL80211_CMD_NEW_KEY, 4508 .cmd = NL80211_CMD_NEW_KEY,
5108 .doit = nl80211_new_key, 4509 .doit = nl80211_new_key,
5109 .policy = nl80211_policy, 4510 .policy = nl80211_policy,
5110 .flags = GENL_ADMIN_PERM, 4511 .flags = GENL_ADMIN_PERM,
4512 .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
4513 NL80211_FLAG_NEED_RTNL,
5111 }, 4514 },
5112 { 4515 {
5113 .cmd = NL80211_CMD_DEL_KEY, 4516 .cmd = NL80211_CMD_DEL_KEY,
5114 .doit = nl80211_del_key, 4517 .doit = nl80211_del_key,
5115 .policy = nl80211_policy, 4518 .policy = nl80211_policy,
5116 .flags = GENL_ADMIN_PERM, 4519 .flags = GENL_ADMIN_PERM,
4520 .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
4521 NL80211_FLAG_NEED_RTNL,
5117 }, 4522 },
5118 { 4523 {
5119 .cmd = NL80211_CMD_SET_BEACON, 4524 .cmd = NL80211_CMD_SET_BEACON,
5120 .policy = nl80211_policy, 4525 .policy = nl80211_policy,
5121 .flags = GENL_ADMIN_PERM, 4526 .flags = GENL_ADMIN_PERM,
5122 .doit = nl80211_addset_beacon, 4527 .doit = nl80211_addset_beacon,
4528 .internal_flags = NL80211_FLAG_NEED_NETDEV |
4529 NL80211_FLAG_NEED_RTNL,
5123 }, 4530 },
5124 { 4531 {
5125 .cmd = NL80211_CMD_NEW_BEACON, 4532 .cmd = NL80211_CMD_NEW_BEACON,
5126 .policy = nl80211_policy, 4533 .policy = nl80211_policy,
5127 .flags = GENL_ADMIN_PERM, 4534 .flags = GENL_ADMIN_PERM,
5128 .doit = nl80211_addset_beacon, 4535 .doit = nl80211_addset_beacon,
4536 .internal_flags = NL80211_FLAG_NEED_NETDEV |
4537 NL80211_FLAG_NEED_RTNL,
5129 }, 4538 },
5130 { 4539 {
5131 .cmd = NL80211_CMD_DEL_BEACON, 4540 .cmd = NL80211_CMD_DEL_BEACON,
5132 .policy = nl80211_policy, 4541 .policy = nl80211_policy,
5133 .flags = GENL_ADMIN_PERM, 4542 .flags = GENL_ADMIN_PERM,
5134 .doit = nl80211_del_beacon, 4543 .doit = nl80211_del_beacon,
4544 .internal_flags = NL80211_FLAG_NEED_NETDEV |
4545 NL80211_FLAG_NEED_RTNL,
5135 }, 4546 },
5136 { 4547 {
5137 .cmd = NL80211_CMD_GET_STATION, 4548 .cmd = NL80211_CMD_GET_STATION,
5138 .doit = nl80211_get_station, 4549 .doit = nl80211_get_station,
5139 .dumpit = nl80211_dump_station, 4550 .dumpit = nl80211_dump_station,
5140 .policy = nl80211_policy, 4551 .policy = nl80211_policy,
4552 .internal_flags = NL80211_FLAG_NEED_NETDEV |
4553 NL80211_FLAG_NEED_RTNL,
5141 }, 4554 },
5142 { 4555 {
5143 .cmd = NL80211_CMD_SET_STATION, 4556 .cmd = NL80211_CMD_SET_STATION,
5144 .doit = nl80211_set_station, 4557 .doit = nl80211_set_station,
5145 .policy = nl80211_policy, 4558 .policy = nl80211_policy,
5146 .flags = GENL_ADMIN_PERM, 4559 .flags = GENL_ADMIN_PERM,
4560 .internal_flags = NL80211_FLAG_NEED_NETDEV |
4561 NL80211_FLAG_NEED_RTNL,
5147 }, 4562 },
5148 { 4563 {
5149 .cmd = NL80211_CMD_NEW_STATION, 4564 .cmd = NL80211_CMD_NEW_STATION,
5150 .doit = nl80211_new_station, 4565 .doit = nl80211_new_station,
5151 .policy = nl80211_policy, 4566 .policy = nl80211_policy,
5152 .flags = GENL_ADMIN_PERM, 4567 .flags = GENL_ADMIN_PERM,
4568 .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
4569 NL80211_FLAG_NEED_RTNL,
5153 }, 4570 },
5154 { 4571 {
5155 .cmd = NL80211_CMD_DEL_STATION, 4572 .cmd = NL80211_CMD_DEL_STATION,
5156 .doit = nl80211_del_station, 4573 .doit = nl80211_del_station,
5157 .policy = nl80211_policy, 4574 .policy = nl80211_policy,
5158 .flags = GENL_ADMIN_PERM, 4575 .flags = GENL_ADMIN_PERM,
4576 .internal_flags = NL80211_FLAG_NEED_NETDEV |
4577 NL80211_FLAG_NEED_RTNL,
5159 }, 4578 },
5160 { 4579 {
5161 .cmd = NL80211_CMD_GET_MPATH, 4580 .cmd = NL80211_CMD_GET_MPATH,
@@ -5163,30 +4582,40 @@ static struct genl_ops nl80211_ops[] = {
5163 .dumpit = nl80211_dump_mpath, 4582 .dumpit = nl80211_dump_mpath,
5164 .policy = nl80211_policy, 4583 .policy = nl80211_policy,
5165 .flags = GENL_ADMIN_PERM, 4584 .flags = GENL_ADMIN_PERM,
4585 .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
4586 NL80211_FLAG_NEED_RTNL,
5166 }, 4587 },
5167 { 4588 {
5168 .cmd = NL80211_CMD_SET_MPATH, 4589 .cmd = NL80211_CMD_SET_MPATH,
5169 .doit = nl80211_set_mpath, 4590 .doit = nl80211_set_mpath,
5170 .policy = nl80211_policy, 4591 .policy = nl80211_policy,
5171 .flags = GENL_ADMIN_PERM, 4592 .flags = GENL_ADMIN_PERM,
4593 .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
4594 NL80211_FLAG_NEED_RTNL,
5172 }, 4595 },
5173 { 4596 {
5174 .cmd = NL80211_CMD_NEW_MPATH, 4597 .cmd = NL80211_CMD_NEW_MPATH,
5175 .doit = nl80211_new_mpath, 4598 .doit = nl80211_new_mpath,
5176 .policy = nl80211_policy, 4599 .policy = nl80211_policy,
5177 .flags = GENL_ADMIN_PERM, 4600 .flags = GENL_ADMIN_PERM,
4601 .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
4602 NL80211_FLAG_NEED_RTNL,
5178 }, 4603 },
5179 { 4604 {
5180 .cmd = NL80211_CMD_DEL_MPATH, 4605 .cmd = NL80211_CMD_DEL_MPATH,
5181 .doit = nl80211_del_mpath, 4606 .doit = nl80211_del_mpath,
5182 .policy = nl80211_policy, 4607 .policy = nl80211_policy,
5183 .flags = GENL_ADMIN_PERM, 4608 .flags = GENL_ADMIN_PERM,
4609 .internal_flags = NL80211_FLAG_NEED_NETDEV |
4610 NL80211_FLAG_NEED_RTNL,
5184 }, 4611 },
5185 { 4612 {
5186 .cmd = NL80211_CMD_SET_BSS, 4613 .cmd = NL80211_CMD_SET_BSS,
5187 .doit = nl80211_set_bss, 4614 .doit = nl80211_set_bss,
5188 .policy = nl80211_policy, 4615 .policy = nl80211_policy,
5189 .flags = GENL_ADMIN_PERM, 4616 .flags = GENL_ADMIN_PERM,
4617 .internal_flags = NL80211_FLAG_NEED_NETDEV |
4618 NL80211_FLAG_NEED_RTNL,
5190 }, 4619 },
5191 { 4620 {
5192 .cmd = NL80211_CMD_GET_REG, 4621 .cmd = NL80211_CMD_GET_REG,
@@ -5211,18 +4640,24 @@ static struct genl_ops nl80211_ops[] = {
5211 .doit = nl80211_get_mesh_params, 4640 .doit = nl80211_get_mesh_params,
5212 .policy = nl80211_policy, 4641 .policy = nl80211_policy,
5213 /* can be retrieved by unprivileged users */ 4642 /* can be retrieved by unprivileged users */
4643 .internal_flags = NL80211_FLAG_NEED_NETDEV |
4644 NL80211_FLAG_NEED_RTNL,
5214 }, 4645 },
5215 { 4646 {
5216 .cmd = NL80211_CMD_SET_MESH_PARAMS, 4647 .cmd = NL80211_CMD_SET_MESH_PARAMS,
5217 .doit = nl80211_set_mesh_params, 4648 .doit = nl80211_set_mesh_params,
5218 .policy = nl80211_policy, 4649 .policy = nl80211_policy,
5219 .flags = GENL_ADMIN_PERM, 4650 .flags = GENL_ADMIN_PERM,
4651 .internal_flags = NL80211_FLAG_NEED_NETDEV |
4652 NL80211_FLAG_NEED_RTNL,
5220 }, 4653 },
5221 { 4654 {
5222 .cmd = NL80211_CMD_TRIGGER_SCAN, 4655 .cmd = NL80211_CMD_TRIGGER_SCAN,
5223 .doit = nl80211_trigger_scan, 4656 .doit = nl80211_trigger_scan,
5224 .policy = nl80211_policy, 4657 .policy = nl80211_policy,
5225 .flags = GENL_ADMIN_PERM, 4658 .flags = GENL_ADMIN_PERM,
4659 .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
4660 NL80211_FLAG_NEED_RTNL,
5226 }, 4661 },
5227 { 4662 {
5228 .cmd = NL80211_CMD_GET_SCAN, 4663 .cmd = NL80211_CMD_GET_SCAN,
@@ -5234,36 +4669,48 @@ static struct genl_ops nl80211_ops[] = {
5234 .doit = nl80211_authenticate, 4669 .doit = nl80211_authenticate,
5235 .policy = nl80211_policy, 4670 .policy = nl80211_policy,
5236 .flags = GENL_ADMIN_PERM, 4671 .flags = GENL_ADMIN_PERM,
4672 .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
4673 NL80211_FLAG_NEED_RTNL,
5237 }, 4674 },
5238 { 4675 {
5239 .cmd = NL80211_CMD_ASSOCIATE, 4676 .cmd = NL80211_CMD_ASSOCIATE,
5240 .doit = nl80211_associate, 4677 .doit = nl80211_associate,
5241 .policy = nl80211_policy, 4678 .policy = nl80211_policy,
5242 .flags = GENL_ADMIN_PERM, 4679 .flags = GENL_ADMIN_PERM,
4680 .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
4681 NL80211_FLAG_NEED_RTNL,
5243 }, 4682 },
5244 { 4683 {
5245 .cmd = NL80211_CMD_DEAUTHENTICATE, 4684 .cmd = NL80211_CMD_DEAUTHENTICATE,
5246 .doit = nl80211_deauthenticate, 4685 .doit = nl80211_deauthenticate,
5247 .policy = nl80211_policy, 4686 .policy = nl80211_policy,
5248 .flags = GENL_ADMIN_PERM, 4687 .flags = GENL_ADMIN_PERM,
4688 .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
4689 NL80211_FLAG_NEED_RTNL,
5249 }, 4690 },
5250 { 4691 {
5251 .cmd = NL80211_CMD_DISASSOCIATE, 4692 .cmd = NL80211_CMD_DISASSOCIATE,
5252 .doit = nl80211_disassociate, 4693 .doit = nl80211_disassociate,
5253 .policy = nl80211_policy, 4694 .policy = nl80211_policy,
5254 .flags = GENL_ADMIN_PERM, 4695 .flags = GENL_ADMIN_PERM,
4696 .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
4697 NL80211_FLAG_NEED_RTNL,
5255 }, 4698 },
5256 { 4699 {
5257 .cmd = NL80211_CMD_JOIN_IBSS, 4700 .cmd = NL80211_CMD_JOIN_IBSS,
5258 .doit = nl80211_join_ibss, 4701 .doit = nl80211_join_ibss,
5259 .policy = nl80211_policy, 4702 .policy = nl80211_policy,
5260 .flags = GENL_ADMIN_PERM, 4703 .flags = GENL_ADMIN_PERM,
4704 .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
4705 NL80211_FLAG_NEED_RTNL,
5261 }, 4706 },
5262 { 4707 {
5263 .cmd = NL80211_CMD_LEAVE_IBSS, 4708 .cmd = NL80211_CMD_LEAVE_IBSS,
5264 .doit = nl80211_leave_ibss, 4709 .doit = nl80211_leave_ibss,
5265 .policy = nl80211_policy, 4710 .policy = nl80211_policy,
5266 .flags = GENL_ADMIN_PERM, 4711 .flags = GENL_ADMIN_PERM,
4712 .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
4713 NL80211_FLAG_NEED_RTNL,
5267 }, 4714 },
5268#ifdef CONFIG_NL80211_TESTMODE 4715#ifdef CONFIG_NL80211_TESTMODE
5269 { 4716 {
@@ -5271,6 +4718,8 @@ static struct genl_ops nl80211_ops[] = {
5271 .doit = nl80211_testmode_do, 4718 .doit = nl80211_testmode_do,
5272 .policy = nl80211_policy, 4719 .policy = nl80211_policy,
5273 .flags = GENL_ADMIN_PERM, 4720 .flags = GENL_ADMIN_PERM,
4721 .internal_flags = NL80211_FLAG_NEED_WIPHY |
4722 NL80211_FLAG_NEED_RTNL,
5274 }, 4723 },
5275#endif 4724#endif
5276 { 4725 {
@@ -5278,18 +4727,24 @@ static struct genl_ops nl80211_ops[] = {
5278 .doit = nl80211_connect, 4727 .doit = nl80211_connect,
5279 .policy = nl80211_policy, 4728 .policy = nl80211_policy,
5280 .flags = GENL_ADMIN_PERM, 4729 .flags = GENL_ADMIN_PERM,
4730 .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
4731 NL80211_FLAG_NEED_RTNL,
5281 }, 4732 },
5282 { 4733 {
5283 .cmd = NL80211_CMD_DISCONNECT, 4734 .cmd = NL80211_CMD_DISCONNECT,
5284 .doit = nl80211_disconnect, 4735 .doit = nl80211_disconnect,
5285 .policy = nl80211_policy, 4736 .policy = nl80211_policy,
5286 .flags = GENL_ADMIN_PERM, 4737 .flags = GENL_ADMIN_PERM,
4738 .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
4739 NL80211_FLAG_NEED_RTNL,
5287 }, 4740 },
5288 { 4741 {
5289 .cmd = NL80211_CMD_SET_WIPHY_NETNS, 4742 .cmd = NL80211_CMD_SET_WIPHY_NETNS,
5290 .doit = nl80211_wiphy_netns, 4743 .doit = nl80211_wiphy_netns,
5291 .policy = nl80211_policy, 4744 .policy = nl80211_policy,
5292 .flags = GENL_ADMIN_PERM, 4745 .flags = GENL_ADMIN_PERM,
4746 .internal_flags = NL80211_FLAG_NEED_WIPHY |
4747 NL80211_FLAG_NEED_RTNL,
5293 }, 4748 },
5294 { 4749 {
5295 .cmd = NL80211_CMD_GET_SURVEY, 4750 .cmd = NL80211_CMD_GET_SURVEY,
@@ -5301,72 +4756,104 @@ static struct genl_ops nl80211_ops[] = {
5301 .doit = nl80211_setdel_pmksa, 4756 .doit = nl80211_setdel_pmksa,
5302 .policy = nl80211_policy, 4757 .policy = nl80211_policy,
5303 .flags = GENL_ADMIN_PERM, 4758 .flags = GENL_ADMIN_PERM,
4759 .internal_flags = NL80211_FLAG_NEED_NETDEV |
4760 NL80211_FLAG_NEED_RTNL,
5304 }, 4761 },
5305 { 4762 {
5306 .cmd = NL80211_CMD_DEL_PMKSA, 4763 .cmd = NL80211_CMD_DEL_PMKSA,
5307 .doit = nl80211_setdel_pmksa, 4764 .doit = nl80211_setdel_pmksa,
5308 .policy = nl80211_policy, 4765 .policy = nl80211_policy,
5309 .flags = GENL_ADMIN_PERM, 4766 .flags = GENL_ADMIN_PERM,
4767 .internal_flags = NL80211_FLAG_NEED_NETDEV |
4768 NL80211_FLAG_NEED_RTNL,
5310 }, 4769 },
5311 { 4770 {
5312 .cmd = NL80211_CMD_FLUSH_PMKSA, 4771 .cmd = NL80211_CMD_FLUSH_PMKSA,
5313 .doit = nl80211_flush_pmksa, 4772 .doit = nl80211_flush_pmksa,
5314 .policy = nl80211_policy, 4773 .policy = nl80211_policy,
5315 .flags = GENL_ADMIN_PERM, 4774 .flags = GENL_ADMIN_PERM,
4775 .internal_flags = NL80211_FLAG_NEED_NETDEV |
4776 NL80211_FLAG_NEED_RTNL,
5316 }, 4777 },
5317 { 4778 {
5318 .cmd = NL80211_CMD_REMAIN_ON_CHANNEL, 4779 .cmd = NL80211_CMD_REMAIN_ON_CHANNEL,
5319 .doit = nl80211_remain_on_channel, 4780 .doit = nl80211_remain_on_channel,
5320 .policy = nl80211_policy, 4781 .policy = nl80211_policy,
5321 .flags = GENL_ADMIN_PERM, 4782 .flags = GENL_ADMIN_PERM,
4783 .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
4784 NL80211_FLAG_NEED_RTNL,
5322 }, 4785 },
5323 { 4786 {
5324 .cmd = NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL, 4787 .cmd = NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL,
5325 .doit = nl80211_cancel_remain_on_channel, 4788 .doit = nl80211_cancel_remain_on_channel,
5326 .policy = nl80211_policy, 4789 .policy = nl80211_policy,
5327 .flags = GENL_ADMIN_PERM, 4790 .flags = GENL_ADMIN_PERM,
4791 .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
4792 NL80211_FLAG_NEED_RTNL,
5328 }, 4793 },
5329 { 4794 {
5330 .cmd = NL80211_CMD_SET_TX_BITRATE_MASK, 4795 .cmd = NL80211_CMD_SET_TX_BITRATE_MASK,
5331 .doit = nl80211_set_tx_bitrate_mask, 4796 .doit = nl80211_set_tx_bitrate_mask,
5332 .policy = nl80211_policy, 4797 .policy = nl80211_policy,
5333 .flags = GENL_ADMIN_PERM, 4798 .flags = GENL_ADMIN_PERM,
4799 .internal_flags = NL80211_FLAG_NEED_NETDEV |
4800 NL80211_FLAG_NEED_RTNL,
5334 }, 4801 },
5335 { 4802 {
5336 .cmd = NL80211_CMD_REGISTER_ACTION, 4803 .cmd = NL80211_CMD_REGISTER_FRAME,
5337 .doit = nl80211_register_action, 4804 .doit = nl80211_register_mgmt,
5338 .policy = nl80211_policy, 4805 .policy = nl80211_policy,
5339 .flags = GENL_ADMIN_PERM, 4806 .flags = GENL_ADMIN_PERM,
4807 .internal_flags = NL80211_FLAG_NEED_NETDEV |
4808 NL80211_FLAG_NEED_RTNL,
5340 }, 4809 },
5341 { 4810 {
5342 .cmd = NL80211_CMD_ACTION, 4811 .cmd = NL80211_CMD_FRAME,
5343 .doit = nl80211_action, 4812 .doit = nl80211_tx_mgmt,
5344 .policy = nl80211_policy, 4813 .policy = nl80211_policy,
5345 .flags = GENL_ADMIN_PERM, 4814 .flags = GENL_ADMIN_PERM,
4815 .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
4816 NL80211_FLAG_NEED_RTNL,
5346 }, 4817 },
5347 { 4818 {
5348 .cmd = NL80211_CMD_SET_POWER_SAVE, 4819 .cmd = NL80211_CMD_SET_POWER_SAVE,
5349 .doit = nl80211_set_power_save, 4820 .doit = nl80211_set_power_save,
5350 .policy = nl80211_policy, 4821 .policy = nl80211_policy,
5351 .flags = GENL_ADMIN_PERM, 4822 .flags = GENL_ADMIN_PERM,
4823 .internal_flags = NL80211_FLAG_NEED_NETDEV |
4824 NL80211_FLAG_NEED_RTNL,
5352 }, 4825 },
5353 { 4826 {
5354 .cmd = NL80211_CMD_GET_POWER_SAVE, 4827 .cmd = NL80211_CMD_GET_POWER_SAVE,
5355 .doit = nl80211_get_power_save, 4828 .doit = nl80211_get_power_save,
5356 .policy = nl80211_policy, 4829 .policy = nl80211_policy,
5357 /* can be retrieved by unprivileged users */ 4830 /* can be retrieved by unprivileged users */
4831 .internal_flags = NL80211_FLAG_NEED_NETDEV |
4832 NL80211_FLAG_NEED_RTNL,
5358 }, 4833 },
5359 { 4834 {
5360 .cmd = NL80211_CMD_SET_CQM, 4835 .cmd = NL80211_CMD_SET_CQM,
5361 .doit = nl80211_set_cqm, 4836 .doit = nl80211_set_cqm,
5362 .policy = nl80211_policy, 4837 .policy = nl80211_policy,
5363 .flags = GENL_ADMIN_PERM, 4838 .flags = GENL_ADMIN_PERM,
4839 .internal_flags = NL80211_FLAG_NEED_NETDEV |
4840 NL80211_FLAG_NEED_RTNL,
5364 }, 4841 },
5365 { 4842 {
5366 .cmd = NL80211_CMD_SET_CHANNEL, 4843 .cmd = NL80211_CMD_SET_CHANNEL,
5367 .doit = nl80211_set_channel, 4844 .doit = nl80211_set_channel,
5368 .policy = nl80211_policy, 4845 .policy = nl80211_policy,
5369 .flags = GENL_ADMIN_PERM, 4846 .flags = GENL_ADMIN_PERM,
4847 .internal_flags = NL80211_FLAG_NEED_NETDEV |
4848 NL80211_FLAG_NEED_RTNL,
4849 },
4850 {
4851 .cmd = NL80211_CMD_SET_WDS_PEER,
4852 .doit = nl80211_set_wds_peer,
4853 .policy = nl80211_policy,
4854 .flags = GENL_ADMIN_PERM,
4855 .internal_flags = NL80211_FLAG_NEED_NETDEV |
4856 NL80211_FLAG_NEED_RTNL,
5370 }, 4857 },
5371}; 4858};
5372 4859
@@ -6040,9 +5527,9 @@ void nl80211_send_sta_event(struct cfg80211_registered_device *rdev,
6040 nl80211_mlme_mcgrp.id, gfp); 5527 nl80211_mlme_mcgrp.id, gfp);
6041} 5528}
6042 5529
6043int nl80211_send_action(struct cfg80211_registered_device *rdev, 5530int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
6044 struct net_device *netdev, u32 nlpid, 5531 struct net_device *netdev, u32 nlpid,
6045 int freq, const u8 *buf, size_t len, gfp_t gfp) 5532 int freq, const u8 *buf, size_t len, gfp_t gfp)
6046{ 5533{
6047 struct sk_buff *msg; 5534 struct sk_buff *msg;
6048 void *hdr; 5535 void *hdr;
@@ -6052,7 +5539,7 @@ int nl80211_send_action(struct cfg80211_registered_device *rdev,
6052 if (!msg) 5539 if (!msg)
6053 return -ENOMEM; 5540 return -ENOMEM;
6054 5541
6055 hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_ACTION); 5542 hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_FRAME);
6056 if (!hdr) { 5543 if (!hdr) {
6057 nlmsg_free(msg); 5544 nlmsg_free(msg);
6058 return -ENOMEM; 5545 return -ENOMEM;
@@ -6080,10 +5567,10 @@ int nl80211_send_action(struct cfg80211_registered_device *rdev,
6080 return -ENOBUFS; 5567 return -ENOBUFS;
6081} 5568}
6082 5569
6083void nl80211_send_action_tx_status(struct cfg80211_registered_device *rdev, 5570void nl80211_send_mgmt_tx_status(struct cfg80211_registered_device *rdev,
6084 struct net_device *netdev, u64 cookie, 5571 struct net_device *netdev, u64 cookie,
6085 const u8 *buf, size_t len, bool ack, 5572 const u8 *buf, size_t len, bool ack,
6086 gfp_t gfp) 5573 gfp_t gfp)
6087{ 5574{
6088 struct sk_buff *msg; 5575 struct sk_buff *msg;
6089 void *hdr; 5576 void *hdr;
@@ -6092,7 +5579,7 @@ void nl80211_send_action_tx_status(struct cfg80211_registered_device *rdev,
6092 if (!msg) 5579 if (!msg)
6093 return; 5580 return;
6094 5581
6095 hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_ACTION_TX_STATUS); 5582 hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_FRAME_TX_STATUS);
6096 if (!hdr) { 5583 if (!hdr) {
6097 nlmsg_free(msg); 5584 nlmsg_free(msg);
6098 return; 5585 return;
@@ -6179,7 +5666,7 @@ static int nl80211_netlink_notify(struct notifier_block * nb,
6179 5666
6180 list_for_each_entry_rcu(rdev, &cfg80211_rdev_list, list) 5667 list_for_each_entry_rcu(rdev, &cfg80211_rdev_list, list)
6181 list_for_each_entry_rcu(wdev, &rdev->netdev_list, list) 5668 list_for_each_entry_rcu(wdev, &rdev->netdev_list, list)
6182 cfg80211_mlme_unregister_actions(wdev, notify->pid); 5669 cfg80211_mlme_unregister_socket(wdev, notify->pid);
6183 5670
6184 rcu_read_unlock(); 5671 rcu_read_unlock();
6185 5672
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index 2ad7fbc7d9f1..30d2f939150d 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -74,13 +74,13 @@ void nl80211_send_sta_event(struct cfg80211_registered_device *rdev,
74 struct net_device *dev, const u8 *mac_addr, 74 struct net_device *dev, const u8 *mac_addr,
75 struct station_info *sinfo, gfp_t gfp); 75 struct station_info *sinfo, gfp_t gfp);
76 76
77int nl80211_send_action(struct cfg80211_registered_device *rdev, 77int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
78 struct net_device *netdev, u32 nlpid, int freq, 78 struct net_device *netdev, u32 nlpid, int freq,
79 const u8 *buf, size_t len, gfp_t gfp); 79 const u8 *buf, size_t len, gfp_t gfp);
80void nl80211_send_action_tx_status(struct cfg80211_registered_device *rdev, 80void nl80211_send_mgmt_tx_status(struct cfg80211_registered_device *rdev,
81 struct net_device *netdev, u64 cookie, 81 struct net_device *netdev, u64 cookie,
82 const u8 *buf, size_t len, bool ack, 82 const u8 *buf, size_t len, bool ack,
83 gfp_t gfp); 83 gfp_t gfp);
84 84
85void 85void
86nl80211_send_cqm_rssi_notify(struct cfg80211_registered_device *rdev, 86nl80211_send_cqm_rssi_notify(struct cfg80211_registered_device *rdev,
diff --git a/net/wireless/radiotap.c b/net/wireless/radiotap.c
index 1332c445d1c7..dbe35e138e94 100644
--- a/net/wireless/radiotap.c
+++ b/net/wireless/radiotap.c
@@ -14,6 +14,7 @@
14 * See COPYING for more details. 14 * See COPYING for more details.
15 */ 15 */
16 16
17#include <linux/kernel.h>
17#include <net/cfg80211.h> 18#include <net/cfg80211.h>
18#include <net/ieee80211_radiotap.h> 19#include <net/ieee80211_radiotap.h>
19#include <asm/unaligned.h> 20#include <asm/unaligned.h>
@@ -45,7 +46,7 @@ static const struct radiotap_align_size rtap_namespace_sizes[] = {
45}; 46};
46 47
47static const struct ieee80211_radiotap_namespace radiotap_ns = { 48static const struct ieee80211_radiotap_namespace radiotap_ns = {
48 .n_bits = sizeof(rtap_namespace_sizes) / sizeof(rtap_namespace_sizes[0]), 49 .n_bits = ARRAY_SIZE(rtap_namespace_sizes),
49 .align_size = rtap_namespace_sizes, 50 .align_size = rtap_namespace_sizes,
50}; 51};
51 52
@@ -200,7 +201,7 @@ int ieee80211_radiotap_iterator_next(
200{ 201{
201 while (1) { 202 while (1) {
202 int hit = 0; 203 int hit = 0;
203 int pad, align, size, subns, vnslen; 204 int pad, align, size, subns;
204 uint32_t oui; 205 uint32_t oui;
205 206
206 /* if no more EXT bits, that's it */ 207 /* if no more EXT bits, that's it */
@@ -260,6 +261,27 @@ int ieee80211_radiotap_iterator_next(
260 if (pad) 261 if (pad)
261 iterator->_arg += align - pad; 262 iterator->_arg += align - pad;
262 263
264 if (iterator->_arg_index % 32 == IEEE80211_RADIOTAP_VENDOR_NAMESPACE) {
265 int vnslen;
266
267 if ((unsigned long)iterator->_arg + size -
268 (unsigned long)iterator->_rtheader >
269 (unsigned long)iterator->_max_length)
270 return -EINVAL;
271
272 oui = (*iterator->_arg << 16) |
273 (*(iterator->_arg + 1) << 8) |
274 *(iterator->_arg + 2);
275 subns = *(iterator->_arg + 3);
276
277 find_ns(iterator, oui, subns);
278
279 vnslen = get_unaligned_le16(iterator->_arg + 4);
280 iterator->_next_ns_data = iterator->_arg + size + vnslen;
281 if (!iterator->current_namespace)
282 size += vnslen;
283 }
284
263 /* 285 /*
264 * this is what we will return to user, but we need to 286 * this is what we will return to user, but we need to
265 * move on first so next call has something fresh to test 287 * move on first so next call has something fresh to test
@@ -286,40 +308,25 @@ int ieee80211_radiotap_iterator_next(
286 /* these special ones are valid in each bitmap word */ 308 /* these special ones are valid in each bitmap word */
287 switch (iterator->_arg_index % 32) { 309 switch (iterator->_arg_index % 32) {
288 case IEEE80211_RADIOTAP_VENDOR_NAMESPACE: 310 case IEEE80211_RADIOTAP_VENDOR_NAMESPACE:
289 iterator->_bitmap_shifter >>= 1;
290 iterator->_arg_index++;
291
292 iterator->_reset_on_ext = 1; 311 iterator->_reset_on_ext = 1;
293 312
294 vnslen = get_unaligned_le16(iterator->this_arg + 4);
295 iterator->_next_ns_data = iterator->_arg + vnslen;
296 oui = (*iterator->this_arg << 16) |
297 (*(iterator->this_arg + 1) << 8) |
298 *(iterator->this_arg + 2);
299 subns = *(iterator->this_arg + 3);
300
301 find_ns(iterator, oui, subns);
302
303 iterator->is_radiotap_ns = 0; 313 iterator->is_radiotap_ns = 0;
304 /* allow parsers to show this information */ 314 /*
315 * If parser didn't register this vendor
316 * namespace with us, allow it to show it
317 * as 'raw. Do do that, set argument index
318 * to vendor namespace.
319 */
305 iterator->this_arg_index = 320 iterator->this_arg_index =
306 IEEE80211_RADIOTAP_VENDOR_NAMESPACE; 321 IEEE80211_RADIOTAP_VENDOR_NAMESPACE;
307 iterator->this_arg_size += vnslen; 322 if (!iterator->current_namespace)
308 if ((unsigned long)iterator->this_arg + 323 hit = 1;
309 iterator->this_arg_size - 324 goto next_entry;
310 (unsigned long)iterator->_rtheader >
311 (unsigned long)(unsigned long)iterator->_max_length)
312 return -EINVAL;
313 hit = 1;
314 break;
315 case IEEE80211_RADIOTAP_RADIOTAP_NAMESPACE: 325 case IEEE80211_RADIOTAP_RADIOTAP_NAMESPACE:
316 iterator->_bitmap_shifter >>= 1;
317 iterator->_arg_index++;
318
319 iterator->_reset_on_ext = 1; 326 iterator->_reset_on_ext = 1;
320 iterator->current_namespace = &radiotap_ns; 327 iterator->current_namespace = &radiotap_ns;
321 iterator->is_radiotap_ns = 1; 328 iterator->is_radiotap_ns = 1;
322 break; 329 goto next_entry;
323 case IEEE80211_RADIOTAP_EXT: 330 case IEEE80211_RADIOTAP_EXT:
324 /* 331 /*
325 * bit 31 was set, there is more 332 * bit 31 was set, there is more
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index f180db0de66c..4b9f8912526c 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -36,6 +36,7 @@
36#include <linux/slab.h> 36#include <linux/slab.h>
37#include <linux/list.h> 37#include <linux/list.h>
38#include <linux/random.h> 38#include <linux/random.h>
39#include <linux/ctype.h>
39#include <linux/nl80211.h> 40#include <linux/nl80211.h>
40#include <linux/platform_device.h> 41#include <linux/platform_device.h>
41#include <net/cfg80211.h> 42#include <net/cfg80211.h>
@@ -73,7 +74,11 @@ const struct ieee80211_regdomain *cfg80211_regdomain;
73 * - last_request 74 * - last_request
74 */ 75 */
75static DEFINE_MUTEX(reg_mutex); 76static DEFINE_MUTEX(reg_mutex);
76#define assert_reg_lock() WARN_ON(!mutex_is_locked(&reg_mutex)) 77
78static inline void assert_reg_lock(void)
79{
80 lockdep_assert_held(&reg_mutex);
81}
77 82
78/* Used to queue up regulatory hints */ 83/* Used to queue up regulatory hints */
79static LIST_HEAD(reg_requests_list); 84static LIST_HEAD(reg_requests_list);
@@ -181,14 +186,6 @@ static bool is_alpha2_set(const char *alpha2)
181 return false; 186 return false;
182} 187}
183 188
184static bool is_alpha_upper(char letter)
185{
186 /* ASCII A - Z */
187 if (letter >= 65 && letter <= 90)
188 return true;
189 return false;
190}
191
192static bool is_unknown_alpha2(const char *alpha2) 189static bool is_unknown_alpha2(const char *alpha2)
193{ 190{
194 if (!alpha2) 191 if (!alpha2)
@@ -220,7 +217,7 @@ static bool is_an_alpha2(const char *alpha2)
220{ 217{
221 if (!alpha2) 218 if (!alpha2)
222 return false; 219 return false;
223 if (is_alpha_upper(alpha2[0]) && is_alpha_upper(alpha2[1])) 220 if (isalpha(alpha2[0]) && isalpha(alpha2[1]))
224 return true; 221 return true;
225 return false; 222 return false;
226} 223}
@@ -1170,7 +1167,7 @@ static int ignore_request(struct wiphy *wiphy,
1170 return 0; 1167 return 0;
1171 return -EALREADY; 1168 return -EALREADY;
1172 } 1169 }
1173 return REG_INTERSECT; 1170 return 0;
1174 case NL80211_REGDOM_SET_BY_DRIVER: 1171 case NL80211_REGDOM_SET_BY_DRIVER:
1175 if (last_request->initiator == NL80211_REGDOM_SET_BY_CORE) { 1172 if (last_request->initiator == NL80211_REGDOM_SET_BY_CORE) {
1176 if (regdom_changes(pending_request->alpha2)) 1173 if (regdom_changes(pending_request->alpha2))
@@ -1399,6 +1396,11 @@ static DECLARE_WORK(reg_work, reg_todo);
1399 1396
1400static void queue_regulatory_request(struct regulatory_request *request) 1397static void queue_regulatory_request(struct regulatory_request *request)
1401{ 1398{
1399 if (isalpha(request->alpha2[0]))
1400 request->alpha2[0] = toupper(request->alpha2[0]);
1401 if (isalpha(request->alpha2[1]))
1402 request->alpha2[1] = toupper(request->alpha2[1]);
1403
1402 spin_lock(&reg_requests_lock); 1404 spin_lock(&reg_requests_lock);
1403 list_add_tail(&request->list, &reg_requests_list); 1405 list_add_tail(&request->list, &reg_requests_list);
1404 spin_unlock(&reg_requests_lock); 1406 spin_unlock(&reg_requests_lock);
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 5ca8c7180141..503ebb86ba18 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -650,14 +650,14 @@ void cfg80211_unlink_bss(struct wiphy *wiphy, struct cfg80211_bss *pub)
650 bss = container_of(pub, struct cfg80211_internal_bss, pub); 650 bss = container_of(pub, struct cfg80211_internal_bss, pub);
651 651
652 spin_lock_bh(&dev->bss_lock); 652 spin_lock_bh(&dev->bss_lock);
653 if (!list_empty(&bss->list)) {
654 list_del_init(&bss->list);
655 dev->bss_generation++;
656 rb_erase(&bss->rbn, &dev->bss_tree);
653 657
654 list_del(&bss->list); 658 kref_put(&bss->ref, bss_release);
655 dev->bss_generation++; 659 }
656 rb_erase(&bss->rbn, &dev->bss_tree);
657
658 spin_unlock_bh(&dev->bss_lock); 660 spin_unlock_bh(&dev->bss_lock);
659
660 kref_put(&bss->ref, bss_release);
661} 661}
662EXPORT_SYMBOL(cfg80211_unlink_bss); 662EXPORT_SYMBOL(cfg80211_unlink_bss);
663 663
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index a8c2d6b877ae..e17b0bee6bdc 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -411,7 +411,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
411 411
412 ASSERT_WDEV_LOCK(wdev); 412 ASSERT_WDEV_LOCK(wdev);
413 413
414 if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) 414 if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION &&
415 wdev->iftype != NL80211_IFTYPE_P2P_CLIENT))
415 return; 416 return;
416 417
417 if (wdev->sme_state != CFG80211_SME_CONNECTING) 418 if (wdev->sme_state != CFG80211_SME_CONNECTING)
@@ -548,7 +549,8 @@ void __cfg80211_roamed(struct wireless_dev *wdev, const u8 *bssid,
548 549
549 ASSERT_WDEV_LOCK(wdev); 550 ASSERT_WDEV_LOCK(wdev);
550 551
551 if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) 552 if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION &&
553 wdev->iftype != NL80211_IFTYPE_P2P_CLIENT))
552 return; 554 return;
553 555
554 if (wdev->sme_state != CFG80211_SME_CONNECTED) 556 if (wdev->sme_state != CFG80211_SME_CONNECTED)
@@ -644,7 +646,8 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
644 646
645 ASSERT_WDEV_LOCK(wdev); 647 ASSERT_WDEV_LOCK(wdev);
646 648
647 if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) 649 if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION &&
650 wdev->iftype != NL80211_IFTYPE_P2P_CLIENT))
648 return; 651 return;
649 652
650 if (wdev->sme_state != CFG80211_SME_CONNECTED) 653 if (wdev->sme_state != CFG80211_SME_CONNECTED)
@@ -695,7 +698,7 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
695 */ 698 */
696 if (rdev->ops->del_key) 699 if (rdev->ops->del_key)
697 for (i = 0; i < 6; i++) 700 for (i = 0; i < 6; i++)
698 rdev->ops->del_key(wdev->wiphy, dev, i, NULL); 701 rdev->ops->del_key(wdev->wiphy, dev, i, false, NULL);
699 702
700#ifdef CONFIG_CFG80211_WEXT 703#ifdef CONFIG_CFG80211_WEXT
701 memset(&wrqu, 0, sizeof(wrqu)); 704 memset(&wrqu, 0, sizeof(wrqu));
diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c
index 9f2cef3e0ca0..4294fa22bb2d 100644
--- a/net/wireless/sysfs.c
+++ b/net/wireless/sysfs.c
@@ -35,6 +35,14 @@ SHOW_FMT(index, "%d", wiphy_idx);
35SHOW_FMT(macaddress, "%pM", wiphy.perm_addr); 35SHOW_FMT(macaddress, "%pM", wiphy.perm_addr);
36SHOW_FMT(address_mask, "%pM", wiphy.addr_mask); 36SHOW_FMT(address_mask, "%pM", wiphy.addr_mask);
37 37
38static ssize_t name_show(struct device *dev,
39 struct device_attribute *attr,
40 char *buf) {
41 struct wiphy *wiphy = &dev_to_rdev(dev)->wiphy;
42 return sprintf(buf, "%s\n", dev_name(&wiphy->dev));
43}
44
45
38static ssize_t addresses_show(struct device *dev, 46static ssize_t addresses_show(struct device *dev,
39 struct device_attribute *attr, 47 struct device_attribute *attr,
40 char *buf) 48 char *buf)
@@ -57,6 +65,7 @@ static struct device_attribute ieee80211_dev_attrs[] = {
57 __ATTR_RO(macaddress), 65 __ATTR_RO(macaddress),
58 __ATTR_RO(address_mask), 66 __ATTR_RO(address_mask),
59 __ATTR_RO(addresses), 67 __ATTR_RO(addresses),
68 __ATTR_RO(name),
60 {} 69 {}
61}; 70};
62 71
@@ -110,6 +119,13 @@ static int wiphy_resume(struct device *dev)
110 return ret; 119 return ret;
111} 120}
112 121
122static const void *wiphy_namespace(struct device *d)
123{
124 struct wiphy *wiphy = container_of(d, struct wiphy, dev);
125
126 return wiphy_net(wiphy);
127}
128
113struct class ieee80211_class = { 129struct class ieee80211_class = {
114 .name = "ieee80211", 130 .name = "ieee80211",
115 .owner = THIS_MODULE, 131 .owner = THIS_MODULE,
@@ -120,6 +136,8 @@ struct class ieee80211_class = {
120#endif 136#endif
121 .suspend = wiphy_suspend, 137 .suspend = wiphy_suspend,
122 .resume = wiphy_resume, 138 .resume = wiphy_resume,
139 .ns_type = &net_ns_type_operations,
140 .namespace = wiphy_namespace,
123}; 141};
124 142
125int wiphy_sysfs_init(void) 143int wiphy_sysfs_init(void)
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 0c8a1e8b7690..76120aeda57d 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -144,19 +144,25 @@ void ieee80211_set_bitrate_flags(struct wiphy *wiphy)
144 144
145int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, 145int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
146 struct key_params *params, int key_idx, 146 struct key_params *params, int key_idx,
147 const u8 *mac_addr) 147 bool pairwise, const u8 *mac_addr)
148{ 148{
149 int i; 149 int i;
150 150
151 if (key_idx > 5) 151 if (key_idx > 5)
152 return -EINVAL; 152 return -EINVAL;
153 153
154 if (!pairwise && mac_addr && !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN))
155 return -EINVAL;
156
157 if (pairwise && !mac_addr)
158 return -EINVAL;
159
154 /* 160 /*
155 * Disallow pairwise keys with non-zero index unless it's WEP 161 * Disallow pairwise keys with non-zero index unless it's WEP
156 * (because current deployments use pairwise WEP keys with 162 * (because current deployments use pairwise WEP keys with
157 * non-zero indizes but 802.11i clearly specifies to use zero) 163 * non-zero indizes but 802.11i clearly specifies to use zero)
158 */ 164 */
159 if (mac_addr && key_idx && 165 if (pairwise && key_idx &&
160 params->cipher != WLAN_CIPHER_SUITE_WEP40 && 166 params->cipher != WLAN_CIPHER_SUITE_WEP40 &&
161 params->cipher != WLAN_CIPHER_SUITE_WEP104) 167 params->cipher != WLAN_CIPHER_SUITE_WEP104)
162 return -EINVAL; 168 return -EINVAL;
@@ -183,7 +189,14 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
183 return -EINVAL; 189 return -EINVAL;
184 break; 190 break;
185 default: 191 default:
186 return -EINVAL; 192 /*
193 * We don't know anything about this algorithm,
194 * allow using it -- but the driver must check
195 * all parameters! We still check below whether
196 * or not the driver supports this algorithm,
197 * of course.
198 */
199 break;
187 } 200 }
188 201
189 if (params->seq) { 202 if (params->seq) {
@@ -221,7 +234,7 @@ const unsigned char bridge_tunnel_header[] __aligned(2) =
221 { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0xf8 }; 234 { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0xf8 };
222EXPORT_SYMBOL(bridge_tunnel_header); 235EXPORT_SYMBOL(bridge_tunnel_header);
223 236
224unsigned int ieee80211_hdrlen(__le16 fc) 237unsigned int __attribute_const__ ieee80211_hdrlen(__le16 fc)
225{ 238{
226 unsigned int hdrlen = 24; 239 unsigned int hdrlen = 24;
227 240
@@ -319,7 +332,8 @@ int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr,
319 cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) { 332 cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) {
320 case cpu_to_le16(IEEE80211_FCTL_TODS): 333 case cpu_to_le16(IEEE80211_FCTL_TODS):
321 if (unlikely(iftype != NL80211_IFTYPE_AP && 334 if (unlikely(iftype != NL80211_IFTYPE_AP &&
322 iftype != NL80211_IFTYPE_AP_VLAN)) 335 iftype != NL80211_IFTYPE_AP_VLAN &&
336 iftype != NL80211_IFTYPE_P2P_GO))
323 return -1; 337 return -1;
324 break; 338 break;
325 case cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS): 339 case cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS):
@@ -347,7 +361,8 @@ int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr,
347 break; 361 break;
348 case cpu_to_le16(IEEE80211_FCTL_FROMDS): 362 case cpu_to_le16(IEEE80211_FCTL_FROMDS):
349 if ((iftype != NL80211_IFTYPE_STATION && 363 if ((iftype != NL80211_IFTYPE_STATION &&
350 iftype != NL80211_IFTYPE_MESH_POINT) || 364 iftype != NL80211_IFTYPE_P2P_CLIENT &&
365 iftype != NL80211_IFTYPE_MESH_POINT) ||
351 (is_multicast_ether_addr(dst) && 366 (is_multicast_ether_addr(dst) &&
352 !compare_ether_addr(src, addr))) 367 !compare_ether_addr(src, addr)))
353 return -1; 368 return -1;
@@ -424,6 +439,7 @@ int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr,
424 switch (iftype) { 439 switch (iftype) {
425 case NL80211_IFTYPE_AP: 440 case NL80211_IFTYPE_AP:
426 case NL80211_IFTYPE_AP_VLAN: 441 case NL80211_IFTYPE_AP_VLAN:
442 case NL80211_IFTYPE_P2P_GO:
427 fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS); 443 fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS);
428 /* DA BSSID SA */ 444 /* DA BSSID SA */
429 memcpy(hdr.addr1, skb->data, ETH_ALEN); 445 memcpy(hdr.addr1, skb->data, ETH_ALEN);
@@ -432,6 +448,7 @@ int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr,
432 hdrlen = 24; 448 hdrlen = 24;
433 break; 449 break;
434 case NL80211_IFTYPE_STATION: 450 case NL80211_IFTYPE_STATION:
451 case NL80211_IFTYPE_P2P_CLIENT:
435 fc |= cpu_to_le16(IEEE80211_FCTL_TODS); 452 fc |= cpu_to_le16(IEEE80211_FCTL_TODS);
436 /* BSSID SA DA */ 453 /* BSSID SA DA */
437 memcpy(hdr.addr1, bssid, ETH_ALEN); 454 memcpy(hdr.addr1, bssid, ETH_ALEN);
@@ -666,7 +683,7 @@ void cfg80211_upload_connect_keys(struct wireless_dev *wdev)
666 for (i = 0; i < 6; i++) { 683 for (i = 0; i < 6; i++) {
667 if (!wdev->connect_keys->params[i].cipher) 684 if (!wdev->connect_keys->params[i].cipher)
668 continue; 685 continue;
669 if (rdev->ops->add_key(wdev->wiphy, dev, i, NULL, 686 if (rdev->ops->add_key(wdev->wiphy, dev, i, false, NULL,
670 &wdev->connect_keys->params[i])) { 687 &wdev->connect_keys->params[i])) {
671 printk(KERN_ERR "%s: failed to set key %d\n", 688 printk(KERN_ERR "%s: failed to set key %d\n",
672 dev->name, i); 689 dev->name, i);
@@ -771,7 +788,9 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
771 788
772 /* if it's part of a bridge, reject changing type to station/ibss */ 789 /* if it's part of a bridge, reject changing type to station/ibss */
773 if ((dev->priv_flags & IFF_BRIDGE_PORT) && 790 if ((dev->priv_flags & IFF_BRIDGE_PORT) &&
774 (ntype == NL80211_IFTYPE_ADHOC || ntype == NL80211_IFTYPE_STATION)) 791 (ntype == NL80211_IFTYPE_ADHOC ||
792 ntype == NL80211_IFTYPE_STATION ||
793 ntype == NL80211_IFTYPE_P2P_CLIENT))
775 return -EBUSY; 794 return -EBUSY;
776 795
777 if (ntype != otype) { 796 if (ntype != otype) {
@@ -782,6 +801,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
782 cfg80211_leave_ibss(rdev, dev, false); 801 cfg80211_leave_ibss(rdev, dev, false);
783 break; 802 break;
784 case NL80211_IFTYPE_STATION: 803 case NL80211_IFTYPE_STATION:
804 case NL80211_IFTYPE_P2P_CLIENT:
785 cfg80211_disconnect(rdev, dev, 805 cfg80211_disconnect(rdev, dev,
786 WLAN_REASON_DEAUTH_LEAVING, true); 806 WLAN_REASON_DEAUTH_LEAVING, true);
787 break; 807 break;
@@ -810,9 +830,11 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
810 if (dev->ieee80211_ptr->use_4addr) 830 if (dev->ieee80211_ptr->use_4addr)
811 break; 831 break;
812 /* fall through */ 832 /* fall through */
833 case NL80211_IFTYPE_P2P_CLIENT:
813 case NL80211_IFTYPE_ADHOC: 834 case NL80211_IFTYPE_ADHOC:
814 dev->priv_flags |= IFF_DONT_BRIDGE; 835 dev->priv_flags |= IFF_DONT_BRIDGE;
815 break; 836 break;
837 case NL80211_IFTYPE_P2P_GO:
816 case NL80211_IFTYPE_AP: 838 case NL80211_IFTYPE_AP:
817 case NL80211_IFTYPE_AP_VLAN: 839 case NL80211_IFTYPE_AP_VLAN:
818 case NL80211_IFTYPE_WDS: 840 case NL80211_IFTYPE_WDS:
@@ -823,7 +845,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
823 /* monitor can't bridge anyway */ 845 /* monitor can't bridge anyway */
824 break; 846 break;
825 case NL80211_IFTYPE_UNSPECIFIED: 847 case NL80211_IFTYPE_UNSPECIFIED:
826 case __NL80211_IFTYPE_AFTER_LAST: 848 case NUM_NL80211_IFTYPES:
827 /* not happening */ 849 /* not happening */
828 break; 850 break;
829 } 851 }
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index bb5e0a5ecfa1..12222ee6ebf2 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -432,14 +432,17 @@ int cfg80211_wext_giwretry(struct net_device *dev,
432EXPORT_SYMBOL_GPL(cfg80211_wext_giwretry); 432EXPORT_SYMBOL_GPL(cfg80211_wext_giwretry);
433 433
434static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev, 434static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
435 struct net_device *dev, const u8 *addr, 435 struct net_device *dev, bool pairwise,
436 bool remove, bool tx_key, int idx, 436 const u8 *addr, bool remove, bool tx_key,
437 struct key_params *params) 437 int idx, struct key_params *params)
438{ 438{
439 struct wireless_dev *wdev = dev->ieee80211_ptr; 439 struct wireless_dev *wdev = dev->ieee80211_ptr;
440 int err, i; 440 int err, i;
441 bool rejoin = false; 441 bool rejoin = false;
442 442
443 if (pairwise && !addr)
444 return -EINVAL;
445
443 if (!wdev->wext.keys) { 446 if (!wdev->wext.keys) {
444 wdev->wext.keys = kzalloc(sizeof(*wdev->wext.keys), 447 wdev->wext.keys = kzalloc(sizeof(*wdev->wext.keys),
445 GFP_KERNEL); 448 GFP_KERNEL);
@@ -478,7 +481,13 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
478 __cfg80211_leave_ibss(rdev, wdev->netdev, true); 481 __cfg80211_leave_ibss(rdev, wdev->netdev, true);
479 rejoin = true; 482 rejoin = true;
480 } 483 }
481 err = rdev->ops->del_key(&rdev->wiphy, dev, idx, addr); 484
485 if (!pairwise && addr &&
486 !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN))
487 err = -ENOENT;
488 else
489 err = rdev->ops->del_key(&rdev->wiphy, dev, idx,
490 pairwise, addr);
482 } 491 }
483 wdev->wext.connect.privacy = false; 492 wdev->wext.connect.privacy = false;
484 /* 493 /*
@@ -507,12 +516,13 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
507 if (addr) 516 if (addr)
508 tx_key = false; 517 tx_key = false;
509 518
510 if (cfg80211_validate_key_settings(rdev, params, idx, addr)) 519 if (cfg80211_validate_key_settings(rdev, params, idx, pairwise, addr))
511 return -EINVAL; 520 return -EINVAL;
512 521
513 err = 0; 522 err = 0;
514 if (wdev->current_bss) 523 if (wdev->current_bss)
515 err = rdev->ops->add_key(&rdev->wiphy, dev, idx, addr, params); 524 err = rdev->ops->add_key(&rdev->wiphy, dev, idx,
525 pairwise, addr, params);
516 if (err) 526 if (err)
517 return err; 527 return err;
518 528
@@ -563,17 +573,17 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
563} 573}
564 574
565static int cfg80211_set_encryption(struct cfg80211_registered_device *rdev, 575static int cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
566 struct net_device *dev, const u8 *addr, 576 struct net_device *dev, bool pairwise,
567 bool remove, bool tx_key, int idx, 577 const u8 *addr, bool remove, bool tx_key,
568 struct key_params *params) 578 int idx, struct key_params *params)
569{ 579{
570 int err; 580 int err;
571 581
572 /* devlist mutex needed for possible IBSS re-join */ 582 /* devlist mutex needed for possible IBSS re-join */
573 mutex_lock(&rdev->devlist_mtx); 583 mutex_lock(&rdev->devlist_mtx);
574 wdev_lock(dev->ieee80211_ptr); 584 wdev_lock(dev->ieee80211_ptr);
575 err = __cfg80211_set_encryption(rdev, dev, addr, remove, 585 err = __cfg80211_set_encryption(rdev, dev, pairwise, addr,
576 tx_key, idx, params); 586 remove, tx_key, idx, params);
577 wdev_unlock(dev->ieee80211_ptr); 587 wdev_unlock(dev->ieee80211_ptr);
578 mutex_unlock(&rdev->devlist_mtx); 588 mutex_unlock(&rdev->devlist_mtx);
579 589
@@ -635,7 +645,7 @@ int cfg80211_wext_siwencode(struct net_device *dev,
635 else if (!remove) 645 else if (!remove)
636 return -EINVAL; 646 return -EINVAL;
637 647
638 return cfg80211_set_encryption(rdev, dev, NULL, remove, 648 return cfg80211_set_encryption(rdev, dev, false, NULL, remove,
639 wdev->wext.default_key == -1, 649 wdev->wext.default_key == -1,
640 idx, &params); 650 idx, &params);
641} 651}
@@ -725,7 +735,9 @@ int cfg80211_wext_siwencodeext(struct net_device *dev,
725 } 735 }
726 736
727 return cfg80211_set_encryption( 737 return cfg80211_set_encryption(
728 rdev, dev, addr, remove, 738 rdev, dev,
739 !(ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY),
740 addr, remove,
729 ext->ext_flags & IW_ENCODE_EXT_SET_TX_KEY, 741 ext->ext_flags & IW_ENCODE_EXT_SET_TX_KEY,
730 idx, &params); 742 idx, &params);
731} 743}
@@ -1354,6 +1366,10 @@ struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev)
1354 } 1366 }
1355 1367
1356 wstats.qual.updated |= IW_QUAL_NOISE_INVALID; 1368 wstats.qual.updated |= IW_QUAL_NOISE_INVALID;
1369 if (sinfo.filled & STATION_INFO_RX_DROP_MISC)
1370 wstats.discard.misc = sinfo.rx_dropped_misc;
1371 if (sinfo.filled & STATION_INFO_TX_FAILED)
1372 wstats.discard.retries = sinfo.tx_failed;
1357 1373
1358 return &wstats; 1374 return &wstats;
1359} 1375}
@@ -1420,6 +1436,9 @@ int cfg80211_wext_giwessid(struct net_device *dev,
1420{ 1436{
1421 struct wireless_dev *wdev = dev->ieee80211_ptr; 1437 struct wireless_dev *wdev = dev->ieee80211_ptr;
1422 1438
1439 data->flags = 0;
1440 data->length = 0;
1441
1423 switch (wdev->iftype) { 1442 switch (wdev->iftype) {
1424 case NL80211_IFTYPE_ADHOC: 1443 case NL80211_IFTYPE_ADHOC:
1425 return cfg80211_ibss_wext_giwessid(dev, info, data, ssid); 1444 return cfg80211_ibss_wext_giwessid(dev, info, data, ssid);
diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
index 0ef17bc42bac..dc675a3daa3d 100644
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -611,7 +611,7 @@ struct iw_statistics *get_wireless_stats(struct net_device *dev)
611#endif 611#endif
612 612
613#ifdef CONFIG_CFG80211_WEXT 613#ifdef CONFIG_CFG80211_WEXT
614 if (dev->ieee80211_ptr && dev->ieee80211_ptr && 614 if (dev->ieee80211_ptr &&
615 dev->ieee80211_ptr->wiphy && 615 dev->ieee80211_ptr->wiphy &&
616 dev->ieee80211_ptr->wiphy->wext && 616 dev->ieee80211_ptr->wiphy->wext &&
617 dev->ieee80211_ptr->wiphy->wext->get_wireless_stats) 617 dev->ieee80211_ptr->wiphy->wext->get_wireless_stats)
@@ -782,6 +782,22 @@ static int ioctl_standard_iw_point(struct iw_point *iwp, unsigned int cmd,
782 } 782 }
783 } 783 }
784 784
785 if (IW_IS_GET(cmd) && !(descr->flags & IW_DESCR_FLAG_NOMAX)) {
786 /*
787 * If this is a GET, but not NOMAX, it means that the extra
788 * data is not bounded by userspace, but by max_tokens. Thus
789 * set the length to max_tokens. This matches the extra data
790 * allocation.
791 * The driver should fill it with the number of tokens it
792 * provided, and it may check iwp->length rather than having
793 * knowledge of max_tokens. If the driver doesn't change the
794 * iwp->length, this ioctl just copies back max_token tokens
795 * filled with zeroes. Hopefully the driver isn't claiming
796 * them to be valid data.
797 */
798 iwp->length = descr->max_tokens;
799 }
800
785 err = handler(dev, info, (union iwreq_data *) iwp, extra); 801 err = handler(dev, info, (union iwreq_data *) iwp, extra);
786 802
787 iwp->length += essid_compat; 803 iwp->length += essid_compat;
diff --git a/net/wireless/wext-priv.c b/net/wireless/wext-priv.c
index 3feb28e41c53..674d426a9d24 100644
--- a/net/wireless/wext-priv.c
+++ b/net/wireless/wext-priv.c
@@ -152,7 +152,7 @@ static int ioctl_private_iw_point(struct iw_point *iwp, unsigned int cmd,
152 } else if (!iwp->pointer) 152 } else if (!iwp->pointer)
153 return -EFAULT; 153 return -EFAULT;
154 154
155 extra = kmalloc(extra_size, GFP_KERNEL); 155 extra = kzalloc(extra_size, GFP_KERNEL);
156 if (!extra) 156 if (!extra)
157 return -ENOMEM; 157 return -ENOMEM;
158 158
diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c
index 9818198add8a..6fffe62d7c25 100644
--- a/net/wireless/wext-sme.c
+++ b/net/wireless/wext-sme.c
@@ -197,6 +197,8 @@ int cfg80211_mgd_wext_siwessid(struct net_device *dev,
197 wdev->wext.connect.ssid_len = len; 197 wdev->wext.connect.ssid_len = len;
198 198
199 wdev->wext.connect.crypto.control_port = false; 199 wdev->wext.connect.crypto.control_port = false;
200 wdev->wext.connect.crypto.control_port_ethertype =
201 cpu_to_be16(ETH_P_PAE);
200 202
201 err = cfg80211_mgd_wext_connect(rdev, wdev); 203 err = cfg80211_mgd_wext_connect(rdev, wdev);
202 out: 204 out:
diff --git a/net/x25/Kconfig b/net/x25/Kconfig
index e6759c9660bb..2196e55e4f61 100644
--- a/net/x25/Kconfig
+++ b/net/x25/Kconfig
@@ -5,6 +5,7 @@
5config X25 5config X25
6 tristate "CCITT X.25 Packet Layer (EXPERIMENTAL)" 6 tristate "CCITT X.25 Packet Layer (EXPERIMENTAL)"
7 depends on EXPERIMENTAL 7 depends on EXPERIMENTAL
8 depends on BKL # should be fixable
8 ---help--- 9 ---help---
9 X.25 is a set of standardized network protocols, similar in scope to 10 X.25 is a set of standardized network protocols, similar in scope to
10 frame relay; the one physical line from your box to the X.25 network 11 frame relay; the one physical line from your box to the X.25 network
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 5e86d4e97dce..f7af98dff409 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -507,14 +507,14 @@ static int x25_listen(struct socket *sock, int backlog)
507 struct sock *sk = sock->sk; 507 struct sock *sk = sock->sk;
508 int rc = -EOPNOTSUPP; 508 int rc = -EOPNOTSUPP;
509 509
510 lock_kernel(); 510 lock_sock(sk);
511 if (sk->sk_state != TCP_LISTEN) { 511 if (sk->sk_state != TCP_LISTEN) {
512 memset(&x25_sk(sk)->dest_addr, 0, X25_ADDR_LEN); 512 memset(&x25_sk(sk)->dest_addr, 0, X25_ADDR_LEN);
513 sk->sk_max_ack_backlog = backlog; 513 sk->sk_max_ack_backlog = backlog;
514 sk->sk_state = TCP_LISTEN; 514 sk->sk_state = TCP_LISTEN;
515 rc = 0; 515 rc = 0;
516 } 516 }
517 unlock_kernel(); 517 release_sock(sk);
518 518
519 return rc; 519 return rc;
520} 520}
@@ -688,7 +688,6 @@ static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
688 struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr; 688 struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr;
689 int len, i, rc = 0; 689 int len, i, rc = 0;
690 690
691 lock_kernel();
692 if (!sock_flag(sk, SOCK_ZAPPED) || 691 if (!sock_flag(sk, SOCK_ZAPPED) ||
693 addr_len != sizeof(struct sockaddr_x25) || 692 addr_len != sizeof(struct sockaddr_x25) ||
694 addr->sx25_family != AF_X25) { 693 addr->sx25_family != AF_X25) {
@@ -704,12 +703,13 @@ static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
704 } 703 }
705 } 704 }
706 705
706 lock_sock(sk);
707 x25_sk(sk)->source_addr = addr->sx25_addr; 707 x25_sk(sk)->source_addr = addr->sx25_addr;
708 x25_insert_socket(sk); 708 x25_insert_socket(sk);
709 sock_reset_flag(sk, SOCK_ZAPPED); 709 sock_reset_flag(sk, SOCK_ZAPPED);
710 release_sock(sk);
710 SOCK_DEBUG(sk, "x25_bind: socket is bound\n"); 711 SOCK_DEBUG(sk, "x25_bind: socket is bound\n");
711out: 712out:
712 unlock_kernel();
713 return rc; 713 return rc;
714} 714}
715 715
@@ -751,7 +751,6 @@ static int x25_connect(struct socket *sock, struct sockaddr *uaddr,
751 struct x25_route *rt; 751 struct x25_route *rt;
752 int rc = 0; 752 int rc = 0;
753 753
754 lock_kernel();
755 lock_sock(sk); 754 lock_sock(sk);
756 if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) { 755 if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) {
757 sock->state = SS_CONNECTED; 756 sock->state = SS_CONNECTED;
@@ -829,7 +828,6 @@ out_put_route:
829 x25_route_put(rt); 828 x25_route_put(rt);
830out: 829out:
831 release_sock(sk); 830 release_sock(sk);
832 unlock_kernel();
833 return rc; 831 return rc;
834} 832}
835 833
@@ -869,8 +867,7 @@ static int x25_accept(struct socket *sock, struct socket *newsock, int flags)
869 struct sk_buff *skb; 867 struct sk_buff *skb;
870 int rc = -EINVAL; 868 int rc = -EINVAL;
871 869
872 lock_kernel(); 870 if (!sk)
873 if (!sk || sk->sk_state != TCP_LISTEN)
874 goto out; 871 goto out;
875 872
876 rc = -EOPNOTSUPP; 873 rc = -EOPNOTSUPP;
@@ -878,6 +875,10 @@ static int x25_accept(struct socket *sock, struct socket *newsock, int flags)
878 goto out; 875 goto out;
879 876
880 lock_sock(sk); 877 lock_sock(sk);
878 rc = -EINVAL;
879 if (sk->sk_state != TCP_LISTEN)
880 goto out2;
881
881 rc = x25_wait_for_data(sk, sk->sk_rcvtimeo); 882 rc = x25_wait_for_data(sk, sk->sk_rcvtimeo);
882 if (rc) 883 if (rc)
883 goto out2; 884 goto out2;
@@ -897,7 +898,6 @@ static int x25_accept(struct socket *sock, struct socket *newsock, int flags)
897out2: 898out2:
898 release_sock(sk); 899 release_sock(sk);
899out: 900out:
900 unlock_kernel();
901 return rc; 901 return rc;
902} 902}
903 903
@@ -909,7 +909,6 @@ static int x25_getname(struct socket *sock, struct sockaddr *uaddr,
909 struct x25_sock *x25 = x25_sk(sk); 909 struct x25_sock *x25 = x25_sk(sk);
910 int rc = 0; 910 int rc = 0;
911 911
912 lock_kernel();
913 if (peer) { 912 if (peer) {
914 if (sk->sk_state != TCP_ESTABLISHED) { 913 if (sk->sk_state != TCP_ESTABLISHED) {
915 rc = -ENOTCONN; 914 rc = -ENOTCONN;
@@ -923,19 +922,6 @@ static int x25_getname(struct socket *sock, struct sockaddr *uaddr,
923 *uaddr_len = sizeof(*sx25); 922 *uaddr_len = sizeof(*sx25);
924 923
925out: 924out:
926 unlock_kernel();
927 return rc;
928}
929
930static unsigned int x25_datagram_poll(struct file *file, struct socket *sock,
931 poll_table *wait)
932{
933 int rc;
934
935 lock_kernel();
936 rc = datagram_poll(file, sock, wait);
937 unlock_kernel();
938
939 return rc; 925 return rc;
940} 926}
941 927
@@ -1746,7 +1732,7 @@ static const struct proto_ops x25_proto_ops = {
1746 .socketpair = sock_no_socketpair, 1732 .socketpair = sock_no_socketpair,
1747 .accept = x25_accept, 1733 .accept = x25_accept,
1748 .getname = x25_getname, 1734 .getname = x25_getname,
1749 .poll = x25_datagram_poll, 1735 .poll = datagram_poll,
1750 .ioctl = x25_ioctl, 1736 .ioctl = x25_ioctl,
1751#ifdef CONFIG_COMPAT 1737#ifdef CONFIG_COMPAT
1752 .compat_ioctl = compat_x25_ioctl, 1738 .compat_ioctl = compat_x25_ioctl,
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index a3cca0a94346..64f2ae1fdc15 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -101,7 +101,7 @@ resume:
101 err = -EHOSTUNREACH; 101 err = -EHOSTUNREACH;
102 goto error_nolock; 102 goto error_nolock;
103 } 103 }
104 skb_dst_set_noref(skb, dst); 104 skb_dst_set(skb, dst_clone(dst));
105 x = dst->xfrm; 105 x = dst->xfrm;
106 } while (x && !(x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL)); 106 } while (x && !(x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL));
107 107
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 2b3ed7ad4933..044e77898512 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -50,6 +50,9 @@ static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family);
50static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo); 50static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo);
51static void xfrm_init_pmtu(struct dst_entry *dst); 51static void xfrm_init_pmtu(struct dst_entry *dst);
52static int stale_bundle(struct dst_entry *dst); 52static int stale_bundle(struct dst_entry *dst);
53static int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *xdst,
54 struct flowi *fl, int family, int strict);
55
53 56
54static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, 57static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
55 int dir); 58 int dir);
@@ -1175,9 +1178,8 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, struct flowi *fl,
1175 tmpl->mode == XFRM_MODE_BEET) { 1178 tmpl->mode == XFRM_MODE_BEET) {
1176 remote = &tmpl->id.daddr; 1179 remote = &tmpl->id.daddr;
1177 local = &tmpl->saddr; 1180 local = &tmpl->saddr;
1178 family = tmpl->encap_family; 1181 if (xfrm_addr_any(local, tmpl->encap_family)) {
1179 if (xfrm_addr_any(local, family)) { 1182 error = xfrm_get_saddr(net, &tmp, remote, tmpl->encap_family);
1180 error = xfrm_get_saddr(net, &tmp, remote, family);
1181 if (error) 1183 if (error)
1182 goto fail; 1184 goto fail;
1183 local = &tmp; 1185 local = &tmp;
@@ -2277,7 +2279,7 @@ static void xfrm_init_pmtu(struct dst_entry *dst)
2277 * still valid. 2279 * still valid.
2278 */ 2280 */
2279 2281
2280int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first, 2282static int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first,
2281 struct flowi *fl, int family, int strict) 2283 struct flowi *fl, int family, int strict)
2282{ 2284{
2283 struct dst_entry *dst = &first->u.dst; 2285 struct dst_entry *dst = &first->u.dst;
@@ -2359,8 +2361,6 @@ int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first,
2359 return 1; 2361 return 1;
2360} 2362}
2361 2363
2362EXPORT_SYMBOL(xfrm_bundle_ok);
2363
2364int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) 2364int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
2365{ 2365{
2366 struct net *net; 2366 struct net *net;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 5208b12fbfb4..eb96ce52f178 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -656,15 +656,23 @@ void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si)
656EXPORT_SYMBOL(xfrm_sad_getinfo); 656EXPORT_SYMBOL(xfrm_sad_getinfo);
657 657
658static int 658static int
659xfrm_init_tempsel(struct xfrm_state *x, struct flowi *fl, 659xfrm_init_tempstate(struct xfrm_state *x, struct flowi *fl,
660 struct xfrm_tmpl *tmpl, 660 struct xfrm_tmpl *tmpl,
661 xfrm_address_t *daddr, xfrm_address_t *saddr, 661 xfrm_address_t *daddr, xfrm_address_t *saddr,
662 unsigned short family) 662 unsigned short family)
663{ 663{
664 struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); 664 struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
665 if (!afinfo) 665 if (!afinfo)
666 return -1; 666 return -1;
667 afinfo->init_tempsel(x, fl, tmpl, daddr, saddr); 667 afinfo->init_tempsel(&x->sel, fl);
668
669 if (family != tmpl->encap_family) {
670 xfrm_state_put_afinfo(afinfo);
671 afinfo = xfrm_state_get_afinfo(tmpl->encap_family);
672 if (!afinfo)
673 return -1;
674 }
675 afinfo->init_temprop(x, tmpl, daddr, saddr);
668 xfrm_state_put_afinfo(afinfo); 676 xfrm_state_put_afinfo(afinfo);
669 return 0; 677 return 0;
670} 678}
@@ -790,37 +798,38 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
790 int error = 0; 798 int error = 0;
791 struct xfrm_state *best = NULL; 799 struct xfrm_state *best = NULL;
792 u32 mark = pol->mark.v & pol->mark.m; 800 u32 mark = pol->mark.v & pol->mark.m;
801 unsigned short encap_family = tmpl->encap_family;
793 802
794 to_put = NULL; 803 to_put = NULL;
795 804
796 spin_lock_bh(&xfrm_state_lock); 805 spin_lock_bh(&xfrm_state_lock);
797 h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, family); 806 h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family);
798 hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h, bydst) { 807 hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h, bydst) {
799 if (x->props.family == family && 808 if (x->props.family == encap_family &&
800 x->props.reqid == tmpl->reqid && 809 x->props.reqid == tmpl->reqid &&
801 (mark & x->mark.m) == x->mark.v && 810 (mark & x->mark.m) == x->mark.v &&
802 !(x->props.flags & XFRM_STATE_WILDRECV) && 811 !(x->props.flags & XFRM_STATE_WILDRECV) &&
803 xfrm_state_addr_check(x, daddr, saddr, family) && 812 xfrm_state_addr_check(x, daddr, saddr, encap_family) &&
804 tmpl->mode == x->props.mode && 813 tmpl->mode == x->props.mode &&
805 tmpl->id.proto == x->id.proto && 814 tmpl->id.proto == x->id.proto &&
806 (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) 815 (tmpl->id.spi == x->id.spi || !tmpl->id.spi))
807 xfrm_state_look_at(pol, x, fl, family, daddr, saddr, 816 xfrm_state_look_at(pol, x, fl, encap_family, daddr, saddr,
808 &best, &acquire_in_progress, &error); 817 &best, &acquire_in_progress, &error);
809 } 818 }
810 if (best) 819 if (best)
811 goto found; 820 goto found;
812 821
813 h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, family); 822 h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, encap_family);
814 hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h_wildcard, bydst) { 823 hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h_wildcard, bydst) {
815 if (x->props.family == family && 824 if (x->props.family == encap_family &&
816 x->props.reqid == tmpl->reqid && 825 x->props.reqid == tmpl->reqid &&
817 (mark & x->mark.m) == x->mark.v && 826 (mark & x->mark.m) == x->mark.v &&
818 !(x->props.flags & XFRM_STATE_WILDRECV) && 827 !(x->props.flags & XFRM_STATE_WILDRECV) &&
819 xfrm_state_addr_check(x, daddr, saddr, family) && 828 xfrm_state_addr_check(x, daddr, saddr, encap_family) &&
820 tmpl->mode == x->props.mode && 829 tmpl->mode == x->props.mode &&
821 tmpl->id.proto == x->id.proto && 830 tmpl->id.proto == x->id.proto &&
822 (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) 831 (tmpl->id.spi == x->id.spi || !tmpl->id.spi))
823 xfrm_state_look_at(pol, x, fl, family, daddr, saddr, 832 xfrm_state_look_at(pol, x, fl, encap_family, daddr, saddr,
824 &best, &acquire_in_progress, &error); 833 &best, &acquire_in_progress, &error);
825 } 834 }
826 835
@@ -829,7 +838,7 @@ found:
829 if (!x && !error && !acquire_in_progress) { 838 if (!x && !error && !acquire_in_progress) {
830 if (tmpl->id.spi && 839 if (tmpl->id.spi &&
831 (x0 = __xfrm_state_lookup(net, mark, daddr, tmpl->id.spi, 840 (x0 = __xfrm_state_lookup(net, mark, daddr, tmpl->id.spi,
832 tmpl->id.proto, family)) != NULL) { 841 tmpl->id.proto, encap_family)) != NULL) {
833 to_put = x0; 842 to_put = x0;
834 error = -EEXIST; 843 error = -EEXIST;
835 goto out; 844 goto out;
@@ -839,9 +848,9 @@ found:
839 error = -ENOMEM; 848 error = -ENOMEM;
840 goto out; 849 goto out;
841 } 850 }
842 /* Initialize temporary selector matching only 851 /* Initialize temporary state matching only
843 * to current session. */ 852 * to current session. */
844 xfrm_init_tempsel(x, fl, tmpl, daddr, saddr, family); 853 xfrm_init_tempstate(x, fl, tmpl, daddr, saddr, family);
845 memcpy(&x->mark, &pol->mark, sizeof(x->mark)); 854 memcpy(&x->mark, &pol->mark, sizeof(x->mark));
846 855
847 error = security_xfrm_state_alloc_acquire(x, pol->security, fl->secid); 856 error = security_xfrm_state_alloc_acquire(x, pol->security, fl->secid);
@@ -856,10 +865,10 @@ found:
856 x->km.state = XFRM_STATE_ACQ; 865 x->km.state = XFRM_STATE_ACQ;
857 list_add(&x->km.all, &net->xfrm.state_all); 866 list_add(&x->km.all, &net->xfrm.state_all);
858 hlist_add_head(&x->bydst, net->xfrm.state_bydst+h); 867 hlist_add_head(&x->bydst, net->xfrm.state_bydst+h);
859 h = xfrm_src_hash(net, daddr, saddr, family); 868 h = xfrm_src_hash(net, daddr, saddr, encap_family);
860 hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h); 869 hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h);
861 if (x->id.spi) { 870 if (x->id.spi) {
862 h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, family); 871 h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, encap_family);
863 hlist_add_head(&x->byspi, net->xfrm.state_byspi+h); 872 hlist_add_head(&x->byspi, net->xfrm.state_byspi+h);
864 } 873 }
865 x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires; 874 x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index ba59983aaffe..8bae6b22c846 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1801,7 +1801,7 @@ static int xfrm_add_sa_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
1801 struct xfrm_user_expire *ue = nlmsg_data(nlh); 1801 struct xfrm_user_expire *ue = nlmsg_data(nlh);
1802 struct xfrm_usersa_info *p = &ue->state; 1802 struct xfrm_usersa_info *p = &ue->state;
1803 struct xfrm_mark m; 1803 struct xfrm_mark m;
1804 u32 mark = xfrm_mark_get(attrs, &m);; 1804 u32 mark = xfrm_mark_get(attrs, &m);
1805 1805
1806 x = xfrm_state_lookup(net, mark, &p->id.daddr, p->id.spi, p->id.proto, p->family); 1806 x = xfrm_state_lookup(net, mark, &p->id.daddr, p->id.spi, p->id.proto, p->family);
1807 1807
@@ -2504,7 +2504,7 @@ static struct xfrm_policy *xfrm_compile_policy(struct sock *sk, int opt,
2504 if (p->dir > XFRM_POLICY_OUT) 2504 if (p->dir > XFRM_POLICY_OUT)
2505 return NULL; 2505 return NULL;
2506 2506
2507 xp = xfrm_policy_alloc(net, GFP_KERNEL); 2507 xp = xfrm_policy_alloc(net, GFP_ATOMIC);
2508 if (xp == NULL) { 2508 if (xp == NULL) {
2509 *dir = -ENOBUFS; 2509 *dir = -ENOBUFS;
2510 return NULL; 2510 return NULL;